9685 KPTI %cr3 handling needs fixes

Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Approved by: Richard Lowe <richlowe@richlowe.net>
author: John Levon <john.levon@joyent.com> 2018-07-30 10:05:48 +0100
committer: Richard Lowe <richlowe@richlowe.net> 2018-08-07 19:46:08 +0000
commit: eea802b0a2c12269d15276d4657e5cd64dd541a4 (patch)
tree: 428299fc23a7fc774b25363db24f80dfc085937c
parent: 72dc11568f48cd37cf8182a82d9cb55b22d7c805 (diff)
download: illumos-joyent-eea802b0a2c12269d15276d4657e5cd64dd541a4.tar.gz
4 files changed, 78 insertions, 40 deletions
diff --git a/usr/src/uts/i86pc/ml/kpti_trampolines.s b/usr/src/uts/i86pc/ml/kpti_trampolines.s
index 036c254d08..83cbd69048 100644
--- a/usr/src/uts/i86pc/ml/kpti_trampolines.s
+++ b/usr/src/uts/i86pc/ml/kpti_trampolines.s
@@ -251,6 +251,11 @@ kpti_kbase:
  * This is used for all interrupts that can plausibly be taken inside another
  * interrupt and are using a kpti_frame stack (so #BP, #DB, #GP, #PF, #SS).
  *
+ * We also use this for #NP, even though it uses the standard IST: the
+ * additional %rsp checks below will catch when we get an exception doing an
+ * iret to userspace with a bad %cs/%ss.  This appears as a kernel trap, and
+ * only later gets redirected via kern_gpfault().
+ *
  * We check for whether we took the interrupt while in another trampoline, in
  * which case we need to use the kthread stack.
  */
@@ -649,7 +654,7 @@ tr_intr_ret_end:
 	MK_INTR_TRAMPOLINE_NOERR(invoptrap)
 	MK_INTR_TRAMPOLINE_NOERR(ndptrap)
 	MK_INTR_TRAMPOLINE(invtsstrap)
-	MK_INTR_TRAMPOLINE(segnptrap)
+	MK_DBG_INTR_TRAMPOLINE(segnptrap)
 	MK_DBG_INTR_TRAMPOLINE(stktrap)
 	MK_DBG_INTR_TRAMPOLINE(gptrap)
 	MK_DBG_INTR_TRAMPOLINE(pftrap)
diff --git a/usr/src/uts/i86pc/ml/locore.s b/usr/src/uts/i86pc/ml/locore.s
index acd96e271a..10db95ab51 100644
--- a/usr/src/uts/i86pc/ml/locore.s
+++ b/usr/src/uts/i86pc/ml/locore.s
@@ -159,7 +159,7 @@ _locore_start(struct boot_syscalls *sysp, ulong_t rsi, struct bootops *bop)
 	 * %rdi = boot services (should die someday)
 	 * %rdx = bootops
 	 * end
-	 */	
+	 */
 
 	leaq	edata(%rip), %rbp	/* reference edata for ksyms */
 	movq	$0, (%rbp)		/* limit stack back trace */
@@ -178,7 +178,7 @@ _locore_start(struct boot_syscalls *sysp, ulong_t rsi, struct bootops *bop)
 #endif
 	/*
 	 * Save call back for special x86 boot services vector
-	 */	
+	 */
 	movq	%rdi, sysp(%rip)
 
 	movq	%rdx, bootops(%rip)		/* save bootops */
@@ -208,7 +208,7 @@ _locore_start(struct boot_syscalls *sysp, ulong_t rsi, struct bootops *bop)
 #endif	/* __xpv */
 
 	/*
-	 * (We just assert this works by virtue of being here) 
+	 * (We just assert this works by virtue of being here)
 	 */
 	bts	$X86FSET_CPUID, x86_featureset(%rip)
 
@@ -268,7 +268,7 @@ _locore_start(struct boot_syscalls *sysp, struct bootops *bop)
 	/*
 	 *	%ecx = boot services (should die someday)
 	 *	%ebx = bootops
-	 */	
+	 */
 	mov	$edata, %ebp		/ edata needs to be defined for ksyms
 	movl	$0, (%ebp)		/ limit stack back trace
 
@@ -283,14 +283,14 @@ _locore_start(struct boot_syscalls *sysp, struct bootops *bop)
 	 */
 	mov	%ecx, sysp		/ save call back for boot services
 
- 	mov	%ebx, bootops		/ save bootops
+	mov	%ebx, bootops		/ save bootops
 	movl	$bootops, bootopsp
 
 
 	/*
 	 * Save all registers and flags
 	 */
-	pushal	
+	pushal
 	pushfl
 
 #if !defined(__xpv)
@@ -443,7 +443,7 @@ port_22_free:
 	 * cycle. If the CCR index was not valid for this Cyrix model, we may
 	 * have performed an external I/O cycle as well. In these cases and
 	 * if the motherboard/chipset vendor ignores I/O address line A1,
-	 * then the PIC will have IRQ3 set at the lowest priority as a side	
+	 * then the PIC will have IRQ3 set at the lowest priority as a side
 	 * effect of the above outb. We are reasonalbly confident that there
 	 * is not an unknown device on I/O port 0x22, so there should have been
 	 * no unpredictable side-effect of the above outb.
@@ -892,7 +892,7 @@ likelyM3:
 	 * now we will call anything with a DIR0 of 0x80 or higher an MIII.
 	 * The MIII is supposed to support large pages, but we will believe
 	 * it when we see it. For now we just enable and test for MII features.
-	 */	
+	 */
 	movl	$X86_TYPE_VIA_CYRIX_III, x86_type
 	jmp	likeMII
 
@@ -930,7 +930,7 @@ coma_bug:
  * fixed this bug sometime late in 1997 and no other exploits other than
  * xchgl have been discovered is good indication that this workaround is
  * reasonable.
- */	
+ */
 
 	.set	CYRIX_DBR0, 0x30	/ Debug Register 0
 	.set	CYRIX_DBR1, 0x31	/ Debug Register 1
@@ -939,7 +939,7 @@ coma_bug:
 	.set	CYRIX_DOR, 0x3c		/ Debug Opcode Register
 
 	/*
- 	 * What is known about DBR1, DBR2, DBR3, and DOR is that for normal
+	 * What is known about DBR1, DBR2, DBR3, and DOR is that for normal
 	 * cpu execution DBR1, DBR2, and DBR3 are set to 0. To obtain opcode
 	 * serialization, DBR1, DBR2, and DBR3 are loaded with 0xb8, 0x7f,
 	 * and 0xff. Then, DOR is loaded with the one byte opcode.
@@ -999,7 +999,7 @@ coma_bug:
 	/*
 	 * write DBR1
 	 */
-	movb	$CYRIX_DBR1, %al 
+	movb	$CYRIX_DBR1, %al
 	outb	$CYRIX_CRI
 	movb	$0xf8, %al
 	outb	$CYRIX_CRD
@@ -1201,6 +1201,7 @@ cmntrap()
 	leaq	dtrace_badtrap(%rip), %rdi
 	xorl	%eax, %eax
 	call	panic
+	SET_SIZE(cmntrap_pushed)
 	SET_SIZE(cmntrap)
 	SET_SIZE(_cmntrap)
 
diff --git a/usr/src/uts/intel/kdi/kdi_asm.s b/usr/src/uts/intel/kdi/kdi_asm.s
index 5180dbb1b2..edfbea10c7 100644
--- a/usr/src/uts/intel/kdi/kdi_asm.s
+++ b/usr/src/uts/intel/kdi/kdi_asm.s
@@ -452,8 +452,8 @@
 
 	pushq	%rdi
 	call	kdi_trap_pass
-	cmpq	$1, %rax
-	je	kdi_pass_to_kernel
+	testq	%rax, %rax
+	jnz	kdi_pass_to_kernel
 	popq	%rax /* cpusave in %rax */
 
 	SAVE_IDTGDT
@@ -569,17 +569,9 @@
 	/*NOTREACHED*/
 	SET_SIZE(kdi_resume)
 
-	ENTRY_NP(kdi_pass_to_kernel)
-
-	popq	%rdi /* cpusave */
-
-	movq	$KDI_CPU_STATE_NONE, KRS_CPU_STATE(%rdi)
 
 	/*
-	 * Find the trap and vector off the right kernel handler.  The trap
-	 * handler will expect the stack to be in trap order, with %rip being
-	 * the last entry, so we'll need to restore all our regs.  On i86xpv
-	 * we'll need to compensate for XPV_TRAP_POP.
+	 * We took a trap that should be handled by the kernel, not KMDB.
 	 *
 	 * We're hard-coding the three cases where KMDB has installed permanent
 	 * handlers, since after we KDI_RESTORE_REGS(), we don't have registers
@@ -587,15 +579,53 @@
 	 * through here at the same time.
 	 *
 	 * Note that we handle T_DBGENTR since userspace might have tried it.
+	 *
+	 * The trap handler will expect the stack to be in trap order, with %rip
+	 * being the last entry, so we'll need to restore all our regs.  On
+	 * i86xpv we'll need to compensate for XPV_TRAP_POP.
+	 *
+	 * %rax on entry is either 1 or 2, which is from kdi_trap_pass().
+	 * kdi_cmnint stashed the original %cr3 into KDIREG_CR3, then (probably)
+	 * switched us to the CPU's kf_kernel_cr3. But we're about to call, for
+	 * example:
+	 *
+	 * dbgtrap->trap()->tr_iret_kernel
+	 *
+	 * which, unlike, tr_iret_kdi, doesn't restore the original %cr3, so
+	 * we'll do so here if needed.
+	 *
+	 * This isn't just a matter of tidiness: for example, consider:
+	 *
+	 * hat_switch(oldhat=kas.a_hat, newhat=prochat)
+	 *  setcr3()
+	 *  reset_kpti()
+	 *   *brktrap* due to fbt on reset_kpti:entry
+	 *
+	 * Here, we have the new hat's %cr3, but we haven't yet updated
+	 * kf_kernel_cr3 (so its currently kas's). So if we don't restore here,
+	 * we'll stay on kas's cr3 value on returning from the trap: not good if
+	 * we fault on a userspace address.
 	 */
+	ENTRY_NP(kdi_pass_to_kernel)
+
+	popq	%rdi /* cpusave */
+	movq	$KDI_CPU_STATE_NONE, KRS_CPU_STATE(%rdi)
 	movq	KRS_GREGS(%rdi), %rsp
+
+	cmpq	$2, %rax
+	jne	no_restore_cr3
+	movq	REG_OFF(KDIREG_CR3)(%rsp), %r11
+	movq	%r11, %cr3
+
+no_restore_cr3:
 	movq	REG_OFF(KDIREG_TRAPNO)(%rsp), %rdi
+
 	cmpq	$T_SGLSTP, %rdi
-	je	1f
+	je	kdi_pass_dbgtrap
 	cmpq	$T_BPTFLT, %rdi
-	je	2f
+	je	kdi_pass_brktrap
 	cmpq	$T_DBGENTR, %rdi
-	je	3f
+	je	kdi_pass_invaltrap
 	/*
 	 * Hmm, unknown handler.  Somebody forgot to update this when they
 	 * added a new trap interposition... try to drop back into kmdb.
@@ -609,13 +639,13 @@
 	XPV_TRAP_PUSH; \
 	jmp	%cs:name
 
-1:
+kdi_pass_dbgtrap:
 	CALL_TRAP_HANDLER(dbgtrap)
 	/*NOTREACHED*/
-2:
+kdi_pass_brktrap:
 	CALL_TRAP_HANDLER(brktrap)
 	/*NOTREACHED*/
-3:
+kdi_pass_invaltrap:
 	CALL_TRAP_HANDLER(invaltrap)
 	/*NOTREACHED*/
 
diff --git a/usr/src/uts/intel/kdi/kdi_idt.c b/usr/src/uts/intel/kdi/kdi_idt.c
index d801588954..6ea4681bce 100644
--- a/usr/src/uts/intel/kdi/kdi_idt.c
+++ b/usr/src/uts/intel/kdi/kdi_idt.c
@@ -164,8 +164,8 @@ struct idt_description {
 	{ T_GPFLT, 0,		kdi_traperr13, NULL },
 	{ T_PGFLT, 0,		kdi_traperr14, NULL },
 	{ 15, 0,		kdi_invaltrap, NULL },
-	{ T_EXTERRFLT, 0, 	kdi_trap16, NULL },
-	{ T_ALIGNMENT, 0, 	kdi_traperr17, NULL },
+	{ T_EXTERRFLT, 0,	kdi_trap16, NULL },
+	{ T_ALIGNMENT, 0,	kdi_traperr17, NULL },
 	{ T_MCE, 0,		kdi_trap18, NULL },
 	{ T_SIMDFPE, 0,		kdi_trap19, NULL },
 	{ T_DBGENTR, 0,		kdi_trap20, NULL },
@@ -366,14 +366,16 @@ kdi_deactivate(void)
 }
 
 /*
- * We receive all breakpoints and single step traps.  Some of them,
- * including those from userland and those induced by DTrace providers,
- * are intended for the kernel, and must be processed there.  We adopt
- * this ours-until-proven-otherwise position due to the painful
- * consequences of sending the kernel an unexpected breakpoint or
- * single step.  Unless someone can prove to us that the kernel is
- * prepared to handle the trap, we'll assume there's a problem and will
- * give the user a chance to debug it.
+ * We receive all breakpoints and single step traps.  Some of them, including
+ * those from userland and those induced by DTrace providers, are intended for
+ * the kernel, and must be processed there.  We adopt this
+ * ours-until-proven-otherwise position due to the painful consequences of
+ * sending the kernel an unexpected breakpoint or single step.  Unless someone
+ * can prove to us that the kernel is prepared to handle the trap, we'll assume
+ * there's a problem and will give the user a chance to debug it.
+ *
+ * If we return 2, then the calling code should restore the trap-time %cr3: that
+ * is, it really is a kernel-originated trap.
  */
 int
 kdi_trap_pass(kdi_cpusave_t *cpusave)
@@ -390,7 +392,7 @@ kdi_trap_pass(kdi_cpusave_t *cpusave)
 
 	if (tt == T_BPTFLT && kdi_dtrace_get_state() ==
 	    KDI_DTSTATE_DTRACE_ACTIVE)
-		return (1);
+		return (2);
 
 	/*
 	 * See the comments in the kernel's T_SGLSTP handler for why we need to
author	John Levon <john.levon@joyent.com>	2018-07-30 10:05:48 +0100
committer	Richard Lowe <richlowe@richlowe.net>	2018-08-07 19:46:08 +0000
commit	eea802b0a2c12269d15276d4657e5cd64dd541a4 (patch)
tree	428299fc23a7fc774b25363db24f80dfc085937c
parent	72dc11568f48cd37cf8182a82d9cb55b22d7c805 (diff)
download	illumos-joyent-eea802b0a2c12269d15276d4657e5cd64dd541a4.tar.gz