summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsethg <none@none>2007-03-30 19:27:19 -0700
committersethg <none@none>2007-03-30 19:27:19 -0700
commitddece0baf7ff3a228bcd106c2bb2303ac0c9af89 (patch)
tree332634e3cac8098e7d82e14de41ffdfac419e694
parentd71dbb732372504daff1f1783bc0d8864ce9bd50 (diff)
downloadillumos-joyent-ddece0baf7ff3a228bcd106c2bb2303ac0c9af89.tar.gz
6533678 dtrace incorrectly resumes execution in the middle of 2-byte int 3 instructions
6533954 kmdb destroys usermode GSBASE when entering 6534035 dtrace_user_probe fails to consider alternate code selectors when checking for int3 6534061 amd64 DEBUG kernel's __SAVE_REGS should save the {gs,fs}base registers 6534200 syscall rewriting code is not thread-safe 6534218 Incorrect ASSERT logic in prnldt induces panic 6534277 syscall rewriting code in trap() should be aware of alternate segment selectors
-rw-r--r--usr/src/uts/i86pc/os/dtrace_subr.c18
-rw-r--r--usr/src/uts/i86pc/os/machdep.c98
-rw-r--r--usr/src/uts/i86pc/os/trap.c114
-rw-r--r--usr/src/uts/i86pc/sys/machsystm.h2
-rw-r--r--usr/src/uts/intel/amd64/ml/mach_offsets.in4
-rw-r--r--usr/src/uts/intel/amd64/sys/privregs.h29
-rw-r--r--usr/src/uts/intel/fs/proc/prmachdep.c5
-rw-r--r--usr/src/uts/intel/kdi/amd64/kdi_asm.s16
8 files changed, 235 insertions, 51 deletions
diff --git a/usr/src/uts/i86pc/os/dtrace_subr.c b/usr/src/uts/i86pc/os/dtrace_subr.c
index 7e369be269..b96d16fbfa 100644
--- a/usr/src/uts/i86pc/os/dtrace_subr.c
+++ b/usr/src/uts/i86pc/os/dtrace_subr.c
@@ -240,7 +240,8 @@ dtrace_user_probe(struct regs *rp, caddr_t addr, processorid_t cpuid)
rp->r_pc = npc;
} else if (rp->r_trapno == T_BPTFLT) {
- uint8_t instr;
+ uint8_t instr, instr2;
+ caddr_t linearpc;
rwp = &CPU->cpu_ft_lock;
/*
@@ -258,14 +259,25 @@ dtrace_user_probe(struct regs *rp, caddr_t addr, processorid_t cpuid)
}
rw_exit(rwp);
+ if (dtrace_linear_pc(rp, p, &linearpc) != 0) {
+ trap(rp, addr, cpuid);
+ return;
+ }
+
/*
* If the instruction that caused the breakpoint trap doesn't
* look like an int 3 anymore, it may be that this tracepoint
* was removed just after the user thread executed it. In
* that case, return to user land to retry the instuction.
+ * Note that we assume the length of the instruction to retry
+ * is 1 byte because that's the length of FASTTRAP_INSTR.
+ * We check for r_pc > 0 and > 2 so that we don't have to
+ * deal with segment wraparound.
*/
- if (fuword8((void *)(rp->r_pc - 1), &instr) == 0 &&
- instr != FASTTRAP_INSTR) {
+ if (rp->r_pc > 0 && fuword8(linearpc - 1, &instr) == 0 &&
+ instr != FASTTRAP_INSTR &&
+ (instr != 3 || (rp->r_pc >= 2 &&
+ (fuword8(linearpc - 2, &instr2) != 0 || instr2 != 0xCD)))) {
rp->r_pc--;
return;
}
diff --git a/usr/src/uts/i86pc/os/machdep.c b/usr/src/uts/i86pc/os/machdep.c
index bc2da7f363..f96722f64f 100644
--- a/usr/src/uts/i86pc/os/machdep.c
+++ b/usr/src/uts/i86pc/os/machdep.c
@@ -29,6 +29,7 @@
#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
+#include <sys/segments.h>
#include <sys/sysmacros.h>
#include <sys/signal.h>
#include <sys/systm.h>
@@ -1019,3 +1020,100 @@ dump_plat_data(void *dump_cbuf)
{
return (0);
}
+
+/*
+ * Calculates a linear address, given the CS selector and PC values
+ * by looking up the %cs selector process's LDT or the CPU's GDT.
+ * proc->p_ldtlock must be held across this call.
+ */
+int
+linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp)
+{
+ user_desc_t *descrp;
+ caddr_t baseaddr;
+ uint16_t idx = SELTOIDX(rp->r_cs);
+
+ ASSERT(rp->r_cs <= 0xFFFF);
+ ASSERT(MUTEX_HELD(&p->p_ldtlock));
+
+ if (SELISLDT(rp->r_cs)) {
+ /*
+ * Currently 64 bit processes cannot have private LDTs.
+ */
+ ASSERT(p->p_model != DATAMODEL_LP64);
+
+ if (p->p_ldt == NULL)
+ return (-1);
+
+ descrp = &p->p_ldt[idx];
+ baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
+
+ /*
+ * Calculate the linear address (wraparound is not only ok,
+ * it's expected behavior). The cast to uint32_t is because
+ * LDT selectors are only allowed in 32-bit processes.
+ */
+ *linearp = (caddr_t)(uintptr_t)(uint32_t)((uintptr_t)baseaddr +
+ rp->r_pc);
+ } else {
+#ifdef DEBUG
+ descrp = &CPU->cpu_gdt[idx];
+ baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
+ /* GDT-based descriptors' base addresses should always be 0 */
+ ASSERT(baseaddr == 0);
+#endif
+ *linearp = (caddr_t)(uintptr_t)rp->r_pc;
+ }
+
+ return (0);
+}
+
+/*
+ * The implementation of dtrace_linear_pc is similar to the that of
+ * linear_pc, above, but here we acquire p_ldtlock before accessing
+ * p_ldt. This implementation is used by the pid provider; we prefix
+ * it with "dtrace_" to avoid inducing spurious tracing events.
+ */
+int
+dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp)
+{
+ user_desc_t *descrp;
+ caddr_t baseaddr;
+ uint16_t idx = SELTOIDX(rp->r_cs);
+
+ ASSERT(rp->r_cs <= 0xFFFF);
+
+ if (SELISLDT(rp->r_cs)) {
+ /*
+ * Currently 64 bit processes cannot have private LDTs.
+ */
+ ASSERT(p->p_model != DATAMODEL_LP64);
+
+ mutex_enter(&p->p_ldtlock);
+ if (p->p_ldt == NULL) {
+ mutex_exit(&p->p_ldtlock);
+ return (-1);
+ }
+ descrp = &p->p_ldt[idx];
+ baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
+ mutex_exit(&p->p_ldtlock);
+
+ /*
+ * Calculate the linear address (wraparound is not only ok,
+ * it's expected behavior). The cast to uint32_t is because
+ * LDT selectors are only allowed in 32-bit processes.
+ */
+ *linearp = (caddr_t)(uintptr_t)(uint32_t)((uintptr_t)baseaddr +
+ rp->r_pc);
+ } else {
+#ifdef DEBUG
+ descrp = &CPU->cpu_gdt[idx];
+ baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
+ /* GDT-based descriptors' base addresses should always be 0 */
+ ASSERT(baseaddr == 0);
+#endif
+ *linearp = (caddr_t)(uintptr_t)rp->r_pc;
+ }
+
+ return (0);
+}
diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c
index 38451ba8ad..2d0a093435 100644
--- a/usr/src/uts/i86pc/os/trap.c
+++ b/usr/src/uts/i86pc/os/trap.c
@@ -132,6 +132,9 @@ static const char *trap_type[] = {
#define TRAP_TYPES (sizeof (trap_type) / sizeof (trap_type[0]))
+#define SLOW_SCALL_SIZE 2
+#define FAST_SCALL_SIZE 2
+
int tudebug = 0;
int tudebugbpt = 0;
int tudebugfpe = 0;
@@ -206,8 +209,6 @@ die(uint_t type, struct regs *rp, caddr_t addr, processorid_t cpuid)
* int <vector> is two bytes: 0xCD <vector>
*/
-#define SLOW_SCALL_SIZE 2
-
static int
rewrite_syscall(caddr_t pc)
{
@@ -227,28 +228,88 @@ rewrite_syscall(caddr_t pc)
*
* sysenter is two bytes: 0x0F 0x34
* syscall is two bytes: 0x0F 0x05
+ * int $T_SYSCALLINT is two bytes: 0xCD 0x91
*/
-#define FAST_SCALL_SIZE 2
-
static int
-instr_is_fast_syscall(caddr_t pc, int which)
+instr_is_other_syscall(caddr_t pc, int which)
{
uchar_t instr[FAST_SCALL_SIZE];
- ASSERT(which == X86_SEP || which == X86_ASYSC);
+ ASSERT(which == X86_SEP || which == X86_ASYSC || which == 0xCD);
- if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0 ||
- instr[0] != 0x0F)
+ if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0)
return (0);
- if ((which == X86_SEP && instr[1] == 0x34) ||
- (which == X86_ASYSC && instr[1] == 0x05))
- return (1);
+ switch (which) {
+ case X86_SEP:
+ if (instr[0] == 0x0F && instr[1] == 0x34)
+ return (1);
+ break;
+ case X86_ASYSC:
+ if (instr[0] == 0x0F && instr[1] == 0x05)
+ return (1);
+ break;
+ case 0xCD:
+ if (instr[0] == 0xCD && instr[1] == T_SYSCALLINT)
+ return (1);
+ break;
+ }
return (0);
}
+static const char *
+syscall_insn_string(int syscall_insn)
+{
+ switch (syscall_insn) {
+ case X86_SEP:
+ return ("sysenter");
+ case X86_ASYSC:
+ return ("syscall");
+ case 0xCD:
+ return ("int");
+ default:
+ return ("Unknown");
+ }
+}
+
+static int
+ldt_rewrite_syscall(struct regs *rp, proc_t *p, int syscall_insn)
+{
+ caddr_t linearpc;
+ int return_code = 0;
+
+ mutex_enter(&p->p_ldtlock); /* Must be held across linear_pc() */
+
+ if (linear_pc(rp, p, &linearpc) == 0) {
+
+ /*
+ * If another thread beat us here, it already changed
+ * this site to the slower (int) syscall instruction.
+ */
+ if (instr_is_other_syscall(linearpc, 0xCD)) {
+ return_code = 1;
+ } else if (instr_is_other_syscall(linearpc, syscall_insn)) {
+
+ if (rewrite_syscall(linearpc) == 0) {
+ return_code = 1;
+ }
+#ifdef DEBUG
+ else
+ cmn_err(CE_WARN, "failed to rewrite %s "
+ "instruction in process %d",
+ syscall_insn_string(syscall_insn),
+ p->p_pid);
+#endif /* DEBUG */
+ }
+ }
+
+ mutex_exit(&p->p_ldtlock); /* Must be held across linear_pc() */
+
+ return (return_code);
+}
+
/*
* Test to see if the instruction at pc is a system call instruction.
*
@@ -260,7 +321,7 @@ instr_is_fast_syscall(caddr_t pc, int which)
#define LCALLSIZE 7
static int
-instr_is_syscall(caddr_t pc)
+instr_is_lcall_syscall(caddr_t pc)
{
uchar_t instr[LCALLSIZE];
@@ -704,7 +765,7 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
* trap gate sequence. We just have to adjust the pc.
*/
if (watchpage && addr == (caddr_t)rp->r_sp &&
- rw == S_READ && instr_is_syscall((caddr_t)rp->r_pc)) {
+ rw == S_READ && instr_is_lcall_syscall((caddr_t)rp->r_pc)) {
extern void watch_syscall(void);
rp->r_pc += LCALLSIZE;
@@ -828,16 +889,8 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
* be to emulate that particular instruction.
*/
if (p->p_ldt != NULL &&
- instr_is_fast_syscall((caddr_t)rp->r_pc, X86_ASYSC)) {
- if (rewrite_syscall((caddr_t)rp->r_pc) == 0)
- goto out;
-#ifdef DEBUG
- else
- cmn_err(CE_WARN, "failed to rewrite syscall "
- "instruction in process %d",
- curthread->t_procp->p_pid);
-#endif /* DEBUG */
- }
+ ldt_rewrite_syscall(rp, p, X86_ASYSC))
+ goto out;
#ifdef __amd64
/*
@@ -1114,7 +1167,7 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
#ifdef _SYSCALL32_IMPL
if (p->p_model != DATAMODEL_NATIVE) {
#endif /* _SYSCALL32_IMPL */
- if (instr_is_syscall((caddr_t)rp->r_pc)) {
+ if (instr_is_lcall_syscall((caddr_t)rp->r_pc)) {
if (type == T_SEGFLT + USER)
ASSERT(p->p_ldt != NULL);
@@ -1161,16 +1214,9 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
* this will be to emulate that particular instruction.
*/
if (p->p_ldt != NULL &&
- instr_is_fast_syscall((caddr_t)rp->r_pc, X86_SEP)) {
- if (rewrite_syscall((caddr_t)rp->r_pc) == 0)
- goto out;
-#ifdef DEBUG
- else
- cmn_err(CE_WARN, "failed to rewrite sysenter "
- "instruction in process %d",
- curthread->t_procp->p_pid);
-#endif /* DEBUG */
- }
+ ldt_rewrite_syscall(rp, p, X86_SEP))
+ goto out;
+
/*FALLTHROUGH*/
case T_BOUNDFLT + USER: /* bound fault */
diff --git a/usr/src/uts/i86pc/sys/machsystm.h b/usr/src/uts/i86pc/sys/machsystm.h
index caacf71e30..6132815959 100644
--- a/usr/src/uts/i86pc/sys/machsystm.h
+++ b/usr/src/uts/i86pc/sys/machsystm.h
@@ -121,6 +121,8 @@ struct memlist;
extern void memlist_add(uint64_t, uint64_t, struct memlist *,
struct memlist **);
extern page_t *page_get_physical(uintptr_t);
+extern int linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp);
+extern int dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp);
#endif /* _KERNEL */
diff --git a/usr/src/uts/intel/amd64/ml/mach_offsets.in b/usr/src/uts/intel/amd64/ml/mach_offsets.in
index 0e41e5c098..018b9ef813 100644
--- a/usr/src/uts/intel/amd64/ml/mach_offsets.in
+++ b/usr/src/uts/intel/amd64/ml/mach_offsets.in
@@ -77,6 +77,10 @@ regs REGSIZE
r_r13 REGOFF_R13
r_r14 REGOFF_R14
r_r15 REGOFF_R15
+\#if DEBUG
+ __r_fsbase REGOFF_FSBASE
+ __r_gsbase REGOFF_GSBASE
+\#endif
r_ds REGOFF_DS
r_es REGOFF_ES
r_fs REGOFF_FS
diff --git a/usr/src/uts/intel/amd64/sys/privregs.h b/usr/src/uts/intel/amd64/sys/privregs.h
index 38672670fe..2e76668a32 100644
--- a/usr/src/uts/intel/amd64/sys/privregs.h
+++ b/usr/src/uts/intel/amd64/sys/privregs.h
@@ -78,15 +78,11 @@ struct regs {
greg_t r_r15; /* callee-saved */
/*
- * XX64
- * We used to sample fsbase and gsbase on every exception
- * with expensive rdmsr's. Yet this was only useful at
- * best for debugging during amd64 bringup. We should take
- * these away but for now simply rename them to avoid any
- * flag days.
+ * fsbase and gsbase are sampled on every exception in DEBUG kernels
+ * only. They remain in the non-DEBUG kernel to avoid any flag days.
*/
- greg_t __r_fsbase; /* XX64 no longer used by the kernel */
- greg_t __r_gsbase; /* XX64 no longer used by the kernel */
+ greg_t __r_fsbase; /* no longer used in non-DEBUG builds */
+ greg_t __r_gsbase; /* no longer used in non-DEBUG builds */
greg_t r_ds;
greg_t r_es;
greg_t r_fs; /* %fs is *never* used by the kernel */
@@ -123,6 +119,20 @@ struct regs {
#include <sys/machprivregs.h>
#include <sys/pcb.h>
+#ifdef DEBUG
+#define __SAVE_BASES \
+ movl $MSR_AMD_FSBASE, %ecx; \
+ rdmsr; \
+ movl %eax, REGOFF_FSBASE(%rsp); \
+ movl %edx, REGOFF_FSBASE+4(%rsp); \
+ movl $MSR_AMD_GSBASE, %ecx; \
+ rdmsr; \
+ movl %eax, REGOFF_GSBASE(%rsp); \
+ movl %edx, REGOFF_GSBASE+4(%rsp)
+#else
+#define __SAVE_BASES
+#endif
+
/*
* Create a struct regs on the stack suitable for an
* interrupt trap.
@@ -157,7 +167,8 @@ struct regs {
movw %es, %cx; \
movq %rcx, REGOFF_ES(%rsp); \
movw %ds, %cx; \
- movq %rcx, REGOFF_DS(%rsp)
+ movq %rcx, REGOFF_DS(%rsp); \
+ __SAVE_BASES
#define __RESTORE_REGS \
movq REGOFF_RDI(%rsp), %rdi; \
diff --git a/usr/src/uts/intel/fs/proc/prmachdep.c b/usr/src/uts/intel/fs/proc/prmachdep.c
index caebfab916..267f6423a9 100644
--- a/usr/src/uts/intel/fs/proc/prmachdep.c
+++ b/usr/src/uts/intel/fs/proc/prmachdep.c
@@ -508,10 +508,9 @@ prnldt(proc_t *p)
ASSERT(MUTEX_HELD(&p->p_ldtlock));
/*
- * Currently 64 bit processes cannot have a private ldt.
+ * Currently 64 bit processes cannot have private LDTs.
*/
- ASSERT(get_udatamodel() != DATAMODEL_LP64 || p->p_ldt == NULL);
-
+ ASSERT(p->p_model != DATAMODEL_LP64 || p->p_ldt == NULL);
if (p->p_ldt == NULL)
return (0);
diff --git a/usr/src/uts/intel/kdi/amd64/kdi_asm.s b/usr/src/uts/intel/kdi/amd64/kdi_asm.s
index 365d4fdaea..f8042ae544 100644
--- a/usr/src/uts/intel/kdi/amd64/kdi_asm.s
+++ b/usr/src/uts/intel/kdi/amd64/kdi_asm.s
@@ -104,10 +104,20 @@
movw %fs, %ax; \
movq %rax, REG_OFF(KDIREG_FS)(base); \
movw %gs, %ax; \
- movq %rax, REG_OFF(KDIREG_GS)(base)
+ movq %rax, REG_OFF(KDIREG_GS)(base); \
+ movl $MSR_AMD_GSBASE, %ecx; \
+ rdmsr; \
+ shlq $32, %rdx; \
+ orq %rax, %rdx; \
+ movq %rdx, REG_OFF(KDIREG_GSBASE)(base)
#define KDI_RESTORE_REGS(base) \
movq base, %rdi; \
+ movq REG_OFF(KDIREG_GSBASE)(%rdi), %rdx; \
+ movq %rdx, %rax; \
+ shrq $32, %rdx; \
+ movl $MSR_AMD_GSBASE, %ecx; \
+ wrmsr; \
movq REG_OFF(KDIREG_ES)(%rdi), %rax; \
movw %ax, %es; \
movq REG_OFF(KDIREG_DS)(%rdi), %rax; \
@@ -193,7 +203,9 @@ kdi_cmnint(void)
* Switch to the kernel's GSBASE. Neither GSBASE nor the ill-named
* KGSBASE can be trusted, as the kernel may or may not have already
* done a swapgs. All is not lost, as the kernel can divine the correct
- * value for us.
+ * value for us. Note that the previous GSBASE is saved in the
+ * KDI_SAVE_REGS macro to prevent a usermode process's GSBASE from being
+ * blown away.
*/
subq $10, %rsp
sgdt (%rsp)