diff options
author | sethg <none@none> | 2007-03-30 19:27:19 -0700 |
---|---|---|
committer | sethg <none@none> | 2007-03-30 19:27:19 -0700 |
commit | ddece0baf7ff3a228bcd106c2bb2303ac0c9af89 (patch) | |
tree | 332634e3cac8098e7d82e14de41ffdfac419e694 | |
parent | d71dbb732372504daff1f1783bc0d8864ce9bd50 (diff) | |
download | illumos-joyent-ddece0baf7ff3a228bcd106c2bb2303ac0c9af89.tar.gz |
6533678 dtrace incorrectly resumes execution in the middle of 2-byte int 3 instructions
6533954 kmdb destroys usermode GSBASE when entering
6534035 dtrace_user_probe fails to consider alternate code selectors when checking for int3
6534061 amd64 DEBUG kernel's __SAVE_REGS should save the {gs,fs}base registers
6534200 syscall rewriting code is not thread-safe
6534218 Incorrect ASSERT logic in prnldt induces panic
6534277 syscall rewriting code in trap() should be aware of alternate segment selectors
-rw-r--r-- | usr/src/uts/i86pc/os/dtrace_subr.c | 18 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/machdep.c | 98 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/trap.c | 114 | ||||
-rw-r--r-- | usr/src/uts/i86pc/sys/machsystm.h | 2 | ||||
-rw-r--r-- | usr/src/uts/intel/amd64/ml/mach_offsets.in | 4 | ||||
-rw-r--r-- | usr/src/uts/intel/amd64/sys/privregs.h | 29 | ||||
-rw-r--r-- | usr/src/uts/intel/fs/proc/prmachdep.c | 5 | ||||
-rw-r--r-- | usr/src/uts/intel/kdi/amd64/kdi_asm.s | 16 |
8 files changed, 235 insertions, 51 deletions
diff --git a/usr/src/uts/i86pc/os/dtrace_subr.c b/usr/src/uts/i86pc/os/dtrace_subr.c index 7e369be269..b96d16fbfa 100644 --- a/usr/src/uts/i86pc/os/dtrace_subr.c +++ b/usr/src/uts/i86pc/os/dtrace_subr.c @@ -240,7 +240,8 @@ dtrace_user_probe(struct regs *rp, caddr_t addr, processorid_t cpuid) rp->r_pc = npc; } else if (rp->r_trapno == T_BPTFLT) { - uint8_t instr; + uint8_t instr, instr2; + caddr_t linearpc; rwp = &CPU->cpu_ft_lock; /* @@ -258,14 +259,25 @@ dtrace_user_probe(struct regs *rp, caddr_t addr, processorid_t cpuid) } rw_exit(rwp); + if (dtrace_linear_pc(rp, p, &linearpc) != 0) { + trap(rp, addr, cpuid); + return; + } + /* * If the instruction that caused the breakpoint trap doesn't * look like an int 3 anymore, it may be that this tracepoint * was removed just after the user thread executed it. In * that case, return to user land to retry the instuction. + * Note that we assume the length of the instruction to retry + * is 1 byte because that's the length of FASTTRAP_INSTR. + * We check for r_pc > 0 and > 2 so that we don't have to + * deal with segment wraparound. */ - if (fuword8((void *)(rp->r_pc - 1), &instr) == 0 && - instr != FASTTRAP_INSTR) { + if (rp->r_pc > 0 && fuword8(linearpc - 1, &instr) == 0 && + instr != FASTTRAP_INSTR && + (instr != 3 || (rp->r_pc >= 2 && + (fuword8(linearpc - 2, &instr2) != 0 || instr2 != 0xCD)))) { rp->r_pc--; return; } diff --git a/usr/src/uts/i86pc/os/machdep.c b/usr/src/uts/i86pc/os/machdep.c index bc2da7f363..f96722f64f 100644 --- a/usr/src/uts/i86pc/os/machdep.c +++ b/usr/src/uts/i86pc/os/machdep.c @@ -29,6 +29,7 @@ #include <sys/types.h> #include <sys/t_lock.h> #include <sys/param.h> +#include <sys/segments.h> #include <sys/sysmacros.h> #include <sys/signal.h> #include <sys/systm.h> @@ -1019,3 +1020,100 @@ dump_plat_data(void *dump_cbuf) { return (0); } + +/* + * Calculates a linear address, given the CS selector and PC values + * by looking up the %cs selector process's LDT or the CPU's GDT. + * proc->p_ldtlock must be held across this call. + */ +int +linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp) +{ + user_desc_t *descrp; + caddr_t baseaddr; + uint16_t idx = SELTOIDX(rp->r_cs); + + ASSERT(rp->r_cs <= 0xFFFF); + ASSERT(MUTEX_HELD(&p->p_ldtlock)); + + if (SELISLDT(rp->r_cs)) { + /* + * Currently 64 bit processes cannot have private LDTs. + */ + ASSERT(p->p_model != DATAMODEL_LP64); + + if (p->p_ldt == NULL) + return (-1); + + descrp = &p->p_ldt[idx]; + baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp); + + /* + * Calculate the linear address (wraparound is not only ok, + * it's expected behavior). The cast to uint32_t is because + * LDT selectors are only allowed in 32-bit processes. + */ + *linearp = (caddr_t)(uintptr_t)(uint32_t)((uintptr_t)baseaddr + + rp->r_pc); + } else { +#ifdef DEBUG + descrp = &CPU->cpu_gdt[idx]; + baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp); + /* GDT-based descriptors' base addresses should always be 0 */ + ASSERT(baseaddr == 0); +#endif + *linearp = (caddr_t)(uintptr_t)rp->r_pc; + } + + return (0); +} + +/* + * The implementation of dtrace_linear_pc is similar to the that of + * linear_pc, above, but here we acquire p_ldtlock before accessing + * p_ldt. This implementation is used by the pid provider; we prefix + * it with "dtrace_" to avoid inducing spurious tracing events. + */ +int +dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp) +{ + user_desc_t *descrp; + caddr_t baseaddr; + uint16_t idx = SELTOIDX(rp->r_cs); + + ASSERT(rp->r_cs <= 0xFFFF); + + if (SELISLDT(rp->r_cs)) { + /* + * Currently 64 bit processes cannot have private LDTs. + */ + ASSERT(p->p_model != DATAMODEL_LP64); + + mutex_enter(&p->p_ldtlock); + if (p->p_ldt == NULL) { + mutex_exit(&p->p_ldtlock); + return (-1); + } + descrp = &p->p_ldt[idx]; + baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp); + mutex_exit(&p->p_ldtlock); + + /* + * Calculate the linear address (wraparound is not only ok, + * it's expected behavior). The cast to uint32_t is because + * LDT selectors are only allowed in 32-bit processes. + */ + *linearp = (caddr_t)(uintptr_t)(uint32_t)((uintptr_t)baseaddr + + rp->r_pc); + } else { +#ifdef DEBUG + descrp = &CPU->cpu_gdt[idx]; + baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp); + /* GDT-based descriptors' base addresses should always be 0 */ + ASSERT(baseaddr == 0); +#endif + *linearp = (caddr_t)(uintptr_t)rp->r_pc; + } + + return (0); +} diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c index 38451ba8ad..2d0a093435 100644 --- a/usr/src/uts/i86pc/os/trap.c +++ b/usr/src/uts/i86pc/os/trap.c @@ -132,6 +132,9 @@ static const char *trap_type[] = { #define TRAP_TYPES (sizeof (trap_type) / sizeof (trap_type[0])) +#define SLOW_SCALL_SIZE 2 +#define FAST_SCALL_SIZE 2 + int tudebug = 0; int tudebugbpt = 0; int tudebugfpe = 0; @@ -206,8 +209,6 @@ die(uint_t type, struct regs *rp, caddr_t addr, processorid_t cpuid) * int <vector> is two bytes: 0xCD <vector> */ -#define SLOW_SCALL_SIZE 2 - static int rewrite_syscall(caddr_t pc) { @@ -227,28 +228,88 @@ rewrite_syscall(caddr_t pc) * * sysenter is two bytes: 0x0F 0x34 * syscall is two bytes: 0x0F 0x05 + * int $T_SYSCALLINT is two bytes: 0xCD 0x91 */ -#define FAST_SCALL_SIZE 2 - static int -instr_is_fast_syscall(caddr_t pc, int which) +instr_is_other_syscall(caddr_t pc, int which) { uchar_t instr[FAST_SCALL_SIZE]; - ASSERT(which == X86_SEP || which == X86_ASYSC); + ASSERT(which == X86_SEP || which == X86_ASYSC || which == 0xCD); - if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0 || - instr[0] != 0x0F) + if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0) return (0); - if ((which == X86_SEP && instr[1] == 0x34) || - (which == X86_ASYSC && instr[1] == 0x05)) - return (1); + switch (which) { + case X86_SEP: + if (instr[0] == 0x0F && instr[1] == 0x34) + return (1); + break; + case X86_ASYSC: + if (instr[0] == 0x0F && instr[1] == 0x05) + return (1); + break; + case 0xCD: + if (instr[0] == 0xCD && instr[1] == T_SYSCALLINT) + return (1); + break; + } return (0); } +static const char * +syscall_insn_string(int syscall_insn) +{ + switch (syscall_insn) { + case X86_SEP: + return ("sysenter"); + case X86_ASYSC: + return ("syscall"); + case 0xCD: + return ("int"); + default: + return ("Unknown"); + } +} + +static int +ldt_rewrite_syscall(struct regs *rp, proc_t *p, int syscall_insn) +{ + caddr_t linearpc; + int return_code = 0; + + mutex_enter(&p->p_ldtlock); /* Must be held across linear_pc() */ + + if (linear_pc(rp, p, &linearpc) == 0) { + + /* + * If another thread beat us here, it already changed + * this site to the slower (int) syscall instruction. + */ + if (instr_is_other_syscall(linearpc, 0xCD)) { + return_code = 1; + } else if (instr_is_other_syscall(linearpc, syscall_insn)) { + + if (rewrite_syscall(linearpc) == 0) { + return_code = 1; + } +#ifdef DEBUG + else + cmn_err(CE_WARN, "failed to rewrite %s " + "instruction in process %d", + syscall_insn_string(syscall_insn), + p->p_pid); +#endif /* DEBUG */ + } + } + + mutex_exit(&p->p_ldtlock); /* Must be held across linear_pc() */ + + return (return_code); +} + /* * Test to see if the instruction at pc is a system call instruction. * @@ -260,7 +321,7 @@ instr_is_fast_syscall(caddr_t pc, int which) #define LCALLSIZE 7 static int -instr_is_syscall(caddr_t pc) +instr_is_lcall_syscall(caddr_t pc) { uchar_t instr[LCALLSIZE]; @@ -704,7 +765,7 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) * trap gate sequence. We just have to adjust the pc. */ if (watchpage && addr == (caddr_t)rp->r_sp && - rw == S_READ && instr_is_syscall((caddr_t)rp->r_pc)) { + rw == S_READ && instr_is_lcall_syscall((caddr_t)rp->r_pc)) { extern void watch_syscall(void); rp->r_pc += LCALLSIZE; @@ -828,16 +889,8 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) * be to emulate that particular instruction. */ if (p->p_ldt != NULL && - instr_is_fast_syscall((caddr_t)rp->r_pc, X86_ASYSC)) { - if (rewrite_syscall((caddr_t)rp->r_pc) == 0) - goto out; -#ifdef DEBUG - else - cmn_err(CE_WARN, "failed to rewrite syscall " - "instruction in process %d", - curthread->t_procp->p_pid); -#endif /* DEBUG */ - } + ldt_rewrite_syscall(rp, p, X86_ASYSC)) + goto out; #ifdef __amd64 /* @@ -1114,7 +1167,7 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) #ifdef _SYSCALL32_IMPL if (p->p_model != DATAMODEL_NATIVE) { #endif /* _SYSCALL32_IMPL */ - if (instr_is_syscall((caddr_t)rp->r_pc)) { + if (instr_is_lcall_syscall((caddr_t)rp->r_pc)) { if (type == T_SEGFLT + USER) ASSERT(p->p_ldt != NULL); @@ -1161,16 +1214,9 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) * this will be to emulate that particular instruction. */ if (p->p_ldt != NULL && - instr_is_fast_syscall((caddr_t)rp->r_pc, X86_SEP)) { - if (rewrite_syscall((caddr_t)rp->r_pc) == 0) - goto out; -#ifdef DEBUG - else - cmn_err(CE_WARN, "failed to rewrite sysenter " - "instruction in process %d", - curthread->t_procp->p_pid); -#endif /* DEBUG */ - } + ldt_rewrite_syscall(rp, p, X86_SEP)) + goto out; + /*FALLTHROUGH*/ case T_BOUNDFLT + USER: /* bound fault */ diff --git a/usr/src/uts/i86pc/sys/machsystm.h b/usr/src/uts/i86pc/sys/machsystm.h index caacf71e30..6132815959 100644 --- a/usr/src/uts/i86pc/sys/machsystm.h +++ b/usr/src/uts/i86pc/sys/machsystm.h @@ -121,6 +121,8 @@ struct memlist; extern void memlist_add(uint64_t, uint64_t, struct memlist *, struct memlist **); extern page_t *page_get_physical(uintptr_t); +extern int linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp); +extern int dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp); #endif /* _KERNEL */ diff --git a/usr/src/uts/intel/amd64/ml/mach_offsets.in b/usr/src/uts/intel/amd64/ml/mach_offsets.in index 0e41e5c098..018b9ef813 100644 --- a/usr/src/uts/intel/amd64/ml/mach_offsets.in +++ b/usr/src/uts/intel/amd64/ml/mach_offsets.in @@ -77,6 +77,10 @@ regs REGSIZE r_r13 REGOFF_R13 r_r14 REGOFF_R14 r_r15 REGOFF_R15 +\#if DEBUG + __r_fsbase REGOFF_FSBASE + __r_gsbase REGOFF_GSBASE +\#endif r_ds REGOFF_DS r_es REGOFF_ES r_fs REGOFF_FS diff --git a/usr/src/uts/intel/amd64/sys/privregs.h b/usr/src/uts/intel/amd64/sys/privregs.h index 38672670fe..2e76668a32 100644 --- a/usr/src/uts/intel/amd64/sys/privregs.h +++ b/usr/src/uts/intel/amd64/sys/privregs.h @@ -78,15 +78,11 @@ struct regs { greg_t r_r15; /* callee-saved */ /* - * XX64 - * We used to sample fsbase and gsbase on every exception - * with expensive rdmsr's. Yet this was only useful at - * best for debugging during amd64 bringup. We should take - * these away but for now simply rename them to avoid any - * flag days. + * fsbase and gsbase are sampled on every exception in DEBUG kernels + * only. They remain in the non-DEBUG kernel to avoid any flag days. */ - greg_t __r_fsbase; /* XX64 no longer used by the kernel */ - greg_t __r_gsbase; /* XX64 no longer used by the kernel */ + greg_t __r_fsbase; /* no longer used in non-DEBUG builds */ + greg_t __r_gsbase; /* no longer used in non-DEBUG builds */ greg_t r_ds; greg_t r_es; greg_t r_fs; /* %fs is *never* used by the kernel */ @@ -123,6 +119,20 @@ struct regs { #include <sys/machprivregs.h> #include <sys/pcb.h> +#ifdef DEBUG +#define __SAVE_BASES \ + movl $MSR_AMD_FSBASE, %ecx; \ + rdmsr; \ + movl %eax, REGOFF_FSBASE(%rsp); \ + movl %edx, REGOFF_FSBASE+4(%rsp); \ + movl $MSR_AMD_GSBASE, %ecx; \ + rdmsr; \ + movl %eax, REGOFF_GSBASE(%rsp); \ + movl %edx, REGOFF_GSBASE+4(%rsp) +#else +#define __SAVE_BASES +#endif + /* * Create a struct regs on the stack suitable for an * interrupt trap. @@ -157,7 +167,8 @@ struct regs { movw %es, %cx; \ movq %rcx, REGOFF_ES(%rsp); \ movw %ds, %cx; \ - movq %rcx, REGOFF_DS(%rsp) + movq %rcx, REGOFF_DS(%rsp); \ + __SAVE_BASES #define __RESTORE_REGS \ movq REGOFF_RDI(%rsp), %rdi; \ diff --git a/usr/src/uts/intel/fs/proc/prmachdep.c b/usr/src/uts/intel/fs/proc/prmachdep.c index caebfab916..267f6423a9 100644 --- a/usr/src/uts/intel/fs/proc/prmachdep.c +++ b/usr/src/uts/intel/fs/proc/prmachdep.c @@ -508,10 +508,9 @@ prnldt(proc_t *p) ASSERT(MUTEX_HELD(&p->p_ldtlock)); /* - * Currently 64 bit processes cannot have a private ldt. + * Currently 64 bit processes cannot have private LDTs. */ - ASSERT(get_udatamodel() != DATAMODEL_LP64 || p->p_ldt == NULL); - + ASSERT(p->p_model != DATAMODEL_LP64 || p->p_ldt == NULL); if (p->p_ldt == NULL) return (0); diff --git a/usr/src/uts/intel/kdi/amd64/kdi_asm.s b/usr/src/uts/intel/kdi/amd64/kdi_asm.s index 365d4fdaea..f8042ae544 100644 --- a/usr/src/uts/intel/kdi/amd64/kdi_asm.s +++ b/usr/src/uts/intel/kdi/amd64/kdi_asm.s @@ -104,10 +104,20 @@ movw %fs, %ax; \ movq %rax, REG_OFF(KDIREG_FS)(base); \ movw %gs, %ax; \ - movq %rax, REG_OFF(KDIREG_GS)(base) + movq %rax, REG_OFF(KDIREG_GS)(base); \ + movl $MSR_AMD_GSBASE, %ecx; \ + rdmsr; \ + shlq $32, %rdx; \ + orq %rax, %rdx; \ + movq %rdx, REG_OFF(KDIREG_GSBASE)(base) #define KDI_RESTORE_REGS(base) \ movq base, %rdi; \ + movq REG_OFF(KDIREG_GSBASE)(%rdi), %rdx; \ + movq %rdx, %rax; \ + shrq $32, %rdx; \ + movl $MSR_AMD_GSBASE, %ecx; \ + wrmsr; \ movq REG_OFF(KDIREG_ES)(%rdi), %rax; \ movw %ax, %es; \ movq REG_OFF(KDIREG_DS)(%rdi), %rax; \ @@ -193,7 +203,9 @@ kdi_cmnint(void) * Switch to the kernel's GSBASE. Neither GSBASE nor the ill-named * KGSBASE can be trusted, as the kernel may or may not have already * done a swapgs. All is not lost, as the kernel can divine the correct - * value for us. + * value for us. Note that the previous GSBASE is saved in the + * KDI_SAVE_REGS macro to prevent a usermode process's GSBASE from being + * blown away. */ subq $10, %rsp sgdt (%rsp) |