diff options
author | josephb <none@none> | 2007-05-09 11:29:53 -0700 |
---|---|---|
committer | josephb <none@none> | 2007-05-09 11:29:53 -0700 |
commit | 95c0a3c85cc8a224af0bc2bc1f7400be641f43fc (patch) | |
tree | 52c57d45ab01fb5c9f65fd68c24df0159bdb3b4f /usr/src | |
parent | db5ca0bda7f0c1698f5046285dec0f0dce9d3704 (diff) | |
download | illumos-gate-95c0a3c85cc8a224af0bc2bc1f7400be641f43fc.tar.gz |
6520773 faster TLB shootdown on x86
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/uts/i86pc/os/intr.c | 6 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/mp_pc.c | 3 | ||||
-rw-r--r-- | usr/src/uts/i86pc/os/x_call.c | 52 | ||||
-rw-r--r-- | usr/src/uts/i86pc/sys/machcpuvar.h | 2 | ||||
-rw-r--r-- | usr/src/uts/i86pc/sys/x_call.h | 2 | ||||
-rw-r--r-- | usr/src/uts/i86pc/vm/hat_i86.c | 153 | ||||
-rw-r--r-- | usr/src/uts/i86pc/vm/hat_i86.h | 45 | ||||
-rw-r--r-- | usr/src/uts/i86pc/vm/htable.c | 75 |
8 files changed, 167 insertions, 171 deletions
diff --git a/usr/src/uts/i86pc/os/intr.c b/usr/src/uts/i86pc/os/intr.c index 598d608f1a..ea15df6828 100644 --- a/usr/src/uts/i86pc/os/intr.c +++ b/usr/src/uts/i86pc/os/intr.c @@ -53,6 +53,7 @@ #include <sys/ontrap.h> #include <sys/x86_archext.h> #include <sys/promif.h> +#include <vm/hat_i86.h> /* @@ -901,6 +902,11 @@ do_interrupt(struct regs *rp, trap_trace_rec_t *ttp) #endif /* TRAPTRACE */ /* + * Handle any pending TLB flushing + */ + tlb_service(); + + /* * If it's a softint go do it now. */ if (rp->r_trapno == T_SOFTINT) { diff --git a/usr/src/uts/i86pc/os/mp_pc.c b/usr/src/uts/i86pc/os/mp_pc.c index 293750e1ec..eb07c7bd4f 100644 --- a/usr/src/uts/i86pc/os/mp_pc.c +++ b/usr/src/uts/i86pc/os/mp_pc.c @@ -43,6 +43,7 @@ #include <sys/mach_mmu.h> #include <sys/promif.h> #include <sys/cpu.h> +#include <vm/hat_i86.h> extern void real_mode_start(void); extern void real_mode_end(void); @@ -244,7 +245,9 @@ mach_cpu_halt(char *msg) void mach_cpu_idle(void) { + tlb_going_idle(); i86_halt(); + tlb_service(); } void diff --git a/usr/src/uts/i86pc/os/x_call.c b/usr/src/uts/i86pc/os/x_call.c index 71aed5032b..a7f7066dd5 100644 --- a/usr/src/uts/i86pc/os/x_call.c +++ b/usr/src/uts/i86pc/os/x_call.c @@ -131,7 +131,6 @@ xc_serv(caddr_t arg1, caddr_t arg2) int pri = (int)(uintptr_t)arg1; struct cpu *cpup = CPU; xc_arg_t arg2val; - uint_t initiator_first; XC_TRACE(TT_XC_SVC_BEGIN, pri, (ulong_t)arg2); @@ -168,22 +167,12 @@ xc_serv(caddr_t arg1, caddr_t arg2) op = cpup->cpu_m.xc_state[pri]; /* - * Special handling for xc_wait_sync(). The cross call is used - * to allow the initiating CPU to wait until all other CPUs are - * captured in the cross call. Then the initiator invokes the - * service function before any other CPU. Then other CPUs can - * invoke the service function. - */ - initiator_first = (cpup->cpu_m.xc_wait[pri] == 2); - - /* * Don't invoke a null function. */ if (xc_mboxes[pri].func != NULL) { - if (!initiator_first) - cpup->cpu_m.xc_retval[pri] = (*xc_mboxes[pri].func) - (xc_mboxes[pri].arg1, xc_mboxes[pri].arg2, - xc_mboxes[pri].arg3); + cpup->cpu_m.xc_retval[pri] = + (*xc_mboxes[pri].func)(xc_mboxes[pri].arg1, + xc_mboxes[pri].arg2, xc_mboxes[pri].arg3); } else cpup->cpu_m.xc_retval[pri] = 0; @@ -204,12 +193,6 @@ xc_serv(caddr_t arg1, caddr_t arg2) while (cpup->cpu_m.xc_state[pri] != XC_DONE) SMT_PAUSE(); - if (xc_mboxes[pri].func != NULL && initiator_first) { - cpup->cpu_m.xc_retval[pri] = (*xc_mboxes[pri].func) - (xc_mboxes[pri].arg1, xc_mboxes[pri].arg2, - xc_mboxes[pri].arg3); - } - /* * Acknowledge that we have received the directive to continue. */ @@ -295,24 +278,6 @@ xc_sync( } /* - * xc_sync_wait: similar to xc_sync(), except that the starting - * cpu waits for all other cpus to check in before running its - * service locally. - */ -void -xc_wait_sync( - xc_arg_t arg1, - xc_arg_t arg2, - xc_arg_t arg3, - int pri, - cpuset_t set, - xc_func_t func) -{ - xc_do_call(arg1, arg2, arg3, pri, set, func, 2); -} - - -/* * The routines xc_capture_cpus and xc_release_cpus * can be used in place of xc_sync in order to implement a critical * code section where all CPUs in the system can be controlled. @@ -463,7 +428,6 @@ xc_release_cpus(void) * -1 - no waiting, don't release remotes * 0 - no waiting, release remotes immediately * 1 - run service locally w/o waiting for remotes. - * 2 - wait for remotes before running locally */ static void xc_common( @@ -521,9 +485,9 @@ xc_common( } /* - * Run service locally if not waiting for remotes. + * Run service locally */ - if (sync != 2 && CPU_IN_SET(set, lcx) && func != NULL) { + if (CPU_IN_SET(set, lcx) && func != NULL) { XC_TRACE(TT_XC_START, pri, CPU->cpu_id); CPU->cpu_m.xc_retval[pri] = (*func)(arg1, arg2, arg3); } @@ -544,12 +508,6 @@ xc_common( } } - /* - * Run service locally if waiting for remotes. - */ - if (sync == 2 && CPU_IN_SET(set, lcx) && func != NULL) - CPU->cpu_m.xc_retval[pri] = (*func)(arg1, arg2, arg3); - if (sync == 0) return; diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h index 3217a674ab..894c089aaf 100644 --- a/usr/src/uts/i86pc/sys/machcpuvar.h +++ b/usr/src/uts/i86pc/sys/machcpuvar.h @@ -71,6 +71,8 @@ struct machcpu { struct hat_cpu_info *mcpu_hat_info; + volatile ulong_t mcpu_tlb_info; + /* i86 hardware table addresses that cannot be shared */ user_desc_t *mcpu_gdt; /* GDT */ diff --git a/usr/src/uts/i86pc/sys/x_call.h b/usr/src/uts/i86pc/sys/x_call.h index e6bd41bc77..37cd7e39af 100644 --- a/usr/src/uts/i86pc/sys/x_call.h +++ b/usr/src/uts/i86pc/sys/x_call.h @@ -79,8 +79,6 @@ extern uint_t xc_serv(caddr_t, caddr_t); extern void xc_call(xc_arg_t, xc_arg_t, xc_arg_t, int, cpuset_t, xc_func_t); extern void xc_trycall(xc_arg_t, xc_arg_t, xc_arg_t, cpuset_t, xc_func_t); extern void xc_sync(xc_arg_t, xc_arg_t, xc_arg_t, int, cpuset_t, xc_func_t); -extern void xc_wait_sync(xc_arg_t, xc_arg_t, xc_arg_t, int, cpuset_t, - xc_func_t); extern void xc_capture_cpus(cpuset_t); extern void xc_release_cpus(void); diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c index a585f73ced..d90478bd3d 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.c +++ b/usr/src/uts/i86pc/vm/hat_i86.c @@ -62,6 +62,7 @@ #include <sys/bootconf.h> #include <sys/bootsvcs.h> #include <sys/bootinfo.h> +#include <sys/archsystm.h> #include <vm/seg_kmem.h> #include <vm/hat_i86.h> @@ -185,26 +186,6 @@ extern void atomic_andb(uchar_t *addr, uchar_t val); #define PP_CLRALL(pp) PP_CLRRM(pp, P_MOD | P_REF | P_RO) /* - * some useful tracing macros - */ - -int hattrace = 0; -#ifdef DEBUG - -#define HATIN(r, h, a, l) \ - if (hattrace) prom_printf("->%s hat=%p, adr=%p, len=%lx\n", #r, h, a, l) - -#define HATOUT(r, h, a) \ - if (hattrace) prom_printf("<-%s hat=%p, adr=%p\n", #r, h, a) -#else - -#define HATIN(r, h, a, l) -#define HATOUT(r, h, a) - -#endif - - -/* * kmem cache constructor for struct hat */ /*ARGSUSED*/ @@ -218,8 +199,6 @@ hati_constructor(void *buf, void *handle, int kmflags) sizeof (pgcnt_t) * (mmu.max_page_level + 1)); hat->hat_stats = 0; hat->hat_flags = 0; - mutex_init(&hat->hat_switch_mutex, NULL, MUTEX_DRIVER, - (void *)ipltospl(DISP_LEVEL)); CPUSET_ZERO(hat->hat_cpus); hat->hat_htable = NULL; hat->hat_ht_hash = NULL; @@ -913,13 +892,10 @@ hat_switch(hat_t *hat) } /* - * Wait for any in flight pagetable invalidates on this hat to finish. - * This is a spin lock at DISP_LEVEL + * Add this CPU to the active set for this HAT. */ if (hat != kas.a_hat) { - mutex_enter(&hat->hat_switch_mutex); CPUSET_ATOMIC_ADD(hat->hat_cpus, cpu->cpu_id); - mutex_exit(&hat->hat_switch_mutex); } cpu->cpu_current_hat = hat; @@ -1460,7 +1436,6 @@ hat_memload( level_t level = 0; pfn_t pfn = page_pptonum(pp); - HATIN(hat_memload, hat, addr, (size_t)MMU_PAGESIZE); ASSERT(IS_PAGEALIGNED(va)); ASSERT(hat == kas.a_hat || va < _userlimit); ASSERT(hat == kas.a_hat || @@ -1486,7 +1461,6 @@ hat_memload( attr |= HAT_STORECACHING_OK; if (hati_load_common(hat, va, pp, attr, flags, level, pfn) != 0) panic("unexpected hati_load_common() failure"); - HATOUT(hat_memload, hat, addr); } /* @@ -1509,7 +1483,6 @@ hat_memload_array( pfn_t pfn; pgcnt_t i; - HATIN(hat_memload_array, hat, addr, len); ASSERT(IS_PAGEALIGNED(va)); ASSERT(hat == kas.a_hat || va + len <= _userlimit); ASSERT(hat == kas.a_hat || @@ -1581,7 +1554,6 @@ hat_memload_array( va += pgsize; pgindx += mmu_btop(pgsize); } - HATOUT(hat_memload_array, hat, addr); } /* @@ -1631,7 +1603,6 @@ hat_devload( int f; /* per PTE copy of flags - maybe modified */ uint_t a; /* per PTE copy of attr */ - HATIN(hat_devload, hat, addr, len); ASSERT(IS_PAGEALIGNED(va)); ASSERT(hat == kas.a_hat || eva <= _userlimit); ASSERT(hat == kas.a_hat || @@ -1694,7 +1665,6 @@ hat_devload( va += pgsize; pfn += mmu_btop(pgsize); } - HATOUT(hat_devload, hat, addr); } /* @@ -1786,6 +1756,85 @@ hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3) } /* + * Flush all TLB entries, including global (ie. kernel) ones. + */ +static void +flush_all_tlb_entries(void) +{ + ulong_t cr4 = getcr4(); + + if (cr4 & CR4_PGE) { + setcr4(cr4 & ~(ulong_t)CR4_PGE); + setcr4(cr4); + + /* + * 32 bit PAE also needs to always reload_cr3() + */ + if (mmu.max_level == 2) + reload_cr3(); + } else { + reload_cr3(); + } +} + +#define TLB_CPU_HALTED (01ul) +#define TLB_INVAL_ALL (02ul) +#define CAS_TLB_INFO(cpu, old, new) \ + caslong((ulong_t *)&(cpu)->cpu_m.mcpu_tlb_info, (old), (new)) + +/* + * Record that a CPU is going idle + */ +void +tlb_going_idle(void) +{ + atomic_or_long((ulong_t *)&CPU->cpu_m.mcpu_tlb_info, TLB_CPU_HALTED); +} + +/* + * Service a delayed TLB flush if coming out of being idle. + */ +void +tlb_service(void) +{ + ulong_t flags = getflags(); + ulong_t tlb_info; + ulong_t found; + + /* + * Be sure interrupts are off while doing this so that + * higher level interrupts correctly wait for flushes to finish. + */ + if (flags & PS_IE) + flags = intr_clear(); + + /* + * We only have to do something if coming out of being idle. + */ + tlb_info = CPU->cpu_m.mcpu_tlb_info; + if (tlb_info & TLB_CPU_HALTED) { + ASSERT(CPU->cpu_current_hat == kas.a_hat); + + /* + * Atomic clear and fetch of old state. + */ + while ((found = CAS_TLB_INFO(CPU, tlb_info, 0)) != tlb_info) { + ASSERT(found & TLB_CPU_HALTED); + tlb_info = found; + SMT_PAUSE(); + } + if (tlb_info & TLB_INVAL_ALL) + flush_all_tlb_entries(); + } + + /* + * Restore interrupt enable control bit. + */ + if (flags & PS_IE) + sti(); +} + +/* * Internal routine to do cross calls to invalidate a range of pages on * all CPUs using a given hat. */ @@ -1794,7 +1843,10 @@ hat_tlb_inval(hat_t *hat, uintptr_t va) { extern int flushes_require_xcalls; /* from mp_startup.c */ cpuset_t justme; + cpuset_t check_cpus; cpuset_t cpus_to_shootdown; + cpu_t *cpup; + int c; /* * If the hat is being destroyed, there are no more users, so @@ -1833,6 +1885,34 @@ hat_tlb_inval(hat_t *hat, uintptr_t va) else cpus_to_shootdown = hat->hat_cpus; + /* + * If any CPUs in the set are idle, just request a delayed flush + * and avoid waking them up. + */ + check_cpus = cpus_to_shootdown; + for (c = 0; c < NCPU && !CPUSET_ISNULL(check_cpus); ++c) { + ulong_t tlb_info; + + if (!CPU_IN_SET(check_cpus, c)) + continue; + CPUSET_DEL(check_cpus, c); + cpup = cpu[c]; + if (cpup == NULL) + continue; + + tlb_info = cpup->cpu_m.mcpu_tlb_info; + while (tlb_info == TLB_CPU_HALTED) { + (void) CAS_TLB_INFO(cpup, TLB_CPU_HALTED, + TLB_CPU_HALTED | TLB_INVAL_ALL); + SMT_PAUSE(); + tlb_info = cpup->cpu_m.mcpu_tlb_info; + } + if (tlb_info == (TLB_CPU_HALTED | TLB_INVAL_ALL)) { + HATSTAT_INC(hs_tlb_inval_delayed); + CPUSET_DEL(cpus_to_shootdown, c); + } + } + if (CPUSET_ISNULL(cpus_to_shootdown) || CPUSET_ISEQUAL(cpus_to_shootdown, justme)) { @@ -2066,7 +2146,6 @@ hat_unload_callback( uint_t r_cnt = 0; x86pte_t old_pte; - HATIN(hat_unload_callback, hat, addr, len); ASSERT(hat == kas.a_hat || eaddr <= _userlimit); ASSERT(IS_PAGEALIGNED(vaddr)); ASSERT(IS_PAGEALIGNED(eaddr)); @@ -2128,8 +2207,6 @@ hat_unload_callback( */ if (r_cnt > 0) handle_ranges(cb, r_cnt, r); - - HATOUT(hat_unload_callback, hat, addr); } /* @@ -2598,7 +2675,6 @@ hat_share( * valid mappings. That's because it rounds the segment size up to a * large pagesize, even if the actual memory mapped by ism_hat is less. */ - HATIN(hat_share, hat, addr, len); ASSERT(IS_PAGEALIGNED(vaddr_start)); ASSERT(IS_PAGEALIGNED(ism_addr_start)); ASSERT(ism_hat->hat_flags & HAT_SHARED); @@ -2714,8 +2790,6 @@ hat_share( } if (ism_ht != NULL) htable_release(ism_ht); - - HATOUT(hat_share, hat, addr); return (0); } @@ -2736,7 +2810,6 @@ hat_unshare(hat_t *hat, caddr_t addr, size_t len, uint_t ismszc) ASSERT(hat != kas.a_hat); ASSERT(eaddr <= _userlimit); - HATIN(hat_unshare, hat, addr, len); ASSERT(IS_PAGEALIGNED(vaddr)); ASSERT(IS_PAGEALIGNED(eaddr)); @@ -2781,8 +2854,6 @@ hat_unshare(hat_t *hat, caddr_t addr, size_t len, uint_t ismszc) * couldn't share pagetables. */ hat_unload(hat, addr, len, HAT_UNLOAD_UNMAP); - - HATOUT(hat_unshare, hat, addr); } diff --git a/usr/src/uts/i86pc/vm/hat_i86.h b/usr/src/uts/i86pc/vm/hat_i86.h index d127e6ffef..aec1bf8887 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.h +++ b/usr/src/uts/i86pc/vm/hat_i86.h @@ -79,7 +79,6 @@ extern "C" { */ struct hat { kmutex_t hat_mutex; - kmutex_t hat_switch_mutex; struct as *hat_as; uint_t hat_stats; pgcnt_t hat_pages_mapped[MAX_PAGE_LEVEL + 1]; @@ -130,24 +129,29 @@ typedef struct hat hat_t; * debugger. */ struct hatstats { - uint64_t hs_reap_attempts; - uint64_t hs_reaped; - uint64_t hs_steals; - uint64_t hs_ptable_allocs; - uint64_t hs_ptable_frees; - uint64_t hs_htable_rgets; /* allocs from reserve */ - uint64_t hs_htable_rputs; /* putbacks to reserve */ - uint64_t hs_htable_shared; /* number of htables shared */ - uint64_t hs_htable_unshared; /* number of htables unshared */ - uint64_t hs_hm_alloc; - uint64_t hs_hm_free; - uint64_t hs_hm_put_reserve; - uint64_t hs_hm_get_reserve; - uint64_t hs_hm_steals; - uint64_t hs_hm_steal_exam; + ulong_t hs_reap_attempts; + ulong_t hs_reaped; + ulong_t hs_steals; + ulong_t hs_ptable_allocs; + ulong_t hs_ptable_frees; + ulong_t hs_htable_rgets; /* allocs from reserve */ + ulong_t hs_htable_rputs; /* putbacks to reserve */ + ulong_t hs_htable_shared; /* number of htables shared */ + ulong_t hs_htable_unshared; /* number of htables unshared */ + ulong_t hs_hm_alloc; + ulong_t hs_hm_free; + ulong_t hs_hm_put_reserve; + ulong_t hs_hm_get_reserve; + ulong_t hs_hm_steals; + ulong_t hs_hm_steal_exam; + ulong_t hs_tlb_inval_delayed; }; extern struct hatstats hatstat; -#define HATSTAT_INC(x) (atomic_add_64(&hatstat.x, 1)) +#ifdef DEBUG +#define HATSTAT_INC(x) (++hatstat.x) +#else +#define HATSTAT_INC(x) (0) +#endif #if defined(_KERNEL) @@ -225,6 +229,13 @@ extern uintptr_t hat_kernelbase(uintptr_t); extern void hat_kmap_init(uintptr_t base, size_t len); extern hment_t *hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry); + +/* + * routines to deal with delayed TLB invalidations for idle CPUs + */ +extern void tlb_going_idle(void); +extern void tlb_service(void); + /* * Hat switch function invoked to load a new context into %cr3 */ diff --git a/usr/src/uts/i86pc/vm/htable.c b/usr/src/uts/i86pc/vm/htable.c index 00c197348a..a4c4585c5f 100644 --- a/usr/src/uts/i86pc/vm/htable.c +++ b/usr/src/uts/i86pc/vm/htable.c @@ -1857,34 +1857,6 @@ x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new) } /* - * data structure for cross call information - */ -typedef struct xcall_inval { - caddr_t xi_addr; - x86pte_t xi_found; - x86pte_t xi_oldpte; - x86pte_t *xi_pteptr; - processorid_t xi_initiator; -} xcall_inval_t; - -/* - * Cross call service routine to invalidate TLBs. On the - * initiating CPU, this first clears the PTE in memory. - */ -/*ARGSUSED*/ -static int -x86pte_inval_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3) -{ - xcall_inval_t *xi = (xcall_inval_t *)a1; - - if (CPU->cpu_id == xi->xi_initiator) - xi->xi_found = CAS_PTE(xi->xi_pteptr, xi->xi_oldpte, 0); - - mmu_tlbflush_entry(xi->xi_addr); - return (0); -} - -/* * Invalidate a page table entry as long as it currently maps something that * matches the value determined by expect. * @@ -1897,10 +1869,9 @@ x86pte_inval( x86pte_t expect, x86pte_t *pte_ptr) { - hat_t *hat = ht->ht_hat; x86pte_t *ptep; - xcall_inval_t xi; - cpuset_t cpus; + x86pte_t oldpte; + x86pte_t found; ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); ASSERT(ht->ht_level != VLP_LEVEL); @@ -1909,48 +1880,24 @@ x86pte_inval( ptep = pte_ptr; else ptep = x86pte_access_pagetable(ht, entry); - xi.xi_pteptr = ptep; - xi.xi_addr = (caddr_t)htable_e2va(ht, entry); /* - * Setup a cross call to any CPUs using this HAT - */ - kpreempt_disable(); - xi.xi_initiator = CPU->cpu_id; - CPUSET_ZERO(cpus); - if (hat == kas.a_hat) { - CPUSET_OR(cpus, khat_cpuset); - } else { - mutex_enter(&hat->hat_switch_mutex); - CPUSET_OR(cpus, hat->hat_cpus); - CPUSET_ADD(cpus, CPU->cpu_id); - } - - /* - * Do the cross call to invalidate the PTE and flush TLBs. * Note that the loop is needed to handle changes due to h/w updating * of PT_MOD/PT_REF. */ do { - xi.xi_oldpte = GET_PTE(ptep); - if (expect != 0 && - (xi.xi_oldpte & PT_PADDR) != (expect & PT_PADDR)) - break; - if (panicstr == NULL) - xc_wait_sync((xc_arg_t)&xi, NULL, NULL, X_CALL_HIPRI, - cpus, x86pte_inval_func); - else - (void) x86pte_inval_func((xc_arg_t)&xi, NULL, NULL); - } while (xi.xi_found != xi.xi_oldpte); - - if (hat != kas.a_hat) - mutex_exit(&hat->hat_switch_mutex); - kpreempt_enable(); + oldpte = GET_PTE(ptep); + if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR)) + goto done; + found = CAS_PTE(ptep, oldpte, 0); + } while (found != oldpte); + if (oldpte & (PT_REF | PT_MOD)) + hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry)); +done: if (pte_ptr == NULL) x86pte_release_pagetable(ht); - - return (xi.xi_oldpte); + return (oldpte); } /* |