diff options
Diffstat (limited to 'usr/src/uts/common')
-rw-r--r-- | usr/src/uts/common/disp/thread.c | 78 | ||||
-rw-r--r-- | usr/src/uts/common/os/cpu.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/os/lwp.c | 23 | ||||
-rw-r--r-- | usr/src/uts/common/os/mutex.c | 282 | ||||
-rw-r--r-- | usr/src/uts/common/sys/mutex.h | 13 |
5 files changed, 252 insertions, 152 deletions
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c index ee2d80834d..928b594602 100644 --- a/usr/src/uts/common/disp/thread.c +++ b/usr/src/uts/common/disp/thread.c @@ -842,6 +842,21 @@ thread_zone_destroy(zoneid_t zoneid, void *unused) mutex_exit(&reaplock); /* + * Guard against race condition in mutex_owner_running: + * thread=owner(mutex) + * <interrupt> + * thread exits mutex + * thread exits + * thread reaped + * thread struct freed + * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE. + * A cross call to all cpus will cause the interrupt handler + * to reset the PC if it is in mutex_owner_running, refreshing + * stale thread pointers. + */ + mutex_sync(); /* sync with mutex code */ + + /* * Reap threads */ thread_reap_list(t); @@ -874,6 +889,12 @@ thread_reaper() cv_wait(&reaper_cv, &reaplock); CALLB_CPR_SAFE_END(&cprinfo, &reaplock); } + /* + * mutex_sync() needs to be called when reaping, but + * not too often. We limit reaping rate to once + * per second. Reaplimit is max rate at which threads can + * be freed. Does not impact thread destruction/creation. + */ t = thread_deathrow; l = lwp_deathrow; thread_deathrow = NULL; @@ -883,6 +904,20 @@ thread_reaper() mutex_exit(&reaplock); /* + * Guard against race condition in mutex_owner_running: + * thread=owner(mutex) + * <interrupt> + * thread exits mutex + * thread exits + * thread reaped + * thread struct freed + * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE. + * A cross call to all cpus will cause the interrupt handler + * to reset the PC if it is in mutex_owner_running, refreshing + * stale thread pointers. + */ + mutex_sync(); /* sync with mutex code */ + /* * Reap threads */ thread_reap_list(t); @@ -891,13 +926,32 @@ thread_reaper() * Reap lwps */ thread_reap_list(l); + delay(hz); } } /* + * This is called by lwpcreate, etc.() to put a lwp_deathrow thread onto + * thread_deathrow. The thread's state is changed already TS_FREE to indicate + * that is reapable. The thread already holds the reaplock, and was already + * freed. + */ +void +reapq_move_lq_to_tq(kthread_t *t) +{ + ASSERT(t->t_state == TS_FREE); + ASSERT(MUTEX_HELD(&reaplock)); + t->t_forw = thread_deathrow; + thread_deathrow = t; + thread_reapcnt++; + if (lwp_reapcnt + thread_reapcnt > reaplimit) + cv_signal(&reaper_cv); /* wake the reaper */ +} + +/* * This is called by resume() to put a zombie thread onto deathrow. * The thread's state is changed to TS_FREE to indicate that is reapable. - * This is called from the idle thread so it must not block (just spin). + * This is called from the idle thread so it must not block - just spin. */ void reapq_add(kthread_t *t) @@ -1118,6 +1172,28 @@ freectx(kthread_t *t, int isexec) } /* + * freectx_ctx is called from lwp_create() when lwp is reused from + * lwp_deathrow and its thread structure is added to thread_deathrow. + * The thread structure to which this ctx was attached may be already + * freed by the thread reaper so free_op implementations shouldn't rely + * on thread structure to which this ctx was attached still being around. + */ +void +freectx_ctx(struct ctxop *ctx) +{ + struct ctxop *nctx; + + ASSERT(ctx != NULL); + + do { + nctx = ctx->next; + if (ctx->free_op != NULL) + (ctx->free_op)(ctx->arg, 0); + kmem_free(ctx, sizeof (struct ctxop)); + } while ((ctx = nctx) != NULL); +} + +/* * Set the thread running; arrange for it to be swapped in if necessary. */ void diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c index 13cf752b45..92286f7163 100644 --- a/usr/src/uts/common/os/cpu.c +++ b/usr/src/uts/common/os/cpu.c @@ -58,7 +58,7 @@ #include <sys/msacct.h> #include <sys/time.h> #include <sys/archsystm.h> -#if defined(__x86) +#if defined(__x86) || defined(__amd64) #include <sys/x86_archext.h> #endif @@ -728,6 +728,11 @@ weakbinding_start(void) weakbindingbarrier = 0; } +void +null_xcall(void) +{ +} + /* * This routine is called to place the CPUs in a safe place so that * one of them can be taken off line or placed on line. What we are @@ -2797,6 +2802,7 @@ cpu_destroy_bound_threads(cpu_t *cp) mutex_exit(&pidlock); + mutex_sync(); for (t = tlist; t != NULL; t = tnext) { tnext = t->t_next; thread_free(t); diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c index a925f979a4..a9f1aa2588 100644 --- a/usr/src/uts/common/os/lwp.c +++ b/usr/src/uts/common/os/lwp.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -61,6 +61,8 @@ #include <sys/brand.h> void *segkp_lwp; /* cookie for pool of segkp resources */ +extern void reapq_move_lq_to_tq(kthread_t *); +extern void freectx_ctx(struct ctxop *); /* * Create a thread that appears to be stopped at sys_rtt. @@ -88,6 +90,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, int i; int rctlfail = 0; boolean_t branded = 0; + struct ctxop *ctx = NULL; mutex_enter(&p->p_lock); mutex_enter(&p->p_zone->zone_nlwps_lock); @@ -136,14 +139,18 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, lwp_reapcnt--; lwpdata = t->t_swap; lwp = t->t_lwp; - } - mutex_exit(&reaplock); - if (t) { + ctx = t->t_ctx; t->t_swap = NULL; - lwp_stk_fini(t->t_lwp); t->t_lwp = NULL; - t->t_forw = NULL; - thread_free(t); + t->t_ctx = NULL; + reapq_move_lq_to_tq(t); + } + mutex_exit(&reaplock); + if (lwp != NULL) { + lwp_stk_fini(lwp); + } + if (ctx != NULL) { + freectx_ctx(ctx); } } if (lwpdata == NULL && @@ -250,7 +257,7 @@ grow: ldp->ld_next = ldp + 1; new_hashsz = (new_dirsz + 2) / 2; new_hash = kmem_zalloc(new_hashsz * sizeof (lwpdir_t *), - KM_SLEEP); + KM_SLEEP); mutex_enter(&p->p_lock); if (p == curproc) diff --git a/usr/src/uts/common/os/mutex.c b/usr/src/uts/common/os/mutex.c index e935436bf6..ab6df83ad1 100644 --- a/usr/src/uts/common/os/mutex.c +++ b/usr/src/uts/common/os/mutex.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -102,7 +102,8 @@ * * set waiters bit * membar #StoreLoad (via membar_enter()) - * check CPU_THREAD for each CPU; abort if owner running + * check CPU_THREAD for owner's t_cpu + * continue if owner running * membar #LoadLoad (via membar_consumer()) * check owner and waiters bit; abort if either changed * block @@ -133,7 +134,9 @@ * * The only requirements of code outside the mutex implementation are * (1) mutex_exit() preemption fixup in interrupt handlers or trap return, - * and (2) a membar #StoreLoad after setting CPU_THREAD in resume(). + * (2) a membar #StoreLoad after setting CPU_THREAD in resume(), + * (3) mutex_owner_running() preemption fixup in interrupt handlers + * or trap returns. * Note: idle threads cannot grab adaptive locks (since they cannot block), * so the membar may be safely omitted when resuming an idle thread. * @@ -199,27 +202,9 @@ * much reduction in memory traffic, but reduces the potential idle time. * The theory of the exponential delay code is to start with a short * delay loop and double the waiting time on each iteration, up to - * a preselected maximum. The BACKOFF_BASE provides the equivalent - * of 2 to 3 memory references delay for US-III+ and US-IV architectures. - * The BACKOFF_CAP is the equivalent of 50 to 100 memory references of - * time (less than 12 microseconds for a 1000 MHz system). - * - * To determine appropriate BACKOFF_BASE and BACKOFF_CAP values, - * studies on US-III+ and US-IV systems using 1 to 66 threads were - * done. A range of possible values were studied. - * Performance differences below 10 threads were not large. For - * systems with more threads, substantial increases in total lock - * throughput was observed with the given values. For cases where - * more than 20 threads were waiting on the same lock, lock throughput - * increased by a factor of 5 or more using the backoff algorithm. - * - * Some platforms may provide their own platform specific delay code, - * using plat_lock_delay(backoff). If it is available, plat_lock_delay - * is executed instead of the default delay code. + * a preselected maximum. */ -#pragma weak plat_lock_delay - #include <sys/param.h> #include <sys/time.h> #include <sys/cpuvar.h> @@ -236,9 +221,8 @@ #include <sys/cpu.h> #include <sys/stack.h> #include <sys/archsystm.h> - -#define BACKOFF_BASE 50 -#define BACKOFF_CAP 1600 +#include <sys/machsystm.h> +#include <sys/x_call.h> /* * The sobj_ops vector exports a set of functions needed when a thread @@ -268,6 +252,89 @@ mutex_panic(char *msg, mutex_impl_t *lp) msg, lp, MUTEX_OWNER(&panic_mutex), curthread); } +/* "tunables" for per-platform backoff constants. */ +uint_t mutex_backoff_cap = 0; +ushort_t mutex_backoff_base = MUTEX_BACKOFF_BASE; +ushort_t mutex_cap_factor = MUTEX_CAP_FACTOR; +uchar_t mutex_backoff_shift = MUTEX_BACKOFF_SHIFT; + +void +mutex_sync(void) +{ + MUTEX_SYNC(); +} + +/* calculate the backoff interval */ +static uint_t +default_lock_backoff(uint_t backoff) +{ + uint_t cap; /* backoff cap calculated */ + + if (backoff == 0) { + backoff = mutex_backoff_base; + /* first call just sets the base */ + return (backoff); + } + + /* set cap */ + if (mutex_backoff_cap == 0) { + /* + * For a contended lock, in the worst case a load + cas may + * be queued at the controller for each contending CPU. + * Therefore, to avoid queueing, the accesses for all CPUS must + * be spread out in time over an interval of (ncpu * + * cap-factor). Maximum backoff is set to this value, and + * actual backoff is a random number from 0 to the current max. + */ + cap = ncpus_online * mutex_cap_factor; + } else { + cap = mutex_backoff_cap; + } + + /* calculate new backoff value */ + backoff <<= mutex_backoff_shift; /* increase backoff */ + if (backoff > cap) { + if (cap < mutex_backoff_base) + backoff = mutex_backoff_base; + else + backoff = cap; + } + + return (backoff); +} + +/* + * default delay function for mutexes. + */ +static void +default_lock_delay(uint_t backoff) +{ + ulong_t rnd; /* random factor */ + uint_t cur_backoff; /* calculated backoff */ + uint_t backctr; + + /* + * Modify backoff by a random amount to avoid lockstep, and to + * make it probable that some thread gets a small backoff, and + * re-checks quickly + */ + rnd = (((long)curthread >> PTR24_LSB) ^ (long)MUTEX_GETTICK()); + cur_backoff = (uint_t)(rnd % (backoff - mutex_backoff_base + 1)) + + mutex_backoff_base; + + /* + * Delay before trying + * to touch the mutex data structure. + */ + for (backctr = cur_backoff; backctr; backctr--) { + MUTEX_DELAY(); + }; +} + +uint_t (*mutex_lock_backoff)(uint_t) = default_lock_backoff; +void (*mutex_lock_delay)(uint_t) = default_lock_delay; +void (*mutex_delay)(void) = mutex_delay_default; + /* * mutex_vector_enter() is called from the assembly mutex_enter() routine * if the lock is held or is not of type MUTEX_ADAPTIVE. @@ -276,15 +343,15 @@ void mutex_vector_enter(mutex_impl_t *lp) { kthread_id_t owner; + kthread_id_t lastowner = MUTEX_NO_OWNER; /* track owner changes */ hrtime_t sleep_time = 0; /* how long we slept */ uint_t spin_count = 0; /* how many times we spun */ - cpu_t *cpup, *last_cpu; - extern cpu_t *cpu_list; + cpu_t *cpup; turnstile_t *ts; volatile mutex_impl_t *vlp = (volatile mutex_impl_t *)lp; - int backoff; /* current backoff */ - int backctr; /* ctr for backoff */ + uint_t backoff = 0; /* current backoff */ int sleep_count = 0; + int changecnt = 0; /* count of owner changes */ ASSERT_STACK_ALIGNED(); @@ -314,42 +381,31 @@ mutex_vector_enter(mutex_impl_t *lp) CPU_STATS_ADDQ(cpup, sys, mutex_adenters, 1); - if (&plat_lock_delay) { - backoff = 0; - } else { - backoff = BACKOFF_BASE; - } - + backoff = mutex_lock_backoff(0); /* set base backoff */ for (;;) { -spin: spin_count++; - /* - * Add an exponential backoff delay before trying again - * to touch the mutex data structure. - * the spin_count test and call to nulldev are to prevent - * the compiler optimizer from eliminating the delay loop. - */ - if (&plat_lock_delay) { - plat_lock_delay(&backoff); - } else { - for (backctr = backoff; backctr; backctr--) { - if (!spin_count) (void) nulldev(); - }; /* delay */ - backoff = backoff << 1; /* double it */ - if (backoff > BACKOFF_CAP) { - backoff = BACKOFF_CAP; - } - - SMT_PAUSE(); - } + mutex_lock_delay(backoff); /* backoff delay */ if (panicstr) return; if ((owner = MUTEX_OWNER(vlp)) == NULL) { - if (mutex_adaptive_tryenter(lp)) + if (mutex_adaptive_tryenter(lp)) { break; + } + /* increase backoff only on failed attempt. */ + backoff = mutex_lock_backoff(backoff); + changecnt++; continue; + } else if (lastowner != owner) { + lastowner = owner; + backoff = mutex_lock_backoff(backoff); + changecnt++; + } + + if (changecnt >= ncpus_online) { + backoff = mutex_lock_backoff(0); + changecnt = 0; } if (owner == curthread) @@ -362,26 +418,9 @@ spin: if (owner == MUTEX_NO_OWNER) continue; - /* - * When searching the other CPUs, start with the one where - * we last saw the owner thread. If owner is running, spin. - * - * We must disable preemption at this point to guarantee - * that the list doesn't change while we traverse it - * without the cpu_lock mutex. While preemption is - * disabled, we must revalidate our cached cpu pointer. - */ - kpreempt_disable(); - if (cpup->cpu_next == NULL) - cpup = cpu_list; - last_cpu = cpup; /* mark end of search */ - do { - if (cpup->cpu_thread == owner) { - kpreempt_enable(); - goto spin; - } - } while ((cpup = cpup->cpu_next) != last_cpu); - kpreempt_enable(); + if (mutex_owner_running(lp) != NULL) { + continue; + } /* * The owner appears not to be running, so block. @@ -394,19 +433,11 @@ spin: /* * Recheck whether owner is running after waiters bit hits * global visibility (above). If owner is running, spin. - * - * Since we are at ipl DISP_LEVEL, kernel preemption is - * disabled, however we still need to revalidate our cached - * cpu pointer to make sure the cpu hasn't been deleted. */ - if (cpup->cpu_next == NULL) - last_cpu = cpup = cpu_list; - do { - if (cpup->cpu_thread == owner) { - turnstile_exit(lp); - goto spin; - } - } while ((cpup = cpup->cpu_next) != last_cpu); + if (mutex_owner_running(lp) != NULL) { + turnstile_exit(lp); + continue; + } membar_consumer(); /* @@ -418,6 +449,8 @@ spin: &mutex_sobj_ops, NULL, NULL); sleep_time += gethrtime(); sleep_count++; + /* reset backoff after turnstile */ + backoff = mutex_lock_backoff(0); } else { turnstile_exit(lp); } @@ -436,9 +469,10 @@ spin: /* * We do not count a sleep as a spin. */ - if (spin_count > sleep_count) + if (spin_count > sleep_count) { LOCKSTAT_RECORD(LS_MUTEX_ENTER_SPIN, lp, spin_count - sleep_count); + } LOCKSTAT_RECORD0(LS_MUTEX_ENTER_ACQUIRE, lp); } @@ -585,8 +619,8 @@ void lock_set_spin(lock_t *lp) { int spin_count = 1; - int backoff; /* current backoff */ - int backctr; /* ctr for backoff */ + int loop_count = 0; + uint_t backoff = 0; /* current backoff */ if (panicstr) return; @@ -594,36 +628,19 @@ lock_set_spin(lock_t *lp) if (ncpus == 1) panic("lock_set: %p lock held and only one CPU", lp); - if (&plat_lock_delay) { - backoff = 0; - } else { - backoff = BACKOFF_BASE; - } - while (LOCK_HELD(lp) || !lock_spin_try(lp)) { if (panicstr) return; spin_count++; - /* - * Add an exponential backoff delay before trying again - * to touch the mutex data structure. - * the spin_count test and call to nulldev are to prevent - * the compiler optimizer from eliminating the delay loop. - */ - if (&plat_lock_delay) { - plat_lock_delay(&backoff); - } else { - /* delay */ - for (backctr = backoff; backctr; backctr--) { - if (!spin_count) (void) nulldev(); - } + loop_count++; - backoff = backoff << 1; /* double it */ - if (backoff > BACKOFF_CAP) { - backoff = BACKOFF_CAP; - } - SMT_PAUSE(); + if (ncpus_online == loop_count) { + backoff = mutex_lock_backoff(0); + loop_count = 0; + } else { + backoff = mutex_lock_backoff(backoff); } + mutex_lock_delay(backoff); } if (spin_count) { @@ -637,8 +654,8 @@ void lock_set_spl_spin(lock_t *lp, int new_pil, ushort_t *old_pil_addr, int old_pil) { int spin_count = 1; - int backoff; /* current backoff */ - int backctr; /* ctr for backoff */ + int loop_count = 0; + uint_t backoff = 0; /* current backoff */ if (panicstr) return; @@ -648,38 +665,23 @@ lock_set_spl_spin(lock_t *lp, int new_pil, ushort_t *old_pil_addr, int old_pil) ASSERT(new_pil > LOCK_LEVEL); - if (&plat_lock_delay) { - backoff = 0; - } else { - backoff = BACKOFF_BASE; - } do { splx(old_pil); while (LOCK_HELD(lp)) { + spin_count++; + loop_count++; + if (panicstr) { *old_pil_addr = (ushort_t)splr(new_pil); return; } - spin_count++; - /* - * Add an exponential backoff delay before trying again - * to touch the mutex data structure. - * spin_count test and call to nulldev are to prevent - * compiler optimizer from eliminating the delay loop. - */ - if (&plat_lock_delay) { - plat_lock_delay(&backoff); + if (ncpus_online == loop_count) { + backoff = mutex_lock_backoff(0); + loop_count = 0; } else { - for (backctr = backoff; backctr; backctr--) { - if (!spin_count) (void) nulldev(); - } - backoff = backoff << 1; /* double it */ - if (backoff > BACKOFF_CAP) { - backoff = BACKOFF_CAP; - } - - SMT_PAUSE(); + backoff = mutex_lock_backoff(backoff); } + mutex_lock_delay(backoff); } old_pil = splr(new_pil); } while (!lock_spin_try(lp)); diff --git a/usr/src/uts/common/sys/mutex.h b/usr/src/uts/common/sys/mutex.h index 60e81e88f8..53d1e28e15 100644 --- a/usr/src/uts/common/sys/mutex.h +++ b/usr/src/uts/common/sys/mutex.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -83,7 +83,16 @@ extern int mutex_tryenter(kmutex_t *); extern void mutex_exit(kmutex_t *); extern int mutex_owned(kmutex_t *); extern struct _kthread *mutex_owner(kmutex_t *); -extern void plat_lock_delay(int *); + +extern ushort_t mutex_backoff_base; +extern uint_t mutex_backoff_cap; +extern ushort_t mutex_cap_factor; +extern uchar_t mutex_backoff_shift; +extern void (*mutex_lock_delay)(uint_t); +extern uint_t (*mutex_lock_backoff)(uint_t); +extern void (*mutex_delay)(void); +extern void mutex_delay_default(void); +extern void mutex_sync(void); #endif /* _KERNEL */ |