diff options
author | mv143129 <none@none> | 2008-01-07 18:49:45 -0800 |
---|---|---|
committer | mv143129 <none@none> | 2008-01-07 18:49:45 -0800 |
commit | 2850d85b7b93f31e578520dc3b3feb24db609c62 (patch) | |
tree | 2a1994e9e8b17b58b251e608de8c1a449291ba8f | |
parent | ca29f3da510ba7f712a40977b04aeceda9b70b95 (diff) | |
download | illumos-gate-2850d85b7b93f31e578520dc3b3feb24db609c62.tar.gz |
6619224 Tick accounting needs to be made scalable
-rw-r--r-- | usr/src/uts/common/Makefile.files | 3 | ||||
-rw-r--r-- | usr/src/uts/common/conf/param.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/disp/thread.c | 50 | ||||
-rw-r--r-- | usr/src/uts/common/os/clock.c | 264 | ||||
-rw-r--r-- | usr/src/uts/common/os/clock_tick.c | 699 | ||||
-rw-r--r-- | usr/src/uts/common/os/cpu.c | 9 | ||||
-rw-r--r-- | usr/src/uts/common/os/exit.c | 13 | ||||
-rw-r--r-- | usr/src/uts/common/os/main.c | 7 | ||||
-rw-r--r-- | usr/src/uts/common/os/task.c | 41 | ||||
-rw-r--r-- | usr/src/uts/common/sys/clock_tick.h | 118 | ||||
-rw-r--r-- | usr/src/uts/common/sys/cpuvar.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/sys/proc.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/sys/task.h | 5 | ||||
-rw-r--r-- | usr/src/uts/common/sys/thread.h | 18 | ||||
-rw-r--r-- | usr/src/uts/intel/ia32/ml/lock_prim.s | 10 | ||||
-rw-r--r-- | usr/src/uts/sun4/os/intr.c | 36 | ||||
-rw-r--r-- | usr/src/uts/sun4/sys/ivintr.h | 5 | ||||
-rw-r--r-- | usr/src/uts/sun4u/opl/os/opl.c | 12 | ||||
-rw-r--r-- | usr/src/uts/sun4v/os/mach_startup.c | 12 |
19 files changed, 1087 insertions, 225 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 8adce7c1b1..d284271f12 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -67,7 +67,8 @@ COMMON_CORE_OBJS += \ thread_intr.o \ vm_page.o \ vm_pagelist.o \ - zlib_obj.o + zlib_obj.o \ + clock_tick.o CORE_OBJS += $(COMMON_CORE_OBJS) $($(MACH)_CORE_OBJS) diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c index c35d3e7012..53a4b91775 100644 --- a/usr/src/uts/common/conf/param.c +++ b/usr/src/uts/common/conf/param.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -194,6 +194,7 @@ extern void clock_highres_init(void); extern void pg_init(void); extern void pg_cmt_class_init(void); extern void pg_cpu0_init(void); +extern void clock_tick_mp_init(void); void (*init_tbl[])(void) = { system_taskq_init, @@ -238,6 +239,7 @@ void (*mp_init_tbl[])(void) = { #if defined(__sparc) siron_mp_init, #endif + clock_tick_mp_init, 0 }; diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c index 95e21ea1bf..ee2d80834d 100644 --- a/usr/src/uts/common/disp/thread.c +++ b/usr/src/uts/common/disp/thread.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -89,11 +89,13 @@ static kcondvar_t reaper_cv; /* synchronization var */ kthread_t *thread_deathrow; /* circular list of reapable threads */ kthread_t *lwp_deathrow; /* circular list of reapable threads */ kmutex_t reaplock; /* protects lwp and thread deathrows */ -kmutex_t thread_free_lock; /* protects clock from reaper */ int thread_reapcnt = 0; /* number of threads on deathrow */ int lwp_reapcnt = 0; /* number of lwps on deathrow */ int reaplimit = 16; /* delay reaping until reaplimit */ +thread_free_lock_t *thread_free_lock; + /* protects tick thread from reaper */ + extern int nthread; id_t syscid; /* system scheduling class ID */ @@ -152,8 +154,16 @@ thread_init(void) extern char sys_name[]; extern void idle(); struct cpu *cpu = CPU; + int i; + kmutex_t *lp; mutex_init(&reaplock, NULL, MUTEX_SPIN, (void *)ipltospl(DISP_LEVEL)); + thread_free_lock = + kmem_alloc(sizeof (thread_free_lock_t) * THREAD_FREE_NUM, KM_SLEEP); + for (i = 0; i < THREAD_FREE_NUM; i++) { + lp = &thread_free_lock[i].tf_lock; + mutex_init(lp, NULL, MUTEX_DEFAULT, NULL); + } #if defined(__i386) || defined(__amd64) thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t), @@ -663,6 +673,34 @@ thread_join(kt_did_t tid) } void +thread_free_prevent(kthread_t *t) +{ + kmutex_t *lp; + + lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock; + mutex_enter(lp); +} + +void +thread_free_allow(kthread_t *t) +{ + kmutex_t *lp; + + lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock; + mutex_exit(lp); +} + +static void +thread_free_barrier(kthread_t *t) +{ + kmutex_t *lp; + + lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock; + mutex_enter(lp); + mutex_exit(lp); +} + +void thread_free(kthread_t *t) { ASSERT(t != &t0 && t->t_state == TS_FREE); @@ -714,11 +752,11 @@ thread_free(kthread_t *t) free_afd(&t->t_activefd); /* - * Barrier for clock thread. The clock holds this lock to - * keep the thread from going away while it's looking at it. + * Barrier for the tick accounting code. The tick accounting code + * holds this lock to keep the thread from going away while it's + * looking at it. */ - mutex_enter(&thread_free_lock); - mutex_exit(&thread_free_lock); + thread_free_barrier(t); ASSERT(ttoproj(t) == proj0p); project_rele(ttoproj(t)); diff --git a/usr/src/uts/common/os/clock.c b/usr/src/uts/common/os/clock.c index 28d5eece05..f73f758bbf 100644 --- a/usr/src/uts/common/os/clock.c +++ b/usr/src/uts/common/os/clock.c @@ -23,7 +23,7 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -253,6 +253,8 @@ cyclic_id_t clock_cyclic; /* clock()'s cyclic_id */ cyclic_id_t deadman_cyclic; /* deadman()'s cyclic_id */ cyclic_id_t ddi_timer_cyclic; /* cyclic_timer()'s cyclic_id */ +extern void clock_tick_schedule(int); + static int lgrp_ticks; /* counter to schedule lgrp load calcs */ /* @@ -306,7 +308,6 @@ static int adj_hist_entry; int64_t clock_adj_hist[CLOCK_ADJ_HIST_SIZE]; -static void clock_tick(kthread_t *); static void calcloadavg(int, uint64_t *); static int genloadavg(struct loadavg_s *); static void loadavg_update(); @@ -314,17 +315,16 @@ static void loadavg_update(); void (*cmm_clock_callout)() = NULL; void (*cpucaps_clock_callout)() = NULL; +extern clock_t clock_tick_proc_max; + static void clock(void) { kthread_t *t; - kmutex_t *plockp; /* pointer to thread's process lock */ - int pinned_intr = 0; - uint_t nrunnable, nrunning; + uint_t nrunnable; uint_t w_io; cpu_t *cp; cpupart_t *cpupart; - int exiting; extern void set_anoninfo(); extern void set_freemem(); void (*funcp)(); @@ -379,22 +379,7 @@ clock(void) * every timer interrupt. * * Continue with the interrupt processing as scheduled. - * - * Did we pin another interrupt thread? Need to check this before - * grabbing any adaptive locks, since if we block on a lock the - * pinned thread could escape. Note that this is just a heuristic; - * if we take multiple laps though clock() without returning from - * the interrupt because we have another clock tick pending, then - * the pinned interrupt could be released by one of the previous - * laps. The only consequence is that the CPU will be counted as - * in idle (or wait) state once the pinned interrupt is released. - * Since this accounting is inaccurate by nature, this isn't a big - * deal --- but we should try to get it right in the common case - * where we only call clock() once per interrupt. */ - if (curthread->t_intr != NULL) - pinned_intr = (curthread->t_intr->t_flag & T_INTR_THREAD); - /* * Count the number of runnable threads and the number waiting * for some form of I/O to complete -- gets added to @@ -448,6 +433,10 @@ clock(void) if (one_sec) { cpupart->cp_nrunnable += cpu_nrunnable; /* + * Update user, system, and idle cpu times. + */ + cpupart->cp_nrunning++; + /* * w_io is used to update sysinfo.waiting during * one_second processing below. Only gather w_io * information when we walk the list of cpus if we're @@ -547,150 +536,7 @@ clock(void) } } while ((cp = cp->cpu_next) != cpu_list); - /* - * Do tick processing for all the active threads running in - * the system. We're trying to be more fair by walking the - * list of CPUs starting from a different CPUs each time. - */ - cp = clock_cpu_list; - nrunning = 0; - do { - klwp_id_t lwp; - int intr; - int thread_away; - - /* - * Don't do any tick processing on CPUs that - * aren't even in the system or aren't up yet. - */ - if ((cp->cpu_flags & CPU_EXISTS) == 0) { - continue; - } - - /* - * The locking here is rather tricky. We use - * thread_free_lock to keep the currently running - * thread from being freed or recycled while we're - * looking at it. We can then check if the thread - * is exiting and get the appropriate p_lock if it - * is not. We have to be careful, though, because - * the _process_ can still be freed while we're - * holding thread_free_lock. To avoid touching the - * proc structure we put a pointer to the p_lock in the - * thread structure. The p_lock is persistent so we - * can acquire it even if the process is gone. At that - * point we can check (again) if the thread is exiting - * and either drop the lock or do the tick processing. - */ - mutex_enter(&thread_free_lock); - /* - * We cannot hold the cpu_lock to prevent the - * cpu_list from changing in the clock interrupt. - * As long as we don't block (or don't get pre-empted) - * the cpu_list will not change (all threads are paused - * before list modification). If the list does change - * any deleted cpu structures will remain with cpu_next - * set to NULL, hence the following test. - */ - if (cp->cpu_next == NULL) { - mutex_exit(&thread_free_lock); - break; - } - t = cp->cpu_thread; /* Current running thread */ - if (CPU == cp) { - /* - * 't' will be the clock interrupt thread on this - * CPU. Use the pinned thread (if any) on this CPU - * as the target of the clock tick. If we pinned - * an interrupt, though, just keep using the clock - * interrupt thread since the formerly pinned one - * may have gone away. One interrupt thread is as - * good as another, and this means we don't have - * to continue to check pinned_intr in subsequent - * code. - */ - ASSERT(t == curthread); - if (t->t_intr != NULL && !pinned_intr) - t = t->t_intr; - } - - intr = t->t_flag & T_INTR_THREAD; - lwp = ttolwp(t); - if (lwp == NULL || (t->t_proc_flag & TP_LWPEXIT) || intr) { - /* - * Thread is exiting (or uninteresting) so don't - * do tick processing or grab p_lock. Once we - * drop thread_free_lock we can't look inside the - * thread or lwp structure, since the thread may - * have gone away. - */ - exiting = 1; - } else { - /* - * OK, try to grab the process lock. See - * comments above for why we're not using - * ttoproc(t)->p_lockp here. - */ - plockp = t->t_plockp; - mutex_enter(plockp); - /* See above comment. */ - if (cp->cpu_next == NULL) { - mutex_exit(plockp); - mutex_exit(&thread_free_lock); - break; - } - /* - * The thread may have exited between when we - * checked above, and when we got the p_lock. - */ - if (t->t_proc_flag & TP_LWPEXIT) { - mutex_exit(plockp); - exiting = 1; - } else { - exiting = 0; - } - } - /* - * Either we have the p_lock for the thread's process, - * or we don't care about the thread structure any more. - * Either way we can drop thread_free_lock. - */ - mutex_exit(&thread_free_lock); - - /* - * Update user, system, and idle cpu times. - */ - if (one_sec) { - nrunning++; - cp->cpu_part->cp_nrunning++; - } - /* - * If we haven't done tick processing for this - * lwp, then do it now. Since we don't hold the - * lwp down on a CPU it can migrate and show up - * more than once, hence the lbolt check. - * - * Also, make sure that it's okay to perform the - * tick processing before calling clock_tick. - * Setting thread_away to a TRUE value (ie. not 0) - * results in tick processing not being performed for - * that thread. Or, in other words, keeps the thread - * away from clock_tick processing. - */ - thread_away = ((cp->cpu_flags & CPU_QUIESCED) || - CPU_ON_INTR(cp) || intr || - (cp->cpu_dispthread == cp->cpu_idle_thread) || exiting); - - if ((!thread_away) && (lbolt - t->t_lbolt != 0)) { - t->t_lbolt = lbolt; - clock_tick(t); - } - - if (!exiting) - mutex_exit(plockp); - } while ((cp = cp->cpu_next) != clock_cpu_list); - - clock_cpu_list = clock_cpu_list->cpu_next; + clock_tick_schedule(one_sec); /* * bump time in ticks @@ -1522,16 +1368,19 @@ ddi_hardpps(struct timeval *tvp, int usec) * Check for timer action, enforce CPU rlimit, do profiling etc. */ void -clock_tick(kthread_t *t) +clock_tick(kthread_t *t, int pending) { struct proc *pp; klwp_id_t lwp; struct as *as; - clock_t utime; - clock_t stime; + clock_t ticks; int poke = 0; /* notify another CPU */ int user_mode; size_t rss; + int i, total_usec, usec; + rctl_qty_t secs; + + ASSERT(pending > 0); /* Must be operating on a lwp/thread */ if ((lwp = ttolwp(t)) == NULL) { @@ -1539,8 +1388,10 @@ clock_tick(kthread_t *t) /*NOTREACHED*/ } - CL_TICK(t); /* Class specific tick processing */ - DTRACE_SCHED1(tick, kthread_t *, t); + for (i = 0; i < pending; i++) { + CL_TICK(t); /* Class specific tick processing */ + DTRACE_SCHED1(tick, kthread_t *, t); + } pp = ttoproc(t); @@ -1549,17 +1400,18 @@ clock_tick(kthread_t *t) user_mode = (lwp->lwp_state == LWP_USER); + ticks = (pp->p_utime + pp->p_stime) % hz; /* * Update process times. Should use high res clock and state * changes instead of statistical sampling method. XXX */ if (user_mode) { - pp->p_utime++; - pp->p_task->tk_cpu_time++; + pp->p_utime += pending; } else { - pp->p_stime++; - pp->p_task->tk_cpu_time++; + pp->p_stime += pending; } + + pp->p_ttime += pending; as = pp->p_as; /* @@ -1567,45 +1419,73 @@ clock_tick(kthread_t *t) * lwp when the AST happens. */ if (pp->p_prof.pr_scale) { - atomic_add_32(&lwp->lwp_oweupc, 1); + atomic_add_32(&lwp->lwp_oweupc, (int32_t)pending); if (user_mode) { poke = 1; aston(t); } } - utime = pp->p_utime; - stime = pp->p_stime; - /* * If CPU was in user state, process lwp-virtual time - * interval timer. + * interval timer. The value passed to itimerdecr() has to be + * in microseconds and has to be less than one second. Hence + * this loop. */ - if (user_mode && - timerisset(&lwp->lwp_timer[ITIMER_VIRTUAL].it_value) && - itimerdecr(&lwp->lwp_timer[ITIMER_VIRTUAL], usec_per_tick) == 0) { - poke = 1; - sigtoproc(pp, t, SIGVTALRM); + total_usec = usec_per_tick * pending; + while (total_usec > 0) { + usec = MIN(total_usec, (MICROSEC - 1)); + if (user_mode && + timerisset(&lwp->lwp_timer[ITIMER_VIRTUAL].it_value) && + itimerdecr(&lwp->lwp_timer[ITIMER_VIRTUAL], usec) == 0) { + poke = 1; + sigtoproc(pp, t, SIGVTALRM); + } + total_usec -= usec; } - if (timerisset(&lwp->lwp_timer[ITIMER_PROF].it_value) && - itimerdecr(&lwp->lwp_timer[ITIMER_PROF], usec_per_tick) == 0) { - poke = 1; - sigtoproc(pp, t, SIGPROF); + /* + * If CPU was in user state, process lwp-profile + * interval timer. + */ + total_usec = usec_per_tick * pending; + while (total_usec > 0) { + usec = MIN(total_usec, (MICROSEC - 1)); + if (timerisset(&lwp->lwp_timer[ITIMER_PROF].it_value) && + itimerdecr(&lwp->lwp_timer[ITIMER_PROF], usec) == 0) { + poke = 1; + sigtoproc(pp, t, SIGPROF); + } + total_usec -= usec; } /* * Enforce CPU resource controls: * (a) process.max-cpu-time resource control + * + * Perform the check only if we have accumulated more a second. */ - (void) rctl_test(rctlproc_legacy[RLIMIT_CPU], pp->p_rctls, pp, - (utime + stime)/hz, RCA_UNSAFE_SIGINFO); + if ((ticks + pending) >= hz) { + (void) rctl_test(rctlproc_legacy[RLIMIT_CPU], pp->p_rctls, pp, + (pp->p_utime + pp->p_stime)/hz, RCA_UNSAFE_SIGINFO); + } /* * (b) task.max-cpu-time resource control + * + * If we have accumulated enough ticks, increment the task CPU + * time usage and test for the resource limit. This minimizes the + * number of calls to the rct_test(). The task CPU time mutex + * is highly contentious as many processes can be sharing a task. */ - (void) rctl_test(rc_task_cpu_time, pp->p_task->tk_rctls, pp, 1, - RCA_UNSAFE_SIGINFO); + if (pp->p_ttime >= clock_tick_proc_max) { + secs = task_cpu_time_incr(pp->p_task, pp->p_ttime); + pp->p_ttime = 0; + if (secs) { + (void) rctl_test(rc_task_cpu_time, pp->p_task->tk_rctls, + pp, secs, RCA_UNSAFE_SIGINFO); + } + } /* * Update memory usage for the currently running process. diff --git a/usr/src/uts/common/os/clock_tick.c b/usr/src/uts/common/os/clock_tick.c new file mode 100644 index 0000000000..816f4978b1 --- /dev/null +++ b/usr/src/uts/common/os/clock_tick.c @@ -0,0 +1,699 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/task.h> +#include <sys/cmn_err.h> +#include <sys/class.h> +#include <sys/sdt.h> +#include <sys/atomic.h> +#include <sys/cpu.h> +#include <sys/clock_tick.h> +#include <sys/sysmacros.h> +#include <vm/rm.h> + +/* + * This file contains the implementation of clock tick accounting for threads. + * Every tick, user threads running on various CPUs are located and charged + * with a tick to account for their use of CPU time. + * + * Every tick, the clock() handler calls clock_tick_schedule() to perform tick + * accounting for all the threads in the system. Tick accounting is done in + * two phases: + * + * Tick scheduling Done in clock_tick_schedule(). In this phase, cross + * calls are scheduled to multiple CPUs to perform + * multi-threaded tick accounting. The CPUs are chosen + * on a rotational basis so as to distribute the tick + * accounting load evenly across all CPUs. + * + * Tick execution Done in clock_tick_execute(). In this phase, tick + * accounting is actually performed by softint handlers + * on multiple CPUs. + * + * This implementation gives us a multi-threaded tick processing facility that + * is suitable for configurations with a large number of CPUs. On smaller + * configurations it may be desirable to let the processing be single-threaded + * and just allow clock() to do it as it has been done traditionally. To + * facilitate this, a variable, clock_tick_threshold, is defined. Platforms + * that desire multi-threading should set this variable to something + * appropriate. A recommended value may be found in clock_tick.h. At boot time, + * if the number of CPUs is greater than clock_tick_threshold, multi-threading + * kicks in. Note that this is a decision made at boot time. If more CPUs + * are dynamically added later on to exceed the threshold, no attempt is made + * to switch to multi-threaded. Similarly, if CPUs are removed dynamically + * no attempt is made to switch to single-threaded. This is to keep the + * implementation simple. Also note that the threshold can be changed for a + * specific customer configuration via /etc/system. + * + * The boot time decision is reflected in clock_tick_single_threaded. + */ + +/* + * clock_tick_threshold + * If the number of CPUs at boot time exceeds this threshold, + * multi-threaded tick accounting kicks in. + * + * clock_tick_ncpus + * The number of CPUs in a set. Each set is scheduled for tick execution + * on a separate processor. + * + * clock_tick_single_threaded + * Indicates whether or not tick accounting is single threaded. + * + * clock_tick_total_cpus + * Total number of online CPUs. + * + * clock_tick_cpus + * Array of online CPU pointers. + * + * clock_tick_cpu + * Per-CPU, cache-aligned data structures to facilitate multi-threading. + * + * clock_tick_active + * Counter that indicates the number of active tick processing softints + * in the system. + * + * clock_tick_pending + * Number of pending ticks that need to be accounted by the softint + * handlers. + * + * clock_tick_lock + * Mutex to synchronize between clock_tick_schedule() and + * CPU online/offline. + * + * clock_cpu_id + * CPU id of the clock() CPU. Used to detect when the clock CPU + * is offlined. + * + * clock_tick_online_cpuset + * CPU set of all online processors that can be X-called. + * + * clock_tick_proc_max + * Each process is allowed to accumulate a few ticks before checking + * for the task CPU time resource limit. We lower the number of calls + * to rctl_test() to make tick accounting more scalable. The tradeoff + * is that the limit may not get enforced in a timely manner. This is + * typically not a problem. + * + * clock_tick_set + * Per-set structures. Each structure contains the range of CPUs + * to be processed for the set. + * + * clock_tick_nsets; + * Number of sets. + * + * clock_tick_scan + * Where to begin the scan for single-threaded mode. In multi-threaded, + * the clock_tick_set itself contains a field for this. + */ +int clock_tick_threshold; +int clock_tick_ncpus; +int clock_tick_single_threaded; +int clock_tick_total_cpus; +cpu_t *clock_tick_cpus[NCPU]; +clock_tick_cpu_t *clock_tick_cpu[NCPU]; +ulong_t clock_tick_active; +int clock_tick_pending; +kmutex_t clock_tick_lock; +processorid_t clock_cpu_id; +cpuset_t clock_tick_online_cpuset; +clock_t clock_tick_proc_max; +clock_tick_set_t *clock_tick_set; +int clock_tick_nsets; +int clock_tick_scan; + +static uint_t clock_tick_execute(caddr_t, caddr_t); +static void clock_tick_execute_common(int, int, int, clock_t, int); + +#define CLOCK_TICK_ALIGN 64 /* cache alignment */ + +/* + * Clock tick initialization is done in two phases: + * + * 1. Before clock_init() is called, clock_tick_init_pre() is called to set + * up single-threading so the clock() can begin to do its job. + * + * 2. After the slave CPUs are initialized at boot time, we know the number + * of CPUs. clock_tick_init_post() is called to set up multi-threading if + * required. + */ +void +clock_tick_init_pre(void) +{ + clock_tick_cpu_t *ctp; + int i, n; + clock_tick_set_t *csp; + uintptr_t buf; + size_t size; + + clock_tick_single_threaded = 1; + + size = P2ROUNDUP(sizeof (clock_tick_cpu_t), CLOCK_TICK_ALIGN); + buf = (uintptr_t)kmem_zalloc(size * NCPU + CLOCK_TICK_ALIGN, KM_SLEEP); + buf = P2ROUNDUP(buf, CLOCK_TICK_ALIGN); + + /* + * Perform initialization in case multi-threading is chosen later. + */ + for (i = 0; i < NCPU; i++, buf += size) { + ctp = (clock_tick_cpu_t *)buf; + clock_tick_cpu[i] = ctp; + mutex_init(&ctp->ct_lock, NULL, MUTEX_DEFAULT, NULL); + if (&create_softint != NULL) { + ctp->ct_intr = create_softint(LOCK_LEVEL, + clock_tick_execute, (caddr_t)ctp); + } + ctp->ct_pending = 0; + } + + mutex_init(&clock_tick_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Compute clock_tick_ncpus here. We need it to compute the + * maximum number of tick sets we need to support. + */ + ASSERT(clock_tick_ncpus >= 0); + if (clock_tick_ncpus == 0) + clock_tick_ncpus = CLOCK_TICK_NCPUS; + if (clock_tick_ncpus > max_ncpus) + clock_tick_ncpus = max_ncpus; + + /* + * Allocate and initialize the tick sets. + */ + n = (max_ncpus + clock_tick_ncpus - 1)/clock_tick_ncpus; + clock_tick_set = kmem_zalloc(sizeof (clock_tick_set_t) * n, KM_SLEEP); + for (i = 0; i < n; i++) { + csp = &clock_tick_set[i]; + csp->ct_start = i * clock_tick_ncpus; + csp->ct_scan = csp->ct_start; + csp->ct_end = csp->ct_start; + } +} + +void +clock_tick_init_post(void) +{ + /* + * If a platform does not provide create_softint() and invoke_softint(), + * then we assume single threaded. + */ + if (&invoke_softint == NULL) + clock_tick_threshold = 0; + + ASSERT(clock_tick_threshold >= 0); + + if (clock_tick_threshold == 0) + clock_tick_threshold = max_ncpus; + + /* + * If a platform does not specify a threshold or if the number of CPUs + * at boot time does not exceed the threshold, tick accounting remains + * single-threaded. + */ + if (ncpus <= clock_tick_threshold) { + clock_tick_ncpus = max_ncpus; + clock_tick_proc_max = 1; + return; + } + + /* + * OK. Multi-thread tick processing. If a platform has not specified + * the CPU set size for multi-threading, then use the default value. + * This value has been arrived through measurements on large + * configuration systems. + */ + clock_tick_single_threaded = 0; + if (clock_tick_proc_max == 0) { + clock_tick_proc_max = CLOCK_TICK_PROC_MAX; + if (hires_tick) + clock_tick_proc_max *= 10; + } +} + +static void +clock_tick_schedule_one(clock_tick_set_t *csp, int pending, processorid_t cid) +{ + clock_tick_cpu_t *ctp; + + ASSERT(&invoke_softint != NULL); + /* + * Schedule tick accounting for a set of CPUs. + */ + ctp = clock_tick_cpu[cid]; + mutex_enter(&ctp->ct_lock); + ctp->ct_lbolt = lbolt; + ctp->ct_pending += pending; + ctp->ct_start = csp->ct_start; + ctp->ct_end = csp->ct_end; + ctp->ct_scan = csp->ct_scan; + mutex_exit(&ctp->ct_lock); + + invoke_softint(cid, ctp->ct_intr); + /* + * Return without waiting for the softint to finish. + */ +} + +static void +clock_tick_process(cpu_t *cp, clock_t mylbolt, int pending) +{ + kthread_t *t; + kmutex_t *plockp; + int notick, intr; + klwp_id_t lwp; + + /* + * The locking here is rather tricky. thread_free_prevent() + * prevents the thread returned from being freed while we + * are looking at it. We can then check if the thread + * is exiting and get the appropriate p_lock if it + * is not. We have to be careful, though, because + * the _process_ can still be freed while we've + * prevented thread free. To avoid touching the + * proc structure we put a pointer to the p_lock in the + * thread structure. The p_lock is persistent so we + * can acquire it even if the process is gone. At that + * point we can check (again) if the thread is exiting + * and either drop the lock or do the tick processing. + */ + t = cp->cpu_thread; /* Current running thread */ + if (CPU == cp) { + /* + * 't' will be the tick processing thread on this + * CPU. Use the pinned thread (if any) on this CPU + * as the target of the clock tick. + */ + if (t->t_intr != NULL) + t = t->t_intr; + } + + /* + * We use thread_free_prevent to keep the currently running + * thread from being freed or recycled while we're + * looking at it. + */ + thread_free_prevent(t); + /* + * We cannot hold the cpu_lock to prevent the + * cpu_active from changing in the clock interrupt. + * As long as we don't block (or don't get pre-empted) + * the cpu_list will not change (all threads are paused + * before list modification). + */ + if (CLOCK_TICK_CPU_OFFLINE(cp)) { + thread_free_allow(t); + return; + } + + /* + * Make sure the thread is still on the CPU. + */ + if ((t != cp->cpu_thread) && + ((cp != CPU) || (t != cp->cpu_thread->t_intr))) { + /* + * We could not locate the thread. Skip this CPU. Race + * conditions while performing these checks are benign. + * These checks are not perfect and they don't need + * to be. + */ + thread_free_allow(t); + return; + } + + intr = t->t_flag & T_INTR_THREAD; + lwp = ttolwp(t); + if (lwp == NULL || (t->t_proc_flag & TP_LWPEXIT) || intr) { + /* + * Thread is exiting (or uninteresting) so don't + * do tick processing. + */ + thread_free_allow(t); + return; + } + + /* + * OK, try to grab the process lock. See + * comments above for why we're not using + * ttoproc(t)->p_lockp here. + */ + plockp = t->t_plockp; + mutex_enter(plockp); + /* See above comment. */ + if (CLOCK_TICK_CPU_OFFLINE(cp)) { + mutex_exit(plockp); + thread_free_allow(t); + return; + } + + /* + * The thread may have exited between when we + * checked above, and when we got the p_lock. + */ + if (t->t_proc_flag & TP_LWPEXIT) { + mutex_exit(plockp); + thread_free_allow(t); + return; + } + + /* + * Either we have the p_lock for the thread's process, + * or we don't care about the thread structure any more. + * Either way we can allow thread free. + */ + thread_free_allow(t); + + /* + * If we haven't done tick processing for this + * lwp, then do it now. Since we don't hold the + * lwp down on a CPU it can migrate and show up + * more than once, hence the lbolt check. mylbolt + * is copied at the time of tick scheduling to prevent + * lbolt mismatches. + * + * Also, make sure that it's okay to perform the + * tick processing before calling clock_tick. + * Setting notick to a TRUE value (ie. not 0) + * results in tick processing not being performed for + * that thread. + */ + notick = ((cp->cpu_flags & CPU_QUIESCED) || CPU_ON_INTR(cp) || + (cp->cpu_dispthread == cp->cpu_idle_thread)); + + if ((!notick) && (t->t_lbolt < mylbolt)) { + t->t_lbolt = mylbolt; + clock_tick(t, pending); + } + + mutex_exit(plockp); +} + +void +clock_tick_schedule(int one_sec) +{ + ulong_t active; + int i, end; + clock_tick_set_t *csp; + cpu_t *cp; + + if (clock_cpu_id != CPU->cpu_id) + clock_cpu_id = CPU->cpu_id; + + if (clock_tick_single_threaded) { + /* + * Each tick cycle, start the scan from a different + * CPU for the sake of fairness. + */ + end = clock_tick_total_cpus; + clock_tick_scan++; + if (clock_tick_scan >= end) + clock_tick_scan = 0; + + clock_tick_execute_common(0, clock_tick_scan, end, lbolt, 1); + + return; + } + + /* + * If the previous invocation of handlers is not yet finished, then + * simply increment a pending count and return. Eventually when they + * finish, the pending count is passed down to the next set of + * handlers to process. This way, ticks that have already elapsed + * in the past are handled as quickly as possible to minimize the + * chances of threads getting away before their pending ticks are + * accounted. The other benefit is that if the pending count is + * more than one, it can be handled by a single invocation of + * clock_tick(). This is a good optimization for large configuration + * busy systems where tick accounting can get backed up for various + * reasons. + */ + clock_tick_pending++; + + active = clock_tick_active; + active = atomic_cas_ulong(&clock_tick_active, active, active); + if (active) + return; + + /* + * We want to handle the clock CPU here. If we + * scheduled the accounting for the clock CPU to another + * processor, that processor will find only the clock() thread + * running and not account for any user thread below it. Also, + * we want to handle this before we block on anything and allow + * the pinned thread below the current thread to escape. + */ + clock_tick_process(CPU, lbolt, clock_tick_pending); + + mutex_enter(&clock_tick_lock); + + /* + * Schedule each set on a separate processor. + */ + cp = clock_cpu_list; + for (i = 0; i < clock_tick_nsets; i++) { + csp = &clock_tick_set[i]; + + /* + * Pick the next online CPU in list for scheduling tick + * accounting. The clock_tick_lock is held by the caller. + * So, CPU online/offline cannot muck with this while + * we are picking our CPU to X-call. + */ + if (cp == CPU) + cp = cp->cpu_next_onln; + + /* + * Each tick cycle, start the scan from a different + * CPU for the sake of fairness. + */ + csp->ct_scan++; + if (csp->ct_scan >= csp->ct_end) + csp->ct_scan = csp->ct_start; + + clock_tick_schedule_one(csp, clock_tick_pending, cp->cpu_id); + + cp = cp->cpu_next_onln; + } + + if (one_sec) { + /* + * Move the CPU pointer around every second. This is so + * all the CPUs can be X-called in a round-robin fashion + * to evenly distribute the X-calls. We don't do this + * at a faster rate than this because we don't want + * to affect cache performance negatively. + */ + clock_cpu_list = clock_cpu_list->cpu_next_onln; + } + + mutex_exit(&clock_tick_lock); + + clock_tick_pending = 0; +} + +static void +clock_tick_execute_common(int start, int scan, int end, clock_t mylbolt, + int pending) +{ + cpu_t *cp; + int i; + + ASSERT((start <= scan) && (scan <= end)); + + /* + * Handle the thread on current CPU first. This is to prevent a + * pinned thread from escaping if we ever block on something. + * Note that in the single-threaded mode, this handles the clock + * CPU. + */ + clock_tick_process(CPU, mylbolt, pending); + + /* + * Perform tick accounting for the threads running on + * the scheduled CPUs. + */ + for (i = scan; i < end; i++) { + cp = clock_tick_cpus[i]; + if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id)) + continue; + clock_tick_process(cp, mylbolt, pending); + } + + for (i = start; i < scan; i++) { + cp = clock_tick_cpus[i]; + if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id)) + continue; + clock_tick_process(cp, mylbolt, pending); + } +} + +/*ARGSUSED*/ +static uint_t +clock_tick_execute(caddr_t arg1, caddr_t arg2) +{ + clock_tick_cpu_t *ctp; + int start, scan, end, pending; + clock_t mylbolt; + + /* + * We could have raced with cpu offline. We don't want to + * process anything on an offlined CPU. If we got blocked + * on anything, we may not get scheduled when we wakeup + * later on. + */ + if (!CLOCK_TICK_XCALL_SAFE(CPU)) + return (1); + + atomic_inc_ulong(&clock_tick_active); + + ctp = (clock_tick_cpu_t *)arg1; + mutex_enter(&ctp->ct_lock); + pending = ctp->ct_pending; + if (pending == 0) { + /* + * If a CPU is busy at LOCK_LEVEL, then an invocation + * of this softint may be queued for some time. In that case, + * clock_tick_active will not be incremented. + * clock_tick_schedule() will then assume that the previous + * invocation is done and post a new softint. The first one + * that gets in will reset the pending count so the + * second one is a noop. + */ + mutex_exit(&ctp->ct_lock); + goto out; + } + ctp->ct_pending = 0; + start = ctp->ct_start; + end = ctp->ct_end; + scan = ctp->ct_scan; + mylbolt = ctp->ct_lbolt; + mutex_exit(&ctp->ct_lock); + + clock_tick_execute_common(start, scan, end, mylbolt, pending); + +out: + /* + * Signal completion to the clock handler. + */ + atomic_dec_ulong(&clock_tick_active); + + return (1); +} + +/*ARGSUSED*/ +static int +clock_tick_cpu_setup(cpu_setup_t what, int cid, void *arg) +{ + cpu_t *cp, *ncp; + int i, set; + clock_tick_set_t *csp; + + /* + * This function performs some computations at CPU offline/online + * time. The computed values are used during tick scheduling and + * execution phases. This avoids having to compute things on + * an every tick basis. The other benefit is that we perform the + * computations only for onlined CPUs (not offlined ones). As a + * result, no tick processing is attempted for offlined CPUs. + * + * Also, cpu_offline() calls this function before checking for + * active interrupt threads. This allows us to avoid posting + * cross calls to CPUs that are being offlined. + */ + + cp = cpu[cid]; + + mutex_enter(&clock_tick_lock); + + switch (what) { + case CPU_ON: + clock_tick_cpus[clock_tick_total_cpus] = cp; + set = clock_tick_total_cpus / clock_tick_ncpus; + csp = &clock_tick_set[set]; + csp->ct_end++; + clock_tick_total_cpus++; + clock_tick_nsets = + (clock_tick_total_cpus + clock_tick_ncpus - 1) / + clock_tick_ncpus; + CPUSET_ADD(clock_tick_online_cpuset, cp->cpu_id); + membar_sync(); + break; + + case CPU_OFF: + if (&sync_softint != NULL) + sync_softint(clock_tick_online_cpuset); + CPUSET_DEL(clock_tick_online_cpuset, cp->cpu_id); + clock_tick_total_cpus--; + clock_tick_cpus[clock_tick_total_cpus] = NULL; + clock_tick_nsets = + (clock_tick_total_cpus + clock_tick_ncpus - 1) / + clock_tick_ncpus; + set = clock_tick_total_cpus / clock_tick_ncpus; + csp = &clock_tick_set[set]; + csp->ct_end--; + + i = 0; + ncp = cpu_active; + do { + if (cp == ncp) + continue; + clock_tick_cpus[i] = ncp; + i++; + } while ((ncp = ncp->cpu_next_onln) != cpu_active); + ASSERT(i == clock_tick_total_cpus); + membar_sync(); + break; + + default: + break; + } + + mutex_exit(&clock_tick_lock); + + return (0); +} + + +void +clock_tick_mp_init(void) +{ + cpu_t *cp; + + mutex_enter(&cpu_lock); + + cp = cpu_active; + do { + (void) clock_tick_cpu_setup(CPU_ON, cp->cpu_id, NULL); + } while ((cp = cp->cpu_next_onln) != cpu_active); + + register_cpu_setup_func(clock_tick_cpu_setup, NULL); + + mutex_exit(&cpu_lock); +} diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c index 8988a7b647..13cf752b45 100644 --- a/usr/src/uts/common/os/cpu.c +++ b/usr/src/uts/common/os/cpu.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1785,9 +1785,6 @@ cpu_del_unit(int cpuid) cp->cpu_next->cpu_prev = cp->cpu_prev; if (cp == cpu_list) cpu_list = cpnext; - if (cp == clock_cpu_list) - clock_cpu_list = cpnext; - /* * Signals that the cpu has been deleted (see above). @@ -1882,6 +1879,9 @@ cpu_remove_active(cpu_t *cp) lgrp_config(LGRP_CONFIG_CPU_OFFLINE, (uintptr_t)cp, 0); + if (cp == clock_cpu_list) + clock_cpu_list = cp->cpu_next_onln; + cp->cpu_prev_onln->cpu_next_onln = cp->cpu_next_onln; cp->cpu_next_onln->cpu_prev_onln = cp->cpu_prev_onln; if (cpu_active == cp) { @@ -2797,7 +2797,6 @@ cpu_destroy_bound_threads(cpu_t *cp) mutex_exit(&pidlock); - for (t = tlist; t != NULL; t = tnext) { tnext = t->t_next; thread_free(t); diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index e39be3e9f2..71fc90a767 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -356,6 +356,17 @@ proc_exit(int why, int what) if (exitlwps(0) != 0) return (1); + mutex_enter(&p->p_lock); + if (p->p_ttime > 0) { + /* + * Account any remaining ticks charged to this process + * on its way out. + */ + (void) task_cpu_time_incr(p->p_task, p->p_ttime); + p->p_ttime = 0; + } + mutex_exit(&p->p_lock); + DTRACE_PROC(lwp__exit); DTRACE_PROC1(exit, int, why); diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c index d008231021..7109a49cda 100644 --- a/usr/src/uts/common/os/main.c +++ b/usr/src/uts/common/os/main.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -360,6 +360,8 @@ main(void) extern int netboot; extern void vm_init(void); extern void cbe_init(void); + extern void clock_tick_init_pre(void); + extern void clock_tick_init_post(void); extern void clock_init(void); extern void physio_bufs_init(void); extern void pm_cfb_setup_intr(void); @@ -399,6 +401,7 @@ main(void) callout_init(); /* callout table MUST be init'd before clock starts */ timer_init(); /* timer must be initialized before cyclic starts */ cbe_init(); + clock_tick_init_pre(); clock_init(); /* @@ -544,6 +547,8 @@ main(void) kmem_mp_init(); vmem_update(NULL); + clock_tick_init_post(); + for (initptr = &mp_init_tbl[0]; *initptr; initptr++) (**initptr)(); diff --git a/usr/src/uts/common/os/task.c b/usr/src/uts/common/os/task.c index b3967546f5..628fcde30b 100644 --- a/usr/src/uts/common/os/task.c +++ b/usr/src/uts/common/os/task.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -202,7 +202,39 @@ task_cpu_time_usage(rctl_t *r, proc_t *p) task_t *t = p->p_task; ASSERT(MUTEX_HELD(&p->p_lock)); - return (t->tk_cpu_time / hz); + return (t->tk_cpu_time); +} + +/* + * int task_cpu_time_incr(task_t *t, rctl_qty_t incr) + * + * Overview + * task_cpu_time_incr() increments the amount of CPU time used + * by this task. + * + * Return values + * 1 if a second or more time is accumulated + * 0 otherwise + * + * Caller's context + * This is called by the clock tick accounting function to charge + * CPU time to a task. + */ +rctl_qty_t +task_cpu_time_incr(task_t *t, rctl_qty_t incr) +{ + rctl_qty_t ret = 0; + + mutex_enter(&t->tk_cpu_time_lock); + t->tk_cpu_ticks += incr; + if (t->tk_cpu_ticks >= hz) { + t->tk_cpu_time += t->tk_cpu_ticks / hz; + t->tk_cpu_ticks = t->tk_cpu_ticks % hz; + ret = t->tk_cpu_time; + } + mutex_exit(&t->tk_cpu_time_lock); + + return (ret); } /* @@ -224,15 +256,12 @@ static int task_cpu_time_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, struct rctl_val *rcntl, rctl_qty_t incr, uint_t flags) { - task_t *t; - ASSERT(MUTEX_HELD(&p->p_lock)); ASSERT(e->rcep_t == RCENTITY_TASK); if (e->rcep_p.task == NULL) return (0); - t = e->rcep_p.task; - if ((t->tk_cpu_time + incr) / hz >= rcntl->rcv_value) + if (incr >= rcntl->rcv_value) return (1); return (0); diff --git a/usr/src/uts/common/sys/clock_tick.h b/usr/src/uts/common/sys/clock_tick.h new file mode 100644 index 0000000000..b3dc2198ed --- /dev/null +++ b/usr/src/uts/common/sys/clock_tick.h @@ -0,0 +1,118 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CLOCK_TICK_H +#define _SYS_CLOCK_TICK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/mutex.h> +#include <sys/cpuvar.h> +#include <sys/systm.h> +#include <sys/cyclic.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define CLOCK_TICK_NCPUS 32 + +/* + * Per-CPU structure to facilitate multi-threaded tick accounting. + * + * ct_lock + * Mutex for the structure. Used to lock the structure to pass + * arguments to the tick processing softint handler. + * ct_intr + * Tick processing softint handle. For parallelism, each CPU + * needs to have its own softint handle. + * ct_lbolt + * Copy of the lbolt at the time of tick scheduling. + * ct_pending + * Number of ticks to be processed by one invocation of the tick + * processing softint. + * ct_start + * First CPU to do tick processing for. + * ct_end + * Last CPU to do tick processing for. + * ct_scan + * CPU to start the tick processing from. Rotated every tick. + */ +typedef struct clock_tick_cpu { + kmutex_t ct_lock; + ulong_t ct_intr; + clock_t ct_lbolt; + int ct_pending; + int ct_start; + int ct_end; + int ct_scan; +} clock_tick_cpu_t; + +/* + * Per-set structure to facilitate multi-threaded tick accounting. + * clock_tick_lock protects this. + * + * ct_start + * First CPU to do tick processing for. + * ct_end + * Last CPU to do tick processing for. + * ct_scan + * CPU to start the tick processing from. Rotated every tick. + */ +typedef struct clock_tick_set { + int ct_start; + int ct_end; + int ct_scan; +} clock_tick_set_t; + +#define CLOCK_TICK_CPU_OFFLINE(cp) \ + (((cp) != cpu_active) && ((cp)->cpu_next_onln == (cp))) + +#define CLOCK_TICK_XCALL_SAFE(cp) \ + CPU_IN_SET(clock_tick_online_cpuset, cp->cpu_id) + +#define CLOCK_TICK_PROC_MAX 10 + +#ifdef _KERNEL +#pragma weak create_softint +extern ulong_t create_softint(uint_t, uint_t (*)(caddr_t, caddr_t), + caddr_t); +#pragma weak invoke_softint +extern void invoke_softint(processorid_t, ulong_t); +#pragma weak sync_softint +extern void sync_softint(cpuset_t); +extern void clock_tick(kthread_t *, int); +extern void membar_sync(void); + +extern int hires_tick; +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CLOCK_TICK_H */ diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h index 4785796781..c7b76b32ea 100644 --- a/usr/src/uts/common/sys/cpuvar.h +++ b/usr/src/uts/common/sys/cpuvar.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -519,6 +519,7 @@ extern cpuset_t cpu_seqid_inuse; extern struct cpu *cpu[]; /* indexed by CPU number */ extern cpu_t *cpu_list; /* list of CPUs */ +extern cpu_t *cpu_active; /* list of active CPUs */ extern int ncpus; /* number of CPUs present */ extern int ncpus_online; /* number of CPUs not quiesced */ extern int max_ncpus; /* max present before ncpus is known */ diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index 5a9b1caf50..46d205749a 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -348,6 +348,7 @@ typedef struct proc { /* protected by p_lock */ rctl_qty_t p_crypto_mem; /* /dev/crypto memory charged to proc */ /* protected by p_lock */ + clock_t p_ttime; /* buffered task time */ } proc_t; #define PROC_T /* headers relying on proc_t are OK */ diff --git a/usr/src/uts/common/sys/task.h b/usr/src/uts/common/sys/task.h index d1bcb04145..a22f64d970 100644 --- a/usr/src/uts/common/sys/task.h +++ b/usr/src/uts/common/sys/task.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -73,6 +73,8 @@ typedef struct task { task_usage_t *tk_inherited; /* task resource usage */ /* inherited with the first */ /* member process */ + rctl_qty_t tk_cpu_ticks; /* accumulated CPU ticks */ + kmutex_t tk_cpu_time_lock; /* accumulated CPU seconds lock */ } task_t; extern task_t *task0p; @@ -91,6 +93,7 @@ extern task_t *task_hold_by_id_zone(taskid_t, zoneid_t); extern void task_rele(task_t *); extern void task_hold(task_t *); extern void task_end(task_t *); +extern rctl_qty_t task_cpu_time_incr(task_t *, rctl_qty_t); #else /* _KERNEL */ diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h index d545e093b3..7302289ea1 100644 --- a/usr/src/uts/common/sys/thread.h +++ b/usr/src/uts/common/sys/thread.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -515,10 +515,22 @@ extern struct _kthread t0; /* the scheduler thread */ extern kmutex_t pidlock; /* global process lock */ /* - * thread_free_lock is used by the clock thread to keep a thread + * thread_free_lock is used by the tick accounting thread to keep a thread * from being freed while it is being examined. */ -extern kmutex_t thread_free_lock; +#define THREAD_FREE_NUM 1024 +#define THREAD_FREE_MASK (THREAD_FREE_NUM - 1) +#define THREAD_FREE_SHIFT_BITS 5 +#define THREAD_FREE_SHIFT(t) ((uintptr_t)t >> THREAD_FREE_SHIFT_BITS) +#define THREAD_FREE_HASH(t) (THREAD_FREE_SHIFT(t) & THREAD_FREE_MASK) + +typedef struct thread_free_lock { + kmutex_t tf_lock; + uchar_t tf_pad[64 - sizeof (kmutex_t)]; +} thread_free_lock_t; + +extern void thread_free_prevent(kthread_t *); +extern void thread_free_allow(kthread_t *); /* * Routines to change the priority and effective priority diff --git a/usr/src/uts/intel/ia32/ml/lock_prim.s b/usr/src/uts/intel/ia32/ml/lock_prim.s index 2d74137565..8dc51e3eeb 100644 --- a/usr/src/uts/intel/ia32/ml/lock_prim.s +++ b/usr/src/uts/intel/ia32/ml/lock_prim.s @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1278,6 +1278,10 @@ lockstat_hot_patch(void) /* XX64 membar_*() should be inlines */ void +membar_sync(void) +{} + +void membar_enter(void) {} @@ -1299,8 +1303,10 @@ membar_consumer(void) ENTRY(membar_enter) ALTENTRY(membar_exit) + ALTENTRY(membar_sync) mfence /* lighter weight than lock; xorq $0,(%rsp) */ ret + SET_SIZE(membar_sync) SET_SIZE(membar_exit) SET_SIZE(membar_enter) @@ -1318,9 +1324,11 @@ membar_consumer(void) ENTRY(membar_enter) ALTENTRY(membar_exit) + ALTENTRY(membar_sync) lock xorl $0, (%esp) ret + SET_SIZE(membar_sync) SET_SIZE(membar_exit) SET_SIZE(membar_enter) diff --git a/usr/src/uts/sun4/os/intr.c b/usr/src/uts/sun4/os/intr.c index 7f70424a45..d0830a261e 100644 --- a/usr/src/uts/sun4/os/intr.c +++ b/usr/src/uts/sun4/os/intr.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -813,3 +813,37 @@ intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip) cp->cpu_intr_weight = 0; /* sanity */ mutex_exit(&intr_dist_cpu_lock); } + +ulong_t +create_softint(uint_t pil, uint_t (*func)(caddr_t, caddr_t), caddr_t arg1) +{ + uint64_t inum; + + inum = add_softintr(pil, func, arg1, SOFTINT_ST); + return ((ulong_t)inum); +} + +void +invoke_softint(processorid_t cpuid, ulong_t hdl) +{ + uint64_t inum = hdl; + + if (cpuid == CPU->cpu_id) + setsoftint(inum); + else + xt_one(cpuid, setsoftint_tl1, inum, 0); +} + +void +remove_softint(ulong_t hdl) +{ + uint64_t inum = hdl; + + (void) rem_softintr(inum); +} + +void +sync_softint(cpuset_t set) +{ + xt_sync(set); +} diff --git a/usr/src/uts/sun4/sys/ivintr.h b/usr/src/uts/sun4/sys/ivintr.h index 7ea9ae1d85..eb0a8656e3 100644 --- a/usr/src/uts/sun4/sys/ivintr.h +++ b/usr/src/uts/sun4/sys/ivintr.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -42,8 +42,9 @@ extern "C" { * interrupts. * * NOTE: Need two single target software interrupts per cpu for cyclics. + * Need one single target software interrupt per cpu for tick accounting. */ -#define MAX_RSVD_IV ((NCPU * 2) + 256) /* HW and Single target SW intrs */ +#define MAX_RSVD_IV ((NCPU * 3) + 256) /* HW and Single target SW intrs */ #define MAX_RSVD_IVX 32 /* Multi target software intrs */ #ifndef _ASM diff --git a/usr/src/uts/sun4u/opl/os/opl.c b/usr/src/uts/sun4u/opl/os/opl.c index f2d3162a93..f33b231117 100644 --- a/usr/src/uts/sun4u/opl/os/opl.c +++ b/usr/src/uts/sun4u/opl/os/opl.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -116,6 +116,12 @@ static void pass2xscf_thread(); #define OPL_BOFF_MAX (40 * OPL_BOFF_SLEEP) #define OPL_BOFF_TM 1500 +#define OPL_CLOCK_TICK_THRESHOLD 128 +#define OPL_CLOCK_TICK_NCPUS 64 + +extern int clock_tick_threshold; +extern int clock_tick_ncpus; + int set_platform_max_ncpus(void) { @@ -943,6 +949,10 @@ plat_startup_memlist(caddr_t alloc_base) void startup_platform(void) { + if (clock_tick_threshold == 0) + clock_tick_threshold = OPL_CLOCK_TICK_THRESHOLD; + if (clock_tick_ncpus == 0) + clock_tick_ncpus = OPL_CLOCK_TICK_NCPUS; } void diff --git a/usr/src/uts/sun4v/os/mach_startup.c b/usr/src/uts/sun4v/os/mach_startup.c index 78293c17af..333212b4f5 100644 --- a/usr/src/uts/sun4v/os/mach_startup.c +++ b/usr/src/uts/sun4v/os/mach_startup.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -57,6 +57,12 @@ extern void sfmmu_set_tsbs(void); */ static int enable_halt_idle_cpus = 1; +#define SUN4V_CLOCK_TICK_THRESHOLD 64 +#define SUN4V_CLOCK_TICK_NCPUS 64 + +extern int clock_tick_threshold; +extern int clock_tick_ncpus; + void setup_trap_table(void) { @@ -296,6 +302,10 @@ void startup_platform(void) { ip_squeue_soft_ring = B_TRUE; + if (clock_tick_threshold == 0) + clock_tick_threshold = SUN4V_CLOCK_TICK_THRESHOLD; + if (clock_tick_ncpus == 0) + clock_tick_ncpus = SUN4V_CLOCK_TICK_NCPUS; } /* |