summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormv143129 <none@none>2008-01-07 18:49:45 -0800
committermv143129 <none@none>2008-01-07 18:49:45 -0800
commit2850d85b7b93f31e578520dc3b3feb24db609c62 (patch)
tree2a1994e9e8b17b58b251e608de8c1a449291ba8f
parentca29f3da510ba7f712a40977b04aeceda9b70b95 (diff)
downloadillumos-gate-2850d85b7b93f31e578520dc3b3feb24db609c62.tar.gz
6619224 Tick accounting needs to be made scalable
-rw-r--r--usr/src/uts/common/Makefile.files3
-rw-r--r--usr/src/uts/common/conf/param.c4
-rw-r--r--usr/src/uts/common/disp/thread.c50
-rw-r--r--usr/src/uts/common/os/clock.c264
-rw-r--r--usr/src/uts/common/os/clock_tick.c699
-rw-r--r--usr/src/uts/common/os/cpu.c9
-rw-r--r--usr/src/uts/common/os/exit.c13
-rw-r--r--usr/src/uts/common/os/main.c7
-rw-r--r--usr/src/uts/common/os/task.c41
-rw-r--r--usr/src/uts/common/sys/clock_tick.h118
-rw-r--r--usr/src/uts/common/sys/cpuvar.h3
-rw-r--r--usr/src/uts/common/sys/proc.h3
-rw-r--r--usr/src/uts/common/sys/task.h5
-rw-r--r--usr/src/uts/common/sys/thread.h18
-rw-r--r--usr/src/uts/intel/ia32/ml/lock_prim.s10
-rw-r--r--usr/src/uts/sun4/os/intr.c36
-rw-r--r--usr/src/uts/sun4/sys/ivintr.h5
-rw-r--r--usr/src/uts/sun4u/opl/os/opl.c12
-rw-r--r--usr/src/uts/sun4v/os/mach_startup.c12
19 files changed, 1087 insertions, 225 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 8adce7c1b1..d284271f12 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -67,7 +67,8 @@ COMMON_CORE_OBJS += \
thread_intr.o \
vm_page.o \
vm_pagelist.o \
- zlib_obj.o
+ zlib_obj.o \
+ clock_tick.o
CORE_OBJS += $(COMMON_CORE_OBJS) $($(MACH)_CORE_OBJS)
diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c
index c35d3e7012..53a4b91775 100644
--- a/usr/src/uts/common/conf/param.c
+++ b/usr/src/uts/common/conf/param.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -194,6 +194,7 @@ extern void clock_highres_init(void);
extern void pg_init(void);
extern void pg_cmt_class_init(void);
extern void pg_cpu0_init(void);
+extern void clock_tick_mp_init(void);
void (*init_tbl[])(void) = {
system_taskq_init,
@@ -238,6 +239,7 @@ void (*mp_init_tbl[])(void) = {
#if defined(__sparc)
siron_mp_init,
#endif
+ clock_tick_mp_init,
0
};
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index 95e21ea1bf..ee2d80834d 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -89,11 +89,13 @@ static kcondvar_t reaper_cv; /* synchronization var */
kthread_t *thread_deathrow; /* circular list of reapable threads */
kthread_t *lwp_deathrow; /* circular list of reapable threads */
kmutex_t reaplock; /* protects lwp and thread deathrows */
-kmutex_t thread_free_lock; /* protects clock from reaper */
int thread_reapcnt = 0; /* number of threads on deathrow */
int lwp_reapcnt = 0; /* number of lwps on deathrow */
int reaplimit = 16; /* delay reaping until reaplimit */
+thread_free_lock_t *thread_free_lock;
+ /* protects tick thread from reaper */
+
extern int nthread;
id_t syscid; /* system scheduling class ID */
@@ -152,8 +154,16 @@ thread_init(void)
extern char sys_name[];
extern void idle();
struct cpu *cpu = CPU;
+ int i;
+ kmutex_t *lp;
mutex_init(&reaplock, NULL, MUTEX_SPIN, (void *)ipltospl(DISP_LEVEL));
+ thread_free_lock =
+ kmem_alloc(sizeof (thread_free_lock_t) * THREAD_FREE_NUM, KM_SLEEP);
+ for (i = 0; i < THREAD_FREE_NUM; i++) {
+ lp = &thread_free_lock[i].tf_lock;
+ mutex_init(lp, NULL, MUTEX_DEFAULT, NULL);
+ }
#if defined(__i386) || defined(__amd64)
thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
@@ -663,6 +673,34 @@ thread_join(kt_did_t tid)
}
void
+thread_free_prevent(kthread_t *t)
+{
+ kmutex_t *lp;
+
+ lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
+ mutex_enter(lp);
+}
+
+void
+thread_free_allow(kthread_t *t)
+{
+ kmutex_t *lp;
+
+ lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
+ mutex_exit(lp);
+}
+
+static void
+thread_free_barrier(kthread_t *t)
+{
+ kmutex_t *lp;
+
+ lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
+ mutex_enter(lp);
+ mutex_exit(lp);
+}
+
+void
thread_free(kthread_t *t)
{
ASSERT(t != &t0 && t->t_state == TS_FREE);
@@ -714,11 +752,11 @@ thread_free(kthread_t *t)
free_afd(&t->t_activefd);
/*
- * Barrier for clock thread. The clock holds this lock to
- * keep the thread from going away while it's looking at it.
+ * Barrier for the tick accounting code. The tick accounting code
+ * holds this lock to keep the thread from going away while it's
+ * looking at it.
*/
- mutex_enter(&thread_free_lock);
- mutex_exit(&thread_free_lock);
+ thread_free_barrier(t);
ASSERT(ttoproj(t) == proj0p);
project_rele(ttoproj(t));
diff --git a/usr/src/uts/common/os/clock.c b/usr/src/uts/common/os/clock.c
index 28d5eece05..f73f758bbf 100644
--- a/usr/src/uts/common/os/clock.c
+++ b/usr/src/uts/common/os/clock.c
@@ -23,7 +23,7 @@
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -253,6 +253,8 @@ cyclic_id_t clock_cyclic; /* clock()'s cyclic_id */
cyclic_id_t deadman_cyclic; /* deadman()'s cyclic_id */
cyclic_id_t ddi_timer_cyclic; /* cyclic_timer()'s cyclic_id */
+extern void clock_tick_schedule(int);
+
static int lgrp_ticks; /* counter to schedule lgrp load calcs */
/*
@@ -306,7 +308,6 @@ static int adj_hist_entry;
int64_t clock_adj_hist[CLOCK_ADJ_HIST_SIZE];
-static void clock_tick(kthread_t *);
static void calcloadavg(int, uint64_t *);
static int genloadavg(struct loadavg_s *);
static void loadavg_update();
@@ -314,17 +315,16 @@ static void loadavg_update();
void (*cmm_clock_callout)() = NULL;
void (*cpucaps_clock_callout)() = NULL;
+extern clock_t clock_tick_proc_max;
+
static void
clock(void)
{
kthread_t *t;
- kmutex_t *plockp; /* pointer to thread's process lock */
- int pinned_intr = 0;
- uint_t nrunnable, nrunning;
+ uint_t nrunnable;
uint_t w_io;
cpu_t *cp;
cpupart_t *cpupart;
- int exiting;
extern void set_anoninfo();
extern void set_freemem();
void (*funcp)();
@@ -379,22 +379,7 @@ clock(void)
* every timer interrupt.
*
* Continue with the interrupt processing as scheduled.
- *
- * Did we pin another interrupt thread? Need to check this before
- * grabbing any adaptive locks, since if we block on a lock the
- * pinned thread could escape. Note that this is just a heuristic;
- * if we take multiple laps though clock() without returning from
- * the interrupt because we have another clock tick pending, then
- * the pinned interrupt could be released by one of the previous
- * laps. The only consequence is that the CPU will be counted as
- * in idle (or wait) state once the pinned interrupt is released.
- * Since this accounting is inaccurate by nature, this isn't a big
- * deal --- but we should try to get it right in the common case
- * where we only call clock() once per interrupt.
*/
- if (curthread->t_intr != NULL)
- pinned_intr = (curthread->t_intr->t_flag & T_INTR_THREAD);
-
/*
* Count the number of runnable threads and the number waiting
* for some form of I/O to complete -- gets added to
@@ -448,6 +433,10 @@ clock(void)
if (one_sec) {
cpupart->cp_nrunnable += cpu_nrunnable;
/*
+ * Update user, system, and idle cpu times.
+ */
+ cpupart->cp_nrunning++;
+ /*
* w_io is used to update sysinfo.waiting during
* one_second processing below. Only gather w_io
* information when we walk the list of cpus if we're
@@ -547,150 +536,7 @@ clock(void)
}
} while ((cp = cp->cpu_next) != cpu_list);
- /*
- * Do tick processing for all the active threads running in
- * the system. We're trying to be more fair by walking the
- * list of CPUs starting from a different CPUs each time.
- */
- cp = clock_cpu_list;
- nrunning = 0;
- do {
- klwp_id_t lwp;
- int intr;
- int thread_away;
-
- /*
- * Don't do any tick processing on CPUs that
- * aren't even in the system or aren't up yet.
- */
- if ((cp->cpu_flags & CPU_EXISTS) == 0) {
- continue;
- }
-
- /*
- * The locking here is rather tricky. We use
- * thread_free_lock to keep the currently running
- * thread from being freed or recycled while we're
- * looking at it. We can then check if the thread
- * is exiting and get the appropriate p_lock if it
- * is not. We have to be careful, though, because
- * the _process_ can still be freed while we're
- * holding thread_free_lock. To avoid touching the
- * proc structure we put a pointer to the p_lock in the
- * thread structure. The p_lock is persistent so we
- * can acquire it even if the process is gone. At that
- * point we can check (again) if the thread is exiting
- * and either drop the lock or do the tick processing.
- */
- mutex_enter(&thread_free_lock);
- /*
- * We cannot hold the cpu_lock to prevent the
- * cpu_list from changing in the clock interrupt.
- * As long as we don't block (or don't get pre-empted)
- * the cpu_list will not change (all threads are paused
- * before list modification). If the list does change
- * any deleted cpu structures will remain with cpu_next
- * set to NULL, hence the following test.
- */
- if (cp->cpu_next == NULL) {
- mutex_exit(&thread_free_lock);
- break;
- }
- t = cp->cpu_thread; /* Current running thread */
- if (CPU == cp) {
- /*
- * 't' will be the clock interrupt thread on this
- * CPU. Use the pinned thread (if any) on this CPU
- * as the target of the clock tick. If we pinned
- * an interrupt, though, just keep using the clock
- * interrupt thread since the formerly pinned one
- * may have gone away. One interrupt thread is as
- * good as another, and this means we don't have
- * to continue to check pinned_intr in subsequent
- * code.
- */
- ASSERT(t == curthread);
- if (t->t_intr != NULL && !pinned_intr)
- t = t->t_intr;
- }
-
- intr = t->t_flag & T_INTR_THREAD;
- lwp = ttolwp(t);
- if (lwp == NULL || (t->t_proc_flag & TP_LWPEXIT) || intr) {
- /*
- * Thread is exiting (or uninteresting) so don't
- * do tick processing or grab p_lock. Once we
- * drop thread_free_lock we can't look inside the
- * thread or lwp structure, since the thread may
- * have gone away.
- */
- exiting = 1;
- } else {
- /*
- * OK, try to grab the process lock. See
- * comments above for why we're not using
- * ttoproc(t)->p_lockp here.
- */
- plockp = t->t_plockp;
- mutex_enter(plockp);
- /* See above comment. */
- if (cp->cpu_next == NULL) {
- mutex_exit(plockp);
- mutex_exit(&thread_free_lock);
- break;
- }
- /*
- * The thread may have exited between when we
- * checked above, and when we got the p_lock.
- */
- if (t->t_proc_flag & TP_LWPEXIT) {
- mutex_exit(plockp);
- exiting = 1;
- } else {
- exiting = 0;
- }
- }
- /*
- * Either we have the p_lock for the thread's process,
- * or we don't care about the thread structure any more.
- * Either way we can drop thread_free_lock.
- */
- mutex_exit(&thread_free_lock);
-
- /*
- * Update user, system, and idle cpu times.
- */
- if (one_sec) {
- nrunning++;
- cp->cpu_part->cp_nrunning++;
- }
- /*
- * If we haven't done tick processing for this
- * lwp, then do it now. Since we don't hold the
- * lwp down on a CPU it can migrate and show up
- * more than once, hence the lbolt check.
- *
- * Also, make sure that it's okay to perform the
- * tick processing before calling clock_tick.
- * Setting thread_away to a TRUE value (ie. not 0)
- * results in tick processing not being performed for
- * that thread. Or, in other words, keeps the thread
- * away from clock_tick processing.
- */
- thread_away = ((cp->cpu_flags & CPU_QUIESCED) ||
- CPU_ON_INTR(cp) || intr ||
- (cp->cpu_dispthread == cp->cpu_idle_thread) || exiting);
-
- if ((!thread_away) && (lbolt - t->t_lbolt != 0)) {
- t->t_lbolt = lbolt;
- clock_tick(t);
- }
-
- if (!exiting)
- mutex_exit(plockp);
- } while ((cp = cp->cpu_next) != clock_cpu_list);
-
- clock_cpu_list = clock_cpu_list->cpu_next;
+ clock_tick_schedule(one_sec);
/*
* bump time in ticks
@@ -1522,16 +1368,19 @@ ddi_hardpps(struct timeval *tvp, int usec)
* Check for timer action, enforce CPU rlimit, do profiling etc.
*/
void
-clock_tick(kthread_t *t)
+clock_tick(kthread_t *t, int pending)
{
struct proc *pp;
klwp_id_t lwp;
struct as *as;
- clock_t utime;
- clock_t stime;
+ clock_t ticks;
int poke = 0; /* notify another CPU */
int user_mode;
size_t rss;
+ int i, total_usec, usec;
+ rctl_qty_t secs;
+
+ ASSERT(pending > 0);
/* Must be operating on a lwp/thread */
if ((lwp = ttolwp(t)) == NULL) {
@@ -1539,8 +1388,10 @@ clock_tick(kthread_t *t)
/*NOTREACHED*/
}
- CL_TICK(t); /* Class specific tick processing */
- DTRACE_SCHED1(tick, kthread_t *, t);
+ for (i = 0; i < pending; i++) {
+ CL_TICK(t); /* Class specific tick processing */
+ DTRACE_SCHED1(tick, kthread_t *, t);
+ }
pp = ttoproc(t);
@@ -1549,17 +1400,18 @@ clock_tick(kthread_t *t)
user_mode = (lwp->lwp_state == LWP_USER);
+ ticks = (pp->p_utime + pp->p_stime) % hz;
/*
* Update process times. Should use high res clock and state
* changes instead of statistical sampling method. XXX
*/
if (user_mode) {
- pp->p_utime++;
- pp->p_task->tk_cpu_time++;
+ pp->p_utime += pending;
} else {
- pp->p_stime++;
- pp->p_task->tk_cpu_time++;
+ pp->p_stime += pending;
}
+
+ pp->p_ttime += pending;
as = pp->p_as;
/*
@@ -1567,45 +1419,73 @@ clock_tick(kthread_t *t)
* lwp when the AST happens.
*/
if (pp->p_prof.pr_scale) {
- atomic_add_32(&lwp->lwp_oweupc, 1);
+ atomic_add_32(&lwp->lwp_oweupc, (int32_t)pending);
if (user_mode) {
poke = 1;
aston(t);
}
}
- utime = pp->p_utime;
- stime = pp->p_stime;
-
/*
* If CPU was in user state, process lwp-virtual time
- * interval timer.
+ * interval timer. The value passed to itimerdecr() has to be
+ * in microseconds and has to be less than one second. Hence
+ * this loop.
*/
- if (user_mode &&
- timerisset(&lwp->lwp_timer[ITIMER_VIRTUAL].it_value) &&
- itimerdecr(&lwp->lwp_timer[ITIMER_VIRTUAL], usec_per_tick) == 0) {
- poke = 1;
- sigtoproc(pp, t, SIGVTALRM);
+ total_usec = usec_per_tick * pending;
+ while (total_usec > 0) {
+ usec = MIN(total_usec, (MICROSEC - 1));
+ if (user_mode &&
+ timerisset(&lwp->lwp_timer[ITIMER_VIRTUAL].it_value) &&
+ itimerdecr(&lwp->lwp_timer[ITIMER_VIRTUAL], usec) == 0) {
+ poke = 1;
+ sigtoproc(pp, t, SIGVTALRM);
+ }
+ total_usec -= usec;
}
- if (timerisset(&lwp->lwp_timer[ITIMER_PROF].it_value) &&
- itimerdecr(&lwp->lwp_timer[ITIMER_PROF], usec_per_tick) == 0) {
- poke = 1;
- sigtoproc(pp, t, SIGPROF);
+ /*
+ * If CPU was in user state, process lwp-profile
+ * interval timer.
+ */
+ total_usec = usec_per_tick * pending;
+ while (total_usec > 0) {
+ usec = MIN(total_usec, (MICROSEC - 1));
+ if (timerisset(&lwp->lwp_timer[ITIMER_PROF].it_value) &&
+ itimerdecr(&lwp->lwp_timer[ITIMER_PROF], usec) == 0) {
+ poke = 1;
+ sigtoproc(pp, t, SIGPROF);
+ }
+ total_usec -= usec;
}
/*
* Enforce CPU resource controls:
* (a) process.max-cpu-time resource control
+ *
+ * Perform the check only if we have accumulated more a second.
*/
- (void) rctl_test(rctlproc_legacy[RLIMIT_CPU], pp->p_rctls, pp,
- (utime + stime)/hz, RCA_UNSAFE_SIGINFO);
+ if ((ticks + pending) >= hz) {
+ (void) rctl_test(rctlproc_legacy[RLIMIT_CPU], pp->p_rctls, pp,
+ (pp->p_utime + pp->p_stime)/hz, RCA_UNSAFE_SIGINFO);
+ }
/*
* (b) task.max-cpu-time resource control
+ *
+ * If we have accumulated enough ticks, increment the task CPU
+ * time usage and test for the resource limit. This minimizes the
+ * number of calls to the rct_test(). The task CPU time mutex
+ * is highly contentious as many processes can be sharing a task.
*/
- (void) rctl_test(rc_task_cpu_time, pp->p_task->tk_rctls, pp, 1,
- RCA_UNSAFE_SIGINFO);
+ if (pp->p_ttime >= clock_tick_proc_max) {
+ secs = task_cpu_time_incr(pp->p_task, pp->p_ttime);
+ pp->p_ttime = 0;
+ if (secs) {
+ (void) rctl_test(rc_task_cpu_time, pp->p_task->tk_rctls,
+ pp, secs, RCA_UNSAFE_SIGINFO);
+ }
+ }
/*
* Update memory usage for the currently running process.
diff --git a/usr/src/uts/common/os/clock_tick.c b/usr/src/uts/common/os/clock_tick.c
new file mode 100644
index 0000000000..816f4978b1
--- /dev/null
+++ b/usr/src/uts/common/os/clock_tick.c
@@ -0,0 +1,699 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/task.h>
+#include <sys/cmn_err.h>
+#include <sys/class.h>
+#include <sys/sdt.h>
+#include <sys/atomic.h>
+#include <sys/cpu.h>
+#include <sys/clock_tick.h>
+#include <sys/sysmacros.h>
+#include <vm/rm.h>
+
+/*
+ * This file contains the implementation of clock tick accounting for threads.
+ * Every tick, user threads running on various CPUs are located and charged
+ * with a tick to account for their use of CPU time.
+ *
+ * Every tick, the clock() handler calls clock_tick_schedule() to perform tick
+ * accounting for all the threads in the system. Tick accounting is done in
+ * two phases:
+ *
+ * Tick scheduling Done in clock_tick_schedule(). In this phase, cross
+ * calls are scheduled to multiple CPUs to perform
+ * multi-threaded tick accounting. The CPUs are chosen
+ * on a rotational basis so as to distribute the tick
+ * accounting load evenly across all CPUs.
+ *
+ * Tick execution Done in clock_tick_execute(). In this phase, tick
+ * accounting is actually performed by softint handlers
+ * on multiple CPUs.
+ *
+ * This implementation gives us a multi-threaded tick processing facility that
+ * is suitable for configurations with a large number of CPUs. On smaller
+ * configurations it may be desirable to let the processing be single-threaded
+ * and just allow clock() to do it as it has been done traditionally. To
+ * facilitate this, a variable, clock_tick_threshold, is defined. Platforms
+ * that desire multi-threading should set this variable to something
+ * appropriate. A recommended value may be found in clock_tick.h. At boot time,
+ * if the number of CPUs is greater than clock_tick_threshold, multi-threading
+ * kicks in. Note that this is a decision made at boot time. If more CPUs
+ * are dynamically added later on to exceed the threshold, no attempt is made
+ * to switch to multi-threaded. Similarly, if CPUs are removed dynamically
+ * no attempt is made to switch to single-threaded. This is to keep the
+ * implementation simple. Also note that the threshold can be changed for a
+ * specific customer configuration via /etc/system.
+ *
+ * The boot time decision is reflected in clock_tick_single_threaded.
+ */
+
+/*
+ * clock_tick_threshold
+ * If the number of CPUs at boot time exceeds this threshold,
+ * multi-threaded tick accounting kicks in.
+ *
+ * clock_tick_ncpus
+ * The number of CPUs in a set. Each set is scheduled for tick execution
+ * on a separate processor.
+ *
+ * clock_tick_single_threaded
+ * Indicates whether or not tick accounting is single threaded.
+ *
+ * clock_tick_total_cpus
+ * Total number of online CPUs.
+ *
+ * clock_tick_cpus
+ * Array of online CPU pointers.
+ *
+ * clock_tick_cpu
+ * Per-CPU, cache-aligned data structures to facilitate multi-threading.
+ *
+ * clock_tick_active
+ * Counter that indicates the number of active tick processing softints
+ * in the system.
+ *
+ * clock_tick_pending
+ * Number of pending ticks that need to be accounted by the softint
+ * handlers.
+ *
+ * clock_tick_lock
+ * Mutex to synchronize between clock_tick_schedule() and
+ * CPU online/offline.
+ *
+ * clock_cpu_id
+ * CPU id of the clock() CPU. Used to detect when the clock CPU
+ * is offlined.
+ *
+ * clock_tick_online_cpuset
+ * CPU set of all online processors that can be X-called.
+ *
+ * clock_tick_proc_max
+ * Each process is allowed to accumulate a few ticks before checking
+ * for the task CPU time resource limit. We lower the number of calls
+ * to rctl_test() to make tick accounting more scalable. The tradeoff
+ * is that the limit may not get enforced in a timely manner. This is
+ * typically not a problem.
+ *
+ * clock_tick_set
+ * Per-set structures. Each structure contains the range of CPUs
+ * to be processed for the set.
+ *
+ * clock_tick_nsets;
+ * Number of sets.
+ *
+ * clock_tick_scan
+ * Where to begin the scan for single-threaded mode. In multi-threaded,
+ * the clock_tick_set itself contains a field for this.
+ */
+int clock_tick_threshold;
+int clock_tick_ncpus;
+int clock_tick_single_threaded;
+int clock_tick_total_cpus;
+cpu_t *clock_tick_cpus[NCPU];
+clock_tick_cpu_t *clock_tick_cpu[NCPU];
+ulong_t clock_tick_active;
+int clock_tick_pending;
+kmutex_t clock_tick_lock;
+processorid_t clock_cpu_id;
+cpuset_t clock_tick_online_cpuset;
+clock_t clock_tick_proc_max;
+clock_tick_set_t *clock_tick_set;
+int clock_tick_nsets;
+int clock_tick_scan;
+
+static uint_t clock_tick_execute(caddr_t, caddr_t);
+static void clock_tick_execute_common(int, int, int, clock_t, int);
+
+#define CLOCK_TICK_ALIGN 64 /* cache alignment */
+
+/*
+ * Clock tick initialization is done in two phases:
+ *
+ * 1. Before clock_init() is called, clock_tick_init_pre() is called to set
+ * up single-threading so the clock() can begin to do its job.
+ *
+ * 2. After the slave CPUs are initialized at boot time, we know the number
+ * of CPUs. clock_tick_init_post() is called to set up multi-threading if
+ * required.
+ */
+void
+clock_tick_init_pre(void)
+{
+ clock_tick_cpu_t *ctp;
+ int i, n;
+ clock_tick_set_t *csp;
+ uintptr_t buf;
+ size_t size;
+
+ clock_tick_single_threaded = 1;
+
+ size = P2ROUNDUP(sizeof (clock_tick_cpu_t), CLOCK_TICK_ALIGN);
+ buf = (uintptr_t)kmem_zalloc(size * NCPU + CLOCK_TICK_ALIGN, KM_SLEEP);
+ buf = P2ROUNDUP(buf, CLOCK_TICK_ALIGN);
+
+ /*
+ * Perform initialization in case multi-threading is chosen later.
+ */
+ for (i = 0; i < NCPU; i++, buf += size) {
+ ctp = (clock_tick_cpu_t *)buf;
+ clock_tick_cpu[i] = ctp;
+ mutex_init(&ctp->ct_lock, NULL, MUTEX_DEFAULT, NULL);
+ if (&create_softint != NULL) {
+ ctp->ct_intr = create_softint(LOCK_LEVEL,
+ clock_tick_execute, (caddr_t)ctp);
+ }
+ ctp->ct_pending = 0;
+ }
+
+ mutex_init(&clock_tick_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ /*
+ * Compute clock_tick_ncpus here. We need it to compute the
+ * maximum number of tick sets we need to support.
+ */
+ ASSERT(clock_tick_ncpus >= 0);
+ if (clock_tick_ncpus == 0)
+ clock_tick_ncpus = CLOCK_TICK_NCPUS;
+ if (clock_tick_ncpus > max_ncpus)
+ clock_tick_ncpus = max_ncpus;
+
+ /*
+ * Allocate and initialize the tick sets.
+ */
+ n = (max_ncpus + clock_tick_ncpus - 1)/clock_tick_ncpus;
+ clock_tick_set = kmem_zalloc(sizeof (clock_tick_set_t) * n, KM_SLEEP);
+ for (i = 0; i < n; i++) {
+ csp = &clock_tick_set[i];
+ csp->ct_start = i * clock_tick_ncpus;
+ csp->ct_scan = csp->ct_start;
+ csp->ct_end = csp->ct_start;
+ }
+}
+
+void
+clock_tick_init_post(void)
+{
+ /*
+ * If a platform does not provide create_softint() and invoke_softint(),
+ * then we assume single threaded.
+ */
+ if (&invoke_softint == NULL)
+ clock_tick_threshold = 0;
+
+ ASSERT(clock_tick_threshold >= 0);
+
+ if (clock_tick_threshold == 0)
+ clock_tick_threshold = max_ncpus;
+
+ /*
+ * If a platform does not specify a threshold or if the number of CPUs
+ * at boot time does not exceed the threshold, tick accounting remains
+ * single-threaded.
+ */
+ if (ncpus <= clock_tick_threshold) {
+ clock_tick_ncpus = max_ncpus;
+ clock_tick_proc_max = 1;
+ return;
+ }
+
+ /*
+ * OK. Multi-thread tick processing. If a platform has not specified
+ * the CPU set size for multi-threading, then use the default value.
+ * This value has been arrived through measurements on large
+ * configuration systems.
+ */
+ clock_tick_single_threaded = 0;
+ if (clock_tick_proc_max == 0) {
+ clock_tick_proc_max = CLOCK_TICK_PROC_MAX;
+ if (hires_tick)
+ clock_tick_proc_max *= 10;
+ }
+}
+
+static void
+clock_tick_schedule_one(clock_tick_set_t *csp, int pending, processorid_t cid)
+{
+ clock_tick_cpu_t *ctp;
+
+ ASSERT(&invoke_softint != NULL);
+ /*
+ * Schedule tick accounting for a set of CPUs.
+ */
+ ctp = clock_tick_cpu[cid];
+ mutex_enter(&ctp->ct_lock);
+ ctp->ct_lbolt = lbolt;
+ ctp->ct_pending += pending;
+ ctp->ct_start = csp->ct_start;
+ ctp->ct_end = csp->ct_end;
+ ctp->ct_scan = csp->ct_scan;
+ mutex_exit(&ctp->ct_lock);
+
+ invoke_softint(cid, ctp->ct_intr);
+ /*
+ * Return without waiting for the softint to finish.
+ */
+}
+
+static void
+clock_tick_process(cpu_t *cp, clock_t mylbolt, int pending)
+{
+ kthread_t *t;
+ kmutex_t *plockp;
+ int notick, intr;
+ klwp_id_t lwp;
+
+ /*
+ * The locking here is rather tricky. thread_free_prevent()
+ * prevents the thread returned from being freed while we
+ * are looking at it. We can then check if the thread
+ * is exiting and get the appropriate p_lock if it
+ * is not. We have to be careful, though, because
+ * the _process_ can still be freed while we've
+ * prevented thread free. To avoid touching the
+ * proc structure we put a pointer to the p_lock in the
+ * thread structure. The p_lock is persistent so we
+ * can acquire it even if the process is gone. At that
+ * point we can check (again) if the thread is exiting
+ * and either drop the lock or do the tick processing.
+ */
+ t = cp->cpu_thread; /* Current running thread */
+ if (CPU == cp) {
+ /*
+ * 't' will be the tick processing thread on this
+ * CPU. Use the pinned thread (if any) on this CPU
+ * as the target of the clock tick.
+ */
+ if (t->t_intr != NULL)
+ t = t->t_intr;
+ }
+
+ /*
+ * We use thread_free_prevent to keep the currently running
+ * thread from being freed or recycled while we're
+ * looking at it.
+ */
+ thread_free_prevent(t);
+ /*
+ * We cannot hold the cpu_lock to prevent the
+ * cpu_active from changing in the clock interrupt.
+ * As long as we don't block (or don't get pre-empted)
+ * the cpu_list will not change (all threads are paused
+ * before list modification).
+ */
+ if (CLOCK_TICK_CPU_OFFLINE(cp)) {
+ thread_free_allow(t);
+ return;
+ }
+
+ /*
+ * Make sure the thread is still on the CPU.
+ */
+ if ((t != cp->cpu_thread) &&
+ ((cp != CPU) || (t != cp->cpu_thread->t_intr))) {
+ /*
+ * We could not locate the thread. Skip this CPU. Race
+ * conditions while performing these checks are benign.
+ * These checks are not perfect and they don't need
+ * to be.
+ */
+ thread_free_allow(t);
+ return;
+ }
+
+ intr = t->t_flag & T_INTR_THREAD;
+ lwp = ttolwp(t);
+ if (lwp == NULL || (t->t_proc_flag & TP_LWPEXIT) || intr) {
+ /*
+ * Thread is exiting (or uninteresting) so don't
+ * do tick processing.
+ */
+ thread_free_allow(t);
+ return;
+ }
+
+ /*
+ * OK, try to grab the process lock. See
+ * comments above for why we're not using
+ * ttoproc(t)->p_lockp here.
+ */
+ plockp = t->t_plockp;
+ mutex_enter(plockp);
+ /* See above comment. */
+ if (CLOCK_TICK_CPU_OFFLINE(cp)) {
+ mutex_exit(plockp);
+ thread_free_allow(t);
+ return;
+ }
+
+ /*
+ * The thread may have exited between when we
+ * checked above, and when we got the p_lock.
+ */
+ if (t->t_proc_flag & TP_LWPEXIT) {
+ mutex_exit(plockp);
+ thread_free_allow(t);
+ return;
+ }
+
+ /*
+ * Either we have the p_lock for the thread's process,
+ * or we don't care about the thread structure any more.
+ * Either way we can allow thread free.
+ */
+ thread_free_allow(t);
+
+ /*
+ * If we haven't done tick processing for this
+ * lwp, then do it now. Since we don't hold the
+ * lwp down on a CPU it can migrate and show up
+ * more than once, hence the lbolt check. mylbolt
+ * is copied at the time of tick scheduling to prevent
+ * lbolt mismatches.
+ *
+ * Also, make sure that it's okay to perform the
+ * tick processing before calling clock_tick.
+ * Setting notick to a TRUE value (ie. not 0)
+ * results in tick processing not being performed for
+ * that thread.
+ */
+ notick = ((cp->cpu_flags & CPU_QUIESCED) || CPU_ON_INTR(cp) ||
+ (cp->cpu_dispthread == cp->cpu_idle_thread));
+
+ if ((!notick) && (t->t_lbolt < mylbolt)) {
+ t->t_lbolt = mylbolt;
+ clock_tick(t, pending);
+ }
+
+ mutex_exit(plockp);
+}
+
+void
+clock_tick_schedule(int one_sec)
+{
+ ulong_t active;
+ int i, end;
+ clock_tick_set_t *csp;
+ cpu_t *cp;
+
+ if (clock_cpu_id != CPU->cpu_id)
+ clock_cpu_id = CPU->cpu_id;
+
+ if (clock_tick_single_threaded) {
+ /*
+ * Each tick cycle, start the scan from a different
+ * CPU for the sake of fairness.
+ */
+ end = clock_tick_total_cpus;
+ clock_tick_scan++;
+ if (clock_tick_scan >= end)
+ clock_tick_scan = 0;
+
+ clock_tick_execute_common(0, clock_tick_scan, end, lbolt, 1);
+
+ return;
+ }
+
+ /*
+ * If the previous invocation of handlers is not yet finished, then
+ * simply increment a pending count and return. Eventually when they
+ * finish, the pending count is passed down to the next set of
+ * handlers to process. This way, ticks that have already elapsed
+ * in the past are handled as quickly as possible to minimize the
+ * chances of threads getting away before their pending ticks are
+ * accounted. The other benefit is that if the pending count is
+ * more than one, it can be handled by a single invocation of
+ * clock_tick(). This is a good optimization for large configuration
+ * busy systems where tick accounting can get backed up for various
+ * reasons.
+ */
+ clock_tick_pending++;
+
+ active = clock_tick_active;
+ active = atomic_cas_ulong(&clock_tick_active, active, active);
+ if (active)
+ return;
+
+ /*
+ * We want to handle the clock CPU here. If we
+ * scheduled the accounting for the clock CPU to another
+ * processor, that processor will find only the clock() thread
+ * running and not account for any user thread below it. Also,
+ * we want to handle this before we block on anything and allow
+ * the pinned thread below the current thread to escape.
+ */
+ clock_tick_process(CPU, lbolt, clock_tick_pending);
+
+ mutex_enter(&clock_tick_lock);
+
+ /*
+ * Schedule each set on a separate processor.
+ */
+ cp = clock_cpu_list;
+ for (i = 0; i < clock_tick_nsets; i++) {
+ csp = &clock_tick_set[i];
+
+ /*
+ * Pick the next online CPU in list for scheduling tick
+ * accounting. The clock_tick_lock is held by the caller.
+ * So, CPU online/offline cannot muck with this while
+ * we are picking our CPU to X-call.
+ */
+ if (cp == CPU)
+ cp = cp->cpu_next_onln;
+
+ /*
+ * Each tick cycle, start the scan from a different
+ * CPU for the sake of fairness.
+ */
+ csp->ct_scan++;
+ if (csp->ct_scan >= csp->ct_end)
+ csp->ct_scan = csp->ct_start;
+
+ clock_tick_schedule_one(csp, clock_tick_pending, cp->cpu_id);
+
+ cp = cp->cpu_next_onln;
+ }
+
+ if (one_sec) {
+ /*
+ * Move the CPU pointer around every second. This is so
+ * all the CPUs can be X-called in a round-robin fashion
+ * to evenly distribute the X-calls. We don't do this
+ * at a faster rate than this because we don't want
+ * to affect cache performance negatively.
+ */
+ clock_cpu_list = clock_cpu_list->cpu_next_onln;
+ }
+
+ mutex_exit(&clock_tick_lock);
+
+ clock_tick_pending = 0;
+}
+
+static void
+clock_tick_execute_common(int start, int scan, int end, clock_t mylbolt,
+ int pending)
+{
+ cpu_t *cp;
+ int i;
+
+ ASSERT((start <= scan) && (scan <= end));
+
+ /*
+ * Handle the thread on current CPU first. This is to prevent a
+ * pinned thread from escaping if we ever block on something.
+ * Note that in the single-threaded mode, this handles the clock
+ * CPU.
+ */
+ clock_tick_process(CPU, mylbolt, pending);
+
+ /*
+ * Perform tick accounting for the threads running on
+ * the scheduled CPUs.
+ */
+ for (i = scan; i < end; i++) {
+ cp = clock_tick_cpus[i];
+ if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id))
+ continue;
+ clock_tick_process(cp, mylbolt, pending);
+ }
+
+ for (i = start; i < scan; i++) {
+ cp = clock_tick_cpus[i];
+ if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id))
+ continue;
+ clock_tick_process(cp, mylbolt, pending);
+ }
+}
+
+/*ARGSUSED*/
+static uint_t
+clock_tick_execute(caddr_t arg1, caddr_t arg2)
+{
+ clock_tick_cpu_t *ctp;
+ int start, scan, end, pending;
+ clock_t mylbolt;
+
+ /*
+ * We could have raced with cpu offline. We don't want to
+ * process anything on an offlined CPU. If we got blocked
+ * on anything, we may not get scheduled when we wakeup
+ * later on.
+ */
+ if (!CLOCK_TICK_XCALL_SAFE(CPU))
+ return (1);
+
+ atomic_inc_ulong(&clock_tick_active);
+
+ ctp = (clock_tick_cpu_t *)arg1;
+ mutex_enter(&ctp->ct_lock);
+ pending = ctp->ct_pending;
+ if (pending == 0) {
+ /*
+ * If a CPU is busy at LOCK_LEVEL, then an invocation
+ * of this softint may be queued for some time. In that case,
+ * clock_tick_active will not be incremented.
+ * clock_tick_schedule() will then assume that the previous
+ * invocation is done and post a new softint. The first one
+ * that gets in will reset the pending count so the
+ * second one is a noop.
+ */
+ mutex_exit(&ctp->ct_lock);
+ goto out;
+ }
+ ctp->ct_pending = 0;
+ start = ctp->ct_start;
+ end = ctp->ct_end;
+ scan = ctp->ct_scan;
+ mylbolt = ctp->ct_lbolt;
+ mutex_exit(&ctp->ct_lock);
+
+ clock_tick_execute_common(start, scan, end, mylbolt, pending);
+
+out:
+ /*
+ * Signal completion to the clock handler.
+ */
+ atomic_dec_ulong(&clock_tick_active);
+
+ return (1);
+}
+
+/*ARGSUSED*/
+static int
+clock_tick_cpu_setup(cpu_setup_t what, int cid, void *arg)
+{
+ cpu_t *cp, *ncp;
+ int i, set;
+ clock_tick_set_t *csp;
+
+ /*
+ * This function performs some computations at CPU offline/online
+ * time. The computed values are used during tick scheduling and
+ * execution phases. This avoids having to compute things on
+ * an every tick basis. The other benefit is that we perform the
+ * computations only for onlined CPUs (not offlined ones). As a
+ * result, no tick processing is attempted for offlined CPUs.
+ *
+ * Also, cpu_offline() calls this function before checking for
+ * active interrupt threads. This allows us to avoid posting
+ * cross calls to CPUs that are being offlined.
+ */
+
+ cp = cpu[cid];
+
+ mutex_enter(&clock_tick_lock);
+
+ switch (what) {
+ case CPU_ON:
+ clock_tick_cpus[clock_tick_total_cpus] = cp;
+ set = clock_tick_total_cpus / clock_tick_ncpus;
+ csp = &clock_tick_set[set];
+ csp->ct_end++;
+ clock_tick_total_cpus++;
+ clock_tick_nsets =
+ (clock_tick_total_cpus + clock_tick_ncpus - 1) /
+ clock_tick_ncpus;
+ CPUSET_ADD(clock_tick_online_cpuset, cp->cpu_id);
+ membar_sync();
+ break;
+
+ case CPU_OFF:
+ if (&sync_softint != NULL)
+ sync_softint(clock_tick_online_cpuset);
+ CPUSET_DEL(clock_tick_online_cpuset, cp->cpu_id);
+ clock_tick_total_cpus--;
+ clock_tick_cpus[clock_tick_total_cpus] = NULL;
+ clock_tick_nsets =
+ (clock_tick_total_cpus + clock_tick_ncpus - 1) /
+ clock_tick_ncpus;
+ set = clock_tick_total_cpus / clock_tick_ncpus;
+ csp = &clock_tick_set[set];
+ csp->ct_end--;
+
+ i = 0;
+ ncp = cpu_active;
+ do {
+ if (cp == ncp)
+ continue;
+ clock_tick_cpus[i] = ncp;
+ i++;
+ } while ((ncp = ncp->cpu_next_onln) != cpu_active);
+ ASSERT(i == clock_tick_total_cpus);
+ membar_sync();
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_exit(&clock_tick_lock);
+
+ return (0);
+}
+
+
+void
+clock_tick_mp_init(void)
+{
+ cpu_t *cp;
+
+ mutex_enter(&cpu_lock);
+
+ cp = cpu_active;
+ do {
+ (void) clock_tick_cpu_setup(CPU_ON, cp->cpu_id, NULL);
+ } while ((cp = cp->cpu_next_onln) != cpu_active);
+
+ register_cpu_setup_func(clock_tick_cpu_setup, NULL);
+
+ mutex_exit(&cpu_lock);
+}
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 8988a7b647..13cf752b45 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1785,9 +1785,6 @@ cpu_del_unit(int cpuid)
cp->cpu_next->cpu_prev = cp->cpu_prev;
if (cp == cpu_list)
cpu_list = cpnext;
- if (cp == clock_cpu_list)
- clock_cpu_list = cpnext;
-
/*
* Signals that the cpu has been deleted (see above).
@@ -1882,6 +1879,9 @@ cpu_remove_active(cpu_t *cp)
lgrp_config(LGRP_CONFIG_CPU_OFFLINE, (uintptr_t)cp, 0);
+ if (cp == clock_cpu_list)
+ clock_cpu_list = cp->cpu_next_onln;
+
cp->cpu_prev_onln->cpu_next_onln = cp->cpu_next_onln;
cp->cpu_next_onln->cpu_prev_onln = cp->cpu_prev_onln;
if (cpu_active == cp) {
@@ -2797,7 +2797,6 @@ cpu_destroy_bound_threads(cpu_t *cp)
mutex_exit(&pidlock);
-
for (t = tlist; t != NULL; t = tnext) {
tnext = t->t_next;
thread_free(t);
diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c
index e39be3e9f2..71fc90a767 100644
--- a/usr/src/uts/common/os/exit.c
+++ b/usr/src/uts/common/os/exit.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -356,6 +356,17 @@ proc_exit(int why, int what)
if (exitlwps(0) != 0)
return (1);
+ mutex_enter(&p->p_lock);
+ if (p->p_ttime > 0) {
+ /*
+ * Account any remaining ticks charged to this process
+ * on its way out.
+ */
+ (void) task_cpu_time_incr(p->p_task, p->p_ttime);
+ p->p_ttime = 0;
+ }
+ mutex_exit(&p->p_lock);
+
DTRACE_PROC(lwp__exit);
DTRACE_PROC1(exit, int, why);
diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c
index d008231021..7109a49cda 100644
--- a/usr/src/uts/common/os/main.c
+++ b/usr/src/uts/common/os/main.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -360,6 +360,8 @@ main(void)
extern int netboot;
extern void vm_init(void);
extern void cbe_init(void);
+ extern void clock_tick_init_pre(void);
+ extern void clock_tick_init_post(void);
extern void clock_init(void);
extern void physio_bufs_init(void);
extern void pm_cfb_setup_intr(void);
@@ -399,6 +401,7 @@ main(void)
callout_init(); /* callout table MUST be init'd before clock starts */
timer_init(); /* timer must be initialized before cyclic starts */
cbe_init();
+ clock_tick_init_pre();
clock_init();
/*
@@ -544,6 +547,8 @@ main(void)
kmem_mp_init();
vmem_update(NULL);
+ clock_tick_init_post();
+
for (initptr = &mp_init_tbl[0]; *initptr; initptr++)
(**initptr)();
diff --git a/usr/src/uts/common/os/task.c b/usr/src/uts/common/os/task.c
index b3967546f5..628fcde30b 100644
--- a/usr/src/uts/common/os/task.c
+++ b/usr/src/uts/common/os/task.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -202,7 +202,39 @@ task_cpu_time_usage(rctl_t *r, proc_t *p)
task_t *t = p->p_task;
ASSERT(MUTEX_HELD(&p->p_lock));
- return (t->tk_cpu_time / hz);
+ return (t->tk_cpu_time);
+}
+
+/*
+ * int task_cpu_time_incr(task_t *t, rctl_qty_t incr)
+ *
+ * Overview
+ * task_cpu_time_incr() increments the amount of CPU time used
+ * by this task.
+ *
+ * Return values
+ * 1 if a second or more time is accumulated
+ * 0 otherwise
+ *
+ * Caller's context
+ * This is called by the clock tick accounting function to charge
+ * CPU time to a task.
+ */
+rctl_qty_t
+task_cpu_time_incr(task_t *t, rctl_qty_t incr)
+{
+ rctl_qty_t ret = 0;
+
+ mutex_enter(&t->tk_cpu_time_lock);
+ t->tk_cpu_ticks += incr;
+ if (t->tk_cpu_ticks >= hz) {
+ t->tk_cpu_time += t->tk_cpu_ticks / hz;
+ t->tk_cpu_ticks = t->tk_cpu_ticks % hz;
+ ret = t->tk_cpu_time;
+ }
+ mutex_exit(&t->tk_cpu_time_lock);
+
+ return (ret);
}
/*
@@ -224,15 +256,12 @@ static int
task_cpu_time_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
struct rctl_val *rcntl, rctl_qty_t incr, uint_t flags)
{
- task_t *t;
-
ASSERT(MUTEX_HELD(&p->p_lock));
ASSERT(e->rcep_t == RCENTITY_TASK);
if (e->rcep_p.task == NULL)
return (0);
- t = e->rcep_p.task;
- if ((t->tk_cpu_time + incr) / hz >= rcntl->rcv_value)
+ if (incr >= rcntl->rcv_value)
return (1);
return (0);
diff --git a/usr/src/uts/common/sys/clock_tick.h b/usr/src/uts/common/sys/clock_tick.h
new file mode 100644
index 0000000000..b3dc2198ed
--- /dev/null
+++ b/usr/src/uts/common/sys/clock_tick.h
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CLOCK_TICK_H
+#define _SYS_CLOCK_TICK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/mutex.h>
+#include <sys/cpuvar.h>
+#include <sys/systm.h>
+#include <sys/cyclic.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CLOCK_TICK_NCPUS 32
+
+/*
+ * Per-CPU structure to facilitate multi-threaded tick accounting.
+ *
+ * ct_lock
+ * Mutex for the structure. Used to lock the structure to pass
+ * arguments to the tick processing softint handler.
+ * ct_intr
+ * Tick processing softint handle. For parallelism, each CPU
+ * needs to have its own softint handle.
+ * ct_lbolt
+ * Copy of the lbolt at the time of tick scheduling.
+ * ct_pending
+ * Number of ticks to be processed by one invocation of the tick
+ * processing softint.
+ * ct_start
+ * First CPU to do tick processing for.
+ * ct_end
+ * Last CPU to do tick processing for.
+ * ct_scan
+ * CPU to start the tick processing from. Rotated every tick.
+ */
+typedef struct clock_tick_cpu {
+ kmutex_t ct_lock;
+ ulong_t ct_intr;
+ clock_t ct_lbolt;
+ int ct_pending;
+ int ct_start;
+ int ct_end;
+ int ct_scan;
+} clock_tick_cpu_t;
+
+/*
+ * Per-set structure to facilitate multi-threaded tick accounting.
+ * clock_tick_lock protects this.
+ *
+ * ct_start
+ * First CPU to do tick processing for.
+ * ct_end
+ * Last CPU to do tick processing for.
+ * ct_scan
+ * CPU to start the tick processing from. Rotated every tick.
+ */
+typedef struct clock_tick_set {
+ int ct_start;
+ int ct_end;
+ int ct_scan;
+} clock_tick_set_t;
+
+#define CLOCK_TICK_CPU_OFFLINE(cp) \
+ (((cp) != cpu_active) && ((cp)->cpu_next_onln == (cp)))
+
+#define CLOCK_TICK_XCALL_SAFE(cp) \
+ CPU_IN_SET(clock_tick_online_cpuset, cp->cpu_id)
+
+#define CLOCK_TICK_PROC_MAX 10
+
+#ifdef _KERNEL
+#pragma weak create_softint
+extern ulong_t create_softint(uint_t, uint_t (*)(caddr_t, caddr_t),
+ caddr_t);
+#pragma weak invoke_softint
+extern void invoke_softint(processorid_t, ulong_t);
+#pragma weak sync_softint
+extern void sync_softint(cpuset_t);
+extern void clock_tick(kthread_t *, int);
+extern void membar_sync(void);
+
+extern int hires_tick;
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CLOCK_TICK_H */
diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h
index 4785796781..c7b76b32ea 100644
--- a/usr/src/uts/common/sys/cpuvar.h
+++ b/usr/src/uts/common/sys/cpuvar.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -519,6 +519,7 @@ extern cpuset_t cpu_seqid_inuse;
extern struct cpu *cpu[]; /* indexed by CPU number */
extern cpu_t *cpu_list; /* list of CPUs */
+extern cpu_t *cpu_active; /* list of active CPUs */
extern int ncpus; /* number of CPUs present */
extern int ncpus_online; /* number of CPUs not quiesced */
extern int max_ncpus; /* max present before ncpus is known */
diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h
index 5a9b1caf50..46d205749a 100644
--- a/usr/src/uts/common/sys/proc.h
+++ b/usr/src/uts/common/sys/proc.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -348,6 +348,7 @@ typedef struct proc {
/* protected by p_lock */
rctl_qty_t p_crypto_mem; /* /dev/crypto memory charged to proc */
/* protected by p_lock */
+ clock_t p_ttime; /* buffered task time */
} proc_t;
#define PROC_T /* headers relying on proc_t are OK */
diff --git a/usr/src/uts/common/sys/task.h b/usr/src/uts/common/sys/task.h
index d1bcb04145..a22f64d970 100644
--- a/usr/src/uts/common/sys/task.h
+++ b/usr/src/uts/common/sys/task.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -73,6 +73,8 @@ typedef struct task {
task_usage_t *tk_inherited; /* task resource usage */
/* inherited with the first */
/* member process */
+ rctl_qty_t tk_cpu_ticks; /* accumulated CPU ticks */
+ kmutex_t tk_cpu_time_lock; /* accumulated CPU seconds lock */
} task_t;
extern task_t *task0p;
@@ -91,6 +93,7 @@ extern task_t *task_hold_by_id_zone(taskid_t, zoneid_t);
extern void task_rele(task_t *);
extern void task_hold(task_t *);
extern void task_end(task_t *);
+extern rctl_qty_t task_cpu_time_incr(task_t *, rctl_qty_t);
#else /* _KERNEL */
diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h
index d545e093b3..7302289ea1 100644
--- a/usr/src/uts/common/sys/thread.h
+++ b/usr/src/uts/common/sys/thread.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -515,10 +515,22 @@ extern struct _kthread t0; /* the scheduler thread */
extern kmutex_t pidlock; /* global process lock */
/*
- * thread_free_lock is used by the clock thread to keep a thread
+ * thread_free_lock is used by the tick accounting thread to keep a thread
* from being freed while it is being examined.
*/
-extern kmutex_t thread_free_lock;
+#define THREAD_FREE_NUM 1024
+#define THREAD_FREE_MASK (THREAD_FREE_NUM - 1)
+#define THREAD_FREE_SHIFT_BITS 5
+#define THREAD_FREE_SHIFT(t) ((uintptr_t)t >> THREAD_FREE_SHIFT_BITS)
+#define THREAD_FREE_HASH(t) (THREAD_FREE_SHIFT(t) & THREAD_FREE_MASK)
+
+typedef struct thread_free_lock {
+ kmutex_t tf_lock;
+ uchar_t tf_pad[64 - sizeof (kmutex_t)];
+} thread_free_lock_t;
+
+extern void thread_free_prevent(kthread_t *);
+extern void thread_free_allow(kthread_t *);
/*
* Routines to change the priority and effective priority
diff --git a/usr/src/uts/intel/ia32/ml/lock_prim.s b/usr/src/uts/intel/ia32/ml/lock_prim.s
index 2d74137565..8dc51e3eeb 100644
--- a/usr/src/uts/intel/ia32/ml/lock_prim.s
+++ b/usr/src/uts/intel/ia32/ml/lock_prim.s
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1278,6 +1278,10 @@ lockstat_hot_patch(void)
/* XX64 membar_*() should be inlines */
void
+membar_sync(void)
+{}
+
+void
membar_enter(void)
{}
@@ -1299,8 +1303,10 @@ membar_consumer(void)
ENTRY(membar_enter)
ALTENTRY(membar_exit)
+ ALTENTRY(membar_sync)
mfence /* lighter weight than lock; xorq $0,(%rsp) */
ret
+ SET_SIZE(membar_sync)
SET_SIZE(membar_exit)
SET_SIZE(membar_enter)
@@ -1318,9 +1324,11 @@ membar_consumer(void)
ENTRY(membar_enter)
ALTENTRY(membar_exit)
+ ALTENTRY(membar_sync)
lock
xorl $0, (%esp)
ret
+ SET_SIZE(membar_sync)
SET_SIZE(membar_exit)
SET_SIZE(membar_enter)
diff --git a/usr/src/uts/sun4/os/intr.c b/usr/src/uts/sun4/os/intr.c
index 7f70424a45..d0830a261e 100644
--- a/usr/src/uts/sun4/os/intr.c
+++ b/usr/src/uts/sun4/os/intr.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -813,3 +813,37 @@ intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip)
cp->cpu_intr_weight = 0; /* sanity */
mutex_exit(&intr_dist_cpu_lock);
}
+
+ulong_t
+create_softint(uint_t pil, uint_t (*func)(caddr_t, caddr_t), caddr_t arg1)
+{
+ uint64_t inum;
+
+ inum = add_softintr(pil, func, arg1, SOFTINT_ST);
+ return ((ulong_t)inum);
+}
+
+void
+invoke_softint(processorid_t cpuid, ulong_t hdl)
+{
+ uint64_t inum = hdl;
+
+ if (cpuid == CPU->cpu_id)
+ setsoftint(inum);
+ else
+ xt_one(cpuid, setsoftint_tl1, inum, 0);
+}
+
+void
+remove_softint(ulong_t hdl)
+{
+ uint64_t inum = hdl;
+
+ (void) rem_softintr(inum);
+}
+
+void
+sync_softint(cpuset_t set)
+{
+ xt_sync(set);
+}
diff --git a/usr/src/uts/sun4/sys/ivintr.h b/usr/src/uts/sun4/sys/ivintr.h
index 7ea9ae1d85..eb0a8656e3 100644
--- a/usr/src/uts/sun4/sys/ivintr.h
+++ b/usr/src/uts/sun4/sys/ivintr.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -42,8 +42,9 @@ extern "C" {
* interrupts.
*
* NOTE: Need two single target software interrupts per cpu for cyclics.
+ * Need one single target software interrupt per cpu for tick accounting.
*/
-#define MAX_RSVD_IV ((NCPU * 2) + 256) /* HW and Single target SW intrs */
+#define MAX_RSVD_IV ((NCPU * 3) + 256) /* HW and Single target SW intrs */
#define MAX_RSVD_IVX 32 /* Multi target software intrs */
#ifndef _ASM
diff --git a/usr/src/uts/sun4u/opl/os/opl.c b/usr/src/uts/sun4u/opl/os/opl.c
index f2d3162a93..f33b231117 100644
--- a/usr/src/uts/sun4u/opl/os/opl.c
+++ b/usr/src/uts/sun4u/opl/os/opl.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -116,6 +116,12 @@ static void pass2xscf_thread();
#define OPL_BOFF_MAX (40 * OPL_BOFF_SLEEP)
#define OPL_BOFF_TM 1500
+#define OPL_CLOCK_TICK_THRESHOLD 128
+#define OPL_CLOCK_TICK_NCPUS 64
+
+extern int clock_tick_threshold;
+extern int clock_tick_ncpus;
+
int
set_platform_max_ncpus(void)
{
@@ -943,6 +949,10 @@ plat_startup_memlist(caddr_t alloc_base)
void
startup_platform(void)
{
+ if (clock_tick_threshold == 0)
+ clock_tick_threshold = OPL_CLOCK_TICK_THRESHOLD;
+ if (clock_tick_ncpus == 0)
+ clock_tick_ncpus = OPL_CLOCK_TICK_NCPUS;
}
void
diff --git a/usr/src/uts/sun4v/os/mach_startup.c b/usr/src/uts/sun4v/os/mach_startup.c
index 78293c17af..333212b4f5 100644
--- a/usr/src/uts/sun4v/os/mach_startup.c
+++ b/usr/src/uts/sun4v/os/mach_startup.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -57,6 +57,12 @@ extern void sfmmu_set_tsbs(void);
*/
static int enable_halt_idle_cpus = 1;
+#define SUN4V_CLOCK_TICK_THRESHOLD 64
+#define SUN4V_CLOCK_TICK_NCPUS 64
+
+extern int clock_tick_threshold;
+extern int clock_tick_ncpus;
+
void
setup_trap_table(void)
{
@@ -296,6 +302,10 @@ void
startup_platform(void)
{
ip_squeue_soft_ring = B_TRUE;
+ if (clock_tick_threshold == 0)
+ clock_tick_threshold = SUN4V_CLOCK_TICK_THRESHOLD;
+ if (clock_tick_ncpus == 0)
+ clock_tick_ncpus = SUN4V_CLOCK_TICK_NCPUS;
}
/*