6619224 Tick accounting needs to be made scalable

author: mv143129 <none@none> 2008-01-07 18:49:45 -0800
committer: mv143129 <none@none> 2008-01-07 18:49:45 -0800
commit: 2850d85b7b93f31e578520dc3b3feb24db609c62 (patch)
tree: 2a1994e9e8b17b58b251e608de8c1a449291ba8f
parent: ca29f3da510ba7f712a40977b04aeceda9b70b95 (diff)
download: illumos-gate-2850d85b7b93f31e578520dc3b3feb24db609c62.tar.gz
19 files changed, 1087 insertions, 225 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 8adce7c1b1..d284271f12 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -67,7 +67,8 @@ COMMON_CORE_OBJS +=		\
 		thread_intr.o	\
 		vm_page.o	\
 		vm_pagelist.o	\
-		zlib_obj.o
+		zlib_obj.o	\
+		clock_tick.o
 
 CORE_OBJS +=	$(COMMON_CORE_OBJS) $($(MACH)_CORE_OBJS)
 
diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c
index c35d3e7012..53a4b91775 100644
--- a/usr/src/uts/common/conf/param.c
+++ b/usr/src/uts/common/conf/param.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -194,6 +194,7 @@ extern void clock_highres_init(void);
 extern void pg_init(void);
 extern void pg_cmt_class_init(void);
 extern void pg_cpu0_init(void);
+extern void clock_tick_mp_init(void);
 
 void	(*init_tbl[])(void) = {
 	system_taskq_init,
@@ -238,6 +239,7 @@ void	(*mp_init_tbl[])(void) = {
 #if defined(__sparc)
 	siron_mp_init,
 #endif
+	clock_tick_mp_init,
 	0
 };
 
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index 95e21ea1bf..ee2d80834d 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -89,11 +89,13 @@ static kcondvar_t reaper_cv;		/* synchronization var */
 kthread_t	*thread_deathrow;	/* circular list of reapable threads */
 kthread_t	*lwp_deathrow;		/* circular list of reapable threads */
 kmutex_t	reaplock;		/* protects lwp and thread deathrows */
-kmutex_t	thread_free_lock;	/* protects clock from reaper */
 int	thread_reapcnt = 0;		/* number of threads on deathrow */
 int	lwp_reapcnt = 0;		/* number of lwps on deathrow */
 int	reaplimit = 16;			/* delay reaping until reaplimit */
 
+thread_free_lock_t	*thread_free_lock;
+					/* protects tick thread from reaper */
+
 extern int nthread;
 
 id_t	syscid;				/* system scheduling class ID */
@@ -152,8 +154,16 @@ thread_init(void)
 	extern char sys_name[];
 	extern void idle();
 	struct cpu *cpu = CPU;
+	int i;
+	kmutex_t *lp;
 
 	mutex_init(&reaplock, NULL, MUTEX_SPIN, (void *)ipltospl(DISP_LEVEL));
+	thread_free_lock =
+	    kmem_alloc(sizeof (thread_free_lock_t) * THREAD_FREE_NUM, KM_SLEEP);
+	for (i = 0; i < THREAD_FREE_NUM; i++) {
+		lp = &thread_free_lock[i].tf_lock;
+		mutex_init(lp, NULL, MUTEX_DEFAULT, NULL);
+	}
 
 #if defined(__i386) || defined(__amd64)
 	thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
@@ -663,6 +673,34 @@ thread_join(kt_did_t tid)
 }
 
 void
+thread_free_prevent(kthread_t *t)
+{
+	kmutex_t *lp;
+
+	lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
+	mutex_enter(lp);
+}
+
+void
+thread_free_allow(kthread_t *t)
+{
+	kmutex_t *lp;
+
+	lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
+	mutex_exit(lp);
+}
+
+static void
+thread_free_barrier(kthread_t *t)
+{
+	kmutex_t *lp;
+
+	lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
+	mutex_enter(lp);
+	mutex_exit(lp);
+}
+
+void
 thread_free(kthread_t *t)
 {
 	ASSERT(t != &t0 && t->t_state == TS_FREE);
@@ -714,11 +752,11 @@ thread_free(kthread_t *t)
 	free_afd(&t->t_activefd);
 
 	/*
-	 * Barrier for clock thread.  The clock holds this lock to
-	 * keep the thread from going away while it's looking at it.
+	 * Barrier for the tick accounting code.  The tick accounting code
+	 * holds this lock to keep the thread from going away while it's
+	 * looking at it.
 	 */
-	mutex_enter(&thread_free_lock);
-	mutex_exit(&thread_free_lock);
+	thread_free_barrier(t);
 
 	ASSERT(ttoproj(t) == proj0p);
 	project_rele(ttoproj(t));
diff --git a/usr/src/uts/common/os/clock.c b/usr/src/uts/common/os/clock.c
index 28d5eece05..f73f758bbf 100644
--- a/usr/src/uts/common/os/clock.c
+++ b/usr/src/uts/common/os/clock.c
@@ -23,7 +23,7 @@
 
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -253,6 +253,8 @@ cyclic_id_t clock_cyclic;	/* clock()'s cyclic_id */
 cyclic_id_t deadman_cyclic;	/* deadman()'s cyclic_id */
 cyclic_id_t ddi_timer_cyclic;	/* cyclic_timer()'s cyclic_id */
 
+extern void	clock_tick_schedule(int);
+
 static int lgrp_ticks;		/* counter to schedule lgrp load calcs */
 
 /*
@@ -306,7 +308,6 @@ static int	adj_hist_entry;
 
 int64_t clock_adj_hist[CLOCK_ADJ_HIST_SIZE];
 
-static void clock_tick(kthread_t *);
 static void calcloadavg(int, uint64_t *);
 static int genloadavg(struct loadavg_s *);
 static void loadavg_update();
@@ -314,17 +315,16 @@ static void loadavg_update();
 void (*cmm_clock_callout)() = NULL;
 void (*cpucaps_clock_callout)() = NULL;
 
+extern clock_t clock_tick_proc_max;
+
 static void
 clock(void)
 {
 	kthread_t	*t;
-	kmutex_t	*plockp;	/* pointer to thread's process lock */
-	int	pinned_intr = 0;
-	uint_t	nrunnable, nrunning;
+	uint_t	nrunnable;
 	uint_t	w_io;
 	cpu_t	*cp;
 	cpupart_t *cpupart;
-	int	exiting;
 	extern void set_anoninfo();
 	extern	void	set_freemem();
 	void	(*funcp)();
@@ -379,22 +379,7 @@ clock(void)
 	 * every timer interrupt.
 	 *
 	 * Continue with the interrupt processing as scheduled.
-	 *
-	 * Did we pin another interrupt thread?  Need to check this before
-	 * grabbing any adaptive locks, since if we block on a lock the
-	 * pinned thread could escape.  Note that this is just a heuristic;
-	 * if we take multiple laps though clock() without returning from
-	 * the interrupt because we have another clock tick pending, then
-	 * the pinned interrupt could be released by one of the previous
-	 * laps.  The only consequence is that the CPU will be counted as
-	 * in idle (or wait) state once the pinned interrupt is released.
-	 * Since this accounting is inaccurate by nature, this isn't a big
-	 * deal --- but we should try to get it right in the common case
-	 * where we only call clock() once per interrupt.
 	 */
-	if (curthread->t_intr != NULL)
-		pinned_intr = (curthread->t_intr->t_flag & T_INTR_THREAD);
-
 	/*
 	 * Count the number of runnable threads and the number waiting
 	 * for some form of I/O to complete -- gets added to
@@ -448,6 +433,10 @@ clock(void)
 		if (one_sec) {
 			cpupart->cp_nrunnable += cpu_nrunnable;
 			/*
+			 * Update user, system, and idle cpu times.
+			 */
+			cpupart->cp_nrunning++;
+			/*
 			 * w_io is used to update sysinfo.waiting during
 			 * one_second processing below.  Only gather w_io
 			 * information when we walk the list of cpus if we're
@@ -547,150 +536,7 @@ clock(void)
 		}
 	} while ((cp = cp->cpu_next) != cpu_list);
 
-	/*
-	 * Do tick processing for all the active threads running in
-	 * the system.  We're trying to be more fair by walking the
-	 * list of CPUs starting from a different CPUs each time.
-	 */
-	cp = clock_cpu_list;
-	nrunning = 0;
-	do {
-		klwp_id_t lwp;
-		int intr;
-		int thread_away;
-
-		/*
-		 * Don't do any tick processing on CPUs that
-		 * aren't even in the system or aren't up yet.
-		 */
-		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
-			continue;
-		}
-
-		/*
-		 * The locking here is rather tricky.  We use
-		 * thread_free_lock to keep the currently running
-		 * thread from being freed or recycled while we're
-		 * looking at it.  We can then check if the thread
-		 * is exiting and get the appropriate p_lock if it
-		 * is not.  We have to be careful, though, because
-		 * the _process_ can still be freed while we're
-		 * holding thread_free_lock.  To avoid touching the
-		 * proc structure we put a pointer to the p_lock in the
-		 * thread structure.  The p_lock is persistent so we
-		 * can acquire it even if the process is gone.  At that
-		 * point we can check (again) if the thread is exiting
-		 * and either drop the lock or do the tick processing.
-		 */
-		mutex_enter(&thread_free_lock);
-		/*
-		 * We cannot hold the cpu_lock to prevent the
-		 * cpu_list from changing in the clock interrupt.
-		 * As long as we don't block (or don't get pre-empted)
-		 * the cpu_list will not change (all threads are paused
-		 * before list modification). If the list does change
-		 * any deleted cpu structures will remain with cpu_next
-		 * set to NULL, hence the following test.
-		 */
-		if (cp->cpu_next == NULL) {
-			mutex_exit(&thread_free_lock);
-			break;
-		}
-		t = cp->cpu_thread;	/* Current running thread */
-		if (CPU == cp) {
-			/*
-			 * 't' will be the clock interrupt thread on this
-			 * CPU.  Use the pinned thread (if any) on this CPU
-			 * as the target of the clock tick.  If we pinned
-			 * an interrupt, though, just keep using the clock
-			 * interrupt thread since the formerly pinned one
-			 * may have gone away.  One interrupt thread is as
-			 * good as another, and this means we don't have
-			 * to continue to check pinned_intr in subsequent
-			 * code.
-			 */
-			ASSERT(t == curthread);
-			if (t->t_intr != NULL && !pinned_intr)
-				t = t->t_intr;
-		}
-
-		intr = t->t_flag & T_INTR_THREAD;
-		lwp = ttolwp(t);
-		if (lwp == NULL || (t->t_proc_flag & TP_LWPEXIT) || intr) {
-			/*
-			 * Thread is exiting (or uninteresting) so don't
-			 * do tick processing or grab p_lock.  Once we
-			 * drop thread_free_lock we can't look inside the
-			 * thread or lwp structure, since the thread may
-			 * have gone away.
-			 */
-			exiting = 1;
-		} else {
-			/*
-			 * OK, try to grab the process lock.  See
-			 * comments above for why we're not using
-			 * ttoproc(t)->p_lockp here.
-			 */
-			plockp = t->t_plockp;
-			mutex_enter(plockp);
-			/* See above comment. */
-			if (cp->cpu_next == NULL) {
-				mutex_exit(plockp);
-				mutex_exit(&thread_free_lock);
-				break;
-			}
-			/*
-			 * The thread may have exited between when we
-			 * checked above, and when we got the p_lock.
-			 */
-			if (t->t_proc_flag & TP_LWPEXIT) {
-				mutex_exit(plockp);
-				exiting = 1;
-			} else {
-				exiting = 0;
-			}
-		}
-		/*
-		 * Either we have the p_lock for the thread's process,
-		 * or we don't care about the thread structure any more.
-		 * Either way we can drop thread_free_lock.
-		 */
-		mutex_exit(&thread_free_lock);
-
-		/*
-		 * Update user, system, and idle cpu times.
-		 */
-		if (one_sec) {
-			nrunning++;
-			cp->cpu_part->cp_nrunning++;
-		}
-		/*
-		 * If we haven't done tick processing for this
-		 * lwp, then do it now. Since we don't hold the
-		 * lwp down on a CPU it can migrate and show up
-		 * more than once, hence the lbolt check.
-		 *
-		 * Also, make sure that it's okay to perform the
-		 * tick processing before calling clock_tick.
-		 * Setting thread_away to a TRUE value (ie. not 0)
-		 * results in tick processing not being performed for
-		 * that thread.  Or, in other words, keeps the thread
-		 * away from clock_tick processing.
-		 */
-		thread_away = ((cp->cpu_flags & CPU_QUIESCED) ||
-		    CPU_ON_INTR(cp) || intr ||
-		    (cp->cpu_dispthread == cp->cpu_idle_thread) || exiting);
-
-		if ((!thread_away) && (lbolt - t->t_lbolt != 0)) {
-			t->t_lbolt = lbolt;
-			clock_tick(t);
-		}
-
-		if (!exiting)
-			mutex_exit(plockp);
-	} while ((cp = cp->cpu_next) != clock_cpu_list);
-
-	clock_cpu_list = clock_cpu_list->cpu_next;
+	clock_tick_schedule(one_sec);
 
 	/*
 	 * bump time in ticks
@@ -1522,16 +1368,19 @@ ddi_hardpps(struct timeval *tvp, int usec)
  * Check for timer action, enforce CPU rlimit, do profiling etc.
  */
 void
-clock_tick(kthread_t *t)
+clock_tick(kthread_t *t, int pending)
 {
 	struct proc *pp;
 	klwp_id_t    lwp;
 	struct as *as;
-	clock_t	utime;
-	clock_t	stime;
+	clock_t	ticks;
 	int	poke = 0;		/* notify another CPU */
 	int	user_mode;
 	size_t	 rss;
+	int i, total_usec, usec;
+	rctl_qty_t secs;
+
+	ASSERT(pending > 0);
 
 	/* Must be operating on a lwp/thread */
 	if ((lwp = ttolwp(t)) == NULL) {
@@ -1539,8 +1388,10 @@ clock_tick(kthread_t *t)
 		/*NOTREACHED*/
 	}
 
-	CL_TICK(t);	/* Class specific tick processing */
-	DTRACE_SCHED1(tick, kthread_t *, t);
+	for (i = 0; i < pending; i++) {
+		CL_TICK(t);	/* Class specific tick processing */
+		DTRACE_SCHED1(tick, kthread_t *, t);
+	}
 
 	pp = ttoproc(t);
 
@@ -1549,17 +1400,18 @@ clock_tick(kthread_t *t)
 
 	user_mode = (lwp->lwp_state == LWP_USER);
 
+	ticks = (pp->p_utime + pp->p_stime) % hz;
 	/*
 	 * Update process times. Should use high res clock and state
 	 * changes instead of statistical sampling method. XXX
 	 */
 	if (user_mode) {
-		pp->p_utime++;
-		pp->p_task->tk_cpu_time++;
+		pp->p_utime += pending;
 	} else {
-		pp->p_stime++;
-		pp->p_task->tk_cpu_time++;
+		pp->p_stime += pending;
 	}
+
+	pp->p_ttime += pending;
 	as = pp->p_as;
 
 	/*
@@ -1567,45 +1419,73 @@ clock_tick(kthread_t *t)
 	 * lwp when the AST happens.
 	 */
 	if (pp->p_prof.pr_scale) {
-		atomic_add_32(&lwp->lwp_oweupc, 1);
+		atomic_add_32(&lwp->lwp_oweupc, (int32_t)pending);
 		if (user_mode) {
 			poke = 1;
 			aston(t);
 		}
 	}
 
-	utime = pp->p_utime;
-	stime = pp->p_stime;
-
 	/*
 	 * If CPU was in user state, process lwp-virtual time
-	 * interval timer.
+	 * interval timer. The value passed to itimerdecr() has to be
+	 * in microseconds and has to be less than one second. Hence
+	 * this loop.
 	 */
-	if (user_mode &&
-	    timerisset(&lwp->lwp_timer[ITIMER_VIRTUAL].it_value) &&
-	    itimerdecr(&lwp->lwp_timer[ITIMER_VIRTUAL], usec_per_tick) == 0) {
-		poke = 1;
-		sigtoproc(pp, t, SIGVTALRM);
+	total_usec = usec_per_tick * pending;
+	while (total_usec > 0) {
+		usec = MIN(total_usec, (MICROSEC - 1));
+		if (user_mode &&
+		    timerisset(&lwp->lwp_timer[ITIMER_VIRTUAL].it_value) &&
+		    itimerdecr(&lwp->lwp_timer[ITIMER_VIRTUAL], usec) == 0) {
+			poke = 1;
+			sigtoproc(pp, t, SIGVTALRM);
+		}
+		total_usec -= usec;
 	}
 
-	if (timerisset(&lwp->lwp_timer[ITIMER_PROF].it_value) &&
-	    itimerdecr(&lwp->lwp_timer[ITIMER_PROF], usec_per_tick) == 0) {
-		poke = 1;
-		sigtoproc(pp, t, SIGPROF);
+	/*
+	 * If CPU was in user state, process lwp-profile
+	 * interval timer.
+	 */
+	total_usec = usec_per_tick * pending;
+	while (total_usec > 0) {
+		usec = MIN(total_usec, (MICROSEC - 1));
+		if (timerisset(&lwp->lwp_timer[ITIMER_PROF].it_value) &&
+		    itimerdecr(&lwp->lwp_timer[ITIMER_PROF], usec) == 0) {
+			poke = 1;
+			sigtoproc(pp, t, SIGPROF);
+		}
+		total_usec -= usec;
 	}
 
 	/*
 	 * Enforce CPU resource controls:
 	 *   (a) process.max-cpu-time resource control
+	 *
+	 * Perform the check only if we have accumulated more a second.
 	 */
-	(void) rctl_test(rctlproc_legacy[RLIMIT_CPU], pp->p_rctls, pp,
-	    (utime + stime)/hz, RCA_UNSAFE_SIGINFO);
+	if ((ticks + pending) >= hz) {
+		(void) rctl_test(rctlproc_legacy[RLIMIT_CPU], pp->p_rctls, pp,
+		    (pp->p_utime + pp->p_stime)/hz, RCA_UNSAFE_SIGINFO);
+	}
 
 	/*
 	 *   (b) task.max-cpu-time resource control
+	 *
+	 * If we have accumulated enough ticks, increment the task CPU
+	 * time usage and test for the resource limit. This minimizes the
+	 * number of calls to the rct_test(). The task CPU time mutex
+	 * is highly contentious as many processes can be sharing a task.
 	 */
-	(void) rctl_test(rc_task_cpu_time, pp->p_task->tk_rctls, pp, 1,
-	    RCA_UNSAFE_SIGINFO);
+	if (pp->p_ttime >= clock_tick_proc_max) {
+		secs = task_cpu_time_incr(pp->p_task, pp->p_ttime);
+		pp->p_ttime = 0;
+		if (secs) {
+			(void) rctl_test(rc_task_cpu_time, pp->p_task->tk_rctls,
+			    pp, secs, RCA_UNSAFE_SIGINFO);
+		}
+	}
 
 	/*
 	 * Update memory usage for the currently running process.
diff --git a/usr/src/uts/common/os/clock_tick.c b/usr/src/uts/common/os/clock_tick.c
new file mode 100644
index 0000000000..816f4978b1
--- /dev/null
+++ b/usr/src/uts/common/os/clock_tick.c
@@ -0,0 +1,699 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/task.h>
+#include <sys/cmn_err.h>
+#include <sys/class.h>
+#include <sys/sdt.h>
+#include <sys/atomic.h>
+#include <sys/cpu.h>
+#include <sys/clock_tick.h>
+#include <sys/sysmacros.h>
+#include <vm/rm.h>
+
+/*
+ * This file contains the implementation of clock tick accounting for threads.
+ * Every tick, user threads running on various CPUs are located and charged
+ * with a tick to account for their use of CPU time.
+ *
+ * Every tick, the clock() handler calls clock_tick_schedule() to perform tick
+ * accounting for all the threads in the system. Tick accounting is done in
+ * two phases:
+ *
+ * Tick scheduling	Done in clock_tick_schedule(). In this phase, cross
+ *			calls are scheduled to multiple CPUs to perform
+ *			multi-threaded tick accounting. The CPUs are chosen
+ *			on a rotational basis so as to distribute the tick
+ *			accounting load evenly across all CPUs.
+ *
+ * Tick execution	Done in clock_tick_execute(). In this phase, tick
+ *			accounting is actually performed by softint handlers
+ *			on multiple CPUs.
+ *
+ * This implementation gives us a multi-threaded tick processing facility that
+ * is suitable for configurations with a large number of CPUs. On smaller
+ * configurations it may be desirable to let the processing be single-threaded
+ * and just allow clock() to do it as it has been done traditionally. To
+ * facilitate this, a variable, clock_tick_threshold, is defined. Platforms
+ * that desire multi-threading should set this variable to something
+ * appropriate. A recommended value may be found in clock_tick.h. At boot time,
+ * if the number of CPUs is greater than clock_tick_threshold, multi-threading
+ * kicks in. Note that this is a decision made at boot time. If more CPUs
+ * are dynamically added later on to exceed the threshold, no attempt is made
+ * to switch to multi-threaded. Similarly, if CPUs are removed dynamically
+ * no attempt is made to switch to single-threaded. This is to keep the
+ * implementation simple. Also note that the threshold can be changed for a
+ * specific customer configuration via /etc/system.
+ *
+ * The boot time decision is reflected in clock_tick_single_threaded.
+ */
+
+/*
+ * clock_tick_threshold
+ *	If the number of CPUs at boot time exceeds this threshold,
+ *	multi-threaded tick accounting kicks in.
+ *
+ * clock_tick_ncpus
+ *	The number of CPUs in a set. Each set is scheduled for tick execution
+ *	on a separate processor.
+ *
+ * clock_tick_single_threaded
+ *	Indicates whether or not tick accounting is single threaded.
+ *
+ * clock_tick_total_cpus
+ *	Total number of online CPUs.
+ *
+ * clock_tick_cpus
+ *	Array of online CPU pointers.
+ *
+ * clock_tick_cpu
+ *	Per-CPU, cache-aligned data structures to facilitate multi-threading.
+ *
+ * clock_tick_active
+ *	Counter that indicates the number of active tick processing softints
+ *	in the system.
+ *
+ * clock_tick_pending
+ *	Number of pending ticks that need to be accounted by the softint
+ *	handlers.
+ *
+ * clock_tick_lock
+ *	Mutex to synchronize between clock_tick_schedule() and
+ *	CPU online/offline.
+ *
+ * clock_cpu_id
+ *	CPU id of the clock() CPU. Used to detect when the clock CPU
+ *	is offlined.
+ *
+ * clock_tick_online_cpuset
+ *	CPU set of all online processors that can be X-called.
+ *
+ * clock_tick_proc_max
+ *	Each process is allowed to accumulate a few ticks before checking
+ *	for the task CPU time resource limit. We lower the number of calls
+ *	to rctl_test() to make tick accounting more scalable. The tradeoff
+ *	is that the limit may not get enforced in a timely manner. This is
+ *	typically not a problem.
+ *
+ * clock_tick_set
+ *	Per-set structures. Each structure contains the range of CPUs
+ *	to be processed for the set.
+ *
+ * clock_tick_nsets;
+ *	Number of sets.
+ *
+ * clock_tick_scan
+ *	Where to begin the scan for single-threaded mode. In multi-threaded,
+ *	the clock_tick_set itself contains a field for this.
+ */
+int			clock_tick_threshold;
+int			clock_tick_ncpus;
+int			clock_tick_single_threaded;
+int			clock_tick_total_cpus;
+cpu_t			*clock_tick_cpus[NCPU];
+clock_tick_cpu_t	*clock_tick_cpu[NCPU];
+ulong_t			clock_tick_active;
+int			clock_tick_pending;
+kmutex_t		clock_tick_lock;
+processorid_t		clock_cpu_id;
+cpuset_t		clock_tick_online_cpuset;
+clock_t			clock_tick_proc_max;
+clock_tick_set_t	*clock_tick_set;
+int			clock_tick_nsets;
+int			clock_tick_scan;
+
+static uint_t	clock_tick_execute(caddr_t, caddr_t);
+static void	clock_tick_execute_common(int, int, int, clock_t, int);
+
+#define	CLOCK_TICK_ALIGN	64	/* cache alignment */
+
+/*
+ * Clock tick initialization is done in two phases:
+ *
+ * 1. Before clock_init() is called, clock_tick_init_pre() is called to set
+ *    up single-threading so the clock() can begin to do its job.
+ *
+ * 2. After the slave CPUs are initialized at boot time, we know the number
+ *    of CPUs. clock_tick_init_post() is called to set up multi-threading if
+ *    required.
+ */
+void
+clock_tick_init_pre(void)
+{
+	clock_tick_cpu_t	*ctp;
+	int			i, n;
+	clock_tick_set_t	*csp;
+	uintptr_t		buf;
+	size_t			size;
+
+	clock_tick_single_threaded = 1;
+
+	size = P2ROUNDUP(sizeof (clock_tick_cpu_t), CLOCK_TICK_ALIGN);
+	buf = (uintptr_t)kmem_zalloc(size * NCPU + CLOCK_TICK_ALIGN, KM_SLEEP);
+	buf = P2ROUNDUP(buf, CLOCK_TICK_ALIGN);
+
+	/*
+	 * Perform initialization in case multi-threading is chosen later.
+	 */
+	for (i = 0; i < NCPU; i++, buf += size) {
+		ctp = (clock_tick_cpu_t *)buf;
+		clock_tick_cpu[i] = ctp;
+		mutex_init(&ctp->ct_lock, NULL, MUTEX_DEFAULT, NULL);
+		if (&create_softint != NULL) {
+			ctp->ct_intr = create_softint(LOCK_LEVEL,
+			    clock_tick_execute, (caddr_t)ctp);
+		}
+		ctp->ct_pending = 0;
+	}
+
+	mutex_init(&clock_tick_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * Compute clock_tick_ncpus here. We need it to compute the
+	 * maximum number of tick sets we need to support.
+	 */
+	ASSERT(clock_tick_ncpus >= 0);
+	if (clock_tick_ncpus == 0)
+		clock_tick_ncpus = CLOCK_TICK_NCPUS;
+	if (clock_tick_ncpus > max_ncpus)
+		clock_tick_ncpus = max_ncpus;
+
+	/*
+	 * Allocate and initialize the tick sets.
+	 */
+	n = (max_ncpus + clock_tick_ncpus - 1)/clock_tick_ncpus;
+	clock_tick_set = kmem_zalloc(sizeof (clock_tick_set_t) * n, KM_SLEEP);
+	for (i = 0; i < n; i++) {
+		csp = &clock_tick_set[i];
+		csp->ct_start = i * clock_tick_ncpus;
+		csp->ct_scan = csp->ct_start;
+		csp->ct_end = csp->ct_start;
+	}
+}
+
+void
+clock_tick_init_post(void)
+{
+	/*
+	 * If a platform does not provide create_softint() and invoke_softint(),
+	 * then we assume single threaded.
+	 */
+	if (&invoke_softint == NULL)
+		clock_tick_threshold = 0;
+
+	ASSERT(clock_tick_threshold >= 0);
+
+	if (clock_tick_threshold == 0)
+		clock_tick_threshold = max_ncpus;
+
+	/*
+	 * If a platform does not specify a threshold or if the number of CPUs
+	 * at boot time does not exceed the threshold, tick accounting remains
+	 * single-threaded.
+	 */
+	if (ncpus <= clock_tick_threshold) {
+		clock_tick_ncpus = max_ncpus;
+		clock_tick_proc_max = 1;
+		return;
+	}
+
+	/*
+	 * OK. Multi-thread tick processing. If a platform has not specified
+	 * the CPU set size for multi-threading, then use the default value.
+	 * This value has been arrived through measurements on large
+	 * configuration systems.
+	 */
+	clock_tick_single_threaded = 0;
+	if (clock_tick_proc_max == 0) {
+		clock_tick_proc_max = CLOCK_TICK_PROC_MAX;
+		if (hires_tick)
+			clock_tick_proc_max *= 10;
+	}
+}
+
+static void
+clock_tick_schedule_one(clock_tick_set_t *csp, int pending, processorid_t cid)
+{
+	clock_tick_cpu_t	*ctp;
+
+	ASSERT(&invoke_softint != NULL);
+	/*
+	 * Schedule tick accounting for a set of CPUs.
+	 */
+	ctp = clock_tick_cpu[cid];
+	mutex_enter(&ctp->ct_lock);
+	ctp->ct_lbolt = lbolt;
+	ctp->ct_pending += pending;
+	ctp->ct_start = csp->ct_start;
+	ctp->ct_end = csp->ct_end;
+	ctp->ct_scan = csp->ct_scan;
+	mutex_exit(&ctp->ct_lock);
+
+	invoke_softint(cid, ctp->ct_intr);
+	/*
+	 * Return without waiting for the softint to finish.
+	 */
+}
+
+static void
+clock_tick_process(cpu_t *cp, clock_t mylbolt, int pending)
+{
+	kthread_t	*t;
+	kmutex_t	*plockp;
+	int		notick, intr;
+	klwp_id_t	lwp;
+
+	/*
+	 * The locking here is rather tricky. thread_free_prevent()
+	 * prevents the thread returned from being freed while we
+	 * are looking at it. We can then check if the thread
+	 * is exiting and get the appropriate p_lock if it
+	 * is not.  We have to be careful, though, because
+	 * the _process_ can still be freed while we've
+	 * prevented thread free.  To avoid touching the
+	 * proc structure we put a pointer to the p_lock in the
+	 * thread structure.  The p_lock is persistent so we
+	 * can acquire it even if the process is gone.  At that
+	 * point we can check (again) if the thread is exiting
+	 * and either drop the lock or do the tick processing.
+	 */
+	t = cp->cpu_thread;	/* Current running thread */
+	if (CPU == cp) {
+		/*
+		 * 't' will be the tick processing thread on this
+		 * CPU.  Use the pinned thread (if any) on this CPU
+		 * as the target of the clock tick.
+		 */
+		if (t->t_intr != NULL)
+			t = t->t_intr;
+	}
+
+	/*
+	 * We use thread_free_prevent to keep the currently running
+	 * thread from being freed or recycled while we're
+	 * looking at it.
+	 */
+	thread_free_prevent(t);
+	/*
+	 * We cannot hold the cpu_lock to prevent the
+	 * cpu_active from changing in the clock interrupt.
+	 * As long as we don't block (or don't get pre-empted)
+	 * the cpu_list will not change (all threads are paused
+	 * before list modification).
+	 */
+	if (CLOCK_TICK_CPU_OFFLINE(cp)) {
+		thread_free_allow(t);
+		return;
+	}
+
+	/*
+	 * Make sure the thread is still on the CPU.
+	 */
+	if ((t != cp->cpu_thread) &&
+	    ((cp != CPU) || (t != cp->cpu_thread->t_intr))) {
+		/*
+		 * We could not locate the thread. Skip this CPU. Race
+		 * conditions while performing these checks are benign.
+		 * These checks are not perfect and they don't need
+		 * to be.
+		 */
+		thread_free_allow(t);
+		return;
+	}
+
+	intr = t->t_flag & T_INTR_THREAD;
+	lwp = ttolwp(t);
+	if (lwp == NULL || (t->t_proc_flag & TP_LWPEXIT) || intr) {
+		/*
+		 * Thread is exiting (or uninteresting) so don't
+		 * do tick processing.
+		 */
+		thread_free_allow(t);
+		return;
+	}
+
+	/*
+	 * OK, try to grab the process lock.  See
+	 * comments above for why we're not using
+	 * ttoproc(t)->p_lockp here.
+	 */
+	plockp = t->t_plockp;
+	mutex_enter(plockp);
+	/* See above comment. */
+	if (CLOCK_TICK_CPU_OFFLINE(cp)) {
+		mutex_exit(plockp);
+		thread_free_allow(t);
+		return;
+	}
+
+	/*
+	 * The thread may have exited between when we
+	 * checked above, and when we got the p_lock.
+	 */
+	if (t->t_proc_flag & TP_LWPEXIT) {
+		mutex_exit(plockp);
+		thread_free_allow(t);
+		return;
+	}
+
+	/*
+	 * Either we have the p_lock for the thread's process,
+	 * or we don't care about the thread structure any more.
+	 * Either way we can allow thread free.
+	 */
+	thread_free_allow(t);
+
+	/*
+	 * If we haven't done tick processing for this
+	 * lwp, then do it now. Since we don't hold the
+	 * lwp down on a CPU it can migrate and show up
+	 * more than once, hence the lbolt check. mylbolt
+	 * is copied at the time of tick scheduling to prevent
+	 * lbolt mismatches.
+	 *
+	 * Also, make sure that it's okay to perform the
+	 * tick processing before calling clock_tick.
+	 * Setting notick to a TRUE value (ie. not 0)
+	 * results in tick processing not being performed for
+	 * that thread.
+	 */
+	notick = ((cp->cpu_flags & CPU_QUIESCED) || CPU_ON_INTR(cp) ||
+	    (cp->cpu_dispthread == cp->cpu_idle_thread));
+
+	if ((!notick) && (t->t_lbolt < mylbolt)) {
+		t->t_lbolt = mylbolt;
+		clock_tick(t, pending);
+	}
+
+	mutex_exit(plockp);
+}
+
+void
+clock_tick_schedule(int one_sec)
+{
+	ulong_t			active;
+	int			i, end;
+	clock_tick_set_t	*csp;
+	cpu_t			*cp;
+
+	if (clock_cpu_id != CPU->cpu_id)
+		clock_cpu_id = CPU->cpu_id;
+
+	if (clock_tick_single_threaded) {
+		/*
+		 * Each tick cycle, start the scan from a different
+		 * CPU for the sake of fairness.
+		 */
+		end = clock_tick_total_cpus;
+		clock_tick_scan++;
+		if (clock_tick_scan >= end)
+			clock_tick_scan = 0;
+
+		clock_tick_execute_common(0, clock_tick_scan, end, lbolt, 1);
+
+		return;
+	}
+
+	/*
+	 * If the previous invocation of handlers is not yet finished, then
+	 * simply increment a pending count and return. Eventually when they
+	 * finish, the pending count is passed down to the next set of
+	 * handlers to process. This way, ticks that have already elapsed
+	 * in the past are handled as quickly as possible to minimize the
+	 * chances of threads getting away before their pending ticks are
+	 * accounted. The other benefit is that if the pending count is
+	 * more than one, it can be handled by a single invocation of
+	 * clock_tick(). This is a good optimization for large configuration
+	 * busy systems where tick accounting can get backed up for various
+	 * reasons.
+	 */
+	clock_tick_pending++;
+
+	active = clock_tick_active;
+	active = atomic_cas_ulong(&clock_tick_active, active, active);
+	if (active)
+		return;
+
+	/*
+	 * We want to handle the clock CPU here. If we
+	 * scheduled the accounting for the clock CPU to another
+	 * processor, that processor will find only the clock() thread
+	 * running and not account for any user thread below it. Also,
+	 * we want to handle this before we block on anything and allow
+	 * the pinned thread below the current thread to escape.
+	 */
+	clock_tick_process(CPU, lbolt, clock_tick_pending);
+
+	mutex_enter(&clock_tick_lock);
+
+	/*
+	 * Schedule each set on a separate processor.
+	 */
+	cp = clock_cpu_list;
+	for (i = 0; i < clock_tick_nsets; i++) {
+		csp = &clock_tick_set[i];
+
+		/*
+		 * Pick the next online CPU in list for scheduling tick
+		 * accounting. The clock_tick_lock is held by the caller.
+		 * So, CPU online/offline cannot muck with this while
+		 * we are picking our CPU to X-call.
+		 */
+		if (cp == CPU)
+			cp = cp->cpu_next_onln;
+
+		/*
+		 * Each tick cycle, start the scan from a different
+		 * CPU for the sake of fairness.
+		 */
+		csp->ct_scan++;
+		if (csp->ct_scan >= csp->ct_end)
+			csp->ct_scan = csp->ct_start;
+
+		clock_tick_schedule_one(csp, clock_tick_pending, cp->cpu_id);
+
+		cp = cp->cpu_next_onln;
+	}
+
+	if (one_sec) {
+		/*
+		 * Move the CPU pointer around every second. This is so
+		 * all the CPUs can be X-called in a round-robin fashion
+		 * to evenly distribute the X-calls. We don't do this
+		 * at a faster rate than this because we don't want
+		 * to affect cache performance negatively.
+		 */
+		clock_cpu_list = clock_cpu_list->cpu_next_onln;
+	}
+
+	mutex_exit(&clock_tick_lock);
+
+	clock_tick_pending = 0;
+}
+
+static void
+clock_tick_execute_common(int start, int scan, int end, clock_t mylbolt,
+	int pending)
+{
+	cpu_t		*cp;
+	int		i;
+
+	ASSERT((start <= scan) && (scan <= end));
+
+	/*
+	 * Handle the thread on current CPU first. This is to prevent a
+	 * pinned thread from escaping if we ever block on something.
+	 * Note that in the single-threaded mode, this handles the clock
+	 * CPU.
+	 */
+	clock_tick_process(CPU, mylbolt, pending);
+
+	/*
+	 * Perform tick accounting for the threads running on
+	 * the scheduled CPUs.
+	 */
+	for (i = scan; i < end; i++) {
+		cp = clock_tick_cpus[i];
+		if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id))
+			continue;
+		clock_tick_process(cp, mylbolt, pending);
+	}
+
+	for (i = start; i < scan; i++) {
+		cp = clock_tick_cpus[i];
+		if ((cp == NULL) || (cp == CPU) || (cp->cpu_id == clock_cpu_id))
+			continue;
+		clock_tick_process(cp, mylbolt, pending);
+	}
+}
+
+/*ARGSUSED*/
+static uint_t
+clock_tick_execute(caddr_t arg1, caddr_t arg2)
+{
+	clock_tick_cpu_t	*ctp;
+	int			start, scan, end, pending;
+	clock_t			mylbolt;
+
+	/*
+	 * We could have raced with cpu offline. We don't want to
+	 * process anything on an offlined CPU. If we got blocked
+	 * on anything, we may not get scheduled when we wakeup
+	 * later on.
+	 */
+	if (!CLOCK_TICK_XCALL_SAFE(CPU))
+		return (1);
+
+	atomic_inc_ulong(&clock_tick_active);
+
+	ctp = (clock_tick_cpu_t *)arg1;
+	mutex_enter(&ctp->ct_lock);
+	pending = ctp->ct_pending;
+	if (pending == 0) {
+		/*
+		 * If a CPU is busy at LOCK_LEVEL, then an invocation
+		 * of this softint may be queued for some time. In that case,
+		 * clock_tick_active will not be incremented.
+		 * clock_tick_schedule() will then assume that the previous
+		 * invocation is done and post a new softint. The first one
+		 * that gets in will reset the pending count so the
+		 * second one is a noop.
+		 */
+		mutex_exit(&ctp->ct_lock);
+		goto out;
+	}
+	ctp->ct_pending = 0;
+	start = ctp->ct_start;
+	end = ctp->ct_end;
+	scan = ctp->ct_scan;
+	mylbolt = ctp->ct_lbolt;
+	mutex_exit(&ctp->ct_lock);
+
+	clock_tick_execute_common(start, scan, end, mylbolt, pending);
+
+out:
+	/*
+	 * Signal completion to the clock handler.
+	 */
+	atomic_dec_ulong(&clock_tick_active);
+
+	return (1);
+}
+
+/*ARGSUSED*/
+static int
+clock_tick_cpu_setup(cpu_setup_t what, int cid, void *arg)
+{
+	cpu_t			*cp, *ncp;
+	int			i, set;
+	clock_tick_set_t	*csp;
+
+	/*
+	 * This function performs some computations at CPU offline/online
+	 * time. The computed values are used during tick scheduling and
+	 * execution phases. This avoids having to compute things on
+	 * an every tick basis. The other benefit is that we perform the
+	 * computations only for onlined CPUs (not offlined ones). As a
+	 * result, no tick processing is attempted for offlined CPUs.
+	 *
+	 * Also, cpu_offline() calls this function before checking for
+	 * active interrupt threads. This allows us to avoid posting
+	 * cross calls to CPUs that are being offlined.
+	 */
+
+	cp = cpu[cid];
+
+	mutex_enter(&clock_tick_lock);
+
+	switch (what) {
+	case CPU_ON:
+		clock_tick_cpus[clock_tick_total_cpus] = cp;
+		set = clock_tick_total_cpus / clock_tick_ncpus;
+		csp = &clock_tick_set[set];
+		csp->ct_end++;
+		clock_tick_total_cpus++;
+		clock_tick_nsets =
+		    (clock_tick_total_cpus + clock_tick_ncpus - 1) /
+		    clock_tick_ncpus;
+		CPUSET_ADD(clock_tick_online_cpuset, cp->cpu_id);
+		membar_sync();
+		break;
+
+	case CPU_OFF:
+		if (&sync_softint != NULL)
+			sync_softint(clock_tick_online_cpuset);
+		CPUSET_DEL(clock_tick_online_cpuset, cp->cpu_id);
+		clock_tick_total_cpus--;
+		clock_tick_cpus[clock_tick_total_cpus] = NULL;
+		clock_tick_nsets =
+		    (clock_tick_total_cpus + clock_tick_ncpus - 1) /
+		    clock_tick_ncpus;
+		set = clock_tick_total_cpus / clock_tick_ncpus;
+		csp = &clock_tick_set[set];
+		csp->ct_end--;
+
+		i = 0;
+		ncp = cpu_active;
+		do {
+			if (cp == ncp)
+				continue;
+			clock_tick_cpus[i] = ncp;
+			i++;
+		} while ((ncp = ncp->cpu_next_onln) != cpu_active);
+		ASSERT(i == clock_tick_total_cpus);
+		membar_sync();
+		break;
+
+	default:
+		break;
+	}
+
+	mutex_exit(&clock_tick_lock);
+
+	return (0);
+}
+
+
+void
+clock_tick_mp_init(void)
+{
+	cpu_t	*cp;
+
+	mutex_enter(&cpu_lock);
+
+	cp = cpu_active;
+	do {
+		(void) clock_tick_cpu_setup(CPU_ON, cp->cpu_id, NULL);
+	} while ((cp = cp->cpu_next_onln) != cpu_active);
+
+	register_cpu_setup_func(clock_tick_cpu_setup, NULL);
+
+	mutex_exit(&cpu_lock);
+}
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index 8988a7b647..13cf752b45 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -1785,9 +1785,6 @@ cpu_del_unit(int cpuid)
 	cp->cpu_next->cpu_prev = cp->cpu_prev;
 	if (cp == cpu_list)
 		cpu_list = cpnext;
-	if (cp == clock_cpu_list)
-		clock_cpu_list = cpnext;
-
 
 	/*
 	 * Signals that the cpu has been deleted (see above).
@@ -1882,6 +1879,9 @@ cpu_remove_active(cpu_t *cp)
 
 	lgrp_config(LGRP_CONFIG_CPU_OFFLINE, (uintptr_t)cp, 0);
 
+	if (cp == clock_cpu_list)
+		clock_cpu_list = cp->cpu_next_onln;
+
 	cp->cpu_prev_onln->cpu_next_onln = cp->cpu_next_onln;
 	cp->cpu_next_onln->cpu_prev_onln = cp->cpu_prev_onln;
 	if (cpu_active == cp) {
@@ -2797,7 +2797,6 @@ cpu_destroy_bound_threads(cpu_t *cp)
 
 		mutex_exit(&pidlock);
 
-
 		for (t = tlist; t != NULL; t = tnext) {
 			tnext = t->t_next;
 			thread_free(t);
diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c
index e39be3e9f2..71fc90a767 100644
--- a/usr/src/uts/common/os/exit.c
+++ b/usr/src/uts/common/os/exit.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -356,6 +356,17 @@ proc_exit(int why, int what)
 	if (exitlwps(0) != 0)
 		return (1);
 
+	mutex_enter(&p->p_lock);
+	if (p->p_ttime > 0) {
+		/*
+		 * Account any remaining ticks charged to this process
+		 * on its way out.
+		 */
+		(void) task_cpu_time_incr(p->p_task, p->p_ttime);
+		p->p_ttime = 0;
+	}
+	mutex_exit(&p->p_lock);
+
 	DTRACE_PROC(lwp__exit);
 	DTRACE_PROC1(exit, int, why);
 
diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c
index d008231021..7109a49cda 100644
--- a/usr/src/uts/common/os/main.c
+++ b/usr/src/uts/common/os/main.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -360,6 +360,8 @@ main(void)
 	extern int	netboot;
 	extern void	vm_init(void);
 	extern void	cbe_init(void);
+	extern void	clock_tick_init_pre(void);
+	extern void	clock_tick_init_post(void);
 	extern void	clock_init(void);
 	extern void	physio_bufs_init(void);
 	extern void	pm_cfb_setup_intr(void);
@@ -399,6 +401,7 @@ main(void)
 	callout_init();	/* callout table MUST be init'd before clock starts */
 	timer_init();	/* timer must be initialized before cyclic starts */
 	cbe_init();
+	clock_tick_init_pre();
 	clock_init();
 
 	/*
@@ -544,6 +547,8 @@ main(void)
 	kmem_mp_init();
 	vmem_update(NULL);
 
+	clock_tick_init_post();
+
 	for (initptr = &mp_init_tbl[0]; *initptr; initptr++)
 		(**initptr)();
 
diff --git a/usr/src/uts/common/os/task.c b/usr/src/uts/common/os/task.c
index b3967546f5..628fcde30b 100644
--- a/usr/src/uts/common/os/task.c
+++ b/usr/src/uts/common/os/task.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -202,7 +202,39 @@ task_cpu_time_usage(rctl_t *r, proc_t *p)
 	task_t *t = p->p_task;
 
 	ASSERT(MUTEX_HELD(&p->p_lock));
-	return (t->tk_cpu_time / hz);
+	return (t->tk_cpu_time);
+}
+
+/*
+ * int task_cpu_time_incr(task_t *t, rctl_qty_t incr)
+ *
+ * Overview
+ *   task_cpu_time_incr() increments the amount of CPU time used
+ *   by this task.
+ *
+ * Return values
+ *   1   if a second or more time is accumulated
+ *   0   otherwise
+ *
+ * Caller's context
+ *   This is called by the clock tick accounting function to charge
+ *   CPU time to a task.
+ */
+rctl_qty_t
+task_cpu_time_incr(task_t *t, rctl_qty_t incr)
+{
+	rctl_qty_t ret = 0;
+
+	mutex_enter(&t->tk_cpu_time_lock);
+	t->tk_cpu_ticks += incr;
+	if (t->tk_cpu_ticks >= hz) {
+		t->tk_cpu_time += t->tk_cpu_ticks / hz;
+		t->tk_cpu_ticks = t->tk_cpu_ticks % hz;
+		ret = t->tk_cpu_time;
+	}
+	mutex_exit(&t->tk_cpu_time_lock);
+
+	return (ret);
 }
 
 /*
@@ -224,15 +256,12 @@ static int
 task_cpu_time_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
     struct rctl_val *rcntl, rctl_qty_t incr, uint_t flags)
 {
-	task_t *t;
-
 	ASSERT(MUTEX_HELD(&p->p_lock));
 	ASSERT(e->rcep_t == RCENTITY_TASK);
 	if (e->rcep_p.task == NULL)
 		return (0);
 
-	t = e->rcep_p.task;
-	if ((t->tk_cpu_time + incr) / hz >= rcntl->rcv_value)
+	if (incr >= rcntl->rcv_value)
 		return (1);
 
 	return (0);
diff --git a/usr/src/uts/common/sys/clock_tick.h b/usr/src/uts/common/sys/clock_tick.h
new file mode 100644
index 0000000000..b3dc2198ed
--- /dev/null
+++ b/usr/src/uts/common/sys/clock_tick.h
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_CLOCK_TICK_H
+#define	_SYS_CLOCK_TICK_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/mutex.h>
+#include <sys/cpuvar.h>
+#include <sys/systm.h>
+#include <sys/cyclic.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	CLOCK_TICK_NCPUS	32
+
+/*
+ * Per-CPU structure to facilitate multi-threaded tick accounting.
+ *
+ * ct_lock
+ *	Mutex for the structure. Used to lock the structure to pass
+ *	arguments to the tick processing softint handler.
+ * ct_intr
+ *	Tick processing softint handle. For parallelism, each CPU
+ *	needs to have its own softint handle.
+ * ct_lbolt
+ *	Copy of the lbolt at the time of tick scheduling.
+ * ct_pending
+ *	Number of ticks to be processed by one invocation of the tick
+ *	processing softint.
+ * ct_start
+ *	First CPU to do tick processing for.
+ * ct_end
+ *	Last CPU to do tick processing for.
+ * ct_scan
+ *	CPU to start the tick processing from. Rotated every tick.
+ */
+typedef struct clock_tick_cpu {
+	kmutex_t		ct_lock;
+	ulong_t			ct_intr;
+	clock_t			ct_lbolt;
+	int			ct_pending;
+	int			ct_start;
+	int			ct_end;
+	int			ct_scan;
+} clock_tick_cpu_t;
+
+/*
+ * Per-set structure to facilitate multi-threaded tick accounting.
+ * clock_tick_lock protects this.
+ *
+ * ct_start
+ *	First CPU to do tick processing for.
+ * ct_end
+ *	Last CPU to do tick processing for.
+ * ct_scan
+ *	CPU to start the tick processing from. Rotated every tick.
+ */
+typedef struct clock_tick_set {
+	int			ct_start;
+	int			ct_end;
+	int			ct_scan;
+} clock_tick_set_t;
+
+#define	CLOCK_TICK_CPU_OFFLINE(cp)	\
+	(((cp) != cpu_active) && ((cp)->cpu_next_onln == (cp)))
+
+#define	CLOCK_TICK_XCALL_SAFE(cp)	\
+		CPU_IN_SET(clock_tick_online_cpuset, cp->cpu_id)
+
+#define	CLOCK_TICK_PROC_MAX		10
+
+#ifdef	_KERNEL
+#pragma weak		create_softint
+extern ulong_t		create_softint(uint_t, uint_t (*)(caddr_t, caddr_t),
+				caddr_t);
+#pragma weak		invoke_softint
+extern void		invoke_softint(processorid_t, ulong_t);
+#pragma weak		sync_softint
+extern void		sync_softint(cpuset_t);
+extern void		clock_tick(kthread_t *, int);
+extern void		membar_sync(void);
+
+extern int		hires_tick;
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_CLOCK_TICK_H */
diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h
index 4785796781..c7b76b32ea 100644
--- a/usr/src/uts/common/sys/cpuvar.h
+++ b/usr/src/uts/common/sys/cpuvar.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -519,6 +519,7 @@ extern cpuset_t cpu_seqid_inuse;
 
 extern struct cpu	*cpu[];		/* indexed by CPU number */
 extern cpu_t		*cpu_list;	/* list of CPUs */
+extern cpu_t		*cpu_active;	/* list of active CPUs */
 extern int		ncpus;		/* number of CPUs present */
 extern int		ncpus_online;	/* number of CPUs not quiesced */
 extern int		max_ncpus;	/* max present before ncpus is known */
diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h
index 5a9b1caf50..46d205749a 100644
--- a/usr/src/uts/common/sys/proc.h
+++ b/usr/src/uts/common/sys/proc.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -348,6 +348,7 @@ typedef struct	proc {
 					/* protected by p_lock */
 	rctl_qty_t	p_crypto_mem;	/* /dev/crypto memory charged to proc */
 					/* protected by p_lock */
+	clock_t	p_ttime;		/* buffered task time */
 } proc_t;
 
 #define	PROC_T				/* headers relying on proc_t are OK */
diff --git a/usr/src/uts/common/sys/task.h b/usr/src/uts/common/sys/task.h
index d1bcb04145..a22f64d970 100644
--- a/usr/src/uts/common/sys/task.h
+++ b/usr/src/uts/common/sys/task.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -73,6 +73,8 @@ typedef struct task {
 	task_usage_t	*tk_inherited;	/* task resource usage		*/
 					/* inherited with the first	*/
 					/* member process		*/
+	rctl_qty_t	tk_cpu_ticks;	/* accumulated CPU ticks	*/
+	kmutex_t	tk_cpu_time_lock; /* accumulated CPU seconds lock */
 } task_t;
 
 extern task_t *task0p;
@@ -91,6 +93,7 @@ extern task_t *task_hold_by_id_zone(taskid_t, zoneid_t);
 extern void task_rele(task_t *);
 extern void task_hold(task_t *);
 extern void task_end(task_t *);
+extern rctl_qty_t task_cpu_time_incr(task_t *, rctl_qty_t);
 
 #else /* _KERNEL */
 
diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h
index d545e093b3..7302289ea1 100644
--- a/usr/src/uts/common/sys/thread.h
+++ b/usr/src/uts/common/sys/thread.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -515,10 +515,22 @@ extern	struct _kthread	t0;		/* the scheduler thread */
 extern	kmutex_t	pidlock;	/* global process lock */
 
 /*
- * thread_free_lock is used by the clock thread to keep a thread
+ * thread_free_lock is used by the tick accounting thread to keep a thread
  * from being freed while it is being examined.
  */
-extern	kmutex_t	thread_free_lock;
+#define	THREAD_FREE_NUM		1024
+#define	THREAD_FREE_MASK	(THREAD_FREE_NUM - 1)
+#define	THREAD_FREE_SHIFT_BITS	5
+#define	THREAD_FREE_SHIFT(t)	((uintptr_t)t >> THREAD_FREE_SHIFT_BITS)
+#define	THREAD_FREE_HASH(t)	(THREAD_FREE_SHIFT(t) & THREAD_FREE_MASK)
+
+typedef struct thread_free_lock {
+	kmutex_t	tf_lock;
+	uchar_t		tf_pad[64 - sizeof (kmutex_t)];
+} thread_free_lock_t;
+
+extern void	thread_free_prevent(kthread_t *);
+extern void	thread_free_allow(kthread_t *);
 
 /*
  * Routines to change the priority and effective priority
diff --git a/usr/src/uts/intel/ia32/ml/lock_prim.s b/usr/src/uts/intel/ia32/ml/lock_prim.s
index 2d74137565..8dc51e3eeb 100644
--- a/usr/src/uts/intel/ia32/ml/lock_prim.s
+++ b/usr/src/uts/intel/ia32/ml/lock_prim.s
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -1278,6 +1278,10 @@ lockstat_hot_patch(void)
 /* XX64 membar_*() should be inlines */
 
 void
+membar_sync(void)
+{}
+
+void
 membar_enter(void)
 {}
 
@@ -1299,8 +1303,10 @@ membar_consumer(void)
 
 	ENTRY(membar_enter)
 	ALTENTRY(membar_exit)
+	ALTENTRY(membar_sync)
 	mfence			/* lighter weight than lock; xorq $0,(%rsp) */
 	ret
+	SET_SIZE(membar_sync)
 	SET_SIZE(membar_exit)
 	SET_SIZE(membar_enter)
 
@@ -1318,9 +1324,11 @@ membar_consumer(void)
 
 	ENTRY(membar_enter)
 	ALTENTRY(membar_exit)
+	ALTENTRY(membar_sync)
 	lock
 	xorl	$0, (%esp)
 	ret
+	SET_SIZE(membar_sync)
 	SET_SIZE(membar_exit)
 	SET_SIZE(membar_enter)
 
diff --git a/usr/src/uts/sun4/os/intr.c b/usr/src/uts/sun4/os/intr.c
index 7f70424a45..d0830a261e 100644
--- a/usr/src/uts/sun4/os/intr.c
+++ b/usr/src/uts/sun4/os/intr.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -813,3 +813,37 @@ intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip)
 		cp->cpu_intr_weight = 0;	/* sanity */
 	mutex_exit(&intr_dist_cpu_lock);
 }
+
+ulong_t
+create_softint(uint_t pil, uint_t (*func)(caddr_t, caddr_t), caddr_t arg1)
+{
+	uint64_t inum;
+
+	inum = add_softintr(pil, func, arg1, SOFTINT_ST);
+	return ((ulong_t)inum);
+}
+
+void
+invoke_softint(processorid_t cpuid, ulong_t hdl)
+{
+	uint64_t inum = hdl;
+
+	if (cpuid == CPU->cpu_id)
+		setsoftint(inum);
+	else
+		xt_one(cpuid, setsoftint_tl1, inum, 0);
+}
+
+void
+remove_softint(ulong_t hdl)
+{
+	uint64_t inum = hdl;
+
+	(void) rem_softintr(inum);
+}
+
+void
+sync_softint(cpuset_t set)
+{
+	xt_sync(set);
+}
diff --git a/usr/src/uts/sun4/sys/ivintr.h b/usr/src/uts/sun4/sys/ivintr.h
index 7ea9ae1d85..eb0a8656e3 100644
--- a/usr/src/uts/sun4/sys/ivintr.h
+++ b/usr/src/uts/sun4/sys/ivintr.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -42,8 +42,9 @@ extern "C" {
  * interrupts.
  *
  * NOTE: Need two single target software interrupts per cpu for cyclics.
+ *       Need one single target software interrupt per cpu for tick accounting.
  */
-#define	MAX_RSVD_IV	((NCPU * 2) + 256) /* HW and Single target SW intrs */
+#define	MAX_RSVD_IV	((NCPU * 3) + 256) /* HW and Single target SW intrs */
 #define	MAX_RSVD_IVX	32		/* Multi target software intrs */
 
 #ifndef _ASM
diff --git a/usr/src/uts/sun4u/opl/os/opl.c b/usr/src/uts/sun4u/opl/os/opl.c
index f2d3162a93..f33b231117 100644
--- a/usr/src/uts/sun4u/opl/os/opl.c
+++ b/usr/src/uts/sun4u/opl/os/opl.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -116,6 +116,12 @@ static void pass2xscf_thread();
 #define	OPL_BOFF_MAX (40 * OPL_BOFF_SLEEP)
 #define	OPL_BOFF_TM 1500
 
+#define	OPL_CLOCK_TICK_THRESHOLD	128
+#define	OPL_CLOCK_TICK_NCPUS		64
+
+extern int	clock_tick_threshold;
+extern int	clock_tick_ncpus;
+
 int
 set_platform_max_ncpus(void)
 {
@@ -943,6 +949,10 @@ plat_startup_memlist(caddr_t alloc_base)
 void
 startup_platform(void)
 {
+	if (clock_tick_threshold == 0)
+		clock_tick_threshold = OPL_CLOCK_TICK_THRESHOLD;
+	if (clock_tick_ncpus == 0)
+		clock_tick_ncpus = OPL_CLOCK_TICK_NCPUS;
 }
 
 void
diff --git a/usr/src/uts/sun4v/os/mach_startup.c b/usr/src/uts/sun4v/os/mach_startup.c
index 78293c17af..333212b4f5 100644
--- a/usr/src/uts/sun4v/os/mach_startup.c
+++ b/usr/src/uts/sun4v/os/mach_startup.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -57,6 +57,12 @@ extern void sfmmu_set_tsbs(void);
  */
 static int enable_halt_idle_cpus = 1;
 
+#define	SUN4V_CLOCK_TICK_THRESHOLD	64
+#define	SUN4V_CLOCK_TICK_NCPUS		64
+
+extern int	clock_tick_threshold;
+extern int	clock_tick_ncpus;
+
 void
 setup_trap_table(void)
 {
@@ -296,6 +302,10 @@ void
 startup_platform(void)
 {
 	ip_squeue_soft_ring = B_TRUE;
+	if (clock_tick_threshold == 0)
+		clock_tick_threshold = SUN4V_CLOCK_TICK_THRESHOLD;
+	if (clock_tick_ncpus == 0)
+		clock_tick_ncpus = SUN4V_CLOCK_TICK_NCPUS;
 }
 
 /*
author	mv143129 <none@none>	2008-01-07 18:49:45 -0800
committer	mv143129 <none@none>	2008-01-07 18:49:45 -0800
commit	2850d85b7b93f31e578520dc3b3feb24db609c62 (patch)
tree	2a1994e9e8b17b58b251e608de8c1a449291ba8f
parent	ca29f3da510ba7f712a40977b04aeceda9b70b95 (diff)
download	illumos-gate-2850d85b7b93f31e578520dc3b3feb24db609c62.tar.gz