OS-5309 TSC sync detection should be NUMA friendly

Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Robert Mustacchi <rm@joyent.com>
author: Patrick Mooney <pmooney@pfmooney.com> 2016-04-07 20:55:35 +0000
committer: Patrick Mooney <pmooney@pfmooney.com> 2016-04-14 20:47:53 +0000
commit: 46243df51ea0c020876ad159b2846b07fcd5a17a (patch)
tree: a1d52144fca1139e9d860deb4ac34e3cd339a041
parent: 4af82cf9e4a405a5ed18ef81ff24d71acde58bdf (diff)
download: illumos-joyent-46243df51ea0c020876ad159b2846b07fcd5a17a.tar.gz
1 files changed, 65 insertions, 81 deletions
diff --git a/usr/src/uts/i86pc/os/timestamp.c b/usr/src/uts/i86pc/os/timestamp.c
index c40159018c..3b478853ee 100644
--- a/usr/src/uts/i86pc/os/timestamp.c
+++ b/usr/src/uts/i86pc/os/timestamp.c
@@ -25,6 +25,7 @@
  *
  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+ * Copyright 2016 Joyent, Inc
  */
 
 #include <sys/types.h>
@@ -146,8 +147,6 @@ typedef struct tsc_sync {
 	volatile hrtime_t master_tsc, slave_tsc;
 } tsc_sync_t;
 static tsc_sync_t *tscp;
-static hrtime_t largest_tsc_delta = 0;
-static ulong_t shortest_write_time = ~0UL;
 
 static hrtime_t	tsc_last = 0;
 static hrtime_t	tsc_last_jumped = 0;
@@ -451,25 +450,27 @@ tsc_gethrtimeunscaled_delta(void)
 }
 
 /*
- * Called by the master in the TSC sync operation (usually the boot CPU).
- * If the slave is discovered to have a skew, gethrtimef will be changed to
- * point to tsc_gethrtime_delta(). Calculating skews is precise only when
- * the master and slave TSCs are read simultaneously; however, there is no
- * algorithm that can read both CPUs in perfect simultaneity. The proposed
- * algorithm is an approximate method based on the behaviour of cache
- * management. The slave CPU continuously reads TSC and then reads a global
- * variable which the master CPU updates. The moment the master's update reaches
- * the slave's visibility (being forced by an mfence operation) we use the TSC
- * reading taken on the slave. A corresponding TSC read will be taken on the
- * master as soon as possible after finishing the mfence operation. But the
- * delay between causing the slave to notice the invalid cache line and the
- * competion of mfence is not repeatable. This error is heuristically assumed
- * to be 1/4th of the total write time as being measured by the two TSC reads
- * on the master sandwiching the mfence. Furthermore, due to the nature of
- * bus arbitration, contention on memory bus, etc., the time taken for the write
- * to reflect globally can vary a lot. So instead of taking a single reading,
- * a set of readings are taken and the one with least write time is chosen
- * to calculate the final skew.
+ * TSC Sync Master
+ *
+ * Typically called on the boot CPU, this attempts to quantify TSC skew between
+ * different CPUs.  If an appreciable difference is found, gethrtimef will be
+ * changed to point to tsc_gethrtime_delta().
+ *
+ * Calculating skews is precise only when the master and slave TSCs are read
+ * simultaneously; however, there is no algorithm that can read both CPUs in
+ * perfect simultaneity.  The proposed algorithm is an approximate method based
+ * on the behaviour of cache management.  The slave CPU continuously polls the
+ * TSC while reading a global variable updated by the master CPU.  The latest
+ * TSC reading is saved when the master's update (forced via mfence) reaches
+ * visibility on the slave.  The master will also take a TSC reading
+ * immediately following the mfence.
+ *
+ * While the delay between cache line invalidation on the slave and mfence
+ * completion on the master is not repeatable, the error is heuristically
+ * assumed to be 1/4th of the write time recorded by the master.  Multiple
+ * samples are taken to control for the variance caused by external factors
+ * such as bus contention.  Each sample set is independent per-CPU to control
+ * for differing memory latency on NUMA systems.
  *
  * TSC sync is disabled in the context of virtualization because the CPUs
  * assigned to the guest are virtual CPUs which means the real CPUs on which
@@ -482,7 +483,7 @@ void
 tsc_sync_master(processorid_t slave)
 {
 	ulong_t flags, source, min_write_time = ~0UL;
-	hrtime_t write_time, x, mtsc_after, tdelta;
+	hrtime_t write_time, mtsc_after, last_delta = 0;
 	tsc_sync_t *tsc = tscp;
 	int cnt;
 	int hwtype;
@@ -505,45 +506,39 @@ tsc_sync_master(processorid_t slave)
 			SMT_PAUSE();
 		write_time =  mtsc_after - tsc->master_tsc;
 		if (write_time <= min_write_time) {
-			min_write_time = write_time;
+			hrtime_t tdelta;
+
+			tdelta = tsc->slave_tsc - mtsc_after;
+			if (tdelta < 0)
+				tdelta = -tdelta;
 			/*
-			 * Apply heuristic adjustment only if the calculated
-			 * delta is > 1/4th of the write time.
+			 * If the margin exists, subtract 1/4th of the measured
+			 * write time from the master's TSC value.  This is an
+			 * estimate of how late the mfence completion came
+			 * after the slave noticed the cache line change.
 			 */
-			x = tsc->slave_tsc - mtsc_after;
-			if (x < 0)
-				x = -x;
-			if (x > (min_write_time/4))
-				/*
-				 * Subtract 1/4th of the measured write time
-				 * from the master's TSC value, as an estimate
-				 * of how late the mfence completion came
-				 * after the slave noticed the cache line
-				 * change.
-				 */
+			if (tdelta > (write_time/4)) {
 				tdelta = tsc->slave_tsc -
-				    (mtsc_after - (min_write_time/4));
-			else
+				    (mtsc_after - (write_time/4));
+			} else {
 				tdelta = tsc->slave_tsc - mtsc_after;
-			tsc_sync_tick_delta[slave] =
-			    tsc_sync_tick_delta[source] - tdelta;
+			}
+			last_delta = tsc_sync_tick_delta[source] - tdelta;
+			tsc_sync_tick_delta[slave] = last_delta;
+			min_write_time = write_time;
 		}
 
 		tsc->master_tsc = tsc->slave_tsc = write_time = 0;
 		membar_enter();
 		tsc_sync_go = TSC_SYNC_STOP;
 	}
-	if (tdelta < 0)
-		tdelta = -tdelta;
-	if (tdelta > largest_tsc_delta)
-		largest_tsc_delta = tdelta;
-	if (min_write_time < shortest_write_time)
-		shortest_write_time = min_write_time;
+
 	/*
-	 * Enable delta variants of tsc functions if the largest of all chosen
-	 * deltas is > smallest of the write time.
+	 * Only enable the delta variants of the TSC functions if the measured
+	 * skew is greater than the fastest write time.
 	 */
-	if (largest_tsc_delta > shortest_write_time) {
+	last_delta = (last_delta < 0) ? -last_delta : last_delta;
+	if (last_delta > min_write_time) {
 		gethrtimef = tsc_gethrtime_delta;
 		gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
 	}
@@ -551,11 +546,12 @@ tsc_sync_master(processorid_t slave)
 }
 
 /*
+ * TSC Sync Slave
+ *
  * Called by a CPU which has just been onlined.  It is expected that the CPU
  * performing the online operation will call tsc_sync_master().
  *
- * TSC sync is disabled in the context of virtualization. See comments
- * above tsc_sync_master.
+ * Like tsc_sync_master, this logic is skipped on virtualized platforms.
  */
 void
 tsc_sync_slave(void)
@@ -579,11 +575,9 @@ tsc_sync_slave(void)
 		tsc_sync_go = TSC_SYNC_GO;
 		do {
 			/*
-			 * Do not put an SMT_PAUSE here. For instance,
-			 * if the master and slave are really the same
-			 * hyper-threaded CPU, then you want the master
-			 * to yield to the slave as quickly as possible here,
-			 * but not the other way.
+			 * Do not put an SMT_PAUSE here.  If the master and
+			 * slave are the same hyper-threaded CPU, we want the
+			 * master to yield as quickly as possible to the slave.
 			 */
 			s1 = tsc_read();
 		} while (tsc->master_tsc == 0);
@@ -708,12 +702,10 @@ get_tsc_ready()
 }
 
 /*
- * Adjust all the deltas by adding the passed value to the array.
- * Then use the "delt" versions of the the gethrtime functions.
- * Note that 'tdelta' _could_ be a negative number, which should
- * reduce the values in the array (used, for example, if the Solaris
- * instance was moved by a virtual manager to a machine with a higher
- * value of tsc).
+ * Adjust all the deltas by adding the passed value to the array and activate
+ * the "delta" versions of the gethrtime functions.  It is possible that the
+ * adjustment could be negative.  Such may occur if the SunOS instance was
+ * moved by a virtual manager to a machine with a higher value of TSC.
  */
 void
 tsc_adjust_delta(hrtime_t tdelta)
@@ -732,13 +724,9 @@ tsc_adjust_delta(hrtime_t tdelta)
  * Functions to manage TSC and high-res time on suspend and resume.
  */
 
-/*
- * declarations needed for time adjustment
- */
-extern void	rtcsync(void);
+/* tod_ops from "uts/i86pc/io/todpc_subr.c" */
 extern tod_ops_t *tod_ops;
-/* There must be a better way than exposing nsec_scale! */
-extern uint_t	nsec_scale;
+
 static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
 static timestruc_t tsc_saved_ts;
 static int	tsc_needs_resume = 0;	/* We only want to do this once. */
@@ -748,23 +736,20 @@ int		tsc_suspend_count = 0;
 int		tsc_resume_in_cyclic = 0;
 
 /*
- * Let timestamp.c know that we are suspending.  It needs to take
- * snapshots of the current time, and do any pre-suspend work.
+ * Take snapshots of the current time and do any other pre-suspend work.
  */
 void
 tsc_suspend(void)
 {
-/*
- * What we need to do here, is to get the time we suspended, so that we
- * know how much we should add to the resume.
- * This routine is called by each CPU, so we need to handle reentry.
- */
+	/*
+	 * We need to collect the time at which we suspended here so we know
+	 * now much should be added during the resume.  This is called by each
+	 * CPU, so reentry must be properly handled.
+	 */
 	if (tsc_gethrtime_enable) {
 		/*
-		 * We put the tsc_read() inside the lock as it
-		 * as no locking constraints, and it puts the
-		 * aquired value closer to the time stamp (in
-		 * case we delay getting the lock).
+		 * Perform the tsc_read after acquiring the lock to make it as
+		 * accurate as possible in the face of contention.
 		 */
 		mutex_enter(&tod_lock);
 		tsc_saved_tsc = tsc_read();
@@ -786,8 +771,7 @@ tsc_suspend(void)
 }
 
 /*
- * Restore all timestamp state based on the snapshots taken at
- * suspend time.
+ * Restore all timestamp state based on the snapshots taken at suspend time.
  */
 void
 tsc_resume(void)
author	Patrick Mooney <pmooney@pfmooney.com>	2016-04-07 20:55:35 +0000
committer	Patrick Mooney <pmooney@pfmooney.com>	2016-04-14 20:47:53 +0000
commit	46243df51ea0c020876ad159b2846b07fcd5a17a (patch)
tree	a1d52144fca1139e9d860deb4ac34e3cd339a041
parent	4af82cf9e4a405a5ed18ef81ff24d71acde58bdf (diff)
download	illumos-joyent-46243df51ea0c020876ad159b2846b07fcd5a17a.tar.gz