1 files changed, 61 insertions, 6 deletions
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
index 4c2bdccd66..503416b293 100644
--- a/usr/src/uts/common/fs/zfs/zfs_zone.c
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -187,11 +187,38 @@ uint_t		zfs_zone_rw_lat_limit = 10;
 
 /*
  * The I/O throttle will only start delaying zones when it detects disk
- * utilization has reached a certain level.  This tunable controls the threshold
- * at which the throttle will start delaying zones. The calculation should
- * correspond closely with the %b column from iostat.
+ * utilization has reached a certain level.  This tunable controls the
+ * threshold at which the throttle will start delaying zones.  When the number
+ * of vdevs is small, the calculation should correspond closely with the %b
+ * column from iostat -- but as the number of vdevs becomes large, it will
+ * correlate less and less to any single device (therefore making it a poor
+ * approximation for the actual I/O utilization on such systems).  We
+ * therefore use our derived utilization conservatively:  we know that low
+ * derived utilization does indeed correlate to low I/O use -- but that a high
+ * rate of derived utilization does not necesarily alone denote saturation;
+ * where we see a high rate of utilization, we also look for laggard I/Os to
+ * attempt to detect saturation.
  */
 uint_t		zfs_zone_util_threshold = 80;
+uint_t		zfs_zone_underutil_threshold = 60;
+
+/*
+ * There are three important tunables here:  zfs_zone_laggard_threshold denotes
+ * the threshold at which an I/O is considered to be of notably high latency;
+ * zfs_zone_laggard_recent denotes the number of microseconds before the
+ * current time after which the last laggard is considered to be sufficiently
+ * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes
+ * the microseconds before the current time before which the last laggard is
+ * considered to be sufficiently old to merit decreasing the throttle.  The
+ * most important tunable of these three is the zfs_zone_laggard_threshold: in
+ * modeling data from a large public cloud, this tunable was found to have a
+ * much greater effect on the throttle than the two time-based thresholds.
+ * This must be set high enough to not result in spurious throttling, but not
+ * so high as to allow pathological I/O to persist in the system.
+ */
+uint_t		zfs_zone_laggard_threshold = 50000;	/* 50 ms */
+uint_t		zfs_zone_laggard_recent = 1000000;	/* 1000 ms */
+uint_t		zfs_zone_laggard_ancient = 5000000;	/* 5000 ms */
 
 /*
  * Throughout this subsystem, our timestamps are in microseconds.  Our system
@@ -224,6 +251,8 @@ typedef struct {
 	uint64_t zi_totutil;
 	int zi_active;
 	uint_t zi_diskutil;
+	boolean_t zi_underutil;
+	boolean_t zi_overutil;
 } zoneio_stats_t;
 
 static sys_lat_cycle_t	rd_lat;
@@ -247,6 +276,7 @@ hrtime_t	zfs_disk_rlastupdate = 0; /* time last IO dispatched */
 hrtime_t	zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
 /* time that we last updated per-zone throttle info */
 hrtime_t	zfs_zone_last_checked = 0;
+hrtime_t	zfs_disk_last_laggard = 0;
 
 /*
  * Data used to keep track of how often txg sync is running.
@@ -682,13 +712,13 @@ zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
 	 * Adjust each IO's delay.  If the overall delay becomes too high, avoid
 	 * increasing beyond the ceiling value.
 	 */
-	if (zonep->zone_io_util > fairutil &&
-	    sp->zi_diskutil > zfs_zone_util_threshold) {
+	if (zonep->zone_io_util > fairutil && sp->zi_overutil) {
 		zonep->zone_io_util_above_avg = B_TRUE;
 
 		if (sp->zi_active > 1)
 			zfs_zone_delay_inc(zonep);
-	} else if (zonep->zone_io_util < fairutil || sp->zi_active <= 1) {
+	} else if (zonep->zone_io_util < fairutil || sp->zi_underutil ||
+	    sp->zi_active <= 1) {
 		zfs_zone_delay_dec(zonep);
 	}
 
@@ -716,6 +746,7 @@ static void
 zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
 {
 	zoneio_stats_t stats;
+	hrtime_t laggard_udelta = 0;
 
 	(void) bzero(&stats, sizeof (stats));
 
@@ -742,6 +773,23 @@ zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
 	}
 	zfs_disk_last_rtime = zfs_disk_rtime;
 
+	if (unow > zfs_disk_last_laggard)
+		laggard_udelta = unow - zfs_disk_last_laggard;
+
+	/*
+	 * To minimize porpoising, we have three separate states for our
+	 * assessment of I/O performance:  overutilized, underutilized, and
+	 * neither overutilized nor underutilized.  We will increment the
+	 * throttle if a zone is using more than its fair share _and_ I/O
+	 * is overutilized; we will decrement the throttle if a zone is using
+	 * less than its fair share _or_ I/O is underutilized.
+	 */
+	stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold ||
+	    laggard_udelta > zfs_zone_laggard_ancient;
+
+	stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold &&
+	    laggard_udelta < zfs_zone_laggard_recent;
+
 	/*
 	 * sdt:::zfs-zone-stats
 	 *
@@ -1107,6 +1155,9 @@ zfs_zone_zio_done(zio_t *zp)
 	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
 		return;
 
+	if (zp->io_dispatched == 0)
+		return;
+
 	now = gethrtime();
 	unow = NANO_TO_MICRO(now);
 	udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
@@ -1137,6 +1188,10 @@ zfs_zone_zio_done(zio_t *zp)
 	zfs_disk_rcnt--;
 	zfs_disk_rtime += (now - zfs_disk_rlastupdate);
 	zfs_disk_rlastupdate = now;
+
+	if (udelta > zfs_zone_laggard_threshold)
+		zfs_disk_last_laggard = unow;
+
 	mutex_exit(&zfs_disk_lock);
 
 	if (zfs_zone_delay_enable) {