summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_zone.c67
1 files changed, 61 insertions, 6 deletions
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
index 4c2bdccd66..503416b293 100644
--- a/usr/src/uts/common/fs/zfs/zfs_zone.c
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -187,11 +187,38 @@ uint_t zfs_zone_rw_lat_limit = 10;
/*
* The I/O throttle will only start delaying zones when it detects disk
- * utilization has reached a certain level. This tunable controls the threshold
- * at which the throttle will start delaying zones. The calculation should
- * correspond closely with the %b column from iostat.
+ * utilization has reached a certain level. This tunable controls the
+ * threshold at which the throttle will start delaying zones. When the number
+ * of vdevs is small, the calculation should correspond closely with the %b
+ * column from iostat -- but as the number of vdevs becomes large, it will
+ * correlate less and less to any single device (therefore making it a poor
+ * approximation for the actual I/O utilization on such systems). We
+ * therefore use our derived utilization conservatively: we know that low
+ * derived utilization does indeed correlate to low I/O use -- but that a high
+ * rate of derived utilization does not necesarily alone denote saturation;
+ * where we see a high rate of utilization, we also look for laggard I/Os to
+ * attempt to detect saturation.
*/
uint_t zfs_zone_util_threshold = 80;
+uint_t zfs_zone_underutil_threshold = 60;
+
+/*
+ * There are three important tunables here: zfs_zone_laggard_threshold denotes
+ * the threshold at which an I/O is considered to be of notably high latency;
+ * zfs_zone_laggard_recent denotes the number of microseconds before the
+ * current time after which the last laggard is considered to be sufficiently
+ * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes
+ * the microseconds before the current time before which the last laggard is
+ * considered to be sufficiently old to merit decreasing the throttle. The
+ * most important tunable of these three is the zfs_zone_laggard_threshold: in
+ * modeling data from a large public cloud, this tunable was found to have a
+ * much greater effect on the throttle than the two time-based thresholds.
+ * This must be set high enough to not result in spurious throttling, but not
+ * so high as to allow pathological I/O to persist in the system.
+ */
+uint_t zfs_zone_laggard_threshold = 50000; /* 50 ms */
+uint_t zfs_zone_laggard_recent = 1000000; /* 1000 ms */
+uint_t zfs_zone_laggard_ancient = 5000000; /* 5000 ms */
/*
* Throughout this subsystem, our timestamps are in microseconds. Our system
@@ -224,6 +251,8 @@ typedef struct {
uint64_t zi_totutil;
int zi_active;
uint_t zi_diskutil;
+ boolean_t zi_underutil;
+ boolean_t zi_overutil;
} zoneio_stats_t;
static sys_lat_cycle_t rd_lat;
@@ -247,6 +276,7 @@ hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */
hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
/* time that we last updated per-zone throttle info */
hrtime_t zfs_zone_last_checked = 0;
+hrtime_t zfs_disk_last_laggard = 0;
/*
* Data used to keep track of how often txg sync is running.
@@ -682,13 +712,13 @@ zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
* Adjust each IO's delay. If the overall delay becomes too high, avoid
* increasing beyond the ceiling value.
*/
- if (zonep->zone_io_util > fairutil &&
- sp->zi_diskutil > zfs_zone_util_threshold) {
+ if (zonep->zone_io_util > fairutil && sp->zi_overutil) {
zonep->zone_io_util_above_avg = B_TRUE;
if (sp->zi_active > 1)
zfs_zone_delay_inc(zonep);
- } else if (zonep->zone_io_util < fairutil || sp->zi_active <= 1) {
+ } else if (zonep->zone_io_util < fairutil || sp->zi_underutil ||
+ sp->zi_active <= 1) {
zfs_zone_delay_dec(zonep);
}
@@ -716,6 +746,7 @@ static void
zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
{
zoneio_stats_t stats;
+ hrtime_t laggard_udelta = 0;
(void) bzero(&stats, sizeof (stats));
@@ -742,6 +773,23 @@ zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
}
zfs_disk_last_rtime = zfs_disk_rtime;
+ if (unow > zfs_disk_last_laggard)
+ laggard_udelta = unow - zfs_disk_last_laggard;
+
+ /*
+ * To minimize porpoising, we have three separate states for our
+ * assessment of I/O performance: overutilized, underutilized, and
+ * neither overutilized nor underutilized. We will increment the
+ * throttle if a zone is using more than its fair share _and_ I/O
+ * is overutilized; we will decrement the throttle if a zone is using
+ * less than its fair share _or_ I/O is underutilized.
+ */
+ stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold ||
+ laggard_udelta > zfs_zone_laggard_ancient;
+
+ stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold &&
+ laggard_udelta < zfs_zone_laggard_recent;
+
/*
* sdt:::zfs-zone-stats
*
@@ -1107,6 +1155,9 @@ zfs_zone_zio_done(zio_t *zp)
if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
return;
+ if (zp->io_dispatched == 0)
+ return;
+
now = gethrtime();
unow = NANO_TO_MICRO(now);
udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
@@ -1137,6 +1188,10 @@ zfs_zone_zio_done(zio_t *zp)
zfs_disk_rcnt--;
zfs_disk_rtime += (now - zfs_disk_rlastupdate);
zfs_disk_rlastupdate = now;
+
+ if (udelta > zfs_zone_laggard_threshold)
+ zfs_disk_last_laggard = unow;
+
mutex_exit(&zfs_disk_lock);
if (zfs_zone_delay_enable) {