diff options
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_zone.c | 67 |
1 files changed, 61 insertions, 6 deletions
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c index 4c2bdccd66..503416b293 100644 --- a/usr/src/uts/common/fs/zfs/zfs_zone.c +++ b/usr/src/uts/common/fs/zfs/zfs_zone.c @@ -187,11 +187,38 @@ uint_t zfs_zone_rw_lat_limit = 10; /* * The I/O throttle will only start delaying zones when it detects disk - * utilization has reached a certain level. This tunable controls the threshold - * at which the throttle will start delaying zones. The calculation should - * correspond closely with the %b column from iostat. + * utilization has reached a certain level. This tunable controls the + * threshold at which the throttle will start delaying zones. When the number + * of vdevs is small, the calculation should correspond closely with the %b + * column from iostat -- but as the number of vdevs becomes large, it will + * correlate less and less to any single device (therefore making it a poor + * approximation for the actual I/O utilization on such systems). We + * therefore use our derived utilization conservatively: we know that low + * derived utilization does indeed correlate to low I/O use -- but that a high + * rate of derived utilization does not necesarily alone denote saturation; + * where we see a high rate of utilization, we also look for laggard I/Os to + * attempt to detect saturation. */ uint_t zfs_zone_util_threshold = 80; +uint_t zfs_zone_underutil_threshold = 60; + +/* + * There are three important tunables here: zfs_zone_laggard_threshold denotes + * the threshold at which an I/O is considered to be of notably high latency; + * zfs_zone_laggard_recent denotes the number of microseconds before the + * current time after which the last laggard is considered to be sufficiently + * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes + * the microseconds before the current time before which the last laggard is + * considered to be sufficiently old to merit decreasing the throttle. The + * most important tunable of these three is the zfs_zone_laggard_threshold: in + * modeling data from a large public cloud, this tunable was found to have a + * much greater effect on the throttle than the two time-based thresholds. + * This must be set high enough to not result in spurious throttling, but not + * so high as to allow pathological I/O to persist in the system. + */ +uint_t zfs_zone_laggard_threshold = 50000; /* 50 ms */ +uint_t zfs_zone_laggard_recent = 1000000; /* 1000 ms */ +uint_t zfs_zone_laggard_ancient = 5000000; /* 5000 ms */ /* * Throughout this subsystem, our timestamps are in microseconds. Our system @@ -224,6 +251,8 @@ typedef struct { uint64_t zi_totutil; int zi_active; uint_t zi_diskutil; + boolean_t zi_underutil; + boolean_t zi_overutil; } zoneio_stats_t; static sys_lat_cycle_t rd_lat; @@ -247,6 +276,7 @@ hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */ hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */ /* time that we last updated per-zone throttle info */ hrtime_t zfs_zone_last_checked = 0; +hrtime_t zfs_disk_last_laggard = 0; /* * Data used to keep track of how often txg sync is running. @@ -682,13 +712,13 @@ zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) * Adjust each IO's delay. If the overall delay becomes too high, avoid * increasing beyond the ceiling value. */ - if (zonep->zone_io_util > fairutil && - sp->zi_diskutil > zfs_zone_util_threshold) { + if (zonep->zone_io_util > fairutil && sp->zi_overutil) { zonep->zone_io_util_above_avg = B_TRUE; if (sp->zi_active > 1) zfs_zone_delay_inc(zonep); - } else if (zonep->zone_io_util < fairutil || sp->zi_active <= 1) { + } else if (zonep->zone_io_util < fairutil || sp->zi_underutil || + sp->zi_active <= 1) { zfs_zone_delay_dec(zonep); } @@ -716,6 +746,7 @@ static void zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked) { zoneio_stats_t stats; + hrtime_t laggard_udelta = 0; (void) bzero(&stats, sizeof (stats)); @@ -742,6 +773,23 @@ zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked) } zfs_disk_last_rtime = zfs_disk_rtime; + if (unow > zfs_disk_last_laggard) + laggard_udelta = unow - zfs_disk_last_laggard; + + /* + * To minimize porpoising, we have three separate states for our + * assessment of I/O performance: overutilized, underutilized, and + * neither overutilized nor underutilized. We will increment the + * throttle if a zone is using more than its fair share _and_ I/O + * is overutilized; we will decrement the throttle if a zone is using + * less than its fair share _or_ I/O is underutilized. + */ + stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold || + laggard_udelta > zfs_zone_laggard_ancient; + + stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold && + laggard_udelta < zfs_zone_laggard_recent; + /* * sdt:::zfs-zone-stats * @@ -1107,6 +1155,9 @@ zfs_zone_zio_done(zio_t *zp) if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) return; + if (zp->io_dispatched == 0) + return; + now = gethrtime(); unow = NANO_TO_MICRO(now); udelta = unow - NANO_TO_MICRO(zp->io_dispatched); @@ -1137,6 +1188,10 @@ zfs_zone_zio_done(zio_t *zp) zfs_disk_rcnt--; zfs_disk_rtime += (now - zfs_disk_rlastupdate); zfs_disk_rlastupdate = now; + + if (udelta > zfs_zone_laggard_threshold) + zfs_disk_last_laggard = unow; + mutex_exit(&zfs_disk_lock); if (zfs_zone_delay_enable) { |