/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2018, Joyent, Inc. All rights reserved. */ /* * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to * ZFS I/O resources for each zone. * * I/O contention can be major pain point on a multi-tenant system. A single * zone can issue a stream of I/O operations, usually synchronous writes, which * disrupt I/O performance for all other zones. This problem is further * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG, * a set of blocks which are atomically synced to disk. The process of * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving * out any pending read operations. * * There are two facets to this capability; the throttle and the scheduler. * * Throttle * * The requirements on the throttle are: * * 1) Ensure consistent and predictable I/O latency across all zones. * 2) Sequential and random workloads have very different characteristics, * so it is a non-starter to track IOPS or throughput. * 3) A zone should be able to use the full disk bandwidth if no other zone * is actively using the disk. * * The throttle has two components: one to track and account for each zone's * I/O requests, and another to throttle each zone's operations when it * exceeds its fair share of disk I/O. When the throttle detects that a zone is * consuming more than is appropriate, each read or write system call is * delayed by up to 100 microseconds, which we've found is sufficient to allow * other zones to interleave I/O requests during those delays. * * Note: The throttle will delay each logical I/O (as opposed to the physical * I/O which will likely be issued asynchronously), so it may be easier to * think of the I/O throttle delaying each read/write syscall instead of the * actual I/O operation. For each zone, the throttle tracks an ongoing average * of read and write operations performed to determine the overall I/O * utilization for each zone. * * The throttle calculates a I/O utilization metric for each zone using the * following formula: * * (# of read syscalls) x (Average read latency) + * (# of write syscalls) x (Average write latency) * * Once each zone has its utilization metric, the I/O throttle will compare I/O * utilization across all zones, and if a zone has a higher-than-average I/O * utilization, system calls from that zone are throttled. That is, if one * zone has a much higher utilization, that zone's delay is increased by 5 * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is * already throttled and has a lower utilization than average, its delay will * be lowered by 5 microseconds. * * The throttle calculation is driven by IO activity, but since IO does not * happen at fixed intervals, timestamps are used to track when the last update * was made and to drive recalculation. * * The throttle recalculates each zone's I/O usage and throttle delay (if any) * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval. * * Scheduler * * The I/O scheduler manages the vdev queues – the queues of pending I/Os to * issue to the disks. It only makes scheduling decisions for the two * synchronous I/O queues (read & write). * * The scheduler maintains how many I/Os in the queue are from each zone, and * if one zone has a disproportionately large number of I/Os in the queue, the * scheduler will allow certain I/Os from the underutilized zones to be "bumped" * and pulled from the middle of the queue. This bump allows zones with a small * number of I/Os (so small they may not even be taken into account by the * throttle) to complete quickly instead of waiting behind dozens of I/Os from * other zones. */ #include #include #include #ifndef _KERNEL /* * Stubs for when compiling for user-land. */ void zfs_zone_io_throttle(zfs_zone_iop_type_t type) { } void zfs_zone_zio_init(zio_t *zp) { } void zfs_zone_zio_start(zio_t *zp) { } void zfs_zone_zio_done(zio_t *zp) { } void zfs_zone_zio_dequeue(zio_t *zp) { } void zfs_zone_zio_enqueue(zio_t *zp) { } /*ARGSUSED*/ void zfs_zone_report_txg_sync(void *dp) { } hrtime_t zfs_zone_txg_delay() { return (MSEC2NSEC(10)); } #else /* * The real code. */ #include #include #include #include #include #include #include #include #include #include #include #include /* * The zone throttle delays read and write operations from certain zones based * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time * below), the delays for each zone are recalculated based on the utilization * over the previous window. */ boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */ uint8_t zfs_zone_delay_step = 5; /* usec amnt to change delay */ uint8_t zfs_zone_delay_ceiling = 100; /* usec delay max */ boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */ /* * For certain workloads, one zone may be issuing primarily sequential I/O and * another primarily random I/O. The sequential I/O will complete much more * quickly than the random I/O, driving the average system latency for those * operations way down. As a result, the random I/O may be throttled back, even * though the sequential I/O should be throttled to allow the random I/O more * access to the disk. * * This tunable limits the discrepancy between the read and write system * latency. If one becomes excessively high, this tunable prevents the I/O * throttler from exacerbating the imbalance. */ uint_t zfs_zone_rw_lat_limit = 10; /* * The I/O throttle will only start delaying zones when it detects disk * utilization has reached a certain level. This tunable controls the * threshold at which the throttle will start delaying zones. When the number * of vdevs is small, the calculation should correspond closely with the %b * column from iostat -- but as the number of vdevs becomes large, it will * correlate less and less to any single device (therefore making it a poor * approximation for the actual I/O utilization on such systems). We * therefore use our derived utilization conservatively: we know that low * derived utilization does indeed correlate to low I/O use -- but that a high * rate of derived utilization does not necesarily alone denote saturation; * where we see a high rate of utilization, we also look for laggard I/Os to * attempt to detect saturation. */ uint_t zfs_zone_util_threshold = 80; uint_t zfs_zone_underutil_threshold = 60; /* * There are three important tunables here: zfs_zone_laggard_threshold denotes * the threshold at which an I/O is considered to be of notably high latency; * zfs_zone_laggard_recent denotes the number of microseconds before the * current time after which the last laggard is considered to be sufficiently * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes * the microseconds before the current time before which the last laggard is * considered to be sufficiently old to merit decreasing the throttle. The * most important tunable of these three is the zfs_zone_laggard_threshold: in * modeling data from a large public cloud, this tunable was found to have a * much greater effect on the throttle than the two time-based thresholds. * This must be set high enough to not result in spurious throttling, but not * so high as to allow pathological I/O to persist in the system. */ uint_t zfs_zone_laggard_threshold = 50000; /* 50 ms */ uint_t zfs_zone_laggard_recent = 1000000; /* 1000 ms */ uint_t zfs_zone_laggard_ancient = 5000000; /* 5000 ms */ /* * Throughout this subsystem, our timestamps are in microseconds. Our system * average cycle is one second or 1 million microseconds. Our zone counter * update cycle is two seconds or 2 million microseconds. We use a longer * duration for that cycle because some ops can see a little over two seconds of * latency when they are being starved by another zone. */ uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */ uint_t zfs_zone_cycle_time = 2000000; /* 2 s */ /* * How often the I/O throttle will reevaluate each zone's utilization, in * microseconds. Default is 1/4 sec. */ uint_t zfs_zone_adjust_time = 250000; /* 250 ms */ typedef struct { hrtime_t cycle_start; hrtime_t cycle_lat; hrtime_t sys_avg_lat; uint_t cycle_cnt; } sys_lat_cycle_t; typedef struct { hrtime_t zi_now; uint_t zi_avgrlat; uint_t zi_avgwlat; uint64_t zi_totpri; uint64_t zi_totutil; int zi_active; uint_t zi_diskutil; boolean_t zi_underutil; boolean_t zi_overutil; } zoneio_stats_t; static sys_lat_cycle_t rd_lat; static sys_lat_cycle_t wr_lat; /* * Some basic disk stats to determine disk utilization. The utilization info * for all disks on the system is aggregated into these values. * * Overall disk utilization for the current cycle is calculated as: * * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) * ---------------------------------------------- * ((now - zfs_zone_last_checked) * 1000); */ kmutex_t zfs_disk_lock; /* protects the following: */ uint_t zfs_disk_rcnt; /* Number of outstanding IOs */ hrtime_t zfs_disk_rtime = 0; /* cummulative sum of time performing IO */ hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */ hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */ /* time that we last updated per-zone throttle info */ kmutex_t zfs_last_check_lock; /* protects zfs_zone_last_checked */ hrtime_t zfs_zone_last_checked = 0; hrtime_t zfs_disk_last_laggard = 0; /* * Data used to keep track of how often txg sync is running. */ extern int zfs_txg_timeout; static uint_t txg_last_check; static uint_t txg_cnt; static uint_t txg_sync_rate; boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */ /* * Threshold for when zio scheduling should kick in. * * This threshold is based on the zfs_vdev_sync_read_max_active value for the * number of I/Os that can be pending on a device. If there are more than the * max_active ops already queued up, beyond those already issued to the vdev, * then use zone-based scheduling to get the next synchronous zio. */ uint32_t zfs_zone_schedule_thresh = 10; /* * On each pass of the scheduler we increment the zone's weight (up to this * maximum). The weight is used by the scheduler to prevent starvation so * that zones which haven't been able to do any IO over many iterations * will max out thier weight to this value. */ #define SCHED_WEIGHT_MAX 20 /* * Tunables for delay throttling when TXG sync is occurring. * * If the zone is performing a write and we're doing above normal TXG syncing, * then throttle for longer than normal. The zone's wait time is multiplied * by the scale (zfs_zone_txg_throttle_scale). */ int zfs_zone_txg_throttle_scale = 2; hrtime_t zfs_zone_txg_delay_nsec = MSEC2NSEC(20); typedef struct { int zq_qdepth; zio_priority_t zq_queue; int zq_priority; int zq_wt; zoneid_t zq_zoneid; } zone_q_bump_t; /* * This uses gethrtime() but returns a value in usecs. */ #define GET_USEC_TIME (gethrtime() / 1000) #define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC)) /* * Keep track of the zone's ZFS IOPs. * * See the comment on the zfs_zone_io_throttle function for which/how IOPs are * accounted for. * * If the number of ops is >1 then we can just use that value. However, * if the number of ops is <2 then we might have a zone which is trying to do * IO but is not able to get any ops through the system. We don't want to lose * track of this zone so we factor in its decayed count into the current count. * * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count. * However, since this calculation is driven by IO activity and since IO does * not happen at fixed intervals, we use a timestamp to see when the last update * was made. If it was more than one cycle ago, then we need to decay the * historical count by the proper number of additional cycles in which no IO was * performed. * * Return a time delta indicating how far into the current cycle we are or 0 * if the last IO was more than a cycle ago. */ static hrtime_t compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) { hrtime_t delta; int gen_cnt; /* * Check if its time to recompute a new zone count. * If we're still collecting data for the current cycle, return false. */ delta = unow - cp->cycle_start; if (delta < zfs_zone_cycle_time) return (delta); /* A previous cycle is past, compute the new zone count. */ /* * Figure out how many generations we have to decay the historical * count, since multiple cycles may have elapsed since our last IO. * We depend on int rounding here. */ gen_cnt = (int)(delta / zfs_zone_cycle_time); /* If more than 5 cycles since last the IO, reset count. */ if (gen_cnt > 5) { cp->zone_avg_cnt = 0; } else { /* Update the count. */ int i; /* * If the zone did more than 1 IO, just use its current count * as the historical value, otherwise decay the historical * count and factor that into the new historical count. We * pick a threshold > 1 so that we don't lose track of IO due * to int rounding. */ if (cp->cycle_cnt > 1) cp->zone_avg_cnt = cp->cycle_cnt; else cp->zone_avg_cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2); /* * If more than one generation has elapsed since the last * update, decay the values further. */ for (i = 1; i < gen_cnt; i++) cp->zone_avg_cnt = cp->zone_avg_cnt / 2; } /* A new cycle begins. */ cp->cycle_start = unow; cp->cycle_cnt = 0; return (0); } /* * Add IO op data to the zone. */ static void add_zone_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op) { zone_zfs_io_t *iop; mutex_enter(&zpd->zpers_zfs_lock); iop = zpd->zpers_zfsp; if (iop == NULL) { mutex_exit(&zpd->zpers_zfs_lock); return; } switch (op) { case ZFS_ZONE_IOP_READ: (void) compute_historical_zone_cnt(unow, &iop->zpers_rd_ops); iop->zpers_rd_ops.cycle_cnt++; break; case ZFS_ZONE_IOP_WRITE: (void) compute_historical_zone_cnt(unow, &iop->zpers_wr_ops); iop->zpers_wr_ops.cycle_cnt++; break; case ZFS_ZONE_IOP_LOGICAL_WRITE: (void) compute_historical_zone_cnt(unow, &iop->zpers_lwr_ops); iop->zpers_lwr_ops.cycle_cnt++; break; } mutex_exit(&zpd->zpers_zfs_lock); } /* * Use a decaying average to keep track of the overall system latency. * * We want to have the recent activity heavily weighted, but if the * activity decreases or stops, then the average should quickly decay * down to the new value. * * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average. * However, since this calculation is driven by IO activity and since IO does * not happen at fixed intervals, we use a timestamp to see when the last * update was made. If it was more than one cycle ago, then we need to decay * the average by the proper number of additional cycles in which no IO was * performed. * * Return true if we actually computed a new system average. * If we're still within an active cycle there is nothing to do, return false. */ static boolean_t compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp) { hrtime_t delta; int gen_cnt; /* * Check if its time to recompute a new average. * If we're still collecting data for the current cycle, return false. */ delta = unow - cp->cycle_start; if (delta < zfs_zone_sys_avg_cycle) return (B_FALSE); /* A previous cycle is past, compute a new system average. */ /* * Figure out how many generations we have to decay, since multiple * cycles may have elapsed since our last IO. * We count on int rounding here. */ gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle); /* If more than 5 cycles since last the IO, reset average. */ if (gen_cnt > 5) { cp->sys_avg_lat = 0; } else { /* Update the average. */ int i; cp->sys_avg_lat = (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt); /* * If more than one generation has elapsed since the last * update, decay the values further. */ for (i = 1; i < gen_cnt; i++) cp->sys_avg_lat = cp->sys_avg_lat / 2; } /* A new cycle begins. */ cp->cycle_start = unow; cp->cycle_cnt = 0; cp->cycle_lat = 0; return (B_TRUE); } static void add_sys_iop(hrtime_t unow, int op, int lat) { switch (op) { case ZFS_ZONE_IOP_READ: (void) compute_new_sys_avg(unow, &rd_lat); atomic_inc_uint(&rd_lat.cycle_cnt); atomic_add_64((uint64_t *)&rd_lat.cycle_lat, (int64_t)lat); break; case ZFS_ZONE_IOP_WRITE: (void) compute_new_sys_avg(unow, &wr_lat); atomic_inc_uint(&wr_lat.cycle_cnt); atomic_add_64((uint64_t *)&wr_lat.cycle_lat, (int64_t)lat); break; } } /* * Get the zone IO counts. */ static uint_t calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) { hrtime_t delta; uint_t cnt; if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) { /* * No activity in the current cycle, we already have the * historical data so we'll use that. */ cnt = cp->zone_avg_cnt; } else { /* * If we're less than half way through the cycle then use * the current count plus half the historical count, otherwise * just use the current count. */ if (delta < (zfs_zone_cycle_time / 2)) cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2); else cnt = cp->cycle_cnt; } return (cnt); } /* * Get the average read/write latency in usecs for the system. */ static uint_t calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp) { if (compute_new_sys_avg(unow, cp)) { /* * No activity in the current cycle, we already have the * historical data so we'll use that. */ return (cp->sys_avg_lat); } else { /* * We're within a cycle; weight the current activity higher * compared to the historical data and use that. */ DTRACE_PROBE3(zfs__zone__calc__wt__avg, uintptr_t, cp->sys_avg_lat, uintptr_t, cp->cycle_lat, uintptr_t, cp->cycle_cnt); return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) / (1 + (cp->cycle_cnt * 8))); } } /* * Account for the current IOP on the zone and for the system as a whole. * The latency parameter is in usecs. */ static void add_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat) { /* Add op to zone */ add_zone_iop(zpd, unow, op); /* Track system latency */ if (op != ZFS_ZONE_IOP_LOGICAL_WRITE) add_sys_iop(unow, op, lat); } /* * Calculate and return the total number of read ops, write ops and logical * write ops for the given zone. If the zone has issued operations of any type * return a non-zero value, otherwise return 0. */ static int get_zone_io_cnt(hrtime_t unow, zone_zfs_io_t *zpd, uint_t *rops, uint_t *wops, uint_t *lwops) { ASSERT3P(zpd, !=, NULL); *rops = calc_zone_cnt(unow, &zpd->zpers_rd_ops); *wops = calc_zone_cnt(unow, &zpd->zpers_wr_ops); *lwops = calc_zone_cnt(unow, &zpd->zpers_lwr_ops); DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zpd, uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops); return (*rops | *wops | *lwops); } /* * Get the average read/write latency in usecs for the system. */ static void get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat) { *rlat = calc_avg_lat(unow, &rd_lat); *wlat = calc_avg_lat(unow, &wr_lat); /* * In an attempt to improve the accuracy of the throttling algorithm, * assume that IO operations can't have zero latency. Instead, assume * a reasonable lower bound for each operation type. If the actual * observed latencies are non-zero, use those latency values instead. */ if (*rlat == 0) *rlat = 1000; if (*wlat == 0) *wlat = 1000; DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat, uintptr_t, *wlat); } /* * Find disk utilization for each zone and average utilization for all active * zones. */ static int zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg) { zoneio_stats_t *sp = arg; uint_t rops, wops, lwops; zone_persist_t *zpd = &zone_pdata[zonep->zone_id]; zone_zfs_io_t *iop = zpd->zpers_zfsp; ASSERT3P(iop, !=, NULL); mutex_enter(&zpd->zpers_zfs_lock); if (zonep->zone_id == GLOBAL_ZONEID || get_zone_io_cnt(sp->zi_now, iop, &rops, &wops, &lwops) == 0) { mutex_exit(&zpd->zpers_zfs_lock); return (0); } iop->zpers_io_util = (rops * sp->zi_avgrlat) + (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat); sp->zi_totutil += iop->zpers_io_util; if (iop->zpers_io_util > 0) { sp->zi_active++; sp->zi_totpri += iop->zpers_zfs_io_pri; } /* * sdt:::zfs-zone-utilization * * arg0: zone ID * arg1: read operations observed during time window * arg2: physical write operations observed during time window * arg3: logical write ops observed during time window * arg4: calculated utilization given read and write ops * arg5: I/O priority assigned to this zone */ DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id, uint_t, rops, uint_t, wops, uint_t, lwops, uint64_t, iop->zpers_io_util, uint16_t, iop->zpers_zfs_io_pri); mutex_exit(&zpd->zpers_zfs_lock); return (0); } static void zfs_zone_delay_inc(zone_zfs_io_t *zpd) { ASSERT3P(zpd, !=, NULL); if (zpd->zpers_io_delay < zfs_zone_delay_ceiling) zpd->zpers_io_delay += zfs_zone_delay_step; } static void zfs_zone_delay_dec(zone_zfs_io_t *zpd) { ASSERT3P(zpd, !=, NULL); if (zpd->zpers_io_delay > 0) zpd->zpers_io_delay -= zfs_zone_delay_step; } /* * For all zones "far enough" away from the average utilization, increase that * zones delay. Otherwise, reduce its delay. */ static int zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) { zone_persist_t *zpd = &zone_pdata[zonep->zone_id]; zone_zfs_io_t *iop = zpd->zpers_zfsp; zoneio_stats_t *sp = arg; uint8_t delay; uint_t fairutil = 0; ASSERT3P(iop, !=, NULL); mutex_enter(&zpd->zpers_zfs_lock); delay = iop->zpers_io_delay; iop->zpers_io_util_above_avg = 0; /* * Given the calculated total utilitzation for all zones, calculate the * fair share of I/O for this zone. */ if (zfs_zone_priority_enable && sp->zi_totpri > 0) { fairutil = (sp->zi_totutil * iop->zpers_zfs_io_pri) / sp->zi_totpri; } else if (sp->zi_active > 0) { fairutil = sp->zi_totutil / sp->zi_active; } /* * Adjust each IO's delay. If the overall delay becomes too high, avoid * increasing beyond the ceiling value. */ if (iop->zpers_io_util > fairutil && sp->zi_overutil) { iop->zpers_io_util_above_avg = 1; if (sp->zi_active > 1) zfs_zone_delay_inc(iop); } else if (iop->zpers_io_util < fairutil || sp->zi_underutil || sp->zi_active <= 1) { zfs_zone_delay_dec(iop); } /* * sdt:::zfs-zone-throttle * * arg0: zone ID * arg1: old delay for this zone * arg2: new delay for this zone * arg3: calculated fair I/O utilization * arg4: actual I/O utilization */ DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id, uintptr_t, delay, uintptr_t, iop->zpers_io_delay, uintptr_t, fairutil, uintptr_t, iop->zpers_io_util); mutex_exit(&zpd->zpers_zfs_lock); return (0); } /* * Examine the utilization between different zones, and adjust the delay for * each zone appropriately. */ static void zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked) { zoneio_stats_t stats; hrtime_t laggard_udelta = 0; (void) bzero(&stats, sizeof (stats)); stats.zi_now = unow; get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat); if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit) stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit; else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat) stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit; if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0) return; /* * Calculate disk utilization for the most recent period. */ if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) { stats.zi_diskutil = 0; } else { stats.zi_diskutil = ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) / ((unow - last_checked) * 1000); } zfs_disk_last_rtime = zfs_disk_rtime; if (unow > zfs_disk_last_laggard) laggard_udelta = unow - zfs_disk_last_laggard; /* * To minimize porpoising, we have three separate states for our * assessment of I/O performance: overutilized, underutilized, and * neither overutilized nor underutilized. We will increment the * throttle if a zone is using more than its fair share _and_ I/O * is overutilized; we will decrement the throttle if a zone is using * less than its fair share _or_ I/O is underutilized. */ stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold || laggard_udelta > zfs_zone_laggard_ancient; stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold && laggard_udelta < zfs_zone_laggard_recent; /* * sdt:::zfs-zone-stats * * Statistics observed over the last period: * * arg0: average system read latency * arg1: average system write latency * arg2: number of active zones * arg3: total I/O 'utilization' for all zones * arg4: total I/O priority of all active zones * arg5: calculated disk utilization */ DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat, uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active, uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri, uintptr_t, stats.zi_diskutil); (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats); } /* * Callback used to calculate a zone's IO schedule priority. * * We scan the zones looking for ones with ops in the queue. Out of those, * we pick the one that calculates to the highest schedule priority. */ static int get_sched_pri_cb(zone_t *zonep, void *arg) { int pri; uint_t cnt; zone_q_bump_t *qbp = arg; zio_priority_t p = qbp->zq_queue; zone_persist_t *zpd = &zone_pdata[zonep->zone_id]; zone_zfs_io_t *iop; mutex_enter(&zpd->zpers_zfs_lock); iop = zpd->zpers_zfsp; if (iop == NULL) { mutex_exit(&zpd->zpers_zfs_lock); return (0); } cnt = iop->zpers_zfs_queued[p]; if (cnt == 0) { iop->zpers_zfs_weight = 0; mutex_exit(&zpd->zpers_zfs_lock); return (0); } /* * On each pass, increment the zone's weight. We use this as input * to the calculation to prevent starvation. The value is reset * each time we issue an IO for this zone so zones which haven't * done any IO over several iterations will see their weight max * out. */ if (iop->zpers_zfs_weight < SCHED_WEIGHT_MAX) iop->zpers_zfs_weight++; /* * This zone's IO priority is the inverse of the number of IOs * the zone has enqueued * zone's configured priority * weight. * The queue depth has already been scaled by 10 to avoid problems * with int rounding. * * This means that zones with fewer IOs in the queue will get * preference unless other zone's assigned priority pulls them * ahead. The weight is factored in to help ensure that zones * which haven't done IO in a while aren't getting starved. */ pri = (qbp->zq_qdepth / cnt) * iop->zpers_zfs_io_pri * iop->zpers_zfs_weight; /* * If this zone has a higher priority than what we found so far, * it becomes the new leading contender. */ if (pri > qbp->zq_priority) { qbp->zq_zoneid = zonep->zone_id; qbp->zq_priority = pri; qbp->zq_wt = iop->zpers_zfs_weight; } mutex_exit(&zpd->zpers_zfs_lock); return (0); } /* * See if we need to bump a zone's zio to the head of the queue. This is only * done on the two synchronous I/O queues (see the block comment on the * zfs_zone_schedule function). We get the correct vdev_queue_class_t and * queue depth from our caller. * * For single-threaded synchronous processes a zone cannot get more than * 1 op into the queue at a time unless the zone is running multiple processes * in parallel. This can cause an imbalance in performance if there are zones * with many parallel processes (and ops in the queue) vs. other zones which * are doing simple single-threaded processes, such as interactive tasks in the * shell. These zones can get backed up behind a deep queue and their IO * performance will appear to be very poor as a result. This can make the * zone work badly for interactive behavior. * * The scheduling algorithm kicks in once we start to get a deeper queue. * Once that occurs, we look at all of the zones to see which one calculates * to the highest priority. We bump that zone's first zio to the head of the * queue. * * We use a counter on the zone so that we can quickly find how many ops each * zone has in the queue without having to search the entire queue itself. * This scales better since the number of zones is expected to be on the * order of 10-100 whereas the queue depth can be in the range of 50-2000. * In addition, since the zio's in the queue only have the zoneid, we would * have to look up the zone for each zio enqueued and that means the overhead * for scanning the queue each time would be much higher. * * In all cases, we fall back to simply pulling the next op off the queue * if something should go wrong. */ static zio_t * get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p, avl_tree_t *tree) { zone_q_bump_t qbump; zio_t *zp = NULL, *zphead; int cnt = 0; /* To avoid problems with int rounding, scale the queue depth by 10 */ qbump.zq_qdepth = qdepth * 10; qbump.zq_priority = 0; qbump.zq_zoneid = 0; qbump.zq_queue = p; (void) zone_walk(get_sched_pri_cb, &qbump); zphead = avl_first(tree); /* Check if the scheduler didn't pick a zone for some reason!? */ if (qbump.zq_zoneid != 0) { for (zp = avl_first(tree); zp != NULL; zp = avl_walk(tree, zp, AVL_AFTER)) { if (zp->io_zoneid == qbump.zq_zoneid) break; cnt++; } } if (zp == NULL) { zp = zphead; } else if (zp != zphead) { /* * Only fire the probe if we actually picked a different zio * than the one already at the head of the queue. */ DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid, uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt); } return (zp); } /* * Add our zone ID to the zio so we can keep track of which zones are doing * what, even when the current thread processing the zio is not associated * with the zone (e.g. the kernel taskq which pushes out TX groups). */ void zfs_zone_zio_init(zio_t *zp) { zone_t *zonep = curzone; zp->io_zoneid = zonep->zone_id; } /* * Track and throttle IO operations per zone. Called from: * - dmu_tx_count_write for (logical) write ops (both dataset and zvol writes * go through this path) * - arc_read for read ops that miss the ARC (both dataset and zvol) * For each operation, increment that zone's counter based on the type of * operation, then delay the operation, if necessary. * * There are three basic ways that we can see write ops: * 1) An application does write syscalls. Those ops go into a TXG which * we'll count here. Sometime later a kernel taskq thread (we'll see the * vdev IO as zone 0) will perform some number of physical writes to commit * the TXG to disk. Those writes are not associated with the zone which * made the write syscalls and the number of operations is not correlated * between the taskq and the zone. We only see logical writes in this * function, we see the physcial writes in the zfs_zone_zio_start and * zfs_zone_zio_done functions. * 2) An application opens a file with O_SYNC. Each write will result in * an operation which we'll see here plus a low-level vdev write from * that zone. * 3) An application does write syscalls followed by an fsync(). We'll * count the writes going into a TXG here. We'll also see some number * (usually much smaller, maybe only 1) of low-level vdev writes from this * zone when the fsync is performed, plus some other low-level vdev writes * from the taskq in zone 0 (are these metadata writes?). * * 4) In addition to the above, there are misc. system-level writes, such as * writing out dirty pages to swap, or sync(2) calls, which will be handled * by the global zone and which we count but don't generally worry about. * * Because of the above, we can see writes twice; first because this function * is always called by a zone thread for logical writes, but then we also will * count the physical writes that are performed at a low level via * zfs_zone_zio_start. Without this, it can look like a non-global zone never * writes (case 1). Depending on when the TXG is synced, the counts may be in * the same sample bucket or in a different one. * * Tracking read operations is simpler due to their synchronous semantics. The * zfs_read function -- called as a result of a read(2) syscall -- will always * retrieve the data to be read through arc_read and we only come into this * function when we have an arc miss. */ void zfs_zone_io_throttle(zfs_zone_iop_type_t type) { zoneid_t zid = curzone->zone_id; zone_persist_t *zpd = &zone_pdata[zid]; zone_zfs_io_t *iop; hrtime_t unow; uint16_t wait; unow = GET_USEC_TIME; /* * Only bump the counter for logical writes here. The counters for * tracking physical IO operations are handled in zfs_zone_zio_done. */ if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) { add_iop(zpd, unow, type, 0); } if (!zfs_zone_delay_enable) return; mutex_enter(&zpd->zpers_zfs_lock); iop = zpd->zpers_zfsp; if (iop == NULL) { mutex_exit(&zpd->zpers_zfs_lock); return; } /* * If the zone's I/O priority is set to zero, don't throttle that zone's * operations at all. */ if (iop->zpers_zfs_io_pri == 0) { mutex_exit(&zpd->zpers_zfs_lock); return; } /* Handle periodically updating the per-zone I/O parameters */ if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) { hrtime_t last_checked; boolean_t do_update = B_FALSE; /* Recheck under mutex */ mutex_enter(&zfs_last_check_lock); last_checked = zfs_zone_last_checked; if ((unow - last_checked) > zfs_zone_adjust_time) { zfs_zone_last_checked = unow; do_update = B_TRUE; } mutex_exit(&zfs_last_check_lock); if (do_update) { mutex_exit(&zpd->zpers_zfs_lock); zfs_zone_wait_adjust(unow, last_checked); mutex_enter(&zpd->zpers_zfs_lock); iop = zpd->zpers_zfsp; if (iop == NULL) { mutex_exit(&zpd->zpers_zfs_lock); return; } } } wait = iop->zpers_io_delay; mutex_exit(&zpd->zpers_zfs_lock); if (wait > 0) { /* * If this is a write and we're doing above normal TXG * syncing, then throttle for longer than normal. */ if (type == ZFS_ZONE_IOP_LOGICAL_WRITE && (txg_cnt > 1 || txg_sync_rate > 1)) wait *= zfs_zone_txg_throttle_scale; /* * sdt:::zfs-zone-wait * * arg0: zone ID * arg1: type of IO operation * arg2: time to delay (in us) */ DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zid, uintptr_t, type, uintptr_t, wait); drv_usecwait(wait); if (curzone->zone_vfs_stats != NULL) { atomic_inc_64(&curzone->zone_vfs_stats-> zv_delay_cnt.value.ui64); atomic_add_64(&curzone->zone_vfs_stats-> zv_delay_time.value.ui64, wait); } } } /* * XXX Ignore the pool pointer parameter for now. * * Keep track to see if the TXG sync rate is running above the expected rate. * If so, this implies that we are filling TXG's at a high rate due to a heavy * write workload. We use this as input into the zone throttle. * * This function is called every 5 seconds (zfs_txg_timeout) under a normal * write load. In this case, the sync rate is going to be 1. When there * is a heavy write load, TXG's fill up fast and the sync thread will write * the TXG more frequently (perhaps once a second). In this case the rate * will be > 1. The sync rate is a lagging indicator since it can be up * to 5 seconds old. We use the txg_cnt to keep track of the rate in the * current 5 second interval and txg_sync_rate to keep track of the previous * 5 second interval. In that way we don't have a period (1 or more seconds) * where the txg_cnt == 0 and we cut back on throttling even though the rate * is still high. */ /*ARGSUSED*/ void zfs_zone_report_txg_sync(void *dp) { uint_t now; txg_cnt++; now = (uint_t)(gethrtime() / NANOSEC); if ((now - txg_last_check) >= zfs_txg_timeout) { txg_sync_rate = txg_cnt / 2; txg_cnt = 0; txg_last_check = now; } } hrtime_t zfs_zone_txg_delay() { zone_persist_t *zpd = &zone_pdata[curzone->zone_id]; zone_zfs_io_t *iop; uint8_t above; mutex_enter(&zpd->zpers_zfs_lock); iop = zpd->zpers_zfsp; if (iop == NULL) { mutex_exit(&zpd->zpers_zfs_lock); return (0); } above = iop->zpers_io_util_above_avg; mutex_exit(&zpd->zpers_zfs_lock); if (above) { return (zfs_zone_txg_delay_nsec); } return (MSEC2NSEC(10)); } /* * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline * and is issued. * Keep track of start time for latency calculation in zfs_zone_zio_done. */ void zfs_zone_zio_start(zio_t *zp) { zone_persist_t *zpd = &zone_pdata[zp->io_zoneid]; zone_zfs_io_t *iop; /* * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for * an actual I/O operation. Ignore those operations as they relate to * throttling and scheduling. */ if (zp->io_type == ZIO_TYPE_IOCTL) return; mutex_enter(&zpd->zpers_zfs_lock); iop = zpd->zpers_zfsp; if (iop != NULL) { if (zp->io_type == ZIO_TYPE_READ) kstat_runq_enter(&iop->zpers_zfs_rwstats); iop->zpers_zfs_weight = 0; } mutex_exit(&zpd->zpers_zfs_lock); mutex_enter(&zfs_disk_lock); zp->io_dispatched = gethrtime(); if (zfs_disk_rcnt++ != 0) zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate); zfs_disk_rlastupdate = zp->io_dispatched; mutex_exit(&zfs_disk_lock); } /* * Called from vdev_disk_io_done when an IO completes. * Increment our counter for zone ops. * Calculate the IO latency avg. for this zone. */ void zfs_zone_zio_done(zio_t *zp) { zone_persist_t *zpd; zone_zfs_io_t *iop; hrtime_t now, unow, udelta; if (zp->io_type == ZIO_TYPE_IOCTL) return; if (zp->io_dispatched == 0) return; zpd = &zone_pdata[zp->io_zoneid]; now = gethrtime(); unow = NANO_TO_MICRO(now); udelta = unow - NANO_TO_MICRO(zp->io_dispatched); mutex_enter(&zpd->zpers_zfs_lock); iop = zpd->zpers_zfsp; if (iop != NULL) { /* * To calculate the wsvc_t average, keep a cumulative sum of * all the wait time before each I/O was dispatched. Since most * writes are asynchronous, only track the wait time for * read I/Os. */ if (zp->io_type == ZIO_TYPE_READ) { iop->zpers_zfs_rwstats.reads++; iop->zpers_zfs_rwstats.nread += zp->io_size; iop->zpers_zfs_rd_waittime += zp->io_dispatched - zp->io_timestamp; kstat_runq_exit(&iop->zpers_zfs_rwstats); } else { iop->zpers_zfs_rwstats.writes++; iop->zpers_zfs_rwstats.nwritten += zp->io_size; } } mutex_exit(&zpd->zpers_zfs_lock); mutex_enter(&zfs_disk_lock); zfs_disk_rcnt--; zfs_disk_rtime += (now - zfs_disk_rlastupdate); zfs_disk_rlastupdate = now; if (udelta > zfs_zone_laggard_threshold) zfs_disk_last_laggard = unow; mutex_exit(&zfs_disk_lock); if (zfs_zone_delay_enable) { add_iop(zpd, unow, zp->io_type == ZIO_TYPE_READ ? ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta); } /* * sdt:::zfs-zone-latency * * arg0: zone ID * arg1: type of I/O operation * arg2: I/O latency (in us) */ DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid, uintptr_t, zp->io_type, uintptr_t, udelta); } void zfs_zone_zio_dequeue(zio_t *zp) { zio_priority_t p; zone_persist_t *zpd = &zone_pdata[zp->io_zoneid]; zone_zfs_io_t *iop; p = zp->io_priority; if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) return; /* We depend on p being defined as either 0 or 1 */ ASSERT(p < 2); mutex_enter(&zpd->zpers_zfs_lock); iop = zpd->zpers_zfsp; if (iop != NULL) { ASSERT(iop->zpers_zfs_queued[p] > 0); if (iop->zpers_zfs_queued[p] == 0) { cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0"); } else { iop->zpers_zfs_queued[p]--; } } mutex_exit(&zpd->zpers_zfs_lock); } void zfs_zone_zio_enqueue(zio_t *zp) { zio_priority_t p; zone_persist_t *zpd = &zone_pdata[zp->io_zoneid]; zone_zfs_io_t *iop; p = zp->io_priority; if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) return; /* We depend on p being defined as either 0 or 1 */ ASSERT(p < 2); mutex_enter(&zpd->zpers_zfs_lock); iop = zpd->zpers_zfsp; if (iop != NULL) { iop->zpers_zfs_queued[p]++; } mutex_exit(&zpd->zpers_zfs_lock); } /* * Called from vdev_queue_io_to_issue. That function is where zio's are listed * in FIFO order on one of the sync queues, then pulled off (by * vdev_queue_io_remove) and issued. We potentially do zone-based scheduling * here to find a zone's zio deeper in the sync queue and issue that instead * of simply doing FIFO. * * We only do zone-based zio scheduling for the two synchronous I/O queues * (read & write). These queues are normally serviced in FIFO order but we * may decide to move a zone's zio to the head of the line. A typical I/O * load will be mostly synchronous reads and some asynchronous writes (which * are scheduled differently due to transaction groups). There will also be * some synchronous writes for those apps which want to ensure their data is on * disk. We want to make sure that a zone with a single-threaded app (e.g. the * shell) that is doing synchronous I/O (typically reads) isn't penalized by * other zones which are doing lots of synchronous I/O because they have many * running threads. * * The vq->vq_lock mutex is held when we're executing this function so we * can safely access the "last zone" variable on the queue. */ zio_t * zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx, avl_tree_t *tree) { vdev_queue_class_t *vqc = &vq->vq_class[p]; uint_t cnt; zoneid_t last_zone; zio_t *zio; ASSERT(MUTEX_HELD(&vq->vq_lock)); /* Don't change the order on the LBA ordered queues. */ if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) return (avl_nearest(tree, idx, AVL_AFTER)); /* We depend on p being defined as either 0 or 1 */ ASSERT(p < 2); cnt = avl_numnodes(tree); last_zone = vq->vq_last_zone_id; /* * If there are only a few zios in the queue then just issue the head. * If there are more than a few zios already queued up, then use * scheduling to get the next zio. */ if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh) zio = avl_nearest(tree, idx, AVL_AFTER); else zio = get_next_zio(vqc, cnt, p, tree); vq->vq_last_zone_id = zio->io_zoneid; /* * Probe with 4 args; the number of IOs in the queue, the zone that * was last scheduled off this queue, the zone that was associated * with the next IO that is scheduled, and which queue (priority). */ DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone, uint_t, zio->io_zoneid, uint_t, p); return (zio); } #endif