diff options
Diffstat (limited to 'usr/src/uts/common/fs/zfs/zfs_zone.c')
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_zone.c | 1179 |
1 files changed, 1179 insertions, 0 deletions
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c new file mode 100644 index 0000000000..08f4f38e04 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_zone.c @@ -0,0 +1,1179 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zfs_zone.h> + +#ifndef _KERNEL + +/* + * Stubs for when compiling for user-land. + */ + +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ +} + +void +zfs_zone_zio_init(zio_t *zp) +{ +} + +void +zfs_zone_zio_start(zio_t *zp) +{ +} + +void +zfs_zone_zio_done(zio_t *zp) +{ +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ +} + +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ +} + +int +zfs_zone_txg_delay() +{ + return (1); +} + +#else + +/* + * The real code. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/atomic.h> +#include <sys/zio.h> +#include <sys/zone.h> +#include <sys/avl.h> +#include <sys/sdt.h> +#include <sys/ddi.h> + +/* + * The zone throttle delays read and write operations from certain zones based + * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time + * below), the delays for each zone are recalculated based on the utilization + * over the previous window. + */ +boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */ +uint16_t zfs_zone_delay_step = 5; /* amount to change delay */ +uint16_t zfs_zone_delay_ceiling = 100; /* longest possible delay */ + +hrtime_t zfs_zone_last_checked = 0; + +boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */ + +/* + * For certain workloads, one zone may be issuing primarily sequential I/O and + * another primarily random I/O. The sequential I/O will complete much more + * quickly than the random I/O, driving the average system latency for those + * operations way down. As a result, the random I/O may be throttled back, even + * though the sequential I/O should be throttled to allow the random I/O more + * access to the disk. + * + * This tunable limits the discrepancy between the read and write system + * latency. If one becomes excessively high, this tunable prevents the I/O + * throttler from exacerbating the imbalance. + */ +uint_t zfs_zone_rw_lat_limit = 10; + + +/* + * The I/O throttle will only start delaying zones when it detects disk + * utilization has reached a certain level. This tunable controls the threshold + * at which the throttle will start delaying zones. The calculation should + * correspond closely with the %b column from iostat. + */ +uint_t zfs_zone_util_threshold = 80; + +/* + * Throughout this subsystem, our timestamps are in microseconds. Our system + * average cycle is one second or 1 million microseconds. Our zone counter + * update cycle is two seconds or 2 million microseconds. We use a longer + * duration for that cycle because some ops can see a little over two seconds of + * latency when they are being starved by another zone. + */ +uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */ +uint_t zfs_zone_cycle_time = 2000000; /* 2 s */ + +uint_t zfs_zone_adjust_time = 250000; /* 250 ms */ + +typedef struct { + hrtime_t cycle_start; + int cycle_cnt; + hrtime_t cycle_lat; + hrtime_t sys_avg_lat; +} sys_lat_cycle_t; + +typedef struct { + hrtime_t zi_now; + uint_t zi_avgrlat; + uint_t zi_avgwlat; + uint64_t zi_totpri; + uint64_t zi_totutil; + int zi_active; + uint_t zi_diskutil; +} zoneio_stats_t; + +static sys_lat_cycle_t rd_lat; +static sys_lat_cycle_t wr_lat; + +/* + * Some basic disk stats to determine disk utilization. + */ +kmutex_t zfs_disk_lock; +uint_t zfs_disk_rcnt; +hrtime_t zfs_disk_rtime = 0; +hrtime_t zfs_disk_rlastupdate = 0; + +hrtime_t zfs_disk_last_rtime = 0; + +/* + * Data used to keep track of how often txg flush is running. + */ +extern int zfs_txg_timeout; +static uint_t txg_last_check; +static uint_t txg_cnt; +static uint_t txg_flush_rate; + +boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */ +/* + * Threshold for when zio scheduling should kick in. + * + * This threshold is based on 1/2 of the zfs_vdev_max_pending value for the + * number of I/Os that can be pending on a device. If there are more than a + * few ops already queued up, beyond those already issued to the vdev, then + * use scheduling to get the next zio. + */ +int zfs_zone_schedule_thresh = 5; + +/* + * Tunables for delay throttling when TxG flush is occurring. + */ +int zfs_zone_txg_throttle_scale = 2; +int zfs_zone_txg_delay_ticks = 2; + +typedef struct { + int zq_qdepth; + int zq_priority; + int zq_wt; + zoneid_t zq_zoneid; +} zone_q_bump_t; + +/* + * This uses gethrtime() but returns a value in usecs. + */ +#define GET_USEC_TIME (gethrtime() / 1000) +#define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC)) + +/* + * Keep track of the zone's ZFS IOPs. + * + * If the number of ops is >1 then we can just use that value. However, + * if the number of ops is <2 then we might have a zone which is trying to do + * IO but is not able to get any ops through the system. We don't want to lose + * track of this zone so we factor in its decayed count into the current count. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count. + * However, since this calculation is driven by IO activity and since IO does + * not happen at fixed intervals, we use a timestamp to see when the last update + * was made. If it was more than one cycle ago, then we need to decay the + * historical count by the proper number of additional cycles in which no IO was + * performed. + * + * Return true if we actually computed a new historical count. + * If we're still within an active cycle there is nothing to do, return false. + */ +static hrtime_t +compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new zone count. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_cycle_time) + return (delta); + + /* A previous cycle is past, compute the new zone count. */ + + /* + * Figure out how many generations we have to decay the historical + * count, since multiple cycles may have elapsed since our last IO. + * We depend on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_cycle_time); + + /* If more than 5 cycles since last the IO, reset count. */ + if (gen_cnt > 5) { + cp->zone_avg_cnt = 0; + } else { + /* Update the count. */ + int i; + + /* + * If the zone did more than 1 IO, just use its current count + * as the historical value, otherwise decay the historical + * count and factor that into the new historical count. We + * pick a threshold > 1 so that we don't lose track of IO due + * to int rounding. + */ + if (cp->cycle_cnt > 1) + cp->zone_avg_cnt = cp->cycle_cnt; + else + cp->zone_avg_cnt = cp->cycle_cnt + + (cp->zone_avg_cnt / 2); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->zone_avg_cnt = cp->zone_avg_cnt / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + + return (0); +} + +/* + * Add IO op data to the zone. + */ +static void +add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops); + zonep->zone_rd_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops); + zonep->zone_wr_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_LOGICAL_WRITE: + (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops); + zonep->zone_lwr_ops.cycle_cnt++; + break; + } +} + +/* + * Use a decaying average to keep track of the overall system latency. + * + * We want to have the recent activity heavily weighted, but if the + * activity decreases or stops, then the average should quickly decay + * down to the new value. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average. + * However, since this calculation is driven by IO activity and since IO does + * not happen + * + * at fixed intervals, we use a timestamp to see when the last update was made. + * If it was more than one cycle ago, then we need to decay the average by the + * proper number of additional cycles in which no IO was performed. + * + * Return true if we actually computed a new system average. + * If we're still within an active cycle there is nothing to do, return false. + */ +static int +compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new average. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_sys_avg_cycle) + return (0); + + /* A previous cycle is past, compute a new system average. */ + + /* + * Figure out how many generations we have to decay, since multiple + * cycles may have elapsed since our last IO. + * We count on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle); + + /* If more than 5 cycles since last the IO, reset average. */ + if (gen_cnt > 5) { + cp->sys_avg_lat = 0; + } else { + /* Update the average. */ + int i; + + cp->sys_avg_lat = + (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->sys_avg_lat = cp->sys_avg_lat / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + cp->cycle_lat = 0; + + return (1); +} + +static void +add_sys_iop(hrtime_t unow, int op, int lat) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_new_sys_avg(unow, &rd_lat); + rd_lat.cycle_cnt++; + rd_lat.cycle_lat += lat; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_new_sys_avg(unow, &wr_lat); + wr_lat.cycle_cnt++; + wr_lat.cycle_lat += lat; + break; + } +} + +/* + * Get the zone IO counts. + */ +static uint_t +calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + uint_t cnt; + + if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + cnt = cp->zone_avg_cnt; + } else { + /* + * If we're less than half way through the cycle then use + * the current count plus half the historical count, otherwise + * just use the current count. + */ + if (delta < (zfs_zone_cycle_time / 2)) + cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2); + else + cnt = cp->cycle_cnt; + } + + return (cnt); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static uint_t +calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp) +{ + if (compute_new_sys_avg(unow, cp)) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + return (cp->sys_avg_lat); + } else { + /* + * We're within a cycle; weight the current activity higher + * compared to the historical data and use that. + */ + extern void __dtrace_probe_zfs__zone__calc__wt__avg(uintptr_t, + uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__calc__wt__avg( + (uintptr_t)cp->sys_avg_lat, + (uintptr_t)cp->cycle_lat, + (uintptr_t)cp->cycle_cnt); + + return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) / + (1 + (cp->cycle_cnt * 8))); + } +} + +/* + * Account for the current IOP on the zone and for the system as a whole. + * The latency parameter is in usecs. + */ +static void +add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat) +{ + /* Add op to zone */ + add_zone_iop(zonep, unow, op); + + /* Track system latency */ + if (op != ZFS_ZONE_IOP_LOGICAL_WRITE) + add_sys_iop(unow, op, lat); +} + +/* + * Calculate and return the total number of read ops, write ops and logical + * write ops for the given zone. If the zone has issued operations of any type + * return a non-zero value, otherwise return 0. + */ +static int +get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops, + uint_t *lwops) +{ + *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops); + *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops); + *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops); + + extern void __dtrace_probe_zfs__zone__io__cnt(uintptr_t, + uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__io__cnt((uintptr_t)zonep->zone_id, + (uintptr_t)(*rops), (uintptr_t)*wops, (uintptr_t)*lwops); + + return (*rops | *wops | *lwops); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static void +get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat) +{ + *rlat = calc_avg_lat(unow, &rd_lat); + *wlat = calc_avg_lat(unow, &wr_lat); + + /* + * In an attempt to improve the accuracy of the throttling algorithm, + * assume that IO operations can't have zero latency. Instead, assume + * a reasonable lower bound for each operation type. If the actual + * observed latencies are non-zero, use those latency values instead. + */ + if (*rlat == 0) + *rlat = 1000; + if (*wlat == 0) + *wlat = 1000; + + extern void __dtrace_probe_zfs__zone__sys__avg__lat(uintptr_t, + uintptr_t); + + __dtrace_probe_zfs__zone__sys__avg__lat((uintptr_t)(*rlat), + (uintptr_t)*wlat); +} + +/* + * Find disk utilization for each zone and average utilization for all active + * zones. + */ +static int +zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint_t rops, wops, lwops; + + if (zonep->zone_id == GLOBAL_ZONEID || + get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) { + zonep->zone_io_util = 0; + return (0); + } + + zonep->zone_io_util = (rops * sp->zi_avgrlat) + + (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat); + sp->zi_totutil += zonep->zone_io_util; + + if (zonep->zone_io_util > 0) { + sp->zi_active++; + sp->zi_totpri += zonep->zone_zfs_io_pri; + } + + /* + * sdt:::zfs-zone-utilization + * + * arg0: zone ID + * arg1: read operations observed during time window + * arg2: physical write operations observed during time window + * arg3: logical write ops observed during time window + * arg4: calculated utilization given read and write ops + * arg5: I/O priority assigned to this zone + */ + extern void __dtrace_probe_zfs__zone__utilization( + uint_t, uint_t, uint_t, uint_t, uint_t, uint_t); + + __dtrace_probe_zfs__zone__utilization((uint_t)(zonep->zone_id), + (uint_t)rops, (uint_t)wops, (uint_t)lwops, + (uint_t)zonep->zone_io_util, (uint_t)zonep->zone_zfs_io_pri); + + return (0); +} + +static void +zfs_zone_delay_inc(zone_t *zonep) +{ + if (zonep->zone_io_delay < zfs_zone_delay_ceiling) + zonep->zone_io_delay += zfs_zone_delay_step; +} + +static void +zfs_zone_delay_dec(zone_t *zonep) +{ + if (zonep->zone_io_delay > 0) + zonep->zone_io_delay -= zfs_zone_delay_step; +} + +/* + * For all zones "far enough" away from the average utilization, increase that + * zones delay. Otherwise, reduce its delay. + */ +static int +zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint16_t delay = zonep->zone_io_delay; + uint_t fairutil = 0; + + zonep->zone_io_util_above_avg = B_FALSE; + + /* + * Given the calculated total utilitzation for all zones, calculate the + * fair share of I/O for this zone. + */ + if (zfs_zone_priority_enable && sp->zi_totpri > 0) { + fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) / + sp->zi_totpri; + } else if (sp->zi_active > 0) { + fairutil = sp->zi_totutil / sp->zi_active; + } + + /* + * Adjust each IO's delay. If the overall delay becomes too high, avoid + * increasing beyond the ceiling value. + */ + if (zonep->zone_io_util > fairutil && + sp->zi_diskutil > zfs_zone_util_threshold) { + zonep->zone_io_util_above_avg = B_TRUE; + + if (sp->zi_active > 1) + zfs_zone_delay_inc(zonep); + } else if (zonep->zone_io_util < fairutil || sp->zi_active <= 1) { + zfs_zone_delay_dec(zonep); + } + + /* + * sdt:::zfs-zone-throttle + * + * arg0: zone ID + * arg1: old delay for this zone + * arg2: new delay for this zone + * arg3: calculated fair I/O utilization + * arg4: actual I/O utilization + */ + extern void __dtrace_probe_zfs__zone__throttle( + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__throttle( + (uintptr_t)zonep->zone_id, (uintptr_t)delay, + (uintptr_t)zonep->zone_io_delay, (uintptr_t)fairutil, + (uintptr_t)zonep->zone_io_util); + + return (0); +} + +/* + * Examine the utilization between different zones, and adjust the delay for + * each zone appropriately. + */ +static void +zfs_zone_wait_adjust(hrtime_t unow) +{ + zoneio_stats_t stats; + + (void) bzero(&stats, sizeof (stats)); + + stats.zi_now = unow; + get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat); + + if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit) + stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit; + else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat) + stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit; + + if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0) + return; + + /* + * Calculate disk utilization for the most recent period. + */ + if (zfs_disk_last_rtime == 0 || unow - zfs_zone_last_checked <= 0) { + stats.zi_diskutil = 0; + } else { + stats.zi_diskutil = + ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) / + ((unow - zfs_zone_last_checked) * 1000); + } + zfs_disk_last_rtime = zfs_disk_rtime; + + /* + * sdt:::zfs-zone-stats + * + * Statistics observed over the last period: + * + * arg0: average system read latency + * arg1: average system write latency + * arg2: number of active zones + * arg3: total I/O 'utilization' for all zones + * arg4: total I/O priority of all active zones + * arg5: calculated disk utilization + */ + extern void __dtrace_probe_zfs__zone__stats( + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__stats((uintptr_t)(stats.zi_avgrlat), + (uintptr_t)(stats.zi_avgwlat), + (uintptr_t)(stats.zi_active), + (uintptr_t)(stats.zi_totutil), + (uintptr_t)(stats.zi_totpri), + (uintptr_t)(stats.zi_diskutil)); + + (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats); +} + +/* + * Callback used to calculate a zone's IO schedule priority. + * + * We scan the zones looking for ones with ops in the queue. Out of those, + * we pick the one that calculates to the highest schedule priority. + */ +static int +get_sched_pri_cb(zone_t *zonep, void *arg) +{ + int pri; + zone_q_bump_t *qbp = arg; + + extern void __dtrace_probe_zfs__zone__enqueued(uintptr_t, uintptr_t); + __dtrace_probe_zfs__zone__enqueued((uintptr_t)(zonep->zone_id), + (uintptr_t)(zonep->zone_zfs_queued)); + + if (zonep->zone_zfs_queued == 0) { + zonep->zone_zfs_weight = 0; + return (0); + } + + /* + * On each pass, increment the zone's weight. We use this as input + * to the calculation to prevent starvation. The value is reset + * each time we issue an IO for this zone so zones which haven't + * done any IO over several iterations will see their weight max + * out. + */ + if (zonep->zone_zfs_weight < 20) + zonep->zone_zfs_weight++; + + /* + * This zone's IO priority is the inverse of the number of IOs + * the zone has enqueued * zone's configured priority * weight. + * The queue depth has already been scaled by 10 to avoid problems + * with int rounding. + * + * This means that zones with fewer IOs in the queue will get + * preference unless other zone's assigned priority pulls them + * ahead. The weight is factored in to help ensure that zones + * which haven't done IO in a while aren't getting starved. + */ + pri = (qbp->zq_qdepth / zonep->zone_zfs_queued) * + zonep->zone_zfs_io_pri * zonep->zone_zfs_weight; + + /* + * If this zone has a higher priority than what we found so far, + * schedule it next. + */ + if (pri > qbp->zq_priority) { + qbp->zq_zoneid = zonep->zone_id; + qbp->zq_priority = pri; + qbp->zq_wt = zonep->zone_zfs_weight; + } + return (0); +} + +/* + * See if we need to bump a zone's zio to the head of the queue. + * + * For single-threaded synchronous workloads a zone cannot get more than + * 1 op into the queue at a time unless the zone is running multiple workloads + * in parallel. This can cause an imbalance in performance if there are zones + * with many parallel workloads (and ops in the queue) vs. other zones which + * are doing simple single-threaded workloads, such as interactive tasks in the + * shell. These zones can get backed up behind a deep queue and their IO + * performance will appear to be very poor as a result. This can make the + * zone work badly for interactive behavior. + * + * The scheduling algorithm kicks in once we start to get a deeper queue. + * Once that occurs, we look at all of the zones to see which one calculates + * to the highest priority. We bump that zone's first zio to the head of the + * queue. + * + * We use a counter on the zone so that we can quickly find how many ops each + * zone has in the queue without having to search the entire queue itself. + * This scales better since the number of zones is expected to be on the + * order of 10-100 whereas the queue depth can be in the range of 50-2000. + * In addition, since the zio's in the queue only have the zoneid, we would + * have to look up the zone for each zio enqueued and that means the overhead + * for scanning the queue each time would be much higher. + * + * In all cases, we fall back to simply pulling the next op off the queue + * if something should go wrong. + */ +static zio_t * +get_next_zio(vdev_queue_t *vq, int qdepth) +{ + zone_q_bump_t qbump; + zio_t *zp = NULL, *zphead; + int cnt = 0; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + /* To avoid problems with int rounding, scale the queue depth by 10 */ + qbump.zq_qdepth = qdepth * 10; + qbump.zq_priority = 0; + qbump.zq_zoneid = 0; + (void) zone_walk(get_sched_pri_cb, &qbump); + + zphead = avl_first(&vq->vq_deadline_tree); + + /* Check if the scheduler didn't pick a zone for some reason!? */ + if (qbump.zq_zoneid != 0) { + for (zp = avl_first(&vq->vq_deadline_tree); zp != NULL; + zp = avl_walk(&vq->vq_deadline_tree, zp, AVL_AFTER)) { + if (zp->io_zoneid == qbump.zq_zoneid) + break; + cnt++; + } + } + + if (zp == NULL) { + zp = zphead; + } else if (zp != zphead) { + /* + * Only fire the probe if we actually picked a different zio + * than the one already at the head of the queue. + */ + extern void __dtrace_probe_zfs__zone__sched__bump(uintptr_t, + uintptr_t, uintptr_t, uintptr_t); + __dtrace_probe_zfs__zone__sched__bump( + (uintptr_t)(zp->io_zoneid), (uintptr_t)(cnt), + (uintptr_t)(qbump.zq_priority), (uintptr_t)(qbump.zq_wt)); + } + + return (zp); +} + +/* + * Add our zone ID to the zio so we can keep track of which zones are doing + * what, even when the current thread processing the zio is not associated + * with the zone (e.g. the kernel taskq which pushes out RX groups). + */ +void +zfs_zone_zio_init(zio_t *zp) +{ + zone_t *zonep = curzone; + + zp->io_zoneid = zonep->zone_id; +} + +/* + * Track IO operations per zone. Called from dmu_tx_count_write for write ops + * and dmu_read_uio for read ops. For each operation, increment that zone's + * counter based on the type of operation. + * + * There are three basic ways that we can see write ops: + * 1) An application does write syscalls. Those ops go into a TXG which + * we'll count here. Sometime later a kernel taskq thread (we'll see the + * vdev IO as zone 0) will perform some number of physical writes to commit + * the TXG to disk. Those writes are not associated with the zone which + * made the write syscalls and the number of operations is not correlated + * between the taskq and the zone. + * 2) An application opens a file with O_SYNC. Each write will result in + * an operation which we'll see here plus a low-level vdev write from + * that zone. + * 3) An application does write syscalls followed by an fsync(). We'll + * count the writes going into a TXG here. We'll also see some number + * (usually much smaller, maybe only 1) of low-level vdev writes from this + * zone when the fsync is performed, plus some other low-level vdev writes + * from the taskq in zone 0 (are these metadata writes?). + * + * 4) In addition to the above, there are misc. system-level writes, such as + * writing out dirty pages to swap, or sync(2) calls, which will be handled + * by the global zone and which we count but don't generally worry about. + * + * Because of the above, we can see writes twice because this is called + * at a high level by a zone thread, but we also will count the phys. writes + * that are performed at a low level via zfs_zone_zio_start. + * + * Without this, it can look like a non-global zone never writes (case 1). + * Depending on when the TXG is flushed, the counts may be in the same sample + * bucket or in a different one. + * + * Tracking read operations is simpler due to their synchronous semantics. The + * zfs_read function -- called as a result of a read(2) syscall -- will always + * retrieve the data to be read through dmu_read_uio. + */ +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ + zone_t *zonep = curzone; + hrtime_t unow; + uint16_t wait; + + unow = GET_USEC_TIME; + + /* + * Only bump the counters for logical operations here. The counters for + * tracking physical IO operations are handled in zfs_zone_zio_done. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) { + mutex_enter(&zonep->zone_stg_io_lock); + add_iop(zonep, unow, type, 0); + mutex_exit(&zonep->zone_stg_io_lock); + } + + if (!zfs_zone_delay_enable) + return; + + /* + * XXX There's a potential race here in that more than one thread may + * update the zone delays concurrently. The worst outcome is corruption + * of our data to track each zone's IO, so the algorithm may make + * incorrect throttling decisions until the data is refreshed. + */ + if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) { + zfs_zone_wait_adjust(unow); + zfs_zone_last_checked = unow; + } + + if ((wait = zonep->zone_io_delay) > 0) { + /* + * If this is a write and we're doing above normal TxG + * flushing, then throttle for longer than normal. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE && + (txg_cnt > 1 || txg_flush_rate > 1)) + wait *= zfs_zone_txg_throttle_scale; + + /* + * sdt:::zfs-zone-wait + * + * arg0: zone ID + * arg1: type of IO operation + * arg2: time to delay (in us) + */ + extern void __dtrace_probe_zfs__zone__wait( + uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__wait((uintptr_t)(zonep->zone_id), + (uintptr_t)type, (uintptr_t)wait); + + drv_usecwait(wait); + + if (zonep->zone_vfs_stats != NULL) { + atomic_inc_64(&zonep->zone_vfs_stats-> + zv_delay_cnt.value.ui64); + atomic_add_64(&zonep->zone_vfs_stats-> + zv_delay_time.value.ui64, wait); + } + } +} + +/* + * XXX Ignore the pool pointer parameter for now. + * + * Keep track to see if the TxG flush rate is running above the expected rate. + * If so, this implies that we are filling TxG's at a high rate due to a heavy + * write workload. We use this as input into the zone throttle. + * + * This function is called every 5 seconds (zfs_txg_timeout) under a normal + * write load. In this case, the flush rate is going to be 1. When there + * is a heavy write load, TxG's fill up fast and the sync thread will write + * the TxG more frequently (perhaps once a second). In this case the rate + * will be > 1. The flush rate is a lagging indicator since it can be up + * to 5 seconds old. We use the txg_cnt to keep track of the rate in the + * current 5 second interval and txg_flush_rate to keep track of the previous + * 5 second interval. In that way we don't have a period (1 or more seconds) + * where the txg_cnt == 0 and we cut back on throttling even though the rate + * is still high. + */ +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ + uint_t now; + + txg_cnt++; + now = (uint_t)(gethrtime() / NANOSEC); + if ((now - txg_last_check) >= zfs_txg_timeout) { + txg_flush_rate = txg_cnt / 2; + txg_cnt = 0; + txg_last_check = now; + } +} + +int +zfs_zone_txg_delay() +{ + zone_t *zonep = curzone; + int delay = 1; + + if (zonep->zone_io_util_above_avg) + delay = zfs_zone_txg_delay_ticks; + + extern void __dtrace_probe_zfs__zone__txg__delay(uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__txg__delay((uintptr_t)(zonep->zone_id), + (uintptr_t)delay); + + return (delay); +} + +/* + * Called from zio_vdev_io_start when an IO hits the end of the zio pipeline + * and is issued. + * Keep track of start time for latency calculation in zfs_zone_zio_done. + */ +void +zfs_zone_zio_start(zio_t *zp) +{ + zone_t *zonep; + + /* + * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for + * an actual I/O operation. Ignore those operations as they relate to + * throttling and scheduling. + */ + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_zfs_lock); + if (zp->io_type == ZIO_TYPE_READ) + kstat_runq_enter(&zonep->zone_zfs_rwstats); + zonep->zone_zfs_weight = 0; + mutex_exit(&zonep->zone_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zp->io_dispatched = gethrtime(); + + if (zfs_disk_rcnt++ != 0) + zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = zp->io_dispatched; + mutex_exit(&zfs_disk_lock); + + zone_rele(zonep); +} + +/* + * Called from vdev_queue_io_done when an IO completes. + * Increment our counter for zone ops. + * Calculate the IO latency avg. for this zone. + */ +void +zfs_zone_zio_done(zio_t *zp) +{ + zone_t *zonep; + hrtime_t now, unow, udelta; + + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + now = gethrtime(); + unow = NANO_TO_MICRO(now); + udelta = unow - NANO_TO_MICRO(zp->io_dispatched); + + mutex_enter(&zonep->zone_zfs_lock); + + /* + * To calculate the wsvc_t average, keep a cumulative sum of all the + * wait time before each I/O was dispatched. Since most writes are + * asynchronous, only track the wait time for read I/Os. + */ + if (zp->io_type == ZIO_TYPE_READ) { + zonep->zone_zfs_rwstats.reads++; + zonep->zone_zfs_rwstats.nread += zp->io_size; + + zonep->zone_zfs_stats->zz_waittime.value.ui64 += + zp->io_dispatched - zp->io_start; + + kstat_runq_exit(&zonep->zone_zfs_rwstats); + } else { + zonep->zone_zfs_rwstats.writes++; + zonep->zone_zfs_rwstats.nwritten += zp->io_size; + } + + mutex_exit(&zonep->zone_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zfs_disk_rcnt--; + zfs_disk_rtime += (now - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = now; + mutex_exit(&zfs_disk_lock); + + if (zfs_zone_delay_enable) { + mutex_enter(&zonep->zone_stg_io_lock); + add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ? + ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta); + mutex_exit(&zonep->zone_stg_io_lock); + } + + zone_rele(zonep); + + /* + * sdt:::zfs-zone-latency + * + * arg0: zone ID + * arg1: type of I/O operation + * arg2: I/O latency (in us) + */ + extern void __dtrace_probe_zfs__zone__latency( + uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__latency((uintptr_t)(zp->io_zoneid), + (uintptr_t)(zp->io_type), (uintptr_t)(udelta)); +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ + zone_t *zonep; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_stg_io_lock); + ASSERT(zonep->zone_zfs_queued > 0); + if (zonep->zone_zfs_queued == 0) + cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0"); + else + zonep->zone_zfs_queued--; + mutex_exit(&zonep->zone_stg_io_lock); + zone_rele(zonep); +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ + zone_t *zonep; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_stg_io_lock); + zonep->zone_zfs_queued++; + mutex_exit(&zonep->zone_stg_io_lock); + zone_rele(zonep); +} + +/* + * Called from vdev_queue_io_to_issue. This function is where zio's are found + * at the head of the queue (by avl_first), then pulled off (by + * vdev_queue_io_remove) and issued. We do our scheduling here to find the + * next zio to issue. + * + * The vq->vq_lock mutex is held when we're executing this function so we + * can safely access the "last zone" variable on the queue. + */ +zio_t * +zfs_zone_schedule(vdev_queue_t *vq) +{ + int cnt; + zoneid_t last_zone; + zio_t *zp; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + cnt = avl_numnodes(&vq->vq_deadline_tree); + last_zone = vq->vq_last_zone_id; + + /* + * If there are only a few ops in the queue then just issue the head. + * If there are more than a few ops already queued up, then use + * scheduling to get the next zio. + */ + if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh) + zp = avl_first(&vq->vq_deadline_tree); + else + zp = get_next_zio(vq, cnt); + + vq->vq_last_zone_id = zp->io_zoneid; + + /* + * Probe with 3 args; the number of IOs in the queue, the zone that + * was last scheduled off this queue, and the zone that was associated + * with the next IO that is scheduled. + */ + extern void __dtrace_probe_zfs__zone__sched(uintptr_t, uintptr_t, + uintptr_t); + + __dtrace_probe_zfs__zone__sched((uintptr_t)(cnt), + (uintptr_t)(last_zone), (uintptr_t)(zp->io_zoneid)); + + return (zp); +} + +#endif |