/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ #include #include #include #ifndef _KERNEL /* * Stubs for when compiling for user-land. */ void zfs_zone_io_throttle(zfs_zone_iop_type_t type) { } void zfs_zone_zio_init(zio_t *zp) { } void zfs_zone_zio_start(zio_t *zp) { } void zfs_zone_zio_done(zio_t *zp) { } void zfs_zone_zio_dequeue(zio_t *zp) { } void zfs_zone_zio_enqueue(zio_t *zp) { } /*ARGSUSED*/ void zfs_zone_report_txg_sync(void *dp) { } int zfs_zone_txg_delay() { return (1); } #else /* * The real code. */ #include #include #include #include #include #include #include #include #include #include #include #include /* * The zone throttle delays read and write operations from certain zones based * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time * below), the delays for each zone are recalculated based on the utilization * over the previous window. */ boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */ uint16_t zfs_zone_delay_step = 5; /* amount to change delay */ uint16_t zfs_zone_delay_ceiling = 100; /* longest possible delay */ hrtime_t zfs_zone_last_checked = 0; boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */ /* * For certain workloads, one zone may be issuing primarily sequential I/O and * another primarily random I/O. The sequential I/O will complete much more * quickly than the random I/O, driving the average system latency for those * operations way down. As a result, the random I/O may be throttled back, even * though the sequential I/O should be throttled to allow the random I/O more * access to the disk. * * This tunable limits the discrepancy between the read and write system * latency. If one becomes excessively high, this tunable prevents the I/O * throttler from exacerbating the imbalance. */ uint_t zfs_zone_rw_lat_limit = 10; /* * The I/O throttle will only start delaying zones when it detects disk * utilization has reached a certain level. This tunable controls the threshold * at which the throttle will start delaying zones. The calculation should * correspond closely with the %b column from iostat. */ uint_t zfs_zone_util_threshold = 80; /* * Throughout this subsystem, our timestamps are in microseconds. Our system * average cycle is one second or 1 million microseconds. Our zone counter * update cycle is two seconds or 2 million microseconds. We use a longer * duration for that cycle because some ops can see a little over two seconds of * latency when they are being starved by another zone. */ uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */ uint_t zfs_zone_cycle_time = 2000000; /* 2 s */ uint_t zfs_zone_adjust_time = 250000; /* 250 ms */ typedef struct { hrtime_t cycle_start; int cycle_cnt; hrtime_t cycle_lat; hrtime_t sys_avg_lat; } sys_lat_cycle_t; typedef struct { hrtime_t zi_now; uint_t zi_avgrlat; uint_t zi_avgwlat; uint64_t zi_totpri; uint64_t zi_totutil; int zi_active; uint_t zi_diskutil; } zoneio_stats_t; static sys_lat_cycle_t rd_lat; static sys_lat_cycle_t wr_lat; /* * Some basic disk stats to determine disk utilization. */ kmutex_t zfs_disk_lock; uint_t zfs_disk_rcnt; hrtime_t zfs_disk_rtime = 0; hrtime_t zfs_disk_rlastupdate = 0; hrtime_t zfs_disk_last_rtime = 0; /* * Data used to keep track of how often txg flush is running. */ extern int zfs_txg_timeout; static uint_t txg_last_check; static uint_t txg_cnt; static uint_t txg_flush_rate; boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */ /* * Threshold for when zio scheduling should kick in. * * This threshold is based on 1/2 of the zfs_vdev_max_pending value for the * number of I/Os that can be pending on a device. If there are more than a * few ops already queued up, beyond those already issued to the vdev, then * use scheduling to get the next zio. */ int zfs_zone_schedule_thresh = 5; /* * Tunables for delay throttling when TxG flush is occurring. */ int zfs_zone_txg_throttle_scale = 2; int zfs_zone_txg_delay_ticks = 2; typedef struct { int zq_qdepth; int zq_priority; int zq_wt; zoneid_t zq_zoneid; } zone_q_bump_t; /* * This uses gethrtime() but returns a value in usecs. */ #define GET_USEC_TIME (gethrtime() / 1000) #define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC)) /* * Keep track of the zone's ZFS IOPs. * * If the number of ops is >1 then we can just use that value. However, * if the number of ops is <2 then we might have a zone which is trying to do * IO but is not able to get any ops through the system. We don't want to lose * track of this zone so we factor in its decayed count into the current count. * * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count. * However, since this calculation is driven by IO activity and since IO does * not happen at fixed intervals, we use a timestamp to see when the last update * was made. If it was more than one cycle ago, then we need to decay the * historical count by the proper number of additional cycles in which no IO was * performed. * * Return true if we actually computed a new historical count. * If we're still within an active cycle there is nothing to do, return false. */ static hrtime_t compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) { hrtime_t delta; int gen_cnt; /* * Check if its time to recompute a new zone count. * If we're still collecting data for the current cycle, return false. */ delta = unow - cp->cycle_start; if (delta < zfs_zone_cycle_time) return (delta); /* A previous cycle is past, compute the new zone count. */ /* * Figure out how many generations we have to decay the historical * count, since multiple cycles may have elapsed since our last IO. * We depend on int rounding here. */ gen_cnt = (int)(delta / zfs_zone_cycle_time); /* If more than 5 cycles since last the IO, reset count. */ if (gen_cnt > 5) { cp->zone_avg_cnt = 0; } else { /* Update the count. */ int i; /* * If the zone did more than 1 IO, just use its current count * as the historical value, otherwise decay the historical * count and factor that into the new historical count. We * pick a threshold > 1 so that we don't lose track of IO due * to int rounding. */ if (cp->cycle_cnt > 1) cp->zone_avg_cnt = cp->cycle_cnt; else cp->zone_avg_cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2); /* * If more than one generation has elapsed since the last * update, decay the values further. */ for (i = 1; i < gen_cnt; i++) cp->zone_avg_cnt = cp->zone_avg_cnt / 2; } /* A new cycle begins. */ cp->cycle_start = unow; cp->cycle_cnt = 0; return (0); } /* * Add IO op data to the zone. */ static void add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op) { switch (op) { case ZFS_ZONE_IOP_READ: (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops); zonep->zone_rd_ops.cycle_cnt++; break; case ZFS_ZONE_IOP_WRITE: (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops); zonep->zone_wr_ops.cycle_cnt++; break; case ZFS_ZONE_IOP_LOGICAL_WRITE: (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops); zonep->zone_lwr_ops.cycle_cnt++; break; } } /* * Use a decaying average to keep track of the overall system latency. * * We want to have the recent activity heavily weighted, but if the * activity decreases or stops, then the average should quickly decay * down to the new value. * * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average. * However, since this calculation is driven by IO activity and since IO does * not happen * * at fixed intervals, we use a timestamp to see when the last update was made. * If it was more than one cycle ago, then we need to decay the average by the * proper number of additional cycles in which no IO was performed. * * Return true if we actually computed a new system average. * If we're still within an active cycle there is nothing to do, return false. */ static int compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp) { hrtime_t delta; int gen_cnt; /* * Check if its time to recompute a new average. * If we're still collecting data for the current cycle, return false. */ delta = unow - cp->cycle_start; if (delta < zfs_zone_sys_avg_cycle) return (0); /* A previous cycle is past, compute a new system average. */ /* * Figure out how many generations we have to decay, since multiple * cycles may have elapsed since our last IO. * We count on int rounding here. */ gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle); /* If more than 5 cycles since last the IO, reset average. */ if (gen_cnt > 5) { cp->sys_avg_lat = 0; } else { /* Update the average. */ int i; cp->sys_avg_lat = (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt); /* * If more than one generation has elapsed since the last * update, decay the values further. */ for (i = 1; i < gen_cnt; i++) cp->sys_avg_lat = cp->sys_avg_lat / 2; } /* A new cycle begins. */ cp->cycle_start = unow; cp->cycle_cnt = 0; cp->cycle_lat = 0; return (1); } static void add_sys_iop(hrtime_t unow, int op, int lat) { switch (op) { case ZFS_ZONE_IOP_READ: (void) compute_new_sys_avg(unow, &rd_lat); rd_lat.cycle_cnt++; rd_lat.cycle_lat += lat; break; case ZFS_ZONE_IOP_WRITE: (void) compute_new_sys_avg(unow, &wr_lat); wr_lat.cycle_cnt++; wr_lat.cycle_lat += lat; break; } } /* * Get the zone IO counts. */ static uint_t calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) { hrtime_t delta; uint_t cnt; if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) { /* * No activity in the current cycle, we already have the * historical data so we'll use that. */ cnt = cp->zone_avg_cnt; } else { /* * If we're less than half way through the cycle then use * the current count plus half the historical count, otherwise * just use the current count. */ if (delta < (zfs_zone_cycle_time / 2)) cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2); else cnt = cp->cycle_cnt; } return (cnt); } /* * Get the average read/write latency in usecs for the system. */ static uint_t calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp) { if (compute_new_sys_avg(unow, cp)) { /* * No activity in the current cycle, we already have the * historical data so we'll use that. */ return (cp->sys_avg_lat); } else { /* * We're within a cycle; weight the current activity higher * compared to the historical data and use that. */ extern void __dtrace_probe_zfs__zone__calc__wt__avg(uintptr_t, uintptr_t, uintptr_t); __dtrace_probe_zfs__zone__calc__wt__avg( (uintptr_t)cp->sys_avg_lat, (uintptr_t)cp->cycle_lat, (uintptr_t)cp->cycle_cnt); return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) / (1 + (cp->cycle_cnt * 8))); } } /* * Account for the current IOP on the zone and for the system as a whole. * The latency parameter is in usecs. */ static void add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat) { /* Add op to zone */ add_zone_iop(zonep, unow, op); /* Track system latency */ if (op != ZFS_ZONE_IOP_LOGICAL_WRITE) add_sys_iop(unow, op, lat); } /* * Calculate and return the total number of read ops, write ops and logical * write ops for the given zone. If the zone has issued operations of any type * return a non-zero value, otherwise return 0. */ static int get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops, uint_t *lwops) { *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops); *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops); *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops); extern void __dtrace_probe_zfs__zone__io__cnt(uintptr_t, uintptr_t, uintptr_t, uintptr_t); __dtrace_probe_zfs__zone__io__cnt((uintptr_t)zonep->zone_id, (uintptr_t)(*rops), (uintptr_t)*wops, (uintptr_t)*lwops); return (*rops | *wops | *lwops); } /* * Get the average read/write latency in usecs for the system. */ static void get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat) { *rlat = calc_avg_lat(unow, &rd_lat); *wlat = calc_avg_lat(unow, &wr_lat); /* * In an attempt to improve the accuracy of the throttling algorithm, * assume that IO operations can't have zero latency. Instead, assume * a reasonable lower bound for each operation type. If the actual * observed latencies are non-zero, use those latency values instead. */ if (*rlat == 0) *rlat = 1000; if (*wlat == 0) *wlat = 1000; extern void __dtrace_probe_zfs__zone__sys__avg__lat(uintptr_t, uintptr_t); __dtrace_probe_zfs__zone__sys__avg__lat((uintptr_t)(*rlat), (uintptr_t)*wlat); } /* * Find disk utilization for each zone and average utilization for all active * zones. */ static int zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg) { zoneio_stats_t *sp = arg; uint_t rops, wops, lwops; if (zonep->zone_id == GLOBAL_ZONEID || get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) { zonep->zone_io_util = 0; return (0); } zonep->zone_io_util = (rops * sp->zi_avgrlat) + (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat); sp->zi_totutil += zonep->zone_io_util; if (zonep->zone_io_util > 0) { sp->zi_active++; sp->zi_totpri += zonep->zone_zfs_io_pri; } /* * sdt:::zfs-zone-utilization * * arg0: zone ID * arg1: read operations observed during time window * arg2: physical write operations observed during time window * arg3: logical write ops observed during time window * arg4: calculated utilization given read and write ops * arg5: I/O priority assigned to this zone */ extern void __dtrace_probe_zfs__zone__utilization( uint_t, uint_t, uint_t, uint_t, uint_t, uint_t); __dtrace_probe_zfs__zone__utilization((uint_t)(zonep->zone_id), (uint_t)rops, (uint_t)wops, (uint_t)lwops, (uint_t)zonep->zone_io_util, (uint_t)zonep->zone_zfs_io_pri); return (0); } static void zfs_zone_delay_inc(zone_t *zonep) { if (zonep->zone_io_delay < zfs_zone_delay_ceiling) zonep->zone_io_delay += zfs_zone_delay_step; } static void zfs_zone_delay_dec(zone_t *zonep) { if (zonep->zone_io_delay > 0) zonep->zone_io_delay -= zfs_zone_delay_step; } /* * For all zones "far enough" away from the average utilization, increase that * zones delay. Otherwise, reduce its delay. */ static int zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) { zoneio_stats_t *sp = arg; uint16_t delay = zonep->zone_io_delay; uint_t fairutil = 0; zonep->zone_io_util_above_avg = B_FALSE; /* * Given the calculated total utilitzation for all zones, calculate the * fair share of I/O for this zone. */ if (zfs_zone_priority_enable && sp->zi_totpri > 0) { fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) / sp->zi_totpri; } else if (sp->zi_active > 0) { fairutil = sp->zi_totutil / sp->zi_active; } /* * Adjust each IO's delay. If the overall delay becomes too high, avoid * increasing beyond the ceiling value. */ if (zonep->zone_io_util > fairutil && sp->zi_diskutil > zfs_zone_util_threshold) { zonep->zone_io_util_above_avg = B_TRUE; if (sp->zi_active > 1) zfs_zone_delay_inc(zonep); } else if (zonep->zone_io_util < fairutil || sp->zi_active <= 1) { zfs_zone_delay_dec(zonep); } /* * sdt:::zfs-zone-throttle * * arg0: zone ID * arg1: old delay for this zone * arg2: new delay for this zone * arg3: calculated fair I/O utilization * arg4: actual I/O utilization */ extern void __dtrace_probe_zfs__zone__throttle( uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); __dtrace_probe_zfs__zone__throttle( (uintptr_t)zonep->zone_id, (uintptr_t)delay, (uintptr_t)zonep->zone_io_delay, (uintptr_t)fairutil, (uintptr_t)zonep->zone_io_util); return (0); } /* * Examine the utilization between different zones, and adjust the delay for * each zone appropriately. */ static void zfs_zone_wait_adjust(hrtime_t unow) { zoneio_stats_t stats; (void) bzero(&stats, sizeof (stats)); stats.zi_now = unow; get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat); if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit) stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit; else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat) stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit; if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0) return; /* * Calculate disk utilization for the most recent period. */ if (zfs_disk_last_rtime == 0 || unow - zfs_zone_last_checked <= 0) { stats.zi_diskutil = 0; } else { stats.zi_diskutil = ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) / ((unow - zfs_zone_last_checked) * 1000); } zfs_disk_last_rtime = zfs_disk_rtime; /* * sdt:::zfs-zone-stats * * Statistics observed over the last period: * * arg0: average system read latency * arg1: average system write latency * arg2: number of active zones * arg3: total I/O 'utilization' for all zones * arg4: total I/O priority of all active zones * arg5: calculated disk utilization */ extern void __dtrace_probe_zfs__zone__stats( uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); __dtrace_probe_zfs__zone__stats((uintptr_t)(stats.zi_avgrlat), (uintptr_t)(stats.zi_avgwlat), (uintptr_t)(stats.zi_active), (uintptr_t)(stats.zi_totutil), (uintptr_t)(stats.zi_totpri), (uintptr_t)(stats.zi_diskutil)); (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats); } /* * Callback used to calculate a zone's IO schedule priority. * * We scan the zones looking for ones with ops in the queue. Out of those, * we pick the one that calculates to the highest schedule priority. */ static int get_sched_pri_cb(zone_t *zonep, void *arg) { int pri; zone_q_bump_t *qbp = arg; extern void __dtrace_probe_zfs__zone__enqueued(uintptr_t, uintptr_t); __dtrace_probe_zfs__zone__enqueued((uintptr_t)(zonep->zone_id), (uintptr_t)(zonep->zone_zfs_queued)); if (zonep->zone_zfs_queued == 0) { zonep->zone_zfs_weight = 0; return (0); } /* * On each pass, increment the zone's weight. We use this as input * to the calculation to prevent starvation. The value is reset * each time we issue an IO for this zone so zones which haven't * done any IO over several iterations will see their weight max * out. */ if (zonep->zone_zfs_weight < 20) zonep->zone_zfs_weight++; /* * This zone's IO priority is the inverse of the number of IOs * the zone has enqueued * zone's configured priority * weight. * The queue depth has already been scaled by 10 to avoid problems * with int rounding. * * This means that zones with fewer IOs in the queue will get * preference unless other zone's assigned priority pulls them * ahead. The weight is factored in to help ensure that zones * which haven't done IO in a while aren't getting starved. */ pri = (qbp->zq_qdepth / zonep->zone_zfs_queued) * zonep->zone_zfs_io_pri * zonep->zone_zfs_weight; /* * If this zone has a higher priority than what we found so far, * schedule it next. */ if (pri > qbp->zq_priority) { qbp->zq_zoneid = zonep->zone_id; qbp->zq_priority = pri; qbp->zq_wt = zonep->zone_zfs_weight; } return (0); } /* * See if we need to bump a zone's zio to the head of the queue. * * For single-threaded synchronous workloads a zone cannot get more than * 1 op into the queue at a time unless the zone is running multiple workloads * in parallel. This can cause an imbalance in performance if there are zones * with many parallel workloads (and ops in the queue) vs. other zones which * are doing simple single-threaded workloads, such as interactive tasks in the * shell. These zones can get backed up behind a deep queue and their IO * performance will appear to be very poor as a result. This can make the * zone work badly for interactive behavior. * * The scheduling algorithm kicks in once we start to get a deeper queue. * Once that occurs, we look at all of the zones to see which one calculates * to the highest priority. We bump that zone's first zio to the head of the * queue. * * We use a counter on the zone so that we can quickly find how many ops each * zone has in the queue without having to search the entire queue itself. * This scales better since the number of zones is expected to be on the * order of 10-100 whereas the queue depth can be in the range of 50-2000. * In addition, since the zio's in the queue only have the zoneid, we would * have to look up the zone for each zio enqueued and that means the overhead * for scanning the queue each time would be much higher. * * In all cases, we fall back to simply pulling the next op off the queue * if something should go wrong. */ static zio_t * get_next_zio(vdev_queue_t *vq, int qdepth) { zone_q_bump_t qbump; zio_t *zp = NULL, *zphead; int cnt = 0; ASSERT(MUTEX_HELD(&vq->vq_lock)); /* To avoid problems with int rounding, scale the queue depth by 10 */ qbump.zq_qdepth = qdepth * 10; qbump.zq_priority = 0; qbump.zq_zoneid = 0; (void) zone_walk(get_sched_pri_cb, &qbump); zphead = avl_first(&vq->vq_deadline_tree); /* Check if the scheduler didn't pick a zone for some reason!? */ if (qbump.zq_zoneid != 0) { for (zp = avl_first(&vq->vq_deadline_tree); zp != NULL; zp = avl_walk(&vq->vq_deadline_tree, zp, AVL_AFTER)) { if (zp->io_zoneid == qbump.zq_zoneid) break; cnt++; } } if (zp == NULL) { zp = zphead; } else if (zp != zphead) { /* * Only fire the probe if we actually picked a different zio * than the one already at the head of the queue. */ extern void __dtrace_probe_zfs__zone__sched__bump(uintptr_t, uintptr_t, uintptr_t, uintptr_t); __dtrace_probe_zfs__zone__sched__bump( (uintptr_t)(zp->io_zoneid), (uintptr_t)(cnt), (uintptr_t)(qbump.zq_priority), (uintptr_t)(qbump.zq_wt)); } return (zp); } /* * Add our zone ID to the zio so we can keep track of which zones are doing * what, even when the current thread processing the zio is not associated * with the zone (e.g. the kernel taskq which pushes out RX groups). */ void zfs_zone_zio_init(zio_t *zp) { zone_t *zonep = curzone; zp->io_zoneid = zonep->zone_id; } /* * Track IO operations per zone. Called from dmu_tx_count_write for write ops * and dmu_read_uio for read ops. For each operation, increment that zone's * counter based on the type of operation. * * There are three basic ways that we can see write ops: * 1) An application does write syscalls. Those ops go into a TXG which * we'll count here. Sometime later a kernel taskq thread (we'll see the * vdev IO as zone 0) will perform some number of physical writes to commit * the TXG to disk. Those writes are not associated with the zone which * made the write syscalls and the number of operations is not correlated * between the taskq and the zone. * 2) An application opens a file with O_SYNC. Each write will result in * an operation which we'll see here plus a low-level vdev write from * that zone. * 3) An application does write syscalls followed by an fsync(). We'll * count the writes going into a TXG here. We'll also see some number * (usually much smaller, maybe only 1) of low-level vdev writes from this * zone when the fsync is performed, plus some other low-level vdev writes * from the taskq in zone 0 (are these metadata writes?). * * 4) In addition to the above, there are misc. system-level writes, such as * writing out dirty pages to swap, or sync(2) calls, which will be handled * by the global zone and which we count but don't generally worry about. * * Because of the above, we can see writes twice because this is called * at a high level by a zone thread, but we also will count the phys. writes * that are performed at a low level via zfs_zone_zio_start. * * Without this, it can look like a non-global zone never writes (case 1). * Depending on when the TXG is flushed, the counts may be in the same sample * bucket or in a different one. * * Tracking read operations is simpler due to their synchronous semantics. The * zfs_read function -- called as a result of a read(2) syscall -- will always * retrieve the data to be read through dmu_read_uio. */ void zfs_zone_io_throttle(zfs_zone_iop_type_t type) { zone_t *zonep = curzone; hrtime_t unow; uint16_t wait; unow = GET_USEC_TIME; /* * Only bump the counters for logical operations here. The counters for * tracking physical IO operations are handled in zfs_zone_zio_done. */ if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) { mutex_enter(&zonep->zone_stg_io_lock); add_iop(zonep, unow, type, 0); mutex_exit(&zonep->zone_stg_io_lock); } if (!zfs_zone_delay_enable) return; /* * XXX There's a potential race here in that more than one thread may * update the zone delays concurrently. The worst outcome is corruption * of our data to track each zone's IO, so the algorithm may make * incorrect throttling decisions until the data is refreshed. */ if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) { zfs_zone_wait_adjust(unow); zfs_zone_last_checked = unow; } if ((wait = zonep->zone_io_delay) > 0) { /* * If this is a write and we're doing above normal TxG * flushing, then throttle for longer than normal. */ if (type == ZFS_ZONE_IOP_LOGICAL_WRITE && (txg_cnt > 1 || txg_flush_rate > 1)) wait *= zfs_zone_txg_throttle_scale; /* * sdt:::zfs-zone-wait * * arg0: zone ID * arg1: type of IO operation * arg2: time to delay (in us) */ extern void __dtrace_probe_zfs__zone__wait( uintptr_t, uintptr_t, uintptr_t); __dtrace_probe_zfs__zone__wait((uintptr_t)(zonep->zone_id), (uintptr_t)type, (uintptr_t)wait); drv_usecwait(wait); if (zonep->zone_vfs_stats != NULL) { atomic_inc_64(&zonep->zone_vfs_stats-> zv_delay_cnt.value.ui64); atomic_add_64(&zonep->zone_vfs_stats-> zv_delay_time.value.ui64, wait); } } } /* * XXX Ignore the pool pointer parameter for now. * * Keep track to see if the TxG flush rate is running above the expected rate. * If so, this implies that we are filling TxG's at a high rate due to a heavy * write workload. We use this as input into the zone throttle. * * This function is called every 5 seconds (zfs_txg_timeout) under a normal * write load. In this case, the flush rate is going to be 1. When there * is a heavy write load, TxG's fill up fast and the sync thread will write * the TxG more frequently (perhaps once a second). In this case the rate * will be > 1. The flush rate is a lagging indicator since it can be up * to 5 seconds old. We use the txg_cnt to keep track of the rate in the * current 5 second interval and txg_flush_rate to keep track of the previous * 5 second interval. In that way we don't have a period (1 or more seconds) * where the txg_cnt == 0 and we cut back on throttling even though the rate * is still high. */ /*ARGSUSED*/ void zfs_zone_report_txg_sync(void *dp) { uint_t now; txg_cnt++; now = (uint_t)(gethrtime() / NANOSEC); if ((now - txg_last_check) >= zfs_txg_timeout) { txg_flush_rate = txg_cnt / 2; txg_cnt = 0; txg_last_check = now; } } int zfs_zone_txg_delay() { zone_t *zonep = curzone; int delay = 1; if (zonep->zone_io_util_above_avg) delay = zfs_zone_txg_delay_ticks; extern void __dtrace_probe_zfs__zone__txg__delay(uintptr_t, uintptr_t); __dtrace_probe_zfs__zone__txg__delay((uintptr_t)(zonep->zone_id), (uintptr_t)delay); return (delay); } /* * Called from zio_vdev_io_start when an IO hits the end of the zio pipeline * and is issued. * Keep track of start time for latency calculation in zfs_zone_zio_done. */ void zfs_zone_zio_start(zio_t *zp) { zone_t *zonep; /* * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for * an actual I/O operation. Ignore those operations as they relate to * throttling and scheduling. */ if (zp->io_type == ZIO_TYPE_IOCTL) return; if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) return; mutex_enter(&zonep->zone_zfs_lock); if (zp->io_type == ZIO_TYPE_READ) kstat_runq_enter(&zonep->zone_zfs_rwstats); zonep->zone_zfs_weight = 0; mutex_exit(&zonep->zone_zfs_lock); mutex_enter(&zfs_disk_lock); zp->io_dispatched = gethrtime(); if (zfs_disk_rcnt++ != 0) zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate); zfs_disk_rlastupdate = zp->io_dispatched; mutex_exit(&zfs_disk_lock); zone_rele(zonep); } /* * Called from vdev_queue_io_done when an IO completes. * Increment our counter for zone ops. * Calculate the IO latency avg. for this zone. */ void zfs_zone_zio_done(zio_t *zp) { zone_t *zonep; hrtime_t now, unow, udelta; if (zp->io_type == ZIO_TYPE_IOCTL) return; if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) return; now = gethrtime(); unow = NANO_TO_MICRO(now); udelta = unow - NANO_TO_MICRO(zp->io_dispatched); mutex_enter(&zonep->zone_zfs_lock); /* * To calculate the wsvc_t average, keep a cumulative sum of all the * wait time before each I/O was dispatched. Since most writes are * asynchronous, only track the wait time for read I/Os. */ if (zp->io_type == ZIO_TYPE_READ) { zonep->zone_zfs_rwstats.reads++; zonep->zone_zfs_rwstats.nread += zp->io_size; zonep->zone_zfs_stats->zz_waittime.value.ui64 += zp->io_dispatched - zp->io_start; kstat_runq_exit(&zonep->zone_zfs_rwstats); } else { zonep->zone_zfs_rwstats.writes++; zonep->zone_zfs_rwstats.nwritten += zp->io_size; } mutex_exit(&zonep->zone_zfs_lock); mutex_enter(&zfs_disk_lock); zfs_disk_rcnt--; zfs_disk_rtime += (now - zfs_disk_rlastupdate); zfs_disk_rlastupdate = now; mutex_exit(&zfs_disk_lock); if (zfs_zone_delay_enable) { mutex_enter(&zonep->zone_stg_io_lock); add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ? ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta); mutex_exit(&zonep->zone_stg_io_lock); } zone_rele(zonep); /* * sdt:::zfs-zone-latency * * arg0: zone ID * arg1: type of I/O operation * arg2: I/O latency (in us) */ extern void __dtrace_probe_zfs__zone__latency( uintptr_t, uintptr_t, uintptr_t); __dtrace_probe_zfs__zone__latency((uintptr_t)(zp->io_zoneid), (uintptr_t)(zp->io_type), (uintptr_t)(udelta)); } void zfs_zone_zio_dequeue(zio_t *zp) { zone_t *zonep; if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) return; mutex_enter(&zonep->zone_stg_io_lock); ASSERT(zonep->zone_zfs_queued > 0); if (zonep->zone_zfs_queued == 0) cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0"); else zonep->zone_zfs_queued--; mutex_exit(&zonep->zone_stg_io_lock); zone_rele(zonep); } void zfs_zone_zio_enqueue(zio_t *zp) { zone_t *zonep; if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) return; mutex_enter(&zonep->zone_stg_io_lock); zonep->zone_zfs_queued++; mutex_exit(&zonep->zone_stg_io_lock); zone_rele(zonep); } /* * Called from vdev_queue_io_to_issue. This function is where zio's are found * at the head of the queue (by avl_first), then pulled off (by * vdev_queue_io_remove) and issued. We do our scheduling here to find the * next zio to issue. * * The vq->vq_lock mutex is held when we're executing this function so we * can safely access the "last zone" variable on the queue. */ zio_t * zfs_zone_schedule(vdev_queue_t *vq) { int cnt; zoneid_t last_zone; zio_t *zp; ASSERT(MUTEX_HELD(&vq->vq_lock)); cnt = avl_numnodes(&vq->vq_deadline_tree); last_zone = vq->vq_last_zone_id; /* * If there are only a few ops in the queue then just issue the head. * If there are more than a few ops already queued up, then use * scheduling to get the next zio. */ if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh) zp = avl_first(&vq->vq_deadline_tree); else zp = get_next_zio(vq, cnt); vq->vq_last_zone_id = zp->io_zoneid; /* * Probe with 3 args; the number of IOs in the queue, the zone that * was last scheduled off this queue, and the zone that was associated * with the next IO that is scheduled. */ extern void __dtrace_probe_zfs__zone__sched(uintptr_t, uintptr_t, uintptr_t); __dtrace_probe_zfs__zone__sched((uintptr_t)(cnt), (uintptr_t)(last_zone), (uintptr_t)(zp->io_zoneid)); return (zp); } #endif