OS-2531 zfs/zone IO throttle comment improvement and code cleanuprelease-20131003

author: Jerry Jelinek <jerry.jelinek@joyent.com> 2013-10-03 18:33:52 +0000
committer: Jerry Jelinek <jerry.jelinek@joyent.com> 2013-10-03 18:33:52 +0000
commit: 2bd35d239981a47757afea42ebb00bbe46bee26a (patch)
tree: 5fab386462f0d361ab176e838866342882a89df7
parent: c7ebd51897476aa319daf50054d21d2fd9e696e1 (diff)
download: illumos-joyent-2bd35d239981a47757afea42ebb00bbe46bee26a.tar.gz
4 files changed, 187 insertions, 133 deletions
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 7b383e4a74..8e901b804b 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -407,7 +407,8 @@ struct zio {
 	const zio_vsd_ops_t *io_vsd_ops;
 
 	uint64_t	io_offset;
-	hrtime_t	io_timestamp;
+	hrtime_t	io_timestamp;	/* time I/O entered zio pipeline */
+	hrtime_t	io_dispatched;	/* time I/O was dispatched to disk */
 	avl_node_t	io_queue_node;
 
 	/* Internal pipeline state */
@@ -436,8 +437,6 @@ struct zio {
 	uint64_t	io_ena;
 
 	zoneid_t	io_zoneid;	/* zone which originated this I/O */
-	hrtime_t	io_start;	/* time I/O entered zio pipeline */
-	hrtime_t	io_dispatched;	/* time I/O was dispatched to disk */
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
 };
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
index e7b24915e1..4c2bdccd66 100644
--- a/usr/src/uts/common/fs/zfs/zfs_zone.c
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -1,27 +1,93 @@
 /*
- * CDDL HEADER START
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
  *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
  */
+
 /*
  * Copyright 2013, Joyent, Inc. All rights reserved.
  */
 
+/*
+ * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to
+ * ZFS I/O resources for each zone.
+ *
+ * I/O contention can be major pain point on a multi-tenant system. A single
+ * zone can issue a stream of I/O operations, usually synchronous writes, which
+ * disrupt I/O performance for all other zones. This problem is further
+ * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG,
+ * a set of blocks which are atomically synced to disk. The process of
+ * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving
+ * out any pending read operations.
+ *
+ * There are two facets to this capability; the throttle and the scheduler.
+ *
+ * Throttle
+ *
+ * The requirements on the throttle are:
+ *
+ *     1) Ensure consistent and predictable I/O latency across all zones.
+ *     2) Sequential and random workloads have very different characteristics,
+ *        so it is a non-starter to track IOPS or throughput.
+ *     3) A zone should be able to use the full disk bandwidth if no other zone
+ *        is actively using the disk.
+ *
+ * The throttle has two components: one to track and account for each zone's
+ * I/O requests, and another to throttle each zone's operations when it
+ * exceeds its fair share of disk I/O. When the throttle detects that a zone is
+ * consuming more than is appropriate, each read or write system call is
+ * delayed by up to 100 microseconds, which we've found is sufficient to allow
+ * other zones to interleave I/O requests during those delays.
+ *
+ * Note: The throttle will delay each logical I/O (as opposed to the physical
+ * I/O which will likely be issued asynchronously), so it may be easier to
+ * think of the I/O throttle delaying each read/write syscall instead of the
+ * actual I/O operation. For each zone, the throttle tracks an ongoing average
+ * of read and write operations performed to determine the overall I/O
+ * utilization for each zone.
+ *
+ * The throttle calculates a I/O utilization metric for each zone using the
+ * following formula:
+ *
+ *     (# of read syscalls) x (Average read latency) +
+ *     (# of write syscalls) x (Average write latency)
+ *
+ * Once each zone has its utilization metric, the I/O throttle will compare I/O
+ * utilization across all zones, and if a zone has a higher-than-average I/O
+ * utilization, system calls from that zone are throttled. That is, if one
+ * zone has a much higher utilization, that zone's delay is increased by 5
+ * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is
+ * already throttled and has a lower utilization than average, its delay will
+ * be lowered by 5 microseconds.
+ *
+ * The throttle calculation is driven by IO activity, but since IO does not
+ * happen at fixed intervals, timestamps are used to track when the last update
+ * was made and to drive recalculation.
+ *
+ * The throttle recalculates each zone's I/O usage and throttle delay (if any)
+ * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as
+ * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval.
+ *
+ * Scheduler
+ *
+ * The I/O scheduler manages the vdev queues – the queues of pending I/Os to
+ * issue to the disks. It only makes scheduling decisions for the two
+ * synchronous I/O queues (read & write).
+ *
+ * The scheduler maintains how many I/Os in the queue are from each zone, and
+ * if one zone has a disproportionately large number of I/Os in the queue, the
+ * scheduler will allow certain I/Os from the underutilized zones to be "bumped"
+ * and pulled from the middle of the queue. This bump allows zones with a small
+ * number of I/Os (so small they may not even be taken into account by the
+ * throttle) to complete quickly instead of waiting behind dozens of I/Os from
+ * other zones.
+ */
+
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
 #include <sys/zfs_zone.h>
@@ -100,10 +166,8 @@ zfs_zone_txg_delay()
  * over the previous window.
  */
 boolean_t	zfs_zone_delay_enable = B_TRUE;	/* enable IO throttle */
-uint16_t	zfs_zone_delay_step = 5;	/* amount to change delay */
-uint16_t	zfs_zone_delay_ceiling = 100;	/* longest possible delay */
-
-hrtime_t	zfs_zone_last_checked = 0;
+uint16_t	zfs_zone_delay_step = 5;	/* usec amnt to change delay */
+uint16_t	zfs_zone_delay_ceiling = 100;	/* usec delay max */
 
 boolean_t	zfs_zone_priority_enable = B_TRUE;  /* enable IO priority */
 
@@ -121,7 +185,6 @@ boolean_t	zfs_zone_priority_enable = B_TRUE;  /* enable IO priority */
  */
 uint_t		zfs_zone_rw_lat_limit = 10;
 
-
 /*
  * The I/O throttle will only start delaying zones when it detects disk
  * utilization has reached a certain level.  This tunable controls the threshold
@@ -140,6 +203,10 @@ uint_t		zfs_zone_util_threshold = 80;
 uint_t 		zfs_zone_sys_avg_cycle = 1000000;	/* 1 s */
 uint_t 		zfs_zone_cycle_time = 2000000;		/* 2 s */
 
+/*
+ * How often the I/O throttle will reevaluate each zone's utilization, in
+ * microseconds. Default is 1/4 sec.
+ */
 uint_t 		zfs_zone_adjust_time = 250000;		/* 250 ms */
 
 typedef struct {
@@ -163,22 +230,31 @@ static sys_lat_cycle_t	rd_lat;
 static sys_lat_cycle_t	wr_lat;
 
 /*
- * Some basic disk stats to determine disk utilization.
+ * Some basic disk stats to determine disk utilization. The utilization info
+ * for all disks on the system is aggregated into these values.
+ *
+ * Overall disk utilization for the current cycle is calculated as:
+ *
+ * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100)
+ * ----------------------------------------------
+ *    ((now - zfs_zone_last_checked) * 1000);
  */
-kmutex_t	zfs_disk_lock;
-uint_t		zfs_disk_rcnt;
-hrtime_t	zfs_disk_rtime = 0;
-hrtime_t	zfs_disk_rlastupdate = 0;
+kmutex_t	zfs_disk_lock;		/* protects the following: */
+uint_t		zfs_disk_rcnt;		/* Number of outstanding IOs */
+hrtime_t	zfs_disk_rtime = 0; /* cummulative sum of time performing IO */
+hrtime_t	zfs_disk_rlastupdate = 0; /* time last IO dispatched */
 
-hrtime_t	zfs_disk_last_rtime = 0;
+hrtime_t	zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
+/* time that we last updated per-zone throttle info */
+hrtime_t	zfs_zone_last_checked = 0;
 
 /*
- * Data used to keep track of how often txg flush is running.
+ * Data used to keep track of how often txg sync is running.
  */
 extern int	zfs_txg_timeout;
 static uint_t	txg_last_check;
 static uint_t	txg_cnt;
-static uint_t	txg_flush_rate;
+static uint_t	txg_sync_rate;
 
 boolean_t	zfs_zone_schedule_enable = B_TRUE;	/* enable IO sched. */
 /*
@@ -192,7 +268,19 @@ boolean_t	zfs_zone_schedule_enable = B_TRUE;	/* enable IO sched. */
 uint32_t	zfs_zone_schedule_thresh = 10;
 
 /*
- * Tunables for delay throttling when TxG flush is occurring.
+ * On each pass of the scheduler we increment the zone's weight (up to this
+ * maximum). The weight is used by the scheduler to prevent starvation so
+ * that zones which haven't been able to do any IO over many iterations
+ * will max out thier weight to this value.
+ */
+#define	SCHED_WEIGHT_MAX	20
+
+/*
+ * Tunables for delay throttling when TXG sync is occurring.
+ *
+ * If the zone is performing a write and we're doing above normal TXG syncing,
+ * then throttle for longer than normal. The zone's wait time is multiplied
+ * by the scale (zfs_zone_txg_throttle_scale).
  */
 int		zfs_zone_txg_throttle_scale = 2;
 hrtime_t	zfs_zone_txg_delay_nsec = MSEC2NSEC(20);
@@ -214,6 +302,9 @@ typedef struct {
 /*
  * Keep track of the zone's ZFS IOPs.
  *
+ * See the comment on the zfs_zone_io_throttle function for which/how IOPs are
+ * accounted for.
+ *
  * If the number of ops is >1 then we can just use that value.  However,
  * if the number of ops is <2 then we might have a zone which is trying to do
  * IO but is not able to get any ops through the system.  We don't want to lose
@@ -226,8 +317,8 @@ typedef struct {
  * historical count by the proper number of additional cycles in which no IO was
  * performed.
  *
- * Return true if we actually computed a new historical count.
- * If we're still within an active cycle there is nothing to do, return false.
+ * Return a time delta indicating how far into the current cycle we are or 0
+ * if the last IO was more than a cycle ago.
  */
 static hrtime_t
 compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
@@ -318,16 +409,15 @@ add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
  *
  * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
  * However, since this calculation is driven by IO activity and since IO does
- * not happen
- *
- * at fixed intervals, we use a timestamp to see when the last update was made.
- * If it was more than one cycle ago, then we need to decay the average by the
- * proper number of additional cycles in which no IO was performed.
+ * not happen at fixed intervals, we use a timestamp to see when the last
+ * update was made. If it was more than one cycle ago, then we need to decay
+ * the average by the proper number of additional cycles in which no IO was
+ * performed.
  *
  * Return true if we actually computed a new system average.
  * If we're still within an active cycle there is nothing to do, return false.
  */
-static int
+static boolean_t
 compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
 {
 	hrtime_t delta;
@@ -339,7 +429,7 @@ compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
 	 */
 	delta = unow - cp->cycle_start;
 	if (delta < zfs_zone_sys_avg_cycle)
-		return (0);
+		return (B_FALSE);
 
 	/* A previous cycle is past, compute a new system average. */
 
@@ -373,7 +463,7 @@ compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
 	cp->cycle_cnt = 0;
 	cp->cycle_lat = 0;
 
-	return (1);
+	return (B_TRUE);
 }
 
 static void
@@ -440,13 +530,10 @@ calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
 		 * We're within a cycle; weight the current activity higher
 		 * compared to the historical data and use that.
 		 */
-		extern void __dtrace_probe_zfs__zone__calc__wt__avg(uintptr_t,
-		    uintptr_t, uintptr_t);
-
-		__dtrace_probe_zfs__zone__calc__wt__avg(
-		    (uintptr_t)cp->sys_avg_lat,
-		    (uintptr_t)cp->cycle_lat,
-		    (uintptr_t)cp->cycle_cnt);
+		DTRACE_PROBE3(zfs__zone__calc__wt__avg,
+		    uintptr_t, cp->sys_avg_lat,
+		    uintptr_t, cp->cycle_lat,
+		    uintptr_t, cp->cycle_cnt);
 
 		return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
 		    (1 + (cp->cycle_cnt * 8)));
@@ -481,11 +568,8 @@ get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
 	*wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
 	*lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
 
-	extern void __dtrace_probe_zfs__zone__io__cnt(uintptr_t,
-	    uintptr_t, uintptr_t, uintptr_t);
-
-	__dtrace_probe_zfs__zone__io__cnt((uintptr_t)zonep->zone_id,
-	    (uintptr_t)(*rops), (uintptr_t)*wops, (uintptr_t)*lwops);
+	DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id,
+	    uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops);
 
 	return (*rops | *wops | *lwops);
 }
@@ -510,11 +594,8 @@ get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
 	if (*wlat == 0)
 		*wlat = 1000;
 
-	extern void __dtrace_probe_zfs__zone__sys__avg__lat(uintptr_t,
-	    uintptr_t);
-
-	__dtrace_probe_zfs__zone__sys__avg__lat((uintptr_t)(*rlat),
-	    (uintptr_t)*wlat);
+	DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat,
+	    uintptr_t, *wlat);
 }
 
 /*
@@ -552,12 +633,9 @@ zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
 	 *	arg4: calculated utilization given read and write ops
 	 *	arg5: I/O priority assigned to this zone
 	 */
-	extern void __dtrace_probe_zfs__zone__utilization(
-	    uint_t, uint_t, uint_t, uint_t, uint_t, uint_t);
-
-	__dtrace_probe_zfs__zone__utilization((uint_t)(zonep->zone_id),
-	    (uint_t)rops, (uint_t)wops, (uint_t)lwops,
-	    (uint_t)zonep->zone_io_util, (uint_t)zonep->zone_zfs_io_pri);
+	DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id,
+	    uint_t, rops, uint_t, wops, uint_t, lwops,
+	    uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri);
 
 	return (0);
 }
@@ -623,13 +701,9 @@ zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
 	 *	arg3: calculated fair I/O utilization
 	 *	arg4: actual I/O utilization
 	 */
-	extern void __dtrace_probe_zfs__zone__throttle(
-	    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
-
-	__dtrace_probe_zfs__zone__throttle(
-	    (uintptr_t)zonep->zone_id, (uintptr_t)delay,
-	    (uintptr_t)zonep->zone_io_delay, (uintptr_t)fairutil,
-	    (uintptr_t)zonep->zone_io_util);
+	DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id,
+	    uintptr_t, delay, uintptr_t, zonep->zone_io_delay,
+	    uintptr_t, fairutil, uintptr_t, zonep->zone_io_util);
 
 	return (0);
 }
@@ -639,7 +713,7 @@ zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
  * each zone appropriately.
  */
 static void
-zfs_zone_wait_adjust(hrtime_t unow)
+zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
 {
 	zoneio_stats_t stats;
 
@@ -659,12 +733,12 @@ zfs_zone_wait_adjust(hrtime_t unow)
 	/*
 	 * Calculate disk utilization for the most recent period.
 	 */
-	if (zfs_disk_last_rtime == 0 || unow - zfs_zone_last_checked <= 0) {
+	if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) {
 		stats.zi_diskutil = 0;
 	} else {
 		stats.zi_diskutil =
 		    ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
-		    ((unow - zfs_zone_last_checked) * 1000);
+		    ((unow - last_checked) * 1000);
 	}
 	zfs_disk_last_rtime = zfs_disk_rtime;
 
@@ -680,15 +754,10 @@ zfs_zone_wait_adjust(hrtime_t unow)
 	 *	arg4: total I/O priority of all active zones
 	 *	arg5: calculated disk utilization
 	 */
-	extern void __dtrace_probe_zfs__zone__stats(
-	    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
-
-	__dtrace_probe_zfs__zone__stats((uintptr_t)(stats.zi_avgrlat),
-	    (uintptr_t)(stats.zi_avgwlat),
-	    (uintptr_t)(stats.zi_active),
-	    (uintptr_t)(stats.zi_totutil),
-	    (uintptr_t)(stats.zi_totpri),
-	    (uintptr_t)(stats.zi_diskutil));
+	DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat,
+	    uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active,
+	    uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri,
+	    uintptr_t, stats.zi_diskutil);
 
 	(void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
 }
@@ -720,7 +789,7 @@ get_sched_pri_cb(zone_t *zonep, void *arg)
 	 * done any IO over several iterations will see their weight max
 	 * out.
 	 */
-	if (zonep->zone_zfs_weight < 20)
+	if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX)
 		zonep->zone_zfs_weight++;
 
 	/*
@@ -751,7 +820,9 @@ get_sched_pri_cb(zone_t *zonep, void *arg)
 
 /*
  * See if we need to bump a zone's zio to the head of the queue. This is only
- * done on the two synchronous I/O queues.
+ * done on the two synchronous I/O queues (see the block comment on the
+ * zfs_zone_schedule function). We get the correct vdev_queue_class_t and
+ * queue depth from our caller.
  *
  * For single-threaded synchronous processes a zone cannot get more than
  * 1 op into the queue at a time unless the zone is running multiple processes
@@ -811,10 +882,8 @@ get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p)
 		 * Only fire the probe if we actually picked a different zio
 		 * than the one already at the head of the queue.
 		 */
-		extern void __dtrace_probe_zfs__zone__sched__bump(uint_t,
-		    uint_t, int, int);
-		__dtrace_probe_zfs__zone__sched__bump((uint_t)zp->io_zoneid,
-		    (uint_t)cnt, qbump.zq_priority, qbump.zq_wt);
+		DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid,
+		    uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt);
 	}
 
 	return (zp);
@@ -863,7 +932,7 @@ zfs_zone_zio_init(zio_t *zp)
  * that are performed at a low level via zfs_zone_zio_start.
  *
  * Without this, it can look like a non-global zone never writes (case 1).
- * Depending on when the TXG is flushed, the counts may be in the same sample
+ * Depending on when the TXG is synced, the counts may be in the same sample
  * bucket or in a different one.
  *
  * Tracking read operations is simpler due to their synchronous semantics.  The
@@ -874,7 +943,7 @@ void
 zfs_zone_io_throttle(zfs_zone_iop_type_t type)
 {
 	zone_t *zonep = curzone;
-	hrtime_t unow;
+	hrtime_t unow, last_checked;
 	uint16_t wait;
 
 	unow = GET_USEC_TIME;
@@ -905,18 +974,19 @@ zfs_zone_io_throttle(zfs_zone_iop_type_t type)
 	 * of our data to track each zone's IO, so the algorithm may make
 	 * incorrect throttling decisions until the data is refreshed.
 	 */
-	if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) {
-		zfs_zone_wait_adjust(unow);
+	last_checked = zfs_zone_last_checked;
+	if ((unow - last_checked) > zfs_zone_adjust_time) {
 		zfs_zone_last_checked = unow;
+		zfs_zone_wait_adjust(unow, last_checked);
 	}
 
 	if ((wait = zonep->zone_io_delay) > 0) {
 		/*
-		 * If this is a write and we're doing above normal TxG
-		 * flushing, then throttle for longer than normal.
+		 * If this is a write and we're doing above normal TXG
+		 * syncing, then throttle for longer than normal.
 		 */
 		if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
-		    (txg_cnt > 1 || txg_flush_rate > 1))
+		    (txg_cnt > 1 || txg_sync_rate > 1))
 			wait *= zfs_zone_txg_throttle_scale;
 
 		/*
@@ -926,11 +996,8 @@ zfs_zone_io_throttle(zfs_zone_iop_type_t type)
 		 *	arg1: type of IO operation
 		 *	arg2: time to delay (in us)
 		 */
-		extern void __dtrace_probe_zfs__zone__wait(
-		    uintptr_t, uintptr_t, uintptr_t);
-
-		__dtrace_probe_zfs__zone__wait((uintptr_t)(zonep->zone_id),
-		    (uintptr_t)type, (uintptr_t)wait);
+		DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id,
+		    uintptr_t, type, uintptr_t, wait);
 
 		drv_usecwait(wait);
 
@@ -946,17 +1013,17 @@ zfs_zone_io_throttle(zfs_zone_iop_type_t type)
 /*
  * XXX Ignore the pool pointer parameter for now.
  *
- * Keep track to see if the TxG flush rate is running above the expected rate.
- * If so, this implies that we are filling TxG's at a high rate due to a heavy
+ * Keep track to see if the TXG sync rate is running above the expected rate.
+ * If so, this implies that we are filling TXG's at a high rate due to a heavy
  * write workload.  We use this as input into the zone throttle.
  *
  * This function is called every 5 seconds (zfs_txg_timeout) under a normal
- * write load.  In this case, the flush rate is going to be 1.  When there
- * is a heavy write load, TxG's fill up fast and the sync thread will write
- * the TxG more frequently (perhaps once a second).  In this case the rate
- * will be > 1.  The flush rate is a lagging indicator since it can be up
+ * write load.  In this case, the sync rate is going to be 1.  When there
+ * is a heavy write load, TXG's fill up fast and the sync thread will write
+ * the TXG more frequently (perhaps once a second).  In this case the rate
+ * will be > 1.  The sync rate is a lagging indicator since it can be up
  * to 5 seconds old.  We use the txg_cnt to keep track of the rate in the
- * current 5 second interval and txg_flush_rate to keep track of the previous
+ * current 5 second interval and txg_sync_rate to keep track of the previous
  * 5 second interval.  In that way we don't have a period (1 or more seconds)
  * where the txg_cnt == 0 and we cut back on throttling even though the rate
  * is still high.
@@ -970,7 +1037,7 @@ zfs_zone_report_txg_sync(void *dp)
 	txg_cnt++;
 	now = (uint_t)(gethrtime() / NANOSEC);
 	if ((now - txg_last_check) >= zfs_txg_timeout) {
-		txg_flush_rate = txg_cnt / 2;
+		txg_sync_rate = txg_cnt / 2;
 		txg_cnt = 0;
 		txg_last_check = now;
 	}
@@ -986,7 +1053,7 @@ zfs_zone_txg_delay()
 }
 
 /*
- * Called from zio_vdev_io_start when an IO hits the end of the zio pipeline
+ * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline
  * and is issued.
  * Keep track of start time for latency calculation in zfs_zone_zio_done.
  */
@@ -1024,7 +1091,7 @@ zfs_zone_zio_start(zio_t *zp)
 }
 
 /*
- * Called from vdev_queue_io_done when an IO completes.
+ * Called from vdev_disk_io_done when an IO completes.
  * Increment our counter for zone ops.
  * Calculate the IO latency avg. for this zone.
  */
@@ -1056,7 +1123,7 @@ zfs_zone_zio_done(zio_t *zp)
 		zonep->zone_zfs_rwstats.nread += zp->io_size;
 
 		zonep->zone_zfs_stats->zz_waittime.value.ui64 +=
-		    zp->io_dispatched - zp->io_start;
+		    zp->io_dispatched - zp->io_timestamp;
 
 		kstat_runq_exit(&zonep->zone_zfs_rwstats);
 	} else {
@@ -1088,11 +1155,8 @@ zfs_zone_zio_done(zio_t *zp)
 	 *	arg1: type of I/O operation
 	 *	arg2: I/O latency (in us)
 	 */
-	extern void __dtrace_probe_zfs__zone__latency(
-	    uintptr_t, uintptr_t, uintptr_t);
-
-	__dtrace_probe_zfs__zone__latency((uintptr_t)(zp->io_zoneid),
-	    (uintptr_t)(zp->io_type), (uintptr_t)(udelta));
+	DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid,
+	    uintptr_t, zp->io_type, uintptr_t, udelta);
 }
 
 void
@@ -1201,10 +1265,8 @@ zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx)
 	 * was last scheduled off this queue, the zone that was associated
 	 * with the next IO that is scheduled, and which queue (priority).
 	 */
-	extern void __dtrace_probe_zfs__zone__sched(uint_t, uint_t, uint_t,
-	    uint_t);
-	__dtrace_probe_zfs__zone__sched((uint_t)cnt, (uint_t)last_zone,
-	    (uint_t)zio->io_zoneid, (uint_t)p);
+	DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone,
+	    uint_t, zio->io_zoneid, uint_t, p);
 
 	return (zio);
 }
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index dfd8fa3f65..95fc934bac 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -542,8 +542,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	bzero(zio, sizeof (zio_t));
 
-	zio->io_start = gethrtime();
-
 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
@@ -978,8 +976,6 @@ zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
-	zio->io_start = gethrtime();
-
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW)) {
@@ -2450,9 +2446,6 @@ zio_vdev_io_start(zio_t *zio)
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
 
-	if (zio->io_type == ZIO_TYPE_WRITE)
-		zio->io_start = gethrtime();
-
 	if (vd == NULL) {
 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 3528c89b0e..95839d8494 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -613,8 +613,8 @@ typedef struct zone {
 	uint16_t	zone_io_delay;		/* IO delay on logical r/w */
 	kmutex_t	zone_stg_io_lock;	/* protects IO window data */
 	sys_zio_cntr_t	zone_rd_ops;		/* Counters for ZFS reads, */
-	sys_zio_cntr_t	zone_wr_ops;		/* writes and logical writes. */
-	sys_zio_cntr_t	zone_lwr_ops;
+	sys_zio_cntr_t	zone_wr_ops;		/* writes and */
+	sys_zio_cntr_t	zone_lwr_ops;		/* logical writes. */
 
 	/*
 	 * kstats and counters for VFS ops and bytes.
author	Jerry Jelinek <jerry.jelinek@joyent.com>	2013-10-03 18:33:52 +0000
committer	Jerry Jelinek <jerry.jelinek@joyent.com>	2013-10-03 18:33:52 +0000
commit	2bd35d239981a47757afea42ebb00bbe46bee26a (patch)
tree	5fab386462f0d361ab176e838866342882a89df7
parent	c7ebd51897476aa319daf50054d21d2fd9e696e1 (diff)
download	illumos-joyent-2bd35d239981a47757afea42ebb00bbe46bee26a.tar.gz