diff options
| author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2013-10-03 18:33:52 +0000 | 
|---|---|---|
| committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2013-10-03 18:33:52 +0000 | 
| commit | 2bd35d239981a47757afea42ebb00bbe46bee26a (patch) | |
| tree | 5fab386462f0d361ab176e838866342882a89df7 /usr/src | |
| parent | c7ebd51897476aa319daf50054d21d2fd9e696e1 (diff) | |
| download | illumos-joyent-release-20131003.tar.gz | |
OS-2531 zfs/zone IO throttle comment improvement and code cleanuprelease-20131003
Diffstat (limited to 'usr/src')
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zio.h | 5 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_zone.c | 304 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/zio.c | 7 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/zone.h | 4 | 
4 files changed, 187 insertions, 133 deletions
| diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 7b383e4a74..8e901b804b 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -407,7 +407,8 @@ struct zio {  	const zio_vsd_ops_t *io_vsd_ops;  	uint64_t	io_offset; -	hrtime_t	io_timestamp; +	hrtime_t	io_timestamp;	/* time I/O entered zio pipeline */ +	hrtime_t	io_dispatched;	/* time I/O was dispatched to disk */  	avl_node_t	io_queue_node;  	/* Internal pipeline state */ @@ -436,8 +437,6 @@ struct zio {  	uint64_t	io_ena;  	zoneid_t	io_zoneid;	/* zone which originated this I/O */ -	hrtime_t	io_start;	/* time I/O entered zio pipeline */ -	hrtime_t	io_dispatched;	/* time I/O was dispatched to disk */  	/* Taskq dispatching state */  	taskq_ent_t	io_tqent;  }; diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c index e7b24915e1..4c2bdccd66 100644 --- a/usr/src/uts/common/fs/zfs/zfs_zone.c +++ b/usr/src/uts/common/fs/zfs/zfs_zone.c @@ -1,27 +1,93 @@  /* - * CDDL HEADER START + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL.   * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL.   */ +  /*   * Copyright 2013, Joyent, Inc. All rights reserved.   */ +/* + * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to + * ZFS I/O resources for each zone. + * + * I/O contention can be major pain point on a multi-tenant system. A single + * zone can issue a stream of I/O operations, usually synchronous writes, which + * disrupt I/O performance for all other zones. This problem is further + * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG, + * a set of blocks which are atomically synced to disk. The process of + * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving + * out any pending read operations. + * + * There are two facets to this capability; the throttle and the scheduler. + * + * Throttle + * + * The requirements on the throttle are: + * + *     1) Ensure consistent and predictable I/O latency across all zones. + *     2) Sequential and random workloads have very different characteristics, + *        so it is a non-starter to track IOPS or throughput. + *     3) A zone should be able to use the full disk bandwidth if no other zone + *        is actively using the disk. + * + * The throttle has two components: one to track and account for each zone's + * I/O requests, and another to throttle each zone's operations when it + * exceeds its fair share of disk I/O. When the throttle detects that a zone is + * consuming more than is appropriate, each read or write system call is + * delayed by up to 100 microseconds, which we've found is sufficient to allow + * other zones to interleave I/O requests during those delays. + * + * Note: The throttle will delay each logical I/O (as opposed to the physical + * I/O which will likely be issued asynchronously), so it may be easier to + * think of the I/O throttle delaying each read/write syscall instead of the + * actual I/O operation. For each zone, the throttle tracks an ongoing average + * of read and write operations performed to determine the overall I/O + * utilization for each zone. + * + * The throttle calculates a I/O utilization metric for each zone using the + * following formula: + * + *     (# of read syscalls) x (Average read latency) + + *     (# of write syscalls) x (Average write latency) + * + * Once each zone has its utilization metric, the I/O throttle will compare I/O + * utilization across all zones, and if a zone has a higher-than-average I/O + * utilization, system calls from that zone are throttled. That is, if one + * zone has a much higher utilization, that zone's delay is increased by 5 + * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is + * already throttled and has a lower utilization than average, its delay will + * be lowered by 5 microseconds. + * + * The throttle calculation is driven by IO activity, but since IO does not + * happen at fixed intervals, timestamps are used to track when the last update + * was made and to drive recalculation. + * + * The throttle recalculates each zone's I/O usage and throttle delay (if any) + * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as + * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval. + * + * Scheduler + * + * The I/O scheduler manages the vdev queues – the queues of pending I/Os to + * issue to the disks. It only makes scheduling decisions for the two + * synchronous I/O queues (read & write). + * + * The scheduler maintains how many I/Os in the queue are from each zone, and + * if one zone has a disproportionately large number of I/Os in the queue, the + * scheduler will allow certain I/Os from the underutilized zones to be "bumped" + * and pulled from the middle of the queue. This bump allows zones with a small + * number of I/Os (so small they may not even be taken into account by the + * throttle) to complete quickly instead of waiting behind dozens of I/Os from + * other zones. + */ +  #include <sys/spa.h>  #include <sys/vdev_impl.h>  #include <sys/zfs_zone.h> @@ -100,10 +166,8 @@ zfs_zone_txg_delay()   * over the previous window.   */  boolean_t	zfs_zone_delay_enable = B_TRUE;	/* enable IO throttle */ -uint16_t	zfs_zone_delay_step = 5;	/* amount to change delay */ -uint16_t	zfs_zone_delay_ceiling = 100;	/* longest possible delay */ - -hrtime_t	zfs_zone_last_checked = 0; +uint16_t	zfs_zone_delay_step = 5;	/* usec amnt to change delay */ +uint16_t	zfs_zone_delay_ceiling = 100;	/* usec delay max */  boolean_t	zfs_zone_priority_enable = B_TRUE;  /* enable IO priority */ @@ -121,7 +185,6 @@ boolean_t	zfs_zone_priority_enable = B_TRUE;  /* enable IO priority */   */  uint_t		zfs_zone_rw_lat_limit = 10; -  /*   * The I/O throttle will only start delaying zones when it detects disk   * utilization has reached a certain level.  This tunable controls the threshold @@ -140,6 +203,10 @@ uint_t		zfs_zone_util_threshold = 80;  uint_t 		zfs_zone_sys_avg_cycle = 1000000;	/* 1 s */  uint_t 		zfs_zone_cycle_time = 2000000;		/* 2 s */ +/* + * How often the I/O throttle will reevaluate each zone's utilization, in + * microseconds. Default is 1/4 sec. + */  uint_t 		zfs_zone_adjust_time = 250000;		/* 250 ms */  typedef struct { @@ -163,22 +230,31 @@ static sys_lat_cycle_t	rd_lat;  static sys_lat_cycle_t	wr_lat;  /* - * Some basic disk stats to determine disk utilization. + * Some basic disk stats to determine disk utilization. The utilization info + * for all disks on the system is aggregated into these values. + * + * Overall disk utilization for the current cycle is calculated as: + * + * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) + * ---------------------------------------------- + *    ((now - zfs_zone_last_checked) * 1000);   */ -kmutex_t	zfs_disk_lock; -uint_t		zfs_disk_rcnt; -hrtime_t	zfs_disk_rtime = 0; -hrtime_t	zfs_disk_rlastupdate = 0; +kmutex_t	zfs_disk_lock;		/* protects the following: */ +uint_t		zfs_disk_rcnt;		/* Number of outstanding IOs */ +hrtime_t	zfs_disk_rtime = 0; /* cummulative sum of time performing IO */ +hrtime_t	zfs_disk_rlastupdate = 0; /* time last IO dispatched */ -hrtime_t	zfs_disk_last_rtime = 0; +hrtime_t	zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */ +/* time that we last updated per-zone throttle info */ +hrtime_t	zfs_zone_last_checked = 0;  /* - * Data used to keep track of how often txg flush is running. + * Data used to keep track of how often txg sync is running.   */  extern int	zfs_txg_timeout;  static uint_t	txg_last_check;  static uint_t	txg_cnt; -static uint_t	txg_flush_rate; +static uint_t	txg_sync_rate;  boolean_t	zfs_zone_schedule_enable = B_TRUE;	/* enable IO sched. */  /* @@ -192,7 +268,19 @@ boolean_t	zfs_zone_schedule_enable = B_TRUE;	/* enable IO sched. */  uint32_t	zfs_zone_schedule_thresh = 10;  /* - * Tunables for delay throttling when TxG flush is occurring. + * On each pass of the scheduler we increment the zone's weight (up to this + * maximum). The weight is used by the scheduler to prevent starvation so + * that zones which haven't been able to do any IO over many iterations + * will max out thier weight to this value. + */ +#define	SCHED_WEIGHT_MAX	20 + +/* + * Tunables for delay throttling when TXG sync is occurring. + * + * If the zone is performing a write and we're doing above normal TXG syncing, + * then throttle for longer than normal. The zone's wait time is multiplied + * by the scale (zfs_zone_txg_throttle_scale).   */  int		zfs_zone_txg_throttle_scale = 2;  hrtime_t	zfs_zone_txg_delay_nsec = MSEC2NSEC(20); @@ -214,6 +302,9 @@ typedef struct {  /*   * Keep track of the zone's ZFS IOPs.   * + * See the comment on the zfs_zone_io_throttle function for which/how IOPs are + * accounted for. + *   * If the number of ops is >1 then we can just use that value.  However,   * if the number of ops is <2 then we might have a zone which is trying to do   * IO but is not able to get any ops through the system.  We don't want to lose @@ -226,8 +317,8 @@ typedef struct {   * historical count by the proper number of additional cycles in which no IO was   * performed.   * - * Return true if we actually computed a new historical count. - * If we're still within an active cycle there is nothing to do, return false. + * Return a time delta indicating how far into the current cycle we are or 0 + * if the last IO was more than a cycle ago.   */  static hrtime_t  compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) @@ -318,16 +409,15 @@ add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)   *   * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.   * However, since this calculation is driven by IO activity and since IO does - * not happen - * - * at fixed intervals, we use a timestamp to see when the last update was made. - * If it was more than one cycle ago, then we need to decay the average by the - * proper number of additional cycles in which no IO was performed. + * not happen at fixed intervals, we use a timestamp to see when the last + * update was made. If it was more than one cycle ago, then we need to decay + * the average by the proper number of additional cycles in which no IO was + * performed.   *   * Return true if we actually computed a new system average.   * If we're still within an active cycle there is nothing to do, return false.   */ -static int +static boolean_t  compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)  {  	hrtime_t delta; @@ -339,7 +429,7 @@ compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)  	 */  	delta = unow - cp->cycle_start;  	if (delta < zfs_zone_sys_avg_cycle) -		return (0); +		return (B_FALSE);  	/* A previous cycle is past, compute a new system average. */ @@ -373,7 +463,7 @@ compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)  	cp->cycle_cnt = 0;  	cp->cycle_lat = 0; -	return (1); +	return (B_TRUE);  }  static void @@ -440,13 +530,10 @@ calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)  		 * We're within a cycle; weight the current activity higher  		 * compared to the historical data and use that.  		 */ -		extern void __dtrace_probe_zfs__zone__calc__wt__avg(uintptr_t, -		    uintptr_t, uintptr_t); - -		__dtrace_probe_zfs__zone__calc__wt__avg( -		    (uintptr_t)cp->sys_avg_lat, -		    (uintptr_t)cp->cycle_lat, -		    (uintptr_t)cp->cycle_cnt); +		DTRACE_PROBE3(zfs__zone__calc__wt__avg, +		    uintptr_t, cp->sys_avg_lat, +		    uintptr_t, cp->cycle_lat, +		    uintptr_t, cp->cycle_cnt);  		return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /  		    (1 + (cp->cycle_cnt * 8))); @@ -481,11 +568,8 @@ get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,  	*wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);  	*lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops); -	extern void __dtrace_probe_zfs__zone__io__cnt(uintptr_t, -	    uintptr_t, uintptr_t, uintptr_t); - -	__dtrace_probe_zfs__zone__io__cnt((uintptr_t)zonep->zone_id, -	    (uintptr_t)(*rops), (uintptr_t)*wops, (uintptr_t)*lwops); +	DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id, +	    uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops);  	return (*rops | *wops | *lwops);  } @@ -510,11 +594,8 @@ get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)  	if (*wlat == 0)  		*wlat = 1000; -	extern void __dtrace_probe_zfs__zone__sys__avg__lat(uintptr_t, -	    uintptr_t); - -	__dtrace_probe_zfs__zone__sys__avg__lat((uintptr_t)(*rlat), -	    (uintptr_t)*wlat); +	DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat, +	    uintptr_t, *wlat);  }  /* @@ -552,12 +633,9 @@ zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)  	 *	arg4: calculated utilization given read and write ops  	 *	arg5: I/O priority assigned to this zone  	 */ -	extern void __dtrace_probe_zfs__zone__utilization( -	    uint_t, uint_t, uint_t, uint_t, uint_t, uint_t); - -	__dtrace_probe_zfs__zone__utilization((uint_t)(zonep->zone_id), -	    (uint_t)rops, (uint_t)wops, (uint_t)lwops, -	    (uint_t)zonep->zone_io_util, (uint_t)zonep->zone_zfs_io_pri); +	DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id, +	    uint_t, rops, uint_t, wops, uint_t, lwops, +	    uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri);  	return (0);  } @@ -623,13 +701,9 @@ zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)  	 *	arg3: calculated fair I/O utilization  	 *	arg4: actual I/O utilization  	 */ -	extern void __dtrace_probe_zfs__zone__throttle( -	    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); - -	__dtrace_probe_zfs__zone__throttle( -	    (uintptr_t)zonep->zone_id, (uintptr_t)delay, -	    (uintptr_t)zonep->zone_io_delay, (uintptr_t)fairutil, -	    (uintptr_t)zonep->zone_io_util); +	DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id, +	    uintptr_t, delay, uintptr_t, zonep->zone_io_delay, +	    uintptr_t, fairutil, uintptr_t, zonep->zone_io_util);  	return (0);  } @@ -639,7 +713,7 @@ zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)   * each zone appropriately.   */  static void -zfs_zone_wait_adjust(hrtime_t unow) +zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)  {  	zoneio_stats_t stats; @@ -659,12 +733,12 @@ zfs_zone_wait_adjust(hrtime_t unow)  	/*  	 * Calculate disk utilization for the most recent period.  	 */ -	if (zfs_disk_last_rtime == 0 || unow - zfs_zone_last_checked <= 0) { +	if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) {  		stats.zi_diskutil = 0;  	} else {  		stats.zi_diskutil =  		    ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) / -		    ((unow - zfs_zone_last_checked) * 1000); +		    ((unow - last_checked) * 1000);  	}  	zfs_disk_last_rtime = zfs_disk_rtime; @@ -680,15 +754,10 @@ zfs_zone_wait_adjust(hrtime_t unow)  	 *	arg4: total I/O priority of all active zones  	 *	arg5: calculated disk utilization  	 */ -	extern void __dtrace_probe_zfs__zone__stats( -	    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); - -	__dtrace_probe_zfs__zone__stats((uintptr_t)(stats.zi_avgrlat), -	    (uintptr_t)(stats.zi_avgwlat), -	    (uintptr_t)(stats.zi_active), -	    (uintptr_t)(stats.zi_totutil), -	    (uintptr_t)(stats.zi_totpri), -	    (uintptr_t)(stats.zi_diskutil)); +	DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat, +	    uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active, +	    uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri, +	    uintptr_t, stats.zi_diskutil);  	(void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);  } @@ -720,7 +789,7 @@ get_sched_pri_cb(zone_t *zonep, void *arg)  	 * done any IO over several iterations will see their weight max  	 * out.  	 */ -	if (zonep->zone_zfs_weight < 20) +	if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX)  		zonep->zone_zfs_weight++;  	/* @@ -751,7 +820,9 @@ get_sched_pri_cb(zone_t *zonep, void *arg)  /*   * See if we need to bump a zone's zio to the head of the queue. This is only - * done on the two synchronous I/O queues. + * done on the two synchronous I/O queues (see the block comment on the + * zfs_zone_schedule function). We get the correct vdev_queue_class_t and + * queue depth from our caller.   *   * For single-threaded synchronous processes a zone cannot get more than   * 1 op into the queue at a time unless the zone is running multiple processes @@ -811,10 +882,8 @@ get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p)  		 * Only fire the probe if we actually picked a different zio  		 * than the one already at the head of the queue.  		 */ -		extern void __dtrace_probe_zfs__zone__sched__bump(uint_t, -		    uint_t, int, int); -		__dtrace_probe_zfs__zone__sched__bump((uint_t)zp->io_zoneid, -		    (uint_t)cnt, qbump.zq_priority, qbump.zq_wt); +		DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid, +		    uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt);  	}  	return (zp); @@ -863,7 +932,7 @@ zfs_zone_zio_init(zio_t *zp)   * that are performed at a low level via zfs_zone_zio_start.   *   * Without this, it can look like a non-global zone never writes (case 1). - * Depending on when the TXG is flushed, the counts may be in the same sample + * Depending on when the TXG is synced, the counts may be in the same sample   * bucket or in a different one.   *   * Tracking read operations is simpler due to their synchronous semantics.  The @@ -874,7 +943,7 @@ void  zfs_zone_io_throttle(zfs_zone_iop_type_t type)  {  	zone_t *zonep = curzone; -	hrtime_t unow; +	hrtime_t unow, last_checked;  	uint16_t wait;  	unow = GET_USEC_TIME; @@ -905,18 +974,19 @@ zfs_zone_io_throttle(zfs_zone_iop_type_t type)  	 * of our data to track each zone's IO, so the algorithm may make  	 * incorrect throttling decisions until the data is refreshed.  	 */ -	if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) { -		zfs_zone_wait_adjust(unow); +	last_checked = zfs_zone_last_checked; +	if ((unow - last_checked) > zfs_zone_adjust_time) {  		zfs_zone_last_checked = unow; +		zfs_zone_wait_adjust(unow, last_checked);  	}  	if ((wait = zonep->zone_io_delay) > 0) {  		/* -		 * If this is a write and we're doing above normal TxG -		 * flushing, then throttle for longer than normal. +		 * If this is a write and we're doing above normal TXG +		 * syncing, then throttle for longer than normal.  		 */  		if (type == ZFS_ZONE_IOP_LOGICAL_WRITE && -		    (txg_cnt > 1 || txg_flush_rate > 1)) +		    (txg_cnt > 1 || txg_sync_rate > 1))  			wait *= zfs_zone_txg_throttle_scale;  		/* @@ -926,11 +996,8 @@ zfs_zone_io_throttle(zfs_zone_iop_type_t type)  		 *	arg1: type of IO operation  		 *	arg2: time to delay (in us)  		 */ -		extern void __dtrace_probe_zfs__zone__wait( -		    uintptr_t, uintptr_t, uintptr_t); - -		__dtrace_probe_zfs__zone__wait((uintptr_t)(zonep->zone_id), -		    (uintptr_t)type, (uintptr_t)wait); +		DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id, +		    uintptr_t, type, uintptr_t, wait);  		drv_usecwait(wait); @@ -946,17 +1013,17 @@ zfs_zone_io_throttle(zfs_zone_iop_type_t type)  /*   * XXX Ignore the pool pointer parameter for now.   * - * Keep track to see if the TxG flush rate is running above the expected rate. - * If so, this implies that we are filling TxG's at a high rate due to a heavy + * Keep track to see if the TXG sync rate is running above the expected rate. + * If so, this implies that we are filling TXG's at a high rate due to a heavy   * write workload.  We use this as input into the zone throttle.   *   * This function is called every 5 seconds (zfs_txg_timeout) under a normal - * write load.  In this case, the flush rate is going to be 1.  When there - * is a heavy write load, TxG's fill up fast and the sync thread will write - * the TxG more frequently (perhaps once a second).  In this case the rate - * will be > 1.  The flush rate is a lagging indicator since it can be up + * write load.  In this case, the sync rate is going to be 1.  When there + * is a heavy write load, TXG's fill up fast and the sync thread will write + * the TXG more frequently (perhaps once a second).  In this case the rate + * will be > 1.  The sync rate is a lagging indicator since it can be up   * to 5 seconds old.  We use the txg_cnt to keep track of the rate in the - * current 5 second interval and txg_flush_rate to keep track of the previous + * current 5 second interval and txg_sync_rate to keep track of the previous   * 5 second interval.  In that way we don't have a period (1 or more seconds)   * where the txg_cnt == 0 and we cut back on throttling even though the rate   * is still high. @@ -970,7 +1037,7 @@ zfs_zone_report_txg_sync(void *dp)  	txg_cnt++;  	now = (uint_t)(gethrtime() / NANOSEC);  	if ((now - txg_last_check) >= zfs_txg_timeout) { -		txg_flush_rate = txg_cnt / 2; +		txg_sync_rate = txg_cnt / 2;  		txg_cnt = 0;  		txg_last_check = now;  	} @@ -986,7 +1053,7 @@ zfs_zone_txg_delay()  }  /* - * Called from zio_vdev_io_start when an IO hits the end of the zio pipeline + * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline   * and is issued.   * Keep track of start time for latency calculation in zfs_zone_zio_done.   */ @@ -1024,7 +1091,7 @@ zfs_zone_zio_start(zio_t *zp)  }  /* - * Called from vdev_queue_io_done when an IO completes. + * Called from vdev_disk_io_done when an IO completes.   * Increment our counter for zone ops.   * Calculate the IO latency avg. for this zone.   */ @@ -1056,7 +1123,7 @@ zfs_zone_zio_done(zio_t *zp)  		zonep->zone_zfs_rwstats.nread += zp->io_size;  		zonep->zone_zfs_stats->zz_waittime.value.ui64 += -		    zp->io_dispatched - zp->io_start; +		    zp->io_dispatched - zp->io_timestamp;  		kstat_runq_exit(&zonep->zone_zfs_rwstats);  	} else { @@ -1088,11 +1155,8 @@ zfs_zone_zio_done(zio_t *zp)  	 *	arg1: type of I/O operation  	 *	arg2: I/O latency (in us)  	 */ -	extern void __dtrace_probe_zfs__zone__latency( -	    uintptr_t, uintptr_t, uintptr_t); - -	__dtrace_probe_zfs__zone__latency((uintptr_t)(zp->io_zoneid), -	    (uintptr_t)(zp->io_type), (uintptr_t)(udelta)); +	DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid, +	    uintptr_t, zp->io_type, uintptr_t, udelta);  }  void @@ -1201,10 +1265,8 @@ zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx)  	 * was last scheduled off this queue, the zone that was associated  	 * with the next IO that is scheduled, and which queue (priority).  	 */ -	extern void __dtrace_probe_zfs__zone__sched(uint_t, uint_t, uint_t, -	    uint_t); -	__dtrace_probe_zfs__zone__sched((uint_t)cnt, (uint_t)last_zone, -	    (uint_t)zio->io_zoneid, (uint_t)p); +	DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone, +	    uint_t, zio->io_zoneid, uint_t, p);  	return (zio);  } diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index dfd8fa3f65..95fc934bac 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -542,8 +542,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,  	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);  	bzero(zio, sizeof (zio_t)); -	zio->io_start = gethrtime(); -  	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);  	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); @@ -978,8 +976,6 @@ zio_read_bp_init(zio_t *zio)  {  	blkptr_t *bp = zio->io_bp; -	zio->io_start = gethrtime(); -  	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&  	    zio->io_child_type == ZIO_CHILD_LOGICAL &&  	    !(zio->io_flags & ZIO_FLAG_RAW)) { @@ -2450,9 +2446,6 @@ zio_vdev_io_start(zio_t *zio)  	ASSERT(zio->io_error == 0);  	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); -	if (zio->io_type == ZIO_TYPE_WRITE) -		zio->io_start = gethrtime(); -  	if (vd == NULL) {  		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))  			spa_config_enter(spa, SCL_ZIO, zio, RW_READER); diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 3528c89b0e..95839d8494 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -613,8 +613,8 @@ typedef struct zone {  	uint16_t	zone_io_delay;		/* IO delay on logical r/w */  	kmutex_t	zone_stg_io_lock;	/* protects IO window data */  	sys_zio_cntr_t	zone_rd_ops;		/* Counters for ZFS reads, */ -	sys_zio_cntr_t	zone_wr_ops;		/* writes and logical writes. */ -	sys_zio_cntr_t	zone_lwr_ops; +	sys_zio_cntr_t	zone_wr_ops;		/* writes and */ +	sys_zio_cntr_t	zone_lwr_ops;		/* logical writes. */  	/*  	 * kstats and counters for VFS ops and bytes. | 
