5 files changed, 397 insertions, 255 deletions
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
index 4861c64f8e..59357cbee5 100644
--- a/usr/src/uts/common/fs/zfs/zfs_zone.c
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright 2015, Joyent, Inc. All rights reserved.
+ * Copyright 2018, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -166,8 +166,8 @@ zfs_zone_txg_delay()
  * over the previous window.
  */
 boolean_t	zfs_zone_delay_enable = B_TRUE;	/* enable IO throttle */
-uint16_t	zfs_zone_delay_step = 5;	/* usec amnt to change delay */
-uint16_t	zfs_zone_delay_ceiling = 100;	/* usec delay max */
+uint8_t		zfs_zone_delay_step = 5;	/* usec amnt to change delay */
+uint8_t		zfs_zone_delay_ceiling = 100;	/* usec delay max */
 
 boolean_t	zfs_zone_priority_enable = B_TRUE;  /* enable IO priority */
 
@@ -238,9 +238,9 @@ uint_t 		zfs_zone_adjust_time = 250000;		/* 250 ms */
 
 typedef struct {
 	hrtime_t	cycle_start;
-	int		cycle_cnt;
 	hrtime_t	cycle_lat;
 	hrtime_t	sys_avg_lat;
+	uint_t		cycle_cnt;
 } sys_lat_cycle_t;
 
 typedef struct {
@@ -275,6 +275,7 @@ hrtime_t	zfs_disk_rlastupdate = 0; /* time last IO dispatched */
 
 hrtime_t	zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
 /* time that we last updated per-zone throttle info */
+kmutex_t	zfs_last_check_lock;	/* protects zfs_zone_last_checked */
 hrtime_t	zfs_zone_last_checked = 0;
 hrtime_t	zfs_disk_last_laggard = 0;
 
@@ -412,22 +413,32 @@ compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
  * Add IO op data to the zone.
  */
 static void
-add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
+add_zone_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op)
 {
+	zone_zfs_io_t *iop;
+
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop == NULL) {
+		mutex_exit(&zpd->zpers_zfs_lock);
+		return;
+	}
+
 	switch (op) {
 	case ZFS_ZONE_IOP_READ:
-		(void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
-		zonep->zone_rd_ops.cycle_cnt++;
+		(void) compute_historical_zone_cnt(unow, &iop->zpers_rd_ops);
+		iop->zpers_rd_ops.cycle_cnt++;
 		break;
 	case ZFS_ZONE_IOP_WRITE:
-		(void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
-		zonep->zone_wr_ops.cycle_cnt++;
+		(void) compute_historical_zone_cnt(unow, &iop->zpers_wr_ops);
+		iop->zpers_wr_ops.cycle_cnt++;
 		break;
 	case ZFS_ZONE_IOP_LOGICAL_WRITE:
-		(void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
-		zonep->zone_lwr_ops.cycle_cnt++;
+		(void) compute_historical_zone_cnt(unow, &iop->zpers_lwr_ops);
+		iop->zpers_lwr_ops.cycle_cnt++;
 		break;
 	}
+	mutex_exit(&zpd->zpers_zfs_lock);
 }
 
 /*
@@ -502,13 +513,13 @@ add_sys_iop(hrtime_t unow, int op, int lat)
 	switch (op) {
 	case ZFS_ZONE_IOP_READ:
 		(void) compute_new_sys_avg(unow, &rd_lat);
-		rd_lat.cycle_cnt++;
-		rd_lat.cycle_lat += lat;
+		atomic_inc_uint(&rd_lat.cycle_cnt);
+		atomic_add_64((uint64_t *)&rd_lat.cycle_lat, (int64_t)lat);
 		break;
 	case ZFS_ZONE_IOP_WRITE:
 		(void) compute_new_sys_avg(unow, &wr_lat);
-		wr_lat.cycle_cnt++;
-		wr_lat.cycle_lat += lat;
+		atomic_inc_uint(&wr_lat.cycle_cnt);
+		atomic_add_64((uint64_t *)&wr_lat.cycle_lat, (int64_t)lat);
 		break;
 	}
 }
@@ -575,10 +586,11 @@ calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
  * The latency parameter is in usecs.
  */
 static void
-add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
+add_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op,
+    hrtime_t lat)
 {
 	/* Add op to zone */
-	add_zone_iop(zonep, unow, op);
+	add_zone_iop(zpd, unow, op);
 
 	/* Track system latency */
 	if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
@@ -591,14 +603,16 @@ add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
  * return a non-zero value, otherwise return 0.
  */
 static int
-get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
+get_zone_io_cnt(hrtime_t unow, zone_zfs_io_t *zpd, uint_t *rops, uint_t *wops,
     uint_t *lwops)
 {
-	*rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
-	*wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
-	*lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
+	ASSERT3P(zpd, !=, NULL);
 
-	DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id,
+	*rops = calc_zone_cnt(unow, &zpd->zpers_rd_ops);
+	*wops = calc_zone_cnt(unow, &zpd->zpers_wr_ops);
+	*lwops = calc_zone_cnt(unow, &zpd->zpers_lwr_ops);
+
+	DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zpd,
 	    uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops);
 
 	return (*rops | *wops | *lwops);
@@ -637,20 +651,24 @@ zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
 {
 	zoneio_stats_t *sp = arg;
 	uint_t rops, wops, lwops;
+	zone_persist_t *zpd = &zone_pdata[zonep->zone_id];
+	zone_zfs_io_t *iop = zpd->zpers_zfsp;
+
+	ASSERT(MUTEX_HELD(&zpd->zpers_zfs_lock));
+	ASSERT3P(iop, !=, NULL);
 
 	if (zonep->zone_id == GLOBAL_ZONEID ||
-	    get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
-		zonep->zone_io_util = 0;
+	    get_zone_io_cnt(sp->zi_now, iop, &rops, &wops, &lwops) == 0) {
 		return (0);
 	}
 
-	zonep->zone_io_util = (rops * sp->zi_avgrlat) +
-	    (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
-	sp->zi_totutil += zonep->zone_io_util;
+	iop->zpers_io_util = (rops * sp->zi_avgrlat) + (wops * sp->zi_avgwlat) +
+	    (lwops * sp->zi_avgwlat);
+	sp->zi_totutil += iop->zpers_io_util;
 
-	if (zonep->zone_io_util > 0) {
+	if (iop->zpers_io_util > 0) {
 		sp->zi_active++;
-		sp->zi_totpri += zonep->zone_zfs_io_pri;
+		sp->zi_totpri += iop->zpers_zfs_io_pri;
 	}
 
 	/*
@@ -665,23 +683,27 @@ zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
 	 */
 	DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id,
 	    uint_t, rops, uint_t, wops, uint_t, lwops,
-	    uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri);
+	    uint64_t, iop->zpers_io_util, uint16_t, iop->zpers_zfs_io_pri);
 
 	return (0);
 }
 
 static void
-zfs_zone_delay_inc(zone_t *zonep)
+zfs_zone_delay_inc(zone_zfs_io_t *zpd)
 {
-	if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
-		zonep->zone_io_delay += zfs_zone_delay_step;
+	ASSERT3P(zpd, !=, NULL);
+
+	if (zpd->zpers_io_delay < zfs_zone_delay_ceiling)
+		zpd->zpers_io_delay += zfs_zone_delay_step;
 }
 
 static void
-zfs_zone_delay_dec(zone_t *zonep)
+zfs_zone_delay_dec(zone_zfs_io_t *zpd)
 {
-	if (zonep->zone_io_delay > 0)
-		zonep->zone_io_delay -= zfs_zone_delay_step;
+	ASSERT3P(zpd, !=, NULL);
+
+	if (zpd->zpers_io_delay > 0)
+		zpd->zpers_io_delay -= zfs_zone_delay_step;
 }
 
 /*
@@ -691,18 +713,24 @@ zfs_zone_delay_dec(zone_t *zonep)
 static int
 zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
 {
+	zone_persist_t *zpd = &zone_pdata[zonep->zone_id];
+	zone_zfs_io_t *iop = zpd->zpers_zfsp;
 	zoneio_stats_t *sp = arg;
-	uint16_t delay = zonep->zone_io_delay;
+	uint8_t delay;
 	uint_t fairutil = 0;
 
-	zonep->zone_io_util_above_avg = B_FALSE;
+	ASSERT(MUTEX_HELD(&zpd->zpers_zfs_lock));
+	ASSERT3P(iop, !=, NULL);
+
+	delay = iop->zpers_io_delay;
+	iop->zpers_io_util_above_avg = 0;
 
 	/*
 	 * Given the calculated total utilitzation for all zones, calculate the
 	 * fair share of I/O for this zone.
 	 */
 	if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
-		fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
+		fairutil = (sp->zi_totutil * iop->zpers_zfs_io_pri) /
 		    sp->zi_totpri;
 	} else if (sp->zi_active > 0) {
 		fairutil = sp->zi_totutil / sp->zi_active;
@@ -712,14 +740,14 @@ zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
 	 * Adjust each IO's delay.  If the overall delay becomes too high, avoid
 	 * increasing beyond the ceiling value.
 	 */
-	if (zonep->zone_io_util > fairutil && sp->zi_overutil) {
-		zonep->zone_io_util_above_avg = B_TRUE;
+	if (iop->zpers_io_util > fairutil && sp->zi_overutil) {
+		iop->zpers_io_util_above_avg = 1;
 
 		if (sp->zi_active > 1)
-			zfs_zone_delay_inc(zonep);
-	} else if (zonep->zone_io_util < fairutil || sp->zi_underutil ||
+			zfs_zone_delay_inc(iop);
+	} else if (iop->zpers_io_util < fairutil || sp->zi_underutil ||
 	    sp->zi_active <= 1) {
-		zfs_zone_delay_dec(zonep);
+		zfs_zone_delay_dec(iop);
 	}
 
 	/*
@@ -732,8 +760,8 @@ zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
 	 *	arg4: actual I/O utilization
 	 */
 	DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id,
-	    uintptr_t, delay, uintptr_t, zonep->zone_io_delay,
-	    uintptr_t, fairutil, uintptr_t, zonep->zone_io_util);
+	    uintptr_t, delay, uintptr_t, iop->zpers_io_delay,
+	    uintptr_t, fairutil, uintptr_t, iop->zpers_io_util);
 
 	return (0);
 }
@@ -823,10 +851,20 @@ get_sched_pri_cb(zone_t *zonep, void *arg)
 	uint_t cnt;
 	zone_q_bump_t *qbp = arg;
 	zio_priority_t p = qbp->zq_queue;
+	zone_persist_t *zpd = &zone_pdata[zonep->zone_id];
+	zone_zfs_io_t *iop;
 
-	cnt = zonep->zone_zfs_queued[p];
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop == NULL) {
+		mutex_exit(&zpd->zpers_zfs_lock);
+		return (0);
+	}
+
+	cnt = iop->zpers_zfs_queued[p];
 	if (cnt == 0) {
-		zonep->zone_zfs_weight = 0;
+		iop->zpers_zfs_weight = 0;
+		mutex_exit(&zpd->zpers_zfs_lock);
 		return (0);
 	}
 
@@ -837,8 +875,8 @@ get_sched_pri_cb(zone_t *zonep, void *arg)
 	 * done any IO over several iterations will see their weight max
 	 * out.
 	 */
-	if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX)
-		zonep->zone_zfs_weight++;
+	if (iop->zpers_zfs_weight < SCHED_WEIGHT_MAX)
+		iop->zpers_zfs_weight++;
 
 	/*
 	 * This zone's IO priority is the inverse of the number of IOs
@@ -852,7 +890,7 @@ get_sched_pri_cb(zone_t *zonep, void *arg)
 	 * which haven't done IO in a while aren't getting starved.
 	 */
 	pri = (qbp->zq_qdepth / cnt) *
-	    zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
+	    iop->zpers_zfs_io_pri * iop->zpers_zfs_weight;
 
 	/*
 	 * If this zone has a higher priority than what we found so far,
@@ -861,8 +899,9 @@ get_sched_pri_cb(zone_t *zonep, void *arg)
 	if (pri > qbp->zq_priority) {
 		qbp->zq_zoneid = zonep->zone_id;
 		qbp->zq_priority = pri;
-		qbp->zq_wt = zonep->zone_zfs_weight;
+		qbp->zq_wt = iop->zpers_zfs_weight;
 	}
+	mutex_exit(&zpd->zpers_zfs_lock);
 	return (0);
 }
 
@@ -996,8 +1035,10 @@ zfs_zone_zio_init(zio_t *zp)
 void
 zfs_zone_io_throttle(zfs_zone_iop_type_t type)
 {
-	zone_t *zonep = curzone;
-	hrtime_t unow, last_checked;
+	zoneid_t zid = curzone->zone_id;
+	zone_persist_t *zpd = &zone_pdata[zid];
+	zone_zfs_io_t *iop;
+	hrtime_t unow;
 	uint16_t wait;
 
 	unow = GET_USEC_TIME;
@@ -1007,34 +1048,60 @@ zfs_zone_io_throttle(zfs_zone_iop_type_t type)
 	 * tracking physical IO operations are handled in zfs_zone_zio_done.
 	 */
 	if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
-		mutex_enter(&zonep->zone_stg_io_lock);
-		add_iop(zonep, unow, type, 0);
-		mutex_exit(&zonep->zone_stg_io_lock);
+		add_iop(zpd, unow, type, 0);
 	}
 
 	if (!zfs_zone_delay_enable)
 		return;
 
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop == NULL) {
+		mutex_exit(&zpd->zpers_zfs_lock);
+		return;
+	}
+
 	/*
 	 * If the zone's I/O priority is set to zero, don't throttle that zone's
 	 * operations at all.
 	 */
-	if (zonep->zone_zfs_io_pri == 0)
+	if (iop->zpers_zfs_io_pri == 0) {
+		mutex_exit(&zpd->zpers_zfs_lock);
 		return;
+	}
 
-	/*
-	 * XXX There's a potential race here in that more than one thread may
-	 * update the zone delays concurrently.  The worst outcome is corruption
-	 * of our data to track each zone's IO, so the algorithm may make
-	 * incorrect throttling decisions until the data is refreshed.
-	 */
-	last_checked = zfs_zone_last_checked;
-	if ((unow - last_checked) > zfs_zone_adjust_time) {
-		zfs_zone_last_checked = unow;
-		zfs_zone_wait_adjust(unow, last_checked);
+	/* Handle periodically updating the per-zone I/O parameters */
+	if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) {
+		hrtime_t last_checked;
+		boolean_t do_update = B_FALSE;
+
+		/* Recheck under mutex */
+		mutex_enter(&zfs_last_check_lock);
+		last_checked = zfs_zone_last_checked;
+		if ((unow - last_checked) > zfs_zone_adjust_time) {
+			zfs_zone_last_checked = unow;
+			do_update = B_TRUE;
+		}
+		mutex_exit(&zfs_last_check_lock);
+
+		if (do_update) {
+			mutex_exit(&zpd->zpers_zfs_lock);
+
+			zfs_zone_wait_adjust(unow, last_checked);
+
+			mutex_enter(&zpd->zpers_zfs_lock);
+			iop = zpd->zpers_zfsp;
+			if (iop == NULL) {
+				mutex_exit(&zpd->zpers_zfs_lock);
+				return;
+			}
+		}
 	}
 
-	if ((wait = zonep->zone_io_delay) > 0) {
+	wait = iop->zpers_io_delay;
+	mutex_exit(&zpd->zpers_zfs_lock);
+
+	if (wait > 0) {
 		/*
 		 * If this is a write and we're doing above normal TXG
 		 * syncing, then throttle for longer than normal.
@@ -1050,15 +1117,15 @@ zfs_zone_io_throttle(zfs_zone_iop_type_t type)
 		 *	arg1: type of IO operation
 		 *	arg2: time to delay (in us)
 		 */
-		DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id,
+		DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zid,
 		    uintptr_t, type, uintptr_t, wait);
 
 		drv_usecwait(wait);
 
-		if (zonep->zone_vfs_stats != NULL) {
-			atomic_inc_64(&zonep->zone_vfs_stats->
+		if (curzone->zone_vfs_stats != NULL) {
+			atomic_inc_64(&curzone->zone_vfs_stats->
 			    zv_delay_cnt.value.ui64);
-			atomic_add_64(&zonep->zone_vfs_stats->
+			atomic_add_64(&curzone->zone_vfs_stats->
 			    zv_delay_time.value.ui64, wait);
 		}
 	}
@@ -1100,8 +1167,23 @@ zfs_zone_report_txg_sync(void *dp)
 hrtime_t
 zfs_zone_txg_delay()
 {
-	if (curzone->zone_io_util_above_avg)
+	zone_persist_t *zpd = &zone_pdata[curzone->zone_id];
+	zone_zfs_io_t *iop;
+	uint8_t above;
+
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop == NULL) {
+		mutex_exit(&zpd->zpers_zfs_lock);
+		return (0);
+	}
+
+	above = iop->zpers_io_util_above_avg;
+	mutex_exit(&zpd->zpers_zfs_lock);
+
+	if (above) {
 		return (zfs_zone_txg_delay_nsec);
+	}
 
 	return (MSEC2NSEC(10));
 }
@@ -1114,7 +1196,8 @@ zfs_zone_txg_delay()
 void
 zfs_zone_zio_start(zio_t *zp)
 {
-	zone_t	*zonep;
+	zone_persist_t *zpd = &zone_pdata[zp->io_zoneid];
+	zone_zfs_io_t *iop;
 
 	/*
 	 * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
@@ -1124,14 +1207,14 @@ zfs_zone_zio_start(zio_t *zp)
 	if (zp->io_type == ZIO_TYPE_IOCTL)
 		return;
 
-	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
-		return;
-
-	mutex_enter(&zonep->zone_zfs_lock);
-	if (zp->io_type == ZIO_TYPE_READ)
-		kstat_runq_enter(&zonep->zone_zfs_rwstats);
-	zonep->zone_zfs_weight = 0;
-	mutex_exit(&zonep->zone_zfs_lock);
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop != NULL) {
+		if (zp->io_type == ZIO_TYPE_READ)
+			kstat_runq_enter(&iop->zpers_zfs_rwstats);
+		iop->zpers_zfs_weight = 0;
+	}
+	mutex_exit(&zpd->zpers_zfs_lock);
 
 	mutex_enter(&zfs_disk_lock);
 	zp->io_dispatched = gethrtime();
@@ -1140,8 +1223,6 @@ zfs_zone_zio_start(zio_t *zp)
 		zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
 	zfs_disk_rlastupdate = zp->io_dispatched;
 	mutex_exit(&zfs_disk_lock);
-
-	zone_rele(zonep);
 }
 
 /*
@@ -1152,7 +1233,8 @@ zfs_zone_zio_start(zio_t *zp)
 void
 zfs_zone_zio_done(zio_t *zp)
 {
-	zone_t	*zonep;
+	zone_persist_t *zpd;
+	zone_zfs_io_t *iop;
 	hrtime_t now, unow, udelta;
 
 	if (zp->io_type == ZIO_TYPE_IOCTL)
@@ -1161,34 +1243,33 @@ zfs_zone_zio_done(zio_t *zp)
 	if (zp->io_dispatched == 0)
 		return;
 
-	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
-		return;
+	zpd = &zone_pdata[zp->io_zoneid];
 
 	now = gethrtime();
 	unow = NANO_TO_MICRO(now);
 	udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
 
-	mutex_enter(&zonep->zone_zfs_lock);
-
-	/*
-	 * To calculate the wsvc_t average, keep a cumulative sum of all the
-	 * wait time before each I/O was dispatched.  Since most writes are
-	 * asynchronous, only track the wait time for read I/Os.
-	 */
-	if (zp->io_type == ZIO_TYPE_READ) {
-		zonep->zone_zfs_rwstats.reads++;
-		zonep->zone_zfs_rwstats.nread += zp->io_size;
-
-		zonep->zone_zfs_stats->zz_waittime.value.ui64 +=
-		    zp->io_dispatched - zp->io_timestamp;
-
-		kstat_runq_exit(&zonep->zone_zfs_rwstats);
-	} else {
-		zonep->zone_zfs_rwstats.writes++;
-		zonep->zone_zfs_rwstats.nwritten += zp->io_size;
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop != NULL) {
+		/*
+		 * To calculate the wsvc_t average, keep a cumulative sum of
+		 * all the wait time before each I/O was dispatched. Since most
+		 * writes are asynchronous, only track the wait time for
+		 * read I/Os.
+		 */
+		if (zp->io_type == ZIO_TYPE_READ) {
+			iop->zpers_zfs_rwstats.reads++;
+			iop->zpers_zfs_rwstats.nread += zp->io_size;
+			iop->zpers_zfs_rd_waittime +=
+			    zp->io_dispatched - zp->io_timestamp;
+			kstat_runq_exit(&iop->zpers_zfs_rwstats);
+		} else {
+			iop->zpers_zfs_rwstats.writes++;
+			iop->zpers_zfs_rwstats.nwritten += zp->io_size;
+		}
 	}
-
-	mutex_exit(&zonep->zone_zfs_lock);
+	mutex_exit(&zpd->zpers_zfs_lock);
 
 	mutex_enter(&zfs_disk_lock);
 	zfs_disk_rcnt--;
@@ -1201,14 +1282,10 @@ zfs_zone_zio_done(zio_t *zp)
 	mutex_exit(&zfs_disk_lock);
 
 	if (zfs_zone_delay_enable) {
-		mutex_enter(&zonep->zone_stg_io_lock);
-		add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
+		add_iop(zpd, unow, zp->io_type == ZIO_TYPE_READ ?
 		    ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
-		mutex_exit(&zonep->zone_stg_io_lock);
 	}
 
-	zone_rele(zonep);
-
 	/*
 	 * sdt:::zfs-zone-latency
 	 *
@@ -1224,7 +1301,8 @@ void
 zfs_zone_zio_dequeue(zio_t *zp)
 {
 	zio_priority_t p;
-	zone_t	*zonep;
+	zone_persist_t *zpd = &zone_pdata[zp->io_zoneid];
+	zone_zfs_io_t *iop;
 
 	p = zp->io_priority;
 	if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
@@ -1233,24 +1311,25 @@ zfs_zone_zio_dequeue(zio_t *zp)
 	/* We depend on p being defined as either 0 or 1 */
 	ASSERT(p < 2);
 
-	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
-		return;
-
-	mutex_enter(&zonep->zone_stg_io_lock);
-	ASSERT(zonep->zone_zfs_queued[p] > 0);
-	if (zonep->zone_zfs_queued[p] == 0)
-		cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
-	else
-		zonep->zone_zfs_queued[p]--;
-	mutex_exit(&zonep->zone_stg_io_lock);
-	zone_rele(zonep);
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop != NULL) {
+		ASSERT(iop->zpers_zfs_queued[p] > 0);
+		if (iop->zpers_zfs_queued[p] == 0) {
+			cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
+		} else {
+			iop->zpers_zfs_queued[p]--;
+		}
+	}
+	mutex_exit(&zpd->zpers_zfs_lock);
 }
 
 void
 zfs_zone_zio_enqueue(zio_t *zp)
 {
 	zio_priority_t p;
-	zone_t	*zonep;
+	zone_persist_t *zpd = &zone_pdata[zp->io_zoneid];
+	zone_zfs_io_t *iop;
 
 	p = zp->io_priority;
 	if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
@@ -1259,13 +1338,12 @@ zfs_zone_zio_enqueue(zio_t *zp)
 	/* We depend on p being defined as either 0 or 1 */
 	ASSERT(p < 2);
 
-	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
-		return;
-
-	mutex_enter(&zonep->zone_stg_io_lock);
-	zonep->zone_zfs_queued[p]++;
-	mutex_exit(&zonep->zone_stg_io_lock);
-	zone_rele(zonep);
+	mutex_enter(&zpd->zpers_zfs_lock);
+	iop = zpd->zpers_zfsp;
+	if (iop != NULL) {
+		iop->zpers_zfs_queued[p]++;
+	}
+	mutex_exit(&zpd->zpers_zfs_lock);
 }
 
 /*
diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c
index c6e54a75c1..f5ee76a2cb 100644
--- a/usr/src/uts/common/os/vm_pageout.c
+++ b/usr/src/uts/common/os/vm_pageout.c
@@ -21,7 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -1249,7 +1249,7 @@ checkpage(struct page *pp, int whichhand)
 		ASSERT(pp->p_zoneid == ALL_ZONES ||
 		    pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
 		if (pp->p_zoneid == ALL_ZONES ||
-		    zone_pcap_data[pp->p_zoneid].zpcap_over == 0) {
+		    zone_pdata[pp->p_zoneid].zpers_over == 0) {
 			/*
 			 * Cross-zone shared page, or zone not over it's cap.
 			 * Leave the page alone.
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 843adc1ee0..06a8549c5b 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017, Joyent Inc.
+ * Copyright 2018, Joyent Inc.
  * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 
@@ -313,6 +313,7 @@ static id_space_t *zoneid_space;
  * 'global_zone'.
  */
 zone_t zone0;
+zone_zfs_io_t zone0_zp_zfs;
 zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
 
 /*
@@ -429,11 +430,18 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 static const int ZONE_SYSCALL_API_VERSION = 7;
 
 /*
- * "zone_pcap_data" is an array indexed by zoneid. Each member stores the zone's
- * current page usage, its page limit, a flag indicating if the zone is
- * over its physical memory cap and various statistics. The zpcap_over flag is
- * the interface for the page scanner to use when reclaiming pages for zones
- * that are over their cap.
+ * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent"
+ * data which can be referenced independently of the zone_t structure. This
+ * data falls into two categories;
+ *   1) pages and RSS data associated with processes inside a zone
+ *   2) in-flight ZFS I/O data
+ *
+ * Each member of zone_persist_t stores the zone's current page usage, its page
+ * limit, a flag indicating if the zone is over its physical memory cap and
+ * various page-related statistics. The zpers_over flag is the interface for
+ * the page scanner to use when reclaiming pages for zones that are over their
+ * cap. The zone_persist_t structure also includes a mutex and a reference to a
+ * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data.
  *
  * All zone physical memory cap data is stored in this array instead of within
  * the zone structure itself. This is because zone structures come and go, but
@@ -448,33 +456,40 @@ static const int ZONE_SYSCALL_API_VERSION = 7;
  * page scanning.
  *
  * The page scanner can run when "zone_num_over_cap" is non-zero. It can
- * do a direct lookup of a zoneid into the "zone_pcap_data" array to determine
+ * do a direct lookup of a zoneid into the "zone_pdata" array to determine
  * if that zone is over its cap.
  *
  * There is no locking for the page scanner to perform these two checks.
  * We cannot have the page scanner blocking normal paging activity for
  * running processes. Because the physical memory cap is a soft cap, it is
  * fine for the scanner to simply read the current state of the counter and
- * the zone's zpcap_over entry in the array. The scanner should never modify
+ * the zone's zpers_over entry in the array. The scanner should never modify
  * either of these items. Internally the entries and the counter are managed
  * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We
  * take care to ensure that we only take the zone_physcap_lock mutex when a
  * zone is transitioning over/under its physical memory cap.
  *
  * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage
- * the "zone_pcap_data" array and associated counter.
+ * the "zone_pdata" array and associated counter.
  *
- * The zone_pcap_t structure tracks the zone's physical cap and phyiscal usage
- * in terms of pages. These values are currently defined as uint32. Thus, the
- * maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295) since
- * UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
+ * The zone_persist_t structure tracks the zone's physical cap and phyiscal
+ * usage in terms of pages. These values are currently defined as uint32. Thus,
+ * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295)
+ * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
  * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size.
  * In the future we may need to expand these counters to 64-bit, but for now
  * we're using 32-bit to conserve memory, since this array is statically
  * allocated within the kernel based on the maximum number of zones supported.
+ *
+ * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under
+ * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we
+ * had to continuously find the zone structure associated with an I/O that has
+ * just completed. To avoid that overhead, we track the I/O data within the
+ * zone_zfs_io_t instead. We can directly access that data without having to
+ * lookup the full zone_t structure.
  */
 uint_t zone_num_over_cap;
-zone_pcap_t zone_pcap_data[MAX_ZONES];
+zone_persist_t zone_pdata[MAX_ZONES];
 static kmutex_t zone_physcap_lock;
 
 /*
@@ -1509,8 +1524,16 @@ static rctl_ops_t zone_cpu_burst_time_ops = {
 static rctl_qty_t
 zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
 {
+	zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
+	rctl_qty_t r = 0;
+
 	ASSERT(MUTEX_HELD(&p->p_lock));
-	return (p->p_zone->zone_zfs_io_pri);
+	mutex_enter(&zp->zpers_zfs_lock);
+	if (zp->zpers_zfsp != NULL)
+		r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri;
+	mutex_exit(&zp->zpers_zfs_lock);
+
+	return (r);
 }
 
 /*ARGSUSED*/
@@ -1519,6 +1542,7 @@ zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
     rctl_qty_t nv)
 {
 	zone_t *zone = e->rcep_p.zone;
+	zone_persist_t *zp;
 
 	ASSERT(MUTEX_HELD(&p->p_lock));
 	ASSERT(e->rcep_t == RCENTITY_ZONE);
@@ -1529,7 +1553,11 @@ zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
 	/*
 	 * set priority to the new value.
 	 */
-	zone->zone_zfs_io_pri = nv;
+	zp = &zone_pdata[zone->zone_id];
+	mutex_enter(&zp->zpers_zfs_lock);
+	if (zp->zpers_zfsp != NULL)
+		zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv;
+	mutex_exit(&zp->zpers_zfs_lock);
 	return (0);
 }
 
@@ -1871,10 +1899,10 @@ static rctl_qty_t
 zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
 {
 	rctl_qty_t q;
-	zone_pcap_t *zp = &zone_pcap_data[p->p_zone->zone_id];
+	zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
 
 	ASSERT(MUTEX_HELD(&p->p_lock));
-	q = ptob(zp->zpcap_pg_cnt);
+	q = ptob(zp->zpers_pg_cnt);
 	return (q);
 }
 
@@ -1906,7 +1934,7 @@ zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
 			pg_val = (uint_t)pages;
 		}
 	}
-	zone_pcap_data[zid].zpcap_pg_limit = pg_val;
+	zone_pdata[zid].zpers_pg_limit = pg_val;
 	return (0);
 }
 
@@ -2016,13 +2044,13 @@ zone_physmem_kstat_update(kstat_t *ksp, int rw)
 {
 	zone_t *zone = ksp->ks_private;
 	zone_kstat_t *zk = ksp->ks_data;
-	zone_pcap_t *zp = &zone_pcap_data[zone->zone_id];
+	zone_persist_t *zp = &zone_pdata[zone->zone_id];
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
-	zk->zk_usage.value.ui64 = ptob(zp->zpcap_pg_cnt);
-	zk->zk_value.value.ui64 = ptob(zp->zpcap_pg_limit);
+	zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt);
+	zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit);
 	return (0);
 }
 
@@ -2170,26 +2198,42 @@ zone_zfs_kstat_update(kstat_t *ksp, int rw)
 {
 	zone_t *zone = ksp->ks_private;
 	zone_zfs_kstat_t *zzp = ksp->ks_data;
-	kstat_io_t *kiop = &zone->zone_zfs_rwstats;
+	zone_persist_t *zp = &zone_pdata[zone->zone_id];
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
-	/*
-	 * Extract the ZFS statistics from the kstat_io_t structure used by
-	 * kstat_runq_enter() and related functions.  Since the I/O throttle
-	 * counters are updated directly by the ZFS layer, there's no need to
-	 * copy those statistics here.
-	 *
-	 * Note that kstat_runq_enter() and the related functions use
-	 * gethrtime_unscaled(), so scale the time here.
-	 */
-	zzp->zz_nread.value.ui64 = kiop->nread;
-	zzp->zz_reads.value.ui64 = kiop->reads;
-	zzp->zz_rtime.value.ui64 = kiop->rtime;
-	zzp->zz_rlentime.value.ui64 = kiop->rlentime;
-	zzp->zz_nwritten.value.ui64 = kiop->nwritten;
-	zzp->zz_writes.value.ui64 = kiop->writes;
+	mutex_enter(&zp->zpers_zfs_lock);
+	if (zp->zpers_zfsp == NULL) {
+		zzp->zz_nread.value.ui64 = 0;
+		zzp->zz_reads.value.ui64 = 0;
+		zzp->zz_rtime.value.ui64 = 0;
+		zzp->zz_rlentime.value.ui64 = 0;
+		zzp->zz_nwritten.value.ui64 = 0;
+		zzp->zz_writes.value.ui64 = 0;
+		zzp->zz_waittime.value.ui64 = 0;
+	} else {
+		kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats;
+
+		/*
+		 * Extract the ZFS statistics from the kstat_io_t structure
+		 * used by kstat_runq_enter() and related functions. Since the
+		 * I/O throttle counters are updated directly by the ZFS layer,
+		 * there's no need to copy those statistics here.
+		 *
+		 * Note that kstat_runq_enter() and the related functions use
+		 * gethrtime_unscaled(), so scale the time here.
+		 */
+		zzp->zz_nread.value.ui64 = kiop->nread;
+		zzp->zz_reads.value.ui64 = kiop->reads;
+		zzp->zz_rtime.value.ui64 = kiop->rtime;
+		zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+		zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+		zzp->zz_writes.value.ui64 = kiop->writes;
+		zzp->zz_waittime.value.ui64 =
+		    zp->zpers_zfsp->zpers_zfs_rd_waittime;
+	}
+	mutex_exit(&zp->zpers_zfs_lock);
 
 	scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
 	scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
@@ -2240,23 +2284,23 @@ zone_mcap_kstat_update(kstat_t *ksp, int rw)
 {
 	zone_t *zone = ksp->ks_private;
 	zone_mcap_kstat_t *zmp = ksp->ks_data;
-	zone_pcap_t *zp;
+	zone_persist_t *zp;
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
-	zp = &zone_pcap_data[zone->zone_id];
+	zp = &zone_pdata[zone->zone_id];
 
-	zmp->zm_rss.value.ui64 = ptob(zp->zpcap_pg_cnt);
-	zmp->zm_phys_cap.value.ui64 = ptob(zp->zpcap_pg_limit);
+	zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt);
+	zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit);
 	zmp->zm_swap.value.ui64 = zone->zone_max_swap;
 	zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
-	zmp->zm_nover.value.ui64 = zp->zpcap_nover;
+	zmp->zm_nover.value.ui64 = zp->zpers_nover;
 #ifndef DEBUG
-	zmp->zm_pagedout.value.ui64 = ptob(zp->zpcap_pg_out);
+	zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out);
 #else
-	zmp->zm_pagedout.value.ui64 = ptob(zp->zpcap_pg_fsdirty +
-	    zp->zpcap_pg_fs + zp->zpcap_pg_anon + zp->zpcap_pg_anondirty);
+	zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty +
+	    zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty);
 #endif
 	zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
 	zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
@@ -2523,11 +2567,13 @@ zone_zsd_init(void)
 	zone0.zone_swapresv_kstat = NULL;
 	zone0.zone_physmem_kstat = NULL;
 	zone0.zone_nprocs_kstat = NULL;
-	zone0.zone_zfs_io_pri = 1;
 	zone0.zone_stime = 0;
 	zone0.zone_utime = 0;
 	zone0.zone_wtime = 0;
 
+	zone_pdata[0].zpers_zfsp = &zone0_zp_zfs;
+	zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1;
+
 	list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
 	    offsetof(zone_ref_t, zref_linkage));
 	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
@@ -2839,7 +2885,7 @@ zone_free(zone_t *zone)
 	cpucaps_zone_remove(zone);
 
 	/* Clear physical memory capping data. */
-	bzero(&zone_pcap_data[zone->zone_id], sizeof (zone_pcap_t));
+	bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t));
 
 	ASSERT(zone->zone_cpucap == NULL);
 
@@ -5090,7 +5136,10 @@ zone_create(const char *zone_name, const char *zone_root,
 	zone->zone_lockedmem_kstat = NULL;
 	zone->zone_swapresv_kstat = NULL;
 	zone->zone_physmem_kstat = NULL;
-	zone->zone_zfs_io_pri = 1;
+
+	zone_pdata[zoneid].zpers_zfsp =
+	    kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP);
+	zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1;
 
 	/*
 	 * Zsched initializes the rctls.
@@ -5101,8 +5150,8 @@ zone_create(const char *zone_name, const char *zone_root,
 	 * Ensure page count is 0 (in case zoneid has wrapped).
 	 * Initialize physical memory cap as unlimited.
 	 */
-	zone_pcap_data[zoneid].zpcap_pg_cnt = 0;
-	zone_pcap_data[zoneid].zpcap_pg_limit = UINT32_MAX;
+	zone_pdata[zoneid].zpers_pg_cnt = 0;
+	zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX;
 
 	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
 		zone_free(zone);
@@ -5741,6 +5790,7 @@ zone_destroy(zoneid_t zoneid)
 	zone_status_t status;
 	clock_t wait_time;
 	boolean_t log_refcounts;
+	zone_persist_t *zp;
 
 	if (secpolicy_zone_config(CRED()) != 0)
 		return (set_errno(EPERM));
@@ -5774,6 +5824,12 @@ zone_destroy(zoneid_t zoneid)
 	zone_hold(zone);
 	mutex_exit(&zonehash_lock);
 
+	zp = &zone_pdata[zoneid];
+	mutex_enter(&zp->zpers_zfs_lock);
+	kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t));
+	zp->zpers_zfsp = NULL;
+	mutex_exit(&zp->zpers_zfs_lock);
+
 	/*
 	 * wait for zsched to exit
 	 */
@@ -8075,18 +8131,18 @@ done:
 static void
 zone_incr_capped(zoneid_t zid)
 {
-	zone_pcap_t *zp = &zone_pcap_data[zid];
+	zone_persist_t *zp = &zone_pdata[zid];
 
 	/* See if over (unlimited is UINT32_MAX), or already marked that way. */
-	if (zp->zpcap_pg_cnt <= zp->zpcap_pg_limit || zp->zpcap_over == 1) {
+	if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) {
 		return;
 	}
 
 	mutex_enter(&zone_physcap_lock);
 	/* Recheck setting under mutex */
-	if (zp->zpcap_pg_cnt > zp->zpcap_pg_limit && zp->zpcap_over == 0) {
-		zp->zpcap_over = 1;
-		zp->zpcap_nover++;
+	if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) {
+		zp->zpers_over = 1;
+		zp->zpers_nover++;
 		zone_num_over_cap++;
 		DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid);
 	}
@@ -8114,29 +8170,29 @@ zone_incr_capped(zoneid_t zid)
 static void
 zone_decr_capped(zoneid_t zid)
 {
-	zone_pcap_t *zp = &zone_pcap_data[zid];
+	zone_persist_t *zp = &zone_pdata[zid];
 	uint32_t adjusted_limit;
 
 	/*
 	 * See if under, or already marked that way. There is no need to
-	 * check for an unlimited cap (zpcap_pg_limit == UINT32_MAX)
-	 * since we'll never set zpcap_over in zone_incr_capped().
+	 * check for an unlimited cap (zpers_pg_limit == UINT32_MAX)
+	 * since we'll never set zpers_over in zone_incr_capped().
 	 */
-	if (zp->zpcap_over == 0 || zp->zpcap_pg_cnt >= zp->zpcap_pg_limit) {
+	if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) {
 		return;
 	}
 
-	adjusted_limit = zp->zpcap_pg_limit - (zp->zpcap_pg_limit >> 7);
+	adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7);
 
 	/* Recheck, accounting for our hysteresis. */
-	if (zp->zpcap_pg_cnt >= adjusted_limit) {
+	if (zp->zpers_pg_cnt >= adjusted_limit) {
 		return;
 	}
 
 	mutex_enter(&zone_physcap_lock);
 	/* Recheck under mutex. */
-	if (zp->zpcap_pg_cnt < adjusted_limit && zp->zpcap_over == 1) {
-		zp->zpcap_over = 0;
+	if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) {
+		zp->zpers_over = 0;
 		ASSERT(zone_num_over_cap > 0);
 		zone_num_over_cap--;
 		DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid);
@@ -8154,7 +8210,7 @@ void
 zone_add_page(page_t *pp)
 {
 	uint_t pcnt;
-	zone_pcap_t *zp;
+	zone_persist_t *zp;
 	zoneid_t zid;
 
 	/* Skip pages in segkmem, etc. (KV_KVP, ...) */
@@ -8179,9 +8235,9 @@ zone_add_page(page_t *pp)
 	if (pp->p_share == 0) {
 		/* First mapping to this page. */
 		pp->p_zoneid = zid;
-		zp = &zone_pcap_data[zid];
-		ASSERT(zp->zpcap_pg_cnt + pcnt < UINT32_MAX);
-		atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, pcnt);
+		zp = &zone_pdata[zid];
+		ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX);
+		atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt);
 		zone_incr_capped(zid);
 		return;
 	}
@@ -8194,10 +8250,10 @@ zone_add_page(page_t *pp)
 		zid = pp->p_zoneid;
 		pp->p_zoneid = ALL_ZONES;
 		ASSERT(zid >= 0 && zid <= MAX_ZONEID);
-		zp = &zone_pcap_data[zid];
+		zp = &zone_pdata[zid];
 
-		if (zp->zpcap_pg_cnt > 0) {
-			atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, -pcnt);
+		if (zp->zpers_pg_cnt > 0) {
+			atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
 		}
 		zone_decr_capped(zid);
 	}
@@ -8207,7 +8263,7 @@ void
 zone_rm_page(page_t *pp)
 {
 	uint_t pcnt;
-	zone_pcap_t *zp;
+	zone_persist_t *zp;
 	zoneid_t zid;
 
 	/* Skip pages in segkmem, etc. (KV_KVP, ...) */
@@ -8227,9 +8283,9 @@ zone_rm_page(page_t *pp)
 	}
 
 	ASSERT(zid >= 0 && zid <= MAX_ZONEID);
-	zp = &zone_pcap_data[zid];
-	if (zp->zpcap_pg_cnt > 0) {
-		atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, -pcnt);
+	zp = &zone_pdata[zid];
+	if (zp->zpers_pg_cnt > 0) {
+		atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
 	}
 	zone_decr_capped(zid);
 	pp->p_zoneid = ALL_ZONES;
@@ -8238,29 +8294,29 @@ zone_rm_page(page_t *pp)
 void
 zone_pageout_stat(int zid, zone_pageout_op_t op)
 {
-	zone_pcap_t *zp;
+	zone_persist_t *zp;
 
 	if (zid == ALL_ZONES)
 		return;
 
 	ASSERT(zid >= 0 && zid <= MAX_ZONEID);
-	zp = &zone_pcap_data[zid];
+	zp = &zone_pdata[zid];
 
 #ifndef DEBUG
-	atomic_add_64(&zp->zpcap_pg_out, 1);
+	atomic_add_64(&zp->zpers_pg_out, 1);
 #else
 	switch (op) {
 	case ZPO_DIRTY:
-		atomic_add_64(&zp->zpcap_pg_fsdirty, 1);
+		atomic_add_64(&zp->zpers_pg_fsdirty, 1);
 		break;
 	case ZPO_FS:
-		atomic_add_64(&zp->zpcap_pg_fs, 1);
+		atomic_add_64(&zp->zpers_pg_fs, 1);
 		break;
 	case ZPO_ANON:
-		atomic_add_64(&zp->zpcap_pg_anon, 1);
+		atomic_add_64(&zp->zpers_pg_anon, 1);
 		break;
 	case ZPO_ANONDIRTY:
-		atomic_add_64(&zp->zpcap_pg_anondirty, 1);
+		atomic_add_64(&zp->zpers_pg_anondirty, 1);
 		break;
 	default:
 		cmn_err(CE_PANIC, "Invalid pageout operator %d", op);
@@ -8275,23 +8331,23 @@ zone_pageout_stat(int zid, zone_pageout_op_t op)
 void
 zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free)
 {
-	zone_pcap_t *zp;
+	zone_persist_t *zp;
 
 	ASSERT(zid >= 0 && zid <= MAX_ZONEID);
-	zp = &zone_pcap_data[zid];
+	zp = &zone_pdata[zid];
 
 	/*
 	 * If memory or swap limits are set on the zone, use those, otherwise
 	 * use the system values. physmem and freemem are also in pages.
 	 */
-	if (zp->zpcap_pg_limit == UINT32_MAX) {
+	if (zp->zpers_pg_limit == UINT32_MAX) {
 		*memcap = physmem;
 		*free = freemem;
 	} else {
 		int64_t freemem;
 
-		*memcap = (pgcnt_t)zp->zpcap_pg_limit;
-		freemem = zp->zpcap_pg_limit - zp->zpcap_pg_cnt;
+		*memcap = (pgcnt_t)zp->zpers_pg_limit;
+		freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt;
 		if (freemem > 0) {
 			*free = (pgcnt_t)freemem;
 		} else {
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 1cca1e7555..87253134fd 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -22,7 +22,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
  * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
- * Copyright 2017, Joyent, Inc.
+ * Copyright 2018, Joyent, Inc.
  */
 
 #ifndef _SYS_ZONE_H
@@ -624,20 +624,6 @@ typedef struct zone {
 	struct cpucap	*zone_cpucap;	/* CPU caps data */
 
 	/*
-	 * Data and counters used for ZFS fair-share disk IO.
-	 */
-	rctl_qty_t	zone_zfs_io_pri;	/* ZFS IO priority */
-	uint_t		zone_zfs_queued[2];	/* sync I/O enqueued count */
-	uint64_t	zone_zfs_weight;	/* used to prevent starvation */
-	uint64_t	zone_io_util;		/* IO utilization metric */
-	boolean_t	zone_io_util_above_avg;	/* IO util percent > avg. */
-	uint16_t	zone_io_delay;		/* IO delay on logical r/w */
-	kmutex_t	zone_stg_io_lock;	/* protects IO window data */
-	sys_zio_cntr_t	zone_rd_ops;		/* Counters for ZFS reads, */
-	sys_zio_cntr_t	zone_wr_ops;		/* writes and */
-	sys_zio_cntr_t	zone_lwr_ops;		/* logical writes. */
-
-	/*
 	 * kstats and counters for VFS ops and bytes.
 	 */
 	kmutex_t	zone_vfs_lock;		/* protects VFS statistics */
@@ -650,7 +636,6 @@ typedef struct zone {
 	 */
 	kmutex_t	zone_zfs_lock;		/* protects ZFS statistics */
 	kstat_t		*zone_zfs_ksp;
-	kstat_io_t	zone_zfs_rwstats;
 	zone_zfs_kstat_t *zone_zfs_stats;
 
 	/*
@@ -738,25 +723,48 @@ typedef struct zone {
 	kmutex_t	zone_mount_lock;
 } zone_t;
 
-/* zpcap_over is treated as a boolean but is 32 bits for alignment. */
-typedef struct zone_pcap {
-	uint32_t	zpcap_over;	/* currently over cap */
-	uint32_t	zpcap_pg_cnt;	/* current RSS in pages */
-	uint32_t	zpcap_pg_limit;	/* current RRS limit in pages */
-	uint32_t	zpcap_nover;	/* # of times over phys. cap */
+/*
+ * Data and counters used for ZFS fair-share disk IO.
+ */
+typedef struct zone_zfs_io {
+	uint16_t	zpers_zfs_io_pri;	/* ZFS IO priority - 16k max */
+	uint_t		zpers_zfs_queued[2];	/* sync I/O enqueued count */
+	sys_zio_cntr_t	zpers_rd_ops;		/* Counters for ZFS reads, */
+	sys_zio_cntr_t	zpers_wr_ops;		/* writes, and */
+	sys_zio_cntr_t	zpers_lwr_ops;		/* logical writes. */
+	kstat_io_t	zpers_zfs_rwstats;
+	uint64_t	zpers_io_util;		/* IO utilization metric */
+	uint64_t	zpers_zfs_rd_waittime;
+	uint8_t		zpers_io_delay;		/* IO delay on logical r/w */
+	uint8_t		zpers_zfs_weight;	/* used to prevent starvation */
+	uint8_t		zpers_io_util_above_avg; /* IO util percent > avg. */
+} zone_zfs_io_t;
+
+/*
+ * "Persistent" zone data which can be accessed idependently of the zone_t.
+ */
+typedef struct zone_persist {
+	kmutex_t	zpers_zfs_lock;	/* Protects zpers_zfsp references */
+	zone_zfs_io_t	*zpers_zfsp;	/* ZFS fair-share IO data */
+	uint8_t		zpers_over;	/* currently over cap */
+	uint32_t	zpers_pg_cnt;	/* current RSS in pages */
+	uint32_t	zpers_pg_limit;	/* current RRS limit in pages */
+	uint32_t	zpers_nover;	/* # of times over phys. cap */
 #ifndef DEBUG
-	uint64_t	zpcap_pg_out;	/* # pages flushed */
+	uint64_t	zpers_pg_out;	/* # pages flushed */
 #else
 	/*
-	 * To conserve memory, detailed pageout stats are only kept for DEBUG
+	 * To conserve memory, some detailed kstats are only kept for DEBUG
 	 * builds.
 	 */
-	uint64_t	zpcap_pg_anon;		/* # clean anon pages flushed */
-	uint64_t	zpcap_pg_anondirty;	/* # dirty anon pages flushed */
-	uint64_t	zpcap_pg_fs;		/* # clean fs pages flushed */
-	uint64_t	zpcap_pg_fsdirty;	/* # dirty fs pages flushed */
+	uint64_t	zpers_zfs_rd_waittime;
+
+	uint64_t	zpers_pg_anon;		/* # clean anon pages flushed */
+	uint64_t	zpers_pg_anondirty;	/* # dirty anon pages flushed */
+	uint64_t	zpers_pg_fs;		/* # clean fs pages flushed */
+	uint64_t	zpers_pg_fsdirty;	/* # dirty fs pages flushed */
 #endif
-} zone_pcap_t;
+} zone_persist_t;
 
 typedef enum zone_pageout_op {
 	ZPO_DIRTY, ZPO_FS, ZPO_ANON, ZPO_ANONDIRTY
@@ -994,7 +1002,7 @@ extern void zone_get_physmem_data(int, pgcnt_t *, pgcnt_t *);
 
 /* Interfaces for page scanning */
 extern uint_t zone_num_over_cap;
-extern zone_pcap_t zone_pcap_data[MAX_ZONES];
+extern zone_persist_t zone_pdata[MAX_ZONES];
 
 extern rctl_hndl_t rc_zone_locked_mem;
 extern rctl_hndl_t rc_zone_max_swap;
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index 10017d27ef..01c2666e91 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright 2017, Joyent, Inc.
+ * Copyright 2018, Joyent, Inc.
  */
 
 /*
@@ -1691,7 +1691,7 @@ vmu_get_zone_rss(zoneid_t zid)
 	}
 
 	ASSERT(zid >= 0 && zid <= MAX_ZONEID);
-	pgcnt = zone_pcap_data[zid].zpcap_pg_cnt;
+	pgcnt = zone_pdata[zid].zpers_pg_cnt;
 	zone->vmz_zone->vme_result.vmu_rss_all = (size_t)ptob(pgcnt);
 	zone->vmz_zone->vme_result.vmu_swap_all = zp->zone_max_swap;
 
@@ -1728,7 +1728,7 @@ vmu_calculate()
 		int i;
 
 		for (i = 0; i <= MAX_ZONEID; i++) {
-			if (zone_pcap_data[i].zpcap_pg_cnt > 0) {
+			if (zone_pdata[i].zpers_pg_cnt > 0) {
 				vmu_get_zone_rss(i);
 			}
 		}