22 files changed, 1615 insertions, 85 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index e039b9cac4..98aad58025 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -125,6 +126,7 @@
 #include <sys/refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <vm/anon.h>
@@ -2017,6 +2019,16 @@ arc_reclaim_needed(void)
 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
 		return (1);
 
+	/*
+	 * Check that we have enough availrmem that memory locking (e.g., via
+	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
+	 * stores the number of pages that cannot be locked; when availrmem
+	 * drops below pages_pp_maximum, page locking mechanisms such as
+	 * page_pp_lock() will fail.)
+	 */
+	if (availrmem <= pages_pp_maximum)
+		return (1);
+
 #if defined(__i386)
 	/*
 	 * If we're on an i386 platform, it's possible that we'll exhaust the
@@ -2917,6 +2929,14 @@ top:
 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
 		    arc_read_done, buf, priority, zio_flags, zb);
 
+		/*
+		 * At this point, this read I/O has already missed in the ARC
+		 * and will be going through to the disk.  The I/O throttle
+		 * should delay this I/O if this zone is using more than its I/O
+		 * priority allows.
+		 */
+		zfs_zone_io_throttle(ZFS_ZONE_IOP_READ);
+
 		if (*arc_flags & ARC_WAIT)
 			return (zio_wait(rzio));
 
@@ -3484,9 +3504,6 @@ arc_init(void)
 	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
 		arc_meta_limit = zfs_arc_meta_limit;
 
-	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
-		arc_c_min = arc_meta_limit / 2;
-
 	if (zfs_arc_grow_retry > 0)
 		arc_grow_retry = zfs_arc_grow_retry;
 
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 7a0abd22b5..16e42b951a 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -2703,7 +2703,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		    dr->dt.dl.dr_copies);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
-		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
 		    db->db_blkptr, NULL, db->db.db_size, &zp,
 		    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 39234eba53..743f5c4656 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -950,6 +951,7 @@ xuio_stat_wbuf_nocopy()
 }
 
 #ifdef _KERNEL
+
 int
 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 {
@@ -1562,7 +1564,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 
 	if (wp & WP_NOFILL) {
 		ASSERT(!ismd && level == 0);
-		checksum = ZIO_CHECKSUM_OFF;
+		checksum = ZIO_CHECKSUM_NOPARITY;
 		compress = ZIO_COMPRESS_OFF;
 		dedup = B_FALSE;
 	}
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index b4579e278c..2301942907 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -39,11 +39,11 @@
 #include <sys/sa_impl.h>
 #include <sys/zfs_context.h>
 #include <sys/varargs.h>
+#include <sys/zfs_zone.h>
 
 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
     uint64_t arg1, uint64_t arg2);
 
-
 dmu_tx_t *
 dmu_tx_create_dd(dsl_dir_t *dd)
 {
@@ -223,6 +223,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 	if (len == 0)
 		return;
 
+	zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE);
+
 	min_bs = SPA_MINBLOCKSHIFT;
 	max_bs = SPA_MAXBLOCKSHIFT;
 	min_ibs = DN_MIN_INDBLKSHIFT;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index 5ef7f54af1..aeeefd178e 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/dmu_objset.h>
@@ -4133,9 +4134,13 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
 		dsl_dataset_t *snap;
 		uint64_t used, comp, uncomp;
 
-		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
-		if (err != 0)
-			break;
+		if (snapobj == new->ds_object) {
+			snap = new;
+		} else {
+			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
+			if (err != 0)
+				break;
+		}
 
 		if (snap->ds_phys->ds_prev_snap_txg ==
 		    oldsnap->ds_phys->ds_creation_txg) {
@@ -4164,7 +4169,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
 		 * was not a snapshot of/before new.
 		 */
 		snapobj = snap->ds_phys->ds_prev_snap_obj;
-		dsl_dataset_rele(snap, FTAG);
+		if (snap != new)
+			dsl_dataset_rele(snap, FTAG);
 		if (snapobj == 0) {
 			err = EINVAL;
 			break;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 1cd49c8274..b6af7598e2 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -36,6 +36,7 @@
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/sunddi.h>
+#include <sys/zfs_zone.h>
 #include "zfs_namecheck.h"
 
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
@@ -839,7 +840,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
 		err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
 	} else {
 		if (err == EAGAIN) {
-			txg_delay(dd->dd_pool, tx->tx_txg, 1);
+			txg_delay(dd->dd_pool, tx->tx_txg,
+			    zfs_zone_txg_delay());
 			err = ERESTART;
 		}
 		dsl_pool_memory_pressure(dd->dd_pool);
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 418a04c7c2..316b37cebd 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -40,6 +40,7 @@
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
+#include <sys/zfs_zone.h>
 
 int zfs_no_write_throttle = 0;
 int zfs_write_limit_shift = 3;			/* 1/8th of physical memory */
@@ -529,11 +530,11 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
 
 	/*
 	 * If this transaction group is over 7/8ths capacity, delay
-	 * the caller 1 clock tick.  This will slow down the "fill"
-	 * rate until the sync process can catch up with us.
+	 * the caller some number of clock ticks.  This will slow down the
+	 * "fill" rate until the sync process can catch up with us.
 	 */
 	if (reserved && reserved > (write_limit - (write_limit >> 3)))
-		txg_delay(dp, tx->tx_txg, 1);
+		txg_delay(dp, tx->tx_txg, zfs_zone_txg_delay());
 
 	return (0);
 }
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
index b748571ea0..ffca0a7dcb 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
@@ -21,13 +21,12 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_DISK_H
 #define	_SYS_VDEV_DISK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/vdev.h>
 #ifdef _KERNEL
 #include <sys/buf.h>
@@ -40,14 +39,22 @@
 extern "C" {
 #endif
 
+#ifdef _KERNEL
 typedef struct vdev_disk {
 	ddi_devid_t	vd_devid;
 	char		*vd_minor;
 	ldi_handle_t	vd_lh;
 } vdev_disk_t;
+#endif
 
+extern int vdev_disk_physio(vdev_t *, caddr_t, size_t, uint64_t, int);
+
+/*
+ * Since vdev_disk.c is not compiled into libzpool, this function should only be
+ * defined in the zfs kernel module.
+ */
 #ifdef _KERNEL
-extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
+extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
 #endif
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 1df61a587d..c297ae165c 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -102,6 +102,7 @@ struct vdev_queue {
 	avl_tree_t	vq_read_tree;
 	avl_tree_t	vq_write_tree;
 	avl_tree_t	vq_pending_tree;
+	zoneid_t	vq_last_zone_id;
 	kmutex_t	vq_lock;
 };
 
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h
new file mode 100644
index 0000000000..496b718bd6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_RAIDZ_H
+#define	_SYS_VDEV_RAIDZ_H
+
+#include <sys/vdev.h>
+#include <sys/semaphore.h>
+#include <sys/buf.h>
+#ifdef _KERNEL
+#include <sys/ddi.h>
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+extern int vdev_raidz_physio(vdev_t *,
+    caddr_t, size_t, uint64_t, uint64_t, boolean_t);
+#endif
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VDEV_RAIDZ_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
new file mode 100644
index 0000000000..069ec004f3
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
@@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef	_SYS_FS_ZFS_ZONE_H
+#define	_SYS_FS_ZFS_ZONE_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	ZFS_ZONE_IOP_READ = 0,
+	ZFS_ZONE_IOP_WRITE,
+	ZFS_ZONE_IOP_LOGICAL_WRITE,
+} zfs_zone_iop_type_t;
+
+extern void zfs_zone_io_throttle(zfs_zone_iop_type_t);
+
+extern void zfs_zone_zio_init(zio_t *);
+extern void zfs_zone_zio_start(zio_t *);
+extern void zfs_zone_zio_done(zio_t *);
+extern void zfs_zone_zio_dequeue(zio_t *);
+extern void zfs_zone_zio_enqueue(zio_t *);
+extern void zfs_zone_report_txg_sync(void *);
+extern int zfs_zone_txg_delay();
+#ifdef _KERNEL
+extern zio_t *zfs_zone_schedule(vdev_queue_t *);
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_ZONE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index f6cf3f5349..032b77715f 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -24,6 +24,7 @@
  */
 /*
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 #ifndef _ZIO_H
@@ -79,6 +80,8 @@ enum zio_checksum {
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
 	ZIO_CHECKSUM_ZILOG2,
+	ZIO_CHECKSUM_SHA256_MAC,
+	ZIO_CHECKSUM_NOPARITY,
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
@@ -421,6 +424,9 @@ struct zio {
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 
+	zoneid_t	io_zoneid;	/* zone which originated this I/O */
+	hrtime_t	io_start;	/* time I/O entered zio pipeline */
+	hrtime_t	io_dispatched;	/* time I/O was dispatched to disk */
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
 };
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index 55b1f3884b..2269ef271e 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -30,6 +30,7 @@
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/callb.h>
+#include <sys/zfs_zone.h>
 
 /*
  * Pool-wide transaction groups.
@@ -411,6 +412,8 @@ txg_sync_thread(dsl_pool_t *dp)
 		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 		mutex_exit(&tx->tx_sync_lock);
 
+		zfs_zone_report_txg_sync(dp);
+
 		start = ddi_get_lbolt();
 		spa_sync(spa, txg);
 		delta = ddi_get_lbolt() - start;
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index d7417736b4..f78580d0f1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -20,9 +20,11 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
+#include <sys/zfs_zone.h>
 #include <sys/spa_impl.h>
 #include <sys/refcount.h>
 #include <sys/vdev_disk.h>
@@ -325,8 +327,18 @@ vdev_disk_close(vdev_t *vd)
 }
 
 int
-vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
-    uint64_t offset, int flags)
+vdev_disk_physio(vdev_t *vd, caddr_t data,
+    size_t size, uint64_t offset, int flags)
+{
+	vdev_disk_t *dvd = vd->vdev_tsd;
+
+	ASSERT(vd->vdev_ops == &vdev_disk_ops);
+	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
+}
+
+int
+vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
+    size_t size, uint64_t offset, int flags)
 {
 	buf_t *bp;
 	int error = 0;
@@ -479,6 +491,8 @@ vdev_disk_io_start(zio_t *zio)
 	bp->b_bufsize = zio->io_size;
 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
 
+	zfs_zone_zio_start(zio);
+
 	/* ldi_strategy() will return non-zero only on programming errors */
 	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
 
@@ -490,6 +504,8 @@ vdev_disk_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
+	zfs_zone_zio_done(zio);
+
 	/*
 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
 	 * the device has been removed.  If this is the case, then we trigger an
@@ -574,7 +590,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 
 		/* read vdev label */
 		offset = vdev_label_offset(size, l, 0);
-		if (vdev_disk_physio(vd_lh, (caddr_t)label,
+		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
 			continue;
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 5a0d3ee970..4ea958a9f6 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -21,12 +21,14 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
+#include <sys/zfs_zone.h>
 
 /*
  * These tunables are for performance analysis.
@@ -120,6 +122,8 @@ vdev_queue_init(vdev_t *vd)
 
 	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
 	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+	vq->vq_last_zone_id = 0;
 }
 
 void
@@ -139,6 +143,7 @@ static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
 	avl_add(&vq->vq_deadline_tree, zio);
+	zfs_zone_zio_enqueue(zio);
 	avl_add(zio->io_vdev_tree, zio);
 }
 
@@ -146,6 +151,7 @@ static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	avl_remove(&vq->vq_deadline_tree, zio);
+	zfs_zone_zio_dequeue(zio);
 	avl_remove(zio->io_vdev_tree, zio);
 }
 
@@ -188,7 +194,11 @@ again:
 	    avl_numnodes(&vq->vq_deadline_tree) == 0)
 		return (NULL);
 
+#ifdef _KERNEL
+	fio = lio = zfs_zone_schedule(vq);
+#else
 	fio = lio = avl_first(&vq->vq_deadline_tree);
+#endif
 
 	t = fio->io_vdev_tree;
 	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 4b0f5602c1..6094e01876 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -21,11 +21,15 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/fs/zfs.h>
@@ -152,6 +156,8 @@ typedef struct raidz_map {
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 }
 
+#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
+
 /*
  * Force reconstruction to use the general purpose method.
  */
@@ -431,12 +437,12 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 };
 
 static raidz_map_t *
-vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
-    uint64_t nparity)
+vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
+    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 {
 	raidz_map_t *rm;
-	uint64_t b = zio->io_offset >> unit_shift;
-	uint64_t s = zio->io_size >> unit_shift;
+	uint64_t b = offset >> unit_shift;
+	uint64_t s = size >> unit_shift;
 	uint64_t f = b % dcols;
 	uint64_t o = (b / dcols) << unit_shift;
 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
@@ -506,7 +512,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 	for (c = 0; c < rm->rm_firstdatacol; c++)
 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
 
-	rm->rm_col[c].rc_data = zio->io_data;
+	rm->rm_col[c].rc_data = data;
 
 	for (c = c + 1; c < acols; c++)
 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
@@ -535,7 +541,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 	ASSERT(rm->rm_cols >= 2);
 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 
-	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
 		devidx = rm->rm_col[0].rc_devidx;
 		o = rm->rm_col[0].rc_offset;
 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
@@ -547,8 +553,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 			rm->rm_skipstart = 1;
 	}
 
-	zio->io_vsd = rm;
-	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 	return (rm);
 }
 
@@ -1491,6 +1495,104 @@ vdev_raidz_close(vdev_t *vd)
 		vdev_close(vd->vdev_child[c]);
 }
 
+/*
+ * Handle a read or write request to a RAID-Z dump device.
+ *
+ * Unlike the normal RAID-Z codepath in vdev_raidz_io_start(), reads and writes
+ * to the dump zvol are written across a full 128Kb block.  As a result, an
+ * individual I/O may not span all columns in the RAID-Z map; moreover, a small
+ * I/O may only span a single column.
+ *
+ * Note that since there are no parity bits calculated or written, this format
+ * remains the same no matter how many parity bits are used in a normal RAID-Z
+ * stripe.
+ */
+int
+vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
+    uint64_t offset, uint64_t origoffset, boolean_t doread)
+{
+	vdev_t *tvd = vd->vdev_top;
+	vdev_t *cvd;
+	raidz_map_t *rm;
+	raidz_col_t *rc;
+	int c, err = 0;
+
+	uint64_t start, end, colstart, colend;
+	uint64_t coloffset, colsize, colskip;
+
+	int flags = doread ? B_READ : B_WRITE;
+
+#ifdef	_KERNEL
+
+	/*
+	 * Don't write past the end of the block
+	 */
+	VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
+
+	/*
+	 * Even if this I/O operation doesn't span the full block size, let's
+	 * treat the on-disk format as if the only blocks are the complete 128k
+	 * size.
+	 */
+	start = offset;
+	end = start + size;
+
+	/*
+	 * Allocate a RAID-Z map for this block.  Note that this block starts
+	 * from the "original" offset, this is, the offset of the extent which
+	 * contains the requisite offset of the data being read or written.
+	 */
+	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
+	    SPA_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, vd->vdev_children,
+	    vd->vdev_nparity);
+
+	coloffset = origoffset;
+
+	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+	    c++, coloffset += rc->rc_size) {
+		rc = &rm->rm_col[c];
+		cvd = vd->vdev_child[rc->rc_devidx];
+
+		/*
+		 * Find the start and end of this column in the RAID-Z matrix,
+		 * keeping in mind that the stated size and offset of the
+		 * operation may not fill the entire column for this vdev.
+		 *
+		 * If any portion of the data being read or written spans this
+		 * column, issue the appropriate operation to the child vdev.
+		 */
+		if (coloffset + rc->rc_size <= start)
+			continue;
+		if (coloffset >= end)
+			continue;
+
+		colstart = MAX(coloffset, start);
+		colend = MIN(end, coloffset + rc->rc_size);
+		colsize = colend - colstart;
+		colskip = colstart - coloffset;
+
+		VERIFY3U(colsize, <=, rc->rc_size);
+		VERIFY3U(colskip, <=, rc->rc_size);
+
+		/*
+		 * Note that the child vdev will have a vdev label at the start
+		 * of its range of offsets, hence the need for
+		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
+		 * example of why this calculation is needed.
+		 */
+		if ((err = vdev_disk_physio(cvd,
+		    ((char *)rc->rc_data) + colskip, colsize,
+		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
+		    flags)) != 0)
+			break;
+	}
+
+	vdev_raidz_map_free(rm);
+#endif	/* KERNEL */
+
+	return (err);
+}
+
 static uint64_t
 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
 {
@@ -1526,9 +1628,13 @@ vdev_raidz_io_start(zio_t *zio)
 	raidz_col_t *rc;
 	int c, i;
 
-	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
+	    tvd->vdev_ashift, vd->vdev_children,
 	    vd->vdev_nparity);
 
+	zio->io_vsd = rm;
+	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
@@ -1659,6 +1765,13 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 	int c, ret = 0;
 	raidz_col_t *rc;
 
+	blkptr_t *bp = zio->io_bp;
+	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+	if (checksum == ZIO_CHECKSUM_NOPARITY)
+		return (ret);
+
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		rc = &rm->rm_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 929fc06296..baffc223a3 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -23,6 +23,7 @@
  * Portions Copyright 2011 Martin Matuska
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -1757,7 +1758,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
 }
 
 static int
-zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os,
+    boolean_t cachedpropsonly)
 {
 	int error = 0;
 	nvlist_t *nv;
@@ -1775,7 +1777,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
 		 * XXX reading with out owning
 		 */
 		if (!zc->zc_objset_stats.dds_inconsistent &&
-		    dmu_objset_type(os) == DMU_OST_ZVOL) {
+		    dmu_objset_type(os) == DMU_OST_ZVOL &&
+		    !cachedpropsonly) {
 			error = zvol_get_stats(os, nv);
 			if (error == EIO)
 				return (error);
@@ -1802,13 +1805,25 @@ static int
 zfs_ioc_objset_stats(zfs_cmd_t *zc)
 {
 	objset_t *os = NULL;
+	nvlist_t *nvl = NULL;
+	boolean_t cachedpropsonly = B_FALSE;
 	int error;
 
-	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+	if (zc->zc_nvlist_src != NULL &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &nvl) != 0))
 		return (error);
 
-	error = zfs_ioc_objset_stats_impl(zc, os);
+	if (nvl != NULL) {
+		(void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+		    &cachedpropsonly);
+		nvlist_free(nvl);
+	}
+
+	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+		return (error);
 
+	error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly);
 	dmu_objset_rele(os, FTAG);
 
 	return (error);
@@ -2022,8 +2037,21 @@ static int
 zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
+	nvlist_t *nvl = NULL;
+	boolean_t cachedpropsonly = B_FALSE;
 	int error;
 
+	if (zc->zc_nvlist_src != NULL &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &nvl) != 0))
+		return (error);
+
+	if (nvl != NULL) {
+		(void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+		    &cachedpropsonly);
+		nvlist_free(nvl);
+	}
+
 top:
 	if (zc->zc_cookie == 0)
 		(void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
@@ -2072,8 +2100,10 @@ top:
 			objset_t *ossnap;
 
 			error = dmu_objset_from_ds(ds, &ossnap);
-			if (error == 0)
-				error = zfs_ioc_objset_stats_impl(zc, ossnap);
+			if (error == 0) {
+				error = zfs_ioc_objset_stats_impl(zc,
+				    ossnap, cachedpropsonly);
+			}
 			dsl_dataset_rele(ds, FTAG);
 		}
 	} else if (error == ENOENT) {
@@ -2789,6 +2819,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	uint64_t sense = ZFS_PROP_UNDEFINED;
 	uint64_t norm = ZFS_PROP_UNDEFINED;
 	uint64_t u8 = ZFS_PROP_UNDEFINED;
+	int error;
 
 	ASSERT(zplprops != NULL);
 
@@ -2832,8 +2863,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
 
-	if (norm == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
+	if (norm == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
 
@@ -2842,13 +2874,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	 */
 	if (norm)
 		u8 = 1;
-	if (u8 == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
+	if (u8 == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
 
-	if (sense == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
+	if (sense == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 0c39274caf..9fae31fa6b 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -25,6 +25,10 @@
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
+
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
@@ -4145,6 +4149,8 @@ top:
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
+		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
 	}
 	dmu_tx_commit(tx);
@@ -4655,27 +4661,6 @@ zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 	return (0);
 }
 
-/*
- * The reason we push dirty pages as part of zfs_delmap() is so that we get a
- * more accurate mtime for the associated file.  Since we don't have a way of
- * detecting when the data was actually modified, we have to resort to
- * heuristics.  If an explicit msync() is done, then we mark the mtime when the
- * last page is pushed.  The problem occurs when the msync() call is omitted,
- * which by far the most common case:
- *
- * 	open()
- * 	mmap()
- * 	<modify memory>
- * 	munmap()
- * 	close()
- * 	<time lapse>
- * 	putpage() via fsflush
- *
- * If we wait until fsflush to come along, we can have a modification time that
- * is some arbitrary point in the future.  In order to prevent this in the
- * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
- * torn down.
- */
 /* ARGSUSED */
 static int
 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
@@ -4687,10 +4672,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
 	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
 
-	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
-	    vn_has_cached_data(vp))
-		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
new file mode 100644
index 0000000000..08f4f38e04
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -0,0 +1,1179 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
+
+#ifndef _KERNEL
+
+/*
+ * Stubs for when compiling for user-land.
+ */
+
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+}
+
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+}
+
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+}
+
+int
+zfs_zone_txg_delay()
+{
+	return (1);
+}
+
+#else
+
+/*
+ * The real code.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/zio.h>
+#include <sys/zone.h>
+#include <sys/avl.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+
+/*
+ * The zone throttle delays read and write operations from certain zones based
+ * on each zone's IO utilitzation.  Once a cycle (defined by zfs_zone_cycle_time
+ * below), the delays for each zone are recalculated based on the utilization
+ * over the previous window.
+ */
+boolean_t	zfs_zone_delay_enable = B_TRUE;	/* enable IO throttle */
+uint16_t	zfs_zone_delay_step = 5;	/* amount to change delay */
+uint16_t	zfs_zone_delay_ceiling = 100;	/* longest possible delay */
+
+hrtime_t	zfs_zone_last_checked = 0;
+
+boolean_t	zfs_zone_priority_enable = B_TRUE;  /* enable IO priority */
+
+/*
+ * For certain workloads, one zone may be issuing primarily sequential I/O and
+ * another primarily random I/O.  The sequential I/O will complete much more
+ * quickly than the random I/O, driving the average system latency for those
+ * operations way down.  As a result, the random I/O may be throttled back, even
+ * though the sequential I/O should be throttled to allow the random I/O more
+ * access to the disk.
+ *
+ * This tunable limits the discrepancy between the read and write system
+ * latency.  If one becomes excessively high, this tunable prevents the I/O
+ * throttler from exacerbating the imbalance.
+ */
+uint_t		zfs_zone_rw_lat_limit = 10;
+
+
+/*
+ * The I/O throttle will only start delaying zones when it detects disk
+ * utilization has reached a certain level.  This tunable controls the threshold
+ * at which the throttle will start delaying zones. The calculation should
+ * correspond closely with the %b column from iostat.
+ */
+uint_t		zfs_zone_util_threshold = 80;
+
+/*
+ * Throughout this subsystem, our timestamps are in microseconds.  Our system
+ * average cycle is one second or 1 million microseconds.  Our zone counter
+ * update cycle is two seconds or 2 million microseconds.  We use a longer
+ * duration for that cycle because some ops can see a little over two seconds of
+ * latency when they are being starved by another zone.
+ */
+uint_t 		zfs_zone_sys_avg_cycle = 1000000;	/* 1 s */
+uint_t 		zfs_zone_cycle_time = 2000000;		/* 2 s */
+
+uint_t 		zfs_zone_adjust_time = 250000;		/* 250 ms */
+
+typedef struct {
+	hrtime_t	cycle_start;
+	int		cycle_cnt;
+	hrtime_t	cycle_lat;
+	hrtime_t	sys_avg_lat;
+} sys_lat_cycle_t;
+
+typedef struct {
+	hrtime_t zi_now;
+	uint_t zi_avgrlat;
+	uint_t zi_avgwlat;
+	uint64_t zi_totpri;
+	uint64_t zi_totutil;
+	int zi_active;
+	uint_t zi_diskutil;
+} zoneio_stats_t;
+
+static sys_lat_cycle_t	rd_lat;
+static sys_lat_cycle_t	wr_lat;
+
+/*
+ * Some basic disk stats to determine disk utilization.
+ */
+kmutex_t	zfs_disk_lock;
+uint_t		zfs_disk_rcnt;
+hrtime_t	zfs_disk_rtime = 0;
+hrtime_t	zfs_disk_rlastupdate = 0;
+
+hrtime_t	zfs_disk_last_rtime = 0;
+
+/*
+ * Data used to keep track of how often txg flush is running.
+ */
+extern int	zfs_txg_timeout;
+static uint_t	txg_last_check;
+static uint_t	txg_cnt;
+static uint_t	txg_flush_rate;
+
+boolean_t	zfs_zone_schedule_enable = B_TRUE;	/* enable IO sched. */
+/*
+ * Threshold for when zio scheduling should kick in.
+ *
+ * This threshold is based on 1/2 of the zfs_vdev_max_pending value for the
+ * number of I/Os that can be pending on a device.  If there are more than a
+ * few ops already queued up, beyond those already issued to the vdev, then
+ * use scheduling to get the next zio.
+ */
+int		zfs_zone_schedule_thresh = 5;
+
+/*
+ * Tunables for delay throttling when TxG flush is occurring.
+ */
+int		zfs_zone_txg_throttle_scale = 2;
+int		zfs_zone_txg_delay_ticks = 2;
+
+typedef struct {
+	int	zq_qdepth;
+	int	zq_priority;
+	int	zq_wt;
+	zoneid_t zq_zoneid;
+} zone_q_bump_t;
+
+/*
+ * This uses gethrtime() but returns a value in usecs.
+ */
+#define	GET_USEC_TIME		(gethrtime() / 1000)
+#define	NANO_TO_MICRO(x)	(x / (NANOSEC / MICROSEC))
+
+/*
+ * Keep track of the zone's ZFS IOPs.
+ *
+ * If the number of ops is >1 then we can just use that value.  However,
+ * if the number of ops is <2 then we might have a zone which is trying to do
+ * IO but is not able to get any ops through the system.  We don't want to lose
+ * track of this zone so we factor in its decayed count into the current count.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last update
+ * was made.  If it was more than one cycle ago, then we need to decay the
+ * historical count by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return true if we actually computed a new historical count.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static hrtime_t
+compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+	hrtime_t delta;
+	int	gen_cnt;
+
+	/*
+	 * Check if its time to recompute a new zone count.
+	 * If we're still collecting data for the current cycle, return false.
+	 */
+	delta = unow - cp->cycle_start;
+	if (delta < zfs_zone_cycle_time)
+		return (delta);
+
+	/* A previous cycle is past, compute the new zone count. */
+
+	/*
+	 * Figure out how many generations we have to decay the historical
+	 * count, since multiple cycles may have elapsed since our last IO.
+	 * We depend on int rounding here.
+	 */
+	gen_cnt = (int)(delta / zfs_zone_cycle_time);
+
+	/* If more than 5 cycles since last the IO, reset count. */
+	if (gen_cnt > 5) {
+		cp->zone_avg_cnt = 0;
+	} else {
+		/* Update the count. */
+		int	i;
+
+		/*
+		 * If the zone did more than 1 IO, just use its current count
+		 * as the historical value, otherwise decay the historical
+		 * count and factor that into the new historical count.  We
+		 * pick a threshold > 1 so that we don't lose track of IO due
+		 * to int rounding.
+		 */
+		if (cp->cycle_cnt > 1)
+			cp->zone_avg_cnt = cp->cycle_cnt;
+		else
+			cp->zone_avg_cnt = cp->cycle_cnt +
+			    (cp->zone_avg_cnt / 2);
+
+		/*
+		 * If more than one generation has elapsed since the last
+		 * update, decay the values further.
+		 */
+		for (i = 1; i < gen_cnt; i++)
+			cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
+	}
+
+	/* A new cycle begins. */
+	cp->cycle_start = unow;
+	cp->cycle_cnt = 0;
+
+	return (0);
+}
+
+/*
+ * Add IO op data to the zone.
+ */
+static void
+add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
+{
+	switch (op) {
+	case ZFS_ZONE_IOP_READ:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
+		zonep->zone_rd_ops.cycle_cnt++;
+		break;
+	case ZFS_ZONE_IOP_WRITE:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
+		zonep->zone_wr_ops.cycle_cnt++;
+		break;
+	case ZFS_ZONE_IOP_LOGICAL_WRITE:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
+		zonep->zone_lwr_ops.cycle_cnt++;
+		break;
+	}
+}
+
+/*
+ * Use a decaying average to keep track of the overall system latency.
+ *
+ * We want to have the recent activity heavily weighted, but if the
+ * activity decreases or stops, then the average should quickly decay
+ * down to the new value.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen
+ *
+ * at fixed intervals, we use a timestamp to see when the last update was made.
+ * If it was more than one cycle ago, then we need to decay the average by the
+ * proper number of additional cycles in which no IO was performed.
+ *
+ * Return true if we actually computed a new system average.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static int
+compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+	hrtime_t delta;
+	int	gen_cnt;
+
+	/*
+	 * Check if its time to recompute a new average.
+	 * If we're still collecting data for the current cycle, return false.
+	 */
+	delta = unow - cp->cycle_start;
+	if (delta < zfs_zone_sys_avg_cycle)
+		return (0);
+
+	/* A previous cycle is past, compute a new system average. */
+
+	/*
+	 * Figure out how many generations we have to decay, since multiple
+	 * cycles may have elapsed since our last IO.
+	 * We count on int rounding here.
+	 */
+	gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
+
+	/* If more than 5 cycles since last the IO, reset average. */
+	if (gen_cnt > 5) {
+		cp->sys_avg_lat = 0;
+	} else {
+		/* Update the average. */
+		int	i;
+
+		cp->sys_avg_lat =
+		    (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
+
+		/*
+		 * If more than one generation has elapsed since the last
+		 * update, decay the values further.
+		 */
+		for (i = 1; i < gen_cnt; i++)
+			cp->sys_avg_lat = cp->sys_avg_lat / 2;
+	}
+
+	/* A new cycle begins. */
+	cp->cycle_start = unow;
+	cp->cycle_cnt = 0;
+	cp->cycle_lat = 0;
+
+	return (1);
+}
+
+static void
+add_sys_iop(hrtime_t unow, int op, int lat)
+{
+	switch (op) {
+	case ZFS_ZONE_IOP_READ:
+		(void) compute_new_sys_avg(unow, &rd_lat);
+		rd_lat.cycle_cnt++;
+		rd_lat.cycle_lat += lat;
+		break;
+	case ZFS_ZONE_IOP_WRITE:
+		(void) compute_new_sys_avg(unow, &wr_lat);
+		wr_lat.cycle_cnt++;
+		wr_lat.cycle_lat += lat;
+		break;
+	}
+}
+
+/*
+ * Get the zone IO counts.
+ */
+static uint_t
+calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+	hrtime_t delta;
+	uint_t cnt;
+
+	if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
+		/*
+		 * No activity in the current cycle, we already have the
+		 * historical data so we'll use that.
+		 */
+		cnt = cp->zone_avg_cnt;
+	} else {
+		/*
+		 * If we're less than half way through the cycle then use
+		 * the current count plus half the historical count, otherwise
+		 * just use the current count.
+		 */
+		if (delta < (zfs_zone_cycle_time / 2))
+			cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
+		else
+			cnt = cp->cycle_cnt;
+	}
+
+	return (cnt);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static uint_t
+calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+	if (compute_new_sys_avg(unow, cp)) {
+		/*
+		 * No activity in the current cycle, we already have the
+		 * historical data so we'll use that.
+		 */
+		return (cp->sys_avg_lat);
+	} else {
+		/*
+		 * We're within a cycle; weight the current activity higher
+		 * compared to the historical data and use that.
+		 */
+		extern void __dtrace_probe_zfs__zone__calc__wt__avg(uintptr_t,
+		    uintptr_t, uintptr_t);
+
+		__dtrace_probe_zfs__zone__calc__wt__avg(
+		    (uintptr_t)cp->sys_avg_lat,
+		    (uintptr_t)cp->cycle_lat,
+		    (uintptr_t)cp->cycle_cnt);
+
+		return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
+		    (1 + (cp->cycle_cnt * 8)));
+	}
+}
+
+/*
+ * Account for the current IOP on the zone and for the system as a whole.
+ * The latency parameter is in usecs.
+ */
+static void
+add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
+{
+	/* Add op to zone */
+	add_zone_iop(zonep, unow, op);
+
+	/* Track system latency */
+	if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
+		add_sys_iop(unow, op, lat);
+}
+
+/*
+ * Calculate and return the total number of read ops, write ops and logical
+ * write ops for the given zone.  If the zone has issued operations of any type
+ * return a non-zero value, otherwise return 0.
+ */
+static int
+get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
+    uint_t *lwops)
+{
+	*rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
+	*wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
+	*lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
+
+	extern void __dtrace_probe_zfs__zone__io__cnt(uintptr_t,
+	    uintptr_t, uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__io__cnt((uintptr_t)zonep->zone_id,
+	    (uintptr_t)(*rops), (uintptr_t)*wops, (uintptr_t)*lwops);
+
+	return (*rops | *wops | *lwops);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static void
+get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
+{
+	*rlat = calc_avg_lat(unow, &rd_lat);
+	*wlat = calc_avg_lat(unow, &wr_lat);
+
+	/*
+	 * In an attempt to improve the accuracy of the throttling algorithm,
+	 * assume that IO operations can't have zero latency.  Instead, assume
+	 * a reasonable lower bound for each operation type. If the actual
+	 * observed latencies are non-zero, use those latency values instead.
+	 */
+	if (*rlat == 0)
+		*rlat = 1000;
+	if (*wlat == 0)
+		*wlat = 1000;
+
+	extern void __dtrace_probe_zfs__zone__sys__avg__lat(uintptr_t,
+	    uintptr_t);
+
+	__dtrace_probe_zfs__zone__sys__avg__lat((uintptr_t)(*rlat),
+	    (uintptr_t)*wlat);
+}
+
+/*
+ * Find disk utilization for each zone and average utilization for all active
+ * zones.
+ */
+static int
+zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
+{
+	zoneio_stats_t *sp = arg;
+	uint_t rops, wops, lwops;
+
+	if (zonep->zone_id == GLOBAL_ZONEID ||
+	    get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
+		zonep->zone_io_util = 0;
+		return (0);
+	}
+
+	zonep->zone_io_util = (rops * sp->zi_avgrlat) +
+	    (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
+	sp->zi_totutil += zonep->zone_io_util;
+
+	if (zonep->zone_io_util > 0) {
+		sp->zi_active++;
+		sp->zi_totpri += zonep->zone_zfs_io_pri;
+	}
+
+	/*
+	 * sdt:::zfs-zone-utilization
+	 *
+	 *	arg0: zone ID
+	 *	arg1: read operations observed during time window
+	 *	arg2: physical write operations observed during time window
+	 *	arg3: logical write ops observed during time window
+	 *	arg4: calculated utilization given read and write ops
+	 *	arg5: I/O priority assigned to this zone
+	 */
+	extern void __dtrace_probe_zfs__zone__utilization(
+	    uint_t, uint_t, uint_t, uint_t, uint_t, uint_t);
+
+	__dtrace_probe_zfs__zone__utilization((uint_t)(zonep->zone_id),
+	    (uint_t)rops, (uint_t)wops, (uint_t)lwops,
+	    (uint_t)zonep->zone_io_util, (uint_t)zonep->zone_zfs_io_pri);
+
+	return (0);
+}
+
+static void
+zfs_zone_delay_inc(zone_t *zonep)
+{
+	if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
+		zonep->zone_io_delay += zfs_zone_delay_step;
+}
+
+static void
+zfs_zone_delay_dec(zone_t *zonep)
+{
+	if (zonep->zone_io_delay > 0)
+		zonep->zone_io_delay -= zfs_zone_delay_step;
+}
+
+/*
+ * For all zones "far enough" away from the average utilization, increase that
+ * zones delay.  Otherwise, reduce its delay.
+ */
+static int
+zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
+{
+	zoneio_stats_t *sp = arg;
+	uint16_t delay = zonep->zone_io_delay;
+	uint_t fairutil = 0;
+
+	zonep->zone_io_util_above_avg = B_FALSE;
+
+	/*
+	 * Given the calculated total utilitzation for all zones, calculate the
+	 * fair share of I/O for this zone.
+	 */
+	if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
+		fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
+		    sp->zi_totpri;
+	} else if (sp->zi_active > 0) {
+		fairutil = sp->zi_totutil / sp->zi_active;
+	}
+
+	/*
+	 * Adjust each IO's delay.  If the overall delay becomes too high, avoid
+	 * increasing beyond the ceiling value.
+	 */
+	if (zonep->zone_io_util > fairutil &&
+	    sp->zi_diskutil > zfs_zone_util_threshold) {
+		zonep->zone_io_util_above_avg = B_TRUE;
+
+		if (sp->zi_active > 1)
+			zfs_zone_delay_inc(zonep);
+	} else if (zonep->zone_io_util < fairutil || sp->zi_active <= 1) {
+		zfs_zone_delay_dec(zonep);
+	}
+
+	/*
+	 * sdt:::zfs-zone-throttle
+	 *
+	 *	arg0: zone ID
+	 *	arg1: old delay for this zone
+	 *	arg2: new delay for this zone
+	 *	arg3: calculated fair I/O utilization
+	 *	arg4: actual I/O utilization
+	 */
+	extern void __dtrace_probe_zfs__zone__throttle(
+	    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__throttle(
+	    (uintptr_t)zonep->zone_id, (uintptr_t)delay,
+	    (uintptr_t)zonep->zone_io_delay, (uintptr_t)fairutil,
+	    (uintptr_t)zonep->zone_io_util);
+
+	return (0);
+}
+
+/*
+ * Examine the utilization between different zones, and adjust the delay for
+ * each zone appropriately.
+ */
+static void
+zfs_zone_wait_adjust(hrtime_t unow)
+{
+	zoneio_stats_t stats;
+
+	(void) bzero(&stats, sizeof (stats));
+
+	stats.zi_now = unow;
+	get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
+
+	if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
+		stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
+	else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
+		stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
+
+	if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
+		return;
+
+	/*
+	 * Calculate disk utilization for the most recent period.
+	 */
+	if (zfs_disk_last_rtime == 0 || unow - zfs_zone_last_checked <= 0) {
+		stats.zi_diskutil = 0;
+	} else {
+		stats.zi_diskutil =
+		    ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
+		    ((unow - zfs_zone_last_checked) * 1000);
+	}
+	zfs_disk_last_rtime = zfs_disk_rtime;
+
+	/*
+	 * sdt:::zfs-zone-stats
+	 *
+	 * Statistics observed over the last period:
+	 *
+	 *	arg0: average system read latency
+	 *	arg1: average system write latency
+	 *	arg2: number of active zones
+	 *	arg3: total I/O 'utilization' for all zones
+	 *	arg4: total I/O priority of all active zones
+	 *	arg5: calculated disk utilization
+	 */
+	extern void __dtrace_probe_zfs__zone__stats(
+	    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__stats((uintptr_t)(stats.zi_avgrlat),
+	    (uintptr_t)(stats.zi_avgwlat),
+	    (uintptr_t)(stats.zi_active),
+	    (uintptr_t)(stats.zi_totutil),
+	    (uintptr_t)(stats.zi_totpri),
+	    (uintptr_t)(stats.zi_diskutil));
+
+	(void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
+}
+
+/*
+ * Callback used to calculate a zone's IO schedule priority.
+ *
+ * We scan the zones looking for ones with ops in the queue.  Out of those,
+ * we pick the one that calculates to the highest schedule priority.
+ */
+static int
+get_sched_pri_cb(zone_t *zonep, void *arg)
+{
+	int pri;
+	zone_q_bump_t *qbp = arg;
+
+	extern void __dtrace_probe_zfs__zone__enqueued(uintptr_t, uintptr_t);
+	__dtrace_probe_zfs__zone__enqueued((uintptr_t)(zonep->zone_id),
+	    (uintptr_t)(zonep->zone_zfs_queued));
+
+	if (zonep->zone_zfs_queued == 0) {
+		zonep->zone_zfs_weight = 0;
+		return (0);
+	}
+
+	/*
+	 * On each pass, increment the zone's weight.  We use this as input
+	 * to the calculation to prevent starvation.  The value is reset
+	 * each time we issue an IO for this zone so zones which haven't
+	 * done any IO over several iterations will see their weight max
+	 * out.
+	 */
+	if (zonep->zone_zfs_weight < 20)
+		zonep->zone_zfs_weight++;
+
+	/*
+	 * This zone's IO priority is the inverse of the number of IOs
+	 * the zone has enqueued * zone's configured priority * weight.
+	 * The queue depth has already been scaled by 10 to avoid problems
+	 * with int rounding.
+	 *
+	 * This means that zones with fewer IOs in the queue will get
+	 * preference unless other zone's assigned priority pulls them
+	 * ahead.  The weight is factored in to help ensure that zones
+	 * which haven't done IO in a while aren't getting starved.
+	 */
+	pri = (qbp->zq_qdepth / zonep->zone_zfs_queued) *
+	    zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
+
+	/*
+	 * If this zone has a higher priority than what we found so far,
+	 * schedule it next.
+	 */
+	if (pri > qbp->zq_priority) {
+		qbp->zq_zoneid = zonep->zone_id;
+		qbp->zq_priority = pri;
+		qbp->zq_wt = zonep->zone_zfs_weight;
+	}
+	return (0);
+}
+
+/*
+ * See if we need to bump a zone's zio to the head of the queue.
+ *
+ * For single-threaded synchronous workloads a zone cannot get more than
+ * 1 op into the queue at a time unless the zone is running multiple workloads
+ * in parallel.  This can cause an imbalance in performance if there are zones
+ * with many parallel workloads (and ops in the queue) vs. other zones which
+ * are doing simple single-threaded workloads, such as interactive tasks in the
+ * shell.  These zones can get backed up behind a deep queue and their IO
+ * performance will appear to be very poor as a result.  This can make the
+ * zone work badly for interactive behavior.
+ *
+ * The scheduling algorithm kicks in once we start to get a deeper queue.
+ * Once that occurs, we look at all of the zones to see which one calculates
+ * to the highest priority.  We bump that zone's first zio to the head of the
+ * queue.
+ *
+ * We use a counter on the zone so that we can quickly find how many ops each
+ * zone has in the queue without having to search the entire queue itself.
+ * This scales better since the number of zones is expected to be on the
+ * order of 10-100 whereas the queue depth can be in the range of 50-2000.
+ * In addition, since the zio's in the queue only have the zoneid, we would
+ * have to look up the zone for each zio enqueued and that means the overhead
+ * for scanning the queue each time would be much higher.
+ *
+ * In all cases, we fall back to simply pulling the next op off the queue
+ * if something should go wrong.
+ */
+static zio_t *
+get_next_zio(vdev_queue_t *vq, int qdepth)
+{
+	zone_q_bump_t qbump;
+	zio_t *zp = NULL, *zphead;
+	int cnt = 0;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	/* To avoid problems with int rounding, scale the queue depth by 10 */
+	qbump.zq_qdepth = qdepth * 10;
+	qbump.zq_priority = 0;
+	qbump.zq_zoneid = 0;
+	(void) zone_walk(get_sched_pri_cb, &qbump);
+
+	zphead = avl_first(&vq->vq_deadline_tree);
+
+	/* Check if the scheduler didn't pick a zone for some reason!? */
+	if (qbump.zq_zoneid != 0) {
+		for (zp = avl_first(&vq->vq_deadline_tree); zp != NULL;
+		    zp = avl_walk(&vq->vq_deadline_tree, zp, AVL_AFTER)) {
+			if (zp->io_zoneid == qbump.zq_zoneid)
+				break;
+			cnt++;
+		}
+	}
+
+	if (zp == NULL) {
+		zp = zphead;
+	} else if (zp != zphead) {
+		/*
+		 * Only fire the probe if we actually picked a different zio
+		 * than the one already at the head of the queue.
+		 */
+		extern void __dtrace_probe_zfs__zone__sched__bump(uintptr_t,
+		    uintptr_t, uintptr_t, uintptr_t);
+		__dtrace_probe_zfs__zone__sched__bump(
+		    (uintptr_t)(zp->io_zoneid), (uintptr_t)(cnt),
+		    (uintptr_t)(qbump.zq_priority), (uintptr_t)(qbump.zq_wt));
+	}
+
+	return (zp);
+}
+
+/*
+ * Add our zone ID to the zio so we can keep track of which zones are doing
+ * what, even when the current thread processing the zio is not associated
+ * with the zone (e.g. the kernel taskq which pushes out RX groups).
+ */
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+	zone_t	*zonep = curzone;
+
+	zp->io_zoneid = zonep->zone_id;
+}
+
+/*
+ * Track IO operations per zone.  Called from dmu_tx_count_write for write ops
+ * and dmu_read_uio for read ops.  For each operation, increment that zone's
+ * counter based on the type of operation.
+ *
+ * There are three basic ways that we can see write ops:
+ * 1) An application does write syscalls.  Those ops go into a TXG which
+ *    we'll count here.  Sometime later a kernel taskq thread (we'll see the
+ *    vdev IO as zone 0) will perform some number of physical writes to commit
+ *    the TXG to disk.  Those writes are not associated with the zone which
+ *    made the write syscalls and the number of operations is not correlated
+ *    between the taskq and the zone.
+ * 2) An application opens a file with O_SYNC.  Each write will result in
+ *    an operation which we'll see here plus a low-level vdev write from
+ *    that zone.
+ * 3) An application does write syscalls followed by an fsync().  We'll
+ *    count the writes going into a TXG here.  We'll also see some number
+ *    (usually much smaller, maybe only 1) of low-level vdev writes from this
+ *    zone when the fsync is performed, plus some other low-level vdev writes
+ *    from the taskq in zone 0 (are these metadata writes?).
+ *
+ * 4) In addition to the above, there are misc. system-level writes, such as
+ *    writing out dirty pages to swap, or sync(2) calls, which will be handled
+ *    by the global zone and which we count but don't generally worry about.
+ *
+ * Because of the above, we can see writes twice because this is called
+ * at a high level by a zone thread, but we also will count the phys. writes
+ * that are performed at a low level via zfs_zone_zio_start.
+ *
+ * Without this, it can look like a non-global zone never writes (case 1).
+ * Depending on when the TXG is flushed, the counts may be in the same sample
+ * bucket or in a different one.
+ *
+ * Tracking read operations is simpler due to their synchronous semantics.  The
+ * zfs_read function -- called as a result of a read(2) syscall -- will always
+ * retrieve the data to be read through dmu_read_uio.
+ */
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+	zone_t *zonep = curzone;
+	hrtime_t unow;
+	uint16_t wait;
+
+	unow = GET_USEC_TIME;
+
+	/*
+	 * Only bump the counters for logical operations here.  The counters for
+	 * tracking physical IO operations are handled in zfs_zone_zio_done.
+	 */
+	if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
+		mutex_enter(&zonep->zone_stg_io_lock);
+		add_iop(zonep, unow, type, 0);
+		mutex_exit(&zonep->zone_stg_io_lock);
+	}
+
+	if (!zfs_zone_delay_enable)
+		return;
+
+	/*
+	 * XXX There's a potential race here in that more than one thread may
+	 * update the zone delays concurrently.  The worst outcome is corruption
+	 * of our data to track each zone's IO, so the algorithm may make
+	 * incorrect throttling decisions until the data is refreshed.
+	 */
+	if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) {
+		zfs_zone_wait_adjust(unow);
+		zfs_zone_last_checked = unow;
+	}
+
+	if ((wait = zonep->zone_io_delay) > 0) {
+		/*
+		 * If this is a write and we're doing above normal TxG
+		 * flushing, then throttle for longer than normal.
+		 */
+		if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
+		    (txg_cnt > 1 || txg_flush_rate > 1))
+			wait *= zfs_zone_txg_throttle_scale;
+
+		/*
+		 * sdt:::zfs-zone-wait
+		 *
+		 *	arg0: zone ID
+		 *	arg1: type of IO operation
+		 *	arg2: time to delay (in us)
+		 */
+		extern void __dtrace_probe_zfs__zone__wait(
+		    uintptr_t, uintptr_t, uintptr_t);
+
+		__dtrace_probe_zfs__zone__wait((uintptr_t)(zonep->zone_id),
+		    (uintptr_t)type, (uintptr_t)wait);
+
+		drv_usecwait(wait);
+
+		if (zonep->zone_vfs_stats != NULL) {
+			atomic_inc_64(&zonep->zone_vfs_stats->
+			    zv_delay_cnt.value.ui64);
+			atomic_add_64(&zonep->zone_vfs_stats->
+			    zv_delay_time.value.ui64, wait);
+		}
+	}
+}
+
+/*
+ * XXX Ignore the pool pointer parameter for now.
+ *
+ * Keep track to see if the TxG flush rate is running above the expected rate.
+ * If so, this implies that we are filling TxG's at a high rate due to a heavy
+ * write workload.  We use this as input into the zone throttle.
+ *
+ * This function is called every 5 seconds (zfs_txg_timeout) under a normal
+ * write load.  In this case, the flush rate is going to be 1.  When there
+ * is a heavy write load, TxG's fill up fast and the sync thread will write
+ * the TxG more frequently (perhaps once a second).  In this case the rate
+ * will be > 1.  The flush rate is a lagging indicator since it can be up
+ * to 5 seconds old.  We use the txg_cnt to keep track of the rate in the
+ * current 5 second interval and txg_flush_rate to keep track of the previous
+ * 5 second interval.  In that way we don't have a period (1 or more seconds)
+ * where the txg_cnt == 0 and we cut back on throttling even though the rate
+ * is still high.
+ */
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+	uint_t now;
+
+	txg_cnt++;
+	now = (uint_t)(gethrtime() / NANOSEC);
+	if ((now - txg_last_check) >= zfs_txg_timeout) {
+		txg_flush_rate = txg_cnt / 2;
+		txg_cnt = 0;
+		txg_last_check = now;
+	}
+}
+
+int
+zfs_zone_txg_delay()
+{
+	zone_t	*zonep = curzone;
+	int delay = 1;
+
+	if (zonep->zone_io_util_above_avg)
+		delay = zfs_zone_txg_delay_ticks;
+
+	extern void __dtrace_probe_zfs__zone__txg__delay(uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__txg__delay((uintptr_t)(zonep->zone_id),
+	    (uintptr_t)delay);
+
+	return (delay);
+}
+
+/*
+ * Called from zio_vdev_io_start when an IO hits the end of the zio pipeline
+ * and is issued.
+ * Keep track of start time for latency calculation in zfs_zone_zio_done.
+ */
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+	zone_t	*zonep;
+
+	/*
+	 * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
+	 * an actual I/O operation.  Ignore those operations as they relate to
+	 * throttling and scheduling.
+	 */
+	if (zp->io_type == ZIO_TYPE_IOCTL)
+		return;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_zfs_lock);
+	if (zp->io_type == ZIO_TYPE_READ)
+		kstat_runq_enter(&zonep->zone_zfs_rwstats);
+	zonep->zone_zfs_weight = 0;
+	mutex_exit(&zonep->zone_zfs_lock);
+
+	mutex_enter(&zfs_disk_lock);
+	zp->io_dispatched = gethrtime();
+
+	if (zfs_disk_rcnt++ != 0)
+		zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
+	zfs_disk_rlastupdate = zp->io_dispatched;
+	mutex_exit(&zfs_disk_lock);
+
+	zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_queue_io_done when an IO completes.
+ * Increment our counter for zone ops.
+ * Calculate the IO latency avg. for this zone.
+ */
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+	zone_t	*zonep;
+	hrtime_t now, unow, udelta;
+
+	if (zp->io_type == ZIO_TYPE_IOCTL)
+		return;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	now = gethrtime();
+	unow = NANO_TO_MICRO(now);
+	udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
+
+	mutex_enter(&zonep->zone_zfs_lock);
+
+	/*
+	 * To calculate the wsvc_t average, keep a cumulative sum of all the
+	 * wait time before each I/O was dispatched.  Since most writes are
+	 * asynchronous, only track the wait time for read I/Os.
+	 */
+	if (zp->io_type == ZIO_TYPE_READ) {
+		zonep->zone_zfs_rwstats.reads++;
+		zonep->zone_zfs_rwstats.nread += zp->io_size;
+
+		zonep->zone_zfs_stats->zz_waittime.value.ui64 +=
+		    zp->io_dispatched - zp->io_start;
+
+		kstat_runq_exit(&zonep->zone_zfs_rwstats);
+	} else {
+		zonep->zone_zfs_rwstats.writes++;
+		zonep->zone_zfs_rwstats.nwritten += zp->io_size;
+	}
+
+	mutex_exit(&zonep->zone_zfs_lock);
+
+	mutex_enter(&zfs_disk_lock);
+	zfs_disk_rcnt--;
+	zfs_disk_rtime += (now - zfs_disk_rlastupdate);
+	zfs_disk_rlastupdate = now;
+	mutex_exit(&zfs_disk_lock);
+
+	if (zfs_zone_delay_enable) {
+		mutex_enter(&zonep->zone_stg_io_lock);
+		add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
+		    ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
+		mutex_exit(&zonep->zone_stg_io_lock);
+	}
+
+	zone_rele(zonep);
+
+	/*
+	 * sdt:::zfs-zone-latency
+	 *
+	 *	arg0: zone ID
+	 *	arg1: type of I/O operation
+	 *	arg2: I/O latency (in us)
+	 */
+	extern void __dtrace_probe_zfs__zone__latency(
+	    uintptr_t, uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__latency((uintptr_t)(zp->io_zoneid),
+	    (uintptr_t)(zp->io_type), (uintptr_t)(udelta));
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+	zone_t	*zonep;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_stg_io_lock);
+	ASSERT(zonep->zone_zfs_queued > 0);
+	if (zonep->zone_zfs_queued == 0)
+		cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
+	else
+		zonep->zone_zfs_queued--;
+	mutex_exit(&zonep->zone_stg_io_lock);
+	zone_rele(zonep);
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+	zone_t	*zonep;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_stg_io_lock);
+	zonep->zone_zfs_queued++;
+	mutex_exit(&zonep->zone_stg_io_lock);
+	zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_queue_io_to_issue.  This function is where zio's are found
+ * at the head of the queue (by avl_first), then pulled off (by
+ * vdev_queue_io_remove) and issued.  We do our scheduling here to find the
+ * next zio to issue.
+ *
+ * The vq->vq_lock mutex is held when we're executing this function so we
+ * can safely access the "last zone" variable on the queue.
+ */
+zio_t *
+zfs_zone_schedule(vdev_queue_t *vq)
+{
+	int cnt;
+	zoneid_t last_zone;
+	zio_t *zp;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	cnt = avl_numnodes(&vq->vq_deadline_tree);
+	last_zone = vq->vq_last_zone_id;
+
+	/*
+	 * If there are only a few ops in the queue then just issue the head.
+	 * If there are more than a few ops already queued up, then use
+	 * scheduling to get the next zio.
+	 */
+	if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
+		zp = avl_first(&vq->vq_deadline_tree);
+	else
+		zp = get_next_zio(vq, cnt);
+
+	vq->vq_last_zone_id = zp->io_zoneid;
+
+	/*
+	 * Probe with 3 args; the number of IOs in the queue, the zone that
+	 * was last scheduled off this queue, and the zone that was associated
+	 * with the next IO that is scheduled.
+	 */
+	extern void __dtrace_probe_zfs__zone__sched(uintptr_t, uintptr_t,
+	    uintptr_t);
+
+	__dtrace_probe_zfs__zone__sched((uintptr_t)(cnt),
+	    (uintptr_t)(last_zone), (uintptr_t)(zp->io_zoneid));
+
+	return (zp);
+}
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 64e9acbae1..89c88bc181 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -36,6 +36,7 @@
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
+#include <sys/zfs_zone.h>
 
 /*
  * ==========================================================================
@@ -501,6 +502,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	bzero(zio, sizeof (zio_t));
 
+	zio->io_start = gethrtime();
+
 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
@@ -552,11 +555,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
+		zio->io_zoneid = pio->io_zoneid;
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child(pio, zio);
+	} else {
+		zfs_zone_zio_init(zio);
 	}
 
 	return (zio);
@@ -894,6 +900,8 @@ zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
+	zio->io_start = gethrtime();
+
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW)) {
@@ -2279,6 +2287,9 @@ zio_vdev_io_start(zio_t *zio)
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
 
+	if (zio->io_type == ZIO_TYPE_WRITE)
+		zio->io_start = gethrtime();
+
 	if (vd == NULL) {
 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c
index c8fe20f2eb..c7dd90c45d 100644
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -77,6 +78,8 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "sha256"},
 	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zilog2"},
+	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "sha256_mac"},
+	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "noparity"},
 };
 
 enum zio_checksum
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index df9a16bccb..4dc63888fd 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -76,9 +76,11 @@
 #include <sys/zfs_rlock.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zvol.h>
 #include <sys/dumphdr.h>
 #include <sys/zil_impl.h>
+#include <sys/sdt.h>
 
 #include "zfs_namecheck.h"
 
@@ -1059,27 +1061,28 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
 }
 
 static int
-zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
-    boolean_t doread, boolean_t isdump)
+zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
+    uint64_t size, boolean_t doread, boolean_t isdump)
 {
 	vdev_disk_t *dvd;
 	int c;
 	int numerrors = 0;
 
-	for (c = 0; c < vd->vdev_children; c++) {
-		ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
-		    vd->vdev_ops == &vdev_replacing_ops ||
-		    vd->vdev_ops == &vdev_spare_ops);
-		int err = zvol_dumpio_vdev(vd->vdev_child[c],
-		    addr, offset, size, doread, isdump);
-		if (err != 0) {
-			numerrors++;
-		} else if (doread) {
-			break;
+	if (vd->vdev_ops == &vdev_mirror_ops ||
+	    vd->vdev_ops == &vdev_replacing_ops ||
+	    vd->vdev_ops == &vdev_spare_ops) {
+		for (c = 0; c < vd->vdev_children; c++) {
+			int err = zvol_dumpio_vdev(vd->vdev_child[c],
+			    addr, offset, origoffset, size, doread, isdump);
+			if (err != 0) {
+				numerrors++;
+			} else if (doread) {
+				break;
+			}
 		}
 	}
 
-	if (!vd->vdev_ops->vdev_op_leaf)
+	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
 		return (numerrors < vd->vdev_children ? 0 : EIO);
 
 	if (doread && !vdev_readable(vd))
@@ -1087,19 +1090,27 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
 	else if (!doread && !vdev_writeable(vd))
 		return (EIO);
 
-	dvd = vd->vdev_tsd;
-	ASSERT3P(dvd, !=, NULL);
+	if (vd->vdev_ops == &vdev_raidz_ops) {
+		return (vdev_raidz_physio(vd,
+		    addr, size, offset, origoffset, doread));
+	}
+
 	offset += VDEV_LABEL_START_SIZE;
 
 	if (ddi_in_panic() || isdump) {
 		ASSERT(!doread);
 		if (doread)
 			return (EIO);
+		dvd = vd->vdev_tsd;
+		ASSERT3P(dvd, !=, NULL);
 		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
 		    lbtodb(size)));
 	} else {
-		return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
-		    doread ? B_READ : B_WRITE));
+		dvd = vd->vdev_tsd;
+		ASSERT3P(dvd, !=, NULL);
+
+		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
+		    offset, doread ? B_READ : B_WRITE));
 	}
 }
 
@@ -1131,7 +1142,8 @@ zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
 
 	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
 	offset += DVA_GET_OFFSET(&ze->ze_dva);
-	error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
+	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
+	    size, doread, isdump);
 
 	if (!ddi_in_panic())
 		spa_config_exit(spa, SCL_STATE, FTAG);
@@ -1322,6 +1334,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0);
+
 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
 	    RL_READER);
 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1340,6 +1354,10 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		}
 	}
 	zfs_range_unlock(rl);
+
+	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int,
+	    error);
+
 	return (error);
 }
 
@@ -1369,6 +1387,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1);
+
 	sync = !(zv->zv_flags & ZVOL_WCE) ||
 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
 
@@ -1399,6 +1419,10 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 	zfs_range_unlock(rl);
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int,
+	    error);
+
 	return (error);
 }
 
@@ -1852,7 +1876,7 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 		    ZIO_COMPRESS_OFF) == 0);
 		VERIFY(nvlist_add_uint64(nv,
 		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
-		    ZIO_CHECKSUM_OFF) == 0);
+		    ZIO_CHECKSUM_NOPARITY) == 0);
 		if (version >= SPA_VERSION_DEDUP) {
 			VERIFY(nvlist_add_uint64(nv,
 			    zfs_prop_to_name(ZFS_PROP_DEDUP),