PSARC 2006/077 zpool clear

PSARC 2006/139 FMA for ZFS 6284889 arc should replace the znode cache 6333006 DMU & DSL should not panic upon I/O error 6333092 concurrent reads to a file not scaling with number of readers 6338081 ZFS/FMA phase 1 6338386 need persistent error log 6341326 i/o error causes arc buf hash table corruption 6341639 zfs backup/restore should compute/verify checksum of backup stream 6348002 out of space due to changing properties 6354724 inaccurate error message from zfs restore 6354872 dmu_sync() blows predictive accounting 6355416 zpool scrubbing consumes all memory, system hung 6363995 df should only load libzfs when it encounters a ZFS filesystem 6366320 zfs backup/restore doesn't like signals 6368892 mount -m support needed for legacy mounts 6368902 boot archive fstat support needed for ZFS Mountroot 6369424 BFU complains when bfu'ing a ZFS root filesystem 6374062 mountroot support needed for ZFS 6376356 dirtying dbuf obj=43 lvl=0 blkid=0 but not tx_held 6378391 unused members of dmu_objset_stats_t 6378392 clean up zfs_cmd_t structure 6378685 buf_init should allocate its hash table more carefully 6378976 ziltest should be a first class citizen 6381086 zdb segfaults if there is a spa deferred-free bplist 6381203 deadlock due to i/o while assigning (tc_lock held) 6381209 freed space is not immediately available 6381344 'zpool clear' 6381345 FAULTED devices should really be UNAVAIL 6381346 import should mark devices as persistently unavailable 6383272 recursive mutex_enter() during log replay with zfs root 6386326 origin property is not displayed 6386354 libzfs does too much in its _init section, calls exit(1) 6386624 zpool should not complain about non-existent devices from libdiskmgt 6386910 spa needs to be i/o error hardened 6387735 need a mechanism to inject faults into ZFS 6387736 internal ZFS utilities should be placed in an ON-private package 6389928 libzfs should ship a lint library 6390609 malformed vdev config panics on zpool_create() 6390677 version number checking makes upgrades challenging 6390713 ztest hangs in zil_suspend() 6391873 metadata compression should be turned back on 6392113 ztest sometimes reports leaked blocks because ZIL isn't resilvered 6393004 minor memory leak in unique_insert()
author: eschrock <none@none> 2006-03-03 20:08:16 -0800
committer: eschrock <none@none> 2006-03-03 20:08:16 -0800
commit: ea8dc4b6d2251b437950c0056bc626b311c73c27 (patch)
tree: 69cc1808568f2ef8fd1e21c61e186ba452ea64da /usr/src
parent: 5c18afbc96a46bc3a9e6f3667512daa374d6cd79 (diff)
download: illumos-gate-ea8dc4b6d2251b437950c0056bc626b311c73c27.tar.gz
155 files changed, 10346 insertions, 3982 deletions
diff --git a/usr/src/Makefile.lint b/usr/src/Makefile.lint
index 96420a1f58..7bfc2e94e5 100644
--- a/usr/src/Makefile.lint
+++ b/usr/src/Makefile.lint
@@ -273,6 +273,7 @@ COMMON_SUBDIRS = \
 	cmd/zdb \
 	cmd/zdump \
 	cmd/zfs \
+	cmd/zinject \
 	cmd/zlogin \
 	cmd/zoneadm \
 	cmd/zoneadmd \
diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile
index c8b16c991d..e8ec1adb3b 100644
--- a/usr/src/cmd/Makefile
+++ b/usr/src/cmd/Makefile
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -408,6 +407,7 @@ COMMON_SUBDIRS=		\
 	zdump		\
 	zfs		\
 	zic		\
+	zinject		\
 	zlogin		\
 	zoneadm		\
 	zoneadmd	\
diff --git a/usr/src/cmd/fm/dicts/ZFS.dict b/usr/src/cmd/fm/dicts/ZFS.dict
index 0166183535..89b10434f5 100644
--- a/usr/src/cmd/fm/dicts/ZFS.dict
+++ b/usr/src/cmd/fm/dicts/ZFS.dict
@@ -1,13 +1,12 @@
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -27,7 +26,7 @@
 # DO NOT EDIT -- this file is generated by the Event Registry.
 #
 
-FMDICT: name=ZFS version=1 maxkey=1
+FMDICT: name=ZFS version=1 maxkey=1 dictid=0x5a46
 
 ereport.fs.zfs.pool.corrupt_cache=1
 ereport.fs.zfs.device.missing_r=2
@@ -39,3 +38,5 @@ ereport.fs.zfs.pool.corrupt_pool=7
 ereport.fs.zfs.object.corrupt_data=8
 ereport.fs.zfs.device.failing=9
 ereport.fs.zfs.device.version_mismatch=10
+fault.fs.zfs.pool=11
+fault.fs.zfs.device=12
diff --git a/usr/src/cmd/fm/dicts/ZFS.po b/usr/src/cmd/fm/dicts/ZFS.po
index ea5a9c6195..a1d26715be 100644
--- a/usr/src/cmd/fm/dicts/ZFS.po
+++ b/usr/src/cmd/fm/dicts/ZFS.po
@@ -1,13 +1,12 @@
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -186,3 +185,35 @@ msgid "ZFS-8000-A5.impact"
 msgstr "The pool is unavailable"
 msgid "ZFS-8000-A5.action"
 msgstr "\nIf this error is seen during 'zpool import', see the section below.  Otherwise,\nrun 'zpool status -x' to determine which pool is faulted:\n\n\n# zpool status -x\n  pool: test\n state: FAULTED\nstatus: The ZFS version for the pool is incompatible with the software running\n        on this system.\naction: Destroy and re-create the pool.\n scrub: none requested\nconfig:\n\n        NAME                  STATE     READ WRITE CKSUM\n        test                  FAULTED      0     0     0  incompatible version\n          mirror              ONLINE       0     0     0\n            c0t0d0            ONLINE       0     0     0\n            c0t0d1            ONLINE       0     0     0\n\n\nThe pool cannot be used on this system.  Either move the disks to the system\nwhere they were originally created, or destroy the pool and re-create it from\nbackup.\n\n\nIf this error is seen during import, the pool cannot be imported on the current\nsystem.  The disks must be attached to the system which originally created the\npool, and imported there.\n	"
+#
+# code: ZFS-8000-CS
+# keys: fault.fs.zfs.pool
+#
+msgid "ZFS-8000-CS.type"
+msgstr "Fault"
+msgid "ZFS-8000-CS.severity"
+msgstr "Major"
+msgid "ZFS-8000-CS.description"
+msgstr "A ZFS pool failed to open.  Refer to %s for more information."
+msgid "ZFS-8000-CS.response"
+msgstr "No automated response will occur."
+msgid "ZFS-8000-CS.impact"
+msgstr "The pool data is unavailable"
+msgid "ZFS-8000-CS.action"
+msgstr "Run 'zpool status -x' and either attach the missing device or\n	    restore from backup."
+#
+# code: ZFS-8000-D3
+# keys: fault.fs.zfs.device
+#
+msgid "ZFS-8000-D3.type"
+msgstr "Fault"
+msgid "ZFS-8000-D3.severity"
+msgstr "Major"
+msgid "ZFS-8000-D3.description"
+msgstr "A ZFS device failed.  Refer to %s for more information."
+msgid "ZFS-8000-D3.response"
+msgstr "No automated response will occur."
+msgid "ZFS-8000-D3.impact"
+msgstr "Fault tolerance of the pool may be compromised."
+msgid "ZFS-8000-D3.action"
+msgstr "Run 'zpool status -x' and replace the bad device."
diff --git a/usr/src/cmd/fm/modules/common/Makefile b/usr/src/cmd/fm/modules/common/Makefile
index 75dd15ef9e..868a66df08 100644
--- a/usr/src/cmd/fm/modules/common/Makefile
+++ b/usr/src/cmd/fm/modules/common/Makefile
@@ -27,6 +27,6 @@
 #
 
 SUBDIRS = cpumem-retire eversholt io-retire ip-transport snmp-trapgen	\
-	syslog-msgs
+	syslog-msgs zfs-diagnosis
 
 include ../../Makefile.subdirs
diff --git a/usr/src/cmd/fm/modules/common/zfs-diagnosis/Makefile b/usr/src/cmd/fm/modules/common/zfs-diagnosis/Makefile
new file mode 100644
index 0000000000..03a7a0dda4
--- /dev/null
+++ b/usr/src/cmd/fm/modules/common/zfs-diagnosis/Makefile
@@ -0,0 +1,33 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+MODULE = zfs-diagnosis
+CLASS = common
+SRCS = zfs_de.c
+
+include ../../Makefile.plugin
+
+LDLIBS += -luutil
diff --git a/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs-diagnosis.conf b/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs-diagnosis.conf
new file mode 100644
index 0000000000..cd493d69bc
--- /dev/null
+++ b/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs-diagnosis.conf
@@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+# fmd configuration file for the zfs.so diagnosis engine.
+#
+subscribe ereport.fs.zfs.*
+subscribe resource.fs.zfs.*
+subscribe fault.fs.zfs.*
+dictionary ZFS
diff --git a/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs_de.c b/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs_de.c
new file mode 100644
index 0000000000..02c1a31e2c
--- /dev/null
+++ b/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs_de.c
@@ -0,0 +1,423 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <assert.h>
+#include <stddef.h>
+#include <strings.h>
+#include <libuutil.h>
+#include <fm/fmd_api.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/fs/zfs.h>
+
+typedef struct zfs_case_data {
+	uint64_t	zc_version;
+	uint64_t	zc_ena;
+	uint64_t	zc_pool_guid;
+	uint64_t	zc_vdev_guid;
+	int		zc_has_timer;
+	int		zc_pool_state;
+} zfs_case_data_t;
+
+typedef struct zfs_case {
+	int		zc_version;
+	zfs_case_data_t	zc_data;
+	fmd_case_t	*zc_case;
+	uu_list_node_t	zc_node;
+	id_t		zc_timer;
+} zfs_case_t;
+
+#define	CASE_DATA		"data"
+#define	CASE_DATA_VERSION	1
+
+static int zfs_case_timeout;
+
+uu_list_pool_t *zfs_case_pool;
+uu_list_t *zfs_cases;
+
+static void
+zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp)
+{
+	fmd_buf_write(hdl, zcp->zc_case, CASE_DATA, &zcp->zc_data,
+	    sizeof (zcp->zc_data));
+}
+
+static zfs_case_t *
+zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+	zfs_case_t *zcp;
+
+	zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP);
+	zcp->zc_case = cp;
+
+	fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data,
+	    sizeof (zcp->zc_data));
+
+	if (zcp->zc_data.zc_version != CASE_DATA_VERSION) {
+		fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+		return (NULL);
+	}
+
+	if (zcp->zc_data.zc_has_timer)
+		zcp->zc_timer = fmd_timer_install(hdl, zcp,
+		    NULL, zfs_case_timeout);
+
+	(void) uu_list_insert_before(zfs_cases, NULL, zcp);
+
+	fmd_case_setspecific(hdl, cp, zcp);
+
+	return (zcp);
+}
+
+/*ARGSUSED*/
+static void
+zfs_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
+{
+	zfs_case_t *zcp;
+	int32_t pool_state;
+	uint64_t ena, pool_guid, vdev_guid;
+	nvlist_t *detector;
+	boolean_t isresource;
+
+	isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*");
+
+	if (isresource) {
+		/*
+		 * For our faked-up 'ok' resource (see below), we have no normal
+		 * payload members.
+		 */
+		if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+		    &vdev_guid) != 0)
+			pool_state = SPA_LOAD_OPEN;
+		else
+			pool_state = SPA_LOAD_NONE;
+		detector = NULL;
+	} else {
+		(void) nvlist_lookup_nvlist(nvl,
+		    FM_EREPORT_DETECTOR, &detector);
+		(void) nvlist_lookup_int32(nvl,
+		    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state);
+	}
+
+	/*
+	 * Without a retire agent, we subscribe to our own faults and just
+	 * discard them.
+	 */
+	if (fmd_nvl_class_match(hdl, nvl, "fault.fs.zfs.*"))
+		return;
+
+	/*
+	 * Ignore all block level (.io and .checksum) errors not associated with
+	 * a pool open.  We should really update a bean counter, and eventually
+	 * do some real predictive analysis based on these faults.
+	 */
+	if ((fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.io") ||
+	    fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.checksum")) &&
+	    pool_state == SPA_LOAD_NONE)
+		return;
+
+	/*
+	 * We also ignore all ereports generated during an import of a pool,
+	 * since the only possible fault (.pool) would result in import failure,
+	 * and hence no persistent fault.  Some day we may want to do something
+	 * with these ereports, so we continue generating them internally.
+	 */
+	if (pool_state == SPA_LOAD_IMPORT)
+		return;
+
+	/*
+	 * Determine if this ereport corresponds to an open case.  Cases are
+	 * indexed by ENA, since ZFS does all the work of chaining together
+	 * related ereports.
+	 *
+	 * We also detect if an ereport corresponds to an open case by context,
+	 * such as:
+	 *
+	 * 	- An error occurred during an open of a pool with an existing
+	 *	  case.
+	 *
+	 * 	- An error occurred for a device which already has an open
+	 *	  case.
+	 */
+	if (!isresource) {
+		(void) nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena);
+		(void) nvlist_lookup_uint64(nvl,
+		    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid);
+		if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*"))
+			(void) nvlist_lookup_uint64(nvl,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid);
+		else
+			vdev_guid = 0;
+	} else {
+		(void) nvlist_lookup_uint64(nvl,
+		    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid);
+		if (nvlist_lookup_uint64(nvl,
+		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
+			vdev_guid = 0;
+	}
+
+	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+	    zcp = uu_list_next(zfs_cases, zcp)) {
+		/*
+		 * Matches a known ENA.
+		 */
+		if (zcp->zc_data.zc_ena == ena)
+			break;
+
+		/*
+		 * Matches a case involving load errors for this same pool.
+		 */
+		if (zcp->zc_data.zc_pool_guid == pool_guid &&
+		    zcp->zc_data.zc_pool_state == SPA_LOAD_OPEN &&
+		    pool_state == SPA_LOAD_OPEN)
+			break;
+
+		/*
+		 * Device errors for the same device.
+		 */
+		if (vdev_guid != 0 && zcp->zc_data.zc_vdev_guid == vdev_guid)
+			break;
+	}
+
+	if (zcp == NULL) {
+		fmd_case_t *cs;
+		zfs_case_data_t data;
+
+		/*
+		 * If this is one of our 'fake' resource ereports, and there is
+		 * no case open, simply discard it.
+		 */
+		if (isresource)
+			return;
+
+		/*
+		 * Open a new case.
+		 */
+		cs = fmd_case_open(hdl, NULL);
+
+		/*
+		 * Initialize the case buffer.  To commonize code, we actually
+		 * create the buffer with existing data, and then call
+		 * zfs_case_unserialize() to instantiate the in-core structure.
+		 */
+		fmd_buf_create(hdl, cs, CASE_DATA,
+		    sizeof (zfs_case_data_t));
+
+		data.zc_version = CASE_DATA_VERSION;
+		data.zc_ena = ena;
+		data.zc_pool_guid = pool_guid;
+		data.zc_vdev_guid = vdev_guid;
+		data.zc_has_timer = 0;
+		data.zc_pool_state = (int)pool_state;
+
+		fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data));
+
+		zcp = zfs_case_unserialize(hdl, cs);
+		assert(zcp != NULL);
+	}
+
+	/*
+	 * The 'resource.fs.zfs.ok' event is a special internal-only event that
+	 * signifies that a pool or device that was previously faulted has now
+	 * come online (as detected by ZFS).  This allows us to close the
+	 * associated case.
+	 */
+	if (isresource) {
+		fmd_case_close(hdl, zcp->zc_case);
+		return;
+	}
+
+	/*
+	 * Associate the ereport with this case.
+	 */
+	fmd_case_add_ereport(hdl, zcp->zc_case, ep);
+
+	/*
+	 * Don't do anything else if this case is already solved.
+	 */
+	if (fmd_case_solved(hdl, zcp->zc_case))
+		return;
+
+	/*
+	 * Determine if we should solve the case and generate a fault.  We solve
+	 * a case if:
+	 *
+	 * 	a. A pool failed to open (ereport.fs.zfs.pool)
+	 * 	b. A device failed to open (ereport.fs.zfs.pool) while a pool
+	 *	   was up and running.
+	 *
+	 * We may see a series of ereports associated with a pool open, all
+	 * chained together by the same ENA.  If the pool open succeeds, then
+	 * we'll see no further ereports.  To detect when a pool open has
+	 * succeeded, we associate a timer with the event.  When it expires, we
+	 * close the case.
+	 */
+	if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.zpool")) {
+		/*
+		 * Pool level fault.
+		 */
+		nvlist_t *fault;
+
+		fault = fmd_nvl_create_fault(hdl, "fault.fs.zfs.pool",
+		    100, detector, NULL, detector);
+		fmd_case_add_suspect(hdl, zcp->zc_case, fault);
+		fmd_case_solve(hdl, zcp->zc_case);
+
+		if (zcp->zc_data.zc_has_timer) {
+			fmd_timer_remove(hdl, zcp->zc_timer);
+			zcp->zc_data.zc_has_timer = 0;
+			zfs_case_serialize(hdl, zcp);
+		}
+
+	} else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*") &&
+	    pool_state == SPA_LOAD_NONE) {
+		/*
+		 * Device fault.
+		 */
+		nvlist_t *fault;
+
+		fault = fmd_nvl_create_fault(hdl, "fault.fs.zfs.device",
+		    100, detector, NULL, detector);
+		fmd_case_add_suspect(hdl, zcp->zc_case, fault);
+		fmd_case_solve(hdl, zcp->zc_case);
+
+		if (zcp->zc_data.zc_has_timer) {
+			fmd_timer_remove(hdl, zcp->zc_timer);
+			zcp->zc_data.zc_has_timer = 0;
+			zfs_case_serialize(hdl, zcp);
+		}
+
+	} else if (pool_state == SPA_LOAD_OPEN) {
+		/*
+		 * Error incurred during a pool open.  Reset the timer
+		 * associated with this case.
+		 */
+		if (zcp->zc_data.zc_has_timer)
+			fmd_timer_remove(hdl, zcp->zc_timer);
+		zcp->zc_timer = fmd_timer_install(hdl, zcp, NULL,
+		    zfs_case_timeout);
+		if (!zcp->zc_data.zc_has_timer) {
+			zcp->zc_data.zc_has_timer = 1;
+			zfs_case_serialize(hdl, zcp);
+		}
+	}
+}
+
+/*
+ * Timeout - indicates that a pool had faults, but was eventually opened
+ * successfully.
+ */
+/* ARGSUSED */
+static void
+zfs_timeout(fmd_hdl_t *hdl, id_t id, void *data)
+{
+	zfs_case_t *zcp = data;
+
+	zcp->zc_data.zc_has_timer = 0;
+
+	fmd_case_close(hdl, zcp->zc_case);
+}
+
+static void
+zfs_close(fmd_hdl_t *hdl, fmd_case_t *cs)
+{
+	zfs_case_t *zcp = fmd_case_getspecific(hdl, cs);
+
+	if (zcp->zc_data.zc_has_timer)
+		fmd_timer_remove(hdl, zcp->zc_timer);
+	uu_list_remove(zfs_cases, zcp);
+	fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+}
+
+static const fmd_hdl_ops_t fmd_ops = {
+	zfs_recv,	/* fmdo_recv */
+	zfs_timeout,	/* fmdo_timeout */
+	zfs_close,	/* fmdo_close */
+	NULL,		/* fmdo_stats */
+	NULL,		/* fmdo_gc */
+};
+
+static const fmd_prop_t fmd_props[] = {
+	{ "case_timeout", FMD_TYPE_UINT32, "5" },
+	{ NULL, 0, NULL }
+};
+
+static const fmd_hdl_info_t fmd_info = {
+	"ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props
+};
+
+void
+_fmd_init(fmd_hdl_t *hdl)
+{
+	fmd_case_t *cp;
+
+	if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool",
+	    sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node),
+	    NULL, 0)) == NULL)
+		return;
+
+	if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, 0)) == NULL) {
+		uu_list_pool_destroy(zfs_case_pool);
+		return;
+	}
+
+	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
+		uu_list_destroy(zfs_cases);
+		uu_list_pool_destroy(zfs_case_pool);
+		return;
+	}
+
+	/*
+	 * Iterate over all active cases and unserialize the associated buffers,
+	 * adding them to our list of open cases.
+	 */
+	for (cp = fmd_case_next(hdl, NULL);
+	    cp != NULL; cp = fmd_case_next(hdl, cp))
+		(void) zfs_case_unserialize(hdl, cp);
+
+	zfs_case_timeout = fmd_prop_get_int32(hdl, "case_timeout") * NANOSEC;
+}
+
+void
+_fmd_fini(fmd_hdl_t *hdl)
+{
+	zfs_case_t *zcp;
+	uu_list_walk_t *walk;
+
+	/*
+	 * Remove all active cases.
+	 */
+	walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
+	while ((zcp = uu_list_walk_next(walk)) != NULL) {
+		uu_list_remove(zfs_cases, zcp);
+		fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+	}
+	uu_list_walk_end(walk);
+
+	uu_list_destroy(zfs_cases);
+	uu_list_pool_destroy(zfs_case_pool);
+}
diff --git a/usr/src/cmd/fm/schemes/Makefile b/usr/src/cmd/fm/schemes/Makefile
index 4f1dd443df..8dfc6ff36d 100644
--- a/usr/src/cmd/fm/schemes/Makefile
+++ b/usr/src/cmd/fm/schemes/Makefile
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -33,6 +32,7 @@ SUBDIRS = \
 	legacy-hc \
 	mem \
 	mod \
-	pkg
+	pkg \
+	zfs
 
 include ../Makefile.subdirs
diff --git a/usr/src/cmd/fm/schemes/zfs/Makefile b/usr/src/cmd/fm/schemes/zfs/Makefile
new file mode 100644
index 0000000000..0c82190bb3
--- /dev/null
+++ b/usr/src/cmd/fm/schemes/zfs/Makefile
@@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../../../Makefile.cmd
+
+SUBDIRS = $(MACH)
+$(BUILD64)SUBDIRS += $(MACH64)
+
+include ../../Makefile.subdirs
diff --git a/usr/src/cmd/fm/schemes/zfs/amd64/Makefile b/usr/src/cmd/fm/schemes/zfs/amd64/Makefile
new file mode 100644
index 0000000000..b3e2565271
--- /dev/null
+++ b/usr/src/cmd/fm/schemes/zfs/amd64/Makefile
@@ -0,0 +1,33 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../../Makefile.com
+include $(SRC)/Makefile.master.64
+include ../../Makefile.targ
+
+LDLIBS += -lzfs
+
+install: all $(ROOTPROG64)
diff --git a/usr/src/cmd/fm/schemes/zfs/i386/Makefile b/usr/src/cmd/fm/schemes/zfs/i386/Makefile
new file mode 100644
index 0000000000..11a1534892
--- /dev/null
+++ b/usr/src/cmd/fm/schemes/zfs/i386/Makefile
@@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../../Makefile.com
+include ../../Makefile.targ
+
+LDLIBS += -lzfs
+
+install: all $(ROOTPROG)
diff --git a/usr/src/cmd/fm/schemes/zfs/scheme.c b/usr/src/cmd/fm/schemes/zfs/scheme.c
new file mode 100644
index 0000000000..7f2532a637
--- /dev/null
+++ b/usr/src/cmd/fm/schemes/zfs/scheme.c
@@ -0,0 +1,191 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <fm/fmd_fmri.h>
+#include <strings.h>
+#include <libzfs.h>
+
+typedef struct cbdata {
+	uint64_t	cb_guid;
+	zpool_handle_t	*cb_pool;
+} cbdata_t;
+
+static int
+find_pool(zpool_handle_t *zhp, void *data)
+{
+	cbdata_t *cbp = data;
+
+	if (zpool_get_guid(zhp) == cbp->cb_guid) {
+		cbp->cb_pool = zhp;
+		return (1);
+	}
+
+	zpool_close(zhp);
+
+	return (0);
+}
+
+ssize_t
+fmd_fmri_nvl2str(nvlist_t *nvl, char *buf, size_t buflen)
+{
+	uint64_t pool_guid, vdev_guid;
+	cbdata_t cb;
+	ssize_t len;
+	const char *name;
+	char guidbuf[64];
+
+	(void) nvlist_lookup_uint64(nvl, FM_FMRI_ZFS_POOL, &pool_guid);
+
+	/*
+	 * Attempt to convert the pool guid to a name.
+	 */
+	cb.cb_guid = pool_guid;
+	cb.cb_pool = NULL;
+
+	if (zpool_iter(find_pool, &cb) == 1) {
+		name = zpool_get_name(cb.cb_pool);
+	} else {
+		(void) snprintf(guidbuf, sizeof (guidbuf), "%llx", pool_guid);
+		name = guidbuf;
+	}
+
+	if (nvlist_lookup_uint64(nvl, FM_FMRI_ZFS_VDEV, &vdev_guid) == 0)
+		len = snprintf(buf, buflen, "%s://pool=%s/vdev=%llx",
+		    FM_FMRI_SCHEME_ZFS, name, vdev_guid);
+	else
+		len = snprintf(buf, buflen, "%s://pool=%s",
+		    FM_FMRI_SCHEME_ZFS, name);
+
+	if (cb.cb_pool)
+		zpool_close(cb.cb_pool);
+
+	return (len);
+}
+
+static nvlist_t *
+find_vdev_iter(nvlist_t *nv, uint64_t search)
+{
+	uint_t c, children;
+	nvlist_t **child;
+	uint64_t guid;
+	nvlist_t *ret;
+
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid);
+
+	if (search == guid)
+		return (nv);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return (0);
+
+	for (c = 0; c < children; c++)
+		if ((ret = find_vdev_iter(child[c], search)) != 0)
+			return (ret);
+
+	return (NULL);
+}
+
+static nvlist_t *
+find_vdev(zpool_handle_t *zhp, uint64_t guid)
+{
+	nvlist_t *config;
+	nvlist_t *nvroot;
+
+	config = zpool_get_config(zhp, NULL);
+
+	(void) nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot);
+
+	return (find_vdev_iter(nvroot, guid));
+}
+
+int
+fmd_fmri_present(nvlist_t *nvl)
+{
+	uint64_t pool_guid, vdev_guid;
+	cbdata_t cb;
+	int ret;
+
+	(void) nvlist_lookup_uint64(nvl, FM_FMRI_ZFS_POOL, &pool_guid);
+
+	cb.cb_guid = pool_guid;
+	cb.cb_pool = NULL;
+
+	if (zpool_iter(find_pool, &cb) != 1)
+		return (0);
+
+	if (nvlist_lookup_uint64(nvl, FM_FMRI_ZFS_VDEV, &vdev_guid) != 0) {
+		zpool_close(cb.cb_pool);
+		return (1);
+	}
+
+	ret = (find_vdev(cb.cb_pool, vdev_guid) != NULL);
+
+	zpool_close(cb.cb_pool);
+
+	return (ret);
+}
+
+int
+fmd_fmri_unusable(nvlist_t *nvl)
+{
+	uint64_t pool_guid, vdev_guid;
+	cbdata_t cb;
+	nvlist_t *vd;
+	int ret;
+
+	(void) nvlist_lookup_uint64(nvl, FM_FMRI_ZFS_POOL, &pool_guid);
+
+	cb.cb_guid = pool_guid;
+	cb.cb_pool = NULL;
+
+	if (zpool_iter(find_pool, &cb) != 1)
+		return (1);
+
+	if (nvlist_lookup_uint64(nvl, FM_FMRI_ZFS_VDEV, &vdev_guid) != 0) {
+		ret = (zpool_get_state(cb.cb_pool) == POOL_STATE_UNAVAIL);
+		zpool_close(cb.cb_pool);
+		return (ret);
+	}
+
+	vd = find_vdev(cb.cb_pool, vdev_guid);
+	if (vd == NULL) {
+		ret = 1;
+	} else {
+		vdev_stat_t *vs;
+		uint_t c;
+
+		(void) nvlist_lookup_uint64_array(vd, ZPOOL_CONFIG_STATS,
+		    (uint64_t **)&vs, &c);
+
+		ret = (vs->vs_state < VDEV_STATE_DEGRADED);
+	}
+
+	zpool_close(cb.cb_pool);
+
+	return (ret);
+}
diff --git a/usr/src/cmd/fm/schemes/zfs/sparc/Makefile b/usr/src/cmd/fm/schemes/zfs/sparc/Makefile
new file mode 100644
index 0000000000..11a1534892
--- /dev/null
+++ b/usr/src/cmd/fm/schemes/zfs/sparc/Makefile
@@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../../Makefile.com
+include ../../Makefile.targ
+
+LDLIBS += -lzfs
+
+install: all $(ROOTPROG)
diff --git a/usr/src/cmd/fm/schemes/zfs/sparcv9/Makefile b/usr/src/cmd/fm/schemes/zfs/sparcv9/Makefile
new file mode 100644
index 0000000000..b3e2565271
--- /dev/null
+++ b/usr/src/cmd/fm/schemes/zfs/sparcv9/Makefile
@@ -0,0 +1,33 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../../Makefile.com
+include $(SRC)/Makefile.master.64
+include ../../Makefile.targ
+
+LDLIBS += -lzfs
+
+install: all $(ROOTPROG64)
diff --git a/usr/src/cmd/fs.d/df.c b/usr/src/cmd/fs.d/df.c
index 2650f41811..0a38f44b1a 100644
--- a/usr/src/cmd/fs.d/df.c
+++ b/usr/src/cmd/fs.d/df.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -24,7 +23,7 @@
 
 
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -251,29 +250,19 @@ static void (*_zfs_close)(zfs_handle_t *);
 static uint64_t (*_zfs_prop_get_int)(zfs_handle_t *, zfs_prop_t);
 static void (*_zfs_set_error_handler)(void (*)(const char *, va_list));
 
-int
-main(int argc, char *argv[])
+/*
+ * Dynamically check for libzfs, in case the user hasn't installed the SUNWzfs
+ * packages.  A basic utility such as df shouldn't depend on optional
+ * filesystems.
+ */
+static int
+load_libzfs(void)
 {
 	void *hdl;
 
-	(void) setlocale(LC_ALL, "");
-
-#if !defined(TEXT_DOMAIN)		/* Should be defined by cc -D */
-#define	TEXT_DOMAIN "SYS_TEST"
-#endif
-	(void) textdomain(TEXT_DOMAIN);
-
-	program_name = basename(argv[0]);
-
-#ifdef	_iBCS2
-	sysv3_set = getenv("SYSV3");
-#endif	/* _iBCS2 */
+	if (_zfs_open != NULL)
+		return (1);
 
-	/*
-	 * Dynamically check for libzfs, in case the user hasn't installed the
-	 * SUNWzfs packages.  A basic utility such as df shouldn't depend on
-	 * optional filesystems.
-	 */
 	if ((hdl = dlopen("libzfs.so", RTLD_LAZY)) != NULL) {
 		_zfs_set_error_handler = (void (*)())
 		    dlsym(hdl, "zfs_set_error_handler");
@@ -292,9 +281,29 @@ main(int argc, char *argv[])
 			 * like "can't open ..." under race conditions.
 			 */
 			_zfs_set_error_handler(dummy_error_handler);
+			return (1);
 		}
 	}
 
+	return (0);
+}
+
+int
+main(int argc, char *argv[])
+{
+	(void) setlocale(LC_ALL, "");
+
+#if !defined(TEXT_DOMAIN)		/* Should be defined by cc -D */
+#define	TEXT_DOMAIN "SYS_TEST"
+#endif
+	(void) textdomain(TEXT_DOMAIN);
+
+	program_name = basename(argv[0]);
+
+#ifdef	_iBCS2
+	sysv3_set = getenv("SYSV3");
+#endif	/* _iBCS2 */
+
 	if (EQ(program_name, DEVNM_CMD))
 		do_devnm(argc, argv);
 
@@ -1231,7 +1240,7 @@ adjust_total_blocks(struct df_request *dfrp, fsblkcnt64_t *total,
 	uint64_t quota;
 
 	if (strcmp(DFR_FSTYPE(dfrp), MNTTYPE_ZFS) != 0 ||
-	    _zfs_open == NULL)
+	    !load_libzfs())
 		return;
 
 	/*
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
index 27b0630c72..da7e87dcd2 100644
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -831,7 +830,7 @@ spa_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	}
 
 	if (spa.spa_state < 0 || spa.spa_state > POOL_STATE_UNAVAIL)
-		state = "UKNNOWN";
+		state = "UNKNOWN";
 	else
 		state = statetab[spa.spa_state];
 
diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c
index 377430909f..3fdbace8b7 100644
--- a/usr/src/cmd/truss/codes.c
+++ b/usr/src/cmd/truss/codes.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -926,6 +925,18 @@ const struct ioc {
 		"zfs_cmd_t" },
 	{ (uint_t)ZFS_IOC_SENDBACKUP,		"ZFS_IOC_SENDBACKUP",
 		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_INJECT_FAULT,		"ZFS_IOC_INJECT_FAULT",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_CLEAR_FAULT,		"ZFS_IOC_CLEAR_FAULT",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_INJECT_LIST_NEXT,	"ZFS_IOC_INJECT_LIST_NEXT",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_ERROR_LOG,		"ZFS_IOC_ERROR_LOG",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_CLEAR,		"ZFS_IOC_CLEAR",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_BOOKMARK_NAME,	"ZFS_IOC_BOOKMARK_NAME",
+		"zfs_cmd_t" },
 
 	/* kssl ioctls */
 	{ (uint_t)KSSL_ADD_ENTRY,		"KSSL_ADD_ENTRY",
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 545f1d0df3..990e215c46 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -193,7 +192,7 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
 	size_t nvsize = *(uint64_t *)data;
 	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
 
-	dmu_read(os, object, 0, nvsize, packed);
+	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed));
 
 	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
 
@@ -365,7 +364,8 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
 	 */
 	alloc = 0;
 	for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
-		dmu_read(os, smo->smo_object, offset, sizeof (entry), &entry);
+		VERIFY(0 == dmu_read(os, smo->smo_object, offset,
+		    sizeof (entry), &entry));
 		if (SM_DEBUG_DECODE(entry)) {
 			(void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n",
 			    (u_longlong_t)(offset / sizeof (entry)),
@@ -434,10 +434,10 @@ dump_metaslabs(spa_t *spa)
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vd = rvd->vdev_child[c];
 
-		spa_config_enter(spa, RW_READER);
+		spa_config_enter(spa, RW_READER, FTAG);
 		(void) printf("\n    vdev %llu = %s\n\n",
 		    (u_longlong_t)vd->vdev_id, vdev_description(vd));
-		spa_config_exit(spa);
+		spa_config_exit(spa, FTAG);
 
 		if (dump_opt['d'] <= 5) {
 			(void) printf("\t%10s   %10s   %5s\n",
@@ -463,9 +463,9 @@ dump_dtl(vdev_t *vd, int indent)
 	if (indent == 0)
 		(void) printf("\nDirty time logs:\n\n");
 
-	spa_config_enter(spa, RW_READER);
+	spa_config_enter(spa, RW_READER, FTAG);
 	(void) printf("\t%*s%s\n", indent, "", vdev_description(vd));
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) {
 		/*
@@ -523,11 +523,11 @@ zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
 
 	if (bc->bc_errno) {
 		(void) sprintf(buffer,
-		    "Error %d reading <%llu, %llu, %d, %llu>: ",
+		    "Error %d reading <%llu, %llu, %lld, %llu>: ",
 		    bc->bc_errno,
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
-		    zb->zb_level,
+		    (u_longlong_t)zb->zb_level,
 		    (u_longlong_t)zb->zb_blkid);
 		goto out;
 	}
@@ -547,7 +547,6 @@ zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
 		for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx);
 		    bpx < bpend; bpx++) {
 			if (bpx->blk_birth != 0) {
-				ASSERT(bpx->blk_fill > 0);
 				fill += bpx->blk_fill;
 			} else {
 				ASSERT(bpx->blk_fill == 0);
@@ -575,8 +574,8 @@ zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
 
 	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
 		if (l == zb->zb_level) {
-			(void) sprintf(buffer + strlen(buffer), "L%x",
-			    zb->zb_level);
+			(void) sprintf(buffer + strlen(buffer), "L%llx",
+			    (u_longlong_t)zb->zb_level);
 		} else {
 			(void) sprintf(buffer + strlen(buffer), " ");
 		}
@@ -730,7 +729,7 @@ dump_bplist(objset_t *mos, uint64_t object, char *name)
 	if (dump_opt['d'] < 3)
 		return;
 
-	bplist_open(&bpl, mos, object);
+	VERIFY(0 == bplist_open(&bpl, mos, object));
 	if (bplist_empty(&bpl)) {
 		bplist_close(&bpl);
 		return;
@@ -776,20 +775,20 @@ znode_path(objset_t *os, uint64_t object, char *pathbuf, size_t size)
 	size_t complen;
 	char component[MAXNAMELEN + 1];
 	char *path;
+	int error;
 
 	path = pathbuf + size;
 	*--path = '\0';
 
 	for (;;) {
-		db = dmu_bonus_hold(os, object);
-		if (db == NULL)
+		error = dmu_bonus_hold(os, object, FTAG, &db);
+		if (error)
 			break;
 
-		dmu_buf_read(db);
 		dmu_object_info_from_db(db, &doi);
 		zp = db->db_data;
 		parent = zp->zp_parent;
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, FTAG);
 
 		if (doi.doi_bonus_type != DMU_OT_ZNODE)
 			break;
@@ -881,7 +880,7 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
 	dump_none,		/* ZIL intent log		*/
 	dump_dnode,		/* DMU dnode			*/
 	dump_dmu_objset,	/* DMU objset			*/
-	dump_dsl_dir,	/* DSL directory			*/
+	dump_dsl_dir,		/* DSL directory		*/
 	dump_zap,		/* DSL directory child map	*/
 	dump_zap,		/* DSL dataset snap map		*/
 	dump_zap,		/* DSL props			*/
@@ -897,6 +896,7 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
 	dump_uint8,		/* other uint8[]		*/
 	dump_uint64,		/* other uint64[]		*/
 	dump_zap,		/* other ZAP			*/
+	dump_zap,		/* persistent error log		*/
 };
 
 static void
@@ -920,10 +920,10 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
 	if (object == 0) {
 		dn = os->os->os_meta_dnode;
 	} else {
-		db = dmu_bonus_hold(os, object);
-		if (db == NULL)
-			fatal("dmu_bonus_hold(%llu) failed", object);
-		dmu_buf_read(db);
+		error = dmu_bonus_hold(os, object, FTAG, &db);
+		if (error)
+			fatal("dmu_bonus_hold(%llu) failed, errno %u",
+			    object, error);
 		bonus = db->db_data;
 		bsize = db->db_size;
 		dn = ((dmu_buf_impl_t *)db)->db_dnode;
@@ -999,7 +999,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
 	}
 
 	if (db != NULL)
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, FTAG);
 }
 
 static char *objset_types[DMU_OST_NUMTYPES] = {
@@ -1214,7 +1214,7 @@ zdb_space_map_load(spa_t *spa)
 }
 
 static int
-zdb_space_map_claim(spa_t *spa, blkptr_t *bp)
+zdb_space_map_claim(spa_t *spa, blkptr_t *bp, zbookmark_t *zb)
 {
 	dva_t *dva = &bp->blk_dva[0];
 	uint64_t vdev = DVA_GET_VDEV(dva);
@@ -1248,7 +1248,7 @@ zdb_space_map_claim(spa_t *spa, blkptr_t *bp)
 		error = zio_wait(zio_read(NULL, spa, &blk,
 		    &gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD));
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD, zb));
 		if (error)
 			return (error);
 		if (BP_SHOULD_BYTESWAP(&blk))
@@ -1256,7 +1256,7 @@ zdb_space_map_claim(spa_t *spa, blkptr_t *bp)
 		for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			if (gbh.zg_blkptr[g].blk_birth == 0)
 				break;
-			error = zdb_space_map_claim(spa, &gbh.zg_blkptr[g]);
+			error = zdb_space_map_claim(spa, &gbh.zg_blkptr[g], zb);
 			if (error)
 				return (error);
 		}
@@ -1327,11 +1327,6 @@ zdb_refresh_ubsync(spa_t *spa)
 	zio_t *zio;
 
 	/*
-	 * Reopen all devices to purge zdb's vdev caches.
-	 */
-	vdev_reopen(rvd, NULL);
-
-	/*
 	 * Reload the uberblock.
 	 */
 	zio = zio_root(spa, NULL, NULL,
@@ -1367,8 +1362,6 @@ typedef struct zdb_cb {
 	int		zcb_haderrors;
 } zdb_cb_t;
 
-static blkptr_cb_t zdb_blkptr_cb;
-
 static void
 zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type)
 {
@@ -1388,7 +1381,7 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type)
 	if (dump_opt['L'])
 		return;
 
-	error = zdb_space_map_claim(spa, bp);
+	error = zdb_space_map_claim(spa, bp, &zcb->zcb_cache->bc_bookmark);
 
 	if (error == 0)
 		return;
@@ -1402,22 +1395,6 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type)
 	(void) fatal("fatal error %d in bp %p", error, bp);
 }
 
-static void
-zdb_log_block_cb(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t first_txg)
-{
-	if (bp->blk_birth < first_txg) {
-		zdb_cb_t *zcb = arg;
-		traverse_blk_cache_t bc = *zcb->zcb_cache;
-		zbookmark_t *zb = &bc.bc_bookmark;
-
-		zb->zb_objset = bp->blk_cksum.zc_word[2];
-		zb->zb_blkid = bp->blk_cksum.zc_word[3];
-		bc.bc_blkptr = *bp;
-
-		(void) zdb_blkptr_cb(&bc, zilog->zl_spa, arg);
-	}
-}
-
 static int
 zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 {
@@ -1444,11 +1421,11 @@ zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 			blkbuf[0] = '\0';
 
 		(void) printf("zdb_blkptr_cb: Got error %d reading "
-		    "<%llu, %llu, %d, %llx> %s -- %s\n",
+		    "<%llu, %llu, %lld, %llx> %s -- %s\n",
 		    bc->bc_errno,
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
-		    zb->zb_level,
+		    (u_longlong_t)zb->zb_level,
 		    (u_longlong_t)zb->zb_blkid,
 		    blkbuf,
 		    error == EAGAIN ? "retrying" : "skipping");
@@ -1472,18 +1449,6 @@ zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 		    blkbuf);
 	}
 
-	if (type == DMU_OT_OBJSET) {
-		objset_phys_t *osphys = bc->bc_data;
-		zilog_t zilog = { 0 };
-		zilog.zl_header = &osphys->os_zil_header;
-		zilog.zl_spa = spa;
-
-		zcb->zcb_cache = bc;
-
-		zil_parse(&zilog, zdb_log_block_cb, NULL, zcb,
-		    spa_first_txg(spa));
-	}
-
 	return (0);
 }
 
@@ -1492,6 +1457,7 @@ dump_block_stats(spa_t *spa)
 {
 	traverse_handle_t *th;
 	zdb_cb_t zcb = { 0 };
+	traverse_blk_cache_t dummy_cache = { 0 };
 	zdb_blkstats_t *zb, *tzb;
 	uint64_t alloc, space;
 	int leaks = 0;
@@ -1499,10 +1465,12 @@ dump_block_stats(spa_t *spa)
 	int flags;
 	int e;
 
+	zcb.zcb_cache = &dummy_cache;
+
 	if (dump_opt['c'])
 		advance |= ADVANCE_DATA;
 
-	advance |= ADVANCE_PRUNE;
+	advance |= ADVANCE_PRUNE | ADVANCE_ZIL;
 
 	(void) printf("\nTraversing all blocks to %sverify"
 	    " nothing leaked ...\n",
@@ -1526,8 +1494,8 @@ dump_block_stats(spa_t *spa)
 		blkptr_t blk;
 		uint64_t itor = 0;
 
-		bplist_open(bpl, spa->spa_meta_objset,
-		    spa->spa_sync_bplist_obj);
+		VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
+		    spa->spa_sync_bplist_obj));
 
 		while (bplist_iterate(bpl, &itor, &blk) == 0) {
 			zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
@@ -1543,8 +1511,8 @@ dump_block_stats(spa_t *spa)
 	}
 
 	/*
-	 * Now traverse the pool.  If we're read all data to verify checksums,
-	 * do a scrubbing read so that we validate all copies.
+	 * Now traverse the pool.  If we're reading all data to verify
+	 * checksums, do a scrubbing read so that we validate all copies.
 	 */
 	flags = ZIO_FLAG_CANFAIL;
 	if (advance & ADVANCE_DATA)
@@ -1552,7 +1520,7 @@ dump_block_stats(spa_t *spa)
 	th = traverse_init(spa, zdb_blkptr_cb, &zcb, advance, flags);
 	th->th_noread = zdb_noread;
 
-	traverse_add_pool(th, 0, -1ULL);
+	traverse_add_pool(th, 0, spa_first_txg(spa));
 
 	while (traverse_more(th) == EAGAIN)
 		continue;
@@ -1734,6 +1702,7 @@ main(int argc, char **argv)
 	int verbose = 0;
 	int error;
 	int flag, set;
+	vdev_knob_t *vk;
 
 	(void) setrlimit(RLIMIT_NOFILE, &rl);
 
@@ -1789,10 +1758,10 @@ main(int argc, char **argv)
 			zdb_noread.zb_level = strtol(endstr + 1, &endstr, 0);
 			zdb_noread.zb_blkid = strtoull(endstr + 1, &endstr, 16);
 			(void) printf("simulating bad block "
-			    "<%llu, %llu, %d, %llx>\n",
+			    "<%llu, %llu, %lld, %llx>\n",
 			    (u_longlong_t)zdb_noread.zb_objset,
 			    (u_longlong_t)zdb_noread.zb_object,
-			    zdb_noread.zb_level,
+			    (u_longlong_t)zdb_noread.zb_level,
 			    (u_longlong_t)zdb_noread.zb_blkid);
 			break;
 		case 'v':
@@ -1809,6 +1778,15 @@ main(int argc, char **argv)
 
 	kernel_init(FREAD);
 
+	/*
+	 * Disable vdev caching.  If we don't do this, live pool traversal
+	 * won't make progress because it will never see disk updates.
+	 */
+	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
+		if (strcmp(vk->vk_name, "cache_size") == 0)
+			vk->vk_default = 0;
+	}
+
 	for (c = 0; c < 256; c++) {
 		if (dump_all && c != 'L' && c != 'l')
 			dump_opt[c] = 1;
diff --git a/usr/src/cmd/zdb/zdb_il.c b/usr/src/cmd/zdb/zdb_il.c
index 1006115709..ffa2471cda 100644
--- a/usr/src/cmd/zdb/zdb_il.c
+++ b/usr/src/cmd/zdb/zdb_il.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -129,9 +128,19 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
 		if (bp->blk_birth == 0) {
 			bzero(buf, sizeof (buf));
 		} else {
+			zbookmark_t zb;
+
+			ASSERT3U(bp->blk_cksum.zc_word[2], ==,
+			    dmu_objset_id(zilog->zl_os));
+
+			zb.zb_objset = bp->blk_cksum.zc_word[2];
+			zb.zb_object = 0;
+			zb.zb_level = -1;
+			zb.zb_blkid = bp->blk_cksum.zc_word[3];
+
 			error = zio_wait(zio_read(NULL, zilog->zl_spa,
 			    bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
-			    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL));
+			    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
 			if (error)
 				return;
 		}
diff --git a/usr/src/cmd/zfs/zfs_main.c b/usr/src/cmd/zfs/zfs_main.c
index a92b012744..b58a29cfa4 100644
--- a/usr/src/cmd/zfs/zfs_main.c
+++ b/usr/src/cmd/zfs/zfs_main.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -111,9 +110,8 @@ typedef struct zfs_command {
 
 /*
  * Master command table.  Each ZFS command has a name, associated function, and
- * usage message.  Unfortunately, the usage messages need to be
- * iternationalized, so we have to have a function to return the usage message
- * based on a command index.
+ * usage message.  The usage messages need to be internationalized, so we have
+ * to have a function to return the usage message based on a command index.
  *
  * These commands are organized according to how they are displayed in the usage
  * message.  An empty command (one with a NULL name) indicates an empty line in
@@ -2569,7 +2567,7 @@ manual_mount(int argc, char **argv)
 	char *dataset, *path;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":o:O")) != -1) {
+	while ((c = getopt(argc, argv, ":mo:O")) != -1) {
 		switch (c) {
 		case 'o':
 			(void) strlcpy(mntopts, optarg, sizeof (mntopts));
@@ -2577,6 +2575,9 @@ manual_mount(int argc, char **argv)
 		case 'O':
 			flags |= MS_OVERLAY;
 			break;
+		case 'm':
+			flags |= MS_NOMNTTAB;
+			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
diff --git a/usr/src/cmd/zinject/Makefile b/usr/src/cmd/zinject/Makefile
new file mode 100644
index 0000000000..f646689967
--- /dev/null
+++ b/usr/src/cmd/zinject/Makefile
@@ -0,0 +1,54 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+PROG:sh=	basename `pwd`
+
+include ../Makefile.cmd
+
+$(INTEL_BLD)SUBDIRS	= $(MACH)
+$(BUILD64)SUBDIRS	+= $(MACH64)
+
+all	:=	TARGET = all
+install	:=	TARGET = install
+clean	:=	TARGET = clean
+clobber	:=	TARGET = clobber
+lint	:=	TARGET = lint
+
+.KEEP_STATE:
+
+all clean clobber lint:	$(SUBDIRS)
+
+install:	$(SUBDIRS)
+	-$(RM) $(ROOTUSRSBINPROG)
+	-$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
+
+$(SUBDIRS):	FRC
+	@cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/zinject/Makefile.com b/usr/src/cmd/zinject/Makefile.com
new file mode 100644
index 0000000000..40f1914729
--- /dev/null
+++ b/usr/src/cmd/zinject/Makefile.com
@@ -0,0 +1,55 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+PROG:sh=	cd ..; basename `pwd`
+SRCS= ../$(PROG).c ../translate.c
+
+include ../../Makefile.cmd
+
+INCS +=	-I../../../lib/libzpool/common
+INCS +=	-I../../../uts/common/fs/zfs
+
+LDLIBS += -lzpool -lzfs
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
+
+CPPFLAGS += -D_LARGEFILE64_SOURCE=1 -D_REENTRANT $(INCS)
+
+.KEEP_STATE:
+
+all: $(PROG)
+
+$(PROG): $(SRCS)
+	$(LINK.c) -o $(PROG) $(SRCS) $(LDLIBS)
+	$(POST_PROCESS)
+
+clean:
+
+lint:	lint_SRCS
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/zinject/amd64/Makefile b/usr/src/cmd/zinject/amd64/Makefile
new file mode 100644
index 0000000000..8740a9f3ac
--- /dev/null
+++ b/usr/src/cmd/zinject/amd64/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+install: all $(ROOTUSRSBINPROG64)
diff --git a/usr/src/cmd/zinject/i386/Makefile b/usr/src/cmd/zinject/i386/Makefile
new file mode 100644
index 0000000000..d2cb13dcd1
--- /dev/null
+++ b/usr/src/cmd/zinject/i386/Makefile
@@ -0,0 +1,30 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+
+install: all $(ROOTUSRSBINPROG32)
diff --git a/usr/src/cmd/zinject/sparcv9/Makefile b/usr/src/cmd/zinject/sparcv9/Makefile
new file mode 100644
index 0000000000..8740a9f3ac
--- /dev/null
+++ b/usr/src/cmd/zinject/sparcv9/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+install: all $(ROOTUSRSBINPROG64)
diff --git a/usr/src/cmd/zinject/translate.c b/usr/src/cmd/zinject/translate.c
new file mode 100644
index 0000000000..882b230930
--- /dev/null
+++ b/usr/src/cmd/zinject/translate.c
@@ -0,0 +1,458 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <libzfs.h>
+
+#undef verify	/* both libzfs.h and zfs_context.h want to define this */
+
+#include <sys/zfs_context.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/file.h>
+#include <sys/mntent.h>
+#include <sys/mnttab.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dnode.h>
+
+#include <sys/mkdev.h>
+
+#include "zinject.h"
+
+extern void kernel_init(int);
+extern void kernel_fini(void);
+
+static int debug;
+
+static void
+ziprintf(const char *fmt, ...)
+{
+	va_list ap;
+
+	if (!debug)
+		return;
+
+	va_start(ap, fmt);
+	(void) vprintf(fmt, ap);
+	va_end(ap);
+}
+
+/*
+ * Given a full path to a file, translate into a dataset name and a relative
+ * path within the dataset.  'dataset' must be at least MAXNAMELEN characters,
+ * and 'relpath' must be at least MAXPATHLEN characters.  We also pass a stat64
+ * buffer, which we need later to get the object ID.
+ */
+static int
+parse_pathname(const char *fullpath, char *dataset, char *relpath,
+    struct stat64 *statbuf)
+{
+	struct extmnttab mp;
+	FILE *fp;
+	int match;
+	const char *rel;
+
+	if (fullpath[0] != '/') {
+		(void) fprintf(stderr, "invalid object '%s': must be full "
+		    "path\n", fullpath);
+		usage();
+		return (-1);
+	}
+
+	if (strlen(fullpath) >= MAXPATHLEN) {
+		(void) fprintf(stderr, "invalid object; pathname too long\n");
+		return (-1);
+	}
+
+	if (stat64(fullpath, statbuf) != 0) {
+		(void) fprintf(stderr, "cannot open '%s': %s\n",
+		    fullpath, strerror(errno));
+		return (-1);
+	}
+
+	if ((fp = fopen(MNTTAB, "r")) == NULL) {
+		(void) fprintf(stderr, "cannot open /etc/mnttab\n");
+		return (-1);
+	}
+
+	match = 0;
+	while (getextmntent(fp, &mp, sizeof (mp)) == 0) {
+		if (makedev(mp.mnt_major, mp.mnt_minor) == statbuf->st_dev) {
+			match = 1;
+			break;
+		}
+	}
+
+	if (!match) {
+		(void) fprintf(stderr, "cannot find mountpoint for '%s'\n",
+		    fullpath);
+		return (-1);
+	}
+
+	if (strcmp(mp.mnt_fstype, MNTTYPE_ZFS) != 0) {
+		(void) fprintf(stderr, "invalid path '%s': not a ZFS "
+		    "filesystem\n", fullpath);
+		return (-1);
+	}
+
+	if (strncmp(fullpath, mp.mnt_mountp, strlen(mp.mnt_mountp)) != 0) {
+		(void) fprintf(stderr, "invalid path '%s': mountpoint "
+		    "doesn't match path\n", fullpath);
+		return (-1);
+	}
+
+	(void) strcpy(dataset, mp.mnt_special);
+
+	rel = fullpath + strlen(mp.mnt_mountp);
+	if (rel[0] == '/')
+		rel++;
+	(void) strcpy(relpath, rel);
+
+	return (0);
+}
+
+/*
+ * Convert from a (dataset, path) pair into a (objset, object) pair.  Note that
+ * we grab the object number from the inode number, since looking this up via
+ * libzpool is a real pain.
+ */
+/* ARGSUSED */
+static int
+object_from_path(const char *dataset, const char *path, struct stat64 *statbuf,
+    zinject_record_t *record)
+{
+	objset_t *os;
+	int err;
+
+	/*
+	 * Before doing any libzpool operations, call sync() to ensure that the
+	 * on-disk state is consistent with the in-core state.
+	 */
+	sync();
+
+	if ((err = dmu_objset_open(dataset, DMU_OST_ZFS,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &os)) != 0) {
+		(void) fprintf(stderr, "cannot open dataset '%s': %s\n",
+		    dataset, strerror(err));
+		return (-1);
+	}
+
+	record->zi_objset = dmu_objset_id(os);
+	record->zi_object = statbuf->st_ino;
+
+	dmu_objset_close(os);
+
+	return (0);
+}
+
+/*
+ * Calculate the real range based on the type, level, and range given.
+ */
+static int
+calculate_range(const char *dataset, err_type_t type, int level, char *range,
+    zinject_record_t *record)
+{
+	objset_t *os = NULL;
+	dnode_t *dn = NULL;
+	int err;
+	int ret = -1;
+
+	/*
+	 * Determine the numeric range from the string.
+	 */
+	if (range == NULL) {
+		/*
+		 * If range is unspecified, set the range to [0,-1], which
+		 * indicates that the whole object should be treated as an
+		 * error.
+		 */
+		record->zi_start = 0;
+		record->zi_end = -1ULL;
+	} else {
+		char *end;
+
+		/* XXX add support for suffixes */
+		record->zi_start = strtoull(range, &end, 10);
+
+
+		if (*end == '\0')
+			record->zi_end = record->zi_start + 1;
+		else if (*end == ',')
+			record->zi_end = strtoull(end + 1, &end, 10);
+
+		if (*end != '\0') {
+			(void) fprintf(stderr, "invalid range '%s': must be "
+			    "a numeric range of the form 'start[,end]'\n",
+			    range);
+			goto out;
+		}
+	}
+
+	switch (type) {
+	case TYPE_DATA:
+		break;
+
+	case TYPE_DNODE:
+		/*
+		 * If this is a request to inject faults into the dnode, then we
+		 * must translate the current (objset,object) pair into an
+		 * offset within the metadnode for the objset.  Specifying any
+		 * kind of range with type 'dnode' is illegal.
+		 */
+		if (range != NULL) {
+			(void) fprintf(stderr, "range cannot be specified when "
+			    "type is 'dnode'\n");
+			goto out;
+		}
+
+		record->zi_start = record->zi_object * sizeof (dnode_phys_t);
+		record->zi_end = record->zi_start + sizeof (dnode_phys_t);
+		record->zi_object = 0;
+		break;
+	}
+
+	/*
+	 * Get the dnode associated with object, so we can calculate the block
+	 * size.
+	 */
+	if ((err = dmu_objset_open(dataset, DMU_OST_ANY,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &os)) != 0) {
+		(void) fprintf(stderr, "cannot open dataset '%s': %s\n",
+		    dataset, strerror(err));
+		goto out;
+	}
+
+	if (record->zi_object == 0) {
+		dn = os->os->os_meta_dnode;
+	} else {
+		err = dnode_hold(os->os, record->zi_object, FTAG, &dn);
+		if (err != 0) {
+			(void) fprintf(stderr, "failed to hold dnode "
+			    "for object %llu\n",
+			    (u_longlong_t)record->zi_object);
+			goto out;
+		}
+	}
+
+
+	ziprintf("data shift: %d\n", (int)dn->dn_datablkshift);
+	ziprintf(" ind shift: %d\n", (int)dn->dn_indblkshift);
+
+	/*
+	 * Translate range into block IDs.
+	 */
+	if (record->zi_start != 0 || record->zi_end != -1ULL) {
+		record->zi_start >>= dn->dn_datablkshift;
+		record->zi_end >>= dn->dn_datablkshift;
+	}
+
+	/*
+	 * Check level, and then translate level 0 blkids into ranges
+	 * appropriate for level of indirection.
+	 */
+	record->zi_level = level;
+	if (level > 0) {
+		ziprintf("level 0 blkid range: [%llu, %llu]\n",
+		    record->zi_start, record->zi_end);
+
+		if (level >= dn->dn_nlevels) {
+			(void) fprintf(stderr, "level %d exceeds max level "
+			    "of object (%d)\n", level, dn->dn_nlevels - 1);
+			goto out;
+		}
+
+		if (record->zi_start != 0 || record->zi_end != 0) {
+			int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+			for (; level > 0; level--) {
+				record->zi_start >>= shift;
+				record->zi_end >>= shift;
+			}
+		}
+	}
+
+	ret = 0;
+out:
+	if (dn) {
+		if (dn != os->os->os_meta_dnode)
+			dnode_rele(dn, FTAG);
+	}
+	if (os)
+		dmu_objset_close(os);
+
+	return (ret);
+}
+
+int
+translate_record(err_type_t type, const char *object, const char *range,
+    int level, zinject_record_t *record, char *poolname, char *dataset)
+{
+	char path[MAXPATHLEN];
+	char *slash;
+	struct stat64 statbuf;
+	int ret = -1;
+
+	kernel_init(FREAD);
+
+	debug = (getenv("ZINJECT_DEBUG") != NULL);
+
+	ziprintf("translating: %s\n", object);
+
+	if (MOS_TYPE(type)) {
+		/*
+		 * MOS objects are treated specially.
+		 */
+		switch (type) {
+		case TYPE_MOS:
+			record->zi_type = 0;
+			break;
+		case TYPE_MOSDIR:
+			record->zi_type = DMU_OT_OBJECT_DIRECTORY;
+			break;
+		case TYPE_METASLAB:
+			record->zi_type = DMU_OT_OBJECT_ARRAY;
+			break;
+		case TYPE_CONFIG:
+			record->zi_type = DMU_OT_PACKED_NVLIST;
+			break;
+		case TYPE_BPLIST:
+			record->zi_type = DMU_OT_BPLIST;
+			break;
+		case TYPE_SPACEMAP:
+			record->zi_type = DMU_OT_SPACE_MAP;
+			break;
+		case TYPE_ERRLOG:
+			record->zi_type = DMU_OT_ERROR_LOG;
+			break;
+		}
+
+		dataset[0] = '\0';
+		(void) strcpy(poolname, object);
+		return (0);
+	}
+
+	/*
+	 * Convert a full path into a (dataset, file) pair.
+	 */
+	if (parse_pathname(object, dataset, path, &statbuf) != 0)
+		goto err;
+
+	ziprintf("   dataset: %s\n", dataset);
+	ziprintf("      path: %s\n", path);
+
+	/*
+	 * Convert (dataset, file) into (objset, object)
+	 */
+	if (object_from_path(dataset, path, &statbuf, record) != 0)
+		goto err;
+
+	ziprintf("raw objset: %llu\n", record->zi_objset);
+	ziprintf("raw object: %llu\n", record->zi_object);
+
+	/*
+	 * For the given object, calculate the real (type, level, range)
+	 */
+	if (calculate_range(dataset, type, level, (char *)range, record) != 0)
+		goto err;
+
+	ziprintf("    objset: %llu\n", record->zi_objset);
+	ziprintf("    object: %llu\n", record->zi_object);
+	if (record->zi_start == 0 &&
+	    record->zi_end == -1ULL)
+		ziprintf("     range: all\n");
+	else
+		ziprintf("     range: [%llu, %llu]\n", record->zi_start,
+		    record->zi_end);
+
+	/*
+	 * Copy the pool name
+	 */
+	(void) strcpy(poolname, dataset);
+	if ((slash = strchr(poolname, '/')) != NULL)
+		*slash = '\0';
+
+	ret = 0;
+
+err:
+	kernel_fini();
+	return (ret);
+}
+
+int
+translate_raw(const char *str, zinject_record_t *record)
+{
+	/*
+	 * A raw bookmark of the form objset:object:level:blkid, where each
+	 * number is a hexidecimal value.
+	 */
+	if (sscanf(str, "%llx:%llx:%x:%llx", (u_longlong_t *)&record->zi_objset,
+	    (u_longlong_t *)&record->zi_object, &record->zi_level,
+	    (u_longlong_t *)&record->zi_start) != 4) {
+		(void) fprintf(stderr, "bad raw spec '%s': must be of the form "
+		    "'objset:object:level:blkid'\n", str);
+		return (-1);
+	}
+
+	record->zi_end = record->zi_start;
+
+	return (0);
+}
+
+int
+translate_device(const char *pool, const char *device, zinject_record_t *record)
+{
+	char *end;
+	zpool_handle_t *zhp;
+
+	/*
+	 * Given a device name or GUID, create an appropriate injection record
+	 * with zi_guid set.
+	 */
+	if ((zhp = zpool_open(pool)) == NULL)
+		return (-1);
+
+	record->zi_guid = strtoull(device, &end, 16);
+	if (record->zi_guid == 0 || *end != '\0')
+		record->zi_guid = zpool_vdev_to_guid(zhp, device);
+
+	if (record->zi_guid == 0) {
+		(void) fprintf(stderr, "cannot find device '%s' in pool '%s'\n",
+		    device, pool);
+		return (-1);
+	}
+
+	return (0);
+}
diff --git a/usr/src/cmd/zinject/zinject.c b/usr/src/cmd/zinject/zinject.c
new file mode 100644
index 0000000000..b584fb0de5
--- /dev/null
+++ b/usr/src/cmd/zinject/zinject.c
@@ -0,0 +1,739 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * ZFS Fault Injector
+ *
+ * This userland component takes a set of options and uses libzpool to translate
+ * from a user-visible object type and name to an internal representation.
+ * There are two basic types of faults: device faults and data faults.
+ *
+ *
+ * DEVICE FAULTS
+ *
+ * Errors can be injected into a particular vdev using the '-d' option.  This
+ * option takes a path or vdev GUID to uniquely identify the device within a
+ * pool.  There are two types of errors that can be injected, EIO and ENXIO,
+ * that can be controlled through the '-t' option.  The default is ENXIO.  For
+ * EIO failures, any attempt to read data from the device will return EIO, but
+ * subsequent attempt to reopen the device will succeed.  For ENXIO failures,
+ * any attempt to read from the device will return EIO, but any attempt to
+ * reopen the device will also return ENXIO.
+ *
+ * This form of the command looks like:
+ *
+ * 	zinject -d device [-t type] pool
+ *
+ *
+ * DATA FAULTS
+ *
+ * We begin with a tuple of the form:
+ *
+ * 	<type,level,range,object>
+ *
+ * 	type	A string describing the type of data to target.  Each type
+ * 		implicitly describes how to interpret 'object'. Currently,
+ * 		the following values are supported:
+ *
+ * 		data		User data for a file
+ * 		dnode		Dnode for a file or directory
+ *
+ *		The following MOS objects are special.  Instead of injecting
+ *		errors on a particular object or blkid, we inject errors across
+ *		all objects of the given type.
+ *
+ * 		mos		Any data in the MOS
+ * 		mosdir		object directory
+ * 		config		pool configuration
+ * 		bplist		blkptr list
+ * 		spacemap	spacemap
+ * 		metaslab	metaslab
+ * 		errlog		persistent error log
+ *
+ * 	level	Object level.  Defaults to '0', not applicable to all types.  If
+ * 		a range is given, this corresponds to the indirect block
+ * 		corresponding to the specific range.
+ *
+ *	range	A numerical range [start,end) within the object.  Defaults to
+ *		the full size of the file.
+ *
+ * 	object	A string describing the logical location of the object.  For
+ * 		files and directories (currently the only supported types),
+ * 		this is the path of the object on disk.
+ *
+ * This is translated, via libzpool, into the following internal representation:
+ *
+ * 	<type,objset,object,level,range>
+ *
+ * These types should be self-explanatory.  This tuple is then passed to the
+ * kernel via a special ioctl() to initiate fault injection for the given
+ * object.  Note that 'type' is not strictly necessary for fault injection, but
+ * is used when translating existing faults into a human-readable string.
+ *
+ *
+ * The command itself takes one of the forms:
+ *
+ * 	zinject
+ * 	zinject <-a | -u pool>
+ * 	zinject -c <id|all>
+ * 	zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
+ *	    [-r range] <object>
+ * 	zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
+ *
+ * With no arguments, the command prints all currently registered injection
+ * handlers, with their numeric identifiers.
+ *
+ * The '-c' option will clear the given handler, or all handlers if 'all' is
+ * specified.
+ *
+ * The '-e' option takes a string describing the errno to simulate.  This must
+ * be either 'io' or 'checksum'.  In most cases this will result in the same
+ * behavior, but RAID-Z will produce a different set of ereports for this
+ * situation.
+ *
+ * The '-a', '-u', and '-m' flags toggle internal flush behavior.  If '-a' is
+ * specified, then the ARC cache is flushed appropriately.  If '-u' is
+ * specified, then the underlying SPA is unloaded.  Either of these flags can be
+ * specified independently of any other handlers.  The '-m' flag automatically
+ * does an unmount and remount of the underlying dataset to aid in flushing the
+ * cache.
+ *
+ * The '-f' flag controls the frequency of errors injected, expressed as a
+ * integer percentage between 1 and 100.  The default is 100.
+ *
+ * The this form is responsible for actually injecting the handler into the
+ * framework.  It takes the arguments described above, translates them to the
+ * internal tuple using libzpool, and then issues an ioctl() to register the
+ * handler.
+ *
+ * The final form can target a specific bookmark, regardless of whether a
+ * human-readable interface has been designed.  It allows developers to specify
+ * a particular block by number.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <sys/fs/zfs.h>
+#include <sys/mount.h>
+
+#include <libzfs.h>
+
+#undef verify	/* both libzfs.h and zfs_context.h want to define this */
+
+#include "zinject.h"
+
+int zfs_fd;
+
+#define	ECKSUM	EBADE
+
+static const char *errtable[TYPE_INVAL] = {
+	"data",
+	"dnode",
+	"mos",
+	"mosdir",
+	"metaslab",
+	"config",
+	"bplist",
+	"spacemap",
+	"errlog"
+};
+
+static err_type_t
+name_to_type(const char *arg)
+{
+	int i;
+	for (i = 0; i < TYPE_INVAL; i++)
+		if (strcmp(errtable[i], arg) == 0)
+			return (i);
+
+	return (TYPE_INVAL);
+}
+
+static const char *
+type_to_name(uint64_t type)
+{
+	switch (type) {
+	case DMU_OT_OBJECT_DIRECTORY:
+		return ("mosdir");
+	case DMU_OT_OBJECT_ARRAY:
+		return ("metaslab");
+	case DMU_OT_PACKED_NVLIST:
+		return ("config");
+	case DMU_OT_BPLIST:
+		return ("bplist");
+	case DMU_OT_SPACE_MAP:
+		return ("spacemap");
+	case DMU_OT_ERROR_LOG:
+		return ("errlog");
+	default:
+		return ("-");
+	}
+}
+
+
+/*
+ * Print usage message.
+ */
+void
+usage(void)
+{
+	(void) printf(
+	    "usage:\n"
+	    "\n"
+	    "\tzinject\n"
+	    "\n"
+	    "\t\tList all active injection records.\n"
+	    "\n"
+	    "\tzinject -c <id|all>\n"
+	    "\n"
+	    "\t\tClear the particular record (if given a numeric ID), or\n"
+	    "\t\tall records if 'all' is specificed.\n"
+	    "\n"
+	    "\tzinject -d device [-e errno] pool\n"
+	    "\t\tInject a fault into a particular device.  'errno' can either\n"
+	    "\t\tbe 'nxio' (the default) or 'io'.\n"
+	    "\n"
+	    "\tzinject -b objset:object:level:blkid pool\n"
+	    "\n"
+	    "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
+	    "\t\tspecified by the remaining tuple.  Each number is in\n"
+	    "\t\thexidecimal, and only one block can be specified.\n"
+	    "\n"
+	    "\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n"
+	    "\t    [-a] [-m] [-u] [-f freq] <object>\n"
+	    "\n"
+	    "\t\tInject an error into the object specified by the '-t' option\n"
+	    "\t\tand the object descriptor.  The 'object' parameter is\n"
+	    "\t\tinterperted depending on the '-t' option.\n"
+	    "\n"
+	    "\t\t-q\tQuiet mode.  Only print out the handler number added.\n"
+	    "\t\t-e\tInject a specific error.  Must be either 'io' or\n"
+	    "\t\t\t'checksum'.  Default is 'io'.\n"
+	    "\t\t-l\tInject error at a particular block level. Default is "
+	    "0.\n"
+	    "\t\t-m\tAutomatically remount underlying filesystem.\n"
+	    "\t\t-r\tInject error over a particular logical range of an\n"
+	    "\t\t\tobject.  Will be translated to the appropriate blkid\n"
+	    "\t\t\trange according to the object's properties.\n"
+	    "\t\t-a\tFlush the ARC cache.  Can be specified without any\n"
+	    "\t\t\tassociated object.\n"
+	    "\t\t-u\tUnload the associated pool.  Can be specified with only\n"
+	    "\t\t\ta pool object.\n"
+	    "\t\t-f\tOnly inject errors a fraction of the time.  Expressed as\n"
+	    "\t\t\ta percentage between 1 and 100.\n"
+	    "\n"
+	    "\t-t data\t\tInject an error into the plain file contents of a\n"
+	    "\t\t\tfile.  The object must be specified as a complete path\n"
+	    "\t\t\tto a file on a ZFS filesystem.\n"
+	    "\n"
+	    "\t-t dnode\tInject an error into the metadnode in the block\n"
+	    "\t\t\tcorresponding to the dnode for a file or directory.  The\n"
+	    "\t\t\t'-r' option is incompatible with this mode.  The object\n"
+	    "\t\t\tis specified as a complete path to a file or directory\n"
+	    "\t\t\ton a ZFS filesystem.\n"
+	    "\n"
+	    "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
+	    "\t\t\ttype.  Valid types are: mos, mosdir, config, bplist,\n"
+	    "\t\t\tspacemap, metaslab, errlog\n");
+}
+
+static int
+iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),
+    void *data)
+{
+	zfs_cmd_t zc;
+	int ret;
+
+	zc.zc_guid = 0;
+
+	while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0)
+		if ((ret = func((int)zc.zc_guid, zc.zc_name,
+		    &zc.zc_inject_record, data)) != 0)
+			return (ret);
+
+	return (0);
+}
+
+static int
+print_data_handler(int id, const char *pool, zinject_record_t *record,
+    void *data)
+{
+	int *count = data;
+
+	if (record->zi_guid != 0)
+		return (0);
+
+	if (*count == 0) {
+		(void) printf("%3s  %-15s  %-6s  %-6s  %-8s  %3s  %-15s\n",
+		    "ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL",  "RANGE");
+		(void) printf("---  ---------------  ------  "
+		    "------  --------  ---  ---------------\n");
+	}
+
+	*count += 1;
+
+	(void) printf("%3d  %-15s  %-6llu  %-6llu  %-8s  %3d  ", id, pool,
+	    (u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object,
+	    type_to_name(record->zi_type), record->zi_level);
+
+	if (record->zi_start == 0 &&
+	    record->zi_end == -1ULL)
+		(void) printf("all\n");
+	else
+		(void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start,
+		    (u_longlong_t)record->zi_end);
+
+	return (0);
+}
+
+static int
+print_device_handler(int id, const char *pool, zinject_record_t *record,
+    void *data)
+{
+	int *count = data;
+
+	if (record->zi_guid == 0)
+		return (0);
+
+	if (*count == 0) {
+		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "GUID");
+		(void) printf("---  ---------------  ----------------\n");
+	}
+
+	*count += 1;
+
+	(void) printf("%3d  %-15s  %llx\n", id, pool,
+	    (u_longlong_t)record->zi_guid);
+
+	return (0);
+}
+
+/*
+ * Print all registered error handlers.  Returns the number of handlers
+ * registered.
+ */
+static int
+print_all_handlers(void)
+{
+	int count = 0;
+
+	(void) iter_handlers(print_device_handler, &count);
+	(void) printf("\n");
+	count = 0;
+	(void) iter_handlers(print_data_handler, &count);
+
+	return (count);
+}
+
+/* ARGSUSED */
+static int
+cancel_one_handler(int id, const char *pool, zinject_record_t *record,
+    void *data)
+{
+	zfs_cmd_t zc;
+
+	zc.zc_guid = (uint64_t)id;
+
+	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
+		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
+		    id, strerror(errno));
+		return (1);
+	}
+
+	return (0);
+}
+
+/*
+ * Remove all fault injection handlers.
+ */
+static int
+cancel_all_handlers(void)
+{
+	int ret = iter_handlers(cancel_one_handler, NULL);
+
+	(void) printf("removed all registered handlers\n");
+
+	return (ret);
+}
+
+/*
+ * Remove a specific fault injection handler.
+ */
+static int
+cancel_handler(int id)
+{
+	zfs_cmd_t zc;
+
+	zc.zc_guid = (uint64_t)id;
+
+	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
+		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
+		    id, strerror(errno));
+		return (1);
+	}
+
+	(void) printf("removed handler %d\n", id);
+
+	return (0);
+}
+
+/*
+ * Register a new fault injection handler.
+ */
+static int
+register_handler(const char *pool, int flags, zinject_record_t *record,
+    int quiet)
+{
+	zfs_cmd_t zc;
+
+	(void) strcpy(zc.zc_name, pool);
+	zc.zc_inject_record = *record;
+	zc.zc_guid = flags;
+
+	if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
+		(void) fprintf(stderr, "failed to add handler: %s\n",
+		    strerror(errno));
+		return (1);
+	}
+
+	if (flags & ZINJECT_NULL)
+		return (0);
+
+	if (quiet) {
+		(void) printf("%llu\n", (u_longlong_t)zc.zc_guid);
+	} else {
+		(void) printf("Added handler %llu with the following "
+		    "properties:\n", (u_longlong_t)zc.zc_guid);
+		(void) printf("  pool: %s\n", pool);
+		if (record->zi_guid) {
+			(void) printf("  vdev: %llx\n",
+			    (u_longlong_t)record->zi_guid);
+		} else {
+			(void) printf("objset: %llu\n",
+			    (u_longlong_t)record->zi_objset);
+			(void) printf("object: %llu\n",
+			    (u_longlong_t)record->zi_object);
+			(void) printf("  type: %llu\n",
+			    (u_longlong_t)record->zi_type);
+			(void) printf(" level: %d\n", record->zi_level);
+			if (record->zi_start == 0 &&
+			    record->zi_end == -1ULL)
+				(void) printf(" range: all\n");
+			else
+				(void) printf(" range: [%llu, %llu)\n",
+				    (u_longlong_t)record->zi_start,
+				    (u_longlong_t)record->zi_end);
+		}
+	}
+
+	return (0);
+}
+
+int
+main(int argc, char **argv)
+{
+	int c;
+	char *range = NULL;
+	char *cancel = NULL;
+	char *end;
+	char *raw = NULL;
+	char *device = NULL;
+	int level = 0;
+	int quiet = 0;
+	int error = 0;
+	int domount = 0;
+	err_type_t type = TYPE_INVAL;
+	zinject_record_t record = { 0 };
+	char pool[MAXNAMELEN];
+	char dataset[MAXNAMELEN];
+	zfs_handle_t *zhp;
+	int ret;
+	int flags = 0;
+
+	if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
+		(void) fprintf(stderr, "failed to open ZFS device\n");
+		return (1);
+	}
+
+	if (argc == 1) {
+		/*
+		 * No arguments.  Print the available handlers.  If there are no
+		 * available handlers, direct the user to '-h' for help
+		 * information.
+		 */
+		if (print_all_handlers() == 0) {
+			(void) printf("No handlers registered.\n");
+			(void) printf("Run 'zinject -h' for usage "
+			    "information.\n");
+		}
+
+		return (0);
+	}
+
+	while ((c = getopt(argc, argv, ":ab:d:f:qhc:t:l:mr:e:u")) != -1) {
+		switch (c) {
+		case 'a':
+			flags |= ZINJECT_FLUSH_ARC;
+			break;
+		case 'b':
+			raw = optarg;
+			break;
+		case 'c':
+			cancel = optarg;
+			break;
+		case 'd':
+			device = optarg;
+			break;
+		case 'e':
+			if (strcasecmp(optarg, "io") == 0) {
+				error = EIO;
+			} else if (strcasecmp(optarg, "checksum") == 0) {
+				error = ECKSUM;
+			} else if (strcasecmp(optarg, "nxio") == 0) {
+				error = ENXIO;
+			} else {
+				(void) fprintf(stderr, "invalid error type "
+				    "'%s': must be 'io', 'checksum' or "
+				    "'nxio'\n", optarg);
+				usage();
+				return (1);
+			}
+			break;
+		case 'f':
+			record.zi_freq = atoi(optarg);
+			if (record.zi_freq < 1 || record.zi_freq > 100) {
+				(void) fprintf(stderr, "frequency range must "
+				    "be in the range (0, 100]\n");
+				return (1);
+			}
+			break;
+		case 'h':
+			usage();
+			return (0);
+		case 'l':
+			level = (int)strtol(optarg, &end, 10);
+			if (*end != '\0') {
+				(void) fprintf(stderr, "invalid level '%s': "
+				    "must be an integer\n", optarg);
+				usage();
+				return (1);
+			}
+			break;
+		case 'm':
+			domount = 1;
+			break;
+		case 'q':
+			quiet = 1;
+			break;
+		case 'r':
+			range = optarg;
+			break;
+		case 't':
+			if ((type = name_to_type(optarg)) == TYPE_INVAL) {
+				(void) fprintf(stderr, "invalid type '%s'\n",
+				    optarg);
+				usage();
+				return (1);
+			}
+			break;
+		case 'u':
+			flags |= ZINJECT_UNLOAD_SPA;
+			break;
+		case ':':
+			(void) fprintf(stderr, "option -%c requires an "
+			    "operand\n", optopt);
+			usage();
+			return (1);
+		case '?':
+			(void) fprintf(stderr, "invalid option '%c'\n",
+			    optopt);
+			usage();
+			return (2);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (cancel != NULL) {
+		/*
+		 * '-c' is invalid with any other options.
+		 */
+		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+		    level != 0) {
+			(void) fprintf(stderr, "cancel (-c) incompatible with "
+			    "any other options\n");
+			usage();
+			return (2);
+		}
+		if (argc != 0) {
+			(void) fprintf(stderr, "extraneous argument to '-c'\n");
+			usage();
+			return (2);
+		}
+
+		if (strcmp(cancel, "all") == 0) {
+			return (cancel_all_handlers());
+		} else {
+			int id = (int)strtol(cancel, &end, 10);
+			if (*end != '\0') {
+				(void) fprintf(stderr, "invalid handle id '%s':"
+				    " must be an integer or 'all'\n", cancel);
+				usage();
+				return (1);
+			}
+			return (cancel_handler(id));
+		}
+	}
+
+	if (device != NULL) {
+		/*
+		 * Device (-d) injection uses a completely different mechanism
+		 * for doing injection, so handle it separately here.
+		 */
+		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+		    level != 0) {
+			(void) fprintf(stderr, "device (-d) incompatible with "
+			    "data error injection\n");
+			usage();
+			return (2);
+		}
+
+		if (argc != 1) {
+			(void) fprintf(stderr, "device (-d) injection requires "
+			    "a single pool name\n");
+			usage();
+			return (2);
+		}
+
+		(void) strcpy(pool, argv[0]);
+		dataset[0] = '\0';
+
+		if (error == ECKSUM) {
+			(void) fprintf(stderr, "device error type must be "
+			    "'io' or 'nxio'\n");
+			return (1);
+		}
+
+		if (translate_device(pool, device, &record) != 0)
+			return (1);
+		if (!error)
+			error = ENXIO;
+	} else if (raw != NULL) {
+		if (range != NULL || type != TYPE_INVAL || level != 0) {
+			(void) fprintf(stderr, "raw (-b) format with "
+			    "any other options\n");
+			usage();
+			return (2);
+		}
+
+		if (argc != 1) {
+			(void) fprintf(stderr, "raw (-b) format expects a "
+			    "single pool name\n");
+			usage();
+			return (2);
+		}
+
+		(void) strcpy(pool, argv[0]);
+		dataset[0] = '\0';
+
+		if (error == ENXIO) {
+			(void) fprintf(stderr, "data error type must be "
+			    "'checksum' or 'io'\n");
+			return (1);
+		}
+
+		if (translate_raw(raw, &record) != 0)
+			return (1);
+		if (!error)
+			error = EIO;
+	} else if (type == TYPE_INVAL) {
+		if (flags == 0) {
+			(void) fprintf(stderr, "at least one of '-b', '-d', "
+			    "'-t', '-a', or '-u' must be specified\n");
+			usage();
+			return (2);
+		}
+
+		if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) {
+			(void) strcpy(pool, argv[0]);
+			dataset[0] = '\0';
+		} else if (argc != 0) {
+			(void) fprintf(stderr, "extraneous argument for "
+			    "'-f'\n");
+			usage();
+			return (2);
+		}
+
+		flags |= ZINJECT_NULL;
+	} else {
+		if (argc != 1) {
+			(void) fprintf(stderr, "missing object\n");
+			usage();
+			return (2);
+		}
+
+		if (error == ENXIO) {
+			(void) fprintf(stderr, "data error type must be "
+			    "'checksum' or 'io'\n");
+			return (1);
+		}
+
+		if (translate_record(type, argv[0], range, level, &record, pool,
+		    dataset) != 0)
+			return (1);
+		if (!error)
+			error = EIO;
+	}
+
+	/*
+	 * If this is pool-wide metadata, unmount everything.  The ioctl() will
+	 * unload the pool, so that we trigger spa-wide reopen of metadata next
+	 * time we access the pool.
+	 */
+	if (dataset[0] != '\0' && domount) {
+		if ((zhp = zfs_open(dataset, ZFS_TYPE_ANY)) == NULL)
+			return (1);
+
+		if (zfs_unmount(zhp, NULL, 0) != 0)
+			return (1);
+	}
+
+	record.zi_error = error;
+
+	ret = register_handler(pool, flags, &record, quiet);
+
+	if (dataset[0] != '\0' && domount)
+		ret = (zfs_mount(zhp, NULL, 0) != 0);
+
+	return (ret);
+}
diff --git a/usr/src/cmd/zinject/zinject.h b/usr/src/cmd/zinject/zinject.h
new file mode 100644
index 0000000000..bdbc2454c4
--- /dev/null
+++ b/usr/src/cmd/zinject/zinject.h
@@ -0,0 +1,64 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_ZINJECT_H
+#define	_ZINJECT_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_ioctl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	TYPE_DATA,		/* plain file contents		*/
+	TYPE_DNODE,		/* metadnode contents		*/
+	TYPE_MOS,		/* all MOS data			*/
+	TYPE_MOSDIR,		/* MOS object directory		*/
+	TYPE_METASLAB,		/* metaslab objects		*/
+	TYPE_CONFIG,		/* MOS config			*/
+	TYPE_BPLIST,		/* block pointer list		*/
+	TYPE_SPACEMAP,		/* space map objects		*/
+	TYPE_ERRLOG,		/* persistent error log		*/
+	TYPE_INVAL
+} err_type_t;
+
+#define	MOS_TYPE(t)	\
+	((t) >= TYPE_MOS && (t) < TYPE_INVAL)
+
+int translate_record(err_type_t type, const char *object, const char *range,
+    int level, zinject_record_t *record, char *poolname, char *dataset);
+int translate_raw(const char *raw, zinject_record_t *record);
+int translate_device(const char *pool, const char *device,
+    zinject_record_t *record);
+void usage(void);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZINJECT_H */
diff --git a/usr/src/cmd/zoneadmd/vplat.c b/usr/src/cmd/zoneadmd/vplat.c
index 98dd9e67bc..deb04138be 100644
--- a/usr/src/cmd/zoneadmd/vplat.c
+++ b/usr/src/cmd/zoneadmd/vplat.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -2552,13 +2551,6 @@ validate_datasets(zlog_t *zlogp)
 
 	zfs_set_error_handler(zfs_error_handler);
 
-	/*
-	 * libzfs opens /dev/zfs during its .init routine.
-	 * zoneadmd automatically closes these files when it daemonizes,
-	 * so we cheat by re-calling the init routine.
-	 */
-	zfs_init();
-
 	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
 
 		if ((zhp = zfs_open(dstab.zone_dataset_name,
diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c
index 95857402cb..1b86fe538c 100644
--- a/usr/src/cmd/zpool/zpool_main.c
+++ b/usr/src/cmd/zpool/zpool_main.c
@@ -58,6 +58,7 @@ static int zpool_do_status(int, char **);
 
 static int zpool_do_online(int, char **);
 static int zpool_do_offline(int, char **);
+static int zpool_do_clear(int, char **);
 
 static int zpool_do_attach(int, char **);
 static int zpool_do_detach(int, char **);
@@ -87,6 +88,7 @@ _umem_logging_init(void)
 typedef enum {
 	HELP_ADD,
 	HELP_ATTACH,
+	HELP_CLEAR,
 	HELP_CREATE,
 	HELP_DESTROY,
 	HELP_DETACH,
@@ -110,9 +112,8 @@ typedef struct zpool_command {
 
 /*
  * Master command table.  Each ZFS command has a name, associated function, and
- * usage message.  Unfortunately, the usage messages need to be
- * iternationalized, so we have to have a function to return the usage message
- * based on a command index.
+ * usage message.  The usage messages need to be internationalized, so we have
+ * to have a function to return the usage message based on a command index.
  *
  * These commands are organized according to how they are displayed in the usage
  * message.  An empty command (one with a NULL name) indicates an empty line in
@@ -130,6 +131,7 @@ static zpool_command_t command_table[] = {
 	{ NULL },
 	{ "online",	zpool_do_online,	HELP_ONLINE		},
 	{ "offline",	zpool_do_offline,	HELP_OFFLINE		},
+	{ "clear",	zpool_do_clear,		HELP_CLEAR		},
 	{ NULL },
 	{ "attach",	zpool_do_attach,	HELP_ATTACH		},
 	{ "detach",	zpool_do_detach,	HELP_DETACH		},
@@ -153,6 +155,8 @@ get_usage(zpool_help_t idx) {
 	case HELP_ATTACH:
 		return (gettext("\tattach [-f] <pool> <device> "
 		    "<new_device>\n"));
+	case HELP_CLEAR:
+		return (gettext("\tclear <pool> [device]\n"));
 	case HELP_CREATE:
 		return (gettext("\tcreate  [-fn] [-R root] [-m mountpoint] "
 		    "<pool> <vdev> ...\n"));
@@ -277,12 +281,15 @@ usage(int requested)
 }
 
 const char *
-state_to_name(int state)
+state_to_name(vdev_stat_t *vs)
 {
-	switch (state) {
+	switch (vs->vs_state) {
 	case VDEV_STATE_CLOSED:
 	case VDEV_STATE_CANT_OPEN:
-		return (gettext("FAULTED"));
+		if (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)
+			return (gettext("FAULTED"));
+		else
+			return (gettext("UNAVAIL"));
 	case VDEV_STATE_OFFLINE:
 		return (gettext("OFFLINE"));
 	case VDEV_STATE_DEGRADED:
@@ -771,7 +778,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
 	(void) printf("\t%*s%-*s", depth, "", namewidth - depth, name);
 
 	if (vs->vs_aux != 0) {
-		(void) printf("  %-8s  ", state_to_name(vs->vs_state));
+		(void) printf("  %-8s  ", state_to_name(vs));
 
 		switch (vs->vs_aux) {
 		case VDEV_AUX_OPEN_FAILED:
@@ -791,7 +798,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
 			break;
 		}
 	} else {
-		(void) printf("  %s", state_to_name(vs->vs_state));
+		(void) printf("  %s", state_to_name(vs));
 	}
 	(void) printf("\n");
 
@@ -867,6 +874,11 @@ show_import(nvlist_t *config)
 		    "are offlined.\n"));
 		break;
 
+	case ZPOOL_STATUS_CORRUPT_POOL:
+		(void) printf(gettext("status: The pool metadata is "
+		    "corrupted.\n"));
+		break;
+
 	default:
 		/*
 		 * No other status can be seen when importing pools.
@@ -1671,7 +1683,7 @@ list_callback(zpool_handle_t *zhp, void *data)
 				verify(nvlist_lookup_uint64_array(nvroot,
 				    ZPOOL_CONFIG_STATS, (uint64_t **)&vs,
 				    &vsc) == 0);
-				(void) strlcpy(buf, state_to_name(vs->vs_state),
+				(void) strlcpy(buf, state_to_name(vs),
 				    sizeof (buf));
 			}
 			break;
@@ -2081,6 +2093,42 @@ zpool_do_offline(int argc, char **argv)
 	return (ret);
 }
 
+/*
+ * zpool clear <pool> [device]
+ *
+ * Clear all errors associated with a pool or a particular device.
+ */
+int
+zpool_do_clear(int argc, char **argv)
+{
+	int ret = 0;
+	zpool_handle_t *zhp;
+	char *pool, *device;
+
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing pool name\n"));
+		usage(FALSE);
+	}
+
+	if (argc > 3) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(FALSE);
+	}
+
+	pool = argv[1];
+	device = argc == 3 ? argv[2] : NULL;
+
+	if ((zhp = zpool_open(pool)) == NULL)
+		return (1);
+
+	if (zpool_clear(zhp, device) != 0)
+		ret = 1;
+
+	zpool_close(zhp);
+
+	return (ret);
+}
+
 typedef struct scrub_cbdata {
 	int	cb_type;
 } scrub_cbdata_t;
@@ -2090,6 +2138,15 @@ scrub_callback(zpool_handle_t *zhp, void *data)
 {
 	scrub_cbdata_t *cb = data;
 
+	/*
+	 * Ignore faulted pools.
+	 */
+	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+		(void) fprintf(stderr, gettext("cannot scrub '%s': pool is "
+		    "currently unavailable\n"), zpool_get_name(zhp));
+		return (1);
+	}
+
 	return (zpool_scrub(zhp, cb->cb_type) != 0);
 }
 
@@ -2201,8 +2258,9 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
 	nvlist_t **child;
 	uint_t c, children;
 	vdev_stat_t *vs;
-	char rbuf[6], wbuf[6], cbuf[6], repaired[6];
+	char rbuf[6], wbuf[6], cbuf[6], repaired[7];
 	char *vname;
+	uint64_t notpresent;
 
 	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
 	    (uint64_t **)&vs, &c) == 0);
@@ -2212,14 +2270,19 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
 		children = 0;
 
 	(void) printf("\t%*s%-*s  %-8s", depth, "", namewidth - depth,
-	    name, state_to_name(vs->vs_state));
+	    name, state_to_name(vs));
 
 	zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
 	zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
 	zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
 	(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
 
-	if (vs->vs_aux != 0) {
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+	    &notpresent) == 0) {
+		char *path;
+		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
+		(void) printf("  was %s\n", path);
+	} else if (vs->vs_aux != 0) {
 		(void) printf("  ");
 
 		switch (vs->vs_aux) {
@@ -2259,6 +2322,60 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
 	}
 }
 
+static void
+print_error_log(zpool_handle_t *zhp)
+{
+	nvlist_t **log;
+	size_t nelem;
+	size_t maxdsname = sizeof ("DATASET") - 1;
+	size_t maxobjname = sizeof ("OBJECT") - 1;
+	int i;
+	nvlist_t *nv;
+	size_t len;
+	char *dsname, *objname, *range;
+
+	if (zpool_get_errlog(zhp, &log, &nelem) != 0) {
+		(void) printf("errors: List of errors unavailable "
+		    "(insufficient privileges)\n");
+		return;
+	}
+
+	for (i = 0; i < nelem; i++) {
+		nv = log[i];
+
+		verify(nvlist_lookup_string(nv, ZPOOL_ERR_DATASET,
+		    &dsname) == 0);
+		len = strlen(dsname);
+		if (len > maxdsname)
+			maxdsname = len;
+
+		verify(nvlist_lookup_string(nv, ZPOOL_ERR_OBJECT,
+		    &objname) == 0);
+		len = strlen(objname);
+		if (len > maxobjname)
+			maxobjname = len;
+	}
+
+	(void) printf("errors: The following persistent errors have been "
+	    "detected:\n\n");
+	(void) printf("%8s  %-*s  %-*s  %s\n", "", maxdsname, "DATASET",
+	    maxobjname, "OBJECT", "RANGE");
+
+	for (i = 0; i < nelem; i++) {
+		nv = log[i];
+
+		verify(nvlist_lookup_string(nv, ZPOOL_ERR_DATASET,
+		    &dsname) == 0);
+		verify(nvlist_lookup_string(nv, ZPOOL_ERR_OBJECT,
+		    &objname) == 0);
+		verify(nvlist_lookup_string(nv, ZPOOL_ERR_RANGE,
+		    &range) == 0);
+
+		(void) printf("%8s  %-*s  %-*s  %s\n", "", maxdsname,
+		    dsname, maxobjname, objname, range);
+	}
+}
+
 /*
  * Display a summary of pool status.  Displays a summary such as:
  *
@@ -2269,7 +2386,7 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
  *	config:
  *		mirror		DEGRADED
  *                c1t0d0	OK
- *                c2t0d0	FAULTED
+ *                c2t0d0	UNAVAIL
  *
  * When given the '-v' option, we print out the complete config.  If the '-e'
  * option is specified, then we print out error rate information as well.
@@ -2348,7 +2465,7 @@ status_callback(zpool_handle_t *zhp, void *data)
 		    "unaffected.\n"));
 		(void) printf(gettext("action: Determine if the device needs "
 		    "to be replaced, and clear the errors\n\tusing "
-		    "'zpool online' or replace the device with 'zpool "
+		    "'zpool clear' or replace the device with 'zpool "
 		    "replace'.\n"));
 		break;
 
@@ -2370,6 +2487,22 @@ status_callback(zpool_handle_t *zhp, void *data)
 		    "complete.\n"));
 		break;
 
+	case ZPOOL_STATUS_CORRUPT_DATA:
+		(void) printf(gettext("status: One or more devices has "
+		    "experienced an error resulting in data\n\tcorruption.  "
+		    "Applications may be affected.\n"));
+		(void) printf(gettext("action: Restore the file in question "
+		    "if possible.  Otherwise restore the\n\tentire pool from "
+		    "backup.\n"));
+		break;
+
+	case ZPOOL_STATUS_CORRUPT_POOL:
+		(void) printf(gettext("status: The pool metadata is corrupted "
+		    "and the pool cannot be opened.\n"));
+		(void) printf(gettext("action: Destroy and re-create the pool "
+		    "from a backup source.\n"));
+		break;
+
 	default:
 		/*
 		 * The remaining errors can't actually be generated, yet.
@@ -2383,6 +2516,8 @@ status_callback(zpool_handle_t *zhp, void *data)
 
 	if (config != NULL) {
 		int namewidth;
+		uint64_t nerr;
+		size_t realerr;
 
 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 		    &nvroot) == 0);
@@ -2399,6 +2534,28 @@ status_callback(zpool_handle_t *zhp, void *data)
 		    "NAME", "STATE", "READ", "WRITE", "CKSUM");
 		print_status_config(zhp, zpool_get_name(zhp), nvroot,
 		    namewidth, 0);
+
+		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
+		    &nerr) == 0) {
+			/*
+			 * If the approximate error count is small, get a
+			 * precise count by fetching the entire log and
+			 * uniquifying the results.
+			 */
+			if (nerr < 100 && !cbp->cb_verbose &&
+			    zpool_get_errlog(zhp, NULL, &realerr) == 0)
+				nerr = realerr;
+
+			(void) printf("\n");
+			if (nerr == 0)
+				(void) printf(gettext("errors: No known data "
+				    "errors\n"));
+			else if (!cbp->cb_verbose)
+				(void) printf(gettext("errors: %d data errors, "
+				    "use '-v' for a list\n"), nerr);
+			else
+				print_error_log(zhp);
+		}
 	} else {
 		(void) printf(gettext("config: The configuration cannot be "
 		    "determined.\n"));
@@ -2507,8 +2664,8 @@ main(int argc, char **argv)
 	 * 'freeze' is a vile debugging abomination, so we treat it as such.
 	 */
 	if (strcmp(cmdname, "freeze") == 0 && argc == 3) {
-		char buf[8192];
-		int fd = open("/dev/zpoolctl", O_RDWR);
+		char buf[16384];
+		int fd = open(ZFS_DEV, O_RDWR);
 		(void) strcpy((void *)buf, argv[2]);
 		return (!!ioctl(fd, ZFS_IOC_POOL_FREEZE, buf));
 	}
diff --git a/usr/src/cmd/zpool/zpool_vdev.c b/usr/src/cmd/zpool/zpool_vdev.c
index 2dd85062be..6fba820d10 100644
--- a/usr/src/cmd/zpool/zpool_vdev.c
+++ b/usr/src/cmd/zpool/zpool_vdev.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -112,6 +111,13 @@ vdev_error(const char *fmt, ...)
 static void
 libdiskmgt_error(int error)
 {
+	/*
+	 * ENXIO is a valid error message if the device doesn't live in
+	 * /dev/dsk.  Don't bother printing an error message in this case.
+	 */
+	if (error == ENXIO)
+		return;
+
 	(void) fprintf(stderr, gettext("warning: device in use checking "
 	    "failed: %s\n"), strerror(error));
 }
diff --git a/usr/src/cmd/ztest/Makefile b/usr/src/cmd/ztest/Makefile
index 52e17eb413..1a34525b2d 100644
--- a/usr/src/cmd/ztest/Makefile
+++ b/usr/src/cmd/ztest/Makefile
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -43,13 +42,9 @@ lint	:=	TARGET = lint
 
 all clean clobber lint:	$(SUBDIRS)
 
-#
-# This should really be $(LN), but protocmp detects link inconsistencies 
-# between isaexec (which we ship) and ztest (which we do not ship).
-#
 install:	$(SUBDIRS)
 	-$(RM) $(ROOTPROG)
-	-$(CP) $(ISAEXEC) $(ROOTPROG)
+	-$(LN) $(ISAEXEC) $(ROOTPROG)
 
 $(SUBDIRS):	FRC
 	@cd $@; pwd; $(MAKE) $(TARGET)
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index 07cda80045..13d8b81f36 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -132,6 +132,7 @@ typedef struct ztest_args {
 	uint64_t	za_random;
 	uint64_t	za_diroff;
 	uint64_t	za_diroff_shared;
+	uint64_t	za_zil_seq;
 	hrtime_t	za_start;
 	hrtime_t	za_stop;
 	hrtime_t	za_kill;
@@ -183,7 +184,7 @@ ztest_info_t ztest_info[] = {
 	{ ztest_traverse,			&zopt_often	},
 	{ ztest_dsl_prop_get_set,		&zopt_sometimes	},
 	{ ztest_dmu_objset_create_destroy,	&zopt_sometimes	},
-	{ ztest_dmu_snapshot_create_destroy,	&zopt_sometimes	},
+	{ ztest_dmu_snapshot_create_destroy,	&zopt_rarely	},
 	{ ztest_spa_create_destroy,		&zopt_sometimes	},
 	{ ztest_fault_inject,			&zopt_sometimes	},
 	{ ztest_spa_rename,			&zopt_rarely	},
@@ -777,12 +778,12 @@ ztest_vdev_add_remove(ztest_args_t *za)
 
 	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
 
-	spa_config_enter(spa, RW_READER);
+	spa_config_enter(spa, RW_READER, FTAG);
 
 	ztest_shared->zs_vdev_primaries =
 	    spa->spa_root_vdev->vdev_children * leaves;
 
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	nvroot = make_vdev_root(zopt_vdev_size, zopt_raidz, zopt_mirrors, 1);
 	error = spa_vdev_add(spa, nvroot);
@@ -799,6 +800,35 @@ ztest_vdev_add_remove(ztest_args_t *za)
 		(void) printf("spa_vdev_add = %d, as expected\n", error);
 }
 
+static vdev_t *
+vdev_lookup_by_path(vdev_t *vd, const char *path)
+{
+	int c;
+	vdev_t *mvd;
+
+	if (vd->vdev_path != NULL) {
+		if (vd->vdev_wholedisk == 1) {
+			/*
+			 * For whole disks, the internal path has 's0', but the
+			 * path passed in by the user doesn't.
+			 */
+			if (strlen(path) == strlen(vd->vdev_path) - 2 &&
+			    strncmp(path, vd->vdev_path, strlen(path)) == 0)
+				return (vd);
+		} else if (strcmp(path, vd->vdev_path) == 0) {
+			return (vd);
+		}
+	}
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
+		    NULL)
+			return (mvd);
+
+	return (NULL);
+}
+
+
 /*
  * Verify that we can attach and detach devices.
  */
@@ -807,19 +837,19 @@ ztest_vdev_attach_detach(ztest_args_t *za)
 {
 	spa_t *spa = dmu_objset_spa(za->za_os);
 	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *vd0, *vd1, *pvd;
+	vdev_t *oldvd, *newvd, *pvd;
 	nvlist_t *root, *file;
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	uint64_t leaf, top;
-	size_t size0, size1;
-	char path0[MAXPATHLEN], path1[MAXPATHLEN];
+	size_t oldsize, newsize;
+	char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
 	int replacing;
 	int error, expected_error;
 	int fd;
 
 	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
 
-	spa_config_enter(spa, RW_READER);
+	spa_config_enter(spa, RW_READER, FTAG);
 
 	/*
 	 * Decide whether to do an attach or a replace.
@@ -840,84 +870,83 @@ ztest_vdev_attach_detach(ztest_args_t *za)
 	 * Generate the path to this leaf.  The filename will end with 'a'.
 	 * We'll alternate replacements with a filename that ends with 'b'.
 	 */
-	(void) snprintf(path0, sizeof (path0),
+	(void) snprintf(oldpath, sizeof (oldpath),
 	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + leaf);
 
-	bcopy(path0, path1, MAXPATHLEN);
+	bcopy(oldpath, newpath, MAXPATHLEN);
 
 	/*
 	 * If the 'a' file isn't part of the pool, the 'b' file must be.
 	 */
-	if (vdev_lookup_by_path(rvd, path0) == NULL)
-		path0[strlen(path0) - 1] = 'b';
+	if (vdev_lookup_by_path(rvd, oldpath) == NULL)
+		oldpath[strlen(oldpath) - 1] = 'b';
 	else
-		path1[strlen(path1) - 1] = 'b';
+		newpath[strlen(newpath) - 1] = 'b';
 
 	/*
-	 * Now path0 represents something that's already in the pool,
-	 * and path1 is the thing we'll try to attach.
+	 * Now oldpath represents something that's already in the pool,
+	 * and newpath is the thing we'll try to attach.
 	 */
-	vd0 = vdev_lookup_by_path(rvd, path0);
-	vd1 = vdev_lookup_by_path(rvd, path1);
-	ASSERT(vd0 != NULL);
-	pvd = vd0->vdev_parent;
-
+	oldvd = vdev_lookup_by_path(rvd, oldpath);
+	newvd = vdev_lookup_by_path(rvd, newpath);
+	ASSERT(oldvd != NULL);
+	pvd = oldvd->vdev_parent;
 
 	/*
-	 * Make size1 a little bigger or smaller than size0.
+	 * Make newsize a little bigger or smaller than oldsize.
 	 * If it's smaller, the attach should fail.
 	 * If it's larger, and we're doing a replace,
 	 * we should get dynamic LUN growth when we're done.
 	 */
-	size0 = vd0->vdev_psize;
-	size1 = 10 * size0 / (9 + ztest_random(3));
+	oldsize = vdev_get_rsize(oldvd);
+	newsize = 10 * oldsize / (9 + ztest_random(3));
 
 	/*
 	 * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
 	 * unless it's a replace; in that case any non-replacing parent is OK.
 	 *
-	 * If vd1 is already part of the pool, it should fail with EBUSY.
+	 * If newvd is already part of the pool, it should fail with EBUSY.
 	 *
-	 * If vd1 is too small, it should fail with EOVERFLOW.
+	 * If newvd is too small, it should fail with EOVERFLOW.
 	 */
 	if (pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_root_ops &&
 	    (!replacing || pvd->vdev_ops == &vdev_replacing_ops))
 		expected_error = ENOTSUP;
-	else if (vd1 != NULL)
+	else if (newvd != NULL)
 		expected_error = EBUSY;
-	else if (size1 < size0)
+	else if (newsize < oldsize)
 		expected_error = EOVERFLOW;
 	else
 		expected_error = 0;
 
 	/*
-	 * If vd1 isn't already part of the pool, create it.
+	 * If newvd isn't already part of the pool, create it.
 	 */
-	if (vd1 == NULL) {
-		fd = open(path1, O_RDWR | O_CREAT | O_TRUNC, 0666);
+	if (newvd == NULL) {
+		fd = open(newpath, O_RDWR | O_CREAT | O_TRUNC, 0666);
 		if (fd == -1)
-			fatal(1, "can't open %s", path1);
-		if (ftruncate(fd, size1) != 0)
-			fatal(1, "can't ftruncate %s", path1);
+			fatal(1, "can't open %s", newpath);
+		if (ftruncate(fd, newsize) != 0)
+			fatal(1, "can't ftruncate %s", newpath);
 		(void) close(fd);
 	}
 
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	/*
-	 * Build the nvlist describing path1.
+	 * Build the nvlist describing newpath.
 	 */
 	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
-	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path1) == 0);
+	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, newpath) == 0);
 
 	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
 	    &file, 1) == 0);
 
-	error = spa_vdev_attach(spa, path0, root, replacing);
+	error = spa_vdev_attach(spa, oldvd->vdev_guid, root, replacing);
 
 	nvlist_free(file);
 	nvlist_free(root);
@@ -939,7 +968,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
 
 	if (error != expected_error) {
 		fatal(0, "attach (%s, %s, %d) returned %d, expected %d",
-		    path0, path1, replacing, error, expected_error);
+		    oldpath, newpath, replacing, error, expected_error);
 	}
 
 	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
@@ -964,9 +993,9 @@ ztest_vdev_LUN_growth(ztest_args_t *za)
 	/*
 	 * Pick a random leaf vdev.
 	 */
-	spa_config_enter(spa, RW_READER);
+	spa_config_enter(spa, RW_READER, FTAG);
 	vdev = ztest_random(spa->spa_root_vdev->vdev_children * leaves);
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
 
@@ -1219,6 +1248,7 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za)
 			dmu_objset_close(os2);
 	}
 
+	txg_wait_synced(dmu_objset_pool(os), 0);
 	zil_close(zilog);
 	dmu_objset_close(os);
 
@@ -1268,6 +1298,26 @@ ztest_blk_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 	traverse_handle_t *th = za->za_th;
 	uint64_t size = BP_GET_LSIZE(bp);
 
+	/*
+	 * Level -1 indicates the objset_phys_t or something in its intent log.
+	 */
+	if (zb->zb_level == -1) {
+		if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+			ASSERT3U(zb->zb_object, ==, 0);
+			ASSERT3U(zb->zb_blkid, ==, 0);
+			ASSERT3U(size, ==, sizeof (objset_phys_t));
+			za->za_zil_seq = 0;
+		} else if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
+			ASSERT3U(zb->zb_object, ==, 0);
+			ASSERT3U(zb->zb_blkid, >, za->za_zil_seq);
+			za->za_zil_seq = zb->zb_blkid;
+		} else {
+			ASSERT3U(zb->zb_object, !=, 0);	/* lr_write_t */
+		}
+
+		return (0);
+	}
+
 	ASSERT(dnp != NULL);
 
 	if (bc->bc_errno)
@@ -1309,11 +1359,6 @@ ztest_blk_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 		return (0);
 	}
 
-	if (zb->zb_level == -1) {
-		ASSERT3U(size, ==, sizeof (objset_phys_t));
-		return (0);
-	}
-
 	ASSERT(zb->zb_level == 0);
 	ASSERT3U(size, ==, dnp->dn_datablkszsec << DEV_BSHIFT);
 
@@ -1346,6 +1391,9 @@ ztest_traverse(ztest_args_t *za)
 		if (ztest_random(2) == 0)
 			advance |= ADVANCE_HOLES;
 
+		if (ztest_random(2) == 0)
+			advance |= ADVANCE_ZIL;
+
 		th = za->za_th = traverse_init(spa, ztest_blk_cb, za, advance,
 		    ZIO_FLAG_CANFAIL);
 
@@ -1361,7 +1409,7 @@ ztest_traverse(ztest_args_t *za)
 
 	if (zopt_verbose >= 5)
 		(void) printf("traverse %s%s%s%s %llu blocks to "
-		    "<%llu, %llu, %d, %llx>%s\n",
+		    "<%llu, %llu, %lld, %llx>%s\n",
 		    (advance & ADVANCE_PRE) ? "pre" : "post",
 		    (advance & ADVANCE_PRUNE) ? "|prune" : "",
 		    (advance & ADVANCE_DATA) ? "|data" : "",
@@ -1369,7 +1417,7 @@ ztest_traverse(ztest_args_t *za)
 		    (u_longlong_t)(th->th_callbacks - cbstart),
 		    (u_longlong_t)th->th_lastcb.zb_objset,
 		    (u_longlong_t)th->th_lastcb.zb_object,
-		    th->th_lastcb.zb_level,
+		    (u_longlong_t)th->th_lastcb.zb_level,
 		    (u_longlong_t)th->th_lastcb.zb_blkid,
 		    rc == 0 ? " [done]" :
 		    rc == EINTR ? " [aborted]" :
@@ -1406,7 +1454,8 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
 	/*
 	 * Create a batch object if necessary, and record it in the directory.
 	 */
-	dmu_read(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t), &batchobj);
+	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
+	    sizeof (uint64_t), &batchobj));
 	if (batchobj == 0) {
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
@@ -1430,23 +1479,21 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
 	 * Destroy the previous batch of objects.
 	 */
 	for (b = 0; b < batchsize; b++) {
-		dmu_read(os, batchobj, b * sizeof (uint64_t),
-		    sizeof (uint64_t), &object);
+		VERIFY(0 == dmu_read(os, batchobj, b * sizeof (uint64_t),
+		    sizeof (uint64_t), &object));
 		if (object == 0)
 			continue;
 		/*
 		 * Read and validate contents.
 		 * We expect the nth byte of the bonus buffer to be n.
 		 */
-		db = dmu_bonus_hold(os, object);
+		VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
 
 		dmu_object_info_from_db(db, &doi);
 		ASSERT(doi.doi_type == DMU_OT_UINT64_OTHER);
 		ASSERT(doi.doi_bonus_type == DMU_OT_PLAIN_OTHER);
 		ASSERT3S(doi.doi_physical_blks, >=, 0);
 
-		dmu_buf_read(db);
-
 		bonuslen = db->db_size;
 
 		for (c = 0; c < bonuslen; c++) {
@@ -1460,12 +1507,13 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
 			}
 		}
 
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, FTAG);
 
 		/*
 		 * We expect the word at endoff to be our object number.
 		 */
-		dmu_read(os, object, endoff, sizeof (uint64_t), &temp);
+		VERIFY(0 == dmu_read(os, object, endoff,
+		    sizeof (uint64_t), &temp));
 
 		if (temp != object) {
 			fatal(0, "bad data in %s, got %llu, expected %llu",
@@ -1564,7 +1612,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
 		/*
 		 * Write to both the bonus buffer and the regular data.
 		 */
-		db = dmu_bonus_hold(os, object);
+		VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
 		ASSERT3U(bonuslen, ==, db->db_size);
 
 		dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
@@ -1579,7 +1627,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
 		for (c = 0; c < db->db_size; c++)
 			((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
 
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, FTAG);
 
 		/*
 		 * Write to a large offset to increase indirection.
@@ -1647,7 +1695,8 @@ ztest_dmu_read_write(ztest_args_t *za)
 	/*
 	 * Read the directory info.  If it's the first time, set things up.
 	 */
-	dmu_read(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd);
+	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
+	    sizeof (dd), &dd));
 	if (dd.dd_chunk == 0) {
 		ASSERT(dd.dd_packobj == 0);
 		ASSERT(dd.dd_bigobj == 0);
@@ -1709,8 +1758,10 @@ ztest_dmu_read_write(ztest_args_t *za)
 	/*
 	 * Read the current contents of our objects.
 	 */
-	dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf);
-	dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf);
+	error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf);
+	ASSERT3U(error, ==, 0);
+	error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf);
+	ASSERT3U(error, ==, 0);
 
 	/*
 	 * Get a tx for the mods to both packobj and bigobj.
@@ -1792,7 +1843,8 @@ ztest_dmu_read_write(ztest_args_t *za)
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
-		dmu_free_range(os, dd.dd_bigobj, bigoff, bigsize, tx);
+		VERIFY(0 == dmu_free_range(os, dd.dd_bigobj, bigoff,
+		    bigsize, tx));
 	} else {
 		if (zopt_verbose >= 6) {
 			(void) printf("writing offset %llx size %llx"
@@ -1813,8 +1865,10 @@ ztest_dmu_read_write(ztest_args_t *za)
 		void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
 		void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
-		dmu_read(os, dd.dd_packobj, packoff, packsize, packcheck);
-		dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigcheck);
+		VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
+		    packsize, packcheck));
+		VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
+		    bigsize, bigcheck));
 
 		ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
 		ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
@@ -1890,7 +1944,8 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 
 		if (do_free) {
 			(void) mutex_lock(lp);
-			dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx);
+			VERIFY(0 == dmu_free_range(os, ZTEST_DIROBJ, off,
+			    bs, tx));
 			(void) mutex_unlock(lp);
 			dmu_tx_commit(tx);
 			continue;
@@ -1904,9 +1959,9 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 
 		if (off == -1ULL) {
 			wbt.bt_seq = 0;
-			db = dmu_bonus_hold(os, ZTEST_DIROBJ);
+			VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ,
+			    FTAG, &db));
 			ASSERT3U(db->db_size, ==, sizeof (wbt));
-			dmu_buf_read(db);
 			bcopy(db->db_data, &rbt, db->db_size);
 			if (rbt.bt_objset != 0) {
 				ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
@@ -1916,7 +1971,7 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 			}
 			dmu_buf_will_dirty(db, tx);
 			bcopy(&wbt, db->db_data, db->db_size);
-			dmu_buf_rele(db);
+			dmu_buf_rele(db, FTAG);
 			dmu_tx_commit(tx);
 			continue;
 		}
@@ -1940,6 +1995,7 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 		if (ztest_random(2) == 0) {
 			blkptr_t blk = { 0 };
 			uint64_t blkoff;
+			zbookmark_t zb;
 
 			txg_suspend(dmu_objset_pool(os));
 			(void) mutex_lock(lp);
@@ -1969,9 +2025,13 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 			 * We do this while still txg_suspend()ed to ensure
 			 * that the block can't be reused before we read it.
 			 */
+			zb.zb_objset = dmu_objset_id(os);
+			zb.zb_object = ZTEST_DIROBJ;
+			zb.zb_level = 0;
+			zb.zb_blkid = off / bs;
 			error = zio_wait(zio_read(NULL, dmu_objset_spa(os),
 			    &blk, iobuf, bs, NULL, NULL,
-			    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED));
+			    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
 			ASSERT(error == 0);
 
 			txg_resume(dmu_objset_pool(os));
@@ -2025,13 +2085,14 @@ ztest_zap(ztest_args_t *za)
 	/*
 	 * Create a new object if necessary, and record it in the directory.
 	 */
-	dmu_read(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t), &object);
+	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
+	    sizeof (uint64_t), &object));
 
 	if (object == 0) {
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
 		    sizeof (uint64_t));
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 2);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("create zap test obj");
@@ -2123,7 +2184,7 @@ ztest_zap(ztest_args_t *za)
 		 * should be txg + object + n.
 		 */
 		tx = dmu_tx_create(os);
-		dmu_tx_hold_zap(tx, object, 2);
+		dmu_tx_hold_zap(tx, object, TRUE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("create zap entry");
@@ -2168,7 +2229,7 @@ ztest_zap(ztest_args_t *za)
 		ASSERT3U(error, ==, 0);
 
 		tx = dmu_tx_create(os);
-		dmu_tx_hold_zap(tx, object, 2);
+		dmu_tx_hold_zap(tx, object, TRUE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			ztest_record_enospc("remove zap entry");
@@ -2265,7 +2326,7 @@ ztest_zap_parallel(ztest_args_t *za)
 
 		if (i >= 2) {
 			tx = dmu_tx_create(os);
-			dmu_tx_hold_zap(tx, object, 1);
+			dmu_tx_hold_zap(tx, object, TRUE, NULL);
 			error = dmu_tx_assign(tx, TXG_WAIT);
 			if (error) {
 				ztest_record_enospc("zap parallel");
@@ -2334,6 +2395,7 @@ ztest_dsl_prop_get_set(ztest_args_t *za)
 	const char *prop, *valname;
 	char setpoint[MAXPATHLEN];
 	char osname[MAXNAMELEN];
+	int error;
 
 	(void) rw_rdlock(&ztest_shared->zs_name_lock);
 
@@ -2350,8 +2412,15 @@ ztest_dsl_prop_get_set(ztest_args_t *za)
 			inherit = (value == ZIO_COMPRESS_INHERIT);
 		}
 
-		VERIFY3U(dsl_prop_set(osname, prop, sizeof (value),
-		    !inherit, &value), ==, 0);
+		error = dsl_prop_set(osname, prop, sizeof (value),
+		    !inherit, &value);
+
+		if (error == ENOSPC) {
+			ztest_record_enospc("dsl_prop_set");
+			break;
+		}
+
+		ASSERT3U(error, ==, 0);
 
 		VERIFY3U(dsl_prop_get(osname, prop, sizeof (value),
 		    1, &value, setpoint), ==, 0);
@@ -2370,6 +2439,21 @@ ztest_dsl_prop_get_set(ztest_args_t *za)
 	(void) rw_unlock(&ztest_shared->zs_name_lock);
 }
 
+static void
+ztest_error_setup(vdev_t *vd, int mode, int mask, uint64_t arg)
+{
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		ztest_error_setup(vd->vdev_child[c], mode, mask, arg);
+
+	if (vd->vdev_path != NULL) {
+		vd->vdev_fault_mode = mode;
+		vd->vdev_fault_mask = mask;
+		vd->vdev_fault_arg = arg;
+	}
+}
+
 /*
  * Inject random faults into the on-disk data.
  */
@@ -2382,20 +2466,28 @@ ztest_fault_inject(ztest_args_t *za)
 	uint64_t bad = 0x1990c0ffeedecade;
 	uint64_t top, leaf;
 	char path0[MAXPATHLEN];
-	char path1[MAXPATHLEN];
 	char pathrand[MAXPATHLEN];
 	size_t fsize;
 	spa_t *spa = dmu_objset_spa(za->za_os);
 	int bshift = SPA_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
 	int iters = 1000;
-	int ftype;
+	vdev_t *vd0;
+	uint64_t guid0 = 0;
+
+	/*
+	 * We can't inject faults when we have no fault tolerance.
+	 */
+	if (zopt_maxfaults == 0)
+		return;
+
+	ASSERT(leaves >= 2);
 
 	/*
 	 * Pick a random top-level vdev.
 	 */
-	spa_config_enter(spa, RW_READER);
+	spa_config_enter(spa, RW_READER, FTAG);
 	top = ztest_random(spa->spa_root_vdev->vdev_children);
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Pick a random leaf.
@@ -2403,73 +2495,45 @@ ztest_fault_inject(ztest_args_t *za)
 	leaf = ztest_random(leaves);
 
 	/*
-	 * Generate paths to the first to leaves in this top-level vdev,
+	 * Generate paths to the first two leaves in this top-level vdev,
 	 * and to the random leaf we selected.  We'll induce transient
-	 * faults on leaves 0 and 1, we'll online/offline leaf 1,
+	 * I/O errors and random online/offline activity on leaf 0,
 	 * and we'll write random garbage to the randomly chosen leaf.
 	 */
 	(void) snprintf(path0, sizeof (path0),
 	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + 0);
-	(void) snprintf(path1, sizeof (path1),
-	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + 1);
 	(void) snprintf(pathrand, sizeof (pathrand),
 	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + leaf);
 
-	if (leaves < 2)			/* there is no second leaf */
-		path1[0] = '\0';
+	dprintf("damaging %s and %s\n", path0, pathrand);
 
-	dprintf("damaging %s, %s, and %s\n", path0, path1, pathrand);
+	spa_config_enter(spa, RW_READER, FTAG);
 
 	/*
-	 * If we have exactly one-fault tolerance, just randomly offline
-	 * and online one device.
+	 * If we can tolerate two or more faults, make vd0 fail randomly.
 	 */
-	if (zopt_maxfaults == 1 && path1[0] != '\0') {
-		if (ztest_random(10) < 6)
-			(void) vdev_offline(spa, path1, B_TRUE);
-		else
-			(void) vdev_online(spa, path1);
-		return;
+	vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
+	if (vd0 != NULL && zopt_maxfaults >= 2) {
+		guid0 = vd0->vdev_guid;
+		ztest_error_setup(vd0, VDEV_FAULT_COUNT,
+		    (1U << ZIO_TYPE_READ) | (1U << ZIO_TYPE_WRITE), 100);
 	}
 
-	/*
-	 * Always inject a little random device failure, regardless of
-	 * the replication level.  The I/Os should be retried successfully.
-	 * If we only have single-fault tolerance, don't inject write
-	 * faults, because then we'll be doing partial writes and won't
-	 * be able to recover when we inject data corruption.
-	 */
-	if (zopt_maxfaults <= 1)
-		ftype = (1U << ZIO_TYPE_READ);
-	else
-		ftype = (1U << ZIO_TYPE_READ) | (1U << ZIO_TYPE_WRITE);
-
-	(void) vdev_error_setup(spa, path0, VDEV_FAULT_COUNT, ftype, 10);
+	spa_config_exit(spa, FTAG);
 
 	/*
-	 * If we can tolerate three or more faults, make one of the
-	 * devices fail quite a lot.
+	 * If we can tolerate two or more faults, randomly online/offline vd0.
 	 */
-	if (zopt_maxfaults >= 3 && path1[0] != '\0')
-		(void) vdev_error_setup(spa, path1, VDEV_FAULT_COUNT,
-		    ftype, 100);
-
-	/*
-	 * If we can tolerate four or more faults, offline one of the devices.
-	 */
-	if (zopt_maxfaults >= 4 && path1[0] != '\0') {
+	if (zopt_maxfaults >= 2 && guid0 != 0) {
 		if (ztest_random(10) < 6)
-			(void) vdev_offline(spa, path1, B_TRUE);
+			(void) vdev_offline(spa, guid0, B_TRUE);
 		else
-			(void) vdev_online(spa, path1);
+			(void) vdev_online(spa, guid0);
 	}
 
 	/*
-	 * If we have at least single-fault tolerance, inject data corruption.
+	 * We have at least single-fault tolerance, so inject data corruption.
 	 */
-	if (zopt_maxfaults < 1)
-		return;
-
 	fd = open(pathrand, O_RDWR);
 
 	if (fd == -1)	/* we hit a gap in the device namespace */
@@ -2497,19 +2561,6 @@ ztest_fault_inject(ztest_args_t *za)
 	(void) close(fd);
 }
 
-static void
-ztest_error_setup(vdev_t *vd, int mode, int mask, uint64_t arg)
-{
-	int c;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		ztest_error_setup(vd->vdev_child[c], mode, mask, arg);
-
-	if (vd->vdev_path != NULL)
-		(void) vdev_error_setup(vd->vdev_spa, vd->vdev_path,
-		    mode, mask, arg);
-}
-
 /*
  * Scrub the pool.
  */
@@ -2634,6 +2685,8 @@ ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
 	char dev_name[MAXPATHLEN];
 	nvlist_t *file, *root;
 	int error;
+	uint64_t guid;
+	vdev_t *vd;
 
 	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
 
@@ -2649,7 +2702,13 @@ ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
 	VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
 	    &file, 1) == 0);
 
-	error = spa_vdev_attach(spa, dev_name, root, B_TRUE);
+	spa_config_enter(spa, RW_READER, FTAG);
+	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL)
+		guid = 0;
+	else
+		guid = vd->vdev_guid;
+	spa_config_exit(spa, FTAG);
+	error = spa_vdev_attach(spa, guid, root, B_TRUE);
 	if (error != 0 && error != EBUSY && error != ENOTSUP && error != ENODEV)
 		fatal(0, "spa_vdev_attach(in-place) = %d", error);
 
@@ -2943,7 +3002,8 @@ ztest_run(char *pool)
 		for (d = -5; d <= 5; d++) {
 			error = dmu_object_info(spa->spa_meta_objset,
 			    (1ULL << t) + d, NULL);
-			ASSERT(error == 0 || error == ENOENT);
+			ASSERT(error == 0 || error == ENOENT ||
+			    error == EINVAL);
 		}
 	}
 
@@ -3016,6 +3076,7 @@ ztest_run(char *pool)
 		if (za[t].za_th)
 			traverse_fini(za[t].za_th);
 		if (t < zopt_dirs) {
+			txg_wait_synced(spa_get_dsl(spa), 0);
 			zil_close(za[t].za_zilog);
 			dmu_objset_close(za[t].za_os);
 		}
@@ -3046,11 +3107,7 @@ ztest_run(char *pool)
 		(void) rw_unlock(&ztest_shared->zs_name_lock);
 	}
 
-	/*
-	 * Prepare every leaf device to inject a few random read faults.
-	 */
-	ztest_error_setup(spa->spa_root_vdev, VDEV_FAULT_COUNT,
-	    (1U << ZIO_TYPE_READ), 10);
+	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	/*
 	 * Right before closing the pool, kick off a bunch of async I/O;
@@ -3141,11 +3198,6 @@ main(int argc, char **argv)
 	/* Override location of zpool.cache */
 	spa_config_dir = "/tmp";
 
-	/*
-	 * Blow away any existing copy of zpool.cache
-	 */
-	(void) remove("/tmp/zpool.cache");
-
 	ztest_random_fd = open("/dev/urandom", O_RDONLY);
 
 	process_options(argc, argv);
@@ -3155,6 +3207,12 @@ main(int argc, char **argv)
 
 	dprintf_setup(&argc, argv);
 
+	/*
+	 * Blow away any existing copy of zpool.cache
+	 */
+	if (zopt_init != 0)
+		(void) remove("/tmp/zpool.cache");
+
 	zs = ztest_shared = (void *)mmap(0,
 	    P2ROUNDUP(sizeof (ztest_shared_t), getpagesize()),
 	    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
diff --git a/usr/src/common/fs/hsfs.c b/usr/src/common/fs/hsfs.c
index 5d3a5db5db..b625c0aba9 100644
--- a/usr/src/common/fs/hsfs.c
+++ b/usr/src/common/fs/hsfs.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -136,6 +135,7 @@ static int bhsfs_close(int fd);
 static void bhsfs_closeall(void);
 static ssize_t bhsfs_read(int fdesc, char *buf, size_t count);
 static off_t bhsfs_lseek(int fdesc, off_t addr, int whence);
+static int bhsfs_fstat(int fdesc, struct bootstat *stp);
 
 static fileid_t *
 find_fp(int fd)
@@ -563,6 +563,48 @@ bhsfs_lseek(int fd, off_t addr, int whence)
 	return (0);
 }
 
+static int
+bhsfs_fstat(int fd, struct bootstat *stp)
+{
+	fileid_t	*filep;
+	struct inode	*ip;
+
+	if (!(filep = find_fp(fd)))
+		return (-1);
+
+	ip = filep->fi_inode;
+
+	stp->st_mode = 0;
+	stp->st_size = 0;
+
+	if (ip == NULL)
+		return (0);
+
+	switch (ip->i_smode & IFMT) {
+	case IFDIR:
+		stp->st_mode = S_IFDIR;
+		break;
+	case IFREG:
+		stp->st_mode = S_IFREG;
+		break;
+	default:
+		break;
+	}
+	stp->st_size = ip->i_size;
+
+	/* file times */
+	stp->st_atim.tv_sec = ip->i_atime.tv_sec;
+	stp->st_atim.tv_nsec = ip->i_atime.tv_usec * 1000;
+	stp->st_mtim.tv_sec = ip->i_mtime.tv_sec;
+	stp->st_mtim.tv_nsec = ip->i_mtime.tv_usec * 1000;
+	stp->st_ctim.tv_sec = ip->i_ctime.tv_sec;
+	stp->st_ctim.tv_nsec = ip->i_ctime.tv_usec * 1000;
+
+	return (0);
+
+}
+
+
 /*
  * Parse a directory entry.
  *
@@ -757,5 +799,6 @@ struct boot_fs_ops bhsfs_ops = {
 	bhsfs_close,
 	bhsfs_read,
 	bhsfs_lseek,
+	bhsfs_fstat,
 	NULL
 };
diff --git a/usr/src/common/fs/ufsops.c b/usr/src/common/fs/ufsops.c
index f440e85b1b..054d8a76e0 100644
--- a/usr/src/common/fs/ufsops.c
+++ b/usr/src/common/fs/ufsops.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -628,6 +627,46 @@ bufs_lseek(int fd, off_t addr, int whence)
 	return (0);
 }
 
+
+int
+bufs_fstat(int fd, struct bootstat *stp)
+{
+	fileid_t	*filep;
+	struct inode	*ip;
+
+	if (!(filep = find_fp(fd)))
+		return (-1);
+
+	ip = filep->fi_inode;
+
+	stp->st_mode = 0;
+	stp->st_size = 0;
+
+	if (ip == NULL)
+		return (0);
+
+	switch (ip->i_smode & IFMT) {
+	case IFLNK:
+		stp->st_mode = S_IFLNK;
+		break;
+	case IFREG:
+		stp->st_mode = S_IFREG;
+		break;
+	default:
+		break;
+	}
+	stp->st_size = ip->i_size;
+	stp->st_atim.tv_sec = ip->i_atime.tv_sec;
+	stp->st_atim.tv_nsec = ip->i_atime.tv_usec * 1000;
+	stp->st_mtim.tv_sec = ip->i_mtime.tv_sec;
+	stp->st_mtim.tv_nsec = ip->i_mtime.tv_usec * 1000;
+	stp->st_ctim.tv_sec = ip->i_ctime.tv_sec;
+	stp->st_ctim.tv_nsec = ip->i_ctime.tv_usec * 1000;
+
+	return (0);
+}
+
+
 static int
 bufs_close(int fd)
 {
@@ -733,5 +772,6 @@ struct boot_fs_ops bufs_ops = {
 	bufs_close,
 	bufs_read,
 	bufs_lseek,
+	bufs_fstat,
 	NULL
 };
diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h
index c7459797ea..820eb059bf 100644
--- a/usr/src/lib/libzfs/common/libzfs.h
+++ b/usr/src/lib/libzfs/common/libzfs.h
@@ -88,6 +88,8 @@ extern int zpool_vdev_offline(zpool_handle_t *, const char *, int);
 extern int zpool_vdev_attach(zpool_handle_t *, const char *, const char *,
     nvlist_t *, int);
 extern int zpool_vdev_detach(zpool_handle_t *, const char *);
+extern int zpool_clear(zpool_handle_t *, const char *);
+extern uint64_t zpool_vdev_to_guid(zpool_handle_t *, const char *);
 
 /*
  * Pool health statistics.
@@ -122,14 +124,19 @@ typedef enum {
 	ZPOOL_STATUS_OK
 } zpool_status_t;
 
-extern zpool_status_t zpool_get_status(zpool_handle_t *, char **msgid);
-extern zpool_status_t zpool_import_status(nvlist_t *, char **msgid);
+extern zpool_status_t zpool_get_status(zpool_handle_t *, char **);
+extern zpool_status_t zpool_import_status(nvlist_t *, char **);
 
 /*
  * Statistics and configuration functions.
  */
-extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **oldconfig);
+extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **);
 extern int zpool_refresh_stats(zpool_handle_t *);
+extern int zpool_get_errlog(zpool_handle_t *, nvlist_t ***, size_t *);
+
+#define	ZPOOL_ERR_DATASET	"dataset"
+#define	ZPOOL_ERR_OBJECT	"object"
+#define	ZPOOL_ERR_RANGE		"range"
 
 /*
  * Import and export functions
@@ -140,7 +147,7 @@ extern int zpool_import(nvlist_t *, const char *, const char *);
 /*
  * Search for pools to import
  */
-extern nvlist_t *zpool_find_import(int argc, char **argv);
+extern nvlist_t *zpool_find_import(int, char **);
 
 /*
  * Miscellaneous pool functions
diff --git a/usr/src/lib/libzfs/common/libzfs_config.c b/usr/src/lib/libzfs/common/libzfs_config.c
index 73f090dc98..71801d5cba 100644
--- a/usr/src/lib/libzfs/common/libzfs_config.c
+++ b/usr/src/lib/libzfs/common/libzfs_config.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -119,7 +118,7 @@ namespace_reload()
 	    zfs_malloc(zc.zc_config_dst_size);
 	for (;;) {
 		zc.zc_cookie = namespace_generation;
-		if (ioctl(zfs_fd, ZFS_IOC_POOL_CONFIGS, &zc) != 0) {
+		if (zfs_ioctl(ZFS_IOC_POOL_CONFIGS, &zc) != 0) {
 			switch (errno) {
 			case EEXIST:
 				/*
@@ -213,32 +212,23 @@ zpool_refresh_stats(zpool_handle_t *zhp)
 	zc.zc_config_dst = (uint64_t)(uintptr_t)
 	    zfs_malloc(zc.zc_config_dst_size);
 
-	while ((error = ioctl(zfs_fd, ZFS_IOC_POOL_STATS, &zc)) != 0) {
-		error = errno;
-
-		if (error == ENXIO) {
+	for (;;) {
+		if (zfs_ioctl(ZFS_IOC_POOL_STATS, &zc) == 0) {
 			/*
-			 * We can't open one or more top-level vdevs,
-			 * but we have the config.
+			 * The real error is returned in the zc_cookie field.
 			 */
+			error = zc.zc_cookie;
 			break;
 		}
 
-		free((void *)(uintptr_t)zc.zc_config_dst);
-
-		if (error == ENOENT || error == EINVAL) {
-			/*
-			 * There's no such pool (ENOENT)
-			 * or the config is bogus (EINVAL).
-			 */
-			return (error);
+		if (errno == ENOMEM) {
+			free((void *)(uintptr_t)zc.zc_config_dst);
+			zc.zc_config_dst = (uint64_t)(uintptr_t)
+			    zfs_malloc(zc.zc_config_dst_size);
+		} else {
+			free((void *)(uintptr_t)zc.zc_config_dst);
+			return (errno);
 		}
-
-		if (error != ENOMEM)
-			zfs_baderror(error);
-
-		zc.zc_config_dst =
-		    (uint64_t)(uintptr_t)zfs_malloc(zc.zc_config_dst_size);
 	}
 
 	verify(nvlist_unpack((void *)(uintptr_t)zc.zc_config_dst,
diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c
index f7c674006d..1e6e1d79ab 100644
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -193,7 +192,7 @@ get_stats(zfs_handle_t *zhp)
 	zc.zc_config_src = (uint64_t)(uintptr_t)zfs_malloc(1024);
 	zc.zc_config_src_size = 1024;
 
-	while (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) {
+	while (zfs_ioctl(ZFS_IOC_OBJSET_STATS, &zc) != 0) {
 		if (errno == ENOMEM) {
 			zc.zc_config_src = (uint64_t)(uintptr_t)
 			    zfs_malloc(zc.zc_config_src_size);
@@ -206,6 +205,8 @@ get_stats(zfs_handle_t *zhp)
 	bcopy(&zc.zc_objset_stats, &zhp->zfs_dmustats,
 	    sizeof (zc.zc_objset_stats));
 
+	(void) strcpy(zhp->zfs_root, zc.zc_root);
+
 	verify(nvlist_unpack((void *)(uintptr_t)zc.zc_config_src,
 	    zc.zc_config_src_size, &zhp->zfs_props, 0) == 0);
 
@@ -303,6 +304,16 @@ zfs_open(const char *path, int types)
 			    path_to_str(path, types));
 			break;
 
+		case ENXIO:
+		case EIO:
+			/*
+			 * I/O error from the underlying pool.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot open '%s': I/O error"), path,
+			    path_to_str(path, types));
+			break;
+
 		default:
 			zfs_baderror(errno);
 
@@ -800,11 +811,11 @@ zfs_prop_set(zfs_handle_t *zhp, zfs_prop_t prop, const char *propval)
 	switch (prop) {
 	case ZFS_PROP_QUOTA:
 		zc.zc_cookie = number;
-		ret = ioctl(zfs_fd, ZFS_IOC_SET_QUOTA, &zc);
+		ret = zfs_ioctl(ZFS_IOC_SET_QUOTA, &zc);
 		break;
 	case ZFS_PROP_RESERVATION:
 		zc.zc_cookie = number;
-		ret = ioctl(zfs_fd, ZFS_IOC_SET_RESERVATION, &zc);
+		ret = zfs_ioctl(ZFS_IOC_SET_RESERVATION, &zc);
 		break;
 	case ZFS_PROP_MOUNTPOINT:
 	case ZFS_PROP_SHARENFS:
@@ -817,15 +828,15 @@ zfs_prop_set(zfs_handle_t *zhp, zfs_prop_t prop, const char *propval)
 		    sizeof (zc.zc_prop_value));
 		zc.zc_intsz = 1;
 		zc.zc_numints = strlen(propval) + 1;
-		ret = ioctl(zfs_fd, ZFS_IOC_SET_PROP, &zc);
+		ret = zfs_ioctl(ZFS_IOC_SET_PROP, &zc);
 		break;
 	case ZFS_PROP_VOLSIZE:
 		zc.zc_volsize = number;
-		ret = ioctl(zfs_fd, ZFS_IOC_SET_VOLSIZE, &zc);
+		ret = zfs_ioctl(ZFS_IOC_SET_VOLSIZE, &zc);
 		break;
 	case ZFS_PROP_VOLBLOCKSIZE:
 		zc.zc_volblocksize = number;
-		ret = ioctl(zfs_fd, ZFS_IOC_SET_VOLBLOCKSIZE, &zc);
+		ret = zfs_ioctl(ZFS_IOC_SET_VOLBLOCKSIZE, &zc);
 		break;
 	default:
 		(void) strlcpy(zc.zc_prop_name, propname,
@@ -834,7 +845,7 @@ zfs_prop_set(zfs_handle_t *zhp, zfs_prop_t prop, const char *propval)
 		*(uint64_t *)zc.zc_prop_value = number;
 		zc.zc_intsz = 8;
 		zc.zc_numints = 1;
-		ret = ioctl(zfs_fd, ZFS_IOC_SET_PROP, &zc);
+		ret = zfs_ioctl(ZFS_IOC_SET_PROP, &zc);
 		break;
 	}
 
@@ -1001,7 +1012,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, zfs_prop_t prop)
 
 	zc.zc_numints = 0;
 
-	if ((ret = ioctl(zfs_fd, ZFS_IOC_SET_PROP, &zc)) != 0) {
+	if ((ret = zfs_ioctl(ZFS_IOC_SET_PROP, &zc)) != 0) {
 		switch (errno) {
 		case EPERM:
 			zfs_error(dgettext(TEXT_DOMAIN,
@@ -1178,6 +1189,9 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zfs_source_t *src,
 		}
 		return (val);
 
+	case ZFS_PROP_CREATION:
+		return (zhp->zfs_dmustats.dds_creation_time);
+
 	case ZFS_PROP_QUOTA:
 		if (zhp->zfs_dmustats.dds_quota == 0)
 			*source = "";	/* default */
@@ -1250,9 +1264,9 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zfs_source_t *src,
 
 			search.mnt_special = (char *)zhp->zfs_name;
 			search.mnt_fstype = MNTTYPE_ZFS;
-			rewind(mnttab_file);
+			rewind(zfs_mnttab());
 
-			if (getmntany(mnttab_file, &entry, &search) == 0)
+			if (getmntany(zfs_mnttab(), &entry, &search) == 0)
 				zhp->zfs_mntopts =
 				    zfs_strdup(entry.mnt_mntopts);
 		}
@@ -1431,7 +1445,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
 		 * If the pool has an alternate root, we want to prepend that
 		 * root to any values we return.
 		 */
-		root = zhp->zfs_dmustats.dds_altroot;
+		root = zhp->zfs_root;
 		str = getprop_string(zhp, prop, &source);
 
 		if (str[0] == '\0') {
@@ -1465,7 +1479,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
 		break;
 
 	case ZFS_PROP_ORIGIN:
-		(void) strlcpy(propbuf, getprop_string(zhp, prop, &source),
+		(void) strlcpy(propbuf, zhp->zfs_dmustats.dds_clone_of,
 		    proplen);
 		/*
 		 * If there is no parent at all, return failure to indicate that
@@ -1620,7 +1634,7 @@ zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data)
 	int ret;
 
 	for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-	    ioctl(zfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0;
+	    zfs_ioctl(ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0;
 	    (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) {
 		/*
 		 * Ignore private dataset names.
@@ -1661,7 +1675,7 @@ zfs_iter_snapshots(zfs_handle_t *zhp, zfs_iter_f func, void *data)
 	int ret;
 
 	for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-	    ioctl(zfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0;
+	    zfs_ioctl(ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0;
 	    (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) {
 
 		if ((nzhp = make_dataset_handle(zc.zc_name)) == NULL)
@@ -1740,7 +1754,7 @@ check_parents(const char *path, zfs_type_t type)
 		slash = parent + strlen(parent);
 	(void) strncpy(zc.zc_name, parent, slash - parent);
 	zc.zc_name[slash - parent] = '\0';
-	if (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0 &&
+	if (zfs_ioctl(ZFS_IOC_OBJSET_STATS, &zc) != 0 &&
 	    errno == ENOENT) {
 		zfs_error(dgettext(TEXT_DOMAIN,
 		    "cannot create '%s': no such pool '%s'"), path, zc.zc_name);
@@ -1835,7 +1849,7 @@ zfs_create(const char *path, zfs_type_t type,
 	 * first try to see if the dataset exists.
 	 */
 	(void) strlcpy(zc.zc_name, path, sizeof (zc.zc_name));
-	if (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0) {
+	if (zfs_ioctl(ZFS_IOC_OBJSET_STATS, &zc) == 0) {
 		zfs_error(dgettext(TEXT_DOMAIN,
 		    "cannot create '%s': dataset exists"), path);
 		return (-1);
@@ -1886,7 +1900,7 @@ zfs_create(const char *path, zfs_type_t type,
 	}
 
 	/* create the dataset */
-	ret = ioctl(zfs_fd, ZFS_IOC_CREATE, &zc);
+	ret = zfs_ioctl(ZFS_IOC_CREATE, &zc);
 
 	if (ret == 0 && type == ZFS_TYPE_VOLUME)
 		ret = zvol_create_link(path);
@@ -2003,7 +2017,7 @@ zfs_destroy(zfs_handle_t *zhp)
 		zc.zc_objset_type = DMU_OST_ZFS;
 	}
 
-	ret = ioctl(zfs_fd, ZFS_IOC_DESTROY, &zc);
+	ret = zfs_ioctl(ZFS_IOC_DESTROY, &zc);
 
 	if (ret != 0) {
 		switch (errno) {
@@ -2093,7 +2107,7 @@ zfs_clone(zfs_handle_t *zhp, const char *target)
 
 	(void) strlcpy(zc.zc_name, target, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_filename, zhp->zfs_name, sizeof (zc.zc_filename));
-	ret = ioctl(zfs_fd, ZFS_IOC_CREATE, &zc);
+	ret = zfs_ioctl(ZFS_IOC_CREATE, &zc);
 
 	if (ret != 0) {
 		switch (errno) {
@@ -2207,12 +2221,12 @@ zfs_snapshot(const char *path)
 	else
 		zc.zc_objset_type = DMU_OST_ZFS;
 
-	ret = ioctl(zfs_fd, ZFS_IOC_CREATE, &zc);
+	ret = zfs_ioctl(ZFS_IOC_CREATE, &zc);
 
 	if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) {
 		ret = zvol_create_link(path);
 		if (ret != 0)
-			(void) ioctl(zfs_fd, ZFS_IOC_DESTROY, &zc);
+			(void) zfs_ioctl(ZFS_IOC_DESTROY, &zc);
 	}
 
 	if (ret != 0) {
@@ -2283,7 +2297,7 @@ zfs_backup(zfs_handle_t *zhp_to, zfs_handle_t *zhp_from)
 	}
 	zc.zc_cookie = STDOUT_FILENO;
 
-	ret = ioctl(zfs_fd, ZFS_IOC_SENDBACKUP, &zc);
+	ret = zfs_ioctl(ZFS_IOC_SENDBACKUP, &zc);
 	if (ret != 0) {
 		switch (errno) {
 		case EPERM:
@@ -2402,7 +2416,7 @@ zfs_restore(const char *tosnap, int isprefix, int verbose, int dryrun)
 	/*
 	 * Determine name of destination snapshot.
 	 */
-	(void) strcpy(drrb->drr_toname, tosnap);
+	(void) strcpy(zc.zc_filename, tosnap);
 	if (isprefix) {
 		if (strchr(tosnap, '@') != NULL) {
 			zfs_error(dgettext(TEXT_DOMAIN,
@@ -2417,8 +2431,8 @@ zfs_restore(const char *tosnap, int isprefix, int verbose, int dryrun)
 		else
 			cp++;
 
-		(void) strcat(drrb->drr_toname, "/");
-		(void) strcat(drrb->drr_toname, cp);
+		(void) strcat(zc.zc_filename, "/");
+		(void) strcat(zc.zc_filename, cp);
 	} else if (strchr(tosnap, '@') == NULL) {
 		/*
 		 * they specified just a filesystem; tack on the
@@ -2427,11 +2441,10 @@ zfs_restore(const char *tosnap, int isprefix, int verbose, int dryrun)
 		cp = strchr(drr.drr_u.drr_begin.drr_toname, '@');
 		if (cp == NULL || strlen(tosnap) + strlen(cp) >= MAXNAMELEN) {
 			zfs_error(dgettext(TEXT_DOMAIN,
-			    "cannot restore: invalid backup stream "
-			    "(invalid snapshot name)"));
+			    "cannot restore: invalid snapshot name"));
 			return (-1);
 		}
-		(void) strcat(drrb->drr_toname, cp);
+		(void) strcat(zc.zc_filename, cp);
 	}
 
 	if (drrb->drr_fromguid) {
@@ -2439,7 +2452,7 @@ zfs_restore(const char *tosnap, int isprefix, int verbose, int dryrun)
 		/* incremental backup stream */
 
 		/* do the ioctl to the containing fs */
-		(void) strcpy(zc.zc_name, drrb->drr_toname);
+		(void) strcpy(zc.zc_name, zc.zc_filename);
 		cp = strchr(zc.zc_name, '@');
 		*cp = '\0';
 
@@ -2464,7 +2477,7 @@ zfs_restore(const char *tosnap, int isprefix, int verbose, int dryrun)
 	} else {
 		/* full backup stream */
 
-		(void) strcpy(zc.zc_name, drrb->drr_toname);
+		(void) strcpy(zc.zc_name, zc.zc_filename);
 
 		/* make sure they aren't trying to restore into the root */
 		if (strchr(zc.zc_name, '/') == NULL) {
@@ -2490,33 +2503,58 @@ zfs_restore(const char *tosnap, int isprefix, int verbose, int dryrun)
 				    tosnap);
 				return (-1);
 			}
+			zfs_close(h);
 
 			/* create any necessary ancestors up to prefix */
 			zc.zc_objset_type = DMU_OST_ZFS;
+
 			/*
 			 * zc.zc_name is now the full name of the snap
-			 * we're restoring into
+			 * we're restoring into.  Attempt to create,
+			 * mount, and share any ancestor filesystems, up
+			 * to the one that was named.
 			 */
-			cp = zc.zc_name + strlen(tosnap) + 1;
-			while (cp = strchr(cp, '/')) {
+			for (cp = zc.zc_name + strlen(tosnap) + 1;
+			    cp = strchr(cp, '/'); *cp = '/', cp++) {
+				const char *opname;
 				*cp = '\0';
-				err = ioctl(zfs_fd, ZFS_IOC_CREATE, &zc);
-				if (err && errno != ENOENT && errno != EEXIST) {
-					zfs_error(dgettext(TEXT_DOMAIN,
-					    "cannot restore: "
-					    "couldn't create ancestor %s"),
-					    zc.zc_name);
-					return (-1);
+
+				opname = "create";
+				if (zfs_create(zc.zc_name, ZFS_TYPE_FILESYSTEM,
+				    NULL, NULL) != 0) {
+					if (errno == EEXIST)
+						continue;
+					goto ancestorerr;
 				}
-				*cp = '/';
-				cp++;
+
+				opname = "open";
+				h = zfs_open(zc.zc_name, ZFS_TYPE_FILESYSTEM);
+				if (h == NULL)
+					goto ancestorerr;
+
+				opname = "mount";
+				if (zfs_mount(h, NULL, 0) != 0)
+					goto ancestorerr;
+
+				opname = "share";
+				if (zfs_share(h) != 0)
+					goto ancestorerr;
+
+				zfs_close(h);
+
+				continue;
+ancestorerr:
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "cannot restore: couldn't %s ancestor %s"),
+				    opname, zc.zc_name);
+				return (-1);
 			}
 		}
 
 		/* Make sure destination fs does not exist */
 		cp = strchr(zc.zc_name, '@');
 		*cp = '\0';
-		if (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0) {
+		if (zfs_ioctl(ZFS_IOC_OBJSET_STATS, &zc) == 0) {
 			zfs_error(dgettext(TEXT_DOMAIN,
 			    "cannot restore full backup: "
 			    "destination filesystem %s already exists"),
@@ -2537,12 +2575,12 @@ zfs_restore(const char *tosnap, int isprefix, int verbose, int dryrun)
 		    dryrun ? "would restore" : "restoring",
 		    drrb->drr_fromguid ? "incremental" : "full",
 		    drr.drr_u.drr_begin.drr_toname,
-		    zc.zc_begin_record.drr_toname);
+		    zc.zc_filename);
 		(void) fflush(stdout);
 	}
 	if (dryrun)
 		return (0);
-	err = ioctl_err = ioctl(zfs_fd, ZFS_IOC_RECVBACKUP, &zc);
+	err = ioctl_err = zfs_ioctl(ZFS_IOC_RECVBACKUP, &zc);
 	if (ioctl_err != 0) {
 		switch (errno) {
 		case ENODEV:
@@ -2561,12 +2599,12 @@ zfs_restore(const char *tosnap, int isprefix, int verbose, int dryrun)
 		case EEXIST:
 			if (drrb->drr_fromguid == 0) {
 				/* it's the containing fs that exists */
-				cp = strchr(drrb->drr_toname, '@');
+				cp = strchr(zc.zc_filename, '@');
 				*cp = '\0';
 			}
 			zfs_error(dgettext(TEXT_DOMAIN,
 			    "cannot restore to %s: destination already exists"),
-			    drrb->drr_toname);
+			    zc.zc_filename);
 			break;
 		case ENOENT:
 			zfs_error(dgettext(TEXT_DOMAIN,
@@ -2592,6 +2630,11 @@ zfs_restore(const char *tosnap, int isprefix, int verbose, int dryrun)
 			zfs_error(dgettext(TEXT_DOMAIN,
 			    "cannot restore: invalid backup stream"));
 			break;
+		case ECKSUM:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot restore: invalid backup stream "
+			    "(checksum mismatch)"));
+			break;
 		case EPERM:
 			zfs_error(dgettext(TEXT_DOMAIN,
 			    "cannot restore: permission denied"));
@@ -2607,12 +2650,12 @@ zfs_restore(const char *tosnap, int isprefix, int verbose, int dryrun)
 	 * restore), and the /dev links for the new snapshot (if
 	 * created).
 	 */
-	cp = strchr(drrb->drr_toname, '@');
+	cp = strchr(zc.zc_filename, '@');
 	if (cp && (ioctl_err == 0 || drrb->drr_fromguid)) {
 		zfs_handle_t *h;
 
 		*cp = '\0';
-		h = zfs_open(drrb->drr_toname,
+		h = zfs_open(zc.zc_filename,
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 		*cp = '@';
 		if (h) {
@@ -2620,10 +2663,8 @@ zfs_restore(const char *tosnap, int isprefix, int verbose, int dryrun)
 				err = zfs_mount(h, NULL, 0);
 			} else {
 				err = zvol_create_link(h->zfs_name);
-				if (err == 0 && ioctl_err == 0) {
-					err =
-					    zvol_create_link(drrb->drr_toname);
-				}
+				if (err == 0 && ioctl_err == 0)
+					err = zvol_create_link(zc.zc_filename);
 			}
 			zfs_close(h);
 		}
@@ -2723,7 +2764,7 @@ do_rollback(zfs_handle_t *zhp)
 	 * condition where the user has taken a snapshot since we verified that
 	 * this was the most recent.
 	 */
-	if ((ret = ioctl(zfs_fd, ZFS_IOC_ROLLBACK, &zc)) != 0) {
+	if ((ret = zfs_ioctl(ZFS_IOC_ROLLBACK, &zc)) != 0) {
 		switch (errno) {
 		case EPERM:
 			/*
@@ -2966,7 +3007,7 @@ zfs_rename(zfs_handle_t *zhp, const char *target)
 	else
 		zc.zc_objset_type = DMU_OST_ZFS;
 
-	if ((ret = ioctl(zfs_fd, ZFS_IOC_RENAME, &zc)) != 0) {
+	if ((ret = zfs_ioctl(ZFS_IOC_RENAME, &zc)) != 0) {
 		switch (errno) {
 		case EPERM:
 			/*
@@ -3051,7 +3092,7 @@ zvol_create_link(const char *dataset)
 	/*
 	 * Issue the appropriate ioctl.
 	 */
-	if (ioctl(zfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) {
+	if (zfs_ioctl(ZFS_IOC_CREATE_MINOR, &zc) != 0) {
 		switch (errno) {
 		case EPERM:
 			zfs_error(dgettext(TEXT_DOMAIN, "cannot create "
@@ -3080,7 +3121,7 @@ zvol_create_link(const char *dataset)
 	if ((hdl = di_devlink_init(ZFS_DRIVER, DI_MAKE_LINK)) == NULL) {
 		zfs_error(dgettext(TEXT_DOMAIN,
 		    "cannot create device links for '%s'"), dataset);
-		(void) ioctl(zfs_fd, ZFS_IOC_REMOVE_MINOR, &zc);
+		(void) zfs_ioctl(ZFS_IOC_REMOVE_MINOR, &zc);
 		return (-1);
 	} else {
 		(void) di_devlink_fini(&hdl);
@@ -3099,7 +3140,7 @@ zvol_remove_link(const char *dataset)
 
 	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
 
-	if (ioctl(zfs_fd, ZFS_IOC_REMOVE_MINOR, &zc) != 0) {
+	if (zfs_ioctl(ZFS_IOC_REMOVE_MINOR, &zc) != 0) {
 		switch (errno) {
 		case EPERM:
 			zfs_error(dgettext(TEXT_DOMAIN, "cannot remove "
diff --git a/usr/src/lib/libzfs/common/libzfs_graph.c b/usr/src/lib/libzfs/common/libzfs_graph.c
index 65b115879b..4c7bb547ee 100644
--- a/usr/src/lib/libzfs/common/libzfs_graph.c
+++ b/usr/src/lib/libzfs/common/libzfs_graph.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -345,7 +344,7 @@ iterate_children(zfs_graph_t *zgp, const char *dataset)
 		return (0);
 
 	for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-	    ioctl(zfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0;
+	    zfs_ioctl(ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0;
 	    (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) {
 
 		/*
@@ -359,7 +358,7 @@ iterate_children(zfs_graph_t *zgp, const char *dataset)
 		 * dataset and clone statistics.  If this fails, the dataset has
 		 * since been removed, and we're pretty much screwed anyway.
 		 */
-		if (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0)
+		if (zfs_ioctl(ZFS_IOC_OBJSET_STATS, &zc) != 0)
 			continue;
 
 		/*
@@ -393,7 +392,7 @@ iterate_children(zfs_graph_t *zgp, const char *dataset)
 	bzero(&zc, sizeof (zc));
 
 	for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-	    ioctl(zfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0;
+	    zfs_ioctl(ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0;
 	    (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) {
 
 		/*
@@ -401,7 +400,7 @@ iterate_children(zfs_graph_t *zgp, const char *dataset)
 		 * dataset and clone statistics.  If this fails, the dataset has
 		 * since been removed, and we're pretty much screwed anyway.
 		 */
-		if (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0)
+		if (zfs_ioctl(ZFS_IOC_OBJSET_STATS, &zc) != 0)
 			continue;
 
 		/*
@@ -439,7 +438,7 @@ construct_graph(const char *dataset)
 	 * since iterate_children() only checks the children.
 	 */
 	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-	(void) ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc);
+	(void) zfs_ioctl(ZFS_IOC_OBJSET_STATS, &zc);
 
 	if (zc.zc_objset_stats.dds_num_clones != 0 ||
 	    iterate_children(zgp, dataset) != 0) {
diff --git a/usr/src/lib/libzfs/common/libzfs_impl.h b/usr/src/lib/libzfs/common/libzfs_impl.h
index 27571ba582..76bca21242 100644
--- a/usr/src/lib/libzfs/common/libzfs_impl.h
+++ b/usr/src/lib/libzfs/common/libzfs_impl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -49,6 +48,7 @@ struct zfs_handle {
 	uint64_t zfs_volsize;
 	uint64_t zfs_volblocksize;
 	char *zfs_mntopts;
+	char zfs_root[MAXPATHLEN];
 };
 
 struct zpool_handle {
@@ -57,6 +57,8 @@ struct zpool_handle {
 	size_t zpool_config_size;
 	nvlist_t *zpool_config;
 	nvlist_t *zpool_old_config;
+	nvlist_t **zpool_error_log;
+	size_t zpool_error_count;
 };
 
 void zfs_error(const char *, ...);
@@ -70,13 +72,8 @@ void no_memory(void);
 	"internal error: unexpected error %d at line %d of %s"),	\
 	(err), (__LINE__), (__FILE__)))
 
-int zfs_fd;
-
 char **get_dependents(const char *, size_t *);
 
-FILE *mnttab_file;
-FILE *sharetab_file;
-
 typedef struct prop_changelist prop_changelist_t;
 
 int changelist_prefix(prop_changelist_t *);
@@ -91,12 +88,16 @@ int changelist_haszonedchild(prop_changelist_t *);
 void remove_mountpoint(zfs_handle_t *);
 
 zfs_handle_t *make_dataset_handle(const char *);
-void set_pool_health(nvlist_t *config);
+void set_pool_health(nvlist_t *);
+
+zpool_handle_t *zpool_open_silent(const char *);
 
-zpool_handle_t *zpool_open_silent(const char *pool);
+int zvol_create_link(const char *);
+int zvol_remove_link(const char *);
 
-int zvol_create_link(const char *dataset);
-int zvol_remove_link(const char *dataset);
+int zfs_ioctl(int, zfs_cmd_t *);
+FILE *zfs_mnttab(void);
+FILE *zfs_sharetab(void);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/lib/libzfs/common/libzfs_import.c b/usr/src/lib/libzfs/common/libzfs_import.c
index cc86292a50..04e3c0983f 100644
--- a/usr/src/lib/libzfs/common/libzfs_import.c
+++ b/usr/src/lib/libzfs/common/libzfs_import.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -483,8 +482,8 @@ get_configs(pool_list_t *pl)
 		    &guid) == 0);
 
 		(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
-		if (ioctl(zfs_fd, ZFS_IOC_POOL_GUID, &zc) == 0 &&
-		    guid == zc.zc_pool_guid) {
+		if (zfs_ioctl(ZFS_IOC_POOL_GUID, &zc) == 0 &&
+		    guid == zc.zc_guid) {
 			nvlist_free(config);
 			continue;
 		}
@@ -511,7 +510,7 @@ get_configs(pool_list_t *pl)
 		zc.zc_config_dst = (uint64_t)(uintptr_t)
 		    zfs_malloc(zc.zc_config_dst_size);
 
-		while ((err = ioctl(zfs_fd, ZFS_IOC_POOL_TRYIMPORT,
+		while ((err = zfs_ioctl(ZFS_IOC_POOL_TRYIMPORT,
 		    &zc)) != 0 && errno == ENOMEM) {
 			free((void *)(uintptr_t)zc.zc_config_dst);
 			zc.zc_config_dst = (uint64_t)(uintptr_t)
@@ -562,7 +561,7 @@ zpool_read_label(int fd)
 	int l;
 	vdev_label_t *label;
 	nvlist_t *config;
-	uint64_t version, state, txg;
+	uint64_t state, txg;
 
 	if (fstat64(fd, &statbuf) == -1)
 		return (NULL);
@@ -578,12 +577,6 @@ zpool_read_label(int fd)
 		    sizeof (label->vl_vdev_phys.vp_nvlist), &config, 0) != 0)
 			continue;
 
-		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
-		    &version) != 0 || version != UBERBLOCK_VERSION) {
-			nvlist_free(config);
-			continue;
-		}
-
 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 		    &state) != 0 || state > POOL_STATE_EXPORTED) {
 			nvlist_free(config);
@@ -747,8 +740,8 @@ zpool_in_use(int fd, pool_state_t *state, char **namestr)
 		 * message if the pool cannot be opened.
 		 */
 		(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
-		if (ioctl(zfs_fd, ZFS_IOC_POOL_GUID, &zc) == 0 &&
-		    guid == zc.zc_pool_guid) {
+		if (zfs_ioctl(ZFS_IOC_POOL_GUID, &zc) == 0 &&
+		    guid == zc.zc_guid) {
 			/*
 			 * Because the device may have been removed while
 			 * offlined, we only report it as active if the vdev is
diff --git a/usr/src/lib/libzfs/common/libzfs_mount.c b/usr/src/lib/libzfs/common/libzfs_mount.c
index b280fe1e8a..ae4a9937a8 100644
--- a/usr/src/lib/libzfs/common/libzfs_mount.c
+++ b/usr/src/lib/libzfs/common/libzfs_mount.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -63,14 +62,6 @@
 
 #include "libzfs_impl.h"
 
-
-/*
- * The following two files are opened as part of zfs_init().  It's OK to for
- * the sharetab to be NULL, but mnttab must always be non-NULL;
- */
-FILE *mnttab_file;
-FILE *sharetab_file;
-
 /*
  * Search the sharetab for the given mountpoint, returning TRUE if it is found.
  */
@@ -79,12 +70,12 @@ is_shared(const char *mountpoint)
 {
 	char buf[MAXPATHLEN], *tab;
 
-	if (sharetab_file == NULL)
+	if (zfs_sharetab() == NULL)
 		return (0);
 
-	(void) fseek(sharetab_file, 0, SEEK_SET);
+	(void) fseek(zfs_sharetab(), 0, SEEK_SET);
 
-	while (fgets(buf, sizeof (buf), sharetab_file) != NULL) {
+	while (fgets(buf, sizeof (buf), zfs_sharetab()) != NULL) {
 
 		/* the mountpoint is the first entry on each line */
 		if ((tab = strchr(buf, '\t')) != NULL) {
@@ -143,8 +134,8 @@ zfs_is_mounted(zfs_handle_t *zhp, char **where)
 	search.mnt_special = (char *)zfs_get_name(zhp);
 	search.mnt_fstype = MNTTYPE_ZFS;
 
-	rewind(mnttab_file);
-	if (getmntany(mnttab_file, &entry, &search) != 0)
+	rewind(zfs_mnttab());
+	if (getmntany(zfs_mnttab(), &entry, &search) != 0)
 		return (FALSE);
 
 	if (where != NULL)
@@ -262,9 +253,9 @@ zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags)
 	/* check to see if need to unmount the filesystem */
 	search.mnt_special = (char *)zfs_get_name(zhp);
 	search.mnt_fstype = MNTTYPE_ZFS;
-	rewind(mnttab_file);
+	rewind(zfs_mnttab());
 	if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) &&
-	    getmntany(mnttab_file, &entry, &search) == 0)) {
+	    getmntany(zfs_mnttab(), &entry, &search) == 0)) {
 
 		if (mountpoint == NULL)
 			mountpoint = entry.mnt_mountp;
@@ -442,9 +433,9 @@ zfs_unshare(zfs_handle_t *zhp, const char *mountpoint)
 	/* check to see if need to unmount the filesystem */
 	search.mnt_special = (char *)zfs_get_name(zhp);
 	search.mnt_fstype = MNTTYPE_ZFS;
-	rewind(mnttab_file);
+	rewind(zfs_mnttab());
 	if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) &&
-	    getmntany(mnttab_file, &entry, &search) == 0)) {
+	    getmntany(zfs_mnttab(), &entry, &search) == 0)) {
 
 		if (mountpoint == NULL)
 			mountpoint = entry.mnt_mountp;
diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c
index 949b02adfd..3ffe2465b5 100644
--- a/usr/src/lib/libzfs/common/libzfs_pool.c
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c
@@ -36,6 +36,7 @@
 #include <string.h>
 #include <unistd.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/zio.h>
 
 #include "zfs_namecheck.h"
 #include "libzfs_impl.h"
@@ -208,10 +209,10 @@ zpool_open(const char *pool)
 		return (NULL);
 
 	if (zhp->zpool_state == POOL_STATE_UNAVAIL) {
-		zfs_error(dgettext(TEXT_DOMAIN, "cannot open ' %s': pool is "
-		    "currently unavailable\n"), zhp->zpool_name);
-		zfs_error(dgettext(TEXT_DOMAIN, "run 'zpool status -v %s' for "
-		    "detailed information\n"), zhp->zpool_name);
+		zfs_error(dgettext(TEXT_DOMAIN, "cannot open '%s': pool is "
+		    "currently unavailable"), zhp->zpool_name);
+		zfs_error(dgettext(TEXT_DOMAIN, "run 'zpool status %s' for "
+		    "detailed information"), zhp->zpool_name);
 		zpool_close(zhp);
 		return (NULL);
 	}
@@ -229,6 +230,12 @@ zpool_close(zpool_handle_t *zhp)
 		nvlist_free(zhp->zpool_config);
 	if (zhp->zpool_old_config)
 		nvlist_free(zhp->zpool_old_config);
+	if (zhp->zpool_error_log) {
+		int i;
+		for (i = 0; i < zhp->zpool_error_count; i++)
+			free(zhp->zpool_error_log[i]);
+		free(zhp->zpool_error_log);
+	}
 	free(zhp);
 }
 
@@ -299,11 +306,11 @@ zpool_get_root(zpool_handle_t *zhp, char *buf, size_t buflen)
 	zfs_cmd_t zc = { 0 };
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
-	if (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0 ||
-	    zc.zc_objset_stats.dds_altroot[0] == '\0')
+	if (zfs_ioctl(ZFS_IOC_OBJSET_STATS, &zc) != 0 ||
+	    zc.zc_root[0] == '\0')
 		return (-1);
 
-	(void) strlcpy(buf, zc.zc_objset_stats.dds_altroot, buflen);
+	(void) strlcpy(buf, zc.zc_root, buflen);
 
 	return (0);
 }
@@ -359,7 +366,7 @@ zpool_create(const char *pool, nvlist_t *nvroot, const char *altroot)
 	if (altroot != NULL)
 		(void) strlcpy(zc.zc_root, altroot, sizeof (zc.zc_root));
 
-	if (ioctl(zfs_fd, ZFS_IOC_POOL_CREATE, &zc) != 0) {
+	if (zfs_ioctl(ZFS_IOC_POOL_CREATE, &zc) != 0) {
 		switch (errno) {
 		case EEXIST:
 			zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': "
@@ -481,7 +488,7 @@ zpool_destroy(zpool_handle_t *zhp)
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 
-	if (ioctl(zfs_fd, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
+	if (zfs_ioctl(ZFS_IOC_POOL_DESTROY, &zc) != 0) {
 		switch (errno) {
 		case EPERM:
 			zfs_error(dgettext(TEXT_DOMAIN,
@@ -546,7 +553,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
 	zc.zc_config_src = (uint64_t)(uintptr_t)packed;
 	zc.zc_config_src_size = len;
 
-	if (ioctl(zfs_fd, ZFS_IOC_VDEV_ADD, &zc) != 0) {
+	if (zfs_ioctl(ZFS_IOC_VDEV_ADD, &zc) != 0) {
 		switch (errno) {
 		case EPERM:
 			zfs_error(dgettext(TEXT_DOMAIN, "cannot add to '%s': "
@@ -631,7 +638,7 @@ zpool_export(zpool_handle_t *zhp)
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 
-	if (ioctl(zfs_fd, ZFS_IOC_POOL_EXPORT, &zc) != 0) {
+	if (zfs_ioctl(ZFS_IOC_POOL_EXPORT, &zc) != 0) {
 		switch (errno) {
 		case EPERM:
 			zfs_error(dgettext(TEXT_DOMAIN,
@@ -706,7 +713,7 @@ zpool_import(nvlist_t *config, const char *newname, const char *altroot)
 		zc.zc_root[0] = '\0';
 
 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
-	    &zc.zc_pool_guid) == 0);
+	    &zc.zc_guid) == 0);
 
 	verify(nvlist_size(config, &len, NV_ENCODE_NATIVE) == 0);
 
@@ -718,7 +725,7 @@ zpool_import(nvlist_t *config, const char *newname, const char *altroot)
 	zc.zc_config_src_size = len;
 
 	ret = 0;
-	if (ioctl(zfs_fd, ZFS_IOC_POOL_IMPORT, &zc) != 0) {
+	if (zfs_ioctl(ZFS_IOC_POOL_IMPORT, &zc) != 0) {
 		char desc[1024];
 		if (newname == NULL)
 			(void) snprintf(desc, sizeof (desc),
@@ -756,6 +763,14 @@ zpool_import(nvlist_t *config, const char *newname, const char *altroot)
 			    desc);
 			break;
 
+		case ENOTSUP:
+			/*
+			 * Unsupported version.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "%s: unsupported version"), desc);
+			break;
+
 		default:
 			zfs_baderror(errno);
 		}
@@ -788,7 +803,7 @@ zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type)
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 	zc.zc_cookie = type;
 
-	if (ioctl(zfs_fd, ZFS_IOC_POOL_SCRUB, &zc) == 0)
+	if (zfs_ioctl(ZFS_IOC_POOL_SCRUB, &zc) == 0)
 		return (0);
 
 	(void) snprintf(msg, sizeof (msg),
@@ -816,6 +831,81 @@ zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type)
 	return (-1);
 }
 
+static uint64_t
+vdev_to_guid(nvlist_t *nv, const char *search, uint64_t guid)
+{
+	uint_t c, children;
+	nvlist_t **child;
+	uint64_t ret, present;
+	char *path;
+	uint64_t wholedisk = 0;
+
+	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &ret) == 0);
+
+	if (search == NULL &&
+	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &present) == 0) {
+		/*
+		 * If the device has never been present since import, the only
+		 * reliable way to match the vdev is by GUID.
+		 */
+		if (ret == guid)
+			return (ret);
+	} else if (search != NULL &&
+	    nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+		    &wholedisk);
+		if (wholedisk) {
+			/*
+			 * For whole disks, the internal path has 's0', but the
+			 * path passed in by the user doesn't.
+			 */
+			if (strlen(search) == strlen(path) - 2 &&
+			    strncmp(search, path, strlen(search)) == 0)
+				return (ret);
+		} else if (strcmp(search, path) == 0) {
+			return (ret);
+		}
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return (0);
+
+	for (c = 0; c < children; c++)
+		if ((ret = vdev_to_guid(child[c], search, guid)) != 0)
+			return (ret);
+
+	return (0);
+}
+
+/*
+ * Given a string describing a vdev, returns the matching GUID, or 0 if none.
+ */
+uint64_t
+zpool_vdev_to_guid(zpool_handle_t *zhp, const char *path)
+{
+	char buf[MAXPATHLEN];
+	const char *search;
+	char *end;
+	nvlist_t *nvroot;
+	uint64_t guid;
+
+	guid = strtoull(path, &end, 16);
+	if (guid != 0 && *end == '\0') {
+		search = NULL;
+	} else if (path[0] != '/') {
+		(void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path);
+		search = buf;
+	} else {
+		search = path;
+	}
+
+	verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+
+	return (vdev_to_guid(nvroot, search, guid));
+}
+
 /*
  * Bring the specified vdev online
  */
@@ -825,16 +915,19 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path)
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
 
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot online %s"), path);
+
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
-	(void) snprintf(zc.zc_prop_value, sizeof (zc.zc_prop_value),
-	    "%s%s", path[0] == '/' ? "" : "/dev/dsk/", path);
+	if ((zc.zc_guid = zpool_vdev_to_guid(zhp, path)) == 0) {
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: no such device in pool"),
+		    msg);
+		return (-1);
+	}
 
-	if (ioctl(zfs_fd, ZFS_IOC_VDEV_ONLINE, &zc) == 0)
+	if (zfs_ioctl(ZFS_IOC_VDEV_ONLINE, &zc) == 0)
 		return (0);
 
-	(void) snprintf(msg, sizeof (msg),
-	    dgettext(TEXT_DOMAIN, "cannot online %s"), zc.zc_prop_value);
-
 	switch (errno) {
 	    case ENODEV:
 		/*
@@ -865,18 +958,21 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, int istmp)
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
 
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot offline %s"), path);
+
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
-	(void) snprintf(zc.zc_prop_value, sizeof (zc.zc_prop_value),
-	    "%s%s", path[0] == '/' ? "" : "/dev/dsk/", path);
+	if ((zc.zc_guid = zpool_vdev_to_guid(zhp, path)) == 0) {
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: no such device in pool"),
+		    msg);
+		return (-1);
+	}
 
 	zc.zc_cookie = istmp;
 
-	if (ioctl(zfs_fd, ZFS_IOC_VDEV_OFFLINE, &zc) == 0)
+	if (zfs_ioctl(ZFS_IOC_VDEV_OFFLINE, &zc) == 0)
 		return (0);
 
-	(void) snprintf(msg, sizeof (msg),
-	    dgettext(TEXT_DOMAIN, "cannot offline %s"), zc.zc_prop_value);
-
 	switch (errno) {
 	    case ENODEV:
 		/*
@@ -919,9 +1015,19 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 	int ret;
 	size_t len;
 
+	if (replacing)
+		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+		    "cannot replace %s with %s"), old_disk, new_disk);
+	else
+		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+		    "cannot attach %s to %s"), new_disk, old_disk);
+
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
-	(void) snprintf(zc.zc_prop_value, sizeof (zc.zc_prop_value),
-	    "%s%s", old_disk[0] == '/' ? "" : "/dev/dsk/", old_disk);
+	if ((zc.zc_guid = zpool_vdev_to_guid(zhp, old_disk)) == 0) {
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: no such device in pool"),
+		    msg);
+		return (-1);
+	}
 	zc.zc_cookie = replacing;
 
 	verify(nvlist_size(nvroot, &len, NV_ENCODE_NATIVE) == 0);
@@ -933,29 +1039,22 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 	zc.zc_config_src = (uint64_t)(uintptr_t)packed;
 	zc.zc_config_src_size = len;
 
-	ret = ioctl(zfs_fd, ZFS_IOC_VDEV_ATTACH, &zc);
+	ret = zfs_ioctl(ZFS_IOC_VDEV_ATTACH, &zc);
 
 	free(packed);
 
 	if (ret == 0)
 		return (0);
 
-	if (replacing)
-		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
-		    "cannot replace %s with %s"), old_disk, new_disk);
-	else
-		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
-		    "cannot attach %s to %s"), new_disk, old_disk);
-
 	switch (errno) {
-	    case EPERM:
+	case EPERM:
 		/*
 		 * No permission to mess with the config.
 		 */
 		zfs_error(dgettext(TEXT_DOMAIN, "%s: permission denied"), msg);
 		break;
 
-	    case ENODEV:
+	case ENODEV:
 		/*
 		 * Device doesn't exist.
 		 */
@@ -963,7 +1062,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 		    msg, old_disk);
 		break;
 
-	    case ENOTSUP:
+	case ENOTSUP:
 		/*
 		 * Can't attach to or replace this type of vdev.
 		 */
@@ -975,7 +1074,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 			    "%s: attach is only applicable to mirrors"), msg);
 		break;
 
-	    case EINVAL:
+	case EINVAL:
 		/*
 		 * The new device must be a single disk.
 		 */
@@ -983,7 +1082,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 		    "%s: <new_device> must be a single disk"), msg);
 		break;
 
-	    case ENXIO:
+	case ENXIO:
 		/*
 		 * This is unlikely to happen since we've verified that
 		 * all the devices can be opened from userland, but it's
@@ -993,14 +1092,14 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 		    msg, new_disk);
 		break;
 
-	    case EBUSY:
+	case EBUSY:
 		/*
 		 * The new device is is use.
 		 */
 		zfs_error(dgettext(TEXT_DOMAIN, "%s: %s busy"), msg, new_disk);
 		break;
 
-	    case EOVERFLOW:
+	case EOVERFLOW:
 		/*
 		 * The new device is too small.
 		 */
@@ -1008,7 +1107,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 		    msg, new_disk);
 		break;
 
-	    case EDOM:
+	case EDOM:
 		/*
 		 * The new device has a different alignment requirement.
 		 */
@@ -1016,7 +1115,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 		    "%s: devices have different sector alignment"), msg);
 		break;
 
-	    case ENAMETOOLONG:
+	case ENAMETOOLONG:
 		/*
 		 * The resulting top-level vdev spec won't fit in the label.
 		 */
@@ -1024,7 +1123,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 		    "%s: too many devices in a single vdev"), msg);
 		break;
 
-	    default:
+	default:
 		zfs_baderror(errno);
 	}
 
@@ -1040,32 +1139,34 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
 
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot detach %s"), path);
+
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
-	(void) snprintf(zc.zc_prop_value, sizeof (zc.zc_prop_value),
-	    "%s%s", path[0] == '/' ? "" : "/dev/dsk/", path);
+	if ((zc.zc_guid = zpool_vdev_to_guid(zhp, path)) == 0) {
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: no such device in pool"));
+		return (-1);
+	}
 
-	if (ioctl(zfs_fd, ZFS_IOC_VDEV_DETACH, &zc) == 0)
+	if (zfs_ioctl(ZFS_IOC_VDEV_DETACH, &zc) == 0)
 		return (0);
 
-	(void) snprintf(msg, sizeof (msg),
-	    dgettext(TEXT_DOMAIN, "cannot detach %s"), zc.zc_prop_value);
-
 	switch (errno) {
-	    case EPERM:
+	case EPERM:
 		/*
 		 * No permission to mess with the config.
 		 */
 		zfs_error(dgettext(TEXT_DOMAIN, "%s: permission denied"), msg);
 		break;
 
-	    case ENODEV:
+	case ENODEV:
 		/*
 		 * Device doesn't exist.
 		 */
 		zfs_error(dgettext(TEXT_DOMAIN, "%s: device not in pool"), msg);
 		break;
 
-	    case ENOTSUP:
+	case ENOTSUP:
 		/*
 		 * Can't detach from this type of vdev.
 		 */
@@ -1073,14 +1174,64 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
 		    "%s: only applicable to mirror and replacing vdevs"), msg);
 		break;
 
-	    case EBUSY:
+	case EBUSY:
 		/*
 		 * There are no other replicas of this device.
 		 */
 		zfs_error(dgettext(TEXT_DOMAIN, "%s: no valid replicas"), msg);
 		break;
 
-	    default:
+	default:
+		zfs_baderror(errno);
+	}
+
+	return (1);
+}
+
+/*
+ * Clear the errors for the pool, or the particular device if specified.
+ */
+int
+zpool_clear(zpool_handle_t *zhp, const char *path)
+{
+	zfs_cmd_t zc = { 0 };
+	char msg[1024];
+
+	if (path)
+		(void) snprintf(msg, sizeof (msg),
+		    dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
+		    zc.zc_prop_value);
+	else
+		(void) snprintf(msg, sizeof (msg),
+		    dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
+		    zhp->zpool_name);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	if (path && (zc.zc_guid = zpool_vdev_to_guid(zhp, path)) == 0) {
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: no such device in pool"),
+		    msg);
+		return (-1);
+	}
+
+	if (zfs_ioctl(ZFS_IOC_CLEAR, &zc) == 0)
+		return (0);
+
+	switch (errno) {
+	case EPERM:
+		/*
+		 * No permission to mess with the config.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: permission denied"), msg);
+		break;
+
+	case ENODEV:
+		/*
+		 * Device doesn't exist.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: device not in pool"), msg);
+		break;
+
+	default:
 		zfs_baderror(errno);
 	}
 
@@ -1221,9 +1372,9 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path)
 	(void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 	(void) strncpy(zc.zc_prop_value, path, sizeof (zc.zc_prop_value));
 	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
-	    &zc.zc_pool_guid) == 0);
+	    &zc.zc_guid) == 0);
 
-	(void) ioctl(zfs_fd, ZFS_IOC_VDEV_SETPATH, &zc);
+	(void) zfs_ioctl(ZFS_IOC_VDEV_SETPATH, &zc);
 }
 
 /*
@@ -1245,9 +1396,16 @@ char *
 zpool_vdev_name(zpool_handle_t *zhp, nvlist_t *nv)
 {
 	char *path, *devid;
-	uint64_t wholedisk;
+	uint64_t value;
+	char buf[64];
 
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+	    &value) == 0) {
+		verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
+		    &value) == 0);
+		(void) snprintf(buf, sizeof (buf), "%llx", value);
+		path = buf;
+	} else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
 
 		if (zhp != NULL &&
 		    nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) {
@@ -1282,7 +1440,7 @@ zpool_vdev_name(zpool_handle_t *zhp, nvlist_t *nv)
 			path += 9;
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
-		    &wholedisk) == 0 && wholedisk) {
+		    &value) == 0 && value) {
 			char *tmp = zfs_strdup(path);
 			tmp[strlen(path) - 2] = '\0';
 			return (tmp);
@@ -1293,3 +1451,141 @@ zpool_vdev_name(zpool_handle_t *zhp, nvlist_t *nv)
 
 	return (zfs_strdup(path));
 }
+
+static int
+zbookmark_compare(const void *a, const void *b)
+{
+	return (memcmp(a, b, sizeof (zbookmark_t)));
+}
+
+/*
+ * Retrieve the persistent error log, uniquify the members, and return to the
+ * caller.
+ */
+int
+zpool_get_errlog(zpool_handle_t *zhp, nvlist_t ***list, size_t *nelem)
+{
+	zfs_cmd_t zc = { 0 };
+	uint64_t count;
+	zbookmark_t *zb;
+	int i, j;
+
+	if (zhp->zpool_error_log != NULL) {
+		*list = zhp->zpool_error_log;
+		*nelem = zhp->zpool_error_count;
+		return (0);
+	}
+
+	/*
+	 * Retrieve the raw error list from the kernel.  If the number of errors
+	 * has increased, allocate more space and continue until we get the
+	 * entire list.
+	 */
+	verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT,
+	    &count) == 0);
+	zc.zc_config_dst = (uintptr_t)zfs_malloc(count * sizeof (zbookmark_t));
+	zc.zc_config_dst_size = count;
+	(void) strcpy(zc.zc_name, zhp->zpool_name);
+	for (;;) {
+		if (zfs_ioctl(ZFS_IOC_ERROR_LOG, &zc) != 0) {
+			if (errno == ENOMEM) {
+				free((void *)(uintptr_t)zc.zc_config_dst);
+				zc.zc_config_dst = (uintptr_t)
+				    zfs_malloc(zc.zc_config_dst_size);
+			} else {
+				return (-1);
+			}
+		} else {
+			break;
+		}
+	}
+
+	/*
+	 * Sort the resulting bookmarks.  This is a little confusing due to the
+	 * implementation of ZFS_IOC_ERROR_LOG.  The bookmarks are copied last
+	 * to first, and 'zc_config_dst_size' indicates the number of boomarks
+	 * _not_ copied as part of the process.  So we point the start of our
+	 * array appropriate and decrement the total number of elements.
+	 */
+	zb = ((zbookmark_t *)(uintptr_t)zc.zc_config_dst) +
+	    zc.zc_config_dst_size;
+	count -= zc.zc_config_dst_size;
+
+	qsort(zb, count, sizeof (zbookmark_t), zbookmark_compare);
+
+	/*
+	 * Count the number of unique elements
+	 */
+	j = 0;
+	for (i = 0; i < count; i++) {
+		if (i > 0 && memcmp(&zb[i - 1], &zb[i],
+		    sizeof (zbookmark_t)) == 0)
+			continue;
+		j++;
+	}
+
+	/*
+	 * If the user has only requested the number of items, return it now
+	 * without bothering with the extra work.
+	 */
+	if (list == NULL) {
+		*nelem = j;
+		return (0);
+	}
+
+	zhp->zpool_error_count = j;
+
+	/*
+	 * Allocate an array of nvlists to hold the results
+	 */
+	zhp->zpool_error_log = zfs_malloc(j * sizeof (nvlist_t *));
+
+	/*
+	 * Fill in the results with names from the kernel.
+	 */
+	j = 0;
+	for (i = 0; i < count; i++) {
+		char buf[64];
+		nvlist_t *nv;
+
+		if (i > 0 && memcmp(&zb[i - 1], &zb[i],
+		    sizeof (zbookmark_t)) == 0)
+			continue;
+
+		verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
+		    0) == 0);
+		zhp->zpool_error_log[j] = nv;
+
+		zc.zc_bookmark = zb[i];
+		if (zfs_ioctl(ZFS_IOC_BOOKMARK_NAME, &zc) == 0) {
+			verify(nvlist_add_string(nv, ZPOOL_ERR_DATASET,
+			    zc.zc_prop_name) == 0);
+			verify(nvlist_add_string(nv, ZPOOL_ERR_OBJECT,
+			    zc.zc_prop_value) == 0);
+			verify(nvlist_add_string(nv, ZPOOL_ERR_RANGE,
+			    zc.zc_filename) == 0);
+		} else {
+			(void) snprintf(buf, sizeof (buf), "%llx",
+			    zb[i].zb_objset);
+			verify(nvlist_add_string(nv,
+			    ZPOOL_ERR_DATASET, buf) == 0);
+			(void) snprintf(buf, sizeof (buf), "%llx",
+			    zb[i].zb_object);
+			verify(nvlist_add_string(nv, ZPOOL_ERR_OBJECT,
+			    buf) == 0);
+			(void) snprintf(buf, sizeof (buf), "lvl=%u blkid=%llu",
+			    (int)zb[i].zb_level, (long long)zb[i].zb_blkid);
+			verify(nvlist_add_string(nv, ZPOOL_ERR_RANGE,
+			    buf) == 0);
+		}
+
+		j++;
+	}
+
+	*list = zhp->zpool_error_log;
+	*nelem = zhp->zpool_error_count;
+
+	free((void *)(uintptr_t)zc.zc_config_dst);
+
+	return (0);
+}
diff --git a/usr/src/lib/libzfs/common/libzfs_status.c b/usr/src/lib/libzfs/common/libzfs_status.c
index 27a86d0c3c..314e452076 100644
--- a/usr/src/lib/libzfs/common/libzfs_status.c
+++ b/usr/src/lib/libzfs/common/libzfs_status.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -64,6 +63,25 @@ static char *msgid_table[] = {
 	"ZFS-8000-A5"
 };
 
+/*
+ * If the pool is active, a certain class of static errors is overridden by the
+ * faults as analayzed by FMA.  These faults have separate knowledge articles,
+ * and the article referred to by 'zpool status' must match that indicated by
+ * the syslog error message.  We override missing data as well as corrupt pool.
+ */
+static char *msgid_table_active[] = {
+	"ZFS-8000-14",
+	"ZFS-8000-D3",		/* overridden */
+	"ZFS-8000-D3",		/* overridden */
+	"ZFS-8000-4J",
+	"ZFS-8000-5E",
+	"ZFS-8000-6X",
+	"ZFS-8000-CS",		/* overridden */
+	"ZFS-8000-8A",
+	"ZFS-8000-9P",
+	"ZFS-8000-CS",		/* overridden */
+};
+
 #define	NMSGID	(sizeof (msgid_table) / sizeof (msgid_table[0]))
 
 /* ARGSUSED */
@@ -143,9 +161,10 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
  * following:
  *
  *	- Check for a complete and valid configuration
- *	- Look for any missing devices
- *	- Look for any devices showing errors
+ *	- Look for any missing devices in a non-replicated config
  *	- Check for any data errors
+ *	- Check for any missing devices in a replicated config
+ *	- Look for any devices showing errors
  *	- Check for any resilvering devices
  *
  * There can obviously be multiple errors within a single pool, so this routine
@@ -157,6 +176,7 @@ check_status(nvlist_t *config, int isimport)
 	nvlist_t *nvroot;
 	vdev_stat_t *vs;
 	uint_t vsc;
+	uint64_t nerr;
 
 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
@@ -167,31 +187,45 @@ check_status(nvlist_t *config, int isimport)
 	 * Check that the config is complete.
 	 */
 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
-	    vs->vs_aux == VDEV_AUX_BAD_GUID_SUM) {
+	    vs->vs_aux == VDEV_AUX_BAD_GUID_SUM)
 		return (ZPOOL_STATUS_BAD_GUID_SUM);
-	}
 
 	/*
-	 * Missing devices
+	 * Missing devices in non-replicated config.
 	 */
-	if (find_vdev_problem(nvroot, vdev_missing)) {
-		if (vs->vs_state == VDEV_STATE_CANT_OPEN)
-			return (ZPOOL_STATUS_MISSING_DEV_NR);
-		else
-			return (ZPOOL_STATUS_MISSING_DEV_R);
-	}
+	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
+	    find_vdev_problem(nvroot, vdev_missing))
+		return (ZPOOL_STATUS_MISSING_DEV_NR);
+
+	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
+	    find_vdev_problem(nvroot, vdev_broken))
+		return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
+
+	/*
+	 * Corrupted pool metadata
+	 */
+	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
+	    vs->vs_aux == VDEV_AUX_CORRUPT_DATA)
+		return (ZPOOL_STATUS_CORRUPT_POOL);
 
 	/*
-	 * Devices with corrupted labels.
+	 * Persistent data errors.
 	 */
-	if (find_vdev_problem(nvroot, vdev_broken)) {
-		if (vs->vs_state == VDEV_STATE_CANT_OPEN)
-			return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
-		else
-			return (ZPOOL_STATUS_CORRUPT_LABEL_R);
+	if (!isimport) {
+		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
+		    &nerr) == 0 && nerr != 0)
+			return (ZPOOL_STATUS_CORRUPT_DATA);
 	}
 
 	/*
+	 * Missing devices in a replicated config.
+	 */
+	if (find_vdev_problem(nvroot, vdev_missing))
+		return (ZPOOL_STATUS_MISSING_DEV_R);
+	if (find_vdev_problem(nvroot, vdev_broken))
+		return (ZPOOL_STATUS_CORRUPT_LABEL_R);
+
+	/*
 	 * Devices with errors
 	 */
 	if (!isimport && find_vdev_problem(nvroot, vdev_errors))
@@ -214,8 +248,6 @@ check_status(nvlist_t *config, int isimport)
 	 *
 	 * 	CORRUPT_CACHE
 	 * 	VERSION_MISMATCH
-	 * 	CORRUPT_POOL
-	 * 	CORRUPT_DATA
 	 */
 
 	return (ZPOOL_STATUS_OK);
@@ -229,7 +261,7 @@ zpool_get_status(zpool_handle_t *zhp, char **msgid)
 	if (ret >= NMSGID)
 		*msgid = NULL;
 	else
-		*msgid = msgid_table[ret];
+		*msgid = msgid_table_active[ret];
 
 	return (ret);
 }
diff --git a/usr/src/lib/libzfs/common/libzfs_util.c b/usr/src/lib/libzfs/common/libzfs_util.c
index d164c4585d..c7f7528491 100644
--- a/usr/src/lib/libzfs/common/libzfs_util.c
+++ b/usr/src/lib/libzfs/common/libzfs_util.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -44,7 +43,10 @@
 
 #include "libzfs_impl.h"
 
-int zfs_fd;
+static int zfs_fd = -1;
+static FILE *mnttab_file;
+static FILE *sharetab_file;
+static int sharetab_opened;
 
 void (*error_func)(const char *, va_list);
 
@@ -144,34 +146,55 @@ zfs_strdup(const char *str)
 }
 
 /*
- * Initialize the library.  Sets the command name used when reporting errors.
- * This command name is used to prefix all error messages appropriately.
- * Also opens /dev/zfs and dies if it cannot be opened.
+ * Utility functions around common used files - /dev/zfs, /etc/mnttab, and
+ * /etc/dfs/sharetab.
  */
-#pragma init(zfs_init)
-void
-zfs_init(void)
+int
+zfs_ioctl(int cmd, zfs_cmd_t *zc)
 {
-	if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0)
-		zfs_fatal(dgettext(TEXT_DOMAIN,
-		    "internal error: cannot open zfs device"));
+	if (zfs_fd == -1 &&
+	    (zfs_fd = open(ZFS_DEV, O_RDWR)) < 0)
+		zfs_fatal(dgettext(TEXT_DOMAIN, "internal error: unable to "
+		    "open ZFS device\n"), MNTTAB);
 
-	if ((mnttab_file = fopen(MNTTAB, "r")) == NULL)
+	return (ioctl(zfs_fd, cmd, zc));
+}
+
+FILE *
+zfs_mnttab(void)
+{
+	if (mnttab_file == NULL &&
+	    (mnttab_file = fopen(MNTTAB, "r")) == NULL)
 		zfs_fatal(dgettext(TEXT_DOMAIN, "internal error: unable to "
 		    "open %s\n"), MNTTAB);
 
-	sharetab_file = fopen("/etc/dfs/sharetab", "r");
+	return (mnttab_file);
+}
+
+FILE *
+zfs_sharetab(void)
+{
+	if (sharetab_opened)
+		return (sharetab_file);
+
+	sharetab_opened = TRUE;
+	return (sharetab_file = fopen("/etc/dfs/sharetab", "r"));
 }
 
 /*
- * Cleanup function for library.  Simply close the file descriptors that we
- * opened as part of libzfs_init().
+ * Cleanup function for library.  Close any file descriptors that were
+ * opened as part of the above functions.
  */
 #pragma fini(zfs_fini)
 void
 zfs_fini(void)
 {
-	(void) close(zfs_fd);
+	if (zfs_fd != -1)
+		(void) close(zfs_fd);
+	if (sharetab_file)
+		(void) fclose(sharetab_file);
+	if (mnttab_file)
+		(void) fclose(mnttab_file);
 }
 
 /*
diff --git a/usr/src/lib/libzfs/spec/libzfs.spec b/usr/src/lib/libzfs/spec/libzfs.spec
index 1a180cd63d..a611d5a1e5 100644
--- a/usr/src/lib/libzfs/spec/libzfs.spec
+++ b/usr/src/lib/libzfs/spec/libzfs.spec
@@ -1,13 +1,9 @@
 #
-# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -23,7 +19,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 #ident	"%Z%%M%	%I%	%E% SMI"
@@ -60,10 +56,6 @@ function zfs_get_type
 version SUNWprivate_1.1
 end
 
-function zfs_init
-version SUNWprivate_1.1
-end
-
 function zfs_is_mounted
 version SUNWprivate_1.1
 end
@@ -232,6 +224,10 @@ function zpool_add
 version SUNWprivate_1.1
 end
 
+function zpool_clear
+version SUNWprivate_1.1
+end
+
 function zpool_close
 version SUNWprivate_1.1
 end
@@ -260,6 +256,10 @@ function zpool_get_config
 version SUNWprivate_1.1
 end
 
+function zpool_get_errlog
+version SUNWprivate_1.1
+end
+
 function zpool_get_guid
 version SUNWprivate_1.1
 end
@@ -347,3 +347,7 @@ end
 function zpool_vdev_name
 version  SUNWprivate_1.1
 end
+
+function zpool_vdev_to_guid
+version  SUNWprivate_1.1
+end
diff --git a/usr/src/lib/libzpool/common/kernel.c b/usr/src/lib/libzpool/common/kernel.c
index 83155b480f..01dafade29 100644
--- a/usr/src/lib/libzpool/common/kernel.c
+++ b/usr/src/lib/libzpool/common/kernel.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -43,6 +42,7 @@
 
 uint64_t physmem;
 vnode_t *rootdir = (vnode_t *)0xabcd1234;
+int modrootloaded = 0;
 
 /*
  * =========================================================================
@@ -562,6 +562,57 @@ cmn_err(int ce, const char *fmt, ...)
 
 /*
  * =========================================================================
+ * kobj interfaces
+ * =========================================================================
+ */
+struct _buf *
+kobj_open_file(char *name)
+{
+	struct _buf *file;
+	vnode_t *vp;
+
+	/* set vp as the _fd field of the file */
+	if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir) != 0)
+		return ((void *)-1UL);
+
+	file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
+	file->_fd = (intptr_t)vp;
+	return (file);
+}
+
+int
+kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
+{
+	ssize_t resid;
+
+	vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
+	    UIO_SYSSPACE, 0, 0, 0, &resid);
+
+	return (0);
+}
+
+void
+kobj_close_file(struct _buf *file)
+{
+	vn_close((vnode_t *)file->_fd);
+	umem_free(file, sizeof (struct _buf));
+}
+
+int
+kobj_fstat(intptr_t fd, struct bootstat *bst)
+{
+	struct stat64 st;
+	vnode_t *vp = (vnode_t *)fd;
+	if (fstat64(vp->v_fd, &st) == -1) {
+		vn_close(vp);
+		return (errno);
+	}
+	bst->st_size = (uint64_t)st.st_size;
+	return (0);
+}
+
+/*
+ * =========================================================================
  * misc routines
  * =========================================================================
  */
diff --git a/usr/src/lib/libzpool/common/sys/zfs_context.h b/usr/src/lib/libzpool/common/sys/zfs_context.h
index 1ddfcbd8bd..e471a275da 100644
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -409,4 +408,21 @@ typedef struct callb_cpr {
 }
 #endif
 
+/* ZFS Boot Related stuff. */
+
+struct _buf {
+	intptr_t	_fd;
+};
+
+struct bootstat {
+	uint64_t st_size;
+};
+
+extern struct _buf *kobj_open_file(char *name);
+extern int kobj_read_file(struct _buf *file, char *buf, unsigned size,
+    unsigned off);
+extern void kobj_close_file(struct _buf *file);
+extern int kobj_fstat(intptr_t, struct bootstat *);
+
+
 #endif	/* _SYS_ZFS_CONTEXT_H */
diff --git a/usr/src/lib/libzpool/common/util.c b/usr/src/lib/libzpool/common/util.c
index 28a6704702..2519539ed8 100644
--- a/usr/src/lib/libzpool/common/util.c
+++ b/usr/src/lib/libzpool/common/util.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -34,6 +33,7 @@
 #include <stdlib.h>
 #include <sys/spa.h>
 #include <sys/fs/zfs.h>
+#include <sys/refcount.h>
 
 /*
  * Routines needed by more than one client of libzpool.
@@ -125,11 +125,11 @@ show_pool_stats(spa_t *spa)
 	nvlist_t *config = NULL;
 	nvlist_t *nvroot = NULL;
 
-	spa_config_enter(spa, RW_READER);
-	VERIFY(spa_get_stats(spa_name(spa), &config) == 0);
+	spa_config_enter(spa, RW_READER, FTAG);
+	VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0) == 0);
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
 
 	show_vdev_stats(spa_name(spa), nvroot, 0);
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 }
diff --git a/usr/src/pkgdefs/Makefile b/usr/src/pkgdefs/Makefile
index c6c918813f..85ea723dcc 100644
--- a/usr/src/pkgdefs/Makefile
+++ b/usr/src/pkgdefs/Makefile
@@ -233,6 +233,7 @@ COMMON_SUBDIRS= \
 	SUNWocfh \
 	SUNWocfr \
 	SUNWonfmes \
+	SUNWonzfs \
 	SUNWosdem \
 	SUNWypr \
 	SUNWypu \
diff --git a/usr/src/pkgdefs/SUNWfmd/prototype_com b/usr/src/pkgdefs/SUNWfmd/prototype_com
index 723d7f38ac..56f8277363 100644
--- a/usr/src/pkgdefs/SUNWfmd/prototype_com
+++ b/usr/src/pkgdefs/SUNWfmd/prototype_com
@@ -69,6 +69,8 @@ f none usr/lib/fm/fmd/plugins/snmp-trapgen.conf 644 root bin
 f none usr/lib/fm/fmd/plugins/snmp-trapgen.so 555 root bin
 f none usr/lib/fm/fmd/plugins/syslog-msgs.conf 644 root bin
 f none usr/lib/fm/fmd/plugins/syslog-msgs.so 555 root bin
+f none usr/lib/fm/fmd/plugins/zfs-diagnosis.conf 644 root bin
+f none usr/lib/fm/fmd/plugins/zfs-diagnosis.so 555 root bin
 d none usr/lib/fm/fmd/schemes 755 root bin
 f none usr/lib/fm/fmd/schemes/cpu.so 555 root bin
 f none usr/lib/fm/fmd/schemes/dev.so 555 root bin
@@ -78,6 +80,7 @@ f none usr/lib/fm/fmd/schemes/legacy-hc.so 555 root bin
 f none usr/lib/fm/fmd/schemes/mem.so 555 root bin
 f none usr/lib/fm/fmd/schemes/mod.so 555 root bin
 f none usr/lib/fm/fmd/schemes/pkg.so 555 root bin
+f none usr/lib/fm/fmd/schemes/zfs.so 555 root bin
 f none usr/lib/fm/libdiagcode.so.1 755 root bin
 s none usr/lib/fm/libdiagcode.so=libdiagcode.so.1
 f none usr/lib/fm/llib-ldiagcode 644 root bin
diff --git a/usr/src/pkgdefs/SUNWfmd/prototype_i386 b/usr/src/pkgdefs/SUNWfmd/prototype_i386
index c7012f3702..6af5cc1411 100644
--- a/usr/src/pkgdefs/SUNWfmd/prototype_i386
+++ b/usr/src/pkgdefs/SUNWfmd/prototype_i386
@@ -53,6 +53,7 @@ f none usr/lib/fm/fmd/schemes/amd64/legacy-hc.so 555 root bin
 f none usr/lib/fm/fmd/schemes/amd64/mem.so 555 root bin
 f none usr/lib/fm/fmd/schemes/amd64/mod.so 555 root bin
 f none usr/lib/fm/fmd/schemes/amd64/pkg.so 555 root bin
+f none usr/lib/fm/fmd/schemes/amd64/zfs.so 555 root bin
 f none usr/lib/fm/dict/AMD.dict 444 root bin
 f none usr/lib/locale/C/LC_MESSAGES/AMD.mo 444 root bin
 d none usr/platform 755 root sys
diff --git a/usr/src/pkgdefs/SUNWfmd/prototype_sparc b/usr/src/pkgdefs/SUNWfmd/prototype_sparc
index d094cbacea..b63ef0d4a3 100644
--- a/usr/src/pkgdefs/SUNWfmd/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWfmd/prototype_sparc
@@ -44,6 +44,7 @@ f none usr/lib/fm/fmd/schemes/sparcv9/legacy-hc.so 555 root bin
 f none usr/lib/fm/fmd/schemes/sparcv9/mem.so 555 root bin
 f none usr/lib/fm/fmd/schemes/sparcv9/mod.so 555 root bin
 f none usr/lib/fm/fmd/schemes/sparcv9/pkg.so 555 root bin
+f none usr/lib/fm/fmd/schemes/sparcv9/zfs.so 555 root bin
 d none usr/lib/fm/sparcv9 755 root bin
 f none usr/lib/fm/sparcv9/libdiagcode.so.1 755 root bin
 s none usr/lib/fm/sparcv9/libdiagcode.so=libdiagcode.so.1
diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com
index 001dafc9e9..21c0e46b32 100644
--- a/usr/src/pkgdefs/SUNWhea/prototype_com
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -655,6 +655,8 @@ d none usr/include/sys/fm 755 root bin
 d none usr/include/sys/fm/cpu 755 root bin
 f none usr/include/sys/fm/protocol.h 644 root bin
 f none usr/include/sys/fm/util.h 644 root bin
+d none usr/include/sys/fm/fs 755 root bin
+f none usr/include/sys/fm/fs/zfs.h 644 root bin
 d none usr/include/sys/fm/io 755 root bin
 f none usr/include/sys/fm/io/ddi.h 644 root bin
 f none usr/include/sys/fm/io/pci.h 644 root bin
diff --git a/usr/src/pkgdefs/SUNWonzfs/Makefile b/usr/src/pkgdefs/SUNWonzfs/Makefile
new file mode 100644
index 0000000000..193c83ef69
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWonzfs/Makefile
@@ -0,0 +1,37 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../Makefile.com
+
+DATAFILES += depend
+
+.KEEP_STATE:
+
+all: $(FILES)
+
+install: all pkg
+
+include ../Makefile.targ
diff --git a/usr/src/pkgdefs/SUNWonzfs/pkginfo.tmpl b/usr/src/pkgdefs/SUNWonzfs/pkginfo.tmpl
new file mode 100644
index 0000000000..31b6598557
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWonzfs/pkginfo.tmpl
@@ -0,0 +1,45 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This required package information file describes characteristics of the
+# package, such as package abbreviation, full package name, package version,
+# and package architecture.
+#
+PKG="SUNWonzfs"
+NAME="ZFS unbundled utilities"
+ARCH="ISA"
+VERSION="ONVERS,REV=0.0.0"
+SUNW_PRODNAME="SunOS"
+SUNW_PRODVERS="RELEASE/VERSION"
+SUNW_PKGTYPE="usr"
+MAXINST="1000"
+CATEGORY="system"
+DESC="ZFS test commands"
+VENDOR="Sun Microsystems, Inc."
+HOTLINE="Please contact your local service provider"
+EMAIL=""
+CLASSES="none"
+BASEDIR=/
+SUNW_PKGVERS="1.0"
diff --git a/usr/src/pkgdefs/SUNWonzfs/prototype_com b/usr/src/pkgdefs/SUNWonzfs/prototype_com
new file mode 100644
index 0000000000..7a4bf243e9
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWonzfs/prototype_com
@@ -0,0 +1,36 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+i pkginfo
+i copyright
+i depend
+#
+# SUNWonzfs
+#
+d none usr 755 root sys
+d none usr/sbin 755 root bin
+l none usr/sbin/zinject=../../usr/lib/isaexec
+d none usr/bin 755 root bin
+l none usr/bin/ztest=../../usr/lib/isaexec
diff --git a/usr/src/pkgdefs/SUNWonzfs/prototype_i386 b/usr/src/pkgdefs/SUNWonzfs/prototype_i386
new file mode 100644
index 0000000000..d81f259bdd
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWonzfs/prototype_i386
@@ -0,0 +1,35 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+!include prototype_com
+#
+d none usr/sbin/amd64 755 root bin
+f none usr/sbin/amd64/zinject 555 root bin
+d none usr/sbin/i86 755 root bin
+f none usr/sbin/i86/zinject 555 root bin
+d none usr/bin/amd64 755 root bin
+f none usr/bin/amd64/ztest 555 root bin
+d none usr/bin/i86 755 root bin
+f none usr/bin/i86/ztest 555 root bin
diff --git a/usr/src/pkgdefs/SUNWonzfs/prototype_sparc b/usr/src/pkgdefs/SUNWonzfs/prototype_sparc
new file mode 100644
index 0000000000..dfef30ce02
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWonzfs/prototype_sparc
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+!include prototype_com
+#
+d none usr/sbin/sparcv9 755 root bin
+f none usr/sbin/sparcv9/zinject 555 root bin
+d none usr/bin/sparcv9 755 root bin
+f none usr/bin/sparcv9/ztest 555 root bin
diff --git a/usr/src/pkgdefs/SUNWzfsr/prototype_com b/usr/src/pkgdefs/SUNWzfsr/prototype_com
index 5720b47591..1adcf54648 100644
--- a/usr/src/pkgdefs/SUNWzfsr/prototype_com
+++ b/usr/src/pkgdefs/SUNWzfsr/prototype_com
@@ -43,6 +43,8 @@ d none etc/zfs 755 root sys
 d none lib 755 root bin
 s none lib/libzfs.so=libzfs.so.1
 f none lib/libzfs.so.1 755 root bin
+f none lib/llib-lzfs.ln 644 root bin
+f none lib/llib-lzfs 644 root bin
 d none sbin 755 root sys
 f none sbin/zfs 555 root bin
 f none sbin/zpool 555 root bin
diff --git a/usr/src/pkgdefs/SUNWzfsr/prototype_i386 b/usr/src/pkgdefs/SUNWzfsr/prototype_i386
index 3f75cdaf30..10900e272d 100644
--- a/usr/src/pkgdefs/SUNWzfsr/prototype_i386
+++ b/usr/src/pkgdefs/SUNWzfsr/prototype_i386
@@ -34,3 +34,4 @@
 d none lib/amd64 755 root bin
 s none lib/amd64/libzfs.so=libzfs.so.1
 f none lib/amd64/libzfs.so.1 755 root bin
+f none lib/amd64/llib-lzfs.ln 644 root bin
diff --git a/usr/src/pkgdefs/SUNWzfsr/prototype_sparc b/usr/src/pkgdefs/SUNWzfsr/prototype_sparc
index b22d8ce0c9..3aa95e8747 100644
--- a/usr/src/pkgdefs/SUNWzfsr/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWzfsr/prototype_sparc
@@ -34,3 +34,4 @@
 d none lib/sparcv9 755 root bin
 s none lib/sparcv9/libzfs.so=libzfs.so.1
 f none lib/sparcv9/libzfs.so.1 755 root bin
+f none lib/sparcv9/llib-lzfs.ln 644 root bin
diff --git a/usr/src/pkgdefs/SUNWzfsu/prototype_com b/usr/src/pkgdefs/SUNWzfsu/prototype_com
index 771903d2eb..7668414ed7 100644
--- a/usr/src/pkgdefs/SUNWzfsu/prototype_com
+++ b/usr/src/pkgdefs/SUNWzfsu/prototype_com
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -50,6 +50,8 @@ s none usr/lib/libzfs.so.1=../../lib/libzfs.so.1
 s none usr/lib/libzfs.so=../../lib/libzfs.so.1
 f none usr/lib/libzfs_jni.so.1 755 root bin
 s none usr/lib/libzfs_jni.so=libzfs_jni.so.1
+s none usr/lib/llib-lzfs.ln=../../lib/llib-lzfs.ln
+s none usr/lib/llib-lzfs=../../lib/llib-lzfs
 d none usr/lib/mdb 755 root sys
 d none usr/lib/mdb/kvm 755 root sys
 d none usr/lib/mdb/proc 755 root sys
diff --git a/usr/src/pkgdefs/SUNWzfsu/prototype_i386 b/usr/src/pkgdefs/SUNWzfsu/prototype_i386
index 988cb0b6fb..8b34a81045 100644
--- a/usr/src/pkgdefs/SUNWzfsu/prototype_i386
+++ b/usr/src/pkgdefs/SUNWzfsu/prototype_i386
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -38,6 +38,7 @@ f none usr/lib/amd64/libzfs_jni.so.1 755 root bin
 s none usr/lib/amd64/libzfs_jni.so=libzfs_jni.so.1
 f none usr/lib/amd64/libzpool.so.1 755 root bin
 s none usr/lib/amd64/libzpool.so=libzpool.so.1
+s none usr/lib/amd64/llib-lzfs.ln=../../../lib/amd64/llib-lzfs.ln
 f none usr/lib/libzpool.so.1 755 root bin
 s none usr/lib/libzpool.so=libzpool.so.1
 d none usr/lib/mdb/kvm/amd64 755 root sys
diff --git a/usr/src/pkgdefs/SUNWzfsu/prototype_sparc b/usr/src/pkgdefs/SUNWzfsu/prototype_sparc
index 57eb70b964..c757468cf9 100644
--- a/usr/src/pkgdefs/SUNWzfsu/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWzfsu/prototype_sparc
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -42,5 +42,6 @@ f none usr/lib/sparcv9/libzfs_jni.so.1 755 root bin
 s none usr/lib/sparcv9/libzfs_jni.so=libzfs_jni.so.1
 f none usr/lib/sparcv9/libzpool.so.1 755 root bin
 s none usr/lib/sparcv9/libzpool.so=libzpool.so.1
+s none usr/lib/sparcv9/llib-lzfs.ln=../../../lib/sparcv9/llib-lzfs.ln
 d none usr/sbin/sparcv9 755 root bin
 f none usr/sbin/sparcv9/zdb 555 root bin
diff --git a/usr/src/pkgdefs/etc/exception_list_i386 b/usr/src/pkgdefs/etc/exception_list_i386
index 985e567363..5c0e1ae3f1 100644
--- a/usr/src/pkgdefs/etc/exception_list_i386
+++ b/usr/src/pkgdefs/etc/exception_list_i386
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -744,19 +744,10 @@ lib/amd64/libc_i18n.a			i386
 #
 # ZFS internal tools and lint libraries
 #
-lib/llib-lzfs.ln			i386
-lib/llib-lzfs				i386
-lib/amd64/llib-lzfs.ln			i386
-usr/bin/ztest				i386
-usr/bin/i86/ztest			i386
-usr/bin/amd64/ztest			i386
-usr/lib/llib-lzfs.ln			i386
-usr/lib/llib-lzfs			i386
 usr/lib/llib-lzfs_jni			i386
 usr/lib/llib-lzfs_jni.ln		i386
 usr/lib/llib-lzpool			i386
 usr/lib/llib-lzpool.ln			i386
-usr/lib/amd64/llib-lzfs.ln		i386
 usr/lib/amd64/llib-lzfs_jni.ln		i386
 usr/lib/amd64/llib-lzpool.ln		i386
 #
diff --git a/usr/src/pkgdefs/etc/exception_list_sparc b/usr/src/pkgdefs/etc/exception_list_sparc
index ed42b8f184..25e896ccf0 100644
--- a/usr/src/pkgdefs/etc/exception_list_sparc
+++ b/usr/src/pkgdefs/etc/exception_list_sparc
@@ -1,13 +1,9 @@
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -22,8 +18,14 @@
 #
 # CDDL HEADER END
 #
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
 # ident	"%Z%%M%	%I%	%E% SMI"
 #
+
 # Exception List for protocmp
 #
 ###########################################
@@ -804,17 +806,9 @@ usr/platform/SUNW,Sun-Fire-T200/lib/llib-lpcp.ln	sparc
 #
 # ZFS internal tools and lint libraries
 #
-lib/llib-lzfs				sparc
-lib/llib-lzfs.ln			sparc
-lib/sparcv9/llib-lzfs.ln		sparc
-usr/bin/ztest				sparc
-usr/bin/sparcv9/ztest			sparc
-usr/lib/llib-lzfs			sparc
-usr/lib/llib-lzfs.ln			sparc
 usr/lib/llib-lzfs_jni			sparc
 usr/lib/llib-lzfs_jni.ln		sparc
 usr/lib/llib-lzpool			sparc
-usr/lib/sparcv9/llib-lzfs.ln		sparc
 usr/lib/sparcv9/llib-lzfs_jni.ln	sparc
 usr/lib/sparcv9/llib-lzpool.ln		sparc
 #
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index f2d155fd25..587e9e1535 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -864,6 +864,7 @@ ZFS_COMMON_OBJS +=		\
 	sha256.o		\
 	spa.o			\
 	spa_config.o		\
+	spa_errlog.o		\
 	spa_misc.o		\
 	space_map.o		\
 	txg.o			\
@@ -882,10 +883,12 @@ ZFS_COMMON_OBJS +=		\
 	zap_leaf.o		\
 	zap_micro.o		\
 	zfs_byteswap.o		\
+	zfs_fm.o		\
 	zil.o			\
 	zio.o			\
 	zio_checksum.o		\
-	zio_compress.o
+	zio_compress.o		\
+	zio_inject.o
 
 ZFS_SHARED_OBJS +=		\
 	zfs_namecheck.o		\
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index bd8a110990..904e746721 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -28,8 +28,8 @@
 /*
  * DVA-based Adjustable Relpacement Cache
  *
- * While much of the theory of operation and algorithms used here
- * are based on the self-tuning, low overhead replacement cache
+ * While much of the theory of operation used here is
+ * based on the self-tuning, low overhead replacement cache
  * presented by Megiddo and Modha at FAST 2003, there are some
  * significant differences:
  *
@@ -98,6 +98,15 @@
  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  * the "top" state mutex must be held before the "bot" state mutex.
  *
+ * Arc buffers may have an associated eviction callback function.
+ * This function will be invoked prior to removing the buffer (e.g.
+ * in arc_do_user_evicts()).  Note however that the data associated
+ * with the buffer may be evicted prior to the callback.  The callback
+ * must be made with *no locks held* (to prevent deadlock).  Additionally,
+ * the users of callbacks must ensure that their private data is
+ * protected from simultaneous callbacks from arc_buf_evict()
+ * and arc_do_user_evicts().
+ *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  */
@@ -136,10 +145,10 @@ static int arc_dead;
 /*
  * Note that buffers can be on one of 5 states:
  *	ARC_anon	- anonymous (discussed below)
- *	ARC_mru_top	- recently used, currently cached
- *	ARC_mru_bot	- recentely used, no longer in cache
- *	ARC_mfu_top	- frequently used, currently cached
- *	ARC_mfu_bot	- frequently used, no longer in cache
+ *	ARC_mru		- recently used, currently cached
+ *	ARC_mru_ghost	- recentely used, no longer in cache
+ *	ARC_mfu		- frequently used, currently cached
+ *	ARC_mfu_ghost	- frequently used, no longer in cache
  * When there are no active references to the buffer, they
  * are linked onto one of the lists in arc.  These are the
  * only buffers that can be evicted or deleted.
@@ -147,9 +156,9 @@ static int arc_dead;
  * Anonymous buffers are buffers that are not associated with
  * a DVA.  These are buffers that hold dirty block copies
  * before they are written to stable storage.  By definition,
- * they are "ref'd" and are considered part of arc_mru_top
+ * they are "ref'd" and are considered part of arc_mru
  * that cannot be freed.  Generally, they will aquire a DVA
- * as they are written and migrate onto the arc_mru_top list.
+ * as they are written and migrate onto the arc_mru list.
  */
 
 typedef struct arc_state {
@@ -162,24 +171,22 @@ typedef struct arc_state {
 
 /* The 5 states: */
 static arc_state_t ARC_anon;
-static arc_state_t ARC_mru_top;
-static arc_state_t ARC_mru_bot;
-static arc_state_t ARC_mfu_top;
-static arc_state_t ARC_mfu_bot;
+static arc_state_t ARC_mru;
+static arc_state_t ARC_mru_ghost;
+static arc_state_t ARC_mfu;
+static arc_state_t ARC_mfu_ghost;
 
 static struct arc {
 	arc_state_t 	*anon;
-	arc_state_t	*mru_top;
-	arc_state_t	*mru_bot;
-	arc_state_t	*mfu_top;
-	arc_state_t	*mfu_bot;
+	arc_state_t	*mru;
+	arc_state_t	*mru_ghost;
+	arc_state_t	*mfu;
+	arc_state_t	*mfu_ghost;
 	uint64_t	size;		/* Actual total arc size */
-	uint64_t	p;		/* Target size (in bytes) of mru_top */
+	uint64_t	p;		/* Target size (in bytes) of mru */
 	uint64_t	c;		/* Target size of cache (in bytes) */
 	uint64_t	c_min;		/* Minimum target cache size */
 	uint64_t	c_max;		/* Maximum target cache size */
-	uint64_t	incr;		/* Size by which to increment arc.c */
-	int64_t		size_check;
 
 	/* performance stats */
 	uint64_t	hits;
@@ -195,12 +202,6 @@ static struct arc {
 	int		no_grow;	/* Don't try to grow cache size */
 } arc;
 
-/* Default amount to grow arc.incr */
-static int64_t arc_incr_size = 1024;
-
-/* > 0 ==> time to increment arc.c */
-static int64_t arc_size_check_default = -1000;
-
 static uint64_t arc_tempreserve;
 
 typedef struct arc_callback arc_callback_t;
@@ -227,6 +228,7 @@ struct arc_buf_hdr {
 	arc_buf_hdr_t		*b_hash_next;
 	arc_buf_t		*b_buf;
 	uint32_t		b_flags;
+	uint32_t		b_datacnt;
 
 	kcondvar_t		b_cv;
 	arc_callback_t		*b_acb;
@@ -242,6 +244,13 @@ struct arc_buf_hdr {
 	refcount_t		b_refcnt;
 };
 
+static arc_buf_t *arc_eviction_list;
+static kmutex_t arc_eviction_mtx;
+static void arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
+
+#define	GHOST_STATE(state)	\
+	((state) == arc.mru_ghost || (state) == arc.mfu_ghost)
+
 /*
  * Private ARC flags.  These flags are private ARC only flags that will show up
  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
@@ -250,13 +259,17 @@ struct arc_buf_hdr {
  * public flags, make sure not to smash the private ones.
  */
 
+#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
 #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
 #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
 #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
+#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
 
+#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
 #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
+#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
 
 /*
  * Hash table routines
@@ -353,6 +366,7 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 	arc_buf_hdr_t *fbuf;
 	uint32_t max, i;
 
+	ASSERT(!HDR_IN_HASH_TABLE(buf));
 	fbufs_lastthread = curthread;
 	*lockp = hash_lock;
 	mutex_enter(hash_lock);
@@ -366,6 +380,7 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 
 	buf->b_hash_next = buf_hash_table.ht_table[idx];
 	buf_hash_table.ht_table[idx] = buf;
+	buf->b_flags |= ARC_IN_HASH_TABLE;
 
 	/* collect some hash table performance data */
 	if (i > 0) {
@@ -391,6 +406,7 @@ buf_hash_remove(arc_buf_hdr_t *buf)
 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 
 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
+	ASSERT(HDR_IN_HASH_TABLE(buf));
 
 	bufp = &buf_hash_table.ht_table[idx];
 	while ((fbuf = *bufp) != buf) {
@@ -399,6 +415,7 @@ buf_hash_remove(arc_buf_hdr_t *buf)
 	}
 	*bufp = buf->b_hash_next;
 	buf->b_hash_next = NULL;
+	buf->b_flags &= ~ARC_IN_HASH_TABLE;
 
 	/* collect some hash table performance data */
 	atomic_add_64(&arc.hash_elements, -1);
@@ -456,6 +473,7 @@ hdr_dest(void *vbuf, void *unused)
 	cv_destroy(&buf->b_cv);
 }
 
+static int arc_reclaim_needed(void);
 void arc_kmem_reclaim(void);
 
 /*
@@ -466,27 +484,33 @@ static void
 hdr_recl(void *unused)
 {
 	dprintf("hdr_recl called\n");
-	arc_kmem_reclaim();
+	if (arc_reclaim_needed())
+		arc_kmem_reclaim();
 }
 
 static void
 buf_init(void)
 {
 	uint64_t *ct;
-	uint64_t hsize = 1ULL << 10;
+	uint64_t hsize = 1ULL << 12;
 	int i, j;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
-	 * with an average 4k block size.  The table will take up
-	 * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte
-	 * pointers).
+	 * with an average 64K block size.  The table will take up
+	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 	 */
-	while (hsize * 4096 < physmem * PAGESIZE)
+	while (hsize * 65536 < physmem * PAGESIZE)
 		hsize <<= 1;
-
+retry:
 	buf_hash_table.ht_mask = hsize - 1;
-	buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+	buf_hash_table.ht_table =
+	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+	if (buf_hash_table.ht_table == NULL) {
+		ASSERT(hsize > (1ULL << 8));
+		hsize >>= 1;
+		goto retry;
+	}
 
 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
@@ -505,8 +529,6 @@ buf_init(void)
 
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
-#define	ARC_TAG		(void *)0x05201962
-
 static void
 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 {
@@ -514,14 +536,21 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 
 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
 	    (ab->b_state != arc.anon)) {
+		int delta = ab->b_size * ab->b_datacnt;
 
 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
 		mutex_enter(&ab->b_state->mtx);
-		ASSERT(!refcount_is_zero(&ab->b_refcnt));
+		ASSERT(refcount_count(&ab->b_refcnt) > 0);
 		ASSERT(list_link_active(&ab->b_arc_node));
 		list_remove(&ab->b_state->list, ab);
-		ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
-		ab->b_state->lsize -= ab->b_size;
+		if (GHOST_STATE(ab->b_state)) {
+			ASSERT3U(ab->b_datacnt, ==, 0);
+			ASSERT3P(ab->b_buf, ==, NULL);
+			delta = ab->b_size;
+		}
+		ASSERT(delta > 0);
+		ASSERT3U(ab->b_state->lsize, >=, delta);
+		atomic_add_64(&ab->b_state->lsize, -delta);
 		mutex_exit(&ab->b_state->mtx);
 	}
 }
@@ -531,7 +560,8 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 {
 	int cnt;
 
-	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(ab->b_state == arc.anon || MUTEX_HELD(hash_lock));
+	ASSERT(!GHOST_STATE(ab->b_state));
 
 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
 	    (ab->b_state != arc.anon)) {
@@ -540,8 +570,9 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 		mutex_enter(&ab->b_state->mtx);
 		ASSERT(!list_link_active(&ab->b_arc_node));
 		list_insert_head(&ab->b_state->list, ab);
-		ASSERT(ab->b_buf != NULL);
-		ab->b_state->lsize += ab->b_size;
+		ASSERT(ab->b_datacnt > 0);
+		atomic_add_64(&ab->b_state->lsize, ab->b_size * ab->b_datacnt);
+		ASSERT3U(ab->b_state->size, >=, ab->b_state->lsize);
 		mutex_exit(&ab->b_state->mtx);
 	}
 	return (cnt);
@@ -552,49 +583,70 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
  * for the buffer must be held by the caller.
  */
 static void
-arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab,
-    kmutex_t *hash_lock)
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 {
-	arc_buf_t *buf;
+	arc_state_t *old_state = ab->b_state;
+	int refcnt = refcount_count(&ab->b_refcnt);
+	int from_delta, to_delta;
 
 	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(new_state != old_state);
+	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
+	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+
+	from_delta = to_delta = ab->b_datacnt * ab->b_size;
 
 	/*
 	 * If this buffer is evictable, transfer it from the
 	 * old state list to the new state list.
 	 */
-	if (refcount_is_zero(&ab->b_refcnt)) {
-		if (ab->b_state != arc.anon) {
-			int drop_mutex = FALSE;
+	if (refcnt == 0) {
+		if (old_state != arc.anon) {
+			int use_mutex = !MUTEX_HELD(&old_state->mtx);
+
+			if (use_mutex)
+				mutex_enter(&old_state->mtx);
 
-			if (!MUTEX_HELD(&ab->b_state->mtx)) {
-				mutex_enter(&ab->b_state->mtx);
-				drop_mutex = TRUE;
-			}
 			ASSERT(list_link_active(&ab->b_arc_node));
-			list_remove(&ab->b_state->list, ab);
-			ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
-			ab->b_state->lsize -= ab->b_size;
-			if (drop_mutex)
-				mutex_exit(&ab->b_state->mtx);
+			list_remove(&old_state->list, ab);
+
+			/* ghost elements have a ghost size */
+			if (GHOST_STATE(old_state)) {
+				ASSERT(ab->b_datacnt == 0);
+				ASSERT(ab->b_buf == NULL);
+				from_delta = ab->b_size;
+			}
+			ASSERT3U(old_state->lsize, >=, from_delta);
+			atomic_add_64(&old_state->lsize, -from_delta);
+
+			if (use_mutex)
+				mutex_exit(&old_state->mtx);
 		}
 		if (new_state != arc.anon) {
-			int drop_mutex = FALSE;
+			int use_mutex = !MUTEX_HELD(&new_state->mtx);
 
-			if (!MUTEX_HELD(&new_state->mtx)) {
+			if (use_mutex)
 				mutex_enter(&new_state->mtx);
-				drop_mutex = TRUE;
-			}
+
 			list_insert_head(&new_state->list, ab);
-			ASSERT(ab->b_buf != NULL);
-			new_state->lsize += ab->b_size;
-			if (drop_mutex)
+
+			/* ghost elements have a ghost size */
+			if (GHOST_STATE(new_state)) {
+				ASSERT(ab->b_datacnt == 0);
+				ASSERT(ab->b_buf == NULL);
+				to_delta = ab->b_size;
+			}
+			atomic_add_64(&new_state->lsize, to_delta);
+			ASSERT3U(new_state->size + to_delta, >=,
+			    new_state->lsize);
+
+			if (use_mutex)
 				mutex_exit(&new_state->mtx);
 		}
 	}
 
 	ASSERT(!BUF_EMPTY(ab));
-	if (new_state == arc.anon && ab->b_state != arc.anon) {
+	if (new_state == arc.anon && old_state != arc.anon) {
 		buf_hash_remove(ab);
 	}
 
@@ -602,22 +654,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab,
 	 * If this buffer isn't being transferred to the MRU-top
 	 * state, it's safe to clear its prefetch flag
 	 */
-	if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) {
+	if ((new_state != arc.mru) && (new_state != arc.mru_ghost)) {
 		ab->b_flags &= ~ARC_PREFETCH;
 	}
 
-	buf = ab->b_buf;
-	if (buf == NULL) {
-		ASSERT3U(ab->b_state->size, >=, ab->b_size);
-		atomic_add_64(&ab->b_state->size, -ab->b_size);
-		/* we should only be here if we are deleting state */
-		ASSERT(new_state == arc.anon &&
-		    (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot));
-	} else while (buf) {
-		ASSERT3U(ab->b_state->size, >=, ab->b_size);
-		atomic_add_64(&ab->b_state->size, -ab->b_size);
-		atomic_add_64(&new_state->size, ab->b_size);
-		buf = buf->b_next;
+	/* adjust state sizes */
+	if (to_delta)
+		atomic_add_64(&new_state->size, to_delta);
+	if (from_delta) {
+		ASSERT3U(old_state->size, >=, from_delta);
+		atomic_add_64(&old_state->size, -from_delta);
 	}
 	ab->b_state = new_state;
 }
@@ -637,9 +683,12 @@ arc_buf_alloc(spa_t *spa, int size, void *tag)
 	hdr->b_arc_access = 0;
 	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
 	buf->b_hdr = hdr;
+	buf->b_efunc = NULL;
+	buf->b_private = NULL;
 	buf->b_next = NULL;
 	buf->b_data = zio_buf_alloc(size);
 	hdr->b_buf = buf;
+	hdr->b_datacnt = 1;
 	hdr->b_flags = 0;
 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
 	(void) refcount_add(&hdr->b_refcnt, tag);
@@ -650,35 +699,124 @@ arc_buf_alloc(spa_t *spa, int size, void *tag)
 	return (buf);
 }
 
+static void *
+arc_data_copy(arc_buf_hdr_t *hdr, void *old_data)
+{
+	void *new_data = zio_buf_alloc(hdr->b_size);
+
+	atomic_add_64(&arc.size, hdr->b_size);
+	bcopy(old_data, new_data, hdr->b_size);
+	atomic_add_64(&hdr->b_state->size, hdr->b_size);
+	if (list_link_active(&hdr->b_arc_node)) {
+		ASSERT(refcount_is_zero(&hdr->b_refcnt));
+		atomic_add_64(&hdr->b_state->lsize, hdr->b_size);
+	}
+	return (new_data);
+}
+
+void
+arc_buf_add_ref(arc_buf_t *buf, void* tag)
+{
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_lock;
+
+	mutex_enter(&arc_eviction_mtx);
+	hdr = buf->b_hdr;
+	if (buf->b_data == NULL) {
+		/*
+		 * This buffer is evicted.
+		 */
+		mutex_exit(&arc_eviction_mtx);
+		return;
+	} else {
+		/*
+		 * Prevent this buffer from being evicted
+		 * while we add a reference.
+		 */
+		buf->b_hdr = NULL;
+	}
+	mutex_exit(&arc_eviction_mtx);
+
+	ASSERT(hdr->b_state != arc.anon);
+	hash_lock = HDR_LOCK(hdr);
+	mutex_enter(hash_lock);
+	ASSERT(!GHOST_STATE(hdr->b_state));
+	buf->b_hdr = hdr;
+	add_reference(hdr, hash_lock, tag);
+	arc_access_and_exit(hdr, hash_lock);
+	atomic_add_64(&arc.hits, 1);
+}
+
+static void
+arc_buf_destroy(arc_buf_t *buf, boolean_t all)
+{
+	arc_buf_t **bufp;
+
+	/* free up data associated with the buf */
+	if (buf->b_data) {
+		arc_state_t *state = buf->b_hdr->b_state;
+		uint64_t size = buf->b_hdr->b_size;
+
+		zio_buf_free(buf->b_data, size);
+		atomic_add_64(&arc.size, -size);
+		if (list_link_active(&buf->b_hdr->b_arc_node)) {
+			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
+			ASSERT(state != arc.anon);
+			ASSERT3U(state->lsize, >=, size);
+			atomic_add_64(&state->lsize, -size);
+		}
+		ASSERT3U(state->size, >=, size);
+		atomic_add_64(&state->size, -size);
+		buf->b_data = NULL;
+		ASSERT(buf->b_hdr->b_datacnt > 0);
+		buf->b_hdr->b_datacnt -= 1;
+	}
+
+	/* only remove the buf if requested */
+	if (!all)
+		return;
+
+	/* remove the buf from the hdr list */
+	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
+		continue;
+	*bufp = buf->b_next;
+
+	ASSERT(buf->b_efunc == NULL);
+
+	/* clean up the buf */
+	buf->b_hdr = NULL;
+	kmem_cache_free(buf_cache, buf);
+}
+
 static void
-arc_hdr_free(arc_buf_hdr_t *hdr)
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
 	ASSERT3P(hdr->b_state, ==, arc.anon);
+	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 
 	if (!BUF_EMPTY(hdr)) {
-		/*
-		 * We can be called with an arc state lock held,
-		 * so we can't hold a hash lock here.
-		 * ASSERT(not in hash table)
-		 */
-		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		ASSERT(!HDR_IN_HASH_TABLE(hdr));
 		bzero(&hdr->b_dva, sizeof (dva_t));
 		hdr->b_birth = 0;
 		hdr->b_cksum0 = 0;
 	}
-	if (hdr->b_buf) {
+	while (hdr->b_buf) {
 		arc_buf_t *buf = hdr->b_buf;
 
-		ASSERT3U(hdr->b_size, >, 0);
-		zio_buf_free(buf->b_data, hdr->b_size);
-		atomic_add_64(&arc.size, -hdr->b_size);
-		ASSERT3U(arc.anon->size, >=, hdr->b_size);
-		atomic_add_64(&arc.anon->size, -hdr->b_size);
-		ASSERT3P(buf->b_next, ==, NULL);
-		kmem_cache_free(buf_cache, buf);
-		hdr->b_buf = NULL;
+		if (buf->b_efunc) {
+			mutex_enter(&arc_eviction_mtx);
+			ASSERT(buf->b_hdr != NULL);
+			arc_buf_destroy(hdr->b_buf, FALSE);
+			hdr->b_buf = buf->b_next;
+			buf->b_next = arc_eviction_list;
+			arc_eviction_list = buf;
+			mutex_exit(&arc_eviction_mtx);
+		} else {
+			arc_buf_destroy(hdr->b_buf, TRUE);
+		}
 	}
+
 	ASSERT(!list_link_active(&hdr->b_arc_node));
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
 	ASSERT3P(hdr->b_acb, ==, NULL);
@@ -689,36 +827,73 @@ void
 arc_buf_free(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
-	kmutex_t *hash_lock = HDR_LOCK(hdr);
-	int freeable;
+	int hashed = hdr->b_state != arc.anon;
 
-	mutex_enter(hash_lock);
-	if (remove_reference(hdr, hash_lock, tag) > 0) {
-		arc_buf_t **bufp = &hdr->b_buf;
-		arc_state_t *state = hdr->b_state;
-		uint64_t size = hdr->b_size;
-
-		ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr));
-		while (*bufp != buf) {
-			ASSERT(*bufp);
-			bufp = &(*bufp)->b_next;
-		}
-		*bufp = buf->b_next;
+	ASSERT(buf->b_efunc == NULL);
+	ASSERT(buf->b_data != NULL);
+
+	if (hashed) {
+		kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+		mutex_enter(hash_lock);
+		(void) remove_reference(hdr, hash_lock, tag);
+		if (hdr->b_datacnt > 1)
+			arc_buf_destroy(buf, TRUE);
+		else
+			hdr->b_flags |= ARC_BUF_AVAILABLE;
 		mutex_exit(hash_lock);
-		zio_buf_free(buf->b_data, size);
-		atomic_add_64(&arc.size, -size);
-		kmem_cache_free(buf_cache, buf);
-		ASSERT3U(state->size, >=, size);
-		atomic_add_64(&state->size, -size);
-		return;
+	} else if (HDR_IO_IN_PROGRESS(hdr)) {
+		int destroy_hdr;
+		/*
+		 * We are in the middle of an async write.  Don't destroy
+		 * this buffer unless the write completes before we finish
+		 * decrementing the reference count.
+		 */
+		mutex_enter(&arc_eviction_mtx);
+		(void) remove_reference(hdr, NULL, tag);
+		ASSERT(refcount_is_zero(&hdr->b_refcnt));
+		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
+		mutex_exit(&arc_eviction_mtx);
+		if (destroy_hdr)
+			arc_hdr_destroy(hdr);
+	} else {
+		if (remove_reference(hdr, NULL, tag) > 0) {
+			ASSERT(HDR_IO_ERROR(hdr));
+			arc_buf_destroy(buf, TRUE);
+		} else {
+			arc_hdr_destroy(hdr);
+		}
 	}
+}
 
-	/* don't free buffers that are in the middle of an async write */
-	freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL);
-	mutex_exit(hash_lock);
+int
+arc_buf_remove_ref(arc_buf_t *buf, void* tag)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	kmutex_t *hash_lock = HDR_LOCK(hdr);
+	int no_callback = (buf->b_efunc == NULL);
 
-	if (freeable)
-		arc_hdr_free(hdr);
+	if (hdr->b_state == arc.anon) {
+		arc_buf_free(buf, tag);
+		return (no_callback);
+	}
+
+	mutex_enter(hash_lock);
+	ASSERT(hdr->b_state != arc.anon);
+	ASSERT(buf->b_data != NULL);
+
+	(void) remove_reference(hdr, hash_lock, tag);
+	if (hdr->b_datacnt > 1) {
+		if (no_callback)
+			arc_buf_destroy(buf, TRUE);
+	} else if (no_callback) {
+		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+		hdr->b_flags |= ARC_BUF_AVAILABLE;
+	}
+	ASSERT(no_callback || hdr->b_datacnt > 1 ||
+	    refcount_is_zero(&hdr->b_refcnt));
+	mutex_exit(hash_lock);
+	return (no_callback);
 }
 
 int
@@ -732,19 +907,16 @@ arc_buf_size(arc_buf_t *buf)
  * bytes.  Move the removed buffers to the appropriate evict state.
  */
 static uint64_t
-arc_evict_state(arc_state_t *state, int64_t bytes)
+arc_evict(arc_state_t *state, int64_t bytes)
 {
 	arc_state_t *evicted_state;
-	uint64_t bytes_evicted = 0;
+	uint64_t bytes_evicted = 0, skipped = 0;
 	arc_buf_hdr_t *ab, *ab_prev;
 	kmutex_t *hash_lock;
 
-	ASSERT(state == arc.mru_top || state == arc.mfu_top);
+	ASSERT(state == arc.mru || state == arc.mfu);
 
-	if (state == arc.mru_top)
-		evicted_state = arc.mru_bot;
-	else
-		evicted_state = arc.mfu_bot;
+	evicted_state = (state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost;
 
 	mutex_enter(&state->mtx);
 	mutex_enter(&evicted_state->mtx);
@@ -754,19 +926,42 @@ arc_evict_state(arc_state_t *state, int64_t bytes)
 		hash_lock = HDR_LOCK(ab);
 		if (mutex_tryenter(hash_lock)) {
 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+			ASSERT(ab->b_datacnt > 0);
+			while (ab->b_buf) {
+				arc_buf_t *buf = ab->b_buf;
+				if (buf->b_data)
+					bytes_evicted += ab->b_size;
+				if (buf->b_efunc) {
+					mutex_enter(&arc_eviction_mtx);
+					/*
+					 * arc_buf_add_ref() could derail
+					 * this eviction.
+					 */
+					if (buf->b_hdr == NULL) {
+						mutex_exit(&arc_eviction_mtx);
+						mutex_exit(hash_lock);
+						goto skip;
+					}
+					arc_buf_destroy(buf, FALSE);
+					ab->b_buf = buf->b_next;
+					buf->b_next = arc_eviction_list;
+					arc_eviction_list = buf;
+					mutex_exit(&arc_eviction_mtx);
+				} else {
+					arc_buf_destroy(buf, TRUE);
+				}
+			}
+			ASSERT(ab->b_datacnt == 0);
 			arc_change_state(evicted_state, ab, hash_lock);
-			zio_buf_free(ab->b_buf->b_data, ab->b_size);
-			atomic_add_64(&arc.size, -ab->b_size);
-			ASSERT3P(ab->b_buf->b_next, ==, NULL);
-			kmem_cache_free(buf_cache, ab->b_buf);
-			ab->b_buf = NULL;
+			ASSERT(HDR_IN_HASH_TABLE(ab));
+			ab->b_flags = ARC_IN_HASH_TABLE;
 			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
-			bytes_evicted += ab->b_size;
 			mutex_exit(hash_lock);
-			if (bytes_evicted >= bytes)
+			if (bytes >= 0 && bytes_evicted >= bytes)
 				break;
 		} else {
-			atomic_add_64(&arc.skipped, 1);
+skip:
+			skipped += 1;
 		}
 	}
 	mutex_exit(&evicted_state->mtx);
@@ -776,6 +971,9 @@ arc_evict_state(arc_state_t *state, int64_t bytes)
 		dprintf("only evicted %lld bytes from %x",
 		    (longlong_t)bytes_evicted, state);
 
+	atomic_add_64(&arc.skipped, skipped);
+	if (bytes < 0)
+		return (skipped);
 	return (bytes_evicted);
 }
 
@@ -784,25 +982,27 @@ arc_evict_state(arc_state_t *state, int64_t bytes)
  * bytes.  Destroy the buffers that are removed.
  */
 static void
-arc_delete_state(arc_state_t *state, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, int64_t bytes)
 {
-	uint_t bufs_skipped = 0;
-	uint64_t bytes_deleted = 0;
 	arc_buf_hdr_t *ab, *ab_prev;
 	kmutex_t *hash_lock;
+	uint64_t bytes_deleted = 0;
+	uint_t bufs_skipped = 0;
 
+	ASSERT(GHOST_STATE(state));
 top:
 	mutex_enter(&state->mtx);
 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
 		ab_prev = list_prev(&state->list, ab);
 		hash_lock = HDR_LOCK(ab);
 		if (mutex_tryenter(hash_lock)) {
+			ASSERT(ab->b_buf == NULL);
 			arc_change_state(arc.anon, ab, hash_lock);
 			mutex_exit(hash_lock);
 			atomic_add_64(&arc.deleted, 1);
-			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
 			bytes_deleted += ab->b_size;
-			arc_hdr_free(ab);
+			arc_hdr_destroy(ab);
+			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
 			if (bytes >= 0 && bytes_deleted >= bytes)
 				break;
 		} else {
@@ -832,41 +1032,62 @@ arc_adjust(void)
 {
 	int64_t top_sz, mru_over, arc_over;
 
-	top_sz = arc.anon->size + arc.mru_top->size;
+	top_sz = arc.anon->size + arc.mru->size;
 
-	if (top_sz > arc.p && arc.mru_top->lsize > 0) {
-		int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p);
-		(void) arc_evict_state(arc.mru_top, toevict);
-		top_sz = arc.anon->size + arc.mru_top->size;
+	if (top_sz > arc.p && arc.mru->lsize > 0) {
+		int64_t toevict = MIN(arc.mru->lsize, top_sz-arc.p);
+		(void) arc_evict(arc.mru, toevict);
+		top_sz = arc.anon->size + arc.mru->size;
 	}
 
-	mru_over = top_sz + arc.mru_bot->size - arc.c;
+	mru_over = top_sz + arc.mru_ghost->size - arc.c;
 
 	if (mru_over > 0) {
-		if (arc.mru_bot->lsize > 0) {
-			int64_t todelete = MIN(arc.mru_bot->lsize, mru_over);
-			arc_delete_state(arc.mru_bot, todelete);
+		if (arc.mru_ghost->lsize > 0) {
+			int64_t todelete = MIN(arc.mru_ghost->lsize, mru_over);
+			arc_evict_ghost(arc.mru_ghost, todelete);
 		}
 	}
 
 	if ((arc_over = arc.size - arc.c) > 0) {
-		int64_t table_over;
+		int64_t tbl_over;
 
-		if (arc.mfu_top->lsize > 0) {
-			int64_t toevict = MIN(arc.mfu_top->lsize, arc_over);
-			(void) arc_evict_state(arc.mfu_top, toevict);
+		if (arc.mfu->lsize > 0) {
+			int64_t toevict = MIN(arc.mfu->lsize, arc_over);
+			(void) arc_evict(arc.mfu, toevict);
 		}
 
-		table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize
-		    - arc.c*2;
+		tbl_over = arc.size + arc.mru_ghost->lsize +
+		    arc.mfu_ghost->lsize - arc.c*2;
 
-		if (table_over > 0 && arc.mfu_bot->lsize > 0) {
-			int64_t todelete = MIN(arc.mfu_bot->lsize, table_over);
-			arc_delete_state(arc.mfu_bot, todelete);
+		if (tbl_over > 0 && arc.mfu_ghost->lsize > 0) {
+			int64_t todelete = MIN(arc.mfu_ghost->lsize, tbl_over);
+			arc_evict_ghost(arc.mfu_ghost, todelete);
 		}
 	}
 }
 
+static void
+arc_do_user_evicts(void)
+{
+	mutex_enter(&arc_eviction_mtx);
+	while (arc_eviction_list != NULL) {
+		arc_buf_t *buf = arc_eviction_list;
+		arc_eviction_list = buf->b_next;
+		buf->b_hdr = NULL;
+		mutex_exit(&arc_eviction_mtx);
+
+		ASSERT(buf->b_efunc != NULL);
+		VERIFY(buf->b_efunc(buf) == 0);
+
+		buf->b_efunc = NULL;
+		buf->b_private = NULL;
+		kmem_cache_free(buf_cache, buf);
+		mutex_enter(&arc_eviction_mtx);
+	}
+	mutex_exit(&arc_eviction_mtx);
+}
+
 /*
  * Flush all *evictable* data from the cache.
  * NOTE: this will not touch "active" (i.e. referenced) data.
@@ -874,17 +1095,22 @@ arc_adjust(void)
 void
 arc_flush(void)
 {
-	arc_delete_state(arc.mru_top, -1);
-	arc_delete_state(arc.mfu_top, -1);
+	while (arc_evict(arc.mru, -1));
+	while (arc_evict(arc.mfu, -1));
 
-	arc_delete_state(arc.mru_bot, -1);
-	arc_delete_state(arc.mfu_bot, -1);
+	arc_evict_ghost(arc.mru_ghost, -1);
+	arc_evict_ghost(arc.mfu_ghost, -1);
+
+	mutex_enter(&arc_reclaim_thr_lock);
+	arc_do_user_evicts();
+	mutex_exit(&arc_reclaim_thr_lock);
+	ASSERT(arc_eviction_list == NULL);
 }
 
 void
 arc_kmem_reclaim(void)
 {
-	/* Remove 6.25% */
+	/* Remove 12.5% */
 	/*
 	 * We need arc_reclaim_lock because we don't want multiple
 	 * threads trying to reclaim concurrently.
@@ -898,19 +1124,23 @@ arc_kmem_reclaim(void)
 	if (arc_dead)
 		return;
 
+	if (arc.c <= arc.c_min)
+		return;
+
 	mutex_enter(&arc_reclaim_lock);
 
-	atomic_add_64(&arc.c, -(arc.c >> 4));
+	atomic_add_64(&arc.c, -(arc.c >> 3));
+	atomic_add_64(&arc.p, -(arc.p >> 3));
+	if (arc.c > arc.size)
+		arc.c = arc.size;
 	if (arc.c < arc.c_min)
 		arc.c = arc.c_min;
-	atomic_add_64(&arc.p, -(arc.p >> 4));
+	if (arc.p > arc.c)
+		arc.p = (arc.c >> 1);
+	ASSERT((int64_t)arc.p >= 0);
 
 	arc_adjust();
 
-	/* Cool it for a while */
-	arc.incr = 0;
-	arc.size_check = arc_size_check_default << 3;
-
 	mutex_exit(&arc_reclaim_lock);
 }
 
@@ -985,16 +1215,11 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 #endif
 
 	/*
-	 * an agressive reclamation will shrink the cache size as well as reap
-	 * free kmem buffers.  The arc_kmem_reclaim function is called when the
-	 * header-cache is reaped, so we only reap the header cache if we're
-	 * performing an agressive reclaim.  If we're not, just clean the kmem
-	 * buffer caches.
+	 * An agressive reclamation will shrink the cache size as well as
+	 * reap free buffers from the arc kmem caches.
 	 */
 	if (strat == ARC_RECLAIM_AGGR)
-		kmem_cache_reap_now(hdr_cache);
-
-	kmem_cache_reap_now(buf_cache);
+		arc_kmem_reclaim();
 
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 		if (zio_buf_cache[i] != prev_cache) {
@@ -1002,6 +1227,8 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 			kmem_cache_reap_now(zio_buf_cache[i]);
 		}
 	}
+	kmem_cache_reap_now(buf_cache);
+	kmem_cache_reap_now(hdr_cache);
 }
 
 static void
@@ -1038,6 +1265,9 @@ arc_reclaim_thread(void)
 			arc.no_grow = FALSE;
 		}
 
+		if (arc_eviction_list != NULL)
+			arc_do_user_evicts();
+
 		/* block until needed, or one second, whichever is shorter */
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait(&arc_reclaim_thr_cv,
@@ -1051,14 +1281,37 @@ arc_reclaim_thread(void)
 	thread_exit();
 }
 
+/*
+ * Adapt arc info given the number of bytes we are trying to add and
+ * the state that we are comming from.  This function is only called
+ * when we are adding new content to the cache.
+ */
 static void
-arc_try_grow(int64_t bytes)
+arc_adapt(int bytes, arc_state_t *state)
 {
+	int mult;
+
+	ASSERT(bytes > 0);
 	/*
-	 * If we're within (2 * maxblocksize) bytes of the target
-	 * cache size, increment the target cache size
+	 * Adapt the target size of the MRU list:
+	 *	- if we just hit in the MRU ghost list, then increase
+	 *	  the target size of the MRU list.
+	 *	- if we just hit in the MFU ghost list, then increase
+	 *	  the target size of the MFU list by decreasing the
+	 *	  target size of the MRU list.
 	 */
-	atomic_add_64((uint64_t *)&arc.size_check, 1);
+	if (state == arc.mru_ghost) {
+		mult = ((arc.mru_ghost->size >= arc.mfu_ghost->size) ?
+		    1 : (arc.mfu_ghost->size/arc.mru_ghost->size));
+
+		arc.p = MIN(arc.c, arc.p + bytes * mult);
+	} else if (state == arc.mfu_ghost) {
+		mult = ((arc.mfu_ghost->size >= arc.mru_ghost->size) ?
+		    1 : (arc.mru_ghost->size/arc.mfu_ghost->size));
+
+		arc.p = MAX(0, (int64_t)arc.p - bytes * mult);
+	}
+	ASSERT((int64_t)arc.p >= 0);
 
 	if (arc_reclaim_needed()) {
 		cv_signal(&arc_reclaim_thr_cv);
@@ -1068,52 +1321,36 @@ arc_try_grow(int64_t bytes)
 	if (arc.no_grow)
 		return;
 
+	if (arc.c >= arc.c_max)
+		return;
+
 	/*
-	 * return true if we successfully grow, or if there's enough space that
-	 * we don't have to grow.  Above, we return false if we can't grow, or
-	 * if we shouldn't because a reclaim is in progress.
+	 * If we're within (2 * maxblocksize) bytes of the target
+	 * cache size, increment the target cache size
 	 */
-	if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) {
-		if (arc.size_check > 0) {
-			arc.size_check = arc_size_check_default;
-			atomic_add_64(&arc.incr, arc_incr_size);
-		}
-		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
+	if (arc.size > arc.c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+		atomic_add_64(&arc.c, (int64_t)bytes);
 		if (arc.c > arc.c_max)
 			arc.c = arc.c_max;
-		else
-			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
-	} else if (arc.size > arc.c) {
-		if (arc.size_check > 0) {
-			arc.size_check = arc_size_check_default;
-			atomic_add_64(&arc.incr, arc_incr_size);
-		}
-		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
-		if (arc.c > arc.c_max)
-			arc.c = arc.c_max;
-		else
-			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
+		else if (state == arc.anon)
+			atomic_add_64(&arc.p, (int64_t)bytes);
+		if (arc.p > arc.c)
+			arc.p = arc.c;
 	}
+	ASSERT((int64_t)arc.p >= 0);
 }
 
 /*
- * check if the cache has reached its limits and eviction is required prior to
- * insert.  In this situation, we want to evict if no_grow is set Otherwise, the
- * cache is either big enough that we can insert, or a arc_try_grow will result
- * in more space being made available.
+ * Check if the cache has reached its limits and eviction is required
+ * prior to insert.
  */
-
 static int
 arc_evict_needed()
 {
-
 	if (arc_reclaim_needed())
 		return (1);
 
-	if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c))
-		return (1);
-
-	return (0);
+	return (arc.size > arc.c);
 }
 
 /*
@@ -1121,21 +1358,21 @@ arc_evict_needed()
  * inserted on its behalf. So, determine which cache must be victimized to
  * satisfy an insertion for this state.  We have the following cases:
  *
- * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) ->
+ * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru) ->
  * In this situation if we're out of space, but the resident size of the MFU is
  * under the limit, victimize the MFU cache to satisfy this insertion request.
  *
- * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) ->
+ * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru) ->
  * Here, we've used up all of the available space for the MRU, so we need to
  * evict from our own cache instead.  Evict from the set of resident MRU
  * entries.
  *
- * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) ->
+ * 3. Insert for MFU (c - p) > sizeof(arc.mfu) ->
  * c minus p represents the MFU space in the cache, since p is the size of the
  * cache that is dedicated to the MRU.  In this situation there's still space on
  * the MFU side, so the MRU side needs to be victimized.
  *
- * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) ->
+ * 4. Insert for MFU (c - p) < sizeof(arc.mfu) ->
  * MFU's resident set is consuming more space than it has been allotted.  In
  * this situation, we must victimize our own cache, the MFU, for this insertion.
  */
@@ -1146,35 +1383,35 @@ arc_evict_for_state(arc_state_t *state, uint64_t bytes)
 	uint64_t	mfu_space;
 	uint64_t	evicted;
 
-	ASSERT(state == arc.mru_top || state == arc.mfu_top);
+	ASSERT(state == arc.mru || state == arc.mfu);
 
-	if (state == arc.mru_top) {
-		mru_used = arc.anon->size + arc.mru_top->size;
+	if (state == arc.mru) {
+		mru_used = arc.anon->size + arc.mru->size;
 		if (arc.p > mru_used) {
 			/* case 1 */
-			evicted = arc_evict_state(arc.mfu_top, bytes);
+			evicted = arc_evict(arc.mfu, bytes);
 			if (evicted < bytes) {
 				arc_adjust();
 			}
 		} else {
 			/* case 2 */
-			evicted = arc_evict_state(arc.mru_top, bytes);
+			evicted = arc_evict(arc.mru, bytes);
 			if (evicted < bytes) {
 				arc_adjust();
 			}
 		}
 	} else {
-		/* MFU_top case */
+		/* MFU case */
 		mfu_space = arc.c - arc.p;
-		if (mfu_space > arc.mfu_top->size) {
+		if (mfu_space > arc.mfu->size) {
 			/* case 3 */
-			evicted = arc_evict_state(arc.mru_top, bytes);
+			evicted = arc_evict(arc.mru, bytes);
 			if (evicted < bytes) {
 				arc_adjust();
 			}
 		} else {
 			/* case 4 */
-			evicted = arc_evict_state(arc.mfu_top, bytes);
+			evicted = arc_evict(arc.mfu, bytes);
 			if (evicted < bytes) {
 				arc_adjust();
 			}
@@ -1184,11 +1421,13 @@ arc_evict_for_state(arc_state_t *state, uint64_t bytes)
 
 /*
  * This routine is called whenever a buffer is accessed.
+ * NOTE: the hash lock is dropped in this function.
  */
 static void
-arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 {
-	int		blksz, mult;
+	arc_state_t	*evict_state = NULL;
+	int		blksz;
 
 	ASSERT(MUTEX_HELD(hash_lock));
 
@@ -1201,27 +1440,16 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 * to the MRU state.
 		 */
 
-		arc_try_grow(blksz);
-		if (arc_evict_needed()) {
-			arc_evict_for_state(arc.mru_top, blksz);
-		}
+		arc_adapt(blksz, arc.anon);
+		if (arc_evict_needed())
+			evict_state = arc.mru;
 
 		ASSERT(buf->b_arc_access == 0);
 		buf->b_arc_access = lbolt;
-		DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *,
-		    buf);
-		arc_change_state(arc.mru_top, buf, hash_lock);
+		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+		arc_change_state(arc.mru, buf, hash_lock);
 
-		/*
-		 * If we are using less than 2/3 of our total target
-		 * cache size, bump up the target size for the MRU
-		 * list.
-		 */
-		if (arc.size < arc.c*2/3) {
-			arc.p = arc.anon->size + arc.mru_top->size + arc.c/6;
-		}
-
-	} else if (buf->b_state == arc.mru_top) {
+	} else if (buf->b_state == arc.mru) {
 		/*
 		 * If this buffer is in the MRU-top state and has the prefetch
 		 * flag, the first read was actually part of a prefetch.  In
@@ -1230,7 +1458,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 */
 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
 			buf->b_flags &= ~ARC_PREFETCH;
-			atomic_add_64(&arc.mru_top->hits, 1);
+			atomic_add_64(&arc.mru->hits, 1);
+			mutex_exit(hash_lock);
 			return;
 		}
 
@@ -1246,12 +1475,11 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 			 * most frequently used state.
 			 */
 			buf->b_arc_access = lbolt;
-			DTRACE_PROBE1(new_state__mfu_top,
-			    arc_buf_hdr_t *, buf);
-			arc_change_state(arc.mfu_top, buf, hash_lock);
+			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+			arc_change_state(arc.mfu, buf, hash_lock);
 		}
-		atomic_add_64(&arc.mru_top->hits, 1);
-	} else if (buf->b_state == arc.mru_bot) {
+		atomic_add_64(&arc.mru->hits, 1);
+	} else if (buf->b_state == arc.mru_ghost) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer has been "accessed" recently, but
@@ -1260,30 +1488,23 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 */
 
 		if (buf->b_flags & ARC_PREFETCH) {
-			new_state = arc.mru_top;
-			DTRACE_PROBE1(new_state__mru_top,
-			    arc_buf_hdr_t *, buf);
+			new_state = arc.mru;
+			buf->b_flags &= ~ARC_PREFETCH;
+			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
 		} else {
-			new_state = arc.mfu_top;
-			DTRACE_PROBE1(new_state__mfu_top,
-			    arc_buf_hdr_t *, buf);
-		}
-
-		arc_try_grow(blksz);
-		if (arc_evict_needed()) {
-			arc_evict_for_state(new_state, blksz);
+			new_state = arc.mfu;
+			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 		}
 
-		/* Bump up the target size of the MRU list */
-		mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ?
-		    1 : (arc.mfu_bot->size/arc.mru_bot->size));
-		arc.p = MIN(arc.c, arc.p + blksz * mult);
+		arc_adapt(blksz, arc.mru_ghost);
+		if (arc_evict_needed())
+			evict_state = new_state;
 
 		buf->b_arc_access = lbolt;
 		arc_change_state(new_state, buf, hash_lock);
 
-		atomic_add_64(&arc.mru_bot->hits, 1);
-	} else if (buf->b_state == arc.mfu_top) {
+		atomic_add_64(&arc.mru_ghost->hits, 1);
+	} else if (buf->b_state == arc.mfu) {
 		/*
 		 * This buffer has been accessed more than once and is
 		 * still in the cache.  Keep it in the MFU state.
@@ -1293,34 +1514,30 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 * so even if it was a prefetch, it will be put back at
 		 * the head of the list when we remove_reference().
 		 */
-		atomic_add_64(&arc.mfu_top->hits, 1);
-	} else if (buf->b_state == arc.mfu_bot) {
+		atomic_add_64(&arc.mfu->hits, 1);
+	} else if (buf->b_state == arc.mfu_ghost) {
 		/*
 		 * This buffer has been accessed more than once but has
 		 * been evicted from the cache.  Move it back to the
 		 * MFU state.
 		 */
 
-		arc_try_grow(blksz);
-		if (arc_evict_needed()) {
-			arc_evict_for_state(arc.mfu_top, blksz);
-		}
-
-		/* Bump up the target size for the MFU list */
-		mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ?
-		    1 : (arc.mru_bot->size/arc.mfu_bot->size));
-		arc.p = MAX(0, (int64_t)arc.p - blksz * mult);
+		arc_adapt(blksz, arc.mfu_ghost);
+		if (arc_evict_needed())
+			evict_state = arc.mfu;
 
 		buf->b_arc_access = lbolt;
-		DTRACE_PROBE1(new_state__mfu_top,
-		    arc_buf_hdr_t *, buf);
-		arc_change_state(arc.mfu_top, buf, hash_lock);
+		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+		arc_change_state(arc.mfu, buf, hash_lock);
 
-		atomic_add_64(&arc.mfu_bot->hits, 1);
+		atomic_add_64(&arc.mfu_ghost->hits, 1);
 	} else {
 		ASSERT(!"invalid arc state");
 	}
 
+	mutex_exit(hash_lock);
+	if (evict_state)
+		arc_evict_for_state(evict_state, blksz);
 }
 
 /* a generic arc_done_func_t which you can use */
@@ -1329,7 +1546,7 @@ void
 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
 	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
-	arc_buf_free(buf, arg);
+	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
 }
 
 /* a generic arc_done_func_t which you can use */
@@ -1338,7 +1555,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
 	arc_buf_t **bufp = arg;
 	if (zio && zio->io_error) {
-		arc_buf_free(buf, arg);
+		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
 		*bufp = NULL;
 	} else {
 		*bufp = buf;
@@ -1387,13 +1604,13 @@ arc_read_done(zio_t *zio)
 		if (acb->acb_done) {
 			if (abuf == NULL) {
 				abuf = kmem_cache_alloc(buf_cache, KM_SLEEP);
-				abuf->b_data = zio_buf_alloc(hdr->b_size);
-				atomic_add_64(&arc.size, hdr->b_size);
-				bcopy(buf->b_data, abuf->b_data, hdr->b_size);
+				abuf->b_data = arc_data_copy(hdr, buf->b_data);
 				abuf->b_hdr = hdr;
+				abuf->b_efunc = NULL;
+				abuf->b_private = NULL;
 				abuf->b_next = hdr->b_buf;
 				hdr->b_buf = abuf;
-				atomic_add_64(&hdr->b_state->size, hdr->b_size);
+				hdr->b_datacnt += 1;
 			}
 			acb->acb_buf = abuf;
 			abuf = NULL;
@@ -1414,6 +1631,9 @@ arc_read_done(zio_t *zio)
 	}
 	hdr->b_acb = NULL;
 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+	ASSERT(!HDR_BUF_AVAILABLE(hdr));
+	if (abuf == buf)
+		hdr->b_flags |= ARC_BUF_AVAILABLE;
 
 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
 
@@ -1421,9 +1641,21 @@ arc_read_done(zio_t *zio)
 		hdr->b_flags |= ARC_IO_ERROR;
 		if (hdr->b_state != arc.anon)
 			arc_change_state(arc.anon, hdr, hash_lock);
+		if (HDR_IN_HASH_TABLE(hdr))
+			buf_hash_remove(hdr);
 		freeable = refcount_is_zero(&hdr->b_refcnt);
+		/* translate checksum errors into IO errors */
+		if (zio->io_error == ECKSUM)
+			zio->io_error = EIO;
 	}
 
+	/*
+	 * Broadcast before we drop the hash_lock.  This is less efficient,
+	 * but avoids the possibility that the hdr (and hence the cv) might
+	 * be freed before we get to the cv_broadcast().
+	 */
+	cv_broadcast(&hdr->b_cv);
+
 	if (!HDR_FREED_IN_READ(hdr)) {
 		/*
 		 * Only call arc_access on anonymous buffers.  This is because
@@ -1432,8 +1664,9 @@ arc_read_done(zio_t *zio)
 		 * getting confused).
 		 */
 		if (zio->io_error == 0 && hdr->b_state == arc.anon)
-			arc_access(hdr, hash_lock);
-		mutex_exit(hash_lock);
+			arc_access_and_exit(hdr, hash_lock);
+		else
+			mutex_exit(hash_lock);
 	} else {
 		/*
 		 * This block was freed while we waited for the read to
@@ -1445,8 +1678,6 @@ arc_read_done(zio_t *zio)
 		freeable = refcount_is_zero(&hdr->b_refcnt);
 	}
 
-	cv_broadcast(&hdr->b_cv);
-
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
 		if (acb->acb_done)
@@ -1462,7 +1693,7 @@ arc_read_done(zio_t *zio)
 	}
 
 	if (freeable)
-		arc_hdr_free(hdr);
+		arc_hdr_destroy(hdr);
 }
 
 /*
@@ -1486,7 +1717,7 @@ arc_read_done(zio_t *zio)
 int
 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
     arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t arc_flags)
+    uint32_t arc_flags, zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
@@ -1495,15 +1726,9 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
 
 top:
 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
-	if (hdr && hdr->b_buf) {
-
-		ASSERT((hdr->b_state == arc.mru_top) ||
-		    (hdr->b_state == arc.mfu_top) ||
-		    ((hdr->b_state == arc.anon) &&
-		    (HDR_IO_IN_PROGRESS(hdr))));
+	if (hdr && hdr->b_datacnt > 0) {
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
-
 			if ((arc_flags & ARC_NOWAIT) && done) {
 				arc_callback_t	*acb = NULL;
 
@@ -1527,35 +1752,39 @@ top:
 				mutex_exit(hash_lock);
 				goto top;
 			}
-
 			mutex_exit(hash_lock);
 			return (0);
 		}
 
-		/*
-		 * If there is already a reference on this block, create
-		 * a new copy of the data so that we will be guaranteed
-		 * that arc_release() will always succeed.
-		 */
+		ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu);
 
-		if (done)
-			add_reference(hdr, hash_lock, private);
-		if (done && refcount_count(&hdr->b_refcnt) > 1) {
-			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
-			buf->b_data = zio_buf_alloc(hdr->b_size);
-			ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1);
-			atomic_add_64(&arc.size, hdr->b_size);
-			bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size);
-			buf->b_hdr = hdr;
-			buf->b_next = hdr->b_buf;
-			hdr->b_buf = buf;
-			atomic_add_64(&hdr->b_state->size, hdr->b_size);
-		} else {
+		if (done) {
+			/*
+			 * If this block is already in use, create a new
+			 * copy of the data so that we will be guaranteed
+			 * that arc_release() will always succeed.
+			 */
 			buf = hdr->b_buf;
+			ASSERT(buf);
+			ASSERT(buf->b_data);
+			if (!HDR_BUF_AVAILABLE(hdr)) {
+				void *data = arc_data_copy(hdr, buf->b_data);
+				buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+				buf->b_hdr = hdr;
+				buf->b_data = data;
+				buf->b_efunc = NULL;
+				buf->b_private = NULL;
+				buf->b_next = hdr->b_buf;
+				hdr->b_buf = buf;
+				hdr->b_datacnt += 1;
+			} else {
+				ASSERT(buf->b_efunc == NULL);
+				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+			}
+			add_reference(hdr, hash_lock, private);
 		}
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
-		arc_access(hdr, hash_lock);
-		mutex_exit(hash_lock);
+		arc_access_and_exit(hdr, hash_lock);
 		atomic_add_64(&arc.hits, 1);
 		if (done)
 			done(NULL, buf, private);
@@ -1579,24 +1808,28 @@ top:
 				bzero(&hdr->b_dva, sizeof (dva_t));
 				hdr->b_birth = 0;
 				hdr->b_cksum0 = 0;
-				arc_buf_free(buf, private);
+				(void) arc_buf_remove_ref(buf, private);
 				goto top; /* restart the IO request */
 			}
 
 		} else {
 			/* this block is in the ghost cache */
-			ASSERT((hdr->b_state == arc.mru_bot) ||
-			    (hdr->b_state == arc.mfu_bot));
+			ASSERT(GHOST_STATE(hdr->b_state));
+			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 			add_reference(hdr, hash_lock, private);
+			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
 
+			ASSERT(hdr->b_buf == NULL);
 			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
-			buf->b_data = zio_buf_alloc(hdr->b_size);
-			atomic_add_64(&arc.size, hdr->b_size);
-			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
 			buf->b_hdr = hdr;
+			buf->b_efunc = NULL;
+			buf->b_private = NULL;
 			buf->b_next = NULL;
 			hdr->b_buf = buf;
+			buf->b_data = zio_buf_alloc(hdr->b_size);
+			atomic_add_64(&arc.size, hdr->b_size);
+			ASSERT(hdr->b_datacnt == 0);
+			hdr->b_datacnt = 1;
 		}
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -1623,18 +1856,17 @@ top:
 		 * buffer ought to notice that it's legit but has a pending I/O.
 		 */
 
-		if ((hdr->b_state == arc.mru_bot) ||
-		    (hdr->b_state == arc.mfu_bot))
-			arc_access(hdr, hash_lock);
-
-		mutex_exit(hash_lock);
+		if (GHOST_STATE(hdr->b_state))
+			arc_access_and_exit(hdr, hash_lock);
+		else
+			mutex_exit(hash_lock);
 
 		ASSERT3U(hdr->b_size, ==, size);
-		DTRACE_PROBE2(arc__miss, blkptr_t *, bp,
-		    uint64_t, size);
+		DTRACE_PROBE2(arc__miss, blkptr_t *, bp, uint64_t, size);
 		atomic_add_64(&arc.misses, 1);
+
 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
-		    arc_read_done, buf, priority, flags);
+		    arc_read_done, buf, priority, flags, zb);
 
 		if (arc_flags & ARC_WAIT)
 			return (zio_wait(rzio));
@@ -1660,10 +1892,18 @@ arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
 
 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
 
-	if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr))
-		bcopy(hdr->b_buf->b_data, data, hdr->b_size);
-	else
+	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
+		arc_buf_t *buf = hdr->b_buf;
+
+		ASSERT(buf);
+		while (buf->b_data == NULL) {
+			buf = buf->b_next;
+			ASSERT(buf);
+		}
+		bcopy(buf->b_data, data, hdr->b_size);
+	} else {
 		rc = ENOENT;
+	}
 
 	if (hash_mtx)
 		mutex_exit(hash_mtx);
@@ -1671,6 +1911,104 @@ arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
 	return (rc);
 }
 
+void
+arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
+{
+	ASSERT(buf->b_hdr != NULL);
+	ASSERT(buf->b_hdr->b_state != arc.anon);
+	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+	buf->b_efunc = func;
+	buf->b_private = private;
+}
+
+/*
+ * This is used by the DMU to let the ARC know that a buffer is
+ * being evicted, so the ARC should clean up.  If this arc buf
+ * is not yet in the evicted state, it will be put there.
+ */
+int
+arc_buf_evict(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_lock;
+	arc_buf_t **bufp;
+
+	mutex_enter(&arc_eviction_mtx);
+	hdr = buf->b_hdr;
+	if (hdr == NULL) {
+		/*
+		 * We are in arc_do_user_evicts().
+		 * NOTE: We can't be in arc_buf_add_ref() because
+		 * that would violate the interface rules.
+		 */
+		ASSERT(buf->b_data == NULL);
+		mutex_exit(&arc_eviction_mtx);
+		return (0);
+	} else if (buf->b_data == NULL) {
+		/*
+		 * We are on the eviction list, pull us off.
+		 */
+		bufp = &arc_eviction_list;
+		while (*bufp != buf)
+			bufp = &(*bufp)->b_next;
+		*bufp = buf->b_next;
+		mutex_exit(&arc_eviction_mtx);
+		goto out;
+	} else {
+		/*
+		 * Prevent a race with arc_evict()
+		 */
+		ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
+		buf->b_hdr = NULL;
+	}
+	mutex_exit(&arc_eviction_mtx);
+
+	hash_lock = HDR_LOCK(hdr);
+	mutex_enter(hash_lock);
+
+	ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu);
+
+	/*
+	 * Pull this buffer off of the hdr
+	 */
+	bufp = &hdr->b_buf;
+	while (*bufp != buf)
+		bufp = &(*bufp)->b_next;
+	*bufp = buf->b_next;
+
+	ASSERT(buf->b_data != NULL);
+	buf->b_hdr = hdr;
+	arc_buf_destroy(buf, FALSE);
+
+	if (hdr->b_datacnt == 0) {
+		arc_state_t *old_state = hdr->b_state;
+		arc_state_t *evicted_state;
+
+		ASSERT(refcount_is_zero(&hdr->b_refcnt));
+
+		evicted_state =
+		    (old_state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost;
+
+		mutex_enter(&old_state->mtx);
+		mutex_enter(&evicted_state->mtx);
+
+		arc_change_state(evicted_state, hdr, hash_lock);
+		ASSERT(HDR_IN_HASH_TABLE(hdr));
+		hdr->b_flags = ARC_IN_HASH_TABLE;
+
+		mutex_exit(&evicted_state->mtx);
+		mutex_exit(&old_state->mtx);
+	}
+	mutex_exit(hash_lock);
+out:
+	VERIFY(buf->b_efunc(buf) == 0);
+	buf->b_efunc = NULL;
+	buf->b_private = NULL;
+	buf->b_hdr = NULL;
+	kmem_cache_free(buf_cache, buf);
+	return (1);
+}
+
 /*
  * Release this buffer from the cache.  This must be done
  * after a read and prior to modifying the buffer contents.
@@ -1690,30 +2028,40 @@ arc_release(arc_buf_t *buf, void *tag)
 		/* this buffer is already released */
 		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
 		ASSERT(BUF_EMPTY(hdr));
+		ASSERT(buf->b_efunc == NULL);
 		return;
 	}
 
 	mutex_enter(hash_lock);
 
-	if (refcount_count(&hdr->b_refcnt) > 1) {
+	/*
+	 * Do we have more than one buf?
+	 */
+	if (hdr->b_buf != buf || buf->b_next != NULL) {
 		arc_buf_hdr_t *nhdr;
 		arc_buf_t **bufp;
 		uint64_t blksz = hdr->b_size;
 		spa_t *spa = hdr->b_spa;
 
+		ASSERT(hdr->b_datacnt > 1);
 		/*
 		 * Pull the data off of this buf and attach it to
 		 * a new anonymous buf.
 		 */
+		(void) remove_reference(hdr, hash_lock, tag);
 		bufp = &hdr->b_buf;
-		while (*bufp != buf) {
-			ASSERT(*bufp);
+		while (*bufp != buf)
 			bufp = &(*bufp)->b_next;
-		}
 		*bufp = (*bufp)->b_next;
-		(void) refcount_remove(&hdr->b_refcnt, tag);
+
 		ASSERT3U(hdr->b_state->size, >=, hdr->b_size);
 		atomic_add_64(&hdr->b_state->size, -hdr->b_size);
+		if (refcount_is_zero(&hdr->b_refcnt)) {
+			ASSERT3U(hdr->b_state->lsize, >=, hdr->b_size);
+			atomic_add_64(&hdr->b_state->lsize, -hdr->b_size);
+		}
+		hdr->b_datacnt -= 1;
+
 		mutex_exit(hash_lock);
 
 		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
@@ -1723,6 +2071,7 @@ arc_release(arc_buf_t *buf, void *tag)
 		nhdr->b_state = arc.anon;
 		nhdr->b_arc_access = 0;
 		nhdr->b_flags = 0;
+		nhdr->b_datacnt = 1;
 		buf->b_hdr = nhdr;
 		buf->b_next = NULL;
 		(void) refcount_add(&nhdr->b_refcnt, tag);
@@ -1730,6 +2079,7 @@ arc_release(arc_buf_t *buf, void *tag)
 
 		hdr = nhdr;
 	} else {
+		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
 		ASSERT(!list_link_active(&hdr->b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		arc_change_state(arc.anon, hdr, hash_lock);
@@ -1739,14 +2089,30 @@ arc_release(arc_buf_t *buf, void *tag)
 		hdr->b_birth = 0;
 		hdr->b_cksum0 = 0;
 	}
+	buf->b_efunc = NULL;
+	buf->b_private = NULL;
 }
 
 int
 arc_released(arc_buf_t *buf)
 {
-	return (buf->b_hdr->b_state == arc.anon);
+	return (buf->b_data != NULL && buf->b_hdr->b_state == arc.anon);
+}
+
+int
+arc_has_callback(arc_buf_t *buf)
+{
+	return (buf->b_efunc != NULL);
 }
 
+#ifdef ZFS_DEBUG
+int
+arc_referenced(arc_buf_t *buf)
+{
+	return (refcount_count(&buf->b_hdr->b_refcnt));
+}
+#endif
+
 static void
 arc_write_done(zio_t *zio)
 {
@@ -1758,6 +2124,7 @@ arc_write_done(zio_t *zio)
 	hdr = buf->b_hdr;
 	acb = hdr->b_acb;
 	hdr->b_acb = NULL;
+	ASSERT(acb != NULL);
 
 	/* this buffer is on no lists and is not in the hash table */
 	ASSERT3P(hdr->b_state, ==, arc.anon);
@@ -1765,9 +2132,12 @@ arc_write_done(zio_t *zio)
 	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
 	hdr->b_birth = zio->io_bp->blk_birth;
 	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
-	/* clear the "in-write" flag */
-	hdr->b_hash_next = NULL;
-	/* This write may be all-zero */
+	/*
+	 * If the block to be written was all-zero, we may have
+	 * compressed it away.  In this case no write was performed
+	 * so there will be no dva/birth-date/checksum.  The buffer
+	 * must therefor remain anonymous (and uncached).
+	 */
 	if (!BUF_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
@@ -1787,27 +2157,41 @@ arc_write_done(zio_t *zio)
 			ASSERT(refcount_is_zero(&exists->b_refcnt));
 			arc_change_state(arc.anon, exists, hash_lock);
 			mutex_exit(hash_lock);
-			arc_hdr_free(exists);
+			arc_hdr_destroy(exists);
 			exists = buf_hash_insert(hdr, &hash_lock);
 			ASSERT3P(exists, ==, NULL);
 		}
-		arc_access(hdr, hash_lock);
-		mutex_exit(hash_lock);
+		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+		arc_access_and_exit(hdr, hash_lock);
+	} else if (acb->acb_done == NULL) {
+		int destroy_hdr;
+		/*
+		 * This is an anonymous buffer with no user callback,
+		 * destroy it if there are no active references.
+		 */
+		mutex_enter(&arc_eviction_mtx);
+		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
+		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+		mutex_exit(&arc_eviction_mtx);
+		if (destroy_hdr)
+			arc_hdr_destroy(hdr);
+	} else {
+		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 	}
-	if (acb && acb->acb_done) {
+
+	if (acb->acb_done) {
 		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
 		acb->acb_done(zio, buf, acb->acb_private);
 	}
 
-	if (acb)
-		kmem_free(acb, sizeof (arc_callback_t));
+	kmem_free(acb, sizeof (arc_callback_t));
 }
 
 int
 arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
     arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t arc_flags)
+    uint32_t arc_flags, zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_callback_t	*acb;
@@ -1822,8 +2206,9 @@ arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
 	acb->acb_private = private;
 	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
 	hdr->b_acb = acb;
+	hdr->b_flags |= ARC_IO_IN_PROGRESS;
 	rzio = zio_write(pio, spa, checksum, compress, txg, bp,
-	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags);
+	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb);
 
 	if (arc_flags & ARC_WAIT)
 		return (zio_wait(rzio));
@@ -1858,16 +2243,21 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 		arc_change_state(arc.anon, ab, hash_lock);
 		if (refcount_is_zero(&ab->b_refcnt)) {
 			mutex_exit(hash_lock);
-			arc_hdr_free(ab);
+			arc_hdr_destroy(ab);
 			atomic_add_64(&arc.deleted, 1);
 		} else {
 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1);
+			ASSERT3U(ab->b_datacnt, ==, 1);
 			if (HDR_IO_IN_PROGRESS(ab))
 				ab->b_flags |= ARC_FREED_IN_READ;
+			if (HDR_IN_HASH_TABLE(ab))
+				buf_hash_remove(ab);
 			ab->b_arc_access = 0;
 			bzero(&ab->b_dva, sizeof (dva_t));
 			ab->b_birth = 0;
 			ab->b_cksum0 = 0;
+			ab->b_buf->b_efunc = NULL;
+			ab->b_buf->b_private = NULL;
 			mutex_exit(hash_lock);
 		}
 	}
@@ -1967,23 +2357,26 @@ arc_init(void)
 		arc.c = arc.c_min;
 
 	arc.anon = &ARC_anon;
-	arc.mru_top = &ARC_mru_top;
-	arc.mru_bot = &ARC_mru_bot;
-	arc.mfu_top = &ARC_mfu_top;
-	arc.mfu_bot = &ARC_mfu_bot;
+	arc.mru = &ARC_mru;
+	arc.mru_ghost = &ARC_mru_ghost;
+	arc.mfu = &ARC_mfu;
+	arc.mfu_ghost = &ARC_mfu_ghost;
+	arc.size = 0;
 
-	list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t),
+	list_create(&arc.mru->list, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t),
+	list_create(&arc.mru_ghost->list, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t),
+	list_create(&arc.mfu->list, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t),
+	list_create(&arc.mfu_ghost->list, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_arc_node));
 
 	buf_init();
 
 	arc_thread_exit = 0;
+	arc_eviction_list = NULL;
+	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
 	    TS_RUN, minclsyspri);
@@ -2002,14 +2395,15 @@ arc_fini(void)
 
 	arc_dead = TRUE;
 
+	mutex_destroy(&arc_eviction_mtx);
 	mutex_destroy(&arc_reclaim_lock);
 	mutex_destroy(&arc_reclaim_thr_lock);
 	cv_destroy(&arc_reclaim_thr_cv);
 
-	list_destroy(&arc.mru_top->list);
-	list_destroy(&arc.mru_bot->list);
-	list_destroy(&arc.mfu_top->list);
-	list_destroy(&arc.mfu_bot->list);
+	list_destroy(&arc.mru->list);
+	list_destroy(&arc.mru_ghost->list);
+	list_destroy(&arc.mfu->list);
+	list_destroy(&arc.mfu_ghost->list);
 
 	buf_fini();
 }
diff --git a/usr/src/uts/common/fs/zfs/bplist.c b/usr/src/uts/common/fs/zfs/bplist.c
index 68f79ac5a2..db0d3534d6 100644
--- a/usr/src/uts/common/fs/zfs/bplist.c
+++ b/usr/src/uts/common/fs/zfs/bplist.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -29,16 +28,18 @@
 #include <sys/bplist.h>
 #include <sys/zfs_context.h>
 
-static void
+static int
 bplist_hold(bplist_t *bpl)
 {
 	ASSERT(MUTEX_HELD(&bpl->bpl_lock));
 	if (bpl->bpl_dbuf == NULL) {
-		bpl->bpl_dbuf = dmu_bonus_hold_tag(bpl->bpl_mos,
-		    bpl->bpl_object, bpl);
-		dmu_buf_read(bpl->bpl_dbuf);
+		int err = dmu_bonus_hold(bpl->bpl_mos,
+		    bpl->bpl_object, bpl, &bpl->bpl_dbuf);
+		if (err)
+			return (err);
 		bpl->bpl_phys = bpl->bpl_dbuf->db_data;
 	}
+	return (0);
 }
 
 uint64_t
@@ -58,12 +59,15 @@ bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
 	VERIFY(dmu_object_free(mos, object, tx) == 0);
 }
 
-void
+int
 bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
 {
 	dmu_object_info_t doi;
+	int err;
 
-	VERIFY(dmu_object_info(mos, object, &doi) == 0);
+	err = dmu_object_info(mos, object, &doi);
+	if (err)
+		return (err);
 
 	mutex_enter(&bpl->bpl_lock);
 
@@ -79,6 +83,7 @@ bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
 	bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
 
 	mutex_exit(&bpl->bpl_lock);
+	return (0);
 }
 
 void
@@ -89,11 +94,11 @@ bplist_close(bplist_t *bpl)
 	ASSERT(bpl->bpl_queue == NULL);
 
 	if (bpl->bpl_cached_dbuf) {
-		dmu_buf_rele(bpl->bpl_cached_dbuf);
+		dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
 		bpl->bpl_cached_dbuf = NULL;
 	}
 	if (bpl->bpl_dbuf) {
-		dmu_buf_rele_tag(bpl->bpl_dbuf, bpl);
+		dmu_buf_rele(bpl->bpl_dbuf, bpl);
 		bpl->bpl_dbuf = NULL;
 		bpl->bpl_phys = NULL;
 	}
@@ -110,22 +115,45 @@ bplist_empty(bplist_t *bpl)
 		return (B_TRUE);
 
 	mutex_enter(&bpl->bpl_lock);
-	bplist_hold(bpl);
+	VERIFY(0 == bplist_hold(bpl)); /* XXX */
 	rv = (bpl->bpl_phys->bpl_entries == 0);
 	mutex_exit(&bpl->bpl_lock);
 
 	return (rv);
 }
 
+static int
+bplist_cache(bplist_t *bpl, uint64_t blkid)
+{
+	int err = 0;
+
+	if (bpl->bpl_cached_dbuf == NULL ||
+	    bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
+		if (bpl->bpl_cached_dbuf != NULL)
+			dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
+		err = dmu_buf_hold(bpl->bpl_mos,
+		    bpl->bpl_object, blkid << bpl->bpl_blockshift,
+		    bpl, &bpl->bpl_cached_dbuf);
+		ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
+		    1ULL << bpl->bpl_blockshift);
+	}
+	return (err);
+}
+
 int
 bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
 {
 	uint64_t blk, off;
 	blkptr_t *bparray;
-	dmu_buf_t *db;
+	int err;
 
 	mutex_enter(&bpl->bpl_lock);
-	bplist_hold(bpl);
+
+	err = bplist_hold(bpl);
+	if (err) {
+		mutex_exit(&bpl->bpl_lock);
+		return (err);
+	}
 
 	if (*itorp >= bpl->bpl_phys->bpl_entries) {
 		mutex_exit(&bpl->bpl_lock);
@@ -134,51 +162,44 @@ bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
 
 	blk = *itorp >> bpl->bpl_bpshift;
 	off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
-	db = bpl->bpl_cached_dbuf;
 
-	if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) {
-		if (db != NULL)
-			dmu_buf_rele(db);
-		bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos,
-		    bpl->bpl_object, blk << bpl->bpl_blockshift);
+	err = bplist_cache(bpl, blk);
+	if (err) {
+		mutex_exit(&bpl->bpl_lock);
+		return (err);
 	}
 
-	ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift);
-
-	dmu_buf_read(db);
-	bparray = db->db_data;
+	bparray = bpl->bpl_cached_dbuf->db_data;
 	*bp = bparray[off];
 	(*itorp)++;
 	mutex_exit(&bpl->bpl_lock);
 	return (0);
 }
 
-void
+int
 bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
 {
 	uint64_t blk, off;
 	blkptr_t *bparray;
-	dmu_buf_t *db;
+	int err;
 
 	ASSERT(!BP_IS_HOLE(bp));
 	mutex_enter(&bpl->bpl_lock);
-	bplist_hold(bpl);
+	err = bplist_hold(bpl);
+	if (err)
+		return (err);
 
 	blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
 	off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
-	db = bpl->bpl_cached_dbuf;
 
-	if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) {
-		if (db != NULL)
-			dmu_buf_rele(db);
-		bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos,
-		    bpl->bpl_object, blk << bpl->bpl_blockshift);
+	err = bplist_cache(bpl, blk);
+	if (err) {
+		mutex_exit(&bpl->bpl_lock);
+		return (err);
 	}
 
-	ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift);
-
-	dmu_buf_will_dirty(db, tx);
-	bparray = db->db_data;
+	dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
+	bparray = bpl->bpl_cached_dbuf->db_data;
 	bparray[off] = *bp;
 
 	/* We never need the fill count. */
@@ -191,6 +212,8 @@ bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
 	bpl->bpl_phys->bpl_entries++;
 	bpl->bpl_phys->bpl_bytes += BP_GET_ASIZE(bp);
 	mutex_exit(&bpl->bpl_lock);
+
+	return (0);
 }
 
 /*
@@ -218,7 +241,7 @@ bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
 	while ((bpq = bpl->bpl_queue) != NULL) {
 		bpl->bpl_queue = bpq->bpq_next;
 		mutex_exit(&bpl->bpl_lock);
-		bplist_enqueue(bpl, &bpq->bpq_blk, tx);
+		VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
 		kmem_free(bpq, sizeof (*bpq));
 		mutex_enter(&bpl->bpl_lock);
 	}
@@ -230,9 +253,10 @@ bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
 {
 	mutex_enter(&bpl->bpl_lock);
 	ASSERT3P(bpl->bpl_queue, ==, NULL);
-	bplist_hold(bpl);
+	VERIFY(0 == bplist_hold(bpl));
 	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
-	dmu_free_range(bpl->bpl_mos, bpl->bpl_object, 0, -1ULL, tx);
+	VERIFY(0 == dmu_free_range(bpl->bpl_mos,
+	    bpl->bpl_object, 0, -1ULL, tx));
 	bpl->bpl_phys->bpl_entries = 0;
 	bpl->bpl_phys->bpl_bytes = 0;
 	mutex_exit(&bpl->bpl_lock);
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 6f93e86078..13f4fdb202 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -118,7 +118,7 @@ dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 			mutex_enter(&db->db_mtx);
-			if (!refcount_is_zero(&db->db_holds)) {
+			if (db->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (db);
 			}
@@ -151,7 +151,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
 	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 			mutex_enter(&dbf->db_mtx);
-			if (!refcount_is_zero(&dbf->db_holds)) {
+			if (dbf->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (dbf);
 			}
@@ -186,7 +186,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
 	 * DBUF_HASH_MUTEX > db_mtx.
 	 */
 	ASSERT(refcount_is_zero(&db->db_holds));
-	ASSERT(db->db_dnode != NULL);
+	ASSERT(db->db_state == DB_EVICTING);
 	ASSERT(!MUTEX_HELD(&db->db_mtx));
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
@@ -201,20 +201,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
 	atomic_add_64(&dbuf_hash_count, -1);
 }
 
-static int dbuf_evictable(dmu_buf_impl_t *db);
-static void dbuf_clear(dmu_buf_impl_t *db);
-
-void
-dbuf_evict(dmu_buf_impl_t *db)
-{
-	int err;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	err = dbuf_evictable(db);
-	ASSERT(err == TRUE);
-	dbuf_clear(db);
-	dbuf_destroy(db);
-}
+static arc_evict_func_t dbuf_do_evict;
 
 static void
 dbuf_evict_user(dmu_buf_impl_t *db)
@@ -233,23 +220,47 @@ dbuf_evict_user(dmu_buf_impl_t *db)
 }
 
 void
+dbuf_evict(dmu_buf_impl_t *db)
+{
+	int i;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db_buf == NULL);
+
+#ifdef ZFS_DEBUG
+	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT(!list_link_active(&db->db_dirty_node[i]));
+		ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL);
+	}
+#endif
+	dbuf_clear(db);
+	dbuf_destroy(db);
+}
+
+void
 dbuf_init(void)
 {
-	uint64_t hsize = 1;
+	uint64_t hsize = 1ULL << 16;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	int i;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
-	 * with an average 64k block size.  The table will take up
-	 * totalmem*sizeof(void*)/64k bytes (i.e. 128KB/GB with 8-byte
-	 * pointers).
+	 * with an average 4K block size.  The table will take up
+	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
 	 */
-	while (hsize * 65536 < physmem * PAGESIZE)
+	while (hsize * 4096 < physmem * PAGESIZE)
 		hsize <<= 1;
 
+retry:
 	h->hash_table_mask = hsize - 1;
-	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
+	if (h->hash_table == NULL) {
+		/* XXX - we should really return an error instead of assert */
+		ASSERT(hsize > (1ULL << 10));
+		hsize >>= 1;
+		goto retry;
+	}
 
 	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
@@ -299,8 +310,9 @@ dbuf_verify(dmu_buf_impl_t *db)
 	} else {
 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
-		ASSERT(list_head(&dn->dn_dbufs));
 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
+		ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+		    list_head(&dn->dn_dbufs));
 	}
 	if (db->db_blkid == DB_BONUS_BLKID) {
 		ASSERT(dn != NULL);
@@ -311,19 +323,11 @@ dbuf_verify(dmu_buf_impl_t *db)
 	}
 
 	if (db->db_level == 0) {
-		void **udpp = db->db_d.db_user_data_ptr_ptr;
 		/* we can be momentarily larger in dnode_set_blksz() */
 		if (db->db_blkid != DB_BONUS_BLKID && dn) {
 			ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
 		}
-		if (udpp) {
-			ASSERT((refcount_is_zero(&db->db_holds) &&
-			    *udpp == NULL) ||
-			    (!refcount_is_zero(&db->db_holds) &&
-			    *udpp == db->db.db_data));
-		}
-
-		if (IS_DNODE_DNODE(db->db.db_object)) {
+		if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 			for (i = 0; i < TXG_SIZE; i++) {
 				/*
 				 * it should only be modified in syncing
@@ -341,7 +345,7 @@ dbuf_verify(dmu_buf_impl_t *db)
 		if (db->db_parent == dn->dn_dbuf) {
 			/* db is pointed to by the dnode */
 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
-			if (IS_DNODE_DNODE(db->db.db_object))
+			if (db->db.db_object == DMU_META_DNODE_OBJECT)
 				ASSERT(db->db_parent == NULL);
 			else
 				ASSERT(db->db_parent != NULL);
@@ -399,10 +403,19 @@ static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(buf->b_data != NULL);
+	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
 	db->db_buf = buf;
-	db->db.db_data = buf->b_data;
-	dbuf_update_data(db);
+	if (buf != NULL) {
+		ASSERT(buf->b_data != NULL);
+		db->db.db_data = buf->b_data;
+		if (!arc_released(buf))
+			arc_set_callback(buf, dbuf_do_evict, db);
+		dbuf_update_data(db);
+	} else {
+		dbuf_evict_user(db);
+		db->db.db_data = NULL;
+		db->db_state = DB_UNCACHED;
+	}
 }
 
 uint64_t
@@ -427,6 +440,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	 * All reads are synchronous, so we must have a hold on the dbuf
 	 */
 	ASSERT(refcount_count(&db->db_holds) > 0);
+	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
 	if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
 		/* we were freed in flight; disregard any error */
@@ -440,60 +454,36 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 		db->db_state = DB_CACHED;
 	} else {
 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
-		arc_buf_free(buf, db);
-		db->db_state = DB_UNCACHED;
 		ASSERT3P(db->db_buf, ==, NULL);
+		VERIFY(arc_buf_remove_ref(buf, db) == 1);
+		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
+	dbuf_rele(db, NULL);
 }
 
-void
+static void
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
-	arc_buf_t *buf;
 	blkptr_t *bp;
+	zbookmark_t zb;
 
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
 	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
-
-	/*
-	 * prefetch only data blocks (level 0) -- don't prefetch indirect
-	 * blocks
-	 */
-	if ((db->db_level > 0) || (db->db_blkid == DB_BONUS_BLKID)) {
-		flags |= DB_RF_NOPREFETCH;
-	}
-
-	if (((flags & DB_RF_NOPREFETCH) == 0) && (db->db_dnode != NULL)) {
-		dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
-		    db->db.db_size);
-	}
-
-	if (db->db_state == DB_CACHED) {
-		ASSERT(db->db.db_data != NULL);
-		return;
-	}
-
-	mutex_enter(&db->db_mtx);
-
-	if (db->db_state != DB_UNCACHED) {
-		mutex_exit(&db->db_mtx);
-		return;
-	}
-
-	ASSERT3U(db->db_state, ==, DB_UNCACHED);
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db_state == DB_UNCACHED);
+	ASSERT(db->db_buf == NULL);
 
 	if (db->db_blkid == DB_BONUS_BLKID) {
 		ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
-		buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-		    DN_MAX_BONUSLEN, db);
+		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 		if (db->db.db_size < DN_MAX_BONUSLEN)
-			bzero(buf->b_data, DN_MAX_BONUSLEN);
-		bcopy(DN_BONUS(db->db_dnode->dn_phys), buf->b_data,
+			bzero(db->db.db_data, DN_MAX_BONUSLEN);
+		bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
 		    db->db.db_size);
-		dbuf_set_data(db, buf);
+		dbuf_update_data(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
@@ -522,20 +512,27 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 	db->db_state = DB_READ;
 	mutex_exit(&db->db_mtx);
 
+	zb.zb_objset = db->db_objset->os_dsl_dataset ?
+	    db->db_objset->os_dsl_dataset->ds_object : 0;
+	zb.zb_object = db->db.db_object;
+	zb.zb_level = db->db_level;
+	zb.zb_blkid = db->db_blkid;
+
+	dbuf_add_ref(db, NULL);
 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
 	(void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
 	    db->db_level > 0 ? byteswap_uint64_array :
 	    dmu_ot[db->db_dnode->dn_type].ot_byteswap,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 	    (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
-	    ARC_NOWAIT);
+	    ARC_NOWAIT, &zb);
 }
 
-static int
-dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags)
+int
+dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
-	zio_t *zio;
-	int err;
+	int err = 0;
+	int havepzio = (zio != NULL);
 
 	/*
 	 * We don't have to hold the mutex to check db_state because it
@@ -545,71 +542,67 @@ dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags)
 	if (db->db_state == DB_CACHED)
 		return (0);
 
-	if (db->db_state == DB_UNCACHED) {
-		zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL,
-		    ZIO_FLAG_CANFAIL);
+	if ((flags & DB_RF_HAVESTRUCT) == 0)
+		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+
+	mutex_enter(&db->db_mtx);
+	if (db->db_state == DB_CACHED) {
+		mutex_exit(&db->db_mtx);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+			rw_exit(&db->db_dnode->dn_struct_rwlock);
+	} else if (db->db_state == DB_UNCACHED) {
+		if (zio == NULL) {
+			zio = zio_root(db->db_dnode->dn_objset->os_spa,
+			    NULL, NULL, ZIO_FLAG_CANFAIL);
+		}
 		dbuf_read_impl(db, zio, flags);
+		/* dbuf_read_impl has dropped db_mtx for us */
+
+		if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+		    (flags & DB_RF_NOPREFETCH) == 0 &&
+		    db->db_dnode != NULL) {
+			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+			    db->db.db_size);
+		}
+
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&db->db_dnode->dn_struct_rwlock);
-		err = zio_wait(zio);
-		if (err)
-			return (err);
-	}
 
-	mutex_enter(&db->db_mtx);
-	while (db->db_state == DB_READ || db->db_state == DB_FILL) {
-		ASSERT(db->db_state == DB_READ ||
-		    (flags & DB_RF_HAVESTRUCT) == 0);
-		cv_wait(&db->db_changed, &db->db_mtx);
+		if (!havepzio)
+			err = zio_wait(zio);
+	} else {
+		if ((flags & DB_RF_HAVESTRUCT) == 0)
+			rw_exit(&db->db_dnode->dn_struct_rwlock);
+		if ((flags & DB_RF_NEVERWAIT) == 0) {
+			while (db->db_state == DB_READ ||
+			    db->db_state == DB_FILL) {
+				ASSERT(db->db_state == DB_READ ||
+				    (flags & DB_RF_HAVESTRUCT) == 0);
+				cv_wait(&db->db_changed, &db->db_mtx);
+			}
+			if (db->db_state == DB_UNCACHED)
+				err = EIO;
+		}
+		mutex_exit(&db->db_mtx);
 	}
-	ASSERT3U(db->db_state, ==, DB_CACHED);
-	mutex_exit(&db->db_mtx);
-
-	return (0);
-}
-
-#pragma weak dmu_buf_read = dbuf_read
-void
-dbuf_read(dmu_buf_impl_t *db)
-{
-	int err;
-
-	err = dbuf_read_generic(db, DB_RF_MUST_SUCCEED);
-	ASSERT(err == 0);
-}
-
-#pragma weak dmu_buf_read_canfail = dbuf_read_canfail
-int
-dbuf_read_canfail(dmu_buf_impl_t *db)
-{
-	return (dbuf_read_generic(db, DB_RF_CANFAIL));
-}
-
-void
-dbuf_read_havestruct(dmu_buf_impl_t *db)
-{
-	int err;
 
-	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
-	err = dbuf_read_generic(db, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH));
-	ASSERT(err == 0);
+	ASSERT(err || havepzio || db->db_state == DB_CACHED);
+	return (err);
 }
 
 static void
 dbuf_noread(dmu_buf_impl_t *db)
 {
 	ASSERT(!refcount_is_zero(&db->db_holds));
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 	if (db->db_state == DB_UNCACHED) {
-		int blksz = (db->db_blkid == DB_BONUS_BLKID) ?
-		    DN_MAX_BONUSLEN : db->db.db_size;
+		ASSERT(db->db_buf == NULL);
 		ASSERT(db->db.db_data == NULL);
 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-		    blksz, db));
+		    db->db.db_size, db));
 		db->db_state = DB_FILL;
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
@@ -634,14 +627,13 @@ static void
 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 {
 	arc_buf_t **quiescing, **syncing;
-	int size = (db->db_blkid == DB_BONUS_BLKID) ?
-	    DN_MAX_BONUSLEN : db->db.db_size;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db.db_data != NULL);
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 
-	quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
-	syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
+	quiescing = (arc_buf_t **)&db->db_d.db_data_old[(txg-1)&TXG_MASK];
+	syncing = (arc_buf_t **)&db->db_d.db_data_old[(txg-2)&TXG_MASK];
 
 	/*
 	 * If this buffer is referenced from the current quiescing
@@ -656,13 +648,12 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 		 */
 		ASSERT(*syncing != db->db_buf);
 		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+			int size = db->db.db_size;
 			*quiescing = arc_buf_alloc(
 			    db->db_dnode->dn_objset->os_spa, size, db);
 			bcopy(db->db.db_data, (*quiescing)->b_data, size);
 		} else {
-			db->db.db_data = NULL;
-			db->db_buf = NULL;
-			db->db_state = DB_UNCACHED;
+			dbuf_set_data(db, NULL);
 		}
 		return;
 	}
@@ -677,22 +668,49 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 		ASSERT3P(*quiescing, ==, NULL);
 		ASSERT3U(db->db_dirtycnt, ==, 1);
 		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+			int size = db->db.db_size;
 			/* we can't copy if we have already started a write */
 			ASSERT(*syncing != db->db_data_pending);
 			*syncing = arc_buf_alloc(
 			    db->db_dnode->dn_objset->os_spa, size, db);
 			bcopy(db->db.db_data, (*syncing)->b_data, size);
 		} else {
-			db->db.db_data = NULL;
-			db->db_buf = NULL;
-			db->db_state = DB_UNCACHED;
+			dbuf_set_data(db, NULL);
 		}
 	}
 }
 
+/*
+ * This is the "bonus buffer" version of the above routine
+ */
+static void
+dbuf_fix_old_bonus_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+	void **quiescing, **syncing;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db.db_data != NULL);
+	ASSERT(db->db_blkid == DB_BONUS_BLKID);
+
+	quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
+	syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
+
+	if (*quiescing == db->db.db_data) {
+		ASSERT(*syncing != db->db.db_data);
+		*quiescing = zio_buf_alloc(DN_MAX_BONUSLEN);
+		bcopy(db->db.db_data, *quiescing, DN_MAX_BONUSLEN);
+	} else if (*syncing == db->db.db_data) {
+		ASSERT3P(*quiescing, ==, NULL);
+		ASSERT3U(db->db_dirtycnt, ==, 1);
+		*syncing = zio_buf_alloc(DN_MAX_BONUSLEN);
+		bcopy(db->db.db_data, *syncing, DN_MAX_BONUSLEN);
+	}
+}
+
 void
 dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg)
 {
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
 		db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
@@ -724,7 +742,8 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 	mutex_enter(&dn->dn_dbufs_mtx);
 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
 		db_next = list_next(&dn->dn_dbufs, db);
-		if ((db->db_level != 0) || (db->db_blkid == DB_BONUS_BLKID))
+		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+		if (db->db_level != 0)
 			continue;
 		dprintf_dbuf(db, "found buf %s\n", "");
 		if (db->db_blkid < blkid ||
@@ -736,7 +755,8 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 			continue;
 
 		mutex_enter(&db->db_mtx);
-		if (db->db_state == DB_UNCACHED) {
+		if (db->db_state == DB_UNCACHED ||
+		    db->db_state == DB_EVICTING) {
 			ASSERT(db->db.db_data == NULL);
 			mutex_exit(&db->db_mtx);
 			continue;
@@ -753,22 +773,40 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
+		if (refcount_count(&db->db_holds) == 0) {
+			ASSERT(db->db_buf);
+			dbuf_clear(db);
+			continue;
+		}
+		/* The dbuf is CACHED and referenced */
 
-		/* make a copy of the data if necessary */
-		dbuf_fix_old_data(db, txg);
-
-		if (db->db.db_data) {
-			/* fill in with appropriate data */
+		if (!list_link_active(&db->db_dirty_node[txg & TXG_MASK])) {
+			/*
+			 * This dbuf is not currently dirty.  We will either
+			 * uncache it (if its not referenced in the open
+			 * context) or reset its contents to empty.
+			 */
+			dbuf_fix_old_data(db, txg);
+		} else if (db->db_d.db_overridden_by[txg & TXG_MASK] != NULL) {
+			/*
+			 * This dbuf is overridden.  Clear that state.
+			 */
+			dbuf_unoverride(db, txg);
+		}
+		/* fill in with appropriate data */
+		if (db->db_state == DB_CACHED) {
+			ASSERT(db->db.db_data != NULL);
 			arc_release(db->db_buf, db);
 			bzero(db->db.db_data, db->db.db_size);
 		}
+
 		mutex_exit(&db->db_mtx);
 	}
 	mutex_exit(&dn->dn_dbufs_mtx);
 }
 
 static int
-dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx)
+dbuf_new_block(dmu_buf_impl_t *db)
 {
 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
 	uint64_t birth_txg = 0;
@@ -790,7 +828,7 @@ dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		birth_txg = db->db_blkptr->blk_birth;
 
 	if (birth_txg)
-		return (!dsl_dataset_block_freeable(ds, birth_txg, tx));
+		return (!dsl_dataset_block_freeable(ds, birth_txg));
 	else
 		return (TRUE);
 }
@@ -801,6 +839,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 	arc_buf_t *buf, *obuf;
 	int osize = db->db.db_size;
 
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
 	/* XXX does *this* func really need the lock? */
 	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
 
@@ -814,6 +854,10 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 	 * be happening.
 	 */
 	/* Make a copy of the data if necessary */
+	/*
+	 * XXX we should be doing a dbuf_read, checking the return
+	 * value and returning that up to our callers
+	 */
 	dbuf_will_dirty(db, tx);
 
 	/* create the data buffer for the new block */
@@ -829,7 +873,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 	mutex_enter(&db->db_mtx);
 	/* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */
 	dbuf_set_data(db, buf);
-	arc_buf_free(obuf, db);
+	VERIFY(arc_buf_remove_ref(obuf, db) == 1);
 	db->db.db_size = size;
 
 	/* fix up the dirty info */
@@ -861,7 +905,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	 */
 	ASSERT(!(dmu_tx_is_syncing(tx) &&
 	    !BP_IS_HOLE(&dn->dn_objset->os_rootbp) &&
-	    !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    dn->dn_objset->os_dsl_dataset != NULL &&
 	    !dsl_dir_is_private(
 	    dn->dn_objset->os_dsl_dataset->ds_dir)));
@@ -871,7 +915,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	 * check if we're already dirty.  They are allowed to re-dirty
 	 * in syncing context.
 	 */
-	ASSERT(dn->dn_object & DMU_PRIVATE_OBJECT ||
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED ||
 	    dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
@@ -940,22 +984,27 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
-	if (db->db_level == 0) {
+	/*
+	 * If this buffer is dirty in an old transaction group we need
+	 * to make a copy of it so that the changes we make in this
+	 * transaction group won't leak out when we sync the older txg.
+	 */
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		ASSERT(db->db.db_data != NULL);
+		ASSERT(db->db_d.db_data_old[txgoff] == NULL);
+		dbuf_fix_old_bonus_data(db, tx->tx_txg);
+		db->db_d.db_data_old[txgoff] = db->db.db_data;
+	} else if (db->db_level == 0) {
 		/*
 		 * Release the data buffer from the cache so that we
 		 * can modify it without impacting possible other users
 		 * of this cached data block.  Note that indirect blocks
 		 * and private objects are not released until the syncing
 		 * state (since they are only modified then).
-		 *
-		 * If this buffer is dirty in an old transaction group we need
-		 * to make a copy of it so that the changes we make in this
-		 * transaction group won't leak out when we sync the older txg.
 		 */
 		ASSERT(db->db_buf != NULL);
-		ASSERT(db->db.db_data != NULL);
 		ASSERT(db->db_d.db_data_old[txgoff] == NULL);
-		if (!(db->db.db_object & DMU_PRIVATE_OBJECT)) {
+		if (db->db.db_object != DMU_META_DNODE_OBJECT) {
 			arc_release(db->db_buf, db);
 			dbuf_fix_old_data(db, tx->tx_txg);
 			ASSERT(db->db_buf != NULL);
@@ -978,12 +1027,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db);
 	mutex_exit(&dn->dn_mtx);
 
-	/*
-	 * If writting this buffer will consume a new block on disk,
-	 * then update the accounting.
-	 */
 	if (db->db_blkid != DB_BONUS_BLKID) {
-		if (!dbuf_new_block(db, tx) && db->db_blkptr) {
+		/*
+		 * Update the accounting.
+		 */
+		if (!dbuf_new_block(db) && db->db_blkptr) {
 			/*
 			 * This is only a guess -- if the dbuf is dirty
 			 * in a previous txg, we don't know how much
@@ -1028,7 +1076,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
 		dbuf_dirty(parent, tx);
-		dbuf_remove_ref(parent, FTAG);
+		dbuf_rele(parent, FTAG);
 	} else {
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
@@ -1042,8 +1090,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn = db->db_dnode;
 	int txgoff = tx->tx_txg & TXG_MASK;
+	int64_t holds;
 
 	ASSERT(tx->tx_txg != 0);
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 
 	mutex_enter(&db->db_mtx);
 
@@ -1080,7 +1130,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		ASSERT(db->db_buf != NULL);
 		ASSERT(db->db_d.db_data_old[txgoff] != NULL);
 		if (db->db_d.db_data_old[txgoff] != db->db_buf)
-			arc_buf_free(db->db_d.db_data_old[txgoff], db);
+			VERIFY(arc_buf_remove_ref(
+			    db->db_d.db_data_old[txgoff], db) == 1);
 		db->db_d.db_data_old[txgoff] = NULL;
 	}
 
@@ -1095,15 +1146,17 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 
-	if (refcount_remove(&db->db_holds,
-	    (void *)(uintptr_t)tx->tx_txg) == 0) {
-		/* make duf_verify() happy */
-		if (db->db.db_data)
-			bzero(db->db.db_data, db->db.db_size);
+	if ((holds = refcount_remove(&db->db_holds,
+	    (void *)(uintptr_t)tx->tx_txg)) == 0) {
+		arc_buf_t *buf = db->db_buf;
 
+		ASSERT(arc_released(buf));
+		dbuf_set_data(db, NULL);
+		VERIFY(arc_buf_remove_ref(buf, db) == 1);
 		dbuf_evict(db);
 		return (1);
 	}
+	ASSERT(holds > 0);
 
 	mutex_exit(&db->db_mtx);
 	return (0);
@@ -1120,19 +1173,21 @@ dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
 		rf |= DB_RF_HAVESTRUCT;
-	(void) dbuf_read_generic(db, rf);
+	(void) dbuf_read(db, NULL, rf);
 	dbuf_dirty(db, tx);
 }
 
-#pragma weak dmu_buf_will_fill = dbuf_will_fill
 void
-dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx)
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(db->db_level == 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
-	ASSERT(!(db->db.db_object & DMU_PRIVATE_OBJECT) ||
+	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
 	    dmu_tx_private_ok(tx));
 
 	dbuf_noread(db);
@@ -1149,6 +1204,7 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	if (db->db_state == DB_FILL) {
 		if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
+			ASSERT(db->db_blkid != DB_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
 			bzero(db->db.db_data, db->db.db_size);
@@ -1160,47 +1216,62 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	mutex_exit(&db->db_mtx);
 }
 
-
-static void
+/*
+ * "Clear" the contents of this dbuf.  This will mark the dbuf
+ * EVICTING and clear *most* of its references.  Unfortunetely,
+ * when we are not holding the dn_dbufs_mtx, we can't clear the
+ * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
+ * in this case.  For callers from the DMU we will usually see:
+ *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
+ * For the arc callback, we will usually see:
+ * 	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ * Sometimes, though, we will get a mix of these two:
+ *	DMU: dbuf_clear()->arc_buf_evict()
+ *	ARC: dbuf_do_evict()->dbuf_destroy()
+ */
+void
 dbuf_clear(dmu_buf_impl_t *db)
 {
 	dnode_t *dn = db->db_dnode;
+	dmu_buf_impl_t *parent = db->db_parent;
+	int dbuf_gone = FALSE;
 
-	ASSERT(MUTEX_HELD(&dn->dn_dbufs_mtx));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(refcount_is_zero(&db->db_holds));
 
+	dbuf_evict_user(db);
+
 	if (db->db_state == DB_CACHED) {
-		ASSERT(db->db_buf != NULL);
-		arc_buf_free(db->db_buf, db);
+		ASSERT(db->db.db_data != NULL);
+		if (db->db_blkid == DB_BONUS_BLKID)
+			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
 		db->db.db_data = NULL;
-		db->db_buf = NULL;
 		db->db_state = DB_UNCACHED;
 	}
 
 	ASSERT3U(db->db_state, ==, DB_UNCACHED);
-	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db_data_pending == NULL);
 
-	mutex_exit(&db->db_mtx);
+	db->db_state = DB_EVICTING;
+	db->db_blkptr = NULL;
+
+	if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
+		list_remove(&dn->dn_dbufs, db);
+		dnode_rele(dn, db);
+	}
+
+	if (db->db_buf)
+		dbuf_gone = arc_buf_evict(db->db_buf);
+
+	if (!dbuf_gone)
+		mutex_exit(&db->db_mtx);
 
 	/*
 	 * If this dbuf is referened from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
-	if (db->db_parent && db->db_parent != dn->dn_dbuf)
-		dbuf_remove_ref(db->db_parent, db);
-
-	/* remove from dn_dbufs */
-	list_remove(&dn->dn_dbufs, db);
-
-	dnode_rele(dn, db);
-
-	dbuf_hash_remove(db);
-
-	db->db_dnode = NULL;
-	db->db_parent = NULL;
-	db->db_blkptr = NULL;
+	if (parent && parent != dn->dn_dbuf)
+		dbuf_rele(parent, db);
 }
 
 static int
@@ -1209,6 +1280,8 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
 {
 	int nlevels, epbs;
 
+	ASSERT(blkid != DB_BONUS_BLKID);
+
 	if (dn->dn_phys->dn_nlevels == 0)
 		nlevels = 1;
 	else
@@ -1218,12 +1291,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
 
 	ASSERT3U(level * epbs, <, 64);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
-	if (blkid == DB_BONUS_BLKID) {
-		/* this is the bonus buffer */
-		*parentp = NULL;
-		*bpp = NULL;
-		return (0);
-	} else if (level >= nlevels ||
+	if (level >= nlevels ||
 	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
 		/* the buffer has no parent yet */
 		*parentp = NULL;
@@ -1235,10 +1303,13 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
 		    blkid >> epbs, fail_sparse, NULL, parentp);
 		if (err)
 			return (err);
-		dbuf_read_havestruct(*parentp);
-		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
-		    (blkid & ((1ULL << epbs) - 1));
-		return (0);
+		err = dbuf_read(*parentp, NULL,
+		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+		if (err == 0) {
+			*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+			    (blkid & ((1ULL << epbs) - 1));
+		}
+		return (err);
 	} else {
 		/* the block is referenced from the dnode */
 		ASSERT3U(level, ==, nlevels-1);
@@ -1266,11 +1337,21 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 	db->db.db_object = dn->dn_object;
 	db->db_level = level;
 	db->db_blkid = blkid;
-	db->db_state = DB_UNCACHED;
+	db->db_dirtied = 0;
+	db->db_dirtycnt = 0;
+	db->db_dnode = dn;
+	db->db_parent = parent;
+	db->db_blkptr = blkptr;
 
-	if (db->db_blkid == DB_BONUS_BLKID) {
+	bzero(&db->db_d, sizeof (db->db_d));
+
+	if (blkid == DB_BONUS_BLKID) {
+		ASSERT3P(parent, ==, dn->dn_dbuf);
 		db->db.db_size = dn->dn_bonuslen;
 		db->db.db_offset = DB_BONUS_BLKID;
+		db->db_state = DB_UNCACHED;
+		/* the bonus dbuf is not placed in the hash table */
+		return (db);
 	} else {
 		int blocksize =
 		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
@@ -1278,11 +1359,6 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 		db->db.db_offset = db->db_blkid * blocksize;
 	}
 
-	db->db_dirtied = 0;
-	db->db_dirtycnt = 0;
-
-	bzero(&db->db_d, sizeof (db->db_d));
-
 	/*
 	 * Hold the dn_dbufs_mtx while we get the new dbuf
 	 * in the hash table *and* added to the dbufs list.
@@ -1291,6 +1367,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 	 * dn_dbufs list.
 	 */
 	mutex_enter(&dn->dn_dbufs_mtx);
+	db->db_state = DB_EVICTING;
 	if ((odb = dbuf_hash_insert(db)) != NULL) {
 		/* someone else inserted it first */
 		kmem_cache_free(dbuf_cache, db);
@@ -1298,50 +1375,43 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 		return (odb);
 	}
 	list_insert_head(&dn->dn_dbufs, db);
+	db->db_state = DB_UNCACHED;
 	mutex_exit(&dn->dn_dbufs_mtx);
 
 	if (parent && parent != dn->dn_dbuf)
 		dbuf_add_ref(parent, db);
 
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+	    refcount_count(&dn->dn_holds) > 0);
 	(void) refcount_add(&dn->dn_holds, db);
 
-	db->db_dnode = dn;
-	db->db_parent = parent;
-	db->db_blkptr = blkptr;
-
 	dprintf_dbuf(db, "db=%p\n", db);
 
 	return (db);
 }
 
 static int
-dbuf_evictable(dmu_buf_impl_t *db)
+dbuf_do_evict(void *private)
 {
-	int i;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	DBUF_VERIFY(db);
+	arc_buf_t *buf = private;
+	dmu_buf_impl_t *db = buf->b_private;
 
-	if (db->db_state != DB_UNCACHED && db->db_state != DB_CACHED)
-		return (FALSE);
+	if (!MUTEX_HELD(&db->db_mtx))
+		mutex_enter(&db->db_mtx);
 
-	if (!refcount_is_zero(&db->db_holds))
-		return (FALSE);
+	ASSERT(db->db_buf == buf);
+	ASSERT(refcount_is_zero(&db->db_holds));
 
-#ifdef ZFS_DEBUG
-	for (i = 0; i < TXG_SIZE; i++) {
-		ASSERT(!list_link_active(&db->db_dirty_node[i]));
-		ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL);
+	if (db->db_state != DB_EVICTING) {
+		ASSERT(db->db_state == DB_CACHED);
+		DBUF_VERIFY(db);
+		db->db_buf = NULL;
+		dbuf_evict(db);
+	} else {
+		mutex_exit(&db->db_mtx);
+		dbuf_destroy(db);
 	}
-#endif
-
-	/*
-	 * Now we know we want to free it.
-	 * This call must be done last, since it has side effects -
-	 * calling the db_evict_func().
-	 */
-	dbuf_evict_user(db);
-	return (TRUE);
+	return (0);
 }
 
 static void
@@ -1349,9 +1419,36 @@ dbuf_destroy(dmu_buf_impl_t *db)
 {
 	ASSERT(refcount_is_zero(&db->db_holds));
 
+	if (db->db_blkid != DB_BONUS_BLKID) {
+		dnode_t *dn = db->db_dnode;
+
+		/*
+		 * If this dbuf is still on the dn_dbufs list,
+		 * remove it from that list.
+		 */
+		if (list_link_active(&db->db_link)) {
+			int need_mutex;
+
+			ASSERT(!MUTEX_HELD(&dn->dn_dbufs_mtx));
+			need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
+			if (need_mutex)
+				mutex_enter(&dn->dn_dbufs_mtx);
+
+			/* remove from dn_dbufs */
+			list_remove(&dn->dn_dbufs, db);
+
+			if (need_mutex)
+				mutex_exit(&dn->dn_dbufs_mtx);
+
+			dnode_rele(dn, db);
+		}
+		dbuf_hash_remove(db);
+	}
+	db->db_parent = NULL;
+	db->db_dnode = NULL;
+	db->db_buf = NULL;
+
 	ASSERT(db->db.db_data == NULL);
-	ASSERT(db->db_dnode == NULL);
-	ASSERT(db->db_parent == NULL);
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
@@ -1384,14 +1481,21 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
 
 	if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) {
 		if (bp && !BP_IS_HOLE(bp)) {
+			zbookmark_t zb;
+			zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
+			    dn->dn_objset->os_dsl_dataset->ds_object : 0;
+			zb.zb_object = dn->dn_object;
+			zb.zb_level = 0;
+			zb.zb_blkid = blkid;
+
 			(void) arc_read(NULL, dn->dn_objset->os_spa, bp,
 			    dmu_ot[dn->dn_type].ot_byteswap,
 			    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
-			    (ARC_NOWAIT | ARC_PREFETCH));
+			    (ARC_NOWAIT | ARC_PREFETCH), &zb);
 		}
 		if (parent && parent != dn->dn_dbuf)
-			dbuf_rele(parent);
+			dbuf_rele(parent, NULL);
 	}
 }
 
@@ -1405,11 +1509,12 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
 {
 	dmu_buf_impl_t *db, *parent = NULL;
 
+	ASSERT(blkid != DB_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT3U(dn->dn_nlevels, >, level);
 
 	*dbp = NULL;
-
+top:
 	/* dbuf_find() returns with db_mtx held */
 	db = dbuf_find(dn, level, blkid);
 
@@ -1423,13 +1528,26 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
 				err = ENOENT;
 			if (err) {
 				if (parent && parent != dn->dn_dbuf)
-					dbuf_rele(parent);
+					dbuf_rele(parent, NULL);
 				return (err);
 			}
 		}
+		if (err && err != ENOENT)
+			return (err);
 		db = dbuf_create(dn, level, blkid, parent, bp);
 	}
 
+	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
+		arc_buf_add_ref(db->db_buf, db);
+		if (db->db_buf->b_data == NULL) {
+			dbuf_clear(db);
+			goto top;
+		}
+		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+	}
+
+	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
+
 	/*
 	 * If this buffer is currently syncing out, and we are
 	 * are still referencing it from db_data, we need to make
@@ -1437,7 +1555,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
 	 * again in this txg.
 	 */
 	if (db->db_level == 0 && db->db_state == DB_CACHED &&
-	    !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    db->db_data_pending == db->db_buf) {
 		int size = (db->db_blkid == DB_BONUS_BLKID) ?
 		    DN_MAX_BONUSLEN : db->db.db_size;
@@ -1448,14 +1566,14 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
 		    db->db.db_size);
 	}
 
-	dbuf_add_ref(db, tag);
+	(void) refcount_add(&db->db_holds, tag);
 	dbuf_update_data(db);
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
 	if (parent && parent != dn->dn_dbuf)
-		dbuf_rele(parent);
+		dbuf_rele(parent, NULL);
 
 	ASSERT3P(db->db_dnode, ==, dn);
 	ASSERT3U(db->db_blkid, ==, blkid);
@@ -1466,81 +1584,83 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
 }
 
 dmu_buf_impl_t *
-dbuf_hold(dnode_t *dn, uint64_t blkid)
+dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
-	(void) dbuf_hold_impl(dn, 0, blkid, FALSE, NULL, &db);
-	return (db);
+	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
+	return (err ? NULL : db);
 }
 
 dmu_buf_impl_t *
 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
-	(void) dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
-	return (db);
+	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+	return (err ? NULL : db);
 }
 
 dmu_buf_impl_t *
-dbuf_hold_bonus(dnode_t *dn, void *tag)
+dbuf_create_bonus(dnode_t *dn)
 {
-	dmu_buf_impl_t *db;
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	(void) dbuf_hold_impl(dn, 0, DB_BONUS_BLKID, FALSE, tag, &db);
-	rw_exit(&dn->dn_struct_rwlock);
+	dmu_buf_impl_t *db = dn->dn_bonus;
+
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+	ASSERT(dn->dn_bonus == NULL);
+	db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
 	return (db);
 }
 
+#pragma weak dmu_buf_add_ref = dbuf_add_ref
 void
 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
 {
-	(void) refcount_add(&db->db_holds, tag);
-	/* dprintf_dbuf(db, "adding ref %p; holds up to %lld\n", tag, holds); */
+	int64_t holds = refcount_add(&db->db_holds, tag);
+	ASSERT(holds > 1);
 }
 
+#pragma weak dmu_buf_rele = dbuf_rele
 void
-dbuf_remove_ref(dmu_buf_impl_t *db, void *tag)
+dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds;
-	dnode_t *dn = db->db_dnode;
-	int need_mutex;
-
-	ASSERT(dn != NULL);
-	need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
-
-	if (need_mutex) {
-		dnode_add_ref(dn, FTAG);
-		mutex_enter(&dn->dn_dbufs_mtx);
-	}
 
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 
 	holds = refcount_remove(&db->db_holds, tag);
+	ASSERT(holds >= 0);
+
+	if (holds == db->db_dirtycnt &&
+	    db->db_level == 0 && db->db_d.db_immediate_evict)
+		dbuf_evict_user(db);
 
 	if (holds == 0) {
-		ASSERT3U(db->db_state, !=, DB_FILL);
-		if (db->db_level == 0 &&
-		    db->db_d.db_user_data_ptr_ptr != NULL)
-			*db->db_d.db_user_data_ptr_ptr = NULL;
-		dbuf_evict(db);
+		if (db->db_blkid == DB_BONUS_BLKID) {
+			mutex_exit(&db->db_mtx);
+			dnode_rele(db->db_dnode, db);
+		} else if (db->db_buf == NULL) {
+			/*
+			 * This is a special case: we never associated this
+			 * dbuf with any data allocated from the ARC.
+			 */
+			ASSERT3U(db->db_state, ==, DB_UNCACHED);
+			dbuf_evict(db);
+		} else  if (arc_released(db->db_buf)) {
+			arc_buf_t *buf = db->db_buf;
+			/*
+			 * This dbuf has anonymous data associated with it.
+			 */
+			dbuf_set_data(db, NULL);
+			VERIFY(arc_buf_remove_ref(buf, db) == 1);
+			dbuf_evict(db);
+		} else {
+			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
+			mutex_exit(&db->db_mtx);
+		}
 	} else {
-		if (holds == db->db_dirtycnt &&
-		    db->db_level == 0 && db->db_d.db_immediate_evict)
-			dbuf_evict_user(db);
 		mutex_exit(&db->db_mtx);
 	}
-
-	if (need_mutex) {
-		mutex_exit(&dn->dn_dbufs_mtx);
-		dnode_rele(dn, FTAG);
-	}
-}
-
-void
-dbuf_rele(dmu_buf_impl_t *db)
-{
-	dbuf_remove_ref(db, NULL);
 }
 
 #pragma weak dmu_buf_refcount = dbuf_refcount
@@ -1611,6 +1731,8 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 	dnode_t *dn = db->db_dnode;
 	objset_impl_t *os = dn->dn_objset;
 	int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	int checksum, compress;
+	zbookmark_t zb;
 	int blksz;
 
 	ASSERT(dmu_tx_is_syncing(tx));
@@ -1638,8 +1760,38 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 	 * be modified yet.
 	 */
 
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		void **datap = &db->db_d.db_data_old[txg&TXG_MASK];
+		/*
+		 * Simply copy the bonus data into the dnode.  It will
+		 * be written out when the dnode is synced (and it will
+		 * be synced, since it must have been dirty for dbuf_sync
+		 * to be called).
+		 */
+		/*
+		 * Use dn_phys->dn_bonuslen since db.db_size is the length
+		 * of the bonus buffer in the open transaction rather than
+		 * the syncing transaction.
+		 */
+		ASSERT(*datap != NULL);
+		ASSERT3U(db->db_level, ==, 0);
+		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+		if (*datap != db->db.db_data)
+			zio_buf_free(*datap, DN_MAX_BONUSLEN);
+		db->db_d.db_data_old[txg&TXG_MASK] = NULL;
+		db->db_data_pending = NULL;
+		if (db->db_dirtied == txg)
+			db->db_dirtied = 0;
+		ASSERT(db->db_dirtycnt > 0);
+		db->db_dirtycnt -= 1;
+		mutex_exit(&db->db_mtx);
+		dbuf_rele(db, (void *)(uintptr_t)txg);
+		return;
+	}
+
 	if (db->db_level == 0) {
-		data = &db->db_d.db_data_old[txg&TXG_MASK];
+		data = (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
 		blksz = arc_buf_size(*data);
 		/*
 		 * If this buffer is currently "in use" (i.e., there are
@@ -1651,17 +1803,15 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		 * modified in the syncing context (e.g. DNONE_DNODE blocks)
 		 * or if there is no actual write involved (bonus blocks).
 		 */
-		if (!(dn->dn_object & DMU_PRIVATE_OBJECT) &&
-		    db->db_d.db_overridden_by[txg&TXG_MASK] == NULL &&
-		    db->db_blkid != DB_BONUS_BLKID) {
+		if (dn->dn_object != DMU_META_DNODE_OBJECT &&
+		    db->db_d.db_overridden_by[txg&TXG_MASK] == NULL) {
 			if (refcount_count(&db->db_holds) > 1 &&
 			    *data == db->db_buf) {
-				*data = arc_buf_alloc(
-				    db->db_dnode->dn_objset->os_spa, blksz, db);
+				*data = arc_buf_alloc(os->os_spa, blksz, db);
 				bcopy(db->db.db_data, (*data)->b_data, blksz);
 			}
 			db->db_data_pending = *data;
-		} else if (dn->dn_object & DMU_PRIVATE_OBJECT) {
+		} else if (dn->dn_object == DMU_META_DNODE_OBJECT) {
 			/*
 			 * Private object buffers are released here rather
 			 * than in dbuf_dirty() since they are only modified
@@ -1683,7 +1833,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 			ASSERT(db->db_dirtycnt > 0);
 			db->db_dirtycnt -= 1;
 			mutex_exit(&db->db_mtx);
-			dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+			dbuf_rele(db, (void *)(uintptr_t)txg);
 			return;
 		}
 		blksz = db->db.db_size;
@@ -1692,35 +1842,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 
 	ASSERT(*data != NULL);
 
-	if (db->db_blkid == DB_BONUS_BLKID) {
-		/*
-		 * Simply copy the bonus data into the dnode.  It will
-		 * be written out when the dnode is synced (and it will
-		 * be synced, since it must have been dirty for dbuf_sync
-		 * to be called).  The bonus data will be byte swapped
-		 * in dnode_byteswap.
-		 */
-		/*
-		 * Use dn_phys->dn_bonuslen since db.db_size is the length
-		 * of the bonus buffer in the open transaction rather than
-		 * the syncing transaction.
-		 */
-		ASSERT3U(db->db_level, ==, 0);
-		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, blksz);
-		bcopy((*data)->b_data, DN_BONUS(dn->dn_phys),
-		    dn->dn_phys->dn_bonuslen);
-		if (*data != db->db_buf)
-			arc_buf_free(*data, db);
-		db->db_d.db_data_old[txg&TXG_MASK] = NULL;
-		db->db_data_pending = NULL;
-		if (db->db_dirtied == txg)
-			db->db_dirtied = 0;
-		ASSERT(db->db_dirtycnt > 0);
-		db->db_dirtycnt -= 1;
-		mutex_exit(&db->db_mtx);
-		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
-		return;
-	} else if (db->db_level > 0 && !arc_released(db->db_buf)) {
+	if (db->db_level > 0 && !arc_released(db->db_buf)) {
 		/*
 		 * This indirect buffer was marked dirty, but
 		 * never modified (if it had been modified, then
@@ -1733,7 +1855,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
 		mutex_exit(&db->db_mtx);
-		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+		dbuf_rele(db, (void *)(uintptr_t)txg);
 		return;
 	} else if (db->db_blkptr == NULL &&
 	    db->db_level == dn->dn_phys->dn_nlevels-1 &&
@@ -1757,18 +1879,18 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		if (parent == NULL) {
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			(void) dbuf_hold_impl(dn, db->db_level+1,
-			    db->db_blkid >> epbs, FALSE, NULL, &parent);
+			    db->db_blkid >> epbs, FALSE, FTAG, &parent);
 			rw_exit(&dn->dn_struct_rwlock);
 			dbuf_add_ref(parent, db);
 			db->db_parent = parent;
-			dbuf_rele(parent);
+			dbuf_rele(parent, FTAG);
 		}
-		dbuf_read(parent);
+		(void) dbuf_read(parent, NULL, DB_RF_MUST_SUCCEED);
 	} else {
 		mutex_exit(&db->db_mtx);
 	}
 
-	ASSERT(IS_DNODE_DNODE(dn->dn_object) || db->db_parent != NULL);
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || db->db_parent != NULL);
 
 	if (db->db_level > 0 &&
 	    db->db_blkid > dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)) {
@@ -1801,7 +1923,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		mutex_enter(&db->db_mtx);
 		db->db_dirtycnt -= 1;
 		mutex_exit(&db->db_mtx);
-		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+		dbuf_rele(db, (void *)(uintptr_t)txg);
 		return;
 	}
 
@@ -1812,20 +1934,17 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		ASSERT(db->db_level == parent->db_level-1);
 		ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK]));
 		/*
-		 * We may have read this block after we dirtied it,
+		 * We may have read this indirect block after we dirtied it,
 		 * so never released it from the cache.
 		 */
-		arc_release(parent->db_buf, parent);
+		arc_release(parent->db_buf, db->db_parent);
 
 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
 		    (db->db_blkid & ((1ULL << epbs) - 1));
 		DBUF_VERIFY(db);
 		mutex_exit(&db->db_mtx);
-	}
-	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
-
 #ifdef ZFS_DEBUG
-	if (db->db_parent == dn->dn_dbuf) {
+	} else {
 		/*
 		 * We don't need to dnode_setdirty(dn) because if we got
 		 * here then the parent is already dirty.
@@ -1833,11 +1952,14 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
 		ASSERT3P(db->db_blkptr, ==,
 		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
-	}
 #endif
+	}
+	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
 	if (db->db_level == 0 &&
 	    db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
-		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
+		arc_buf_t **old =
+		    (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
 		blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK];
 		int old_size = BP_GET_ASIZE(db->db_blkptr);
 		int new_size = BP_GET_ASIZE(*bpp);
@@ -1861,7 +1983,11 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		*bpp = NULL;
 
 		if (*old != db->db_buf)
-			arc_buf_free(*old, db);
+			VERIFY(arc_buf_remove_ref(*old, db) == 1);
+		else if (!BP_IS_HOLE(db->db_blkptr))
+			arc_set_callback(db->db_buf, dbuf_do_evict, db);
+		else
+			ASSERT(arc_released(db->db_buf));
 		*old = NULL;
 		db->db_data_pending = NULL;
 
@@ -1870,54 +1996,55 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
 		mutex_exit(&db->db_mtx);
-		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
-	} else {
-		int checksum, compress;
+		dbuf_rele(db, (void *)(uintptr_t)txg);
+		return;
+	}
 
-		if (db->db_level > 0) {
-			/*
-			 * XXX -- we should design a compression algorithm
-			 * that specializes in arrays of bps.
-			 */
-			checksum = ZIO_CHECKSUM_FLETCHER_4;
-			/* XXX - disable compresssion for now */
-			compress = ZIO_COMPRESS_OFF;
+	if (db->db_level > 0) {
+		/*
+		 * XXX -- we should design a compression algorithm
+		 * that specializes in arrays of bps.
+		 */
+		checksum = ZIO_CHECKSUM_FLETCHER_4;
+		compress = ZIO_COMPRESS_LZJB;
+	} else {
+		/*
+		 * Allow dnode settings to override objset settings,
+		 * except for metadata checksums.
+		 */
+		if (dmu_ot[dn->dn_type].ot_metadata) {
+			checksum = os->os_md_checksum;
+			compress = zio_compress_select(dn->dn_compress,
+			    os->os_md_compress);
 		} else {
-			/*
-			 * Allow dnode settings to override objset settings,
-			 * except for metadata checksums.
-			 */
-			if (dmu_ot[dn->dn_type].ot_metadata) {
-				checksum = os->os_md_checksum;
-				compress = zio_compress_select(dn->dn_compress,
-				    os->os_md_compress);
-			} else {
-				checksum = zio_checksum_select(dn->dn_checksum,
-				    os->os_checksum);
-				compress = zio_compress_select(dn->dn_compress,
-				    os->os_compress);
-			}
+			checksum = zio_checksum_select(dn->dn_checksum,
+			    os->os_checksum);
+			compress = zio_compress_select(dn->dn_compress,
+			    os->os_compress);
 		}
+	}
 #ifdef ZFS_DEBUG
-		if (db->db_parent) {
-			ASSERT(list_link_active(
-			    &db->db_parent->db_dirty_node[txg&TXG_MASK]));
-			ASSERT(db->db_parent == dn->dn_dbuf ||
-			    db->db_parent->db_level > 0);
-			if (dn->dn_object & DMU_PRIVATE_OBJECT ||
-			    db->db_level > 0)
-				ASSERT(*data == db->db_buf);
-		}
-#endif
-		ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg);
-		(void) arc_write(zio, os->os_spa, checksum, compress, txg,
-		    db->db_blkptr, *data, dbuf_write_done, db,
-		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT);
-		/*
-		 * We can't access db after arc_write, since it could finish
-		 * and be freed, and we have no locks on it.
-		 */
+	if (db->db_parent) {
+		ASSERT(list_link_active(
+		    &db->db_parent->db_dirty_node[txg&TXG_MASK]));
+		ASSERT(db->db_parent == dn->dn_dbuf ||
+		    db->db_parent->db_level > 0);
+		if (dn->dn_object == DMU_META_DNODE_OBJECT || db->db_level > 0)
+			ASSERT(*data == db->db_buf);
 	}
+#endif
+	ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg);
+	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+	zb.zb_object = db->db.db_object;
+	zb.zb_level = db->db_level;
+	zb.zb_blkid = db->db_blkid;
+	(void) arc_write(zio, os->os_spa, checksum, compress, txg,
+	    db->db_blkptr, *data, dbuf_write_done, db,
+	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb);
+	/*
+	 * We can't access db after arc_write, since it could finish
+	 * and be freed, and we have no locks on it.
+	 */
 }
 
 struct dbuf_arg {
@@ -1970,12 +2097,17 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 		db->db_dirtied = 0;
 
 	if (db->db_level == 0) {
-		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
+		arc_buf_t **old =
+		    (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
 
 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
 
 		if (*old != db->db_buf)
-			arc_buf_free(*old, db);
+			VERIFY(arc_buf_remove_ref(*old, db) == 1);
+		else if (!BP_IS_HOLE(db->db_blkptr))
+			arc_set_callback(db->db_buf, dbuf_do_evict, db);
+		else
+			ASSERT(arc_released(db->db_buf));
 		*old = NULL;
 		db->db_data_pending = NULL;
 
@@ -2007,6 +2139,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 			    db->db.db_size);
 			ASSERT3U(dn->dn_phys->dn_maxblkid
 			    >> (db->db_level * epbs), >=, db->db_blkid);
+			arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
 			if (BP_IS_HOLE(bp))
@@ -2053,5 +2186,5 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 		}
 	}
 
-	dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+	dbuf_rele(db, (void *)(uintptr_t)txg);
 }
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 14fab6d420..f883842dad 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -40,6 +39,7 @@
 #include <sys/dmu_zfetch.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
+#include <sys/zio_checksum.h>
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	byteswap_uint8_array,	TRUE,	"unallocated"		},
@@ -70,101 +70,40 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	byteswap_uint8_array,	FALSE,	"other uint8[]"		},
 	{	byteswap_uint64_array,	FALSE,	"other uint64[]"	},
 	{	zap_byteswap,		TRUE,	"other ZAP"		},
+	{	zap_byteswap,		TRUE,	"persistent error log"	},
 };
 
-static int
-dmu_buf_read_array_impl(dmu_buf_impl_t **dbp, int numbufs, uint32_t flags)
-{
-	int i, err = 0;
-	dnode_t *dn;
-	zio_t *zio;
-	int canfail;
-	uint64_t rd_sz;
-
-	if (numbufs == 0)
-		return (0);
-
-	rd_sz = numbufs * dbp[0]->db.db_size;
-	ASSERT(rd_sz <= DMU_MAX_ACCESS);
-
-	dn = dbp[0]->db_dnode;
-	if (flags & DB_RF_CANFAIL) {
-		canfail = 1;
-	} else {
-		canfail = 0;
-	}
-	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, canfail);
-
-	/* don't prefetch if read the read is large */
-	if (rd_sz >= zfetch_array_rd_sz) {
-		flags |= DB_RF_NOPREFETCH;
-	}
-
-	/* initiate async reads */
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	for (i = 0; i < numbufs; i++) {
-		if (dbp[i]->db_state == DB_UNCACHED)
-			dbuf_read_impl(dbp[i], zio, flags);
-	}
-	rw_exit(&dn->dn_struct_rwlock);
-	err = zio_wait(zio);
-
-	if (err)
-		return (err);
-
-	/* wait for other io to complete */
-	for (i = 0; i < numbufs; i++) {
-		mutex_enter(&dbp[i]->db_mtx);
-		while (dbp[i]->db_state == DB_READ ||
-		    dbp[i]->db_state == DB_FILL)
-			cv_wait(&dbp[i]->db_changed, &dbp[i]->db_mtx);
-		ASSERT(dbp[i]->db_state == DB_CACHED);
-		mutex_exit(&dbp[i]->db_mtx);
-	}
-
-	return (0);
-}
-
-void
-dmu_buf_read_array(dmu_buf_t **dbp_fake, int numbufs)
-{
-	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
-	int err;
-
-	err = dmu_buf_read_array_impl(dbp, numbufs, DB_RF_MUST_SUCCEED);
-	ASSERT(err == 0);
-}
-
 int
-dmu_buf_read_array_canfail(dmu_buf_t **dbp_fake, int numbufs)
-{
-	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
-
-	return (dmu_buf_read_array_impl(dbp, numbufs, DB_RF_CANFAIL));
-}
-
-dmu_buf_t *
-dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset)
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
+	int err;
 
 	/* dataset_verify(dd); */
 
-	dn = dnode_hold(os->os, object, FTAG);
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
 	blkid = dbuf_whichblock(dn, offset);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	db = dbuf_hold(dn, blkid);
+	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
-	dnode_rele(dn, FTAG);
-	return (&db->db);
-}
+	if (db == NULL) {
+		err = EIO;
+	} else {
+		err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+		if (err) {
+			dbuf_rele(db, tag);
+			db = NULL;
+		}
+	}
 
-dmu_buf_t *
-dmu_bonus_hold(objset_t *os, uint64_t object)
-{
-	return (dmu_bonus_hold_tag(os, object, NULL));
+	dnode_rele(dn, FTAG);
+	*dbp = &db->db;
+	return (err);
 }
 
 int
@@ -174,41 +113,69 @@ dmu_bonus_max(void)
 }
 
 /*
- * Returns held bonus buffer if the object exists, NULL if it doesn't.
+ * returns ENOENT, EIO, or 0.
  */
-dmu_buf_t *
-dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag)
+int
+dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dnode_t *dn;
+	int err, count;
 	dmu_buf_impl_t *db;
 
-	if (dn == NULL)
-		return (NULL);
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
 
-	db = dbuf_hold_bonus(dn, tag);
-	/* XXX - hack: hold the first block if this is a ZAP object */
-	if (dmu_ot[dn->dn_type].ot_byteswap == zap_byteswap) {
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		dn->dn_db0 = dbuf_hold(dn, 0);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (dn->dn_bonus == NULL) {
 		rw_exit(&dn->dn_struct_rwlock);
+		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		if (dn->dn_bonus == NULL)
+			dn->dn_bonus = dbuf_create_bonus(dn);
 	}
+	db = dn->dn_bonus;
+	rw_exit(&dn->dn_struct_rwlock);
+	mutex_enter(&db->db_mtx);
+	count = refcount_add(&db->db_holds, tag);
+	mutex_exit(&db->db_mtx);
+	if (count == 1)
+		dnode_add_ref(dn, db);
 	dnode_rele(dn, FTAG);
-	return (&db->db);
+
+	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
+
+	*dbp = &db->db;
+	return (0);
 }
 
-static dmu_buf_t **
-dbuf_hold_array(dnode_t *dn,
-    uint64_t offset, uint64_t length, int *numbufsp)
+int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
 {
+	dnode_t *dn;
 	dmu_buf_t **dbp;
 	uint64_t blkid, nblks, i;
+	uint32_t flags;
+	int err;
+	zio_t *zio;
+
+	ASSERT(length <= DMU_MAX_ACCESS);
 
 	if (length == 0) {
 		if (numbufsp)
 			*numbufsp = 0;
-		return (NULL);
+		*dbpp = NULL;
+		return (0);
 	}
 
+	flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
+	if (length >= zfetch_array_rd_sz)
+		flags |= DB_RF_NOPREFETCH;
+
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
@@ -218,83 +185,62 @@ dbuf_hold_array(dnode_t *dn,
 		ASSERT3U(offset + length, <=, dn->dn_datablksz);
 		nblks = 1;
 	}
-	dbp = kmem_alloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
+	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE);
 	blkid = dbuf_whichblock(dn, offset);
 	for (i = 0; i < nblks; i++) {
-		dmu_buf_impl_t *dbuf;
-		dbuf = dbuf_hold(dn, blkid+i);
-		dbp[i] = &dbuf->db;
+		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
+		if (db == NULL) {
+			rw_exit(&dn->dn_struct_rwlock);
+			dmu_buf_rele_array(dbp, nblks, tag);
+			dnode_rele(dn, FTAG);
+			zio_nowait(zio);
+			return (EIO);
+		}
+		/* initiate async i/o */
+		if (read && db->db_state == DB_UNCACHED) {
+			rw_exit(&dn->dn_struct_rwlock);
+			(void) dbuf_read(db, zio, flags);
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		}
+		dbp[i] = &db->db;
 	}
 	rw_exit(&dn->dn_struct_rwlock);
-
-	if (numbufsp)
-		*numbufsp = nblks;
-	return (dbp);
-}
-
-dmu_buf_t **
-dmu_buf_hold_array(objset_t *os, uint64_t object,
-	uint64_t offset, uint64_t length, int *numbufsp)
-{
-	dnode_t *dn;
-	dmu_buf_t **dbp;
-
-	ASSERT(length <= DMU_MAX_ACCESS);
-
-	if (length == 0) {
-		if (numbufsp)
-			*numbufsp = 0;
-		return (NULL);
-	}
-
-	dn = dnode_hold(os->os, object, FTAG);
-	dbp = dbuf_hold_array(dn, offset, length, numbufsp);
 	dnode_rele(dn, FTAG);
 
-	return (dbp);
-}
-
-void
-dmu_buf_add_ref(dmu_buf_t *dbuf, void *tag)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
-	dbuf_add_ref(db, tag);
-}
-
-void
-dmu_buf_remove_ref(dmu_buf_t *dbuf, void *tag)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
-	dbuf_remove_ref(db, tag);
-}
-
-void
-dmu_buf_rele(dmu_buf_t *dbuf_fake)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
-
-	/* XXX - hack: hold the first block  if this is a ZAP object */
-	if (db->db_blkid == DB_BONUS_BLKID &&
-	    dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
-		dbuf_rele(db->db_dnode->dn_db0);
-	dbuf_rele(db);
-}
+	/* wait for async i/o */
+	err = zio_wait(zio);
+	if (err) {
+		dmu_buf_rele_array(dbp, nblks, tag);
+		return (err);
+	}
 
-void
-dmu_buf_rele_tag(dmu_buf_t *dbuf_fake, void *tag)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
+	/* wait for other io to complete */
+	if (read) {
+		for (i = 0; i < nblks; i++) {
+			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+			mutex_enter(&db->db_mtx);
+			while (db->db_state == DB_READ ||
+			    db->db_state == DB_FILL)
+				cv_wait(&db->db_changed, &db->db_mtx);
+			if (db->db_state == DB_UNCACHED)
+				err = EIO;
+			mutex_exit(&db->db_mtx);
+			if (err) {
+				dmu_buf_rele_array(dbp, nblks, tag);
+				return (err);
+			}
+		}
+	}
 
-	/* XXX - hack: hold the first block  if this is a ZAP object */
-	if (db->db_blkid == DB_BONUS_BLKID &&
-	    dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
-		dbuf_rele(db->db_dnode->dn_db0);
-	dbuf_remove_ref(db, tag);
+	*numbufsp = nblks;
+	*dbpp = dbp;
+	return (0);
 }
 
 void
-dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs)
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
 {
 	int i;
 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
@@ -302,10 +248,10 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs)
 	if (numbufs == 0)
 		return;
 
-	ASSERT((numbufs * dbp[0]->db.db_size) <= DMU_MAX_ACCESS);
-
-	for (i = 0; i < numbufs; i++)
-		dbuf_rele(dbp[i]);
+	for (i = 0; i < numbufs; i++) {
+		if (dbp[i])
+			dbuf_rele(dbp[i], tag);
+	}
 
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
@@ -315,7 +261,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 {
 	dnode_t *dn;
 	uint64_t blkid;
-	int nblks, i;
+	int nblks, i, err;
 
 	if (len == 0) {  /* they're interested in the bonus buffer */
 		dn = os->os->os_meta_dnode;
@@ -335,8 +281,8 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 	 * already cached, we will do a *synchronous* read in the
 	 * dnode_hold() call.  The same is true for any indirects.
 	 */
-	dn = dnode_hold(os->os, object, FTAG);
-	if (dn == NULL)
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err != 0)
 		return;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
@@ -359,39 +305,44 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 	dnode_rele(dn, FTAG);
 }
 
-void
+int
 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dnode_t *dn;
+	int err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
 	ASSERT(offset < UINT64_MAX);
 	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
 	dnode_free_range(dn, offset, size, tx);
 	dnode_rele(dn, FTAG);
+	return (0);
 }
 
-static int
-dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    void *buf, uint32_t flags)
+int
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    void *buf)
 {
 	dnode_t *dn;
 	dmu_buf_t **dbp;
-	int numbufs, i;
-
-	dn = dnode_hold(os->os, object, FTAG);
+	int numbufs, i, err;
 
+	/*
+	 * Deal with odd block sizes, where there can't be data past the
+	 * first block.
+	 */
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
 	if (dn->dn_datablkshift == 0) {
 		int newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
 		bzero((char *)buf + newsz, size - newsz);
 		size = newsz;
 	}
-
 	dnode_rele(dn, FTAG);
 
-	if (size == 0)
-		return (0);
-
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 		int err;
@@ -400,13 +351,10 @@ dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		 * NB: we could do this block-at-a-time, but it's nice
 		 * to be reading in parallel.
 		 */
-		dbp = dmu_buf_hold_array(os, object, offset, mylen, &numbufs);
-		err = dmu_buf_read_array_impl((dmu_buf_impl_t **)dbp, numbufs,
-		    flags);
-		if (err) {
-			dmu_buf_rele_array(dbp, numbufs);
+		err = dmu_buf_hold_array(os, object, offset, mylen,
+		    TRUE, FTAG, &numbufs, &dbp);
+		if (err)
 			return (err);
-		}
 
 		for (i = 0; i < numbufs; i++) {
 			int tocpy;
@@ -424,36 +372,20 @@ dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 			size -= tocpy;
 			buf = (char *)buf + tocpy;
 		}
-		dmu_buf_rele_array(dbp, numbufs);
+		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	return (0);
 }
 
 void
-dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    void *buf)
-{
-	int err;
-
-	err = dmu_read_impl(os, object, offset, size, buf, DB_RF_MUST_SUCCEED);
-	ASSERT3U(err, ==, 0);
-}
-
-int
-dmu_read_canfail(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    void *buf)
-{
-	return (dmu_read_impl(os, object, offset, size, buf, DB_RF_CANFAIL));
-}
-
-void
 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 
-	dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
+	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp));
 
 	for (i = 0; i < numbufs; i++) {
 		int tocpy;
@@ -481,7 +413,7 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		size -= tocpy;
 		buf = (char *)buf + tocpy;
 	}
-	dmu_buf_rele_array(dbp, numbufs);
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 #ifdef _KERNEL
@@ -493,7 +425,10 @@ dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	int numbufs, i;
 	int err = 0;
 
-	dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
+	err = dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp);
+	if (err)
+		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		int tocpy;
@@ -530,7 +465,7 @@ dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		offset += tocpy;
 		size -= tocpy;
 	}
-	dmu_buf_rele_array(dbp, numbufs);
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
 #endif
@@ -539,6 +474,7 @@ struct backuparg {
 	dmu_replay_record_t *drr;
 	vnode_t *vp;
 	objset_t *os;
+	zio_cksum_t zc;
 	int err;
 };
 
@@ -546,8 +482,9 @@ static int
 dump_bytes(struct backuparg *ba, void *buf, int len)
 {
 	ssize_t resid; /* have to get resid to get detailed errno */
-	/* Need to compute checksum here */
 	ASSERT3U(len % 8, ==, 0);
+
+	fletcher_4_incremental_native(buf, len, &ba->zc);
 	ba->err = vn_rdwr(UIO_WRITE, ba->vp,
 	    (caddr_t)buf, len,
 	    0, UIO_SYSSPACE, FAPPEND, RLIM_INFINITY, CRED(), &resid);
@@ -652,7 +589,7 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 	void *data = bc->bc_data;
 	int err = 0;
 
-	if (issig(JUSTLOOKING))
+	if (issig(JUSTLOOKING) && issig(FORREAL))
 		return (EINTR);
 
 	ASSERT(data || bp == NULL);
@@ -681,16 +618,21 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 		int blksz = BP_GET_LSIZE(bp);
 		if (data == NULL) {
 			arc_buf_t *abuf;
+			zbookmark_t zb;
 
+			zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object;
+			zb.zb_object = object;
+			zb.zb_level = level;
+			zb.zb_blkid = blkid;
 			(void) arc_read(NULL, spa, bp,
 			    dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
 			    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
-			    ARC_WAIT);
+			    ARC_WAIT, &zb);
 
 			if (abuf) {
 				err = dump_data(ba, type, object, blkid * blksz,
 				    blksz, abuf->b_data);
-				arc_buf_free(abuf, &abuf);
+				(void) arc_buf_remove_ref(abuf, &abuf);
 			}
 		} else {
 			err = dump_data(ba, type, object, blkid * blksz,
@@ -736,6 +678,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp)
 	ba.drr = drr;
 	ba.vp = vp;
 	ba.os = tosnap;
+	ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
 
 	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
 		kmem_free(drr, sizeof (dmu_replay_record_t));
@@ -755,6 +698,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp)
 
 	bzero(drr, sizeof (dmu_replay_record_t));
 	drr->drr_type = DRR_END;
+	drr->drr_u.drr_end.drr_checksum = ba.zc;
 
 	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)))
 		return (ba.err);
@@ -773,6 +717,7 @@ struct restorearg {
 	int buflen; /* number of valid bytes in buf */
 	int bufoff; /* next offset to read */
 	int bufsize; /* amount of memory allocated for buf */
+	zio_cksum_t zc;
 };
 
 static int
@@ -789,8 +734,11 @@ replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	if (dd->dd_phys->dd_head_dataset_obj == 0)
 		goto die;
 
-	ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
-	    NULL, DS_MODE_EXCLUSIVE, FTAG);
+	err = dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj,
+	    NULL, DS_MODE_EXCLUSIVE, FTAG, &ds);
+	if (err)
+		goto die;
 
 	if (ds == NULL) {
 		err = EBUSY;
@@ -804,9 +752,11 @@ replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	}
 
 	/* most recent snapshot must match fromguid */
-	ds_prev = dsl_dataset_open_obj(dd->dd_pool,
+	err = dsl_dataset_open_obj(dd->dd_pool,
 	    ds->ds_phys->ds_prev_snap_obj, NULL,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds_prev);
+	if (err)
+		goto die;
 	if (ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) {
 		err = ENODEV;
 		goto die;
@@ -885,9 +835,8 @@ replay_full_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 	/* the point of no (unsuccessful) return */
 
-	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname,
-	    DS_MODE_EXCLUSIVE, FTAG, &ds);
-	ASSERT3U(err, ==, 0);
+	VERIFY(0 == dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname,
+	    DS_MODE_EXCLUSIVE, FTAG, &ds));
 	kmem_free(fsfullname, MAXNAMELEN);
 
 	(void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
@@ -921,9 +870,8 @@ replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 		return (err);
 
 	/* set snapshot's creation time and guid */
-	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname,
-	    DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds);
-	ASSERT3U(err, ==, 0);
+	VERIFY(0 == dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname,
+	    DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds));
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
@@ -932,8 +880,9 @@ replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 	dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
 
-	ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
-	    NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG);
+	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj,
+	    NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG, &ds));
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_restoring = FALSE;
 	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
@@ -959,8 +908,6 @@ restore_read(struct restorearg *ra, int len)
 		    ra->voff, UIO_SYSSPACE, FAPPEND,
 		    RLIM_INFINITY, CRED(), &resid);
 
-		/* Need to compute checksum */
-
 		ra->voff += ra->bufsize - leftover - resid;
 		ra->buflen = ra->bufsize - resid;
 		ra->bufoff = 0;
@@ -968,12 +915,17 @@ restore_read(struct restorearg *ra, int len)
 			ra->err = EINVAL;
 		if (ra->err)
 			return (NULL);
+		/* Could compute checksum here? */
 	}
 
 	ASSERT3U(ra->bufoff % 8, ==, 0);
 	ASSERT3U(ra->buflen - ra->bufoff, >=, len);
 	rv = ra->buf + ra->bufoff;
 	ra->bufoff += len;
+	if (ra->byteswap)
+		fletcher_4_incremental_byteswap(rv, len, &ra->zc);
+	else
+		fletcher_4_incremental_native(rv, len, &ra->zc);
 	return (rv);
 }
 
@@ -1016,7 +968,10 @@ backup_byteswap(dmu_replay_record_t *drr)
 		DO64(drr_free.drr_length);
 		break;
 	case DRR_END:
-		DO64(drr_end.drr_checksum);
+		DO64(drr_end.drr_checksum.zc_word[0]);
+		DO64(drr_end.drr_checksum.zc_word[1]);
+		DO64(drr_end.drr_checksum.zc_word[2]);
+		DO64(drr_end.drr_checksum.zc_word[3]);
 		break;
 	}
 #undef DO64
@@ -1089,7 +1044,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 	if (drro->drr_bonuslen) {
 		dmu_buf_t *db;
 		void *data;
-		db = dmu_bonus_hold(os, drro->drr_object);
+		VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
 		dmu_buf_will_dirty(db, tx);
 
 		ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
@@ -1103,7 +1058,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 			dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
 			    drro->drr_bonuslen);
 		}
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, FTAG);
 	}
 	dmu_tx_commit(tx);
 	return (0);
@@ -1202,21 +1157,22 @@ restore_free(struct restorearg *ra, objset_t *os,
 		dmu_tx_abort(tx);
 		return (err);
 	}
-	dmu_free_range(os, drrf->drr_object,
+	err = dmu_free_range(os, drrf->drr_object,
 	    drrf->drr_offset, drrf->drr_length, tx);
 	dmu_tx_commit(tx);
-	return (0);
+	return (err);
 }
 
 int
-dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
+dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
     vnode_t *vp, uint64_t voffset)
 {
 	struct restorearg ra;
 	dmu_replay_record_t *drr;
-	char *cp, *tosnap;
+	char *cp;
 	dsl_dir_t *dd = NULL;
 	objset_t *os = NULL;
+	zio_cksum_t pzc;
 
 	bzero(&ra, sizeof (ra));
 	ra.vp = vp;
@@ -1233,6 +1189,23 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 		goto out;
 	}
 
+	/*
+	 * NB: this assumes that struct drr_begin will be the largest in
+	 * dmu_replay_record_t's drr_u, and thus we don't need to pad it
+	 * with zeros to make it the same length as we wrote out.
+	 */
+	((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN;
+	((dmu_replay_record_t *)ra.buf)->drr_pad = 0;
+	((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb;
+	if (ra.byteswap) {
+		fletcher_4_incremental_byteswap(ra.buf,
+		    sizeof (dmu_replay_record_t), &ra.zc);
+	} else {
+		fletcher_4_incremental_native(ra.buf,
+		    sizeof (dmu_replay_record_t), &ra.zc);
+	}
+	(void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */
+
 	if (ra.byteswap) {
 		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
 		drrb->drr_version = BSWAP_64(drrb->drr_version);
@@ -1244,7 +1217,6 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 
 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 
-	tosnap = drrb->drr_toname;
 	if (drrb->drr_version != DMU_BACKUP_VERSION ||
 	    drrb->drr_type >= DMU_OST_NUMTYPES ||
 	    strchr(drrb->drr_toname, '@') == NULL) {
@@ -1260,12 +1232,10 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 
 		cp = strchr(tosnap, '@');
 		*cp = '\0';
-		dd = dsl_dir_open(tosnap, FTAG, NULL);
+		ra.err = dsl_dir_open(tosnap, FTAG, &dd, NULL);
 		*cp = '@';
-		if (dd == NULL) {
-			ra.err = ENOENT;
+		if (ra.err)
 			goto out;
-		}
 
 		ra.err = dsl_dir_sync_task(dd, replay_incremental_sync,
 		    drrb, 1<<20);
@@ -1275,12 +1245,10 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 
 		cp = strchr(tosnap, '@');
 		*cp = '\0';
-		dd = dsl_dir_open(tosnap, FTAG, &tail);
+		ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail);
 		*cp = '@';
-		if (dd == NULL) {
-			ra.err = ENOENT;
+		if (ra.err)
 			goto out;
-		}
 		if (tail == NULL) {
 			ra.err = EEXIST;
 			goto out;
@@ -1306,9 +1274,10 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 	/*
 	 * Read records and process them.
 	 */
+	pzc = ra.zc;
 	while (ra.err == 0 &&
 	    NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
-		if (issig(JUSTLOOKING)) {
+		if (issig(JUSTLOOKING) && issig(FORREAL)) {
 			ra.err = EINTR;
 			goto out;
 		}
@@ -1348,7 +1317,22 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 			break;
 		}
 		case DRR_END:
-			/* Need to verify checksum. */
+		{
+			struct drr_end drre = drr->drr_u.drr_end;
+			/*
+			 * We compare against the *previous* checksum
+			 * value, because the stored checksum is of
+			 * everything before the DRR_END record.
+			 */
+			if (drre.drr_checksum.zc_word[0] != 0 &&
+			    ((drre.drr_checksum.zc_word[0] - pzc.zc_word[0]) |
+			    (drre.drr_checksum.zc_word[1] - pzc.zc_word[1]) |
+			    (drre.drr_checksum.zc_word[2] - pzc.zc_word[2]) |
+			    (drre.drr_checksum.zc_word[3] - pzc.zc_word[3]))) {
+				ra.err = ECKSUM;
+				goto out;
+			}
+
 			/*
 			 * dd may be the parent of the dd we are
 			 * restoring into (eg. if it's a full backup).
@@ -1356,10 +1340,12 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 			ra.err = dsl_dir_sync_task(dmu_objset_ds(os)->
 			    ds_dir, replay_end_sync, drrb, 1<<20);
 			goto out;
+		}
 		default:
 			ra.err = EINVAL;
 			goto out;
 		}
+		pzc = ra.zc;
 	}
 
 out:
@@ -1443,6 +1429,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 	dmu_buf_impl_t *db;
 	blkptr_t *blk;
 	int err;
+	zbookmark_t zb;
 
 	ASSERT(RW_LOCK_HELD(&tx->tx_suspend));
 	ASSERT(BP_IS_HOLE(bp));
@@ -1452,6 +1439,11 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 	    txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
 
 	/*
+	 * XXX why is this routine using dmu_buf_*() and casting between
+	 * dmu_buf_impl_t and dmu_buf_t?
+	 */
+
+	/*
 	 * If this txg already synced, there's nothing to do.
 	 */
 	if (txg <= tx->tx_synced_txg) {
@@ -1459,7 +1451,10 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 		 * If we're running ziltest, we need the blkptr regardless.
 		 */
 		if (txg > spa_freeze_txg(dp->dp_spa)) {
-			db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
+			err = dmu_buf_hold(os, object, offset,
+			    FTAG, (dmu_buf_t **)&db);
+			if (err)
+				return (err);
 			/* if db_blkptr == NULL, this was an empty write */
 			if (db->db_blkptr)
 				*bp = *db->db_blkptr; /* structure assignment */
@@ -1467,7 +1462,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 				bzero(bp, sizeof (blkptr_t));
 			*blkoff = offset - db->db.db_offset;
 			ASSERT3U(*blkoff, <, db->db.db_size);
-			dmu_buf_rele((dmu_buf_t *)db);
+			dmu_buf_rele((dmu_buf_t *)db, FTAG);
 			return (0);
 		}
 		return (EALREADY);
@@ -1481,7 +1476,9 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 		return (EINPROGRESS);
 	}
 
-	db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
+	err = dmu_buf_hold(os, object, offset, FTAG, (dmu_buf_t **)&db);
+	if (err)
+		return (err);
 
 	mutex_enter(&db->db_mtx);
 
@@ -1491,7 +1488,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 	 */
 	if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) {
 		mutex_exit(&db->db_mtx);
-		dmu_buf_rele((dmu_buf_t *)db);
+		dmu_buf_rele((dmu_buf_t *)db, FTAG);
 		return (ENOENT);
 	}
 
@@ -1505,7 +1502,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 		ASSERT(blk != IN_DMU_SYNC);
 		if (blk == IN_DMU_SYNC) {
 			mutex_exit(&db->db_mtx);
-			dmu_buf_rele((dmu_buf_t *)db);
+			dmu_buf_rele((dmu_buf_t *)db, FTAG);
 			return (EBUSY);
 		}
 		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
@@ -1522,11 +1519,15 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 	blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
 	blk->blk_birth = 0; /* mark as invalid */
 
+	zb.zb_objset = os->os->os_dsl_dataset->ds_object;
+	zb.zb_object = db->db.db_object;
+	zb.zb_level = db->db_level;
+	zb.zb_blkid = db->db_blkid;
 	err = arc_write(NULL, os->os->os_spa,
 	    zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum),
 	    zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress),
 	    txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
 	ASSERT(err == 0);
 
 	if (!BP_IS_HOLE(blk)) {
@@ -1546,7 +1547,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 		ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL);
 		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
 		mutex_exit(&db->db_mtx);
-		dmu_buf_rele((dmu_buf_t *)db);
+		dmu_buf_rele((dmu_buf_t *)db, FTAG);
 		/* Note that this block does not free on disk until txg syncs */
 
 		/*
@@ -1563,7 +1564,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 
 	db->db_d.db_overridden_by[txg&TXG_MASK] = blk;
 	mutex_exit(&db->db_mtx);
-	dmu_buf_rele((dmu_buf_t *)db);
+	dmu_buf_rele((dmu_buf_t *)db, FTAG);
 	ASSERT3U(txg, >, tx->tx_syncing_txg);
 	return (0);
 }
@@ -1571,7 +1572,10 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 uint64_t
 dmu_object_max_nonzero_offset(objset_t *os, uint64_t object)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dnode_t *dn;
+
+	/* XXX assumes dnode_hold will not get an i/o error */
+	(void) dnode_hold(os->os, object, FTAG, &dn);
 	uint64_t rv = dnode_max_nonzero_offset(dn);
 	dnode_rele(dn, FTAG);
 	return (rv);
@@ -1581,8 +1585,13 @@ int
 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
 	dmu_tx_t *tx)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
-	int err = dnode_set_blksz(dn, size, ibs, tx);
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
+	err = dnode_set_blksz(dn, size, ibs, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
@@ -1591,7 +1600,10 @@ void
 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
 	dmu_tx_t *tx)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dnode_t *dn;
+
+	/* XXX assumes dnode_hold will not get an i/o error */
+	(void) dnode_hold(os->os, object, FTAG, &dn);
 	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
@@ -1602,7 +1614,10 @@ void
 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
 	dmu_tx_t *tx)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dnode_t *dn;
+
+	/* XXX assumes dnode_hold will not get an i/o error */
+	(void) dnode_hold(os->os, object, FTAG, &dn);
 	ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
@@ -1615,7 +1630,9 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 	dnode_t *dn;
 	int i, err;
 
-	dn = dnode_hold(os->os, object, FTAG);
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
 	/*
 	 * Sync any current changes before
 	 * we go trundling through the block pointers.
@@ -1627,7 +1644,9 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 	if (i != TXG_SIZE) {
 		dnode_rele(dn, FTAG);
 		txg_wait_synced(dmu_objset_pool(os), 0);
-		dn = dnode_hold(os->os, object, FTAG);
+		err = dnode_hold(os->os, object, FTAG, &dn);
+		if (err)
+			return (err);
 	}
 
 	err = dnode_next_offset(dn, hole, off, 1, 1);
@@ -1665,10 +1684,11 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 int
 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dnode_t *dn;
+	int err = dnode_hold(os->os, object, FTAG, &dn);
 
-	if (dn == NULL)
-		return (ENOENT);
+	if (err)
+		return (err);
 
 	if (doi != NULL)
 		dmu_object_info_from_dnode(dn, doi);
@@ -1699,6 +1719,71 @@ dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
 	*nblk512 = dn->dn_phys->dn_secphys + 1;	/* add 1 for dnode space */
 }
 
+/*
+ * Given a bookmark, return the name of the dataset, object, and range in
+ * human-readable format.
+ */
+int
+spa_bookmark_name(spa_t *spa, zbookmark_t *zb, char *dsname, size_t dslen,
+    char *objname, size_t objlen, char *range, size_t rangelen)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds = NULL;
+	objset_t *os = NULL;
+	dnode_t *dn = NULL;
+	int err, shift;
+
+	if (dslen < MAXNAMELEN || objlen < 32 || rangelen < 64)
+		return (ENOSPC);
+
+	dp = spa_get_dsl(spa);
+	if (zb->zb_objset != 0) {
+		rw_enter(&dp->dp_config_rwlock, RW_READER);
+		err = dsl_dataset_open_obj(dp, zb->zb_objset,
+		    NULL, DS_MODE_NONE, FTAG, &ds);
+		if (err) {
+			rw_exit(&dp->dp_config_rwlock);
+			return (err);
+		}
+		dsl_dataset_name(ds, dsname);
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		rw_exit(&dp->dp_config_rwlock);
+
+		err = dmu_objset_open(dsname, DMU_OST_ANY, DS_MODE_NONE, &os);
+		if (err)
+			goto out;
+
+	} else {
+		dsl_dataset_name(NULL, dsname);
+		os = dp->dp_meta_objset;
+	}
+
+
+	if (zb->zb_object == DMU_META_DNODE_OBJECT) {
+		(void) strncpy(objname, "mdn", objlen);
+	} else {
+		(void) snprintf(objname, objlen, "%lld",
+		    (longlong_t)zb->zb_object);
+	}
+
+	err = dnode_hold(os->os, zb->zb_object, FTAG, &dn);
+	if (err)
+		goto out;
+
+	shift = (dn->dn_datablkshift?dn->dn_datablkshift:SPA_MAXBLOCKSHIFT) +
+	    zb->zb_level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
+	(void) snprintf(range, rangelen, "%llu-%llu",
+	    (u_longlong_t)(zb->zb_blkid << shift),
+	    (u_longlong_t)((zb->zb_blkid+1) << shift));
+
+out:
+	if (dn)
+		dnode_rele(dn, FTAG);
+	if (os && os != dp->dp_meta_objset)
+		dmu_objset_close(os);
+	return (err);
+}
+
 void
 byteswap_uint64_array(void *vbuf, size_t size)
 {
diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c
index d150d6c400..99d40c5ec5 100644
--- a/usr/src/uts/common/fs/zfs/dmu_object.c
+++ b/usr/src/uts/common/fs/zfs/dmu_object.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,7 +38,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
 	uint64_t object;
 	uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
 	    (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
-	dnode_t *dn;
+	dnode_t *dn = NULL;
 	int restarted = B_FALSE;
 
 	mutex_enter(&osi->os_obj_lock);
@@ -62,7 +61,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
 		}
 		osi->os_obj_next = ++object;
 
-		dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG);
+		/*
+		 * XXX We should check for an i/o error here and return
+		 * up to our caller.  Actually we should pre-read it in
+		 * dmu_tx_assign(), but there is currently no mechanism
+		 * to do so.
+		 */
+		(void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
+		    FTAG, &dn);
 		if (dn)
 			break;
 
@@ -84,13 +90,14 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	dnode_t *dn;
+	int err;
 
-	if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx))
+	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
 		return (EBADF);
 
-	dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG);
-	if (dn == NULL)
-		return (EEXIST);
+	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+	if (err)
+		return (err);
 	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
 	dnode_rele(dn, FTAG);
 
@@ -103,13 +110,15 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	dnode_t *dn;
+	int err;
 
-	if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx))
+	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
 		return (EBADF);
 
-	dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG);
-	if (dn == NULL)
-		return (EBADF);
+	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+	    FTAG, &dn);
+	if (err)
+		return (err);
 	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
 	dnode_rele(dn, FTAG);
 
@@ -120,12 +129,14 @@ int
 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
+	int err;
 
-	ASSERT(!(object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx));
+	ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 
-	dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG);
-	if (dn == NULL)
-		return (ENOENT);
+	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+	    FTAG, &dn);
+	if (err)
+		return (err);
 
 	ASSERT(dn->dn_type != DMU_OT_NONE);
 	dnode_free(dn, tx);
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 8d77ff70c0..6625fdb98d 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -127,8 +126,9 @@ dmu_objset_byteswap(void *buf, size_t size)
 	osp->os_type = BSWAP_64(osp->os_type);
 }
 
-objset_impl_t *
-dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
+int
+dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+    objset_impl_t **osip)
 {
 	objset_impl_t *winner, *osi;
 	int i, err, checksum;
@@ -141,15 +141,25 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
 		osi->os_rootbp = *bp;
 	osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t));
 	if (!BP_IS_HOLE(&osi->os_rootbp)) {
+		zbookmark_t zb;
+		zb.zb_objset = ds ? ds->ds_object : 0;
+		zb.zb_object = 0;
+		zb.zb_level = -1;
+		zb.zb_blkid = 0;
+
 		dprintf_bp(&osi->os_rootbp, "reading %s", "");
-		(void) arc_read(NULL, spa, &osi->os_rootbp,
+		err = arc_read(NULL, spa, &osi->os_rootbp,
 		    dmu_ot[DMU_OT_OBJSET].ot_byteswap,
 		    arc_bcopy_func, osi->os_phys,
-		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, ARC_WAIT, &zb);
+		if (err) {
+			zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
+			kmem_free(osi, sizeof (objset_impl_t));
+			return (err);
+		}
 	} else {
 		bzero(osi->os_phys, sizeof (objset_phys_t));
 	}
-	osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
 
 	/*
 	 * Note: the changed_cb will be called once before the register
@@ -159,18 +169,22 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
 	if (ds) {
 		err = dsl_prop_register(ds, "checksum",
 		    checksum_changed_cb, osi);
-		ASSERT(err == 0);
-
-		err = dsl_prop_register(ds, "compression",
-		    compression_changed_cb, osi);
-		ASSERT(err == 0);
+		if (err == 0)
+			err = dsl_prop_register(ds, "compression",
+			    compression_changed_cb, osi);
+		if (err) {
+			zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
+			kmem_free(osi, sizeof (objset_impl_t));
+			return (err);
+		}
 	} else {
 		/* It's the meta-objset. */
-		/* XXX - turn off metadata compression temporarily */
 		osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
-		osi->os_compress = ZIO_COMPRESS_OFF;
+		osi->os_compress = ZIO_COMPRESS_LZJB;
 	}
 
+	osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
+
 	/*
 	 * Metadata always gets compressed and checksummed.
 	 * If the data checksum is multi-bit correctable, and it's not
@@ -184,9 +198,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
 		osi->os_md_checksum = checksum;
 	else
 		osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
-
-	/* XXX - turn off metadata compression temporarily */
-	osi->os_md_compress = ZIO_COMPRESS_OFF;
+	osi->os_md_compress = ZIO_COMPRESS_LZJB;
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
@@ -210,7 +222,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
 		}
 	}
 
-	return (osi);
+	*osip = osi;
+	return (0);
 }
 
 /* called from zpl */
@@ -235,7 +248,13 @@ dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
 		blkptr_t bp;
 
 		dsl_dataset_get_blkptr(ds, &bp);
-		osi = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, &bp);
+		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
+		    ds, &bp, &osi);
+		if (err) {
+			dsl_dataset_close(ds, mode, os);
+			kmem_free(os, sizeof (objset_t));
+			return (err);
+		}
 	}
 
 	os->os = osi;
@@ -257,9 +276,51 @@ dmu_objset_close(objset_t *os)
 }
 
 void
+dmu_objset_evict_dbufs(objset_t *os)
+{
+	objset_impl_t *osi = os->os;
+	dnode_t *mdn = osi->os_meta_dnode;
+	dnode_t *dn;
+	int allzero = B_TRUE;
+
+	/*
+	 * Each time we process an entry on the list, we first move it
+	 * to the tail so that we don't process it over and over again.
+	 * We use the meta-dnode as a marker: if we make a complete pass
+	 * over the list without finding any work to do, we're done.
+	 * This ensures that we complete in linear time rather than
+	 * quadratic time, as described in detail in bug 1182169.
+	 */
+	mutex_enter(&osi->os_lock);
+	list_remove(&osi->os_dnodes, mdn);
+	list_insert_tail(&osi->os_dnodes, mdn);
+	while ((dn = list_head(&osi->os_dnodes)) != NULL) {
+		list_remove(&osi->os_dnodes, dn);
+		list_insert_tail(&osi->os_dnodes, dn);
+		if (dn == mdn) {
+			if (allzero)
+				break;
+			allzero = B_TRUE;
+			continue;
+		}
+		if (!refcount_is_zero(&dn->dn_holds)) {
+			allzero = B_FALSE;
+			dnode_add_ref(dn, FTAG);
+			mutex_exit(&osi->os_lock);
+			dnode_evict_dbufs(dn);
+			dnode_rele(dn, FTAG);
+			mutex_enter(&osi->os_lock);
+		}
+	}
+	mutex_exit(&osi->os_lock);
+	dnode_evict_dbufs(mdn);
+}
+
+void
 dmu_objset_evict(dsl_dataset_t *ds, void *arg)
 {
 	objset_impl_t *osi = arg;
+	objset_t os;
 	int err, i;
 
 	for (i = 0; i < TXG_SIZE; i++) {
@@ -277,6 +338,13 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
 		ASSERT(err == 0);
 	}
 
+	/*
+	 * We should need only a single pass over the dnode list, since
+	 * nothing can be added to the list at this point.
+	 */
+	os.os = osi;
+	dmu_objset_evict_dbufs(&os);
+
 	ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
 	ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
 	ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
@@ -297,7 +365,7 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type,
 	dnode_t *mdn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
-	osi = dmu_objset_open_impl(spa, ds, NULL);
+	VERIFY(0 == dmu_objset_open_impl(spa, ds, NULL, &osi));
 	mdn = osi->os_meta_dnode;
 
 	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
@@ -314,9 +382,21 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type,
 	 * needs to be synced multiple times as spa_sync() iterates
 	 * to convergence, so minimizing its dn_nlevels matters.
 	 */
-	if (ds != NULL)
+	if (ds != NULL) {
+		int levels = 1;
+
+		/*
+		 * Determine the number of levels necessary for the meta-dnode
+		 * to contain DN_MAX_OBJECT dnodes.
+		 */
+		while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
+		    (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
+		    DN_MAX_OBJECT * sizeof (dnode_phys_t))
+			levels++;
+
 		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
-		    mdn->dn_nlevels = DN_META_DNODE_LEVELS;
+		    mdn->dn_nlevels = levels;
+	}
 
 	ASSERT(type != DMU_OST_NONE);
 	ASSERT(type != DMU_OST_ANY);
@@ -354,9 +434,8 @@ dmu_objset_create_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	if (err)
 		return (err);
 
-	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds);
-	ASSERT3U(err, ==, 0);
+	VERIFY(0 == dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
 	dsl_dataset_get_blkptr(ds, &bp);
 	if (BP_IS_HOLE(&bp)) {
 		objset_impl_t *osi;
@@ -382,9 +461,9 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
 	const char *tail;
 	int err = 0;
 
-	pds = dsl_dir_open(name, FTAG, &tail);
-	if (pds == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(name, FTAG, &pds, &tail);
+	if (err)
+		return (err);
 	if (tail == NULL) {
 		dsl_dir_close(pds, FTAG);
 		return (EEXIST);
@@ -554,6 +633,7 @@ dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
 	int txgoff;
 	list_t *dirty_list;
 	int err;
+	zbookmark_t zb;
 	arc_buf_t *abuf =
 	    arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG);
 
@@ -586,11 +666,15 @@ dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
 	 * Sync the root block.
 	 */
 	bcopy(os->os_phys, abuf->b_data, sizeof (objset_phys_t));
+	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+	zb.zb_object = 0;
+	zb.zb_level = -1;
+	zb.zb_blkid = 0;
 	err = arc_write(NULL, os->os_spa, os->os_md_checksum,
 	    os->os_md_compress, tx->tx_txg, &os->os_rootbp, abuf, killer, os,
-	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
 	ASSERT(err == 0);
-	arc_buf_free(abuf, FTAG);
+	VERIFY(arc_buf_remove_ref(abuf, FTAG) == 1);
 
 	dsl_dataset_set_blkptr(os->os_dsl_dataset, &os->os_rootbp, tx);
 
@@ -707,10 +791,10 @@ dmu_objset_find(char *name, void func(char *, void *), void *arg, int flags)
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	char *child;
-	int do_self;
+	int do_self, err;
 
-	dd = dsl_dir_open(name, FTAG, NULL);
-	if (dd == NULL)
+	err = dsl_dir_open(name, FTAG, &dd, NULL);
+	if (err)
 		return;
 
 	do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c
index fedeba015d..fbc55fec86 100644
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -339,7 +338,7 @@ traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
 	} else {
 		error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
 		    BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
-		    th->th_zio_flags | ZIO_FLAG_DONT_CACHE));
+		    th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
 
 		if (BP_SHOULD_BYTESWAP(bp) && error == 0)
 			(zb->zb_level > 0 ? byteswap_uint64_array :
@@ -469,13 +468,70 @@ get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
 	return (rc);
 }
 
+/* ARGSUSED */
+static void
+traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t maxtxg)
+{
+	traverse_handle_t *th = arg;
+	traverse_blk_cache_t *bc = &th->th_zil_cache;
+	zbookmark_t *zb = &bc->bc_bookmark;
+
+	if (bp->blk_birth < maxtxg) {
+		zb->zb_object = 0;
+		zb->zb_blkid = bp->blk_cksum.zc_word[3];
+		bc->bc_blkptr = *bp;
+		(void) th->th_func(bc, th->th_spa, th->th_arg);
+	}
+}
+
+/* ARGSUSED */
+static void
+traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t maxtxg)
+{
+	traverse_handle_t *th = arg;
+	traverse_blk_cache_t *bc = &th->th_zil_cache;
+	zbookmark_t *zb = &bc->bc_bookmark;
+
+	if (lrc->lrc_txtype == TX_WRITE) {
+		lr_write_t *lr = (lr_write_t *)lrc;
+		blkptr_t *bp = &lr->lr_blkptr;
+
+		if (bp->blk_birth != 0 && bp->blk_birth < maxtxg) {
+			zb->zb_object = lr->lr_foid;
+			zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
+			bc->bc_blkptr = *bp;
+			(void) th->th_func(bc, th->th_spa, th->th_arg);
+		}
+	}
+}
+
+static void
+traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc, uint64_t maxtxg)
+{
+	spa_t *spa = th->th_spa;
+	objset_phys_t *osphys = bc->bc_data;
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	zilog_t *zilog;
+
+	ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
+	ASSERT(bc->bc_bookmark.zb_level == -1);
+
+	th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
+
+	zilog = zil_alloc(dp->dp_meta_objset, &osphys->os_zil_header);
+
+	zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, maxtxg);
+
+	zil_free(zilog);
+}
+
 static int
 traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
 {
 	zbookmark_t *zb = &zseg->seg_start;
 	traverse_blk_cache_t *bc;
 	dnode_phys_t *dn, *dn_tmp;
-	int worklimit = 1000;
+	int worklimit = 100;
 	int rc;
 
 	dprintf("<%llu, %llu, %d, %llx>\n",
@@ -529,6 +585,8 @@ traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
 
 	if (zb->zb_level == -1) {
 		ASSERT(zb->zb_object == 0);
+		ASSERT(zb->zb_blkid == 0);
+		ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
 
 		if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
 			rc = traverse_callback(th, zseg, bc);
@@ -536,6 +594,9 @@ traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
 				ASSERT(rc == EINTR);
 				return (rc);
 			}
+			if ((th->th_advance & ADVANCE_ZIL) &&
+			    zb->zb_objset != 0)
+				traverse_zil(th, bc, zseg->seg_maxtxg);
 		}
 
 		return (advance_from_osphys(zseg, th->th_advance));
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 6576107ae2..894bd63f36 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -37,6 +37,9 @@
 #include <sys/spa.h>
 #include <sys/zfs_context.h>
 
+typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
+    uint64_t arg1, uint64_t arg2);
+
 #ifdef ZFS_DEBUG
 int dmu_use_tx_debug_bufs = 1;
 #endif
@@ -60,6 +63,7 @@ dmu_tx_create(objset_t *os)
 {
 	dmu_tx_t *tx = dmu_tx_create_ds(os->os->os_dsl_dataset->ds_dir);
 	tx->tx_objset = os;
+	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
 	return (tx);
 }
 
@@ -85,7 +89,7 @@ dmu_tx_is_syncing(dmu_tx_t *tx)
 int
 dmu_tx_private_ok(dmu_tx_t *tx)
 {
-	return (tx->tx_anyobj || tx->tx_privateobj);
+	return (tx->tx_anyobj);
 }
 
 static void
@@ -95,11 +99,16 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 {
 	dmu_tx_hold_t *dth;
 	dnode_t *dn = NULL;
+	int err;
 
 	if (object != DMU_NEW_OBJECT) {
-		dn = dnode_hold(os->os, object, tx);
+		err = dnode_hold(os->os, object, tx, &dn);
+		if (err) {
+			tx->tx_err = err;
+			return;
+		}
 
-		if (tx->tx_txg != 0) {
+		if (err == 0 && tx->tx_txg != 0) {
 			mutex_enter(&dn->dn_mtx);
 			/*
 			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
@@ -118,15 +127,12 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 	dth = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 	dth->dth_dnode = dn;
 	dth->dth_type = type;
-	dth->dth_func = func;
 	dth->dth_arg1 = arg1;
 	dth->dth_arg2 = arg2;
-	/*
-	 * XXX Investigate using a different data structure to keep
-	 * track of dnodes in a tx.  Maybe array, since there will
-	 * generally not be many entries?
-	 */
 	list_insert_tail(&tx->tx_holds, dth);
+
+	if (func)
+		func(tx, dn, arg1, arg2);
 }
 
 void
@@ -142,11 +148,27 @@ dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 	}
 }
 
+static int
+dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
+{
+	int err;
+	dmu_buf_impl_t *db;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	db = dbuf_hold_level(dn, level, blkid, FTAG);
+	rw_exit(&dn->dn_struct_rwlock);
+	if (db == NULL)
+		return (EIO);
+	err = dbuf_read(db, zio, DB_RF_CANFAIL);
+	dbuf_rele(db, FTAG);
+	return (err);
+}
+
 /* ARGSUSED */
 static void
 dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 {
-	uint64_t start, end, space;
+	uint64_t start, end, i, space;
 	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 
 	if (len == 0)
@@ -158,6 +180,64 @@ dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 	max_ibs = DN_MAX_INDBLKSHIFT;
 
 	/*
+	 * For i/o error checking, read the first and last level-0
+	 * blocks, and all the level-1 blocks.  We needn't do this on
+	 * the meta-dnode, because we've already read it in.
+	 */
+
+	if (dn && dn->dn_object != DMU_META_DNODE_OBJECT) {
+		int err;
+
+		if (dn->dn_maxblkid == 0) {
+			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+			if (err) {
+				tx->tx_err = err;
+				return;
+			}
+		} else {
+			zio_t *zio = zio_root(tx->tx_pool->dp_spa,
+			    NULL, NULL, ZIO_FLAG_CANFAIL);
+
+			/* first level-0 block */
+			start = off/dn->dn_datablksz;
+			err = dmu_tx_check_ioerr(zio, dn, 0, start);
+			if (err) {
+				tx->tx_err = err;
+				return;
+			}
+
+			/* last level-0 block */
+			end = (off+len)/dn->dn_datablksz;
+			if (end != start) {
+				err = dmu_tx_check_ioerr(zio, dn, 0, end);
+				if (err) {
+					tx->tx_err = err;
+					return;
+				}
+			}
+
+			/* level-1 blocks */
+			if (dn->dn_nlevels > 1) {
+				start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+				end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+				for (i = start+1; i < end; i++) {
+					err = dmu_tx_check_ioerr(zio, dn, 1, i);
+					if (err) {
+						tx->tx_err = err;
+						return;
+					}
+				}
+			}
+
+			err = zio_wait(zio);
+			if (err) {
+				tx->tx_err = err;
+				return;
+			}
+		}
+	}
+
+	/*
 	 * If there's more than one block, the blocksize can't change,
 	 * so we can make a more precise estimate.  Alternatively,
 	 * if the dnode's ibs is larger than max_ibs, always use that.
@@ -218,7 +298,7 @@ dmu_tx_count_dnode(dmu_tx_t *tx, dnode_t *dn)
 	dmu_tx_count_write(tx, mdn, object << DNODE_SHIFT, 1 << DNODE_SHIFT);
 	if (dn && dn->dn_dbuf->db_blkptr &&
 	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-	    dn->dn_dbuf->db_blkptr->blk_birth, tx)) {
+	    dn->dn_dbuf->db_blkptr->blk_birth)) {
 		tx->tx_space_tooverwrite +=
 			tx->tx_space_towrite - pre_write_space;
 		tx->tx_space_towrite = pre_write_space;
@@ -237,7 +317,7 @@ void
 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 {
 	ASSERT(tx->tx_txg == 0);
-	ASSERT(len > 0 && len < DMU_MAX_ACCESS);
+	ASSERT(len < DMU_MAX_ACCESS);
 	ASSERT(UINT64_MAX - off >= len - 1);
 
 	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE,
@@ -251,8 +331,6 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 	uint64_t space = 0;
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 
-	ASSERT(dn->dn_assigned_tx == tx || dn->dn_assigned_tx == NULL);
-
 	if (dn->dn_datablkshift == 0)
 		return;
 	/*
@@ -264,8 +342,10 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 	blkid = off >> dn->dn_datablkshift;
 	nblks = (off + len) >> dn->dn_datablkshift;
 
-	if (blkid >= dn->dn_maxblkid)
-		goto out;
+	if (blkid >= dn->dn_maxblkid) {
+		rw_exit(&dn->dn_struct_rwlock);
+		return;
+	}
 	if (blkid + nblks > dn->dn_maxblkid)
 		nblks = dn->dn_maxblkid - blkid;
 
@@ -278,12 +358,12 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 			blkptr_t *bp = dn->dn_phys->dn_blkptr;
 			ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
 			bp += blkid + i;
-			if (dsl_dataset_block_freeable(ds, bp->blk_birth, tx)) {
+			if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
 				dprintf_bp(bp, "can free old%s", "");
 				space += BP_GET_ASIZE(bp);
 			}
 		}
-		goto out;
+		nblks = 0;
 	}
 
 	while (nblks) {
@@ -299,20 +379,26 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 			int i;
 			blkptr_t *bp;
 
-			dbuf_read_havestruct(dbuf);
+			err = dbuf_read(dbuf, NULL,
+			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
+			if (err != 0) {
+				tx->tx_err = err;
+				dbuf_rele(dbuf, FTAG);
+				break;
+			}
 
 			bp = dbuf->db.db_data;
 			bp += blkoff;
 
 			for (i = 0; i < tochk; i++) {
 				if (dsl_dataset_block_freeable(ds,
-				    bp[i].blk_birth, tx)) {
+				    bp[i].blk_birth)) {
 					dprintf_bp(&bp[i],
 					    "can free old%s", "");
 					space += BP_GET_ASIZE(&bp[i]);
 				}
 			}
-			dbuf_remove_ref(dbuf, FTAG);
+			dbuf_rele(dbuf, FTAG);
 		} else {
 			/* the indirect block is sparse */
 			ASSERT(err == ENOENT);
@@ -321,7 +407,6 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 		blkid += tochk;
 		nblks -= tochk;
 	}
-out:
 	rw_exit(&dn->dn_struct_rwlock);
 
 	tx->tx_space_tofree += space;
@@ -330,7 +415,9 @@ out:
 static void
 dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 {
-	int dirty;
+	uint64_t start, end, i;
+	int dirty, err, shift;
+	zio_t *zio;
 
 	/* first block */
 	if (off != 0 /* || dn->dn_maxblkid == 0 */)
@@ -339,13 +426,46 @@ dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 	if (len != DMU_OBJECT_END)
 		dmu_tx_count_write(tx, dn, off+len, 1);
 
-	dmu_tx_count_dnode(tx, dn);
-
 	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 		return;
 	if (len == DMU_OBJECT_END)
 		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 
+	/*
+	 * For i/o error checking, read the first and last level-0
+	 * blocks, and all the level-1 blocks.  The above count_write's
+	 * will take care of the level-0 blocks.
+	 */
+	shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	start = off >> shift;
+	end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
+
+	zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+	for (i = start+1; i < end; i++) {
+		uint64_t ibyte = i << shift;
+		err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1);
+		i = ibyte >> shift;
+		if (err == ESRCH)
+			break;
+		if (err) {
+			tx->tx_err = err;
+			return;
+		}
+
+		err = dmu_tx_check_ioerr(zio, dn, 1, i);
+		if (err) {
+			tx->tx_err = err;
+			return;
+		}
+	}
+	err = zio_wait(zio);
+	if (err) {
+		tx->tx_err = err;
+		return;
+	}
+
+	dmu_tx_count_dnode(tx, dn);
+
 	/* XXX locking */
 	dirty = dn->dn_dirtyblksz[0] | dn->dn_dirtyblksz[1] |
 	    dn->dn_dirtyblksz[2] | dn->dn_dirtyblksz[3];
@@ -364,17 +484,17 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 
 /* ARGSUSED */
 static void
-dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops)
+dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t add, uint64_t iname)
 {
 	uint64_t nblocks;
-	int epbs;
+	int epbs, err;
+	char *name = (char *)(uintptr_t)iname;
 
 	dmu_tx_count_dnode(tx, dn);
 
 	if (dn == NULL) {
 		/*
-		 * Assuming that nops+cops is not super huge, we will be
-		 * able to fit a new object's entries into one leaf
+		 * We will be able to fit a new object's entries into one leaf
 		 * block.  So there will be at most 2 blocks total,
 		 * including the header block.
 		 */
@@ -384,25 +504,44 @@ dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops)
 
 	ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
 
-	if (dn->dn_maxblkid == 0 && nops == 0) {
+	if (dn->dn_maxblkid == 0 && !add) {
 		/*
 		 * If there is only one block  (i.e. this is a micro-zap)
-		 * and we are only doing updates, the accounting is simple.
+		 * and we are not adding anything, the accounting is simple.
 		 */
+		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+		if (err) {
+			tx->tx_err = err;
+			return;
+		}
+
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-		    dn->dn_phys->dn_blkptr[0].blk_birth, tx))
+		    dn->dn_phys->dn_blkptr[0].blk_birth))
 			tx->tx_space_tooverwrite += dn->dn_datablksz;
 		else
 			tx->tx_space_towrite += dn->dn_datablksz;
 		return;
 	}
 
+	if (dn->dn_maxblkid > 0 && name) {
+		/*
+		 * access the name in this fat-zap so that we'll check
+		 * for i/o errors to the leaf blocks, etc.
+		 */
+		err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
+		    8, 0, NULL);
+		if (err == EIO) {
+			tx->tx_err = err;
+			return;
+		}
+	}
+
 	/*
-	 * 3 blocks overwritten per op: target leaf, ptrtbl block, header block
-	 * 3 new blocks written per op: new split leaf, 2 grown ptrtbl blocks
+	 * 3 blocks overwritten: target leaf, ptrtbl block, header block
+	 * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
 	 */
 	dmu_tx_count_write(tx, dn, dn->dn_maxblkid * dn->dn_datablksz,
-	    (nops * 6ULL + cops * 3ULL) << dn->dn_datablkshift);
+	    (3 + add ? 3 : 0) << dn->dn_datablkshift);
 
 	/*
 	 * If the modified blocks are scattered to the four winds,
@@ -410,17 +549,16 @@ dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops)
 	 */
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
-		tx->tx_space_towrite +=
-		    ((nops + cops) * 3ULL) << dn->dn_indblkshift;
+		tx->tx_space_towrite += 3 << dn->dn_indblkshift;
 }
 
 void
-dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops)
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
 {
 	ASSERT(tx->tx_txg == 0);
 
 	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP,
-	    dmu_tx_hold_zap_impl, (ops > 0?ops:0), (ops < 0?-ops:0));
+	    dmu_tx_hold_zap_impl, add, (uintptr_t)name);
 }
 
 void
@@ -492,7 +630,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 		return;
 
 	/* XXX No checking on the meta dnode for now */
-	if (db->db.db_object & DMU_PRIVATE_OBJECT)
+	if (db->db.db_object == DMU_META_DNODE_OBJECT)
 		return;
 
 	for (dth = list_head(&tx->tx_holds); dth;
@@ -572,20 +710,19 @@ static int
 dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth)
 {
 	dmu_tx_hold_t *dth;
-	uint64_t lsize, asize, fsize;
+	uint64_t lsize, asize, fsize, towrite;
 
 	*last_dth = NULL;
 
-	tx->tx_space_towrite = 0;
-	tx->tx_space_tofree = 0;
-	tx->tx_space_tooverwrite = 0;
 	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 
 	if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
 		return (ERESTART);
+	if (tx->tx_err)
+		return (tx->tx_err);
 
 	for (dth = list_head(&tx->tx_holds); dth;
-	    *last_dth = dth, dth = list_next(&tx->tx_holds, dth)) {
+	    dth = list_next(&tx->tx_holds, dth)) {
 		dnode_t *dn = dth->dth_dnode;
 		if (dn != NULL) {
 			mutex_enter(&dn->dn_mtx);
@@ -608,8 +745,21 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth)
 			(void) refcount_add(&dn->dn_tx_holds, tx);
 			mutex_exit(&dn->dn_mtx);
 		}
-		if (dth->dth_func)
-			dth->dth_func(tx, dn, dth->dth_arg1, dth->dth_arg2);
+		*last_dth = dth;
+		if (tx->tx_err)
+			return (tx->tx_err);
+	}
+
+	/*
+	 * If a snapshot has been taken since we made our estimates,
+	 * assume that we won't be able to free or overwrite anything.
+	 */
+	if (tx->tx_objset &&
+	    dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
+	    tx->tx_lastsnap_txg) {
+		tx->tx_space_towrite += tx->tx_space_tooverwrite;
+		tx->tx_space_tooverwrite = 0;
+		tx->tx_space_tofree = 0;
 	}
 
 	/*
@@ -619,13 +769,16 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth)
 	    tx->tx_space_tofree;
 	lsize = tx->tx_space_towrite + tx->tx_space_tooverwrite;
 	asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+	towrite = tx->tx_space_towrite;
 	tx->tx_space_towrite = asize;
 
 	if (tx->tx_dir && asize != 0) {
 		int err = dsl_dir_tempreserve_space(tx->tx_dir,
 		    lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
-		if (err)
+		if (err) {
+			tx->tx_space_towrite = towrite;
 			return (err);
+		}
 	}
 
 	return (0);
@@ -688,8 +841,6 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
 	ASSERT(tx->tx_txg == 0);
 	ASSERT(txg_how != 0);
 	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
-	ASSERT3U(tx->tx_space_towrite, ==, 0);
-	ASSERT3U(tx->tx_space_tofree, ==, 0);
 
 	while ((err = dmu_tx_try_assign(tx, txg_how, &last_dth)) != 0) {
 		uint64_t txg = dmu_tx_unassign(tx, last_dth);
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
index 03ce2a0398..8adb692ec8 100644
--- a/usr/src/uts/common/fs/zfs/dnode.c
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -155,7 +155,7 @@ dnode_verify(dnode_t *dn)
 	}
 	if (dn->dn_phys->dn_type != DMU_OT_NONE)
 		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
-	ASSERT(IS_DNODE_DNODE(dn->dn_object) || dn->dn_dbuf);
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL);
 	if (dn->dn_dbuf != NULL) {
 		ASSERT3P(dn->dn_phys, ==,
 		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
@@ -307,6 +307,11 @@ dnode_destroy(dnode_t *dn)
 		dn->dn_dirtyctx_firstset = NULL;
 	}
 	dmu_zfetch_rele(&dn->dn_zfetch);
+	if (dn->dn_bonus) {
+		mutex_enter(&dn->dn_bonus->db_mtx);
+		dbuf_evict(dn->dn_bonus);
+		dn->dn_bonus = NULL;
+	}
 	kmem_cache_free(dnode_cache, dn);
 }
 
@@ -381,13 +386,10 @@ void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
-	dmu_buf_impl_t *db = NULL;
-
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
 	ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-	ASSERT(!(dn->dn_object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx));
+	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 	ASSERT(tx->tx_txg != 0);
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
@@ -398,6 +400,10 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 	ASSERT(dn->dn_dirtyblksz[2] == 0);
 	ASSERT(dn->dn_dirtyblksz[3] == 0);
 
+	/* clean up any unreferenced dbufs */
+	dnode_evict_dbufs(dn);
+	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
 	/*
 	 * XXX I should really have a generation number to tell if we
 	 * need to do this...
@@ -421,17 +427,25 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 	dn->dn_type = ot;
 
 	if (dn->dn_bonuslen != bonuslen) {
+		dmu_buf_impl_t *db = NULL;
+
 		/* change bonus size */
 		if (bonuslen == 0)
 			bonuslen = 1; /* XXX */
-		db = dbuf_hold_bonus(dn, FTAG);
-		dbuf_read(db);
+		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		if (dn->dn_bonus == NULL)
+			dn->dn_bonus = dbuf_create_bonus(dn);
+		db = dn->dn_bonus;
+		rw_exit(&dn->dn_struct_rwlock);
+		if (refcount_add(&db->db_holds, FTAG) == 1)
+			dnode_add_ref(dn, db);
 		mutex_enter(&db->db_mtx);
 		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
 		ASSERT(db->db.db_data != NULL);
 		db->db.db_size = bonuslen;
 		mutex_exit(&db->db_mtx);
 		dbuf_dirty(db, tx);
+		dbuf_rele(db, FTAG);
 	}
 
 	/* change bonus size and type */
@@ -445,14 +459,19 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
-
-	if (db)
-		dbuf_remove_ref(db, FTAG);
 }
 
 void
 dnode_special_close(dnode_t *dn)
 {
+	/*
+	 * Wait for final references to the dnode to clear.  This can
+	 * only happen if the arc is asyncronously evicting state that
+	 * has a hold on this dnode while we are trying to evict this
+	 * dnode.
+	 */
+	while (refcount_count(&dn->dn_holds) > 0)
+		delay(1);
 	dnode_destroy(dn);
 }
 
@@ -498,21 +517,25 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg)
 }
 
 /*
- * Returns held dnode if the object number is valid, NULL if not.
- * Note that this will succeed even for free dnodes.
+ * errors:
+ * EINVAL - invalid object number.
+ * EIO - i/o error.
+ * succeeds even for free dnodes.
  */
-dnode_t *
-dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref)
+int
+dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
+    void *tag, dnode_t **dnp)
 {
-	int epb, idx;
+	int epb, idx, err;
 	int drop_struct_lock = FALSE;
+	int type;
 	uint64_t blk;
 	dnode_t *mdn, *dn;
 	dmu_buf_impl_t *db;
 	dnode_t **children_dnodes;
 
 	if (object == 0 || object >= DN_MAX_OBJECT)
-		return (NULL);
+		return (EINVAL);
 
 	mdn = os->os_meta_dnode;
 
@@ -525,10 +548,16 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref)
 
 	blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
 
-	db = dbuf_hold(mdn, blk);
+	db = dbuf_hold(mdn, blk, FTAG);
 	if (drop_struct_lock)
 		rw_exit(&mdn->dn_struct_rwlock);
-	dbuf_read(db);
+	if (db == NULL)
+		return (EIO);
+	err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+	if (err) {
+		dbuf_rele(db, FTAG);
+		return (err);
+	}
 
 	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
 	epb = db->db.db_size >> DNODE_SHIFT;
@@ -559,51 +588,53 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref)
 	}
 
 	mutex_enter(&dn->dn_mtx);
+	type = dn->dn_type;
 	if (dn->dn_free_txg ||
-	    ((flag & DNODE_MUST_BE_ALLOCATED) && dn->dn_type == DMU_OT_NONE) ||
-	    ((flag & DNODE_MUST_BE_FREE) && dn->dn_type != DMU_OT_NONE)) {
+	    ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
+	    ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) {
 		mutex_exit(&dn->dn_mtx);
-		dbuf_rele(db);
-		return (NULL);
+		dbuf_rele(db, FTAG);
+		return (type == DMU_OT_NONE ? ENOENT : EEXIST);
 	}
 	mutex_exit(&dn->dn_mtx);
 
-	if (refcount_add(&dn->dn_holds, ref) == 1)
+	if (refcount_add(&dn->dn_holds, tag) == 1)
 		dbuf_add_ref(db, dn);
 
 	DNODE_VERIFY(dn);
 	ASSERT3P(dn->dn_dbuf, ==, db);
 	ASSERT3U(dn->dn_object, ==, object);
-	dbuf_rele(db);
+	dbuf_rele(db, FTAG);
 
-	return (dn);
+	*dnp = dn;
+	return (0);
 }
 
 /*
  * Return held dnode if the object is allocated, NULL if not.
  */
-dnode_t *
-dnode_hold(objset_impl_t *os, uint64_t object, void *ref)
+int
+dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
 {
-	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, ref));
+	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
 }
 
 void
-dnode_add_ref(dnode_t *dn, void *ref)
+dnode_add_ref(dnode_t *dn, void *tag)
 {
 	ASSERT(refcount_count(&dn->dn_holds) > 0);
-	(void) refcount_add(&dn->dn_holds, ref);
+	(void) refcount_add(&dn->dn_holds, tag);
 }
 
 void
-dnode_rele(dnode_t *dn, void *ref)
+dnode_rele(dnode_t *dn, void *tag)
 {
 	uint64_t refs;
 
-	refs = refcount_remove(&dn->dn_holds, ref);
+	refs = refcount_remove(&dn->dn_holds, tag);
 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
 	if (refs == 0 && dn->dn_dbuf)
-		dbuf_remove_ref(dn->dn_dbuf, dn);
+		dbuf_rele(dn->dn_dbuf, dn);
 }
 
 void
@@ -612,7 +643,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 	objset_impl_t *os = dn->dn_objset;
 	uint64_t txg = tx->tx_txg;
 
-	if (IS_DNODE_DNODE(dn->dn_object))
+	if (dn->dn_object == DMU_META_DNODE_OBJECT)
 		return;
 
 	DNODE_VERIFY(dn);
@@ -658,7 +689,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 	 * dnode will hang around after we finish processing its
 	 * children.
 	 */
-	(void) refcount_add(&dn->dn_holds, (void *)(uintptr_t)tx->tx_txg);
+	dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
 
 	dbuf_dirty(dn->dn_dbuf, tx);
 
@@ -764,7 +795,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	}
 
 	/* obtain the old block */
-	db = dbuf_hold(dn, 0);
+	db = dbuf_hold(dn, 0, FTAG);
 
 	dbuf_new_size(db, size, tx);
 
@@ -773,7 +804,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	/* don't need dd_dirty_mtx, dnode is already dirty */
 	dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size;
 	dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
-	dbuf_rele(db);
+	dbuf_rele(db, FTAG);
 
 	err = 0;
 end:
@@ -844,7 +875,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
 		dmu_buf_impl_t *db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
 		dprintf("dn %p dirtying left indirects\n", dn);
 		dbuf_dirty(db, tx);
-		dbuf_remove_ref(db, FTAG);
+		dbuf_rele(db, FTAG);
 	}
 #ifdef ZFS_DEBUG
 	else if (old_nlevels > 1 && new_nlevels > old_nlevels) {
@@ -855,7 +886,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
 			db = dbuf_hold_level(dn, old_nlevels-1, i, FTAG);
 			ASSERT(!
 			    list_link_active(&db->db_dirty_node[txgoff]));
-			dbuf_remove_ref(db, FTAG);
+			dbuf_rele(db, FTAG);
 		}
 	}
 #endif
@@ -976,7 +1007,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 				data = db->db.db_data;
 				bzero(data + start, head);
 			}
-			dbuf_remove_ref(db, FTAG);
+			dbuf_rele(db, FTAG);
 		}
 		off += head;
 		len -= head;
@@ -1009,7 +1040,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 				bzero(db->db.db_data, tail);
 			}
-			dbuf_remove_ref(db, FTAG);
+			dbuf_rele(db, FTAG);
 		}
 		len -= tail;
 	}
@@ -1022,7 +1053,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 		db = dbuf_hold_level(dn, 1,
 		    (off - head) >> (blkshift + epbs), FTAG);
 		dbuf_will_dirty(db, tx);
-		dbuf_remove_ref(db, FTAG);
+		dbuf_rele(db, FTAG);
 	}
 
 	/* dirty the right indirects */
@@ -1030,7 +1061,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 		db = dbuf_hold_level(dn, 1,
 		    (off + len + tail - 1) >> (blkshift + epbs), FTAG);
 		dbuf_will_dirty(db, tx);
-		dbuf_remove_ref(db, FTAG);
+		dbuf_rele(db, FTAG);
 	}
 
 	/*
@@ -1189,7 +1220,8 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
 				return (hole ? 0 : ESRCH);
 			return (error);
 		}
-		dbuf_read_havestruct(db);
+		(void) dbuf_read(db, NULL,
+		    DB_RF_MUST_SUCCEED | DB_RF_HAVESTRUCT);
 		data = db->db.db_data;
 	}
 
@@ -1228,7 +1260,7 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
 	}
 
 	if (db)
-		dbuf_remove_ref(db, FTAG);
+		dbuf_rele(db, FTAG);
 
 	return (error);
 }
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
index 597cafb44e..dcfb9ee7d2 100644
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -48,13 +47,15 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
 	/* this dnode can't be paged out because it's dirty */
 
 	db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
+	ASSERT(db != NULL);
 	for (i = 0; i < dn->dn_phys->dn_nblkptr; i++)
 		if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
 			break;
 	if (i != dn->dn_phys->dn_nblkptr) {
 		ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]));
 
-		dbuf_read_havestruct(db);
+		(void) dbuf_read(db, NULL,
+		    DB_RF_HAVESTRUCT | DB_RF_MUST_SUCCEED);
 		arc_release(db->db_buf, db);
 		/* copy dnode's block pointers to new indirect block */
 		ASSERT3U(sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr, <=,
@@ -102,7 +103,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
 	bzero(dn->dn_phys->dn_blkptr,
 		sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
 
-	dbuf_remove_ref(db, FTAG);
+	dbuf_rele(db, FTAG);
 }
 
 static void
@@ -163,7 +164,8 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 
 		/* db_data_old better be zeroed */
 		if (child->db_d.db_data_old[txg & TXG_MASK]) {
-			buf = (child->db_d.db_data_old[txg & TXG_MASK])->b_data;
+			buf = ((arc_buf_t *)child->db_d.db_data_old
+			    [txg & TXG_MASK])->b_data;
 			for (j = 0; j < child->db.db_size >> 3; j++) {
 				if (buf[j] != 0) {
 					panic("freed data not zero: "
@@ -194,7 +196,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 		}
 		mutex_exit(&child->db_mtx);
 
-		dbuf_remove_ref(child, FTAG);
+		dbuf_rele(child, FTAG);
 	}
 }
 #endif
@@ -211,7 +213,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
 	int txgoff = tx->tx_txg & TXG_MASK;
 	int all = TRUE;
 
-	dbuf_read(db);
+	(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 	arc_release(db->db_buf, db);
 	bp = (blkptr_t *)db->db.db_data;
 
@@ -254,7 +256,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
 		} else {
 			all = FALSE;
 		}
-		dbuf_remove_ref(subdb, FTAG);
+		dbuf_rele(subdb, FTAG);
 	}
 #ifdef ZFS_DEBUG
 	bp -= (end-start)+1;
@@ -326,7 +328,7 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 			ASSERT3P(db->db_blkptr, ==, bp);
 			free_blocks(dn, bp, 1, tx);
 		}
-		dbuf_remove_ref(db, FTAG);
+		dbuf_rele(db, FTAG);
 	}
 	if (trunc) {
 		uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
@@ -338,6 +340,48 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 	}
 }
 
+/*
+ * Try to kick all the dnodes dbufs out of the cache...
+ */
+void
+dnode_evict_dbufs(dnode_t *dn)
+{
+	dmu_buf_impl_t *db;
+
+	mutex_enter(&dn->dn_dbufs_mtx);
+	while (db = list_head(&dn->dn_dbufs)) {
+		int progress = 0;
+		for (; db; db = list_next(&dn->dn_dbufs, db)) {
+			mutex_enter(&db->db_mtx);
+			if (db->db_state != DB_EVICTING &&
+			    refcount_is_zero(&db->db_holds))
+				break;
+			else if (db->db_state == DB_EVICTING)
+				progress = 1;
+			mutex_exit(&db->db_mtx);
+		}
+		if (db) {
+			ASSERT(!arc_released(db->db_buf));
+			dbuf_clear(db);
+			mutex_exit(&dn->dn_dbufs_mtx);
+			progress = 1;
+		} else {
+			if (progress == 0)
+				break;
+			mutex_exit(&dn->dn_dbufs_mtx);
+		}
+		mutex_enter(&dn->dn_dbufs_mtx);
+	}
+	mutex_exit(&dn->dn_dbufs_mtx);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
+		mutex_enter(&dn->dn_bonus->db_mtx);
+		dbuf_evict(dn->dn_bonus);
+		dn->dn_bonus = NULL;
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
 static int
 dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 {
@@ -352,32 +396,35 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 		/* XXX - use dbuf_undirty()? */
 		list_remove(&dn->dn_dirty_dbufs[txgoff], db);
 		if (db->db_level == 0) {
-			ASSERT3P(db->db_d.db_data_old[txgoff], ==, db->db_buf);
+			ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+			    db->db_d.db_data_old[txgoff] == db->db_buf);
 			if (db->db_d.db_overridden_by[txgoff])
 				dbuf_unoverride(db, tx->tx_txg);
 			db->db_d.db_data_old[txgoff] = NULL;
 		}
 		db->db_dirtycnt -= 1;
 		mutex_exit(&db->db_mtx);
-		dbuf_remove_ref(db, (void *)(uintptr_t)tx->tx_txg);
+		dbuf_rele(db, (void *)(uintptr_t)tx->tx_txg);
 	}
 
-	ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
+	dnode_evict_dbufs(dn);
+	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+	/*
+	 * XXX - It would be nice to assert this, but we may still
+	 * have residual holds from async evictions from the arc...
+	 *
+	 * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
+	 */
 
 	/* Undirty next bits */
 	dn->dn_next_nlevels[txgoff] = 0;
 	dn->dn_next_indblkshift[txgoff] = 0;
 
 	/* free up all the blocks in the file. */
-	dbuf_free_range(dn, 0, -1, tx);
 	dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
 	ASSERT3U(dn->dn_phys->dn_secphys, ==, 0);
 
-	/*
-	 * All dbufs should be gone, since all holds are gone...
-	 */
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-
 	/* ASSERT(blkptrs are zero); */
 	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
 	ASSERT(dn->dn_type != DMU_OT_NONE);
@@ -394,7 +441,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 	dn->dn_allocated_txg = 0;
 	mutex_exit(&dn->dn_mtx);
 
-	ASSERT(!IS_DNODE_DNODE(dn->dn_object));
+	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 
 	dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
 	/*
@@ -420,7 +467,7 @@ dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx)
 
 	/* ASSERT(dn->dn_objset->dd_snapshot == NULL); */
 	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(IS_DNODE_DNODE(dn->dn_object) ||
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    dn->dn_dirtyblksz[txgoff] > 0);
 
 	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
@@ -533,7 +580,7 @@ dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx)
 		dn->dn_dirtyblksz[txgoff] = 0;
 
 
-		if (!IS_DNODE_DNODE(dn->dn_object)) {
+		if (dn->dn_object != DMU_META_DNODE_OBJECT) {
 			dbuf_will_dirty(dn->dn_dbuf, tx);
 			dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
 		}
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index e77b772922..7db7745270 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -146,7 +145,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 		    -used, -compressed, -uncompressed, tx);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
-		bplist_enqueue(&ds->ds_deadlist, bp, tx);
+		VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 		if (ds->ds_phys->ds_prev_snap_obj != 0) {
 			ASSERT3U(ds->ds_prev->ds_object, ==,
@@ -175,14 +174,14 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 	mutex_exit(&ds->ds_lock);
 }
 
-int
-dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, dmu_tx_t *tx)
+uint64_t
+dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 {
-	uint64_t prev_snap_txg;
+	uint64_t txg;
 	dsl_dir_t *dd;
-	/* ASSERT that it is not a snapshot */
+
 	if (ds == NULL)
-		return (TRUE);
+		return (0);
 	/*
 	 * The snapshot creation could fail, but that would cause an
 	 * incorrect FALSE return, which would only result in an
@@ -195,13 +194,19 @@ dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, dmu_tx_t *tx)
 	 */
 	dd = ds->ds_dir;
 	mutex_enter(&dd->dd_lock);
-	if (dd->dd_sync_func == dsl_dataset_snapshot_sync &&
-	    dd->dd_sync_txg < tx->tx_txg)
-		prev_snap_txg = dd->dd_sync_txg;
+	if (dd->dd_sync_func == dsl_dataset_snapshot_sync)
+		txg = dd->dd_sync_txg;
 	else
-		prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+		txg = ds->ds_phys->ds_prev_snap_txg;
 	mutex_exit(&dd->dd_lock);
-	return (blk_birth > prev_snap_txg);
+
+	return (txg);
+}
+
+int
+dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
+{
+	return (blk_birth > dsl_dataset_prev_snap_txg(ds));
 }
 
 /* ARGSUSED */
@@ -236,7 +241,7 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
 
-static void
+static int
 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 {
 	dsl_dataset_phys_t *headphys;
@@ -246,34 +251,37 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds)
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (ds->ds_snapname[0])
-		return;
+		return (0);
 	if (ds->ds_phys->ds_next_snap_obj == 0)
-		return;
+		return (0);
 
-	headdbuf = dmu_bonus_hold_tag(mos,
-	    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG);
-	dmu_buf_read(headdbuf);
+	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
+	    FTAG, &headdbuf);
+	if (err)
+		return (err);
 	headphys = headdbuf->db_data;
 	err = zap_value_search(dp->dp_meta_objset,
 	    headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
-	ASSERT(err == 0);
-	dmu_buf_rele_tag(headdbuf, FTAG);
+	dmu_buf_rele(headdbuf, FTAG);
+	return (err);
 }
 
-dsl_dataset_t *
+int
 dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
-    int mode, void *tag)
+    int mode, void *tag, dsl_dataset_t **dsp)
 {
 	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_buf_t *dbuf;
 	dsl_dataset_t *ds;
+	int err;
 
 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 	    dsl_pool_sync_context(dp));
 
-	dbuf = dmu_bonus_hold_tag(mos, dsobj, tag);
-	dmu_buf_read(dbuf);
+	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
+	if (err)
+		return (err);
 	ds = dmu_buf_get_user(dbuf);
 	if (ds == NULL) {
 		dsl_dataset_t *winner;
@@ -282,47 +290,60 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
 		ds->ds_dbuf = dbuf;
 		ds->ds_object = dsobj;
 		ds->ds_phys = dbuf->db_data;
-		ds->ds_dir = dsl_dir_open_obj(dp,
-		    ds->ds_phys->ds_dir_obj, NULL, ds);
 
-		bplist_open(&ds->ds_deadlist,
+		err = bplist_open(&ds->ds_deadlist,
 		    mos, ds->ds_phys->ds_deadlist_obj);
+		if (err == 0) {
+			err = dsl_dir_open_obj(dp,
+			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
+		}
+		if (err) {
+			/*
+			 * we don't really need to close the blist if we
+			 * just opened it.
+			 */
+			kmem_free(ds, sizeof (dsl_dataset_t));
+			dmu_buf_rele(dbuf, tag);
+			return (err);
+		}
 
 		if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
 			ds->ds_snapname[0] = '\0';
 			if (ds->ds_phys->ds_prev_snap_obj) {
-				ds->ds_prev =
-				    dsl_dataset_open_obj(dp,
+				err = dsl_dataset_open_obj(dp,
 				    ds->ds_phys->ds_prev_snap_obj, NULL,
-				    DS_MODE_NONE, ds);
+				    DS_MODE_NONE, ds, &ds->ds_prev);
 			}
 		} else {
 			if (snapname) {
 #ifdef ZFS_DEBUG
 				dsl_dataset_phys_t *headphys;
-				int err;
-				dmu_buf_t *headdbuf = dmu_bonus_hold_tag(mos,
-				    ds->ds_dir->dd_phys->
-				    dd_head_dataset_obj, FTAG);
-				dmu_buf_read(headdbuf);
-				headphys = headdbuf->db_data;
-				uint64_t foundobj;
-				err = zap_lookup(dp->dp_meta_objset,
-				    headphys->ds_snapnames_zapobj,
-				    snapname, sizeof (foundobj), 1, &foundobj);
-				ASSERT3U(err, ==, 0);
-				ASSERT3U(foundobj, ==, dsobj);
-				dmu_buf_rele_tag(headdbuf, FTAG);
+				dmu_buf_t *headdbuf;
+				err = dmu_bonus_hold(mos,
+				    ds->ds_dir->dd_phys->dd_head_dataset_obj,
+				    FTAG, &headdbuf);
+				if (err == 0) {
+					headphys = headdbuf->db_data;
+					uint64_t foundobj;
+					err = zap_lookup(dp->dp_meta_objset,
+					    headphys->ds_snapnames_zapobj,
+					    snapname, sizeof (foundobj), 1,
+					    &foundobj);
+					ASSERT3U(foundobj, ==, dsobj);
+					dmu_buf_rele(headdbuf, FTAG);
+				}
 #endif
 				(void) strcat(ds->ds_snapname, snapname);
 			} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
-				dsl_dataset_get_snapname(ds);
+				err = dsl_dataset_get_snapname(ds);
 			}
 		}
 
-		winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
-		    dsl_dataset_evict);
-		if (winner) {
+		if (err == 0) {
+			winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
+			    dsl_dataset_evict);
+		}
+		if (err || winner) {
 			bplist_close(&ds->ds_deadlist);
 			if (ds->ds_prev) {
 				dsl_dataset_close(ds->ds_prev,
@@ -330,6 +351,10 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
 			}
 			dsl_dir_close(ds->ds_dir, ds);
 			kmem_free(ds, sizeof (dsl_dataset_t));
+			if (err) {
+				dmu_buf_rele(dbuf, tag);
+				return (err);
+			}
 			ds = winner;
 		} else {
 			uint64_t new =
@@ -349,12 +374,13 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
 	    (ds->ds_open_refcount + weight > DOS_REF_MAX)) {
 		mutex_exit(&ds->ds_lock);
 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
-		return (NULL);
+		return (EBUSY);
 	}
 	ds->ds_open_refcount += weight;
 	mutex_exit(&ds->ds_lock);
 
-	return (ds);
+	*dsp = ds;
+	return (0);
 }
 
 int
@@ -368,9 +394,9 @@ dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
 	dsl_dataset_t *ds = NULL;
 	int err = 0;
 
-	dd = dsl_dir_open_spa(spa, name, FTAG, &tail);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
+	if (err)
+		return (err);
 
 	dp = dd->dd_pool;
 	obj = dd->dd_phys->dd_head_dataset_obj;
@@ -384,7 +410,10 @@ dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
 	if (tail != NULL) {
 		objset_t *mos = dp->dp_meta_objset;
 
-		ds = dsl_dataset_open_obj(dp, obj, NULL, DS_MODE_NONE, tag);
+		err = dsl_dataset_open_obj(dp, obj, NULL,
+		    DS_MODE_NONE, tag, &ds);
+		if (err)
+			goto out;
 		obj = ds->ds_phys->ds_snapnames_zapobj;
 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
 		ds = NULL;
@@ -405,9 +434,7 @@ dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
 		if (err)
 			goto out;
 	}
-	ds = dsl_dataset_open_obj(dp, obj, tail, mode, tag);
-	if (ds == NULL)
-		err = EBUSY;
+	err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
 
 out:
 	rw_exit(&dp->dp_config_rwlock);
@@ -433,7 +460,7 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name)
 		(void) strcpy(name, "mos");
 	} else {
 		dsl_dir_name(ds->ds_dir, name);
-		dsl_dataset_get_snapname(ds);
+		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			(void) strcat(name, "@");
 			if (!MUTEX_HELD(&ds->ds_lock)) {
@@ -462,7 +489,7 @@ dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
 	    mode, ds->ds_open_refcount);
 	mutex_exit(&ds->ds_lock);
 
-	dmu_buf_rele_tag(ds->ds_dbuf, tag);
+	dmu_buf_rele(ds->ds_dbuf, tag);
 }
 
 void
@@ -476,16 +503,16 @@ dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
 	dsl_dir_t *dd;
 
 	dsl_dir_create_root(mos, ddobjp, tx);
-	dd = dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG);
-	ASSERT(dd != NULL);
+	VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	dbuf = dmu_bonus_hold(mos, dsobj);
+	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	dsphys->ds_dir_obj = dd->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
+	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_snapnames_zapobj =
@@ -494,13 +521,14 @@ dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
 	dsphys->ds_creation_txg = tx->tx_txg;
 	dsphys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	dmu_buf_rele(dbuf);
+	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_head_dataset_obj = dsobj;
 	dsl_dir_close(dd, FTAG);
 
-	ds = dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG);
+	VERIFY(0 ==
+	    dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
 	(void) dmu_objset_create_impl(dp->dp_spa, ds, DMU_OST_ZFS, tx);
 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 }
@@ -537,14 +565,13 @@ dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
 	err = dsl_dir_create_sync(pds, lastname, tx);
 	if (err)
 		return (err);
-	dd = dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, NULL);
-	ASSERT(dd != NULL);
+	VERIFY(0 == dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, &dd, NULL));
 
 	/* This is the point of no (unsuccessful) return */
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	dbuf = dmu_bonus_hold(mos, dsobj);
+	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	dsphys->ds_dir_obj = dd->dd_object;
@@ -576,7 +603,7 @@ dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
 		dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
 	}
-	dmu_buf_rele(dbuf);
+	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_head_dataset_obj = dsobj;
@@ -594,9 +621,9 @@ dsl_dataset_destroy(const char *name)
 	dsl_dir_t *dd;
 	const char *tail;
 
-	dd = dsl_dir_open(name, FTAG, &tail);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(name, FTAG, &dd, &tail);
+	if (err)
+		return (err);
 
 	dp = dd->dd_pool;
 	if (tail != NULL) {
@@ -631,10 +658,12 @@ dsl_dataset_destroy(const char *name)
 		 * dsl_dataset_destroy_sync() to destroy the head dataset.
 		 */
 		rw_enter(&dp->dp_config_rwlock, RW_READER);
-		pds = dsl_dir_open_obj(dd->dd_pool,
-		    dd->dd_phys->dd_parent_obj, NULL, FTAG);
+		err = dsl_dir_open_obj(dd->dd_pool,
+		    dd->dd_phys->dd_parent_obj, NULL, FTAG, &pds);
 		dsl_dir_close(dd, FTAG);
 		rw_exit(&dp->dp_config_rwlock);
+		if (err)
+			return (err);
 
 		(void) strcpy(buf, name);
 		cp = strrchr(buf, '/') + 1;
@@ -657,9 +686,9 @@ dsl_dataset_rollback(const char *name)
 	dsl_dir_t *dd;
 	const char *tail;
 
-	dd = dsl_dir_open(name, FTAG, &tail);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(name, FTAG, &dd, &tail);
+	if (err)
+		return (err);
 
 	if (tail != NULL) {
 		dsl_dir_close(dd, FTAG);
@@ -777,11 +806,14 @@ dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	dsl_dataset_t *ds;
+	int err;
 
 	if (dd->dd_phys->dd_head_dataset_obj == 0)
 		return (EINVAL);
-	ds = dsl_dataset_open_obj(dd->dd_pool,
-	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+	err = dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &ds);
+	if (err)
+		return (err);
 
 	if (ds->ds_phys->ds_prev_snap_txg == 0) {
 		/*
@@ -823,7 +855,8 @@ dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
 	ds->ds_phys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
+	    ds->ds_phys->ds_deadlist_obj));
 	dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
 
 	{
@@ -891,27 +924,23 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 		drop_lock = TRUE;
 	}
 
-	ds = dsl_dataset_open_obj(dd->dd_pool,
+	err = dsl_dataset_open_obj(dd->dd_pool,
 	    dd->dd_phys->dd_head_dataset_obj, NULL,
-	    snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG);
+	    snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG, &ds);
 
-	if (snapname) {
+	if (err == 0 && snapname) {
 		err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
 		    snapname, 8, 1, &obj);
 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
-		if (err) {
-			if (drop_lock)
-				rw_exit(&dp->dp_config_rwlock);
-			return (err);
+		if (err == 0) {
+			err = dsl_dataset_open_obj(dd->dd_pool, obj, NULL,
+			    DS_MODE_EXCLUSIVE, FTAG, &ds);
 		}
-
-		ds = dsl_dataset_open_obj(dd->dd_pool, obj, NULL,
-		    DS_MODE_EXCLUSIVE, FTAG);
 	}
-	if (ds == NULL) {
+	if (err) {
 		if (drop_lock)
 			rw_exit(&dp->dp_config_rwlock);
-		return (EBUSY);
+		return (err);
 	}
 
 	obj = ds->ds_object;
@@ -942,22 +971,25 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	 * them.  Try again.
 	 */
 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
-		mutex_exit(&ds->ds_lock);
 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 		if (drop_lock)
 			rw_exit(&dp->dp_config_rwlock);
 		return (EAGAIN);
 	}
 
-	/* THE POINT OF NO (unsuccessful) RETURN */
-
 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
 		if (ds->ds_prev) {
 			ds_prev = ds->ds_prev;
 		} else {
-			ds_prev = dsl_dataset_open_obj(dd->dd_pool,
+			err = dsl_dataset_open_obj(dd->dd_pool,
 			    ds->ds_phys->ds_prev_snap_obj, NULL,
-			    DS_MODE_NONE, FTAG);
+			    DS_MODE_NONE, FTAG, &ds_prev);
+			if (err) {
+				dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+				if (drop_lock)
+					rw_exit(&dp->dp_config_rwlock);
+				return (err);
+			}
 		}
 		after_branch_point =
 		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
@@ -974,6 +1006,8 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 		}
 	}
 
+	/* THE POINT OF NO (unsuccessful) RETURN */
+
 	ASSERT3P(tx->tx_pool, ==, dd->dd_pool);
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 
@@ -983,8 +1017,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 		spa_scrub_restart(dp->dp_spa, tx->tx_txg);
 
-		ds_next = dsl_dataset_open_obj(dd->dd_pool,
-		    ds->ds_phys->ds_next_snap_obj, NULL, DS_MODE_NONE, FTAG);
+		VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+		    ds->ds_phys->ds_next_snap_obj, NULL,
+		    DS_MODE_NONE, FTAG, &ds_next));
 		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
 
 		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
@@ -1006,7 +1041,8 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 		while (bplist_iterate(&ds_next->ds_deadlist, &itor,
 		    &bp) == 0) {
 			if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
-				bplist_enqueue(&ds->ds_deadlist, &bp, tx);
+				VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
+				    &bp, tx));
 				if (ds_prev && !after_branch_point &&
 				    bp.blk_birth >
 				    ds_prev->ds_phys->ds_prev_snap_txg) {
@@ -1030,8 +1066,8 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 		/* set next's deadlist to our deadlist */
 		ds_next->ds_phys->ds_deadlist_obj =
 		    ds->ds_phys->ds_deadlist_obj;
-		bplist_open(&ds_next->ds_deadlist, mos,
-		    ds_next->ds_phys->ds_deadlist_obj);
+		VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
+		    ds_next->ds_phys->ds_deadlist_obj));
 		ds->ds_phys->ds_deadlist_obj = 0;
 
 		if (ds_next->ds_phys->ds_next_snap_obj != 0) {
@@ -1049,9 +1085,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 			 */
 			dsl_dataset_t *ds_after_next;
 
-			ds_after_next = dsl_dataset_open_obj(dd->dd_pool,
+			VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
 			    ds_next->ds_phys->ds_next_snap_obj, NULL,
-			    DS_MODE_NONE, FTAG);
+			    DS_MODE_NONE, FTAG, &ds_after_next));
 			itor = 0;
 			while (bplist_iterate(&ds_after_next->ds_deadlist,
 			    &itor, &bp) == 0) {
@@ -1078,9 +1114,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 			dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
 			    ds_next);
 			if (ds_prev) {
-				ds_next->ds_prev = dsl_dataset_open_obj(
-				    dd->dd_pool, ds->ds_phys->ds_prev_snap_obj,
-				    NULL, DS_MODE_NONE, ds_next);
+				VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+				    ds->ds_phys->ds_prev_snap_obj, NULL,
+				    DS_MODE_NONE, ds_next, &ds_next->ds_prev));
 			} else {
 				ds_next->ds_prev = NULL;
 			}
@@ -1144,8 +1180,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	} else {
 		/* remove from snapshot namespace */
 		dsl_dataset_t *ds_head;
-		ds_head = dsl_dataset_open_obj(dd->dd_pool,
-		    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+		VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+		    dd->dd_phys->dd_head_dataset_obj, NULL,
+		    DS_MODE_NONE, FTAG, &ds_head));
 #ifdef ZFS_DEBUG
 		{
 			uint64_t val;
@@ -1195,8 +1232,10 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 	if (dd->dd_phys->dd_head_dataset_obj == 0)
 		return (EINVAL);
-	ds = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL,
-	    DS_MODE_NONE, FTAG);
+	err = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL,
+	    DS_MODE_NONE, FTAG, &ds);
+	if (err)
+		return (err);
 
 	err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
 	    snapname, 8, 1, &value);
@@ -1217,7 +1256,7 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	dbuf = dmu_bonus_hold(mos, dsobj);
+	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	dsphys->ds_dir_obj = dd->dd_object;
@@ -1237,13 +1276,14 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
 	dsphys->ds_restoring = ds->ds_phys->ds_restoring;
 	dsphys->ds_bp = ds->ds_phys->ds_bp;
-	dmu_buf_rele(dbuf);
+	dmu_buf_rele(dbuf, FTAG);
 
 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
 		dsl_dataset_t *ds_prev;
 
-		ds_prev = dsl_dataset_open_obj(dp,
-		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_NONE, FTAG);
+		VERIFY(0 == dsl_dataset_open_obj(dp,
+		    ds->ds_phys->ds_prev_snap_obj, NULL,
+		    DS_MODE_NONE, FTAG, &ds_prev));
 		ASSERT(ds_prev->ds_phys->ds_next_snap_obj ==
 		    ds->ds_object ||
 		    ds_prev->ds_phys->ds_num_children > 1);
@@ -1266,7 +1306,8 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	ds->ds_phys->ds_unique_bytes = 0;
 	ds->ds_phys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
+	    ds->ds_phys->ds_deadlist_obj));
 
 	dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
 	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
@@ -1275,8 +1316,9 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 	if (ds->ds_prev)
 		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
-	ds->ds_prev = dsl_dataset_open_obj(dp,
-	    ds->ds_phys->ds_prev_snap_obj, snapname, DS_MODE_NONE, ds);
+	VERIFY(0 == dsl_dataset_open_obj(dp,
+	    ds->ds_phys->ds_prev_snap_obj, snapname,
+	    DS_MODE_NONE, ds, &ds->ds_prev));
 
 	rw_exit(&dp->dp_config_rwlock);
 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
@@ -1295,7 +1337,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, dmu_tx_t *tx)
 	dsl_dir_dirty(ds->ds_dir, tx);
 	bplist_close(&ds->ds_deadlist);
 
-	dmu_buf_remove_ref(ds->ds_dbuf, ds);
+	dmu_buf_rele(ds->ds_dbuf, ds);
 }
 
 void
@@ -1319,7 +1361,6 @@ dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds)
 	dds->dds_creation_txg = ds->ds_phys->ds_creation_txg;
 	dds->dds_space_refd = ds->ds_phys->ds_used_bytes;
 	dds->dds_fsid_guid = ds->ds_phys->ds_fsid_guid;
-	dds->dds_guid = ds->ds_phys->ds_guid;
 
 	if (ds->ds_phys->ds_next_snap_obj) {
 		/*
@@ -1332,8 +1373,6 @@ dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds)
 		dds->dds_uncompressed_bytes =
 		    ds->ds_phys->ds_uncompressed_bytes;
 	}
-
-	dds->dds_objset_obj = ds->ds_object;
 }
 
 dsl_pool_t *
@@ -1375,10 +1414,11 @@ dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	}
 
 	/* new fs better exist */
-	nds = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname, FTAG, &tail);
-	if (nds == NULL) {
+	err = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname,
+	    FTAG, &nds, &tail);
+	if (err) {
 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
-		return (ENOENT);
+		return (err);
 	}
 
 	dsl_dir_close(nds, FTAG);
@@ -1397,8 +1437,12 @@ dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 	tail++;
 
-	fsds = dsl_dataset_open_obj(dd->dd_pool,
-	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+	err = dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &fsds);
+	if (err) {
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (err);
+	}
 
 	/* new name better not be in use */
 	err = zap_lookup(mos, fsds->ds_phys->ds_snapnames_zapobj,
@@ -1414,7 +1458,7 @@ dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	/* The point of no (unsuccessful) return */
 
 	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER);
-	dsl_dataset_get_snapname(snds);
+	VERIFY(0 == dsl_dataset_get_snapname(snds));
 	err = zap_remove(mos, fsds->ds_phys->ds_snapnames_zapobj,
 	    snds->ds_snapname, tx);
 	ASSERT3U(err, ==, 0);
@@ -1440,9 +1484,9 @@ dsl_dataset_rename(const char *osname, const char *newname)
 	struct osrenamearg ora;
 	int err;
 
-	dd = dsl_dir_open(osname, FTAG, &tail);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(osname, FTAG, &dd, &tail);
+	if (err)
+		return (err);
 	if (tail == NULL) {
 		err = dsl_dir_sync_task(dd,
 		    dsl_dir_rename_sync, (void*)newname, 1<<12);
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 4ea1d62de5..8ffa145477 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -76,18 +75,20 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
 	kmem_free(dd, sizeof (dsl_dir_t));
 }
 
-dsl_dir_t *
+int
 dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
-    const char *tail, void *tag)
+    const char *tail, void *tag, dsl_dir_t **ddp)
 {
 	dmu_buf_t *dbuf;
 	dsl_dir_t *dd;
+	int err;
 
 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 	    dsl_pool_sync_context(dp));
 
-	dbuf = dmu_bonus_hold_tag(dp->dp_meta_objset, ddobj, tag);
-	dmu_buf_read(dbuf);
+	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
+	if (err)
+		return (err);
 	dd = dmu_buf_get_user(dbuf);
 #ifdef ZFS_DEBUG
 	{
@@ -112,8 +113,13 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 		    offsetof(dsl_prop_cb_record_t, cbr_node));
 
 		if (dd->dd_phys->dd_parent_obj) {
-			dd->dd_parent = dsl_dir_open_obj(dp,
-			    dd->dd_phys->dd_parent_obj, NULL, dd);
+			err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
+			    NULL, dd, &dd->dd_parent);
+			if (err) {
+				kmem_free(dd, sizeof (dsl_dir_t));
+				dmu_buf_rele(dbuf, tag);
+				return (err);
+			}
 			if (tail) {
 #ifdef ZFS_DEBUG
 				uint64_t foundobj;
@@ -122,8 +128,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 				    dd->dd_parent->dd_phys->
 				    dd_child_dir_zapobj,
 				    tail, sizeof (foundobj), 1, &foundobj);
-				ASSERT3U(err, ==, 0);
-				ASSERT3U(foundobj, ==, ddobj);
+				ASSERT(err || foundobj == ddobj);
 #endif
 				(void) strcpy(dd->dd_myname, tail);
 			} else {
@@ -131,11 +136,12 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 				    dd->dd_parent->dd_phys->
 				    dd_child_dir_zapobj,
 				    ddobj, dd->dd_myname);
-				/*
-				 * The caller should be protecting this ddobj
-				 * from being deleted concurrently
-				 */
-				ASSERT(err == 0);
+			}
+			if (err) {
+				dsl_dir_close(dd->dd_parent, dd);
+				kmem_free(dd, sizeof (dsl_dir_t));
+				dmu_buf_rele(dbuf, tag);
+				return (err);
 			}
 		} else {
 			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
@@ -166,7 +172,8 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 	ASSERT3P(dd->dd_pool, ==, dp);
 	ASSERT3U(dd->dd_object, ==, ddobj);
 	ASSERT3P(dd->dd_dbuf, ==, dbuf);
-	return (dd);
+	*ddp = dd;
+	return (0);
 }
 
 void
@@ -174,7 +181,7 @@ dsl_dir_close(dsl_dir_t *dd, void *tag)
 {
 	dprintf_dd(dd, "%s\n", "");
 	spa_close(dd->dd_pool->dp_spa, tag);
-	dmu_buf_rele_tag(dd->dd_dbuf, tag);
+	dmu_buf_rele(dd->dd_dbuf, tag);
 }
 
 /* buf must be long enough (MAXNAMELEN should do) */
@@ -266,8 +273,9 @@ getcomponent(const char *path, char *component, const char **nextp)
  * same as dsl_open_dir, ignore the first component of name and use the
  * spa instead
  */
-dsl_dir_t *
-dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
+int
+dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+    dsl_dir_t **ddp, const char **tailp)
 {
 	char buf[MAXNAMELEN];
 	const char *next, *nextnext = NULL;
@@ -280,15 +288,15 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
 	dprintf("%s\n", name);
 
 	if (name == NULL)
-		return (NULL);
+		return (ENOENT);
 	err = getcomponent(name, buf, &next);
 	if (err)
-		return (NULL);
+		return (err);
 	if (spa == NULL) {
 		err = spa_open(buf, &spa, FTAG);
 		if (err) {
 			dprintf("spa_open(%s) failed\n", buf);
-			return (NULL);
+			return (err);
 		}
 		openedspa = TRUE;
 
@@ -299,17 +307,19 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
 	dp = spa_get_dsl(spa);
 
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	dd = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag);
+	err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
+	if (err) {
+		rw_exit(&dp->dp_config_rwlock);
+		if (openedspa)
+			spa_close(spa, FTAG);
+		return (err);
+	}
+
 	while (next != NULL) {
 		dsl_dir_t *child_ds;
 		err = getcomponent(next, buf, &nextnext);
-		if (err) {
-			dsl_dir_close(dd, tag);
-			rw_exit(&dp->dp_config_rwlock);
-			if (openedspa)
-				spa_close(spa, FTAG);
-			return (NULL);
-		}
+		if (err)
+			break;
 		ASSERT(next[0] != '\0');
 		if (next[0] == '@')
 			break;
@@ -321,18 +331,28 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
 		err = zap_lookup(dp->dp_meta_objset,
 		    dd->dd_phys->dd_child_dir_zapobj,
 		    buf, sizeof (ddobj), 1, &ddobj);
-		if (err == ENOENT) {
+		if (err) {
+			if (err == ENOENT)
+				err = 0;
 			break;
 		}
-		ASSERT(err == 0);
 
-		child_ds = dsl_dir_open_obj(dp, ddobj, buf, tag);
+		err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
+		if (err)
+			break;
 		dsl_dir_close(dd, tag);
 		dd = child_ds;
 		next = nextnext;
 	}
 	rw_exit(&dp->dp_config_rwlock);
 
+	if (err) {
+		dsl_dir_close(dd, tag);
+		if (openedspa)
+			spa_close(spa, FTAG);
+		return (err);
+	}
+
 	/*
 	 * It's an error if there's more than one component left, or
 	 * tailp==NULL and there's any component left.
@@ -342,14 +362,14 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
 		/* bad path name */
 		dsl_dir_close(dd, tag);
 		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
-		next = NULL;
-		dd = NULL;
+		err = ENOENT;
 	}
 	if (tailp)
 		*tailp = next;
 	if (openedspa)
 		spa_close(spa, FTAG);
-	return (dd);
+	*ddp = dd;
+	return (err);
 }
 
 /*
@@ -358,10 +378,10 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
  * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
  * means that the last component is a snapshot.
  */
-dsl_dir_t *
-dsl_dir_open(const char *name, void *tag, const char **tailp)
+int
+dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
 {
-	return (dsl_dir_open_spa(NULL, name, tag, tailp));
+	return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
 }
 
 int
@@ -397,7 +417,7 @@ dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
 	dprintf("dataset_create: zap_add %s->%lld to %lld returned %d\n",
 	    name, ddobj, pds->dd_phys->dd_child_dir_zapobj, err);
 
-	dbuf = dmu_bonus_hold(mos, ddobj);
+	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 
@@ -407,7 +427,7 @@ dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 	dsphys->dd_child_dir_zapobj = zap_create(mos,
 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
-	dmu_buf_rele(dbuf);
+	dmu_buf_rele(dbuf, FTAG);
 
 	rw_exit(&pds->dd_pool->dp_config_rwlock);
 
@@ -431,7 +451,9 @@ dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx)
 	if (err)
 		goto out;
 
-	dd = dsl_dir_open_obj(dp, obj, name, FTAG);
+	err = dsl_dir_open_obj(dp, obj, name, FTAG, &dd);
+	if (err)
+		goto out;
 	ASSERT3U(dd->dd_phys->dd_parent_obj, ==, pds->dd_object);
 
 	if (dmu_buf_refcount(dd->dd_dbuf) > 1) {
@@ -512,7 +534,7 @@ dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
 	    sizeof (uint64_t), 1, ddobjp, tx);
 	ASSERT3U(error, ==, 0);
 
-	dbuf = dmu_bonus_hold(mos, *ddobjp);
+	VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsp = dbuf->db_data;
 
@@ -522,7 +544,7 @@ dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
 	dsp->dd_child_dir_zapobj = zap_create(mos,
 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 
-	dmu_buf_rele(dbuf);
+	dmu_buf_rele(dbuf, FTAG);
 }
 
 void
@@ -530,7 +552,6 @@ dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds)
 {
 	bzero(dds, sizeof (dmu_objset_stats_t));
 
-	dds->dds_dir_obj = dd->dd_object;
 	dds->dds_available = dsl_dir_space_available(dd, NULL, 0, TRUE);
 
 	mutex_enter(&dd->dd_lock);
@@ -543,22 +564,17 @@ dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds)
 
 	dds->dds_creation_time = dd->dd_phys->dd_creation_time;
 
-	dds->dds_is_placeholder = (dd->dd_phys->dd_head_dataset_obj == 0);
-
 	if (dd->dd_phys->dd_clone_parent_obj) {
 		dsl_dataset_t *ds;
 
 		rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-		ds = dsl_dataset_open_obj(dd->dd_pool,
-		    dd->dd_phys->dd_clone_parent_obj, NULL, DS_MODE_NONE, FTAG);
+		VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+		    dd->dd_phys->dd_clone_parent_obj,
+		    NULL, DS_MODE_NONE, FTAG, &ds));
 		dsl_dataset_name(ds, dds->dds_clone_of);
-		dds->dds_clone_of_obj = dd->dd_phys->dd_clone_parent_obj;
 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 		rw_exit(&dd->dd_pool->dp_config_rwlock);
 	}
-
-	spa_altroot(dd->dd_pool->dp_spa, dds->dds_altroot,
-	    sizeof (dds->dds_altroot));
 }
 
 int
@@ -668,7 +684,7 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
 	mutex_exit(&dd->dd_lock);
 
 	/* release the hold from dsl_dir_dirty */
-	dmu_buf_remove_ref(dd->dd_dbuf, dd);
+	dmu_buf_rele(dd->dd_dbuf, dd);
 }
 
 static uint64_t
@@ -679,7 +695,7 @@ dsl_dir_estimated_space(dsl_dir_t *dd)
 
 	ASSERT(MUTEX_HELD(&dd->dd_lock));
 
-	space = dd->dd_used_bytes;
+	space = dd->dd_phys->dd_used_bytes;
 	ASSERT(space >= 0);
 	for (i = 0; i < TXG_SIZE; i++) {
 		space += dd->dd_space_towrite[i&TXG_MASK];
@@ -788,6 +804,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd,
 	struct tempreserve *tr;
 
 	ASSERT3U(txg, !=, 0);
+	ASSERT3S(asize, >=, 0);
 
 	mutex_enter(&dd->dd_lock);
 	/*
@@ -827,10 +844,14 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd,
 	/*
 	 * If they are requesting more space, and our current estimate
 	 * is over quota.  They get to try again unless the actual
-	 * on-disk is over quota.
+	 * on-disk is over quota and there are no pending changes (which
+	 * may free up space for us).
 	 */
 	if (asize > 0 && est_used > quota) {
-		if (dd->dd_used_bytes < quota)
+		if (dd->dd_space_towrite[txg & TXG_MASK] != 0 ||
+		    dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 ||
+		    dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 ||
+		    dd->dd_used_bytes < quota)
 			edquot = ERESTART;
 		dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
 		    "quota=%lluK tr=%lluK err=%d\n",
@@ -876,6 +897,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
 	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 	list_create(tr_list, sizeof (struct tempreserve),
 	    offsetof(struct tempreserve, tr_node));
+	ASSERT3S(asize, >=, 0);
+	ASSERT3S(fsize, >=, 0);
 
 	err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
 	    tr_list, tx);
@@ -975,8 +998,6 @@ dsl_dir_diduse_space(dsl_dir_t *dd,
 	ASSERT(uncompressed >= 0 ||
 	    dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
 	dd->dd_used_bytes += used;
-	if (used > 0)
-		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] -= used;
 	dd->dd_phys->dd_uncompressed_bytes += uncompressed;
 	dd->dd_phys->dd_compressed_bytes += compressed;
 	mutex_exit(&dd->dd_lock);
@@ -1013,9 +1034,9 @@ dsl_dir_set_quota(const char *ddname, uint64_t quota)
 	dsl_dir_t *dd;
 	int err;
 
-	dd = dsl_dir_open(ddname, FTAG, NULL);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	if (err)
+		return (err);
 	/*
 	 * If someone removes a file, then tries to set the quota, we
 	 * want to make sure the file freeing takes effect.
@@ -1073,9 +1094,9 @@ dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
 	dsl_dir_t *dd;
 	int err;
 
-	dd = dsl_dir_open(ddname, FTAG, NULL);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	if (err)
+		return (err);
 	err = dsl_dir_sync_task(dd,
 	    dsl_dir_set_reservation_sync, &reservation, 0);
 	dsl_dir_close(dd, FTAG);
@@ -1128,11 +1149,10 @@ dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 		return (ENXIO);
 	}
 
-	newpds = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &tail);
-
 	/* new parent should exist */
-	if (newpds == NULL)
-		return (ENOENT);
+	err = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &newpds, &tail);
+	if (err)
+		return (err);
 
 	/* new name should not already exist */
 	if (tail == NULL) {
@@ -1195,8 +1215,8 @@ dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	(void) strcpy(dd->dd_myname, tail);
 	dsl_dir_close(dd->dd_parent, dd);
 	dd->dd_phys->dd_parent_obj = newpds->dd_object;
-	dd->dd_parent = dsl_dir_open_obj(dd->dd_pool,
-	    newpds->dd_object, NULL, dd);
+	VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
+	    newpds->dd_object, NULL, dd, &dd->dd_parent));
 
 	/* add to new parent zapobj */
 	err = zap_add(mos, newpds->dd_phys->dd_child_dir_zapobj,
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 5b71ccfaa9..b8e54be6f6 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,8 +38,8 @@
 /* internal reserved dir name */
 #define	MOS_DIR_NAME "$MOS"
 
-static dsl_dir_t *
-dsl_pool_open_mos_dir(dsl_pool_t *dp)
+static int
+dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp)
 {
 	uint64_t obj;
 	int err;
@@ -48,9 +47,10 @@ dsl_pool_open_mos_dir(dsl_pool_t *dp)
 	err = zap_lookup(dp->dp_meta_objset,
 	    dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
 	    MOS_DIR_NAME, sizeof (obj), 1, &obj);
-	ASSERT3U(err, ==, 0);
+	if (err)
+		return (err);
 
-	return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp));
+	return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp));
 }
 
 static dsl_pool_t *
@@ -74,38 +74,56 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	return (dp);
 }
 
-dsl_pool_t *
-dsl_pool_open(spa_t *spa, uint64_t txg)
+int
+dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
-
-	dp->dp_meta_objset =
-	    &dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp)->os;
+	objset_impl_t *osi;
 
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
+	if (err)
+		goto out;
+	dp->dp_meta_objset = &osi->os;
+
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 	    &dp->dp_root_dir_obj);
-	ASSERT3U(err, ==, 0);
+	if (err)
+		goto out;
+
+	err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+	    NULL, dp, &dp->dp_root_dir);
+	if (err)
+		goto out;
 
-	dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
-	    NULL, dp);
-	dp->dp_mos_dir = dsl_pool_open_mos_dir(dp);
+	err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir);
+	if (err)
+		goto out;
+
+out:
 	rw_exit(&dp->dp_config_rwlock);
+	if (err)
+		dsl_pool_close(dp);
+	else
+		*dpp = dp;
 
-	return (dp);
+	return (err);
 }
 
 void
 dsl_pool_close(dsl_pool_t *dp)
 {
 	/* drop our reference from dsl_pool_open() */
-	dsl_dir_close(dp->dp_mos_dir, dp);
-	dsl_dir_close(dp->dp_root_dir, dp);
+	if (dp->dp_mos_dir)
+		dsl_dir_close(dp->dp_mos_dir, dp);
+	if (dp->dp_root_dir)
+		dsl_dir_close(dp->dp_root_dir, dp);
 
 	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
-	dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+	if (dp->dp_meta_objset)
+		dmu_objset_evict(NULL, dp->dp_meta_objset->os);
 
 	txg_list_destroy(&dp->dp_dirty_datasets);
 	txg_list_destroy(&dp->dp_dirty_dirs);
@@ -132,14 +150,13 @@ dsl_pool_create(spa_t *spa, uint64_t txg)
 
 	/* create and open the root dir */
 	dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
-	dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
-	    NULL, dp);
+	VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+	    NULL, dp, &dp->dp_root_dir));
 
 	/* create and open the meta-objset dir */
-	err = dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME,
-	    tx);
+	VERIFY(0 == dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx));
 	ASSERT3U(err, ==, 0);
-	dp->dp_mos_dir = dsl_pool_open_mos_dir(dp);
+	VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir));
 
 	dmu_tx_commit(tx);
 
diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c
index 3feb93e468..fc33b1c591 100644
--- a/usr/src/uts/common/fs/zfs/dsl_prop.c
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -75,7 +74,10 @@ dsl_prop_get_impl(dsl_pool_t *dp, uint64_t ddobj, const char *propname,
 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
 
 	while (ddobj != 0) {
-		dsl_dir_t *dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG);
+		dsl_dir_t *dd;
+		err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
+		if (err)
+			break;
 		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
 		    propname, intsz, numint, buf);
 		if (err != ENOENT) {
@@ -136,7 +138,8 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
 
 	cbr->cbr_func(cbr->cbr_arg, value);
 
-	(void) dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, cbr);
+	VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object,
+	    NULL, cbr, &dd));
 	rw_exit(&dd->dd_pool->dp_config_rwlock);
 	/* Leave dataset open until this callback is unregistered */
 	return (0);
@@ -164,9 +167,9 @@ dsl_prop_get(const char *ddname, const char *propname,
 	const char *tail;
 	int err;
 
-	dd = dsl_dir_open(ddname, FTAG, &tail);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(ddname, FTAG, &dd, &tail);
+	if (err)
+		return (err);
 	if (tail && tail[0] != '@') {
 		dsl_dir_close(dd, FTAG);
 		return (ENOENT);
@@ -258,7 +261,9 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
 	int err;
 
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
-	dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG);
+	err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
+	if (err)
+		return;
 
 	if (!first) {
 		/*
@@ -353,15 +358,15 @@ dsl_prop_set(const char *ddname, const char *propname,
 	int err;
 	struct prop_set_arg psa;
 
-	dd = dsl_dir_open(ddname, FTAG, NULL);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	if (err)
+		return (err);
 
 	psa.name = propname;
 	psa.intsz = intsz;
 	psa.numints = numints;
 	psa.buf = buf;
-	err = dsl_dir_sync_task(dd, dsl_prop_set_sync, &psa, 0);
+	err = dsl_dir_sync_task(dd, dsl_prop_set_sync, &psa, 1<<20);
 
 	dsl_dir_close(dd, FTAG);
 
@@ -457,10 +462,12 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
 		if (dd->dd_phys->dd_parent_obj == 0)
 			parent = NULL;
 		else
-			parent = dsl_dir_open_obj(dp,
-			    dd->dd_phys->dd_parent_obj, NULL, FTAG);
+			err = dsl_dir_open_obj(dp,
+			    dd->dd_phys->dd_parent_obj, NULL, FTAG, &parent);
 		if (dd != ds->ds_dir)
 			dsl_dir_close(dd, FTAG);
+		if (err)
+			break;
 		dd = parent;
 	}
 	rw_exit(&dp->dp_config_rwlock);
diff --git a/usr/src/uts/common/fs/zfs/fletcher.c b/usr/src/uts/common/fs/zfs/fletcher.c
index 03186d1387..edda3c9a9d 100644
--- a/usr/src/uts/common/fs/zfs/fletcher.c
+++ b/usr/src/uts/common/fs/zfs/fletcher.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -98,3 +97,49 @@ fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
 
 	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
 }
+
+void
+fletcher_4_incremental_native(const void *buf, uint64_t size,
+    zio_cksum_t *zcp)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	a = zcp->zc_word[0];
+	b = zcp->zc_word[1];
+	c = zcp->zc_word[2];
+	d = zcp->zc_word[3];
+
+	for (; ip < ipend; ip++) {
+		a += ip[0];
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
+    zio_cksum_t *zcp)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	a = zcp->zc_word[0];
+	b = zcp->zc_word[1];
+	c = zcp->zc_word[2];
+	d = zcp->zc_word[3];
+
+	for (; ip < ipend; ip++) {
+		a += BSWAP_32(ip[0]);
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 9d682e4990..d31e6edda3 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -379,11 +378,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 			    os, tx);
 		}
 
-		db = dmu_bonus_hold(os, smo->smo_object);
+		VERIFY(0 == dmu_bonus_hold(os, smo->smo_object, FTAG, &db));
 		dmu_buf_will_dirty(db, tx);
 		ASSERT3U(db->db_size, ==, sizeof (*smo));
 		bcopy(smo, db->db_data, db->db_size);
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, FTAG);
 
 		dmu_tx_commit(tx);
 	}
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 9b9bcab217..02be864b36 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -33,6 +32,7 @@
  */
 
 #include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
@@ -62,6 +62,44 @@ static uint32_t spa_active_count;
  * ==========================================================================
  */
 
+static int
+spa_error_entry_compare(const void *a, const void *b)
+{
+	spa_error_entry_t *sa = (spa_error_entry_t *)a;
+	spa_error_entry_t *sb = (spa_error_entry_t *)b;
+	int ret;
+
+	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
+	    sizeof (zbookmark_t));
+
+	if (ret < 0)
+		return (-1);
+	else if (ret > 0)
+		return (1);
+	else
+		return (0);
+}
+
+/*
+ * Utility function which retrieves copies of the current logs and
+ * re-initializes them in the process.
+ */
+void
+spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
+{
+	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
+
+	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
+	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
+
+	avl_create(&spa->spa_errlist_scrub,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+	avl_create(&spa->spa_errlist_last,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+}
+
 /*
  * Activate an uninitialized pool.
  */
@@ -76,9 +114,6 @@ spa_activate(spa_t *spa)
 
 	spa->spa_normal_class = metaslab_class_create();
 
-	spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry",
-	    4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
-
 	for (t = 0; t < ZIO_TYPES; t++) {
 		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
 		    8, maxclsyspri, 50, INT_MAX,
@@ -95,6 +130,13 @@ spa_activate(spa_t *spa)
 
 	txg_list_create(&spa->spa_vdev_txg_list,
 	    offsetof(struct vdev, vdev_txg_node));
+
+	avl_create(&spa->spa_errlist_scrub,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+	avl_create(&spa->spa_errlist_last,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
 }
 
 /*
@@ -124,12 +166,18 @@ spa_deactivate(spa_t *spa)
 		spa->spa_zio_intr_taskq[t] = NULL;
 	}
 
-	taskq_destroy(spa->spa_vdev_retry_taskq);
-	spa->spa_vdev_retry_taskq = NULL;
-
 	metaslab_class_destroy(spa->spa_normal_class);
 	spa->spa_normal_class = NULL;
 
+	/*
+	 * If this was part of an import or the open otherwise failed, we may
+	 * still have errors left in the queues.  Empty them just in case.
+	 */
+	spa_errlog_drain(spa);
+
+	avl_destroy(&spa->spa_errlist_scrub);
+	avl_destroy(&spa->spa_errlist_last);
+
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 }
 
@@ -175,6 +223,11 @@ static void
 spa_unload(spa_t *spa)
 {
 	/*
+	 * Stop async tasks.
+	 */
+	spa_async_suspend(spa);
+
+	/*
 	 * Stop syncing.
 	 */
 	if (spa->spa_sync_on) {
@@ -185,8 +238,8 @@ spa_unload(spa_t *spa)
 	/*
 	 * Wait for any outstanding prefetch I/O to complete.
 	 */
-	spa_config_enter(spa, RW_WRITER);
-	spa_config_exit(spa);
+	spa_config_enter(spa, RW_WRITER, FTAG);
+	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Close the dsl pool.
@@ -203,16 +256,16 @@ spa_unload(spa_t *spa)
 		vdev_free(spa->spa_root_vdev);
 		spa->spa_root_vdev = NULL;
 	}
+
+	spa->spa_async_suspended = 0;
 }
 
 /*
  * Load an existing storage pool, using the pool's builtin spa_config as a
- * source of configuration information.  The 'readonly' flag will prevent us
- * from writing any updated state to disk, and can be use when testing a pool
- * for import.
+ * source of configuration information.
  */
 static int
-spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
+spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 {
 	int error = 0;
 	nvlist_t *nvroot = NULL;
@@ -221,25 +274,34 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 	uint64_t pool_guid;
 	zio_t *zio;
 
+	spa->spa_load_state = state;
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
-	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
-		return (EINVAL);
+	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+		error = EINVAL;
+		goto out;
+	}
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    &spa->spa_config_txg);
 
-	if (import && spa_guid_exists(pool_guid, 0))
-		return (EEXIST);
+	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+	    spa_guid_exists(pool_guid, 0)) {
+		error = EEXIST;
+		goto out;
+	}
 
 	/*
 	 * Parse the configuration into a vdev tree.
 	 */
-	spa_config_enter(spa, RW_WRITER);
+	spa_config_enter(spa, RW_WRITER, FTAG);
 	rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
-	if (rvd == NULL)
-		return (EINVAL);
+	if (rvd == NULL) {
+		error = EINVAL;
+		goto out;
+	}
 
 	spa->spa_root_vdev = rvd;
 	ASSERT(spa_guid(spa) == pool_guid);
@@ -247,8 +309,10 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 	/*
 	 * Try to open all vdevs, loading each label in the process.
 	 */
-	if (vdev_open(rvd) != 0)
-		return (ENXIO);
+	if (vdev_open(rvd) != 0) {
+		error = ENXIO;
+		goto out;
+	}
 
 	/*
 	 * Find the best uberblock.
@@ -264,8 +328,16 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 	 * If we weren't able to find a single valid uberblock, return failure.
 	 */
 	if (ub->ub_txg == 0) {
-		dprintf("ub_txg is zero\n");
-		return (ENXIO);
+		error = ENXIO;
+		goto out;
+	}
+
+	/*
+	 * If the pool is newer than the code, we can't open it.
+	 */
+	if (ub->ub_version > UBERBLOCK_VERSION) {
+		error = ENOTSUP;
+		goto out;
 	}
 
 	/*
@@ -273,11 +345,10 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 	 * incomplete configuration.
 	 */
 	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
-		rvd->vdev_state = VDEV_STATE_CANT_OPEN;
-		rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM;
-		dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n",
-		    rvd->vdev_guid_sum, ub->ub_guid_sum);
-		return (ENXIO);
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_BAD_GUID_SUM);
+		error = ENXIO;
+		goto out;
 	}
 
 	/*
@@ -286,12 +357,22 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
-	spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg);
+	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+	if (error) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		goto out;
+	}
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 
-	VERIFY(zap_lookup(spa->spa_meta_objset,
+	if (zap_lookup(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
-	    sizeof (uint64_t), 1, &spa->spa_config_object) == 0);
+	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
 
 	if (!mosconfig) {
 		dmu_buf_t *db;
@@ -299,21 +380,24 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 		size_t nvsize = 0;
 		nvlist_t *newconfig = NULL;
 
-		db = dmu_bonus_hold(spa->spa_meta_objset,
-		    spa->spa_config_object);
-		dmu_buf_read(db);
+		VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset,
+		    spa->spa_config_object, FTAG, &db));
 		nvsize = *(uint64_t *)db->db_data;
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, FTAG);
 
 		packed = kmem_alloc(nvsize, KM_SLEEP);
-		error = dmu_read_canfail(spa->spa_meta_objset,
+		error = dmu_read(spa->spa_meta_objset,
 		    spa->spa_config_object, 0, nvsize, packed);
 		if (error == 0)
 			error = nvlist_unpack(packed, nvsize, &newconfig, 0);
 		kmem_free(packed, nvsize);
 
-		if (error)
-			return (ENXIO);
+		if (error) {
+			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			error = EIO;
+			goto out;
+		}
 
 		spa_config_set(spa, newconfig);
 
@@ -321,39 +405,76 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 		spa_deactivate(spa);
 		spa_activate(spa);
 
-		return (spa_load(spa, newconfig, readonly, import, B_TRUE));
+		return (spa_load(spa, newconfig, state, B_TRUE));
 	}
 
-	VERIFY(zap_lookup(spa->spa_meta_objset,
+	if (zap_lookup(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0);
+	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
 
 	/*
-	 * Load the vdev state for all top level vdevs.
+	 * Load the persistent error log.  If we have an older pool, this will
+	 * not be present.
 	 */
-	if ((error = vdev_load(rvd, import)) != 0)
-		return (error);
+	error = zap_lookup(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
+	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
+	if (error != 0 &&error != ENOENT) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
+
+	error = zap_lookup(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
+	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
+	if (error != 0 && error != ENOENT) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
+
+	/*
+	 * Load the vdev state for all top level vdevs.  We need to grab the
+	 * config lock because all label I/O is done with the
+	 * ZIO_FLAG_CONFIG_HELD flag.
+	 */
+	spa_config_enter(spa, RW_READER, FTAG);
+	if ((error = vdev_load(rvd)) != 0) {
+		spa_config_exit(spa, FTAG);
+		goto out;
+	}
+	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
 	 */
-	spa_config_enter(spa, RW_WRITER);
+	spa_config_enter(spa, RW_WRITER, FTAG);
 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Check the state of the root vdev.  If it can't be opened, it
 	 * indicates one or more toplevel vdevs are faulted.
 	 */
-	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
-		return (ENXIO);
+	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+		error = ENXIO;
+		goto out;
+	}
 
 	/*
 	 * Claim log blocks that haven't been committed yet, and update all
 	 * top-level vdevs to sync any config changes found in vdev_load().
 	 * This must all happen in a single txg.
 	 */
-	if ((spa_mode & FWRITE) && !readonly) {
+	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
 		dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa),
 		    spa_first_txg(spa));
 		dmu_objset_find(spa->spa_name, zil_claim, tx, 0);
@@ -369,7 +490,14 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 	}
 
-	return (0);
+	error = 0;
+out:
+	if (error)
+		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
+	spa->spa_load_state = SPA_LOAD_NONE;
+	spa->spa_ena = 0;
+
+	return (error);
 }
 
 /*
@@ -415,7 +543,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 		spa_activate(spa);
 
 		error = spa_load(spa, spa->spa_config,
-		    B_FALSE, B_FALSE, B_FALSE);
+		    SPA_LOAD_OPEN, B_FALSE);
 
 		if (error == EBADF) {
 			/*
@@ -432,7 +560,9 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			return (ENOENT);
-		} if (error) {
+		}
+
+		if (error) {
 			/*
 			 * We can't open the pool, but we still have useful
 			 * information: the state of each vdev after the
@@ -443,10 +573,14 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 				    B_TRUE);
 			spa_unload(spa);
 			spa_deactivate(spa);
+			spa->spa_last_open_failed = B_TRUE;
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			*spapp = NULL;
 			return (error);
+		} else {
+			zfs_post_ok(spa, NULL);
+			spa->spa_last_open_failed = B_FALSE;
 		}
 
 		loaded = B_TRUE;
@@ -459,9 +593,9 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 	*spapp = spa;
 
 	if (config != NULL) {
-		spa_config_enter(spa, RW_READER);
+		spa_config_enter(spa, RW_READER, FTAG);
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-		spa_config_exit(spa);
+		spa_config_exit(spa, FTAG);
 	}
 
 	/*
@@ -479,8 +613,36 @@ spa_open(const char *name, spa_t **spapp, void *tag)
 	return (spa_open_common(name, spapp, tag, NULL));
 }
 
+/*
+ * Lookup the given spa_t, incrementing the inject count in the process,
+ * preventing it from being exported or destroyed.
+ */
+spa_t *
+spa_inject_addref(char *name)
+{
+	spa_t *spa;
+
+	mutex_enter(&spa_namespace_lock);
+	if ((spa = spa_lookup(name)) == NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (NULL);
+	}
+	spa->spa_inject_ref++;
+	mutex_exit(&spa_namespace_lock);
+
+	return (spa);
+}
+
+void
+spa_inject_delref(spa_t *spa)
+{
+	mutex_enter(&spa_namespace_lock);
+	spa->spa_inject_ref--;
+	mutex_exit(&spa_namespace_lock);
+}
+
 int
-spa_get_stats(const char *name, nvlist_t **config)
+spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
 {
 	int error;
 	spa_t *spa;
@@ -488,6 +650,29 @@ spa_get_stats(const char *name, nvlist_t **config)
 	*config = NULL;
 	error = spa_open_common(name, &spa, FTAG, config);
 
+	if (spa && *config != NULL)
+		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
+		    spa_get_errlog_size(spa)) == 0);
+
+	/*
+	 * We want to get the alternate root even for faulted pools, so we cheat
+	 * and call spa_lookup() directly.
+	 */
+	if (altroot) {
+		if (spa == NULL) {
+			mutex_enter(&spa_namespace_lock);
+			spa = spa_lookup(name);
+			if (spa)
+				spa_altroot(spa, altroot, buflen);
+			else
+				altroot[0] = '\0';
+			spa = NULL;
+			mutex_exit(&spa_namespace_lock);
+		} else {
+			spa_altroot(spa, altroot, buflen);
+		}
+	}
+
 	if (spa != NULL)
 		spa_close(spa, FTAG);
 
@@ -551,9 +736,11 @@ spa_create(const char *pool, nvlist_t *nvroot, char *altroot)
 	    DMU_OT_PACKED_NVLIST, 1 << 14,
 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
 
-	VERIFY(zap_add(spa->spa_meta_objset,
+	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
-	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0);
+	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
+		cmn_err(CE_PANIC, "failed to add pool config");
+	}
 
 	/*
 	 * Create the deferred-free bplist object.  Turn off compression
@@ -565,9 +752,11 @@ spa_create(const char *pool, nvlist_t *nvroot, char *altroot)
 	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
 	    ZIO_COMPRESS_OFF, tx);
 
-	VERIFY(zap_add(spa->spa_meta_objset,
+	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0);
+	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
+		cmn_err(CE_PANIC, "failed to add bplist");
+	}
 
 	dmu_tx_commit(tx);
 
@@ -619,7 +808,7 @@ spa_import(const char *pool, nvlist_t *config, char *altroot)
 	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
 	 * so that we don't try to open the pool if the config is damaged.
 	 */
-	error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE);
+	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
 
 	if (error) {
 		spa_unload(spa);
@@ -694,7 +883,7 @@ spa_tryimport(nvlist_t *tryconfig)
 	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
 	 * so we don't try to open the pool if the config is damaged.
 	 */
-	(void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE);
+	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
@@ -738,6 +927,16 @@ spa_export_common(char *pool, int new_state)
 	}
 
 	/*
+	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
+	 * reacquire the namespace lock, and see if we can export.
+	 */
+	spa_open_ref(spa, FTAG);
+	mutex_exit(&spa_namespace_lock);
+	spa_async_suspend(spa);
+	mutex_enter(&spa_namespace_lock);
+	spa_close(spa, FTAG);
+
+	/*
 	 * The pool will be in core if it's openable,
 	 * in which case we can modify its state.
 	 */
@@ -749,17 +948,20 @@ spa_export_common(char *pool, int new_state)
 		spa_scrub_suspend(spa);
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 
-		if (!spa_refcount_zero(spa)) {
+		/*
+		 * A pool cannot be exported or destroyed if there are active
+		 * references.  If we are resetting a pool, allow references by
+		 * fault injection handlers.
+		 */
+		if (!spa_refcount_zero(spa) ||
+		    (spa->spa_inject_ref != 0 &&
+		    new_state != POOL_STATE_UNINITIALIZED)) {
 			spa_scrub_resume(spa);
+			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (EBUSY);
 		}
 
-		/*
-		 * Update the pool state.
-		 */
-		spa->spa_state = new_state;
-
 		spa_scrub_resume(spa);
 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
 
@@ -771,7 +973,10 @@ spa_export_common(char *pool, int new_state)
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
 		 */
-		vdev_config_dirty(spa->spa_root_vdev);
+		if (new_state != POOL_STATE_UNINITIALIZED) {
+			spa->spa_state = new_state;
+			vdev_config_dirty(spa->spa_root_vdev);
+		}
 	}
 
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
@@ -779,8 +984,10 @@ spa_export_common(char *pool, int new_state)
 		spa_deactivate(spa);
 	}
 
-	spa_remove(spa);
-	spa_config_sync();
+	if (new_state != POOL_STATE_UNINITIALIZED) {
+		spa_remove(spa);
+		spa_config_sync();
+	}
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
@@ -805,6 +1012,17 @@ spa_export(char *pool)
 }
 
 /*
+ * Similar to spa_export(), this unloads the spa_t without actually removing it
+ * from the namespace in any way.
+ */
+int
+spa_reset(char *pool)
+{
+	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED));
+}
+
+
+/*
  * ==========================================================================
  * Device manipulation
  * ==========================================================================
@@ -845,7 +1063,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 			tvd->vdev_id = rvd->vdev_children;
 			vdev_add_child(rvd, tvd);
 		}
-		vdev_init(tvd, txg);
+		if ((error = vdev_init(tvd, txg)) != 0)
+			return (spa_vdev_exit(spa, vd, txg, error));
 		vdev_config_dirty(tvd);
 	}
 
@@ -871,7 +1090,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
  * is automatically detached.
  */
 int
-spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
+spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 {
 	uint64_t txg, open_txg;
 	int error;
@@ -881,7 +1100,7 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
 
 	txg = spa_vdev_enter(spa);
 
-	oldvd = vdev_lookup_by_path(rvd, path);
+	oldvd = vdev_lookup_by_guid(rvd, guid);
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
@@ -954,6 +1173,12 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
 	newvd->vdev_id = pvd->vdev_children;
 	vdev_add_child(pvd, newvd);
 
+	/*
+	 * If newvd is smaller than oldvd, but larger than its rsize,
+	 * the addition of newvd may have decreased our parent's asize.
+	 */
+	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
+
 	tvd = newvd->vdev_top;
 	ASSERT(pvd->vdev_top == tvd);
 	ASSERT(tvd->vdev_parent == rvd);
@@ -962,7 +1187,6 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
 	 * Update the config based on the new in-core state.
 	 */
 	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
-
 	vdev_config_dirty(tvd);
 
 	/*
@@ -976,14 +1200,14 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
 	    open_txg - TXG_INITIAL + 1);
 	mutex_exit(&newvd->vdev_dtl_lock);
 
+	dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg);
+
 	/*
 	 * Mark newvd's DTL dirty in this txg.
 	 */
 	vdev_dirty(tvd, VDD_DTL, txg);
 	(void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg);
 
-	dprintf("attached %s, replacing=%d\n", path, replacing);
-
 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
 
 	/*
@@ -1000,7 +1224,7 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
  * is a replacing vdev.
  */
 int
-spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
+spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 {
 	uint64_t txg;
 	int c, t, error;
@@ -1009,14 +1233,11 @@ spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
 
 	txg = spa_vdev_enter(spa);
 
-	vd = vdev_lookup_by_path(rvd, path);
+	vd = vdev_lookup_by_guid(rvd, guid);
 
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
-	if (guid != 0 && vd->vdev_guid != guid)
-		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
 	pvd = vd->vdev_parent;
 
 	/*
@@ -1105,13 +1326,16 @@ spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
 	/*
 	 * Reopen this top-level vdev to reassess health after detach.
 	 */
-	vdev_reopen(tvd, NULL);
+	vdev_reopen(tvd);
 
 	/*
 	 * If the device we just detached was smaller than the others,
-	 * it may be possible to add metaslabs (i.e. grow the pool).
+	 * it may be possible to add metaslabs (i.e. grow the pool).  We ignore
+	 * the error here because the detach still succeeded - we just weren't
+	 * able to reinitialize the metaslabs.  This pool is in for a world of
+	 * hurt, in any case.
 	 */
-	vdev_metaslab_init(tvd, txg);
+	(void) vdev_metaslab_init(tvd, txg);
 
 	/*
 	 * Update the config based on the new in-core state.
@@ -1133,72 +1357,59 @@ spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
 
-	dprintf("detached %s\n", path);
+	dprintf("detached %s in txg %llu\n", vd->vdev_path, txg);
 
 	return (spa_vdev_exit(spa, vd, txg, 0));
 }
 
 /*
- * If there are any replacing vdevs that have finished replacing, detach them.
- * We can't hold the config lock across detaches, so we lock the config,
- * build a list of candidates, unlock the config, and try each candidate.
+ * Find any device that's done replacing, so we can detach it.
  */
-typedef struct vdev_detach_link {
-	char		*vdl_path;
-	uint64_t	vdl_guid;
-	list_node_t	vdl_node;
-} vdev_detach_link_t;
-
-static void
-spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd)
+static vdev_t *
+spa_vdev_replace_done_hunt(vdev_t *vd)
 {
+	vdev_t *newvd, *oldvd;
 	int c;
 
-	for (c = 0; c < vd->vdev_children; c++)
-		spa_vdev_replace_done_make_list(l, vd->vdev_child[c]);
+	for (c = 0; c < vd->vdev_children; c++) {
+		oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
+		if (oldvd != NULL)
+			return (oldvd);
+	}
 
 	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
-		vdev_t *cvd0 = vd->vdev_child[0];
-		vdev_t *cvd1 = vd->vdev_child[1];
-		vdev_detach_link_t *vdl;
-		int dirty1;
-
-		mutex_enter(&cvd1->vdev_dtl_lock);
-		dirty1 = cvd1->vdev_dtl_map.sm_space |
-		    cvd1->vdev_dtl_scrub.sm_space;
-		mutex_exit(&cvd1->vdev_dtl_lock);
-
-		if (!dirty1) {
-			vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP);
-			vdl->vdl_path = spa_strdup(cvd0->vdev_path);
-			vdl->vdl_guid = cvd0->vdev_guid;
-			list_insert_tail(l, vdl);
+		oldvd = vd->vdev_child[0];
+		newvd = vd->vdev_child[1];
+
+		mutex_enter(&newvd->vdev_dtl_lock);
+		if (newvd->vdev_dtl_map.sm_space == 0 &&
+		    newvd->vdev_dtl_scrub.sm_space == 0) {
+			mutex_exit(&newvd->vdev_dtl_lock);
+			return (oldvd);
 		}
+		mutex_exit(&newvd->vdev_dtl_lock);
 	}
+
+	return (NULL);
 }
 
-void
+static void
 spa_vdev_replace_done(spa_t *spa)
 {
-	vdev_detach_link_t *vdl;
-	list_t vdlist;
-
-	list_create(&vdlist, sizeof (vdev_detach_link_t),
-	    offsetof(vdev_detach_link_t, vdl_node));
-
-	spa_config_enter(spa, RW_READER);
-	spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev);
-	spa_config_exit(spa);
-
-	while ((vdl = list_head(&vdlist)) != NULL) {
-		list_remove(&vdlist, vdl);
-		(void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid,
-		    B_TRUE);
-		spa_strfree(vdl->vdl_path);
-		kmem_free(vdl, sizeof (*vdl));
+	vdev_t *vd;
+	uint64_t guid;
+
+	spa_config_enter(spa, RW_READER, FTAG);
+
+	while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
+		guid = vd->vdev_guid;
+		spa_config_exit(spa, FTAG);
+		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
+			return;
+		spa_config_enter(spa, RW_READER, FTAG);
 	}
 
-	list_destroy(&vdlist);
+	spa_config_exit(spa, FTAG);
 }
 
 /*
@@ -1234,7 +1445,16 @@ spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
  * ==========================================================================
  */
 
-static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t);
+void
+spa_scrub_throttle(spa_t *spa, int direction)
+{
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_throttled += direction;
+	ASSERT(spa->spa_scrub_throttled >= 0);
+	if (spa->spa_scrub_throttled == 0)
+		cv_broadcast(&spa->spa_scrub_io_cv);
+	mutex_exit(&spa->spa_scrub_lock);
+}
 
 static void
 spa_scrub_io_done(zio_t *zio)
@@ -1244,22 +1464,23 @@ spa_scrub_io_done(zio_t *zio)
 	zio_buf_free(zio->io_data, zio->io_size);
 
 	mutex_enter(&spa->spa_scrub_lock);
-	if (zio->io_error)
-		spa->spa_scrub_errors++;
-	if (--spa->spa_scrub_inflight == 0)
-		cv_broadcast(&spa->spa_scrub_io_cv);
-	mutex_exit(&spa->spa_scrub_lock);
-
-	if (zio->io_error) {
+	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 		vdev_t *vd = zio->io_vd;
+		spa->spa_scrub_errors++;
 		mutex_enter(&vd->vdev_stat_lock);
 		vd->vdev_stat.vs_scrub_errors++;
 		mutex_exit(&vd->vdev_stat_lock);
 	}
+	if (--spa->spa_scrub_inflight == 0) {
+		cv_broadcast(&spa->spa_scrub_io_cv);
+		ASSERT(spa->spa_scrub_throttled == 0);
+	}
+	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static void
-spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags)
+spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
+    zbookmark_t *zb)
 {
 	size_t size = BP_GET_LSIZE(bp);
 	void *data = zio_buf_alloc(size);
@@ -1268,8 +1489,13 @@ spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags)
 	spa->spa_scrub_inflight++;
 	mutex_exit(&spa->spa_scrub_lock);
 
+	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
+
+	flags |= ZIO_FLAG_CANFAIL;
+
 	zio_nowait(zio_read(NULL, spa, bp, data, size,
-	    spa_scrub_io_done, NULL, priority, flags));
+	    spa_scrub_io_done, NULL, priority, flags, zb));
 }
 
 /* ARGSUSED */
@@ -1319,12 +1545,11 @@ spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
 		}
 		if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) {
 			spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY |
-			    ZIO_FLAG_RESILVER);
+			    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
 		}
 	} else {
 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB);
+		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
 	}
 
 	return (0);
@@ -1348,19 +1573,25 @@ spa_scrub_thread(spa_t *spa)
 	 */
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
-	spa_config_enter(spa, RW_WRITER);
-	vdev_reopen(rvd, NULL);		/* purge all vdev caches */
+	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
+	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
+	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
+
+	spa_config_enter(spa, RW_WRITER, FTAG);
+	vdev_reopen(rvd);		/* purge all vdev caches */
 	vdev_config_dirty(rvd);		/* rewrite all disk labels */
 	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_errors = 0;
 	spa->spa_scrub_active = 1;
+	ASSERT(spa->spa_scrub_inflight == 0);
+	ASSERT(spa->spa_scrub_throttled == 0);
 
 	while (!spa->spa_scrub_stop) {
 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
-		while (spa->spa_scrub_suspend) {
+		while (spa->spa_scrub_suspended) {
 			spa->spa_scrub_active = 0;
 			cv_broadcast(&spa->spa_scrub_cv);
 			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
@@ -1376,6 +1607,9 @@ spa_scrub_thread(spa_t *spa)
 		mutex_enter(&spa->spa_scrub_lock);
 		if (error != EAGAIN)
 			break;
+
+		while (spa->spa_scrub_throttled > 0)
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 	}
 
 	while (spa->spa_scrub_inflight)
@@ -1384,16 +1618,25 @@ spa_scrub_thread(spa_t *spa)
 	if (spa->spa_scrub_restart_txg != 0)
 		error = ERESTART;
 
+	if (spa->spa_scrub_stop)
+		error = EINTR;
+
 	spa->spa_scrub_active = 0;
 	cv_broadcast(&spa->spa_scrub_cv);
 
 	/*
-	 * If the traverse completed, and there were no errors,
-	 * then the scrub was completely successful.
+	 * Even if there were uncorrectable errors, we consider the scrub
+	 * completed.  The downside is that if there is a transient error during
+	 * a resilver, we won't resilver the data properly to the target.  But
+	 * if the damage is permanent (more likely) we will resilver forever,
+	 * which isn't really acceptable.  Since there is enough information for
+	 * the user to know what has failed and why, this seems like a more
+	 * tractable approach.
 	 */
-	complete = (error == 0 && spa->spa_scrub_errors == 0);
+	complete = (error == 0);
 
-	dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
 	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
 	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
 
@@ -1403,31 +1646,32 @@ spa_scrub_thread(spa_t *spa)
 	 * If the scrub/resilver completed, update all DTLs to reflect this.
 	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
 	 */
-	spa_config_enter(spa, RW_WRITER);
+	spa_config_enter(spa, RW_WRITER, FTAG);
 	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
 	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
-	spa_config_exit(spa);
-
-	spa_vdev_replace_done(spa);
-
-	spa_config_enter(spa, RW_READER);
 	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
-	spa_config_exit(spa);
+	spa_errlog_rotate(spa);
+	spa_config_exit(spa, FTAG);
 
 	mutex_enter(&spa->spa_scrub_lock);
 
-	spa->spa_scrub_type = POOL_SCRUB_NONE;
-	spa->spa_scrub_active = 0;
-	spa->spa_scrub_thread = NULL;
-
-	cv_broadcast(&spa->spa_scrub_cv);
+	/*
+	 * We may have finished replacing a device.
+	 * Let the async thread assess this and handle the detach.
+	 */
+	spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
 
 	/*
 	 * If we were told to restart, our final act is to start a new scrub.
 	 */
 	if (error == ERESTART)
-		VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0);
+		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
+		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
 
+	spa->spa_scrub_type = POOL_SCRUB_NONE;
+	spa->spa_scrub_active = 0;
+	spa->spa_scrub_thread = NULL;
+	cv_broadcast(&spa->spa_scrub_cv);
 	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
 	thread_exit();
 }
@@ -1436,7 +1680,7 @@ void
 spa_scrub_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_suspend++;
+	spa->spa_scrub_suspended++;
 	while (spa->spa_scrub_active) {
 		cv_broadcast(&spa->spa_scrub_cv);
 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
@@ -1450,8 +1694,8 @@ void
 spa_scrub_resume(spa_t *spa)
 {
 	mutex_enter(&spa->spa_scrub_lock);
-	ASSERT(spa->spa_scrub_suspend != 0);
-	if (--spa->spa_scrub_suspend == 0)
+	ASSERT(spa->spa_scrub_suspended != 0);
+	if (--spa->spa_scrub_suspended == 0)
 		cv_broadcast(&spa->spa_scrub_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
@@ -1469,17 +1713,19 @@ spa_scrub_restart(spa_t *spa, uint64_t txg)
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
-static int
-spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+int
+spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
 {
 	space_seg_t *ss;
 	uint64_t mintxg, maxtxg;
 	vdev_t *rvd = spa->spa_root_vdev;
-	int advance = 0;
+	int advance = ADVANCE_PRE | ADVANCE_ZIL;
 
 	if ((uint_t)type >= POOL_SCRUB_TYPES)
 		return (ENOTSUP);
 
+	mutex_enter(&spa->spa_scrub_lock);
+
 	/*
 	 * If there's a scrub or resilver already in progress, stop it.
 	 */
@@ -1487,9 +1733,10 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
 		/*
 		 * Don't stop a resilver unless forced.
 		 */
-		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force)
+		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
+			mutex_exit(&spa->spa_scrub_lock);
 			return (EBUSY);
-
+		}
 		spa->spa_scrub_stop = 1;
 		cv_broadcast(&spa->spa_scrub_cv);
 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
@@ -1503,19 +1750,36 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
 		spa->spa_scrub_th = NULL;
 	}
 
-	spa->spa_scrub_stop = 0;
-	spa->spa_scrub_type = type;
-	spa->spa_scrub_restart_txg = 0;
+	if (rvd == NULL) {
+		ASSERT(spa->spa_scrub_stop == 0);
+		ASSERT(spa->spa_scrub_type == type);
+		ASSERT(spa->spa_scrub_restart_txg == 0);
+		mutex_exit(&spa->spa_scrub_lock);
+		return (0);
+	}
 
 	mintxg = TXG_INITIAL - 1;
 	maxtxg = spa_last_synced_txg(spa) + 1;
 
-	switch (type) {
+	mutex_enter(&rvd->vdev_dtl_lock);
 
-	case POOL_SCRUB_NONE:
-		break;
+	if (rvd->vdev_dtl_map.sm_space == 0) {
+		/*
+		 * The pool-wide DTL is empty.
+		 * If this is a resilver, there's nothing to do.
+		 */
+		if (type == POOL_SCRUB_RESILVER)
+			type = POOL_SCRUB_NONE;
+	} else {
+		/*
+		 * The pool-wide DTL is non-empty.
+		 * If this is a normal scrub, upgrade to a resilver instead.
+		 */
+		if (type == POOL_SCRUB_EVERYTHING)
+			type = POOL_SCRUB_RESILVER;
+	}
 
-	case POOL_SCRUB_RESILVER:
+	if (type == POOL_SCRUB_RESILVER) {
 		/*
 		 * Determine the resilvering boundaries.
 		 *
@@ -1525,26 +1789,22 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
 		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
 		 * so we don't claim to resilver a txg that's still changing.
 		 */
-		mutex_enter(&rvd->vdev_dtl_lock);
 		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
-		mintxg = ss ? ss->ss_start - 1 : 0;
+		mintxg = ss->ss_start - 1;
 		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
-		maxtxg = ss ? ss->ss_end : 0;
-		maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1);
-		mutex_exit(&rvd->vdev_dtl_lock);
+		maxtxg = MIN(ss->ss_end, maxtxg);
 
-		advance = ADVANCE_PRE | ADVANCE_PRUNE;
-		break;
-
-	case POOL_SCRUB_EVERYTHING:
-		/*
-		 * A scrub is like a resilver, but not pruned by DTL.
-		 */
-		advance = ADVANCE_PRE;
-		break;
+		advance |= ADVANCE_PRUNE;
 	}
 
-	if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) {
+	mutex_exit(&rvd->vdev_dtl_lock);
+
+	spa->spa_scrub_stop = 0;
+	spa->spa_scrub_type = type;
+	spa->spa_scrub_restart_txg = 0;
+
+	if (type != POOL_SCRUB_NONE) {
+		spa->spa_scrub_mintxg = mintxg;
 		spa->spa_scrub_maxtxg = maxtxg;
 		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
 		    advance, ZIO_FLAG_CANFAIL);
@@ -1553,24 +1813,119 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
 		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
 	}
 
+	mutex_exit(&spa->spa_scrub_lock);
+
 	return (0);
 }
 
-int
-spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+/*
+ * ==========================================================================
+ * SPA async task processing
+ * ==========================================================================
+ */
+
+static void
+spa_async_reopen(spa_t *spa)
 {
-	int error;
-	traverse_handle_t *th;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *tvd;
+	int c;
 
-	mutex_enter(&spa->spa_scrub_lock);
-	error = spa_scrub_locked(spa, type, force);
-	th = spa->spa_scrub_th;
-	mutex_exit(&spa->spa_scrub_lock);
+	spa_config_enter(spa, RW_WRITER, FTAG);
+
+	for (c = 0; c < rvd->vdev_children; c++) {
+		tvd = rvd->vdev_child[c];
+		if (tvd->vdev_reopen_wanted) {
+			tvd->vdev_reopen_wanted = 0;
+			vdev_reopen(tvd);
+		}
+	}
+
+	spa_config_exit(spa, FTAG);
+}
 
-	if (th == NULL && type != POOL_SCRUB_NONE)
+static void
+spa_async_thread(spa_t *spa)
+{
+	int tasks;
+
+	ASSERT(spa->spa_sync_on);
+
+	mutex_enter(&spa->spa_async_lock);
+	tasks = spa->spa_async_tasks;
+	spa->spa_async_tasks = 0;
+	mutex_exit(&spa->spa_async_lock);
+
+	/*
+	 * See if any devices need to be reopened.
+	 */
+	if (tasks & SPA_ASYNC_REOPEN)
+		spa_async_reopen(spa);
+
+	/*
+	 * If any devices are done replacing, detach them.
+	 */
+	if (tasks & SPA_ASYNC_REPLACE_DONE)
 		spa_vdev_replace_done(spa);
 
-	return (error);
+	/*
+	 * Kick off a scrub.
+	 */
+	if (tasks & SPA_ASYNC_SCRUB)
+		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
+
+	/*
+	 * Kick off a resilver.
+	 */
+	if (tasks & SPA_ASYNC_RESILVER)
+		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+	/*
+	 * Let the world know that we're done.
+	 */
+	mutex_enter(&spa->spa_async_lock);
+	spa->spa_async_thread = NULL;
+	cv_broadcast(&spa->spa_async_cv);
+	mutex_exit(&spa->spa_async_lock);
+	thread_exit();
+}
+
+void
+spa_async_suspend(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	spa->spa_async_suspended++;
+	while (spa->spa_async_thread != NULL)
+		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
+	mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_resume(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	ASSERT(spa->spa_async_suspended != 0);
+	spa->spa_async_suspended--;
+	mutex_exit(&spa->spa_async_lock);
+}
+
+static void
+spa_async_dispatch(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
+	    spa->spa_async_thread == NULL)
+		spa->spa_async_thread = thread_create(NULL, 0,
+		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
+	mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_request(spa_t *spa, int task)
+{
+	mutex_enter(&spa->spa_async_lock);
+	spa->spa_async_tasks |= task;
+	mutex_exit(&spa->spa_async_lock);
 }
 
 /*
@@ -1628,17 +1983,19 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 
 	packed = kmem_alloc(nvsize, KM_SLEEP);
 
-	VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0);
+	VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR,
+	    KM_SLEEP) == 0);
 
 	dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize,
 	    packed, tx);
 
 	kmem_free(packed, nvsize);
 
-	db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object);
+	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset,
+	    spa->spa_config_object, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	*(uint64_t *)db->db_data = nvsize;
-	dmu_buf_rele(db);
+	dmu_buf_rele(db, FTAG);
 }
 
 /*
@@ -1651,7 +2008,6 @@ spa_sync(spa_t *spa, uint64_t txg)
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	objset_t *mos = spa->spa_meta_objset;
 	bplist_t *bpl = &spa->spa_sync_bplist;
-	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	dmu_tx_t *tx;
 	int dirty_vdevs;
@@ -1659,12 +2015,12 @@ spa_sync(spa_t *spa, uint64_t txg)
 	/*
 	 * Lock out configuration changes.
 	 */
-	spa_config_enter(spa, RW_READER);
+	spa_config_enter(spa, RW_READER, FTAG);
 
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
-	bplist_open(bpl, mos, spa->spa_sync_bplist_obj);
+	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
 
 	/*
 	 * If anything has changed in this txg, push the deferred frees
@@ -1685,6 +2041,8 @@ spa_sync(spa_t *spa, uint64_t txg)
 		spa_sync_config_object(spa, tx);
 		dmu_tx_commit(tx);
 
+		spa_errlog_sync(spa, txg);
+
 		dsl_pool_sync(dp, txg);
 
 		dirty_vdevs = 0;
@@ -1707,11 +2065,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 	 * Rewrite the vdev configuration (which includes the uberblock)
 	 * to commit the transaction group.
 	 */
-	while (spa_sync_labels(spa, txg)) {
-		dprintf("waiting for devices to heal\n");
-		delay(hz);
-		vdev_reopen(rvd, NULL);
-	}
+	VERIFY(0 == spa_sync_labels(spa, txg));
 
 	/*
 	 * Make a stable copy of the fully synced uberblock.
@@ -1748,7 +2102,12 @@ spa_sync(spa_t *spa, uint64_t txg)
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 	ASSERT(bpl->bpl_queue == NULL);
 
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
+
+	/*
+	 * If any async tasks have been requested, kick them off.
+	 */
+	spa_async_dispatch(spa);
 }
 
 /*
@@ -1800,13 +2159,13 @@ spa_evict_all(void)
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(NULL)) != NULL) {
 		/*
-		 * Stop all scrub and resilver activity.  spa_scrub() needs to
-		 * wait for the scrub thread, which may do a detach and sync the
-		 * configs, which needs spa_namespace_lock.  Drop the lock while
-		 * maintaining a hold on the spa_t.
+		 * Stop async tasks.  The async thread may need to detach
+		 * a device that's been replaced, which requires grabbing
+		 * spa_namespace_lock, so we must drop it here.
 		 */
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
+		spa_async_suspend(spa);
 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
@@ -1819,3 +2178,9 @@ spa_evict_all(void)
 	}
 	mutex_exit(&spa_namespace_lock);
 }
+
+vdev_t *
+spa_lookup_by_guid(spa_t *spa, uint64_t guid)
+{
+	return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
+}
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
index abcd67ddb9..addf3af885 100644
--- a/usr/src/uts/common/fs/zfs/spa_config.c
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -33,6 +32,11 @@
 #include <sys/fs/zfs.h>
 #include <sys/vdev_impl.h>
 #include <sys/zfs_ioctl.h>
+#ifdef _KERNEL
+#include <sys/kobj.h>
+#endif
+
+extern int modrootloaded;
 
 /*
  * Pool configuration repository.
@@ -65,43 +69,39 @@ const char *spa_config_dir = ZPOOL_CACHE_DIR;
 void
 spa_config_load(void)
 {
-	vnode_t *vp;
 	void *buf = NULL;
-	vattr_t vattr;
-	ssize_t resid;
 	nvlist_t *nvlist, *child;
 	nvpair_t *nvpair;
 	spa_t *spa;
 	char pathname[128];
+	struct _buf *file;
+	struct bootstat bst;
 
 	/*
 	 * Open the configuration file.
 	 */
-	(void) snprintf(pathname, sizeof (pathname), "./%s/%s", spa_config_dir,
-	    ZPOOL_CACHE_FILE);
-	if (vn_openat(pathname, UIO_SYSSPACE, FREAD | FOFFMAX, 0, &vp, 0, 0,
-	    rootdir) != 0)
+	(void) snprintf(pathname, sizeof (pathname), "%s%s/%s",
+	    (modrootloaded) ? "./" : "", spa_config_dir, ZPOOL_CACHE_FILE);
+
+	file = kobj_open_file(pathname);
+	if (file == (struct _buf *)-1)
 		return;
 
-	/*
-	 * Read the nvlist from the file.
-	 */
-	if (VOP_GETATTR(vp, &vattr, 0, kcred) != 0)
+	if (kobj_fstat(file->_fd, &bst) != 0)
 		goto out;
 
-	buf = kmem_alloc(vattr.va_size, KM_SLEEP);
+	buf = kmem_alloc(bst.st_size, KM_SLEEP);
 
-	if (vn_rdwr(UIO_READ, vp, buf, vattr.va_size, 0, UIO_SYSSPACE,
-	    0, RLIM64_INFINITY, kcred, &resid) != 0)
-		goto out;
-
-	if (resid != 0)
+	/*
+	 * Read the nvlist from the file.
+	 */
+	if (kobj_read_file(file, buf, bst.st_size, 0) < 0)
 		goto out;
 
 	/*
 	 * Unpack the nvlist.
 	 */
-	if (nvlist_unpack(buf, vattr.va_size, &nvlist, KM_SLEEP) != 0)
+	if (nvlist_unpack(buf, bst.st_size, &nvlist, KM_SLEEP) != 0)
 		goto out;
 
 	/*
@@ -133,10 +133,9 @@ spa_config_load(void)
 
 out:
 	if (buf != NULL)
-		kmem_free(buf, vattr.va_size);
+		kmem_free(buf, bst.st_size);
 
-	(void) VOP_CLOSE(vp, FREAD | FOFFMAX, 1, 0, kcred);
-	VN_RELE(vp);
+	kobj_close_file(file);
 }
 
 /*
@@ -157,7 +156,7 @@ spa_config_sync(void)
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
-	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	/*
 	 * Add all known pools to the configuration list, ignoring those with
@@ -179,7 +178,8 @@ spa_config_sync(void)
 
 	buf = kmem_alloc(buflen, KM_SLEEP);
 
-	VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR, 0) == 0);
+	VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR,
+	    KM_SLEEP) == 0);
 
 	/*
 	 * Write the configuration to disk.  We need to do the traditional
@@ -226,7 +226,7 @@ spa_all_configs(uint64_t *generation)
 	if (*generation == spa_config_generation)
 		return (NULL);
 
-	VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	spa = NULL;
 	mutex_enter(&spa_namespace_lock);
@@ -279,7 +279,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 	else if (txg != 0 && vd == rvd)
 		spa->spa_config_txg = txg;
 
-	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 	    UBERBLOCK_VERSION) == 0);
diff --git a/usr/src/uts/common/fs/zfs/spa_errlog.c b/usr/src/uts/common/fs/zfs/spa_errlog.c
new file mode 100644
index 0000000000..b52c3236d2
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/spa_errlog.c
@@ -0,0 +1,436 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Routines to manage the on-disk persistent error log.
+ *
+ * Each pool stores a log of all logical data errors seen during normal
+ * operation.  This is actually the union of two distinct logs: the last log,
+ * and the current log.  All errors seen are logged to the current log.  When a
+ * scrub completes, the current log becomes the last log, the last log is thrown
+ * out, and the current log is reinitialized.  This way, if an error is somehow
+ * corrected, a new scrub will show that that it no longer exists, and will be
+ * deleted from the log when the scrub completes.
+ *
+ * The log is stored using a ZAP object whose key is a string form of the
+ * zbookmark tuple (objset, object, level, blkid), and whose contents is an
+ * optional 'objset:object' human-readable string describing the data.  When an
+ * error is first logged, this string will be empty, indicating that no name is
+ * known.  This prevents us from having to issue a potentially large amount of
+ * I/O to discover the object name during an error path.  Instead, we do the
+ * calculation when the data is requested, storing the result so future queries
+ * will be faster.
+ *
+ * This log is then shipped into an nvlist where the key is the dataset name and
+ * the value is the object name.  Userland is then responsible for uniquifying
+ * this list and displaying it to the user.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+
+/*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexidecimal numbers that don't overflow.
+ */
+static uint64_t
+strtonum(char *str, char **nptr)
+{
+	uint64_t val = 0;
+	char c;
+	int digit;
+
+	while ((c = *str) != '\0') {
+		if (c >= '0' && c <= '9')
+			digit = c - '0';
+		else if (c >= 'a' && c <= 'f')
+			digit = 10 + c - 'a';
+		else
+			break;
+
+		val *= 16;
+		val += digit;
+
+		str++;
+	}
+
+	*nptr = str;
+
+	return (val);
+}
+
+/*
+ * Convert a bookmark to a string.
+ */
+static void
+bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
+{
+	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
+	    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
+	    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
+}
+
+/*
+ * Convert a string to a bookmark
+ */
+static void
+name_to_bookmark(char *buf, zbookmark_t *zb)
+{
+	zb->zb_objset = strtonum(buf, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_object = strtonum(buf + 1, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_level = (int)strtonum(buf + 1, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_blkid = strtonum(buf + 1, &buf);
+	ASSERT(*buf == '\0');
+}
+
+/*
+ * Log an uncorrectable error to the persistent error log.  We add it to the
+ * spa's list of pending errors.  The changes are actually synced out to disk
+ * during spa_errlog_sync().
+ */
+void
+spa_log_error(spa_t *spa, zio_t *zio)
+{
+	zbookmark_t *zb = &zio->io_logical->io_bookmark;
+	spa_error_entry_t search;
+	spa_error_entry_t *new;
+	avl_tree_t *tree;
+	avl_index_t where;
+
+	/*
+	 * If we are trying to import a pool, ignore any errors, as we won't be
+	 * writing to the pool any time soon.
+	 */
+	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+		return;
+
+	mutex_enter(&spa->spa_errlist_lock);
+
+	/*
+	 * If we have had a request to rotate the log, log it to the next list
+	 * instead of the current one.
+	 */
+	if (spa->spa_scrub_active || spa->spa_scrub_finished)
+		tree = &spa->spa_errlist_scrub;
+	else
+		tree = &spa->spa_errlist_last;
+
+	search.se_bookmark = *zb;
+	if (avl_find(tree, &search, &where) != NULL) {
+		mutex_exit(&spa->spa_errlist_lock);
+		return;
+	}
+
+	new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+	new->se_bookmark = *zb;
+	avl_insert(tree, new, where);
+
+	mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Return the number of errors currently in the error log.  This is actually the
+ * sum of both the last log and the current log, since we don't know the union
+ * of these logs until we reach userland.
+ */
+uint64_t
+spa_get_errlog_size(spa_t *spa)
+{
+	uint64_t total = 0, count;
+
+	mutex_enter(&spa->spa_errlog_lock);
+	if (spa->spa_errlog_scrub != 0 &&
+	    zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
+	    &count) == 0)
+		total += count;
+
+	if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+	    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+	    &count) == 0)
+		total += count;
+	mutex_exit(&spa->spa_errlog_lock);
+
+	mutex_enter(&spa->spa_errlist_lock);
+	total += avl_numnodes(&spa->spa_errlist_last);
+	total += avl_numnodes(&spa->spa_errlist_scrub);
+	mutex_exit(&spa->spa_errlist_lock);
+
+	return (total);
+}
+
+#ifdef _KERNEL
+static int
+process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	zbookmark_t zb;
+
+	if (obj == 0)
+		return (0);
+
+	for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+
+		if (*count == 0) {
+			zap_cursor_fini(&zc);
+			return (ENOMEM);
+		}
+
+		name_to_bookmark(za.za_name, &zb);
+
+		if (copyout(&zb, (char *)addr +
+		    (*count - 1) * sizeof (zbookmark_t),
+		    sizeof (zbookmark_t)) != 0)
+			return (EFAULT);
+
+		*count -= 1;
+	}
+
+	zap_cursor_fini(&zc);
+
+	return (0);
+}
+
+static int
+process_error_list(avl_tree_t *list, void *addr, size_t *count)
+{
+	spa_error_entry_t *se;
+
+	for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+
+		if (*count == 0)
+			return (ENOMEM);
+
+		if (copyout(&se->se_bookmark, (char *)addr +
+		    (*count - 1) * sizeof (zbookmark_t),
+		    sizeof (zbookmark_t)) != 0)
+			return (EFAULT);
+
+		*count -= 1;
+	}
+
+	return (0);
+}
+#endif
+
+/*
+ * Copy all known errors to userland as an array of bookmarks.  This is
+ * actually a union of the on-disk last log and current log, as well as any
+ * pending error requests.
+ *
+ * Because the act of reading the on-disk log could cause errors to be
+ * generated, we have two separate locks: one for the error log and one for the
+ * in-core error lists.  We only need the error list lock to log and error, so
+ * we grab the error log lock while we read the on-disk logs, and only pick up
+ * the error list lock when we are finished.
+ */
+int
+spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
+{
+	int ret = 0;
+
+#ifdef _KERNEL
+	mutex_enter(&spa->spa_errlog_lock);
+
+	ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
+
+	if (!ret && !spa->spa_scrub_finished)
+		ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
+		    count);
+
+	mutex_enter(&spa->spa_errlist_lock);
+	if (!ret)
+		ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
+		    count);
+	if (!ret)
+		ret = process_error_list(&spa->spa_errlist_last, uaddr,
+		    count);
+	mutex_exit(&spa->spa_errlist_lock);
+
+	mutex_exit(&spa->spa_errlog_lock);
+#endif
+
+	return (ret);
+}
+
+/*
+ * Called when a scrub completes.  This simply set a bit which tells which AVL
+ * tree to add new errors.  spa_errlog_sync() is responsible for actually
+ * syncing the changes to the underlying objects.
+ */
+void
+spa_errlog_rotate(spa_t *spa)
+{
+	mutex_enter(&spa->spa_errlist_lock);
+
+	ASSERT(!spa->spa_scrub_finished);
+	spa->spa_scrub_finished = B_TRUE;
+
+	mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Discard any pending errors from the spa_t.  Called when unloading a faulted
+ * pool, as the errors encountered during the open cannot be synced to disk.
+ */
+void
+spa_errlog_drain(spa_t *spa)
+{
+	spa_error_entry_t *se;
+	void *cookie;
+
+	mutex_enter(&spa->spa_errlist_lock);
+
+	cookie = NULL;
+	while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
+	    &cookie)) != NULL)
+		kmem_free(se, sizeof (spa_error_entry_t));
+	cookie = NULL;
+	while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
+	    &cookie)) != NULL)
+		kmem_free(se, sizeof (spa_error_entry_t));
+
+	mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Process a list of errors into the current on-disk log.
+ */
+static void
+sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
+{
+	spa_error_entry_t *se;
+	char buf[64];
+	void *cookie;
+
+	if (avl_numnodes(t) != 0) {
+		/* create log if necessary */
+		if (*obj == 0)
+			*obj = zap_create(spa->spa_meta_objset,
+			    DMU_OT_ERROR_LOG, DMU_OT_NONE,
+			    0, tx);
+
+		/* add errors to the current log */
+		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
+			char *name = se->se_name ? se->se_name : "";
+
+			bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
+
+			(void) zap_update(spa->spa_meta_objset,
+			    *obj, buf, 1, strlen(name) + 1, name, tx);
+		}
+
+		/* purge the error list */
+		cookie = NULL;
+		while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
+			kmem_free(se, sizeof (spa_error_entry_t));
+	}
+}
+
+/*
+ * Sync the error log out to disk.  This is a little tricky because the act of
+ * writing the error log requires the spa_errlist_lock.  So, we need to lock the
+ * error lists, take a copy of the lists, and then reinitialize them.  Then, we
+ * drop the error list lock and take the error log lock, at which point we
+ * do the errlog processing.  Then, if we encounter an I/O error during this
+ * process, we can successfully add the error to the list.  Note that this will
+ * result in the perpetual recycling of errors, but it is an unlikely situation
+ * and not a performance critical operation.
+ */
+void
+spa_errlog_sync(spa_t *spa, uint64_t txg)
+{
+	dmu_tx_t *tx;
+	avl_tree_t scrub, last;
+	int scrub_finished;
+
+	mutex_enter(&spa->spa_errlist_lock);
+
+	/*
+	 * Bail out early under normal circumstances.
+	 */
+	if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
+	    avl_numnodes(&spa->spa_errlist_last) == 0 &&
+	    !spa->spa_scrub_finished) {
+		mutex_exit(&spa->spa_errlist_lock);
+		return;
+	}
+
+	spa_get_errlists(spa, &last, &scrub);
+	scrub_finished = spa->spa_scrub_finished;
+	spa->spa_scrub_finished = B_FALSE;
+
+	mutex_exit(&spa->spa_errlist_lock);
+	mutex_enter(&spa->spa_errlog_lock);
+
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+	/*
+	 * Sync out the current list of errors.
+	 */
+	sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
+
+	/*
+	 * Rotate the log if necessary.
+	 */
+	if (scrub_finished) {
+		if (spa->spa_errlog_last != 0)
+			VERIFY(dmu_object_free(spa->spa_meta_objset,
+			    spa->spa_errlog_last, tx) == 0);
+		spa->spa_errlog_last = spa->spa_errlog_scrub;
+		spa->spa_errlog_scrub = 0;
+
+		sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
+	}
+
+	/*
+	 * Sync out any pending scrub errors.
+	 */
+	sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
+
+	/*
+	 * Update the MOS to reflect the new values.
+	 */
+	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
+	    &spa->spa_errlog_last, tx);
+	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
+	    &spa->spa_errlog_scrub, tx);
+
+	dmu_tx_commit(tx);
+
+	mutex_exit(&spa->spa_errlog_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 1ea7edfb77..8e0f6ce722 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -60,6 +59,7 @@
  * 		- Increase spa_refcount from non-zero
  * 		- Check if spa_refcount is zero
  * 		- Rename a spa_t
+ *		- add/remove/attach/detach devices
  * 		- Held for the duration of create/destroy/import/export
  *
  * 	It does not need to handle recursion.  A create or destroy may
@@ -91,14 +91,6 @@
  *      must have the namespace lock or non-zero refcount to have any kind
  *      of spa_t pointer at all.
  *
- * spa_vdev_lock (global mutex)
- *
- * 	This special lock is a global mutex used to serialize attempts to
- * 	access devices through ZFS.  It makes sure that we do not try to add
- * 	a single vdev to multiple pools at the same time.  It must be held
- * 	when adding or removing a device from the pool.
- *
- *
  * The locking order is fairly straightforward:
  *
  * 		spa_namespace_lock	->	spa_refcount
@@ -111,10 +103,9 @@
  * 	There must be at least one valid reference on the spa_t to acquire
  * 	the config lock.
  *
- * 		spa_vdev_lock		->	spa_config_lock
+ * 		spa_namespace_lock	->	spa_config_lock
  *
- * 	There are no locks required for spa_vdev_lock, but it must be
- * 	acquired before spa_config_lock.
+ * 	The namespace lock must always be taken before the config lock.
  *
  *
  * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
@@ -136,6 +127,7 @@
  * 	spa_evict_all()		Shutdown and remove all spa_t structures in
  * 				the system.
  *
+ *	spa_guid_exists()	Determine whether a pool/device guid exists.
  *
  * The spa_refcount is manipulated using the following functions:
  *
@@ -162,15 +154,14 @@
  * 	spa_config_held()	Returns true if the config lock is currently
  * 				held in the given state.
  *
- * The spa_vdev_lock, while acquired directly, is hidden by the following
- * functions, which imply additional semantics that must be followed:
+ * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
  *
- * 	spa_vdev_enter()	Acquire the vdev lock and the config lock for
- * 				writing.
+ * 	spa_vdev_enter()	Acquire the namespace lock and the config lock
+ *				for writing.
  *
  * 	spa_vdev_exit()		Release the config lock, wait for all I/O
- * 				to complete, release the vdev lock, and sync
- * 				the updated configs to the cache.
+ * 				to complete, sync the updated configs to the
+ *				cache, and release the namespace lock.
  *
  * The spa_name() function also requires either the spa_namespace_lock
  * or the spa_config_lock, as both are needed to do a rename.  spa_rename() is
@@ -191,8 +182,6 @@ int zfs_flags = ~0;
 int zfs_flags = 0;
 #endif
 
-static kmutex_t spa_vdev_lock;
-
 #define	SPA_MINREF	5	/* spa_refcnt for an open-but-idle pool */
 
 /*
@@ -238,6 +227,7 @@ spa_add(const char *name)
 	spa->spa_freeze_txg = UINT64_MAX;
 
 	refcount_create(&spa->spa_refcount);
+	refcount_create(&spa->spa_config_lock.scl_count);
 
 	avl_add(&spa_namespace_avl, spa);
 
@@ -268,6 +258,7 @@ spa_remove(spa_t *spa)
 	spa_config_set(spa, NULL);
 
 	refcount_destroy(&spa->spa_refcount);
+	refcount_destroy(&spa->spa_config_lock.scl_count);
 
 	kmem_free(spa, sizeof (spa_t));
 }
@@ -351,7 +342,7 @@ spa_refcount_zero(spa_t *spa)
  * valid use during create.
  */
 void
-spa_config_enter(spa_t *spa, krw_t rw)
+spa_config_enter(spa_t *spa, krw_t rw, void *tag)
 {
 	spa_config_lock_t *scl = &spa->spa_config_lock;
 
@@ -362,13 +353,14 @@ spa_config_enter(spa_t *spa, krw_t rw)
 			while (scl->scl_writer != NULL)
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 		} else {
-			while (scl->scl_writer != NULL || scl->scl_count > 0)
+			while (scl->scl_writer != NULL ||
+			    !refcount_is_zero(&scl->scl_count))
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 			scl->scl_writer = curthread;
 		}
 	}
 
-	scl->scl_count++;
+	(void) refcount_add(&scl->scl_count, tag);
 
 	mutex_exit(&scl->scl_lock);
 }
@@ -377,14 +369,14 @@ spa_config_enter(spa_t *spa, krw_t rw)
  * Release the spa config lock, notifying any waiters in the process.
  */
 void
-spa_config_exit(spa_t *spa)
+spa_config_exit(spa_t *spa, void *tag)
 {
 	spa_config_lock_t *scl = &spa->spa_config_lock;
 
 	mutex_enter(&scl->scl_lock);
 
-	ASSERT(scl->scl_count > 0);
-	if (--scl->scl_count == 0) {
+	ASSERT(!refcount_is_zero(&scl->scl_count));
+	if (refcount_remove(&scl->scl_count, tag) == 0) {
 		cv_broadcast(&scl->scl_cv);
 		scl->scl_writer = NULL;  /* OK in either case */
 	}
@@ -405,7 +397,7 @@ spa_config_held(spa_t *spa, krw_t rw)
 	if (rw == RW_WRITER)
 		held = (scl->scl_writer == curthread);
 	else
-		held = (scl->scl_count != 0);
+		held = !refcount_is_zero(&scl->scl_count);
 	mutex_exit(&scl->scl_lock);
 
 	return (held);
@@ -418,16 +410,22 @@ spa_config_held(spa_t *spa, krw_t rw)
  */
 
 /*
- * Lock the given spa_t for the purpose of adding or removing a vdev.  This
- * grabs the global spa_vdev_lock as well as the spa config lock for writing.
+ * Lock the given spa_t for the purpose of adding or removing a vdev.
+ * Grabs the global spa_namespace_lock plus the spa config lock for writing.
  * It returns the next transaction group for the spa_t.
  */
 uint64_t
 spa_vdev_enter(spa_t *spa)
 {
-	mutex_enter(&spa_vdev_lock);
+	/*
+	 * Suspend scrub activity while we mess with the config.
+	 */
+	spa_scrub_suspend(spa);
 
-	spa_config_enter(spa, RW_WRITER);
+	if (spa->spa_root_vdev != NULL)		/* not spa_create() */
+		mutex_enter(&spa_namespace_lock);
+
+	spa_config_enter(spa, RW_WRITER, spa);
 
 	return (spa_last_synced_txg(spa) + 1);
 }
@@ -441,14 +439,26 @@ spa_vdev_enter(spa_t *spa)
 int
 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 {
-	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+	ASSERT(txg != 0);
+
+	/*
+	 * Reassess the DTLs.  spa_scrub() looks at the DTLs without
+	 * taking the config lock at all, so keep it safe.
+	 */
+	if (spa->spa_root_vdev)
+		vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+
+	spa_config_exit(spa, spa);
 
-	spa_config_exit(spa);
+	/*
+	 * If there was a scrub or resilver in progress, indicate that
+	 * it must restart, and then allow it to resume.
+	 */
+	spa_scrub_restart(spa, txg);
+	spa_scrub_resume(spa);
 
-	if (vd == spa->spa_root_vdev) {		/* spa_create() */
-		mutex_exit(&spa_vdev_lock);
+	if (vd == spa->spa_root_vdev)		/* spa_create() */
 		return (error);
-	}
 
 	/*
 	 * Note: this txg_wait_synced() is important because it ensures
@@ -458,8 +468,6 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 	if (error == 0)
 		txg_wait_synced(spa->spa_dsl_pool, txg);
 
-	mutex_exit(&spa_vdev_lock);
-
 	if (vd != NULL) {
 		ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
 		vdev_free(vd);
@@ -469,11 +477,10 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 	 * If we're in the middle of export or destroy, don't sync the
 	 * config -- it will do that anyway, and we deadlock if we try.
 	 */
-	if (error == 0 && spa->spa_state == POOL_STATE_ACTIVE) {
-		mutex_enter(&spa_namespace_lock);
+	if (error == 0 && spa->spa_state == POOL_STATE_ACTIVE)
 		spa_config_sync();
-		mutex_exit(&spa_namespace_lock);
-	}
+
+	mutex_exit(&spa_namespace_lock);
 
 	return (error);
 }
@@ -497,7 +504,7 @@ spa_rename(const char *name, const char *newname)
 	 * Lookup the spa_t and grab the config lock for writing.  We need to
 	 * actually open the pool so that we can sync out the necessary labels.
 	 * It's OK to call spa_open() with the namespace lock held because we
-	 * alllow recursive calls for other reasons.
+	 * allow recursive calls for other reasons.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if ((err = spa_open(name, &spa, FTAG)) != 0) {
@@ -505,7 +512,7 @@ spa_rename(const char *name, const char *newname)
 		return (err);
 	}
 
-	spa_config_enter(spa, RW_WRITER);
+	spa_config_enter(spa, RW_WRITER, FTAG);
 
 	avl_remove(&spa_namespace_avl, spa);
 	spa_strfree(spa->spa_name);
@@ -519,7 +526,7 @@ spa_rename(const char *name, const char *newname)
 	 */
 	vdev_config_dirty(spa->spa_root_vdev);
 
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 
@@ -548,12 +555,8 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
 {
 	spa_t *spa;
 	avl_tree_t *t = &spa_namespace_avl;
-	boolean_t locked = B_FALSE;
 
-	if (mutex_owner(&spa_namespace_lock) != curthread) {
-		mutex_enter(&spa_namespace_lock);
-		locked = B_TRUE;
-	}
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
 		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
@@ -565,9 +568,6 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
 			break;
 	}
 
-	if (locked)
-		mutex_exit(&spa_namespace_lock);
-
 	return (spa != NULL);
 }
 
@@ -646,12 +646,12 @@ spa_freeze(spa_t *spa)
 {
 	uint64_t freeze_txg = 0;
 
-	spa_config_enter(spa, RW_WRITER);
+	spa_config_enter(spa, RW_WRITER, FTAG);
 	if (spa->spa_freeze_txg == UINT64_MAX) {
 		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
 		spa->spa_freeze_txg = freeze_txg;
 	}
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 	if (freeze_txg != 0)
 		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
 }
diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c
index 25f66bf94b..a99ec3f360 100644
--- a/usr/src/uts/common/fs/zfs/space_map.c
+++ b/usr/src/uts/common/fs/zfs/space_map.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -293,7 +292,8 @@ space_map_load(space_map_t *sm, space_map_obj_t *smo, uint8_t maptype,
 
 		dprintf("object=%llu  offset=%llx  size=%llx\n",
 		    smo->smo_object, offset, size);
-		dmu_read(os, smo->smo_object, offset, size, entry_map);
+		VERIFY(0 == dmu_read(os, smo->smo_object, offset, size,
+		    entry_map));
 
 		entry_map_end = entry_map + (size / sizeof (uint64_t));
 		for (entry = entry_map; entry < entry_map_end; entry++) {
@@ -394,7 +394,8 @@ space_map_write(space_map_t *sm, space_map_obj_t *smo, objset_t *os,
 {
 	uint64_t oldsize = smo->smo_objsize;
 
-	dmu_free_range(os, smo->smo_object, 0, smo->smo_objsize, tx);
+	VERIFY(0 == dmu_free_range(os, smo->smo_object, 0,
+	    smo->smo_objsize, tx));
 
 	smo->smo_objsize = 0;
 
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index b11cd42b6d..1a93d4e4ca 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -41,6 +40,7 @@ typedef struct arc_buf_hdr arc_buf_hdr_t;
 typedef struct arc_buf arc_buf_t;
 typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
 typedef void arc_byteswap_func_t(void *buf, size_t size);
+typedef int arc_evict_func_t(void *private);
 
 /* generic arc_done_func_t's which you can use */
 arc_done_func_t arc_bcopy_func;
@@ -50,6 +50,8 @@ struct arc_buf {
 	arc_buf_hdr_t		*b_hdr;
 	arc_buf_t		*b_next;
 	void			*b_data;
+	arc_evict_func_t	*b_efunc;
+	void			*b_private;
 };
 
 /*
@@ -60,22 +62,30 @@ struct arc_buf {
 #define	ARC_PREFETCH	(1 << 3)	/* I/O is a prefetch */
 
 arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag);
-void arc_buf_free(arc_buf_t *buf, void *tag);
+void arc_buf_add_ref(arc_buf_t *buf, void *tag);
+int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
 void arc_release(arc_buf_t *buf, void *tag);
 int arc_released(arc_buf_t *buf);
+int arc_has_callback(arc_buf_t *buf);
+#ifdef ZFS_DEBUG
+int arc_referenced(arc_buf_t *buf);
+#endif
 
 int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
     arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t arc_flags);
+    uint32_t arc_flags, zbookmark_t *zb);
 int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
     arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t arc_flags);
+    uint32_t arc_flags, zbookmark_t *zb);
 int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     zio_done_func_t *done, void *private, uint32_t arc_flags);
 int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
 
+void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
+int arc_buf_evict(arc_buf_t *buf);
+
 void arc_flush(void);
 void arc_tempreserve_clear(uint64_t tempreserve);
 int arc_tempreserve_space(uint64_t tempreserve);
diff --git a/usr/src/uts/common/fs/zfs/sys/bplist.h b/usr/src/uts/common/fs/zfs/sys/bplist.h
index 0933cb977b..c716fe7aa6 100644
--- a/usr/src/uts/common/fs/zfs/sys/bplist.h
+++ b/usr/src/uts/common/fs/zfs/sys/bplist.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -67,11 +66,11 @@ typedef struct bplist {
 
 extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
 extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
-extern void bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
+extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
 extern void bplist_close(bplist_t *bpl);
 extern boolean_t bplist_empty(bplist_t *bpl);
 extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern void bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
+extern int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
 extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp);
 extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
 extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
index d67901b31a..5724f7a324 100644
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -45,13 +44,14 @@ extern "C" {
 #define	IN_DMU_SYNC ((blkptr_t *)-1)
 
 /*
- * define flags for dbuf_read and friends
+ * define flags for dbuf_read
  */
 
 #define	DB_RF_MUST_SUCCEED	0
 #define	DB_RF_CANFAIL		(1 << 1)
 #define	DB_RF_HAVESTRUCT	(1 << 2)
 #define	DB_RF_NOPREFETCH	(1 << 3)
+#define	DB_RF_NEVERWAIT		(1 << 4)
 
 /*
  * The state transition diagram for dbufs looks like:
@@ -59,7 +59,7 @@ extern "C" {
  *		+----> READ ----+
  *		|		|
  *		|		V
- *   (alloc)-->UNCACHED	     CACHED-->(free)
+ *  (alloc)-->UNCACHED	     CACHED-->EVICTING-->(free)
  *		|		^
  *		|		|
  *		+----> FILL ----+
@@ -68,7 +68,8 @@ typedef enum dbuf_states {
 	DB_UNCACHED,
 	DB_FILL,
 	DB_READ,
-	DB_CACHED
+	DB_CACHED,
+	DB_EVICTING
 } dbuf_states_t;
 
 struct objset_impl;
@@ -158,8 +159,8 @@ typedef struct dmu_buf_impl {
 	uint64_t db_dirtied;
 
 	/*
-	 * If dd_dnode != NULL, our link on the owner dnodes's dn_dbufs list.
-	 * Protected by its dn_mtx.
+	 * If db_dnode != NULL, our link on the owner dnodes's dn_dbufs list.
+	 * Protected by its dn_dbufs_mtx.
 	 */
 	list_node_t db_link;
 
@@ -194,7 +195,7 @@ typedef struct dmu_buf_impl {
 		 * modify (dirty or clean). db_mtx must be held
 		 * before dn_dirty_mtx.
 		 */
-		arc_buf_t *db_data_old[TXG_SIZE];
+		void *db_data_old[TXG_SIZE];
 		blkptr_t *db_overridden_by[TXG_SIZE];
 	} db_d;
 } dmu_buf_impl_t;
@@ -212,35 +213,32 @@ typedef struct dbuf_hash_table {
 uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
 
 dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
+dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn);
 
-dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid);
+dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
     void *tag);
-dmu_buf_impl_t *dbuf_hold_bonus(struct dnode *dn, void *tag);
 int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
     void *tag, dmu_buf_impl_t **dbp);
 
 void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
 
 void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
-void dbuf_remove_ref(dmu_buf_impl_t *db, void *tag);
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
-void dbuf_rele(dmu_buf_impl_t *db);
+void dbuf_rele(dmu_buf_impl_t *db, void *tag);
 
 dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
 
-void dbuf_read(dmu_buf_impl_t *db);
-int dbuf_read_canfail(dmu_buf_impl_t *db);
-void dbuf_read_havestruct(dmu_buf_impl_t *db);
-void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
+int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
 void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 
+void dbuf_clear(dmu_buf_impl_t *db);
 void dbuf_evict(dmu_buf_impl_t *db);
 
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
@@ -250,7 +248,6 @@ void dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg);
 void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
     struct dmu_tx *);
 
-void dbuf_downgrade(dmu_buf_impl_t *db, int evicting);
 void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
 
 void dbuf_init(void);
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 62cc46c4de..f0ba816a7c 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -99,6 +98,8 @@ typedef enum dmu_object_type {
 	DMU_OT_PLAIN_OTHER,		/* UINT8 */
 	DMU_OT_UINT64_OTHER,		/* UINT64 */
 	DMU_OT_ZAP_OTHER,		/* ZAP */
+	/* new object types: */
+	DMU_OT_ERROR_LOG,		/* ZAP */
 
 	DMU_OT_NUMTYPES
 } dmu_object_type_t;
@@ -146,6 +147,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
 int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
     objset_t **osp);
 void dmu_objset_close(objset_t *os);
+void dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type,
     objset_t *clone_parent,
     void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
@@ -177,6 +179,8 @@ typedef void dmu_byteswap_func_t(void *buf, size_t size);
 #define	DMU_POOL_CONFIG			"config"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
 #define	DMU_POOL_SYNC_BPLIST		"sync_bplist"
+#define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
+#define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
@@ -268,8 +272,7 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
  * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
  * buffer as well.  You must release your hold with dmu_buf_rele().
  */
-dmu_buf_t *dmu_bonus_hold(objset_t *os, uint64_t object);
-dmu_buf_t *dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag);
+int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
 
 /*
@@ -286,11 +289,10 @@ int dmu_bonus_max(void);
  *
  * The object number must be a valid, allocated object number.
  */
-dmu_buf_t *dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset);
+int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **);
 void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
-void dmu_buf_remove_ref(dmu_buf_t *db, void* tag);
-void dmu_buf_rele(dmu_buf_t *db);
-void dmu_buf_rele_tag(dmu_buf_t *db, void *tag);
+void dmu_buf_rele(dmu_buf_t *db, void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 
 /*
@@ -303,9 +305,9 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db);
  * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
  * individually with dmu_buf_rele.
  */
-dmu_buf_t **dmu_buf_hold_array(objset_t *os, uint64_t object,
-    uint64_t offset, uint64_t length, int *numbufs);
-void dmu_buf_rele_array(dmu_buf_t **, int numbufs);
+int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
+void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
 
 /*
  * Returns NULL on success, or the existing user ptr if it's already
@@ -348,19 +350,6 @@ void dmu_buf_rele_data(dmu_buf_t *db);
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 /*
- * Indicate that you are going to read the buffer's data (db_data).
- *
- * This routine will read the data from disk if necessary.
- *
- * These routines will return 0 on success, or an errno if there is a
- * nonrecoverable I/O error.
- */
-void dmu_buf_read(dmu_buf_t *db);
-int dmu_buf_read_canfail(dmu_buf_t *db);
-void dmu_buf_read_array(dmu_buf_t **dbp, int numbufs);
-int dmu_buf_read_array_canfail(dmu_buf_t **dbp, int numbufs);
-
-/*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
  * The transaction (tx) must be assigned to a txg (ie. you've called
@@ -370,20 +359,6 @@ int dmu_buf_read_array_canfail(dmu_buf_t **dbp, int numbufs);
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 
 /*
- * Indicate that you are going to modify the entire contents of the
- * buffer's data ("fill" it).
- *
- * This routine is the same as dmu_buf_will_dirty, except that it won't
- * read the contents off the disk, so the contents may be uninitialized
- * and you must overwrite it.
- *
- * The transaction (tx) must be assigned to a txg (ie. you've called
- * dmu_tx_assign()).  The buffer's object must be held in the tx (ie.
- * you've called dmu_tx_hold_object(tx, db->db_object)).
- */
-/* void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); */
-
-/*
  * You must create a transaction, then hold the objects which you will
  * (or might) modify as part of this transaction.  Then you must assign
  * the transaction to a transaction group.  Once the transaction has
@@ -408,7 +383,7 @@ dmu_tx_t *dmu_tx_create(objset_t *os);
 void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
-void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
@@ -418,7 +393,7 @@ void dmu_tx_commit(dmu_tx_t *tx);
  * Free up the data blocks for a defined range of a file.  If size is
  * zero, the range from offset to end-of-file is freed.
  */
-void dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size, dmu_tx_t *tx);
 
 /*
@@ -427,10 +402,8 @@ void dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
-void dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	void *buf);
-int dmu_read_canfail(objset_t *dd, uint64_t object, uint64_t offset,
-	uint64_t size, void *buf);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 int dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
@@ -491,8 +464,7 @@ uint64_t dmu_object_max_nonzero_offset(objset_t *os, uint64_t object);
 typedef struct dmu_objset_stats {
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
-	uint8_t dds_is_placeholder;
-	uint8_t dds_pad[2];
+	uint8_t dds_pad[3];
 
 	uint64_t dds_creation_time;
 	uint64_t dds_creation_txg;
@@ -532,7 +504,6 @@ typedef struct dmu_objset_stats {
 	 * change, so there is a small probability that it will collide.
 	 */
 	uint64_t dds_fsid_guid;
-	uint64_t dds_guid;
 
 	uint64_t dds_objects_used;	/* number of objects used */
 	uint64_t dds_objects_avail;	/* number of objects available */
@@ -553,15 +524,9 @@ typedef struct dmu_objset_stats {
 	uint64_t dds_available;
 
 	/*
-	 * Miscellaneous
+	 * Used for debugging purposes
 	 */
-	char dds_altroot[MAXPATHLEN];
-
-	/* The following are for debugging purposes only */
 	uint64_t dds_last_txg;
-	uint64_t dds_dir_obj;
-	uint64_t dds_objset_obj;
-	uint64_t dds_clone_of_obj;
 } dmu_objset_stats_t;
 
 /*
@@ -617,7 +582,7 @@ void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 
 int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp);
-int dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
+int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
     struct vnode *vp, uint64_t voffset);
 
 /* CRC64 table */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
index d0a77fcfb9..ee14bfab85 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -86,12 +85,7 @@ typedef struct objset_impl {
 	list_t os_downgraded_dbufs;
 } objset_impl_t;
 
-#define	DMU_PRIVATE_OBJECT		(1ULL << 63)
-
-#define	DMU_META_DNODE_OBJECT		(1ULL << 63)
-
-/* XXX rename this to DMU_IS_DNODE_OBJECT? */
-#define	IS_DNODE_DNODE(object) ((object) == DMU_META_DNODE_OBJECT)
+#define	DMU_META_DNODE_OBJECT	0
 
 /* called from zpl */
 int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
@@ -106,13 +100,14 @@ void dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds);
 void dmu_objset_find(char *name, void func(char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
+void dmu_objset_evict_dbufs(objset_t *os);
 
 /* called from dsl */
 void dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx);
 objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
     dmu_objset_type_t type, dmu_tx_t *tx);
-objset_impl_t *dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds,
-    blkptr_t *bp);
+int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
+    objset_impl_t **osip);
 void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
index 7087912e00..a80345afd0 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -45,7 +44,8 @@ extern "C" {
 #define	ADVANCE_PRUNE	0x02		/* prune by prev snapshot birth time */
 #define	ADVANCE_DATA	0x04		/* read user data blocks */
 #define	ADVANCE_HOLES	0x08		/* visit holes */
-#define	ADVANCE_NOLOCK	0x10		/* Don't grab SPA sync lock */
+#define	ADVANCE_ZIL	0x10		/* visit intent log blocks */
+#define	ADVANCE_NOLOCK	0x20		/* Don't grab SPA sync lock */
 
 #define	ZB_NO_LEVEL	-2
 #define	ZB_MAXLEVEL	32		/* Next power of 2 >= DN_MAX_LEVELS */
@@ -58,13 +58,6 @@ extern "C" {
 #define	ZB_DN_CACHE	2
 #define	ZB_DEPTH	3
 
-typedef struct zbookmark {
-	uint64_t	zb_objset;
-	uint64_t	zb_object;
-	int		zb_level;
-	uint64_t	zb_blkid;
-} zbookmark_t;
-
 typedef struct zseg {
 	uint64_t	seg_mintxg;
 	uint64_t	seg_maxtxg;
@@ -93,6 +86,7 @@ struct traverse_handle {
 	int		th_zio_flags;
 	list_t		th_seglist;
 	traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL];
+	traverse_blk_cache_t th_zil_cache;
 	uint64_t	th_hits;
 	uint64_t	th_arc_hits;
 	uint64_t	th_reads;
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
index d04c7c8d6b..9b55c56bc9 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -54,6 +53,7 @@ struct dmu_tx {
 	struct dsl_dir *tx_dir;
 	struct dsl_pool *tx_pool;
 	uint64_t tx_txg;
+	uint64_t tx_lastsnap_txg;
 	txg_handle_t tx_txgh;
 	uint64_t tx_space_towrite;
 	refcount_t tx_space_written;
@@ -62,7 +62,7 @@ struct dmu_tx {
 	uint64_t tx_space_tooverwrite;
 	void *tx_tempreserve_cookie;
 	uint8_t tx_anyobj;
-	uint8_t tx_privateobj;
+	int tx_err;
 #ifdef ZFS_DEBUG
 	char *tx_debug_buf;
 	int tx_debug_len;
@@ -79,15 +79,10 @@ enum dmu_tx_hold_type {
 	THT_NUMTYPES
 };
 
-typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
-    uint64_t arg1, uint64_t arg2);
-
-
 typedef struct dmu_tx_hold {
 	list_node_t dth_node;
 	struct dnode *dth_dnode;
 	enum dmu_tx_hold_type dth_type;
-	dmu_tx_hold_func_t dth_func;
 	uint64_t dth_arg1;
 	uint64_t dth_arg2;
 	/* XXX track what the actual estimates were for this hold */
diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h
index 1b43805e93..31b148f295 100644
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -63,23 +62,16 @@ extern "C" {
 #define	DNODE_SIZE	(1 << DNODE_SHIFT)
 #define	DN_MAX_NBLKPTR	((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
 #define	DN_MAX_BONUSLEN	(DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
+#define	DN_MAX_OBJECT	(1ULL << DN_MAX_OBJECT_SHIFT)
 
 #define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
 #define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
 #define	DNODES_PER_LEVEL_SHIFT	(DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
 
-#define	DN_META_DNODE_LEVELS	\
-	(1 + (DN_MAX_OBJECT_SHIFT - DNODE_SHIFT + SPA_BLKPTRSHIFT -	\
-	DNODES_PER_BLOCK_SHIFT) / DNODES_PER_LEVEL_SHIFT)
-
 /* The +2 here is a cheesy way to round up */
 #define	DN_MAX_LEVELS	(2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
 	(DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
 
-#define	DN_MAX_OBJECT		\
-	((uint64_t)DN_MAX_NBLKPTR << (DNODES_PER_BLOCK_SHIFT +	\
-	(DN_META_DNODE_LEVELS - 1) * DNODES_PER_LEVEL_SHIFT))
-
 #define	DN_BONUS(dnp)	((void*)((dnp)->dn_bonus + \
 	(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
 
@@ -213,15 +205,7 @@ typedef struct dnode {
 
 	kmutex_t dn_dbufs_mtx;
 	list_t dn_dbufs;		/* linked list of descendent dbuf_t's */
-	kcondvar_t dn_evicted;		/* a child dbuf has been evicted */
-
-	/*
-	 * Performance hack: whenever we have a hold on the bonus buffer of a
-	 * ZAP object, we will also have a hold on db0.  This will keep the
-	 * meta-data for a micro-zap object cached as long as the znode for the
-	 * object is in the znode cache.
-	 */
-	struct dmu_buf_impl *dn_db0;
+	struct dmu_buf_impl *dn_bonus;	/* bonus buffer dbuf */
 
 	/* holds prefetch structure */
 	struct zfetch	dn_zfetch;
@@ -237,9 +221,10 @@ dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
     uint64_t object);
 void dnode_special_close(dnode_t *dn);
 
-dnode_t *dnode_hold(struct objset_impl *dd, uint64_t object, void *ref);
-dnode_t *dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
-    void *ref);
+int dnode_hold(struct objset_impl *dd, uint64_t object,
+    void *ref, dnode_t **dnp);
+int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
+    void *ref, dnode_t **dnp);
 void dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
 void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
@@ -266,6 +251,7 @@ void dnode_init(void);
 void dnode_fini(void);
 int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
     uint64_t blkfill);
+void dnode_evict_dbufs(dnode_t *dn);
 
 #ifdef ZFS_DEBUG
 
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
index e56c8a67d9..3411eba68b 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -108,8 +107,8 @@ int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
     void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_open(const char *name, int mode, void *tag,
     dsl_dataset_t **dsp);
-dsl_dataset_t *dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
-    const char *tail, int mode, void *tag);
+int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
+    const char *tail, int mode, void *tag, dsl_dataset_t **);
 void dsl_dataset_name(dsl_dataset_t *ds, char *name);
 void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag);
 int dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
@@ -134,8 +133,8 @@ void dsl_dataset_sync(dsl_dataset_t *os, dmu_tx_t *tx);
 
 void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth,
-    dmu_tx_t *tx);
+int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
+uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
 
 void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
 void dsl_dataset_stats(dsl_dataset_t *os, dmu_objset_stats_t *dds);
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
index 0499d731e6..5c23fdc497 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -98,11 +97,11 @@ struct dsl_dir {
 };
 
 void dsl_dir_close(dsl_dir_t *dd, void *tag);
-dsl_dir_t *dsl_dir_open(const char *name, void *tag, const char **tail);
-dsl_dir_t *dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail);
+int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **,
     const char **tailp);
-dsl_dir_t *dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
-    const char *tail, void *tag);
+int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+    const char *tail, void *tag, dsl_dir_t **);
 void dsl_dir_name(dsl_dir_t *dd, char *buf);
 int dsl_dir_is_private(dsl_dir_t *dd);
 int dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx);
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
index 4fca4548ad..2eab6ae945 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -67,7 +66,7 @@ typedef struct dsl_pool {
 	krwlock_t dp_config_rwlock;
 } dsl_pool_t;
 
-dsl_pool_t *dsl_pool_open(spa_t *spa, uint64_t txg);
+int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
 void dsl_pool_close(dsl_pool_t *dp);
 dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg);
 void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h
index f9fffd2443..0b7e12f2cb 100644
--- a/usr/src/uts/common/fs/zfs/sys/refcount.h
+++ b/usr/src/uts/common/fs/zfs/sys/refcount.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -42,7 +41,7 @@ extern "C" {
  * particular object, use FTAG (which is a string) for the holder_tag.
  * Otherwise, use the object that holds the reference.
  */
-#define	FTAG ((void*)__func__)
+#define	FTAG ((char *)__func__)
 
 #if defined(DEBUG) || !defined(_KERNEL)
 typedef struct reference {
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index fbe2822a13..2c8a43bb37 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -292,21 +291,30 @@ typedef struct blkptr {
 
 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, void *tag);
-extern int spa_get_stats(const char *pool, nvlist_t **config);
+extern int spa_get_stats(const char *pool, nvlist_t **config,
+    char *altroot, size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *config, char *altroot);
 extern int spa_import(const char *pool, nvlist_t *config, char *altroot);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
 extern int spa_export(char *pool);
+extern int spa_reset(char *pool);
+extern void spa_async_request(spa_t *spa, int flag);
+extern void spa_async_suspend(spa_t *spa);
+extern void spa_async_resume(spa_t *spa);
+extern spa_t *spa_inject_addref(char *pool);
+extern void spa_inject_delref(spa_t *spa);
+
+#define	SPA_ASYNC_REOPEN	0x01
+#define	SPA_ASYNC_REPLACE_DONE	0x02
+#define	SPA_ASYNC_SCRUB		0x04
+#define	SPA_ASYNC_RESILVER	0x08
 
 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
-extern int spa_vdev_add_unlocked(spa_t *spa, nvlist_t *nvroot);
-extern int spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot,
+extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
     int replacing);
-extern int spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid,
-    int replace_done);
-extern void spa_vdev_replace_done(spa_t *spa);
+extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 
 /* scrubbing */
@@ -314,6 +322,7 @@ extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
 extern void spa_scrub_suspend(spa_t *spa);
 extern void spa_scrub_resume(spa_t *spa);
 extern void spa_scrub_restart(spa_t *spa, uint64_t txg);
+extern void spa_scrub_throttle(spa_t *spa, int direction);
 
 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
@@ -345,8 +354,8 @@ extern void spa_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
 /* Pool configuration lock */
-extern void spa_config_enter(spa_t *spa, krw_t rw);
-extern void spa_config_exit(spa_t *spa);
+extern void spa_config_enter(spa_t *spa, krw_t rw, void *tag);
+extern void spa_config_exit(spa_t *spa, void *tag);
 extern boolean_t spa_config_held(spa_t *spa, krw_t rw);
 
 /* Pool vdev add/remove lock */
@@ -383,6 +392,23 @@ extern uint64_t spa_get_random(uint64_t range);
 extern void sprintf_blkptr(char *buf, int len, blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern void spa_evict_all(void);
+extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid);
+
+/* error handling */
+struct zbookmark;
+struct zio;
+extern void spa_log_error(spa_t *spa, struct zio *zio);
+extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
+    struct zio *zio, uint64_t stateoroffset, uint64_t length);
+extern void zfs_post_ok(spa_t *spa, vdev_t *vd);
+extern uint64_t spa_get_errlog_size(spa_t *spa);
+extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
+extern void spa_errlog_rotate(spa_t *spa);
+extern void spa_errlog_drain(spa_t *spa);
+extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
+extern int spa_bookmark_name(spa_t *spa, struct zbookmark *zb, char *ds,
+    size_t dsname, char *obj, size_t objname, char *range, size_t rangelen);
+extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
 
 /* Initialization and termination */
 extern void spa_init(int flags);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 0fcef6c48b..e9192956c3 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -46,27 +45,33 @@ extern "C" {
 
 typedef struct spa_config_lock {
 	kmutex_t	scl_lock;
-	uint64_t	scl_count;
+	refcount_t	scl_count;
 	kthread_t	*scl_writer;
 	kcondvar_t	scl_cv;
 } spa_config_lock_t;
 
+typedef struct spa_error_entry {
+	zbookmark_t	se_bookmark;
+	char		*se_name;
+	avl_node_t	se_avl;
+} spa_error_entry_t;
+
 struct spa {
 	/*
 	 * Fields protected by spa_namespace_lock.
 	 */
 	char		*spa_name;
 	avl_node_t	spa_avl;
-	int		spa_anon;
 	nvlist_t	*spa_config;
 	uint64_t	spa_config_txg;		/* txg of last config change */
 	spa_config_lock_t spa_config_lock;	/* configuration changes */
 	kmutex_t	spa_config_cache_lock;	/* for spa_config RW_READER */
 	int		spa_sync_pass;		/* iterate-to-convergence */
 	int		spa_state;		/* pool state */
-	uint8_t		spa_minref;		/* min refcnt of open pool */
+	int		spa_inject_ref;		/* injection references */
 	uint8_t		spa_traverse_wanted;	/* traverse lock wanted */
-	taskq_t		*spa_vdev_retry_taskq;
+	uint8_t		spa_sync_on;		/* sync threads are running */
+	spa_load_state_t spa_load_state;	/* current load operation */
 	taskq_t		*spa_zio_issue_taskq[ZIO_TYPES];
 	taskq_t		*spa_zio_intr_taskq[ZIO_TYPES];
 	dsl_pool_t	*spa_dsl_pool;
@@ -88,18 +93,33 @@ struct spa {
 	kthread_t	*spa_scrub_thread;	/* scrub/resilver thread */
 	traverse_handle_t *spa_scrub_th;	/* scrub traverse handle */
 	uint64_t	spa_scrub_restart_txg;	/* need to restart */
+	uint64_t	spa_scrub_mintxg;	/* min txg we'll scrub */
 	uint64_t	spa_scrub_maxtxg;	/* max txg we'll scrub */
 	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
+	int64_t		spa_scrub_throttled;	/* over-throttle scrub I/Os */
 	uint64_t	spa_scrub_errors;	/* scrub I/O error count */
+	int		spa_scrub_suspended;	/* tell scrubber to suspend */
 	kcondvar_t	spa_scrub_cv;		/* scrub thread state change */
 	kcondvar_t	spa_scrub_io_cv;	/* scrub I/O completion */
 	uint8_t		spa_scrub_stop;		/* tell scrubber to stop */
-	uint8_t		spa_scrub_suspend;	/* tell scrubber to suspend */
 	uint8_t		spa_scrub_active;	/* active or suspended? */
 	uint8_t		spa_scrub_type;		/* type of scrub we're doing */
-	int		spa_sync_on;		/* sync threads are running */
+	kmutex_t	spa_async_lock;		/* protect async state */
+	kthread_t	*spa_async_thread;	/* thread doing async task */
+	int		spa_async_suspended;	/* async tasks suspended */
+	kcondvar_t	spa_async_cv;		/* wait for thread_exit() */
+	uint16_t	spa_async_tasks;	/* async task mask */
 	char		*spa_root;		/* alternate root directory */
 	kmutex_t	spa_uberblock_lock;	/* vdev_uberblock_load_done() */
+	uint64_t	spa_ena;		/* spa-wide ereport ENA */
+	boolean_t	spa_last_open_failed;	/* true if last open faled */
+	kmutex_t	spa_errlog_lock;	/* error log lock */
+	uint64_t	spa_errlog_last;	/* last error log object */
+	uint64_t	spa_errlog_scrub;	/* scrub error log object */
+	kmutex_t	spa_errlist_lock;	/* error list/ereport lock */
+	avl_tree_t	spa_errlist_last;	/* last error list */
+	avl_tree_t	spa_errlist_scrub;	/* scrub error list */
+	int		spa_scrub_finished;	/* indicator to rotate logs */
 	/*
 	 * spa_refcnt must be the last element because it changes size based on
 	 * compilation options.  In order for the MDB module to function
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index 86d2f1b1ab..f3d7379049 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -60,11 +60,10 @@ typedef struct vdev_knob {
 extern int vdev_open(vdev_t *);
 extern void vdev_close(vdev_t *);
 extern int vdev_create(vdev_t *, uint64_t txg);
-extern void vdev_init(vdev_t *, uint64_t txg);
-extern void vdev_reopen(vdev_t *, zio_t **zq);
+extern int vdev_init(vdev_t *, uint64_t txg);
+extern void vdev_reopen(vdev_t *);
 
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
-extern vdev_t *vdev_lookup_by_path(vdev_t *vd, const char *path);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
 extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
 extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
@@ -73,16 +72,16 @@ extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
 
 extern const char *vdev_description(vdev_t *vd);
 
-extern void vdev_metaslab_init(vdev_t *vd, uint64_t txg);
+extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
 extern void vdev_metaslab_fini(vdev_t *vd);
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
 extern void vdev_stat_update(zio_t *zio);
 extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
     boolean_t complete);
-extern void vdev_checksum_error(zio_t *zio, vdev_t *vd);
 extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
-extern void vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux);
+extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
+    vdev_aux_t aux);
 
 extern void vdev_space_update(vdev_t *vd, uint64_t space_delta,
     uint64_t alloc_delta);
@@ -92,11 +91,10 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
 extern void vdev_io_start(zio_t *zio);
 extern void vdev_io_done(zio_t *zio);
 
-extern int vdev_online(spa_t *spa, const char *path);
-extern int vdev_offline(spa_t *spa, const char *path, int istmp);
+extern int vdev_online(spa_t *spa, uint64_t guid);
+extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp);
+extern void vdev_clear(spa_t *spa, vdev_t *vd);
 
-extern int vdev_error_setup(spa_t *spa, const char *path, int mode, int mask,
-    uint64_t arg);
 extern int vdev_error_inject(vdev_t *vd, zio_t *zio);
 extern int vdev_is_dead(vdev_t *vd);
 
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 53a202a906..2dfc45edff 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -103,9 +103,11 @@ struct vdev_cache {
 struct vdev_queue {
 	uint64_t	vq_min_pending;
 	uint64_t	vq_max_pending;
+	uint64_t	vq_scrub_limit;
 	uint64_t	vq_agg_limit;
 	uint64_t	vq_time_shift;
 	uint64_t	vq_ramp_rate;
+	uint64_t	vq_scrub_count;
 	avl_tree_t	vq_deadline_tree;
 	avl_tree_t	vq_read_tree;
 	avl_tree_t	vq_write_tree;
@@ -150,10 +152,9 @@ struct vdev {
 	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
 	uint8_t		vdev_dirty[TXG_SIZE]; /* per-txg dirty flags	*/
-	int		vdev_is_dirty;	/* on config dirty list?	*/
+	uint8_t		vdev_is_dirty;	/* on config dirty list?	*/
+	uint8_t		vdev_reopen_wanted; /* async reopen wanted?	*/
 	list_node_t	vdev_dirty_node; /* config dirty list		*/
-	zio_t		*vdev_io_retry;	/* I/O retry list		*/
-	list_t		vdev_io_pending; /* I/O pending list		*/
 
 	/*
 	 * Leaf vdev state.
@@ -173,6 +174,8 @@ struct vdev {
 	uint8_t		vdev_detached;	/* device detached?		*/
 	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
 	vdev_cache_t	vdev_cache;	/* physical block cache		*/
+	uint64_t	vdev_not_present; /* not present during import	*/
+	hrtime_t	vdev_last_try;	/* last reopen time		*/
 
 	/*
 	 * For DTrace to work in userland (libzpool) context, these fields must
@@ -183,8 +186,6 @@ struct vdev {
 	 */
 	kmutex_t	vdev_dtl_lock;	/* vdev_dtl_{map,resilver}	*/
 	kmutex_t	vdev_dirty_lock; /* vdev_dirty[]		*/
-	kmutex_t	vdev_io_lock;	/* vdev_io_pending list		*/
-	kcondvar_t	vdev_io_cv;	/* vdev_io_pending list empty?	*/
 	kmutex_t	vdev_stat_lock;	/* vdev_stat			*/
 };
 
@@ -260,7 +261,7 @@ extern void vdev_remove_parent(vdev_t *cvd);
 /*
  * vdev sync load and sync
  */
-extern int vdev_load(vdev_t *vd, int import);
+extern int vdev_load(vdev_t *vd);
 extern void vdev_sync(vdev_t *vd, uint64_t txg);
 extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
 extern void vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg);
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
index 9fb6a6c5a4..e77a2efa61 100644
--- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
@@ -199,7 +199,7 @@ void zap_put_leaf(struct zap_leaf *l);
 
 int fzap_add_cd(zap_t *zap, const char *name,
     uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, dmu_tx_t *tx, struct zap_leaf **lp);
+    const void *val, uint32_t cd, dmu_tx_t *tx);
 void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
index 2ea27493f9..34057e83c9 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -103,7 +102,6 @@ int zfs_zaccess_rename(struct znode *, struct znode *,
     struct znode *, struct znode *, cred_t *cr);
 int zfs_zaccess_v4_perm(struct znode *, int, cred_t *);
 void zfs_acl_free(zfs_acl_t *);
-zfs_acl_t *zfs_acl_node_read(struct znode *);
 
 #endif
 
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index c914b23570..14ad31e629 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -31,6 +30,7 @@
 
 #include <sys/cred.h>
 #include <sys/dmu.h>
+#include <sys/zio.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -66,7 +66,7 @@ typedef struct dmu_replay_record {
 			char drr_toname[MAXNAMELEN];
 		} drr_begin;
 		struct drr_end {
-			uint64_t drr_checksum;
+			zio_cksum_t drr_checksum;
 		} drr_end;
 		struct drr_object {
 			uint64_t drr_object;
@@ -97,15 +97,31 @@ typedef struct dmu_replay_record {
 	} drr_u;
 } dmu_replay_record_t;
 
+typedef struct zinject_record {
+	uint64_t	zi_objset;
+	uint64_t	zi_object;
+	uint64_t	zi_start;
+	uint64_t	zi_end;
+	uint64_t	zi_guid;
+	uint32_t	zi_level;
+	uint32_t	zi_error;
+	uint64_t	zi_type;
+	uint32_t	zi_freq;
+} zinject_record_t;
+
+#define	ZINJECT_NULL		0x1
+#define	ZINJECT_FLUSH_ARC	0x2
+#define	ZINJECT_UNLOAD_SPA	0x4
+
 typedef struct zfs_cmd {
 	char		zc_name[MAXNAMELEN];
 	char		zc_prop_name[MAXNAMELEN];
 	char		zc_prop_value[MAXPATHLEN];
 	char		zc_root[MAXPATHLEN];
-	char		zc_filename[MAXPATHLEN];
+	char		zc_filename[MAXNAMELEN];
 	uint32_t	zc_intsz;
 	uint32_t	zc_numints;
-	uint64_t	zc_pool_guid;
+	uint64_t	zc_guid;
 	uint64_t	zc_config_src;	/* really (char *) */
 	uint64_t	zc_config_src_size;
 	uint64_t	zc_config_dst;	/* really (char *) */
@@ -116,9 +132,10 @@ typedef struct zfs_cmd {
 	uint64_t	zc_volsize;
 	uint64_t	zc_volblocksize;
 	uint64_t	zc_objset_type;
-	dmu_object_info_t zc_object_info;
 	dmu_objset_stats_t zc_objset_stats;
 	struct drr_begin zc_begin_record;
+	zinject_record_t zc_inject_record;
+	zbookmark_t	zc_bookmark;
 } zfs_cmd_t;
 
 #define	ZVOL_MAX_MINOR	(1 << 16)
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
index f9331be00a..02f4b3b247 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -133,8 +132,6 @@ typedef struct zfs_dirlock {
 	struct zfs_dirlock *dl_next;	/* next in z_dirlocks list */
 } zfs_dirlock_t;
 
-struct zcache_state;
-
 typedef struct znode {
 	struct zfsvfs	*z_zfsvfs;
 	vnode_t		*z_vnode;
@@ -150,16 +147,12 @@ typedef struct znode {
 	uint8_t		z_atime_dirty;	/* atime needs to be synced */
 	uint8_t		z_dbuf_held;	/* Is z_dbuf already held? */
 	uint8_t		z_zn_prefetch;	/* Prefetch znodes? */
-	uint_t		z_mapcnt;	/* number of memory maps to file */
 	uint_t		z_blksz;	/* block size in bytes */
 	uint_t		z_seq;		/* modification sequence number */
+	uint64_t	z_mapcnt;	/* number of pages mapped to file */
 	uint64_t	z_last_itx;	/* last ZIL itx on this znode */
 	kmutex_t	z_acl_lock;	/* acl data lock */
 	list_node_t	z_link_node;	/* all znodes in fs link */
-	list_node_t	z_zcache_node;
-	struct zcache_state *z_zcache_state;
-	uint64_t	z_zcache_access;
-
 	/*
 	 * These are dmu managed fields.
 	 */
@@ -241,14 +234,12 @@ extern int	zfs_freesp(znode_t *, uint64_t, uint64_t, int, dmu_tx_t *,
     cred_t *cr);
 extern void	zfs_znode_init(void);
 extern void	zfs_znode_fini(void);
-extern znode_t	*zfs_znode_alloc(zfsvfs_t *, dmu_buf_t *, uint64_t, int);
 extern int	zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
 extern void	zfs_zinactive(znode_t *);
 extern void	zfs_znode_delete(znode_t *, dmu_tx_t *);
 extern void	zfs_znode_free(znode_t *);
 extern int	zfs_delete_thread_target(zfsvfs_t *zfsvfs, int nthreads);
 extern void	zfs_delete_wait_empty(zfsvfs_t *zfsvfs);
-extern void	zfs_zcache_flush(zfsvfs_t *zfsvf);
 extern void	zfs_remove_op_tables();
 extern int	zfs_create_op_tables();
 extern int	zfs_sync(vfs_t *vfsp, short flag, cred_t *cr);
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 5d3227e546..d80310f2fa 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -109,23 +108,25 @@ enum zio_compress {
 #define	ZIO_PRIORITY_SCRUB		(zio_priority_table[9])
 #define	ZIO_PRIORITY_TABLE_SIZE		10
 
-#define	ZIO_FLAG_MUSTSUCCEED		0x0000
-#define	ZIO_FLAG_CANFAIL		0x0001
-#define	ZIO_FLAG_FAILFAST		0x0002
-#define	ZIO_FLAG_CONFIG_HELD		0x0004
+#define	ZIO_FLAG_MUSTSUCCEED		0x00000
+#define	ZIO_FLAG_CANFAIL		0x00001
+#define	ZIO_FLAG_FAILFAST		0x00002
+#define	ZIO_FLAG_CONFIG_HELD		0x00004
 
-#define	ZIO_FLAG_DONT_CACHE		0x0010
-#define	ZIO_FLAG_DONT_QUEUE		0x0020
-#define	ZIO_FLAG_DONT_PROPAGATE		0x0040
-#define	ZIO_FLAG_DONT_RETRY		0x0080
+#define	ZIO_FLAG_DONT_CACHE		0x00010
+#define	ZIO_FLAG_DONT_QUEUE		0x00020
+#define	ZIO_FLAG_DONT_PROPAGATE		0x00040
+#define	ZIO_FLAG_DONT_RETRY		0x00080
 
-#define	ZIO_FLAG_PHYSICAL		0x0100
-#define	ZIO_FLAG_IO_BYPASS		0x0200
-#define	ZIO_FLAG_IO_REPAIR		0x0400
-#define	ZIO_FLAG_SPECULATIVE		0x0800
+#define	ZIO_FLAG_PHYSICAL		0x00100
+#define	ZIO_FLAG_IO_BYPASS		0x00200
+#define	ZIO_FLAG_IO_REPAIR		0x00400
+#define	ZIO_FLAG_SPECULATIVE		0x00800
 
-#define	ZIO_FLAG_RESILVER		0x1000
-#define	ZIO_FLAG_SCRUB			0x2000
+#define	ZIO_FLAG_RESILVER		0x01000
+#define	ZIO_FLAG_SCRUB			0x02000
+
+#define	ZIO_FLAG_NOBOOKMARK		0x10000
 
 #define	ZIO_FLAG_GANG_INHERIT		\
 	(ZIO_FLAG_CANFAIL |		\
@@ -155,11 +156,39 @@ typedef struct zio_transform zio_transform_t;
 extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
 extern char *zio_type_name[ZIO_TYPES];
 
+/*
+ * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
+ * identifies any block in the pool.  By convention, the meta-objset (MOS)
+ * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is
+ * level -1 of the meta-dnode, and intent log blocks (which are chained
+ * off the root block) have blkid == sequence number.  In summary:
+ *
+ *	mos is objset 0
+ *	meta-dnode is object 0
+ *	root block is <objset, 0, -1, 0>
+ *	intent log is <objset, 0, -1, ZIL sequence number>
+ *
+ * Note: this structure is called a bookmark because its first purpose was
+ * to remember where to resume a pool-wide traverse.  The absolute ordering
+ * for block visitation during traversal is defined in compare_bookmark().
+ *
+ * Note: this structure is passed between userland and the kernel.
+ * Therefore it must not change size or alignment between 32/64 bit
+ * compilation options.
+ */
+typedef struct zbookmark {
+	uint64_t	zb_objset;
+	uint64_t	zb_object;
+	int64_t		zb_level;
+	uint64_t	zb_blkid;
+} zbookmark_t;
+
 struct zio {
 	/* Core information about this I/O */
 	zio_t		*io_parent;
 	zio_t		*io_root;
 	spa_t		*io_spa;
+	zbookmark_t	io_bookmark;
 	int		io_checksum;
 	int		io_compress;
 	int		io_dva_index;
@@ -170,6 +199,7 @@ struct zio {
 	zio_t		*io_sibling_prev;
 	zio_t		*io_sibling_next;
 	zio_transform_t *io_transform_stack;
+	zio_t		*io_logical;
 
 	/* Callback info */
 	zio_done_func_t	*io_done;
@@ -191,8 +221,6 @@ struct zio {
 	avl_tree_t	*io_vdev_tree;
 	zio_t		*io_delegate_list;
 	zio_t		*io_delegate_next;
-	zio_t		*io_retry_next;
-	list_node_t	io_pending;
 
 	/* Internal pipeline state */
 	int		io_flags;
@@ -212,6 +240,9 @@ struct zio {
 	void		*io_waiter;
 	kmutex_t	io_lock;
 	kcondvar_t	io_cv;
+
+	/* FMA state */
+	uint64_t	io_ena;
 };
 
 extern zio_t *zio_null(zio_t *pio, spa_t *spa,
@@ -222,15 +253,17 @@ extern zio_t *zio_root(spa_t *spa,
 
 extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags);
+    int priority, int flags, zbookmark_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags);
+    zio_done_func_t *done, void *private, int priority, int flags,
+    zbookmark_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags);
+    zio_done_func_t *done, void *private, int priority, int flags,
+    zbookmark_t *zb);
 
 extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     zio_done_func_t *done, void *private);
@@ -285,12 +318,27 @@ extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp);
 extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
 extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
 
+boolean_t zio_should_retry(zio_t *zio);
+
 /*
  * Initial setup and teardown.
  */
 extern void zio_init(void);
 extern void zio_fini(void);
 
+/*
+ * Fault injection
+ */
+struct zinject_record;
+extern uint32_t zio_injection_enabled;
+extern int zio_inject_fault(char *name, int flags, int *id,
+    struct zinject_record *record);
+extern int zio_inject_list_next(int *id, char *name, size_t buflen,
+    struct zinject_record *record);
+extern int zio_clear_fault(int id);
+extern int zio_handle_fault_injection(zio_t *zio, int error);
+extern int zio_handle_device_injection(vdev_t *vd, int error);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
index ba3dc48d28..bb7bd41e0b 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -57,9 +56,11 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
  */
 extern zio_checksum_t fletcher_2_native;
 extern zio_checksum_t fletcher_4_native;
+extern zio_checksum_t fletcher_4_incremental_native;
 
 extern zio_checksum_t fletcher_2_byteswap;
 extern zio_checksum_t fletcher_4_byteswap;
+extern zio_checksum_t fletcher_4_incremental_byteswap;
 
 extern zio_checksum_t zio_checksum_SHA256;
 
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
index 0b2b07de29..e1abf0e49d 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -201,6 +200,9 @@ struct zio_transform {
 	zio_transform_t	*zt_next;
 };
 
+extern void zio_inject_init(void);
+extern void zio_inject_fini(void);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/fs/zfs/uberblock.c b/usr/src/uts/common/fs/zfs/uberblock.c
index 63bff0ae4b..b6d3fe9595 100644
--- a/usr/src/uts/common/fs/zfs/uberblock.c
+++ b/usr/src/uts/common/fs/zfs/uberblock.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -30,9 +29,6 @@
 #include <sys/uberblock_impl.h>
 #include <sys/vdev_impl.h>
 
-/* Keep the uberblock version in a varialbe so we can get at it with mdb */
-static uint64_t uberblock_version = UBERBLOCK_VERSION;
-
 int
 uberblock_verify(uberblock_t *ub)
 {
@@ -42,9 +38,6 @@ uberblock_verify(uberblock_t *ub)
 	if (ub->ub_magic != UBERBLOCK_MAGIC)
 		return (EINVAL);
 
-	if (ub->ub_version != UBERBLOCK_VERSION)
-		return (ENOTSUP);
-
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 838e1bfc88..363be462ab 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -26,6 +26,7 @@
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
@@ -137,34 +138,6 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev)
 }
 
 vdev_t *
-vdev_lookup_by_path(vdev_t *vd, const char *path)
-{
-	int c;
-	vdev_t *mvd;
-
-	if (vd->vdev_path != NULL) {
-		if (vd->vdev_wholedisk == 1) {
-			/*
-			 * For whole disks, the internal path has 's0', but the
-			 * path passed in by the user doesn't.
-			 */
-			if (strlen(path) == strlen(vd->vdev_path) - 2 &&
-			    strncmp(path, vd->vdev_path, strlen(path)) == 0)
-				return (vd);
-		} else if (strcmp(path, vd->vdev_path) == 0) {
-			return (vd);
-		}
-	}
-
-	for (c = 0; c < vd->vdev_children; c++)
-		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
-		    NULL)
-			return (mvd);
-
-	return (NULL);
-}
-
-vdev_t *
 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 {
 	int c;
@@ -305,10 +278,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
 
-	mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
-	list_create(&vd->vdev_io_pending, sizeof (zio_t),
-	    offsetof(zio_t, io_pending));
 	mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
@@ -343,9 +312,6 @@ vdev_free_common(vdev_t *vd)
 	mutex_exit(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_dirty_lock);
-	list_destroy(&vd->vdev_io_pending);
-	mutex_destroy(&vd->vdev_io_lock);
-	cv_destroy(&vd->vdev_io_cv);
 
 	kmem_free(vd, sizeof (vdev_t));
 }
@@ -402,6 +368,13 @@ vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
 		vd->vdev_wholedisk = -1ULL;
 
 	/*
+	 * Look for the 'not present' flag.  This will only be set if the device
+	 * was not present at the time of import.
+	 */
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+	    &vd->vdev_not_present);
+
+	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
 	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
@@ -536,8 +509,8 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 		vdev_config_dirty(tvd);
 	}
 
-	ASSERT(svd->vdev_io_retry == NULL);
-	ASSERT(list_is_empty(&svd->vdev_io_pending));
+	tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted;
+	svd->vdev_reopen_wanted = 0;
 }
 
 static void
@@ -611,7 +584,7 @@ vdev_remove_parent(vdev_t *cvd)
 	vdev_free(mvd);
 }
 
-void
+int
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
@@ -621,6 +594,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	space_map_obj_t *smo = vd->vdev_smo;
 	metaslab_t **mspp = vd->vdev_ms;
+	int ret;
 
 	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
 
@@ -638,21 +612,29 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 			ms_array = kmem_zalloc(newc * sizeof (uint64_t),
 			    KM_SLEEP);
 
-			dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
-			    0, newc * sizeof (uint64_t), ms_array);
+			if ((ret = dmu_read(spa->spa_meta_objset,
+			    vd->vdev_ms_array, 0,
+			    newc * sizeof (uint64_t), ms_array)) != 0) {
+				kmem_free(ms_array, newc * sizeof (uint64_t));
+				goto error;
+			}
 
 			for (c = 0; c < newc; c++) {
 				if (ms_array[c] == 0)
 					continue;
-				db = dmu_bonus_hold(spa->spa_meta_objset,
-				    ms_array[c]);
-				dmu_buf_read(db);
+				if ((ret = dmu_bonus_hold(
+				    spa->spa_meta_objset, ms_array[c],
+				    FTAG, &db)) != 0) {
+					kmem_free(ms_array,
+					    newc * sizeof (uint64_t));
+					goto error;
+				}
 				ASSERT3U(db->db_size, ==, sizeof (*smo));
 				bcopy(db->db_data, &vd->vdev_smo[c],
 				    db->db_size);
 				ASSERT3U(vd->vdev_smo[c].smo_object, ==,
 				    ms_array[c]);
-				dmu_buf_rele(db);
+				dmu_buf_rele(db, FTAG);
 			}
 			kmem_free(ms_array, newc * sizeof (uint64_t));
 		}
@@ -674,6 +656,21 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 		kmem_free(mspp, oldc * sizeof (*mspp));
 	}
 
+	return (0);
+
+error:
+	/*
+	 * On error, undo any partial progress we may have made, and restore the
+	 * old metaslab values.
+	 */
+	kmem_free(vd->vdev_smo, newc * sizeof (*smo));
+	kmem_free(vd->vdev_ms, newc * sizeof (*mspp));
+
+	vd->vdev_smo = smo;
+	vd->vdev_ms = mspp;
+	vd->vdev_ms_count = oldc;
+
+	return (ret);
 }
 
 void
@@ -735,39 +732,39 @@ vdev_open(vdev_t *vd)
 
 	if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
-		dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
-		vd->vdev_state = VDEV_STATE_OFFLINE;
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
 		return (ENXIO);
 	}
 
 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
 
+	if (zio_injection_enabled && error == 0)
+		error = zio_handle_device_injection(vd, ENXIO);
+
 	dprintf("%s = %d, osize %llu, state = %d\n",
 	    vdev_description(vd), error, osize, vd->vdev_state);
 
 	if (error) {
-		dprintf("%s in %s failed to open, error %d, aux %d\n",
-		    vdev_description(vd),
-		    vdev_description(vd->vdev_parent),
-		    error,
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    vd->vdev_stat.vs_aux);
-
-		vd->vdev_state = VDEV_STATE_CANT_OPEN;
 		return (error);
 	}
 
 	vd->vdev_state = VDEV_STATE_HEALTHY;
 
 	for (c = 0; c < vd->vdev_children; c++)
-		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
-			vd->vdev_state = VDEV_STATE_DEGRADED;
+		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+			    VDEV_AUX_NONE);
+			break;
+		}
 
 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
 
 	if (vd->vdev_children == 0) {
 		if (osize < SPA_MINDEVSIZE) {
-			vd->vdev_state = VDEV_STATE_CANT_OPEN;
-			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_TOO_SMALL);
 			return (EOVERFLOW);
 		}
 		psize = osize;
@@ -775,8 +772,8 @@ vdev_open(vdev_t *vd)
 	} else {
 		if (osize < SPA_MINDEVSIZE -
 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
-			vd->vdev_state = VDEV_STATE_CANT_OPEN;
-			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_TOO_SMALL);
 			return (EOVERFLOW);
 		}
 		psize = 0;
@@ -796,9 +793,8 @@ vdev_open(vdev_t *vd)
 		 * Make sure the alignment requirement hasn't increased.
 		 */
 		if (ashift > vd->vdev_ashift) {
-			dprintf("%s: ashift grew\n", vdev_description(vd));
-			vd->vdev_state = VDEV_STATE_CANT_OPEN;
-			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_BAD_LABEL);
 			return (EINVAL);
 		}
 
@@ -806,9 +802,8 @@ vdev_open(vdev_t *vd)
 		 * Make sure the device hasn't shrunk.
 		 */
 		if (asize < vd->vdev_asize) {
-			dprintf("%s: device shrank\n", vdev_description(vd));
-			vd->vdev_state = VDEV_STATE_CANT_OPEN;
-			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_BAD_LABEL);
 			return (EINVAL);
 		}
 
@@ -818,11 +813,29 @@ vdev_open(vdev_t *vd)
 		 */
 		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
 		    asize > vd->vdev_asize) {
-			dprintf("%s: device grew\n", vdev_description(vd));
 			vd->vdev_asize = asize;
 		}
 	}
 
+	/*
+	 * If we were able to open a vdev that was marked permanently
+	 * unavailable, clear that state now.
+	 */
+	if (vd->vdev_not_present)
+		vd->vdev_not_present = 0;
+
+	/*
+	 * This allows the ZFS DE to close cases appropriately.  If a device
+	 * goes away and later returns, we want to close the associated case.
+	 * But it's not enough to simply post this only when a device goes from
+	 * CANT_OPEN -> HEALTHY.  If we reboot the system and the device is
+	 * back, we also need to close the case (otherwise we will try to replay
+	 * it).  So we have to post this notifier every time.  Since this only
+	 * occurs during pool open or error recovery, this should not be an
+	 * issue.
+	 */
+	zfs_post_ok(vd->vdev_spa, vd);
+
 	return (0);
 }
 
@@ -832,8 +845,6 @@ vdev_open(vdev_t *vd)
 void
 vdev_close(vdev_t *vd)
 {
-	ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
-
 	vd->vdev_ops->vdev_op_close(vd);
 
 	if (vd->vdev_cache_active) {
@@ -846,43 +857,29 @@ vdev_close(vdev_t *vd)
 		vd->vdev_state = VDEV_STATE_OFFLINE;
 	else
 		vd->vdev_state = VDEV_STATE_CLOSED;
+	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 }
 
 void
-vdev_reopen(vdev_t *vd, zio_t **rq)
+vdev_reopen(vdev_t *vd)
 {
-	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
+	ASSERT(spa_config_held(spa, RW_WRITER));
+
 	if (vd == rvd) {
-		ASSERT(rq == NULL);
 		for (c = 0; c < rvd->vdev_children; c++)
-			vdev_reopen(rvd->vdev_child[c], NULL);
+			vdev_reopen(rvd->vdev_child[c]);
 		return;
 	}
 
 	/* only valid for top-level vdevs */
 	ASSERT3P(vd, ==, vd->vdev_top);
 
-	/*
-	 * vdev_state can change when spa_config_lock is held as writer,
-	 * or when it's held as reader and we're doing a vdev_reopen().
-	 * To handle the latter case, we grab rvd's io_lock to serialize
-	 * reopens.  This ensures that there's never more than one vdev
-	 * state changer active at a time.
-	 */
-	mutex_enter(&rvd->vdev_io_lock);
-
-	mutex_enter(&vd->vdev_io_lock);
-	while (list_head(&vd->vdev_io_pending) != NULL)
-		cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
 	vdev_close(vd);
 	(void) vdev_open(vd);
-	if (rq != NULL) {
-		*rq = vd->vdev_io_retry;
-		vd->vdev_io_retry = NULL;
-	}
-	mutex_exit(&vd->vdev_io_lock);
 
 	/*
 	 * Reassess root vdev's health.
@@ -892,8 +889,6 @@ vdev_reopen(vdev_t *vd, zio_t **rq)
 		uint64_t state = rvd->vdev_child[c]->vdev_state;
 		rvd->vdev_state = MIN(rvd->vdev_state, state);
 	}
-
-	mutex_exit(&rvd->vdev_io_lock);
 }
 
 int
@@ -930,7 +925,7 @@ vdev_create(vdev_t *vd, uint64_t txg)
  * For creation, we want to try to create all vdevs at once and then undo it
  * if anything fails; this is much harder if we have pending transactions.
  */
-void
+int
 vdev_init(vdev_t *vd, uint64_t txg)
 {
 	/*
@@ -942,7 +937,7 @@ vdev_init(vdev_t *vd, uint64_t txg)
 	/*
 	 * Initialize the vdev's metaslabs.
 	 */
-	vdev_metaslab_init(vd, txg);
+	return (vdev_metaslab_init(vd, txg));
 }
 
 void
@@ -993,9 +988,10 @@ vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
 void
 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 {
+	spa_t *spa = vd->vdev_spa;
 	int c;
 
-	ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
+	ASSERT(spa_config_held(spa, RW_WRITER));
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
@@ -1019,6 +1015,12 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 		return;
 	}
 
+	/*
+	 * Make sure the DTLs are always correct under the scrub lock.
+	 */
+	if (vd == spa->spa_root_vdev)
+		mutex_enter(&spa->spa_scrub_lock);
+
 	mutex_enter(&vd->vdev_dtl_lock);
 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
@@ -1032,6 +1034,9 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
 		mutex_exit(&vd->vdev_dtl_lock);
 	}
+
+	if (vd == spa->spa_root_vdev)
+		mutex_exit(&spa->spa_scrub_lock);
 }
 
 static int
@@ -1047,11 +1052,12 @@ vdev_dtl_load(vdev_t *vd)
 	if (smo->smo_object == 0)
 		return (0);
 
-	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
-	dmu_buf_read(db);
+	if ((error = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object,
+	    FTAG, &db)) != 0)
+		return (error);
 	ASSERT3U(db->db_size, ==, sizeof (*smo));
 	bcopy(db->db_data, smo, db->db_size);
-	dmu_buf_rele(db);
+	dmu_buf_rele(db, FTAG);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
@@ -1100,8 +1106,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 		vdev_config_dirty(vd->vdev_top);
 	}
 
-	dmu_free_range(spa->spa_meta_objset, smo->smo_object,
-	    0, smo->smo_objsize, tx);
+	VERIFY(0 == dmu_free_range(spa->spa_meta_objset, smo->smo_object,
+	    0, smo->smo_objsize, tx));
 
 	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
 
@@ -1124,17 +1130,18 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	mutex_exit(&smlock);
 	mutex_destroy(&smlock);
 
-	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
+	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object,
+	    FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	ASSERT3U(db->db_size, ==, sizeof (*smo));
 	bcopy(smo, db->db_data, db->db_size);
-	dmu_buf_rele(db);
+	dmu_buf_rele(db, FTAG);
 
 	dmu_tx_commit(tx);
 }
 
 int
-vdev_load(vdev_t *vd, int import)
+vdev_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	int c, error;
@@ -1147,7 +1154,7 @@ vdev_load(vdev_t *vd, int import)
 	 * Recursively load all children.
 	 */
 	for (c = 0; c < vd->vdev_children; c++)
-		if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
+		if ((error = vdev_load(vd->vdev_child[c])) != 0)
 			return (error);
 
 	/*
@@ -1166,7 +1173,7 @@ vdev_load(vdev_t *vd, int import)
 		 */
 		if ((label = vdev_label_read_config(vd)) == NULL) {
 			dprintf("can't load label config\n");
-			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			return (0);
 		}
@@ -1174,7 +1181,7 @@ vdev_load(vdev_t *vd, int import)
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
 		    &guid) != 0 || guid != spa_guid(spa)) {
 			dprintf("bad or missing pool GUID (%llu)\n", guid);
-			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			return (0);
@@ -1184,7 +1191,7 @@ vdev_load(vdev_t *vd, int import)
 		    guid != vd->vdev_guid) {
 			dprintf("bad or missing vdev guid (%llu != %llu)\n",
 			    guid, vd->vdev_guid);
-			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			return (0);
@@ -1201,14 +1208,15 @@ vdev_load(vdev_t *vd, int import)
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 		    &state)) {
 			dprintf("missing pool state\n");
-			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			return (0);
 		}
 
 		if (state != POOL_STATE_ACTIVE &&
-		    (!import || state != POOL_STATE_EXPORTED)) {
+		    (spa->spa_load_state == SPA_LOAD_OPEN ||
+		    state != POOL_STATE_EXPORTED)) {
 			dprintf("pool state not active (%llu)\n", state);
 			nvlist_free(label);
 			return (EBADF);
@@ -1227,12 +1235,16 @@ vdev_load(vdev_t *vd, int import)
 		    vd->vdev_ms_shift == 0 ||
 		    vd->vdev_ashift == 0 ||
 		    vd->vdev_asize == 0) {
-			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			return (0);
 		}
 
-		vdev_metaslab_init(vd, 0);
+		if ((error = vdev_metaslab_init(vd, 0)) != 0) {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			return (0);
+		}
 	}
 
 	/*
@@ -1243,7 +1255,7 @@ vdev_load(vdev_t *vd, int import)
 		if (error) {
 			dprintf("can't load DTL for %s, error %d\n",
 			    vdev_description(vd), error);
-			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			return (0);
 		}
@@ -1344,7 +1356,7 @@ vdev_description(vdev_t *vd)
 }
 
 int
-vdev_online(spa_t *spa, const char *path)
+vdev_online(spa_t *spa, uint64_t guid)
 {
 	vdev_t *rvd, *vd;
 	uint64_t txg;
@@ -1352,24 +1364,14 @@ vdev_online(spa_t *spa, const char *path)
 	txg = spa_vdev_enter(spa);
 
 	rvd = spa->spa_root_vdev;
-	if ((vd = vdev_lookup_by_path(rvd, path)) == NULL)
+	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	dprintf("ONLINE: %s\n", vdev_description(vd));
 
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
-
-	/*
-	 * Clear the error counts.  The idea is that you expect to see all
-	 * zeroes when everything is working, so if you've just onlined a
-	 * device, you don't want to keep hearing about errors from before.
-	 */
-	vd->vdev_stat.vs_read_errors = 0;
-	vd->vdev_stat.vs_write_errors = 0;
-	vd->vdev_stat.vs_checksum_errors = 0;
-
-	vdev_reopen(vd->vdev_top, NULL);
+	vdev_reopen(vd->vdev_top);
 
 	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
 
@@ -1383,7 +1385,7 @@ vdev_online(spa_t *spa, const char *path)
 }
 
 int
-vdev_offline(spa_t *spa, const char *path, int istmp)
+vdev_offline(spa_t *spa, uint64_t guid, int istmp)
 {
 	vdev_t *rvd, *vd;
 	uint64_t txg;
@@ -1391,7 +1393,7 @@ vdev_offline(spa_t *spa, const char *path, int istmp)
 	txg = spa_vdev_enter(spa);
 
 	rvd = spa->spa_root_vdev;
-	if ((vd = vdev_lookup_by_path(rvd, path)) == NULL)
+	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	dprintf("OFFLINE: %s\n", vdev_description(vd));
@@ -1416,10 +1418,10 @@ vdev_offline(spa_t *spa, const char *path, int istmp)
 	 * undo it and fail the request.
 	 */
 	vd->vdev_offline = B_TRUE;
-	vdev_reopen(vd->vdev_top, NULL);
+	vdev_reopen(vd->vdev_top);
 	if (vdev_is_dead(vd->vdev_top)) {
 		vd->vdev_offline = B_FALSE;
-		vdev_reopen(vd->vdev_top, NULL);
+		vdev_reopen(vd->vdev_top);
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 	}
 
@@ -1434,25 +1436,25 @@ vdev_offline(spa_t *spa, const char *path, int istmp)
 	return (spa_vdev_exit(spa, NULL, txg, 0));
 }
 
-int
-vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
+/*
+ * Clear the error counts associated with this vdev.  Unlike vdev_online() and
+ * vdev_offline(), we assume the spa config is locked.  We also clear all
+ * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
+ */
+void
+vdev_clear(spa_t *spa, vdev_t *vd)
 {
-	vdev_t *vd;
-
-	spa_config_enter(spa, RW_WRITER);
-
-	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
-		spa_config_exit(spa);
-		return (ENODEV);
-	}
+	int c;
 
-	vd->vdev_fault_mode = mode;
-	vd->vdev_fault_mask = mask;
-	vd->vdev_fault_arg = arg;
+	if (vd == NULL)
+		vd = spa->spa_root_vdev;
 
-	spa_config_exit(spa);
+	vd->vdev_stat.vs_read_errors = 0;
+	vd->vdev_stat.vs_write_errors = 0;
+	vd->vdev_stat.vs_checksum_errors = 0;
 
-	return (0);
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_clear(spa, vd->vdev_child[c]);
 }
 
 int
@@ -1631,24 +1633,6 @@ vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
 }
 
 /*
- * Report checksum errors that a vdev that didn't realize it made.
- * This can happen, for example, when RAID-Z combinatorial reconstruction
- * infers that one of its components returned bad data.
- */
-void
-vdev_checksum_error(zio_t *zio, vdev_t *vd)
-{
-	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
-	    vdev_description(vd));
-
-	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_checksum_errors++;
-		mutex_exit(&vd->vdev_stat_lock);
-	}
-}
-
-/*
  * Update the in-core space usage stats for this vdev and the root vdev.
  */
 void
@@ -1709,6 +1693,14 @@ static vdev_knob_t vdev_knob[] = {
 		offsetof(struct vdev, vdev_queue.vq_max_pending)
 	},
 	{
+		"scrub_limit",
+		"maximum scrub/resilver I/O queue",
+		0,
+		10000,
+		70,
+		offsetof(struct vdev, vdev_queue.vq_scrub_limit)
+	},
+	{
 		"agg_limit",
 		"maximum size of aggregated I/Os",
 		0,
@@ -1781,20 +1773,78 @@ vdev_config_clean(vdev_t *vd)
 }
 
 /*
- * Set a vdev's state, updating any parent's state as well.
+ * Set a vdev's state.  If this is during an open, we don't update the parent
+ * state, because we're in the process of opening children depth-first.
+ * Otherwise, we propagate the change to the parent.
+ *
+ * If this routine places a device in a faulted state, an appropriate ereport is
+ * generated.
  */
 void
-vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
+vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 {
-	if (state == vd->vdev_state)
+	uint64_t prev_state;
+
+	if (state == vd->vdev_state) {
+		vd->vdev_stat.vs_aux = aux;
 		return;
+	}
+
+	prev_state = vd->vdev_state;
 
 	vd->vdev_state = state;
 	vd->vdev_stat.vs_aux = aux;
 
+	if (state == VDEV_STATE_CANT_OPEN) {
+		/*
+		 * If we fail to open a vdev during an import, we mark it as
+		 * "not available", which signifies that it was never there to
+		 * begin with.  Failure to open such a device is not considered
+		 * an error.
+		 */
+		if (!vd->vdev_not_present &&
+		    vd != vd->vdev_spa->spa_root_vdev) {
+			const char *class;
+
+			switch (aux) {
+			case VDEV_AUX_OPEN_FAILED:
+				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
+				break;
+			case VDEV_AUX_CORRUPT_DATA:
+				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
+				break;
+			case VDEV_AUX_NO_REPLICAS:
+				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
+				break;
+			case VDEV_AUX_BAD_GUID_SUM:
+				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
+				break;
+			case VDEV_AUX_TOO_SMALL:
+				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
+				break;
+			case VDEV_AUX_BAD_LABEL:
+				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
+				break;
+			default:
+				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
+			}
+
+			zfs_ereport_post(class, vd->vdev_spa,
+			    vd, NULL, prev_state, 0);
+		}
+
+		if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT &&
+		    vd->vdev_ops->vdev_op_leaf)
+			vd->vdev_not_present = 1;
+	}
+
+	if (isopen)
+		return;
+
 	if (vd->vdev_parent != NULL) {
 		int c;
 		int degraded = 0, faulted = 0;
+		int corrupted = 0;
 		vdev_t *parent, *child;
 
 		parent = vd->vdev_parent;
@@ -1804,9 +1854,23 @@ vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
 				faulted++;
 			else if (child->vdev_state == VDEV_STATE_DEGRADED)
 				degraded++;
+
+			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+				corrupted++;
 		}
 
 		vd->vdev_parent->vdev_ops->vdev_op_state_change(
 		    vd->vdev_parent, faulted, degraded);
-	    }
+
+		/*
+		 * Root special: if this is a toplevel vdev that cannot be
+		 * opened due to corrupted metadata, then propagate the root
+		 * vdev's aux state as 'corrupt' rather than 'insufficient
+		 * replicas'.
+		 */
+		if (corrupted && vd == vd->vdev_top)
+			vdev_set_state(vd->vdev_spa->spa_root_vdev,
+			    B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+	}
 }
diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c
index e1e7c1a36f..67a8924b52 100644
--- a/usr/src/uts/common/fs/zfs/vdev_cache.c
+++ b/usr/src/uts/common/fs/zfs/vdev_cache.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -286,7 +285,8 @@ vdev_cache_read(zio_t *zio)
 	fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
 	    ve->ve_data, vc->vc_blocksize, ZIO_TYPE_READ,
 	    ZIO_PRIORITY_CACHE_FILL,
-	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
+	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+	    ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
 	    vdev_cache_fill, ve);
 
 	ve->ve_fill_io = fio;
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index 1556c387b2..b4d7d7a0d2 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -323,6 +323,9 @@ vdev_disk_io_done(zio_t *zio)
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		vdev_cache_write(zio);
 
+	if (zio_injection_enabled && zio->io_error == 0)
+		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
 	zio_next_stage(zio);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index a789008e17..a82abf80b7 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -190,6 +189,9 @@ vdev_file_io_done(zio_t *zio)
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		vdev_cache_write(zio);
 
+	if (zio_injection_enabled && zio->io_error == 0)
+		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
 	zio_next_stage(zio);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 1282df0d9a..3571be9064 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -165,8 +165,8 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
 	zio_nowait(zio_read_phys(zio, vd,
 	    vdev_label_offset(vd->vdev_psize, l, offset),
 	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
-	    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_SPECULATIVE |
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY));
+	    ZIO_PRIORITY_SYNC_READ,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
 }
 
 static void
@@ -178,8 +178,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
 	zio_nowait(zio_write_phys(zio, vd,
 	    vdev_label_offset(vd->vdev_psize, l, offset),
 	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
-	    ZIO_PRIORITY_SYNC_WRITE,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY));
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL));
 }
 
 /*
@@ -190,7 +189,7 @@ vdev_config_generate(vdev_t *vd, int getstats)
 {
 	nvlist_t *nv = NULL;
 
-	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
 	    vd->vdev_ops->vdev_op_type) == 0);
@@ -209,6 +208,9 @@ vdev_config_generate(vdev_t *vd, int getstats)
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 		    vd->vdev_wholedisk) == 0);
 
+	if (vd->vdev_not_present)
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
+
 	if (vd == vd->vdev_top) {
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vd->vdev_ms_array) == 0);
@@ -269,7 +271,6 @@ vdev_label_read_config(vdev_t *vd)
 {
 	nvlist_t *config = NULL;
 	vdev_phys_t *vp;
-	uint64_t version;
 	zio_t *zio;
 	int l;
 
@@ -280,8 +281,8 @@ vdev_label_read_config(vdev_t *vd)
 
 	for (l = 0; l < VDEV_LABELS; l++) {
 
-		zio = zio_root(vd->vdev_spa, NULL, NULL,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD);
+		zio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_CANFAIL |
+		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD);
 
 		vdev_label_read(zio, vd, l, vp,
 		    offsetof(vdev_label_t, vl_vdev_phys),
@@ -289,10 +290,7 @@ vdev_label_read_config(vdev_t *vd)
 
 		if (zio_wait(zio) == 0 &&
 		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
-		    &config, 0) == 0 &&
-		    nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
-		    &version) == 0 &&
-		    version == UBERBLOCK_VERSION)
+		    &config, 0) == 0)
 			break;
 
 		if (config != NULL) {
@@ -341,16 +339,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg)
 	 * Check whether this device is already in use.
 	 * Ignore the check if crtxg == 0, which we use for device removal.
 	 */
-	if (crtxg != 0 && (label = vdev_label_read_config(vd)) != NULL) {
-		uint64_t version, state, pool_guid, device_guid, txg;
+	if (crtxg != 0 &&
+	    (label = vdev_label_read_config(vd)) != NULL) {
+		uint64_t state, pool_guid, device_guid, txg;
 		uint64_t mycrtxg = 0;
 
 		(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
 		    &mycrtxg);
 
-		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION,
-		    &version) == 0 && version == UBERBLOCK_VERSION &&
-		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 		    &state) == 0 && state == POOL_STATE_ACTIVE &&
 		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
 		    &pool_guid) == 0 &&
@@ -390,7 +387,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg)
 	buf = vp->vp_nvlist;
 	buflen = sizeof (vp->vp_nvlist);
 
-	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) {
+	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) != 0) {
 		nvlist_free(label);
 		zio_buf_free(vp, sizeof (vdev_phys_t));
 		return (EINVAL);
@@ -491,7 +488,7 @@ vdev_uberblock_load_done(zio_t *zio)
 
 	ASSERT3U(zio->io_size, ==, sizeof (uberblock_phys_t));
 
-	if (uberblock_verify(ub) == 0) {
+	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
 		mutex_enter(&spa->spa_uberblock_lock);
 		if (vdev_uberblock_compare(ub, ubbest) > 0)
 			*ubbest = *ub;
@@ -645,7 +642,7 @@ vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
 	buf = vp->vp_nvlist;
 	buflen = sizeof (vp->vp_nvlist);
 
-	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) == 0)
+	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0)
 		vdev_label_write(zio, vd, l, vp,
 		    offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
 		    vdev_sync_label_done, NULL);
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index 45eb7ce78b..b88b999c6f 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -209,7 +208,8 @@ vdev_mirror_io_start(zio_t *zio)
 	mm = vdev_mirror_map_alloc(zio);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		if (zio->io_flags & ZIO_FLAG_SCRUB) {
+		if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
+		    vd->vdev_ops != &vdev_replacing_ops) {
 			/*
 			 * For scrubbing reads we need to allocate a read
 			 * buffer for each child and issue reads to all
@@ -384,11 +384,12 @@ static void
 vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	if (faulted == vd->vdev_children)
-		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded + faulted != 0)
-		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
-		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 vdev_ops_t vdev_mirror_ops = {
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 09831e1504..bb838fedd1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -103,6 +102,8 @@ vdev_queue_fini(vdev_t *vd)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 
+	ASSERT(vq->vq_scrub_count == 0);
+
 	avl_destroy(&vq->vq_deadline_tree);
 	avl_destroy(&vq->vq_read_tree);
 	avl_destroy(&vq->vq_write_tree);
@@ -112,6 +113,28 @@ vdev_queue_fini(vdev_t *vd)
 }
 
 static void
+vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
+{
+	avl_add(&vq->vq_deadline_tree, zio);
+	avl_add(zio->io_vdev_tree, zio);
+
+	if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) &&
+	    ++vq->vq_scrub_count >= vq->vq_scrub_limit)
+		spa_scrub_throttle(zio->io_spa, 1);
+}
+
+static void
+vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
+{
+	if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) &&
+	    vq->vq_scrub_count-- >= vq->vq_scrub_limit)
+		spa_scrub_throttle(zio->io_spa, -1);
+
+	avl_remove(&vq->vq_deadline_tree, zio);
+	avl_remove(zio->io_vdev_tree, zio);
+}
+
+static void
 vdev_queue_agg_io_done(zio_t *aio)
 {
 	zio_t *dio;
@@ -182,18 +205,19 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
 		aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
 		    fio->io_offset, buf, size, fio->io_type,
 		    ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
-		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE,
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+		    ZIO_FLAG_NOBOOKMARK,
 		    vdev_queue_agg_io_done, NULL);
 
 		aio->io_delegate_list = fio;
 
 		for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
 			ASSERT(dio->io_type == aio->io_type);
+			ASSERT(dio->io_vdev_tree == tree);
 			if (dio->io_type == ZIO_TYPE_WRITE)
 				bcopy(dio->io_data, buf + offset, dio->io_size);
 			offset += dio->io_size;
-			avl_remove(&vq->vq_deadline_tree, dio);
-			avl_remove(tree, dio);
+			vdev_queue_io_remove(vq, dio);
 			zio_vdev_io_bypass(dio);
 			nagg++;
 		}
@@ -211,8 +235,8 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
 		return (aio);
 	}
 
-	avl_remove(&vq->vq_deadline_tree, fio);
-	avl_remove(tree, fio);
+	ASSERT(fio->io_vdev_tree == tree);
+	vdev_queue_io_remove(vq, fio);
 
 	avl_add(&vq->vq_pending_tree, fio);
 
@@ -245,8 +269,7 @@ vdev_queue_io(zio_t *zio)
 	zio->io_deadline = (zio->io_timestamp >> vq->vq_time_shift) +
 	    zio->io_priority;
 
-	avl_add(&vq->vq_deadline_tree, zio);
-	avl_add(zio->io_vdev_tree, zio);
+	vdev_queue_io_add(vq, zio);
 
 	nio = vdev_queue_io_to_issue(vq, vq->vq_min_pending, &func);
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index c2c4985856..157ae5001c 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -32,6 +31,7 @@
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
 
 /*
  * Virtual device vector for RAID-Z.
@@ -327,6 +327,28 @@ vdev_raidz_io_start(zio_t *zio)
 	zio_wait_children_done(zio);
 }
 
+/*
+ * Report a checksum error for a child of a RAID-Z device.
+ */
+static void
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
+{
+	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_col];
+	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
+	    vdev_description(vd));
+
+	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_checksum_errors++;
+		mutex_exit(&vd->vdev_stat_lock);
+	}
+
+	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
+		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
+}
+
+
 static void
 vdev_raidz_io_done(zio_t *zio)
 {
@@ -398,8 +420,7 @@ vdev_raidz_io_done(zio_t *zio)
 			bcopy(rc->rc_data, orig, rc->rc_size);
 			vdev_raidz_reconstruct(rm, c);
 			if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) {
-				vdev_checksum_error(zio,
-				    vd->vdev_child[rc->rc_col]);
+				raidz_checksum_error(zio, rc);
 				rc->rc_error = ECKSUM;
 				unexpected_errors++;
 			}
@@ -500,8 +521,7 @@ vdev_raidz_io_done(zio_t *zio)
 			 * inform it.
 			 */
 			if (rc->rc_tried && rc->rc_error == 0)
-				vdev_checksum_error(zio,
-				    vd->vdev_child[rc->rc_col]);
+				raidz_checksum_error(zio, rc);
 			rc->rc_error = ECKSUM;
 			goto done;
 		}
@@ -511,9 +531,18 @@ vdev_raidz_io_done(zio_t *zio)
 	}
 
 	/*
-	 * All combinations failed to checksum.
+	 * All combinations failed to checksum.  Generate checksum ereports for
+	 * every one.
 	 */
 	zio->io_error = ECKSUM;
+	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+		for (c = 0; c < rm->rm_cols; c++) {
+			rc = &rm->rm_col[c];
+			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+			    zio->io_spa, vd->vdev_child[rc->rc_col], zio,
+			    rc->rc_offset, rc->rc_size);
+		}
+	}
 
 done:
 	zio_checksum_verified(zio);
@@ -558,11 +587,12 @@ static void
 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	if (faulted > 1)
-		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded + faulted != 0)
-		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
-		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 vdev_ops_t vdev_raidz_ops = {
diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c
index 4e44b5bb05..85671d00b1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_root.c
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -79,11 +78,12 @@ static void
 vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	if (faulted > 0)
-		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded != 0)
-		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
-		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 vdev_ops_t vdev_root_ops = {
diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c
index 2866b7f729..8dc17ed4b1 100644
--- a/usr/src/uts/common/fs/zfs/zap.c
+++ b/usr/src/uts/common/fs/zfs/zap.c
@@ -45,6 +45,7 @@
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
 #include <sys/zap.h>
+#include <sys/refcount.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 
@@ -54,8 +55,8 @@ int fzap_default_block_shift = 14; /* 16k blocksize */
 
 static void zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx);
 static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx);
-static zap_leaf_t *zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
-    dmu_tx_t *tx, krw_t lt);
+static int zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
+    dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp);
 static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
 
 
@@ -120,8 +121,8 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
 	/*
 	 * set up block 1 - the first leaf
 	 */
-	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    1<<FZAP_BLOCK_SHIFT(zap));
+	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 
 	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
@@ -131,7 +132,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
 	zap_leaf_init(l);
 
 	kmem_free(l, sizeof (zap_leaf_t));
-	dmu_buf_rele(db);
+	dmu_buf_rele(db, FTAG);
 }
 
 static int
@@ -157,6 +158,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 {
 	uint64_t b, newblk;
 	dmu_buf_t *db_old, *db_new;
+	int err;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 	int hepb = 1<<(bs-4);
 	/* hepb = half the number of entries in a block */
@@ -181,26 +183,27 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 	 */
 
 	b = tbl->zt_blks_copied;
-	db_old = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + b) << bs);
-	dmu_buf_read(db_old);
+	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + b) << bs, FTAG, &db_old);
+	if (err)
+		return;
 
 	/* first half of entries in old[b] go to new[2*b+0] */
-	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (newblk + 2*b+0) << bs);
+	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (newblk + 2*b+0) << bs, FTAG, &db_new));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func(db_old->db_data, db_new->db_data, hepb);
-	dmu_buf_rele(db_new);
+	dmu_buf_rele(db_new, FTAG);
 
 	/* second half of entries in old[b] go to new[2*b+1] */
-	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (newblk + 2*b+1) << bs);
+	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (newblk + 2*b+1) << bs, FTAG, &db_new));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func((uint64_t *)db_old->db_data + hepb,
 	    db_new->db_data, hepb);
-	dmu_buf_rele(db_new);
+	dmu_buf_rele(db_new, FTAG);
 
-	dmu_buf_rele(db_old);
+	dmu_buf_rele(db_old, FTAG);
 
 	tbl->zt_blks_copied++;
 
@@ -208,7 +211,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 	    tbl->zt_blks_copied, tbl->zt_numblks);
 
 	if (tbl->zt_blks_copied == tbl->zt_numblks) {
-		dmu_free_range(zap->zap_objset, zap->zap_object,
+		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
 		    tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
 
 		tbl->zt_blk = newblk;
@@ -222,13 +225,14 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 	}
 }
 
-static uint64_t
+static int
 zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
     dmu_tx_t *tx)
 {
-	uint64_t blk, off, oldval;
-	dmu_buf_t *db;
+	int err;
+	uint64_t blk, off;
 	int bs = FZAP_BLOCK_SHIFT(zap);
+	dmu_buf_t *db;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 	ASSERT(tbl->zt_blk != 0);
@@ -238,33 +242,41 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 	blk = idx >> (bs-3);
 	off = idx & ((1<<(bs-3))-1);
 
-	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + blk) << bs);
+	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + blk) << bs, FTAG, &db);
+	if (err)
+		return (err);
 	dmu_buf_will_dirty(db, tx);
-	oldval = ((uint64_t *)db->db_data)[off];
-	((uint64_t *)db->db_data)[off] = val;
-	dmu_buf_rele(db);
 
 	if (tbl->zt_nextblk != 0) {
-		idx *= 2;
-		blk = idx >> (bs-3);
-		off = idx & ((1<<(bs-3))-1);
-
-		db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-		    (tbl->zt_nextblk + blk) << bs);
-		dmu_buf_will_dirty(db, tx);
-		((uint64_t *)db->db_data)[off] = val;
-		((uint64_t *)db->db_data)[off+1] = val;
-		dmu_buf_rele(db);
+		uint64_t idx2 = idx * 2;
+		uint64_t blk2 = idx2 >> (bs-3);
+		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
+		dmu_buf_t *db2;
+
+		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2);
+		if (err) {
+			dmu_buf_rele(db, FTAG);
+			return (err);
+		}
+		dmu_buf_will_dirty(db2, tx);
+		((uint64_t *)db2->db_data)[off2] = val;
+		((uint64_t *)db2->db_data)[off2+1] = val;
+		dmu_buf_rele(db2, FTAG);
 	}
 
-	return (oldval);
+	((uint64_t *)db->db_data)[off] = val;
+	dmu_buf_rele(db, FTAG);
+
+	return (0);
 }
 
-static uint64_t
-zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx)
+static int
+zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 {
-	uint64_t blk, off, val;
+	uint64_t blk, off;
+	int err;
 	dmu_buf_t *db;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 
@@ -273,12 +285,26 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx)
 	blk = idx >> (bs-3);
 	off = idx & ((1<<(bs-3))-1);
 
-	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + blk) << bs);
-	dmu_buf_read(db);
-	val = ((uint64_t *)db->db_data)[off];
-	dmu_buf_rele(db);
-	return (val);
+	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + blk) << bs, FTAG, &db);
+	if (err)
+		return (err);
+	*valp = ((uint64_t *)db->db_data)[off];
+	dmu_buf_rele(db, FTAG);
+
+	if (tbl->zt_nextblk != 0) {
+		/*
+		 * read the nextblk for the sake of i/o error checking,
+		 * so that zap_table_load() will catch errors for
+		 * zap_table_store.
+		 */
+		blk = (idx*2) >> (bs-3);
+
+		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		    (tbl->zt_nextblk + blk) << bs, FTAG, &db);
+		dmu_buf_rele(db, FTAG);
+	}
+	return (err);
 }
 
 /*
@@ -310,19 +336,21 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 		 */
 		uint64_t newblk;
 		dmu_buf_t *db_new;
+		int err;
 
 		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
 		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
 
 		newblk = zap_allocate_blocks(zap, 1, tx);
-		db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-		    newblk << FZAP_BLOCK_SHIFT(zap));
-
+		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new);
+		if (err)
+			return;
 		dmu_buf_will_dirty(db_new, tx);
 		zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-		dmu_buf_rele(db_new);
+		dmu_buf_rele(db_new, FTAG);
 
 		zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
 		zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
@@ -386,8 +414,8 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 	l->l_dbuf = NULL;
 	l->l_phys = NULL;
 
-	l->l_dbuf = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    l->l_blkid << FZAP_BLOCK_SHIFT(zap));
+	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
 	winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
 	ASSERT(winner == NULL);
 	dmu_buf_will_dirty(l->l_dbuf, tx);
@@ -403,7 +431,7 @@ zap_destroy_leaf(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
 {
 	/* uint64_t offset = l->l_blkid << ZAP_BLOCK_SHIFT; */
 	rw_exit(&l->l_rwlock);
-	dmu_buf_rele(l->l_dbuf);
+	dmu_buf_rele(l->l_dbuf, NULL);
 	/* XXX there are still holds on this block, so we can't free it? */
 	/* dmu_free_range(zap->zap_objset, zap->zap_object, */
 	    /* offset,  1<<ZAP_BLOCK_SHIFT, tx); */
@@ -430,11 +458,11 @@ zap_put_leaf(zap_leaf_t *l)
 	while (nl) {
 		zap_leaf_t *nnl = nl->l_next;
 		rw_exit(&nl->l_rwlock);
-		dmu_buf_rele(nl->l_dbuf);
+		dmu_buf_rele(nl->l_dbuf, NULL);
 		nl = nnl;
 	}
 	rw_exit(&l->l_rwlock);
-	dmu_buf_rele(l->l_dbuf);
+	dmu_buf_rele(l->l_dbuf, NULL);
 }
 
 _NOTE(ARGSUSED(0))
@@ -489,23 +517,27 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 	return (l);
 }
 
-static zap_leaf_t *
-zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
+static int
+zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
+    zap_leaf_t **lp)
 {
 	dmu_buf_t *db;
 	zap_leaf_t *l;
 	int bs = FZAP_BLOCK_SHIFT(zap);
+	int err;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
-	db = dmu_buf_hold(zap->zap_objset, zap->zap_object, blkid << bs);
+	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    blkid << bs, NULL, &db);
+	if (err)
+		return (err);
 
 	ASSERT3U(db->db_object, ==, zap->zap_object);
 	ASSERT3U(db->db_offset, ==, blkid << bs);
 	ASSERT3U(db->db_size, ==, 1 << bs);
 	ASSERT(blkid != 0);
 
-	dmu_buf_read(db);
 	l = dmu_buf_get_user(db);
 
 	if (l == NULL)
@@ -524,43 +556,53 @@ zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
 	ASSERT3U(l->lh_block_type, ==, ZBT_LEAF);
 	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
 
-	return (l);
+	*lp = l;
+	return (0);
 }
 
-static zap_leaf_t *
-zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
+static int
+zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
+    zap_leaf_t **lp)
 {
-	zap_leaf_t *l, *nl;
+	int err;
+	zap_leaf_t *nl;
 
-	l = zap_get_leaf_byblk_impl(zap, blkid, tx, lt);
+	err = zap_get_leaf_byblk_impl(zap, blkid, tx, lt, lp);
+	if (err)
+		return (err);
 
-	nl = l;
+	nl = *lp;
 	while (nl->lh_next != 0) {
 		zap_leaf_t *nnl;
-		nnl = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt);
+		err = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt, &nnl);
+		if (err) {
+			zap_put_leaf(*lp);
+			return (err);
+		}
 		nl->l_next = nnl;
 		nl = nnl;
 	}
 
-	return (l);
+	return (err);
 }
 
-static uint64_t
-zap_idx_to_blk(zap_t *zap, uint64_t idx)
+static int
+zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
 {
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
 		ASSERT3U(idx, <,
 		    (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
-		return (ZAP_EMBEDDED_PTRTBL_ENT(zap, idx));
+		*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
+		return (0);
 	} else {
 		return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
-		    idx));
+		    idx, valp));
 	}
 }
 
-static void
+static int
 zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
 {
 	ASSERT(tx != NULL);
@@ -568,32 +610,37 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
 
 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
 		ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
+		return (0);
 	} else {
-		(void) zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
-		    idx, blk, tx);
+		return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		    idx, blk, tx));
 	}
 }
 
-static zap_leaf_t *
-zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt)
+static int
+zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 {
-	uint64_t idx;
-	zap_leaf_t *l;
+	uint64_t idx, blk;
+	int err;
 
 	ASSERT(zap->zap_dbuf == NULL ||
 	    zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
 	ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
 	idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
-	l = zap_get_leaf_byblk(zap, zap_idx_to_blk(zap, idx), tx, lt);
-
-	ASSERT3U(ZAP_HASH_IDX(h, l->lh_prefix_len), ==, l->lh_prefix);
+	err = zap_idx_to_blk(zap, idx, &blk);
+	if (err != 0)
+		return (err);
+	err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
 
-	return (l);
+	ASSERT(err ||
+	    ZAP_HASH_IDX(h, (*lp)->lh_prefix_len) == (*lp)->lh_prefix);
+	return (err);
 }
 
 
-static zap_leaf_t *
-zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
+static int
+zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx,
+    zap_leaf_t **lp)
 {
 	zap_leaf_t *nl;
 	int prefix_diff, i, err;
@@ -616,11 +663,13 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
 		err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
 		ASSERT3U(err, ==, 0);
 		ASSERT(!zap->zap_ismicro);
-		l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+		(void) zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
 
-		if (l->lh_prefix_len != old_prefix_len)
+		if (l->lh_prefix_len != old_prefix_len) {
 			/* it split while our locks were down */
-			return (l);
+			*lp = l;
+			return (0);
+		}
 	}
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
@@ -629,21 +678,33 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
 		(void) zap_leaf_chainmore(l, zap_create_leaf(zap, tx));
 		dprintf("chaining leaf %x/%d\n", l->lh_prefix,
 		    l->lh_prefix_len);
-		return (l);
+		*lp = l;
+		return (0);
 	}
 
 	ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
 
 	/* There's more than one pointer to us. Split this leaf. */
-	nl = zap_leaf_split(zap, l, tx);
 
 	/* set sibling pointers */
 	prefix_diff =
-	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
-	sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len) | 1) << prefix_diff;
+	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - (l->lh_prefix_len + 1);
+	sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len + 1) | 1) << prefix_diff;
+
+	/* check for i/o errors before doing zap_leaf_split */
 	for (i = 0; i < (1ULL<<prefix_diff); i++) {
-		ASSERT3U(zap_idx_to_blk(zap, sibling+i), ==, l->l_blkid);
-		zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+		uint64_t blk;
+		err = zap_idx_to_blk(zap, sibling+i, &blk);
+		if (err)
+			return (err);
+		ASSERT3U(blk, ==, l->l_blkid);
+	}
+
+	nl = zap_leaf_split(zap, l, tx);
+
+	for (i = 0; i < (1ULL<<prefix_diff); i++) {
+		err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+		ASSERT3U(err, ==, 0); /* we checked for i/o errors above */
 		/* dprintf("set %d to %u %x\n", sibling+i, nl->l_blkid, nl); */
 	}
 
@@ -657,7 +718,8 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
 		zap_put_leaf(nl);
 	}
 
-	return (l);
+	*lp = l;
+	return (0);
 }
 
 static void
@@ -682,7 +744,8 @@ again:
 			err = zap_lockdir(os, zapobj, tx,
 			    RW_WRITER, FALSE, &zap);
 			ASSERT3U(err, ==, 0);
-			l = zap_get_leaf_byblk(zap, blkid, tx, RW_READER);
+			(void) zap_get_leaf_byblk(zap, blkid, tx,
+			    RW_READER, &l);
 			goto again;
 		}
 
@@ -734,7 +797,9 @@ fzap_lookup(zap_t *zap, const char *name,
 		return (err);
 
 	hash = zap_hash(zap, name);
-	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
+	err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
+	if (err != 0)
+		return (err);
 	err = zap_leaf_lookup(l, name, hash, &zeh);
 	if (err != 0)
 		goto out;
@@ -747,7 +812,7 @@ out:
 int
 fzap_add_cd(zap_t *zap, const char *name,
     uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, dmu_tx_t *tx, zap_leaf_t **lp)
+    const void *val, uint32_t cd, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	uint64_t hash;
@@ -759,14 +824,17 @@ fzap_add_cd(zap_t *zap, const char *name,
 	ASSERT(fzap_checksize(integer_size, num_integers) == 0);
 
 	hash = zap_hash(zap, name);
-	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+	err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+	if (err != 0)
+		return (err);
 retry:
 	err = zap_leaf_lookup(l, name, hash, &zeh);
 	if (err == 0) {
 		err = EEXIST;
 		goto out;
 	}
-	ASSERT(err == ENOENT);
+	if (err != ENOENT)
+		goto out;
 
 	/* XXX If this leaf is chained, split it if we can. */
 	err = zap_entry_create(l, name, hash, cd,
@@ -775,15 +843,14 @@ retry:
 	if (err == 0) {
 		zap_increment_num_entries(zap, 1, tx);
 	} else if (err == EAGAIN) {
-		l = zap_expand_leaf(zap, l, hash, tx);
+		err = zap_expand_leaf(zap, l, hash, tx, &l);
+		if (err != 0)
+			goto out;
 		goto retry;
 	}
 
 out:
-	if (lp)
-		*lp = l;
-	else
-		zap_put_leaf(l);
+	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
 	return (err);
 }
 
@@ -793,16 +860,14 @@ fzap_add(zap_t *zap, const char *name,
     const void *val, dmu_tx_t *tx)
 {
 	int err;
-	zap_leaf_t *l;
 
 	err = fzap_checksize(integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
 	err = fzap_add_cd(zap, name, integer_size, num_integers,
-	    val, ZAP_MAXCD, tx, &l);
+	    val, ZAP_MAXCD, tx);
 
-	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
 	return (err);
 }
 
@@ -821,7 +886,9 @@ fzap_update(zap_t *zap, const char *name,
 		return (err);
 
 	hash = zap_hash(zap, name);
-	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+	err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+	if (err != 0)
+		return (err);
 retry:
 	err = zap_leaf_lookup(l, name, hash, &zeh);
 	create = (err == ENOENT);
@@ -839,10 +906,13 @@ retry:
 	}
 
 	if (err == EAGAIN) {
-		l = zap_expand_leaf(zap, l, hash, tx);
+		err = zap_expand_leaf(zap, l, hash, tx, &l);
+		if (err != 0)
+			goto out;
 		goto retry;
 	}
 
+out:
 	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
 	return (err);
 }
@@ -857,7 +927,9 @@ fzap_length(zap_t *zap, const char *name,
 	zap_entry_handle_t zeh;
 
 	hash = zap_hash(zap, name);
-	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
+	err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
+	if (err != 0)
+		return (err);
 	err = zap_leaf_lookup(l, name, hash, &zeh);
 	if (err != 0)
 		goto out;
@@ -880,7 +952,9 @@ fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
 	zap_entry_handle_t zeh;
 
 	hash = zap_hash(zap, name);
-	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+	err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+	if (err != 0)
+		return (err);
 	err = zap_leaf_lookup(l, name, hash, &zeh);
 	if (err == 0) {
 		zap_entry_remove(&zeh);
@@ -938,7 +1012,10 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 
 again:
 	if (zc->zc_leaf == NULL) {
-		zc->zc_leaf = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER);
+		err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
+		    &zc->zc_leaf);
+		if (err != 0)
+			return (err);
 	} else {
 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 	}
@@ -982,7 +1059,7 @@ again:
 static void
 zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
 {
-	int i;
+	int i, err;
 	uint64_t lastblk = 0;
 
 	/*
@@ -997,10 +1074,11 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
 			continue;
 		lastblk = tbl[i];
 
-		l = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER);
-
-		zap_stats_leaf(zap, l, zs);
-		zap_put_leaf(l);
+		err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
+		if (err == 0) {
+			zap_stats_leaf(zap, l, zs);
+			zap_put_leaf(l);
+		}
 	}
 }
 
@@ -1028,12 +1106,16 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 		for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
 		    b++) {
 			dmu_buf_t *db;
-
-			db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-			    (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs);
-			dmu_buf_read(db);
-			zap_stats_ptrtbl(zap, db->db_data, 1<<(bs-3), zs);
-			dmu_buf_rele(db);
+			int err;
+
+			err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+			    (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
+			    FTAG, &db);
+			if (err == 0) {
+				zap_stats_ptrtbl(zap, db->db_data,
+				    1<<(bs-3), zs);
+				dmu_buf_rele(db, FTAG);
+			}
 		}
 	}
 }
diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c
index 3e150b9b1d..2d3180e37f 100644
--- a/usr/src/uts/common/fs/zfs/zap_micro.c
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c
@@ -29,6 +29,7 @@
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
 #include <sys/zap.h>
+#include <sys/refcount.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 #include <sys/avl.h>
@@ -269,7 +270,9 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 
 	*zapp = NULL;
 
-	db = dmu_buf_hold(os, obj, 0);
+	err = dmu_buf_hold(os, obj, 0, NULL, &db);
+	if (err)
+		return (err);
 
 #ifdef ZFS_DEBUG
 	{
@@ -279,12 +282,6 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 	}
 #endif
 
-	/*
-	 * The zap can deal with EIO here, but its callers don't yet, so
-	 * spare them by doing a mustsucceed read.
-	 */
-	dmu_buf_read(db);
-
 	zap = dmu_buf_get_user(db);
 	if (zap == NULL)
 		zap = mzap_open(os, obj, db);
@@ -340,7 +337,7 @@ void
 zap_unlockdir(zap_t *zap)
 {
 	rw_exit(&zap->zap_rwlock);
-	dmu_buf_rele(zap->zap_dbuf);
+	dmu_buf_rele(zap->zap_dbuf, NULL);
 }
 
 static void
@@ -375,7 +372,7 @@ mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
 		    mze->mze_name, mze->mze_value);
 		err = fzap_add_cd(zap,
 		    mze->mze_name, 8, 1, &mze->mze_value,
-		    mze->mze_cd, tx, NULL);
+		    mze->mze_cd, tx);
 		ASSERT3U(err, ==, 0);
 	}
 	kmem_free(mzp, sz);
@@ -411,7 +408,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
 	dmu_buf_t *db;
 	mzap_phys_t *zp;
 
-	db = dmu_buf_hold(os, obj, 0);
+	VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
 
 #ifdef ZFS_DEBUG
 	{
@@ -426,7 +423,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
 	zp->mz_block_type = ZBT_MICRO;
 	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
 	ASSERT(zp->mz_salt != 0);
-	dmu_buf_rele(db);
+	dmu_buf_rele(db, FTAG);
 }
 
 int
diff --git a/usr/src/uts/common/fs/zfs/zfs_acl.c b/usr/src/uts/common/fs/zfs/zfs_acl.c
index 69acccf493..c70986b853 100644
--- a/usr/src/uts/common/fs/zfs/zfs_acl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_acl.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -288,25 +287,33 @@ zfs_acl_node_read_internal(znode_t *zp)
 /*
  * Read an external acl object.
  */
-zfs_acl_t *
-zfs_acl_node_read(znode_t *zp)
+static int
+zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp)
 {
 	uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
 	zfs_acl_t	*aclp;
+	int error;
 
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 
-	if (zp->z_phys->zp_acl.z_acl_extern_obj == 0)
-		return (zfs_acl_node_read_internal(zp));
+	if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
+		*aclpp = zfs_acl_node_read_internal(zp);
+		return (0);
+	}
 
 	aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count);
 
-	dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
+	error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
 	    ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl);
+	if (error != 0) {
+		zfs_acl_free(aclp);
+		return (error);
+	}
 
 	aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
 
-	return (aclp);
+	*aclpp = aclp;
+	return (0);
 }
 
 static boolean_t
@@ -868,15 +875,17 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
 int
 zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx)
 {
-	zfs_acl_t *aclp;
+	zfs_acl_t *aclp = NULL;
 	int error;
 
 	ASSERT(MUTEX_HELD(&zp->z_lock));
 	mutex_enter(&zp->z_acl_lock);
-	aclp = zfs_acl_node_read(zp);
-	error = zfs_acl_chmod(zp, mode, aclp, tx);
+	error = zfs_acl_node_read(zp, &aclp);
+	if (error == 0)
+		error = zfs_acl_chmod(zp, mode, aclp, tx);
 	mutex_exit(&zp->z_acl_lock);
-	zfs_acl_free(aclp);
+	if (aclp)
+		zfs_acl_free(aclp);
 	return (error);
 }
 
@@ -1047,7 +1056,7 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
 	pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE);
 	if (pull_down) {
 		mutex_enter(&parent->z_acl_lock);
-		paclp = zfs_acl_node_read(parent);
+		VERIFY(0 == zfs_acl_node_read(parent, &paclp));
 		mutex_exit(&parent->z_acl_lock);
 		aclp = zfs_acl_inherit(zp, paclp);
 		zfs_acl_free(paclp);
@@ -1106,7 +1115,12 @@ zfs_getacl(znode_t *zp, vsecattr_t  *vsecp, cred_t *cr)
 
 	mutex_enter(&zp->z_acl_lock);
 
-	aclp = zfs_acl_node_read(zp);
+	error = zfs_acl_node_read(zp, &aclp);
+	if (error != 0) {
+		mutex_exit(&zp->z_acl_lock);
+		return (error);
+	}
+
 
 	if (mask & VSA_ACECNT) {
 		vsecp->vsa_aclcnt = aclp->z_acl_count;
@@ -1240,6 +1254,7 @@ zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
 	int		mode_wanted = v4_mode;
 	int		cnt;
 	int		i;
+	int		error;
 	int		access_deny = ACCESS_UNDETERMINED;
 	uint_t		entry_type;
 	uid_t		uid = crgetuid(cr);
@@ -1257,7 +1272,12 @@ zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
 
 	mutex_enter(&zp->z_acl_lock);
 
-	aclp = zfs_acl_node_read(zp);
+	error = zfs_acl_node_read(zp, &aclp);
+	if (error != 0) {
+		mutex_exit(&zp->z_acl_lock);
+		return (error);
+	}
+
 
 	zacep = aclp->z_acl;
 	cnt = aclp->z_acl_count;
diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c
index ebdce10c33..d73315b47d 100644
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c
@@ -289,6 +289,21 @@ zfs_dq_hexname(char namebuf[17], uint64_t x)
 	return (name);
 }
 
+/*
+ * Delete Queue Error Handling
+ *
+ * When dealing with the delete queue, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating.  We
+ * also fib and say that we won't be adding any new entries to the
+ * delete queue, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem).  So on the small
+ * chance that the delete queue is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the delete queue below to fail due to i/o error.  On a
+ * nondebug system, this will result in the space being leaked.
+ */
+
 void
 zfs_dq_add(znode_t *zp, dmu_tx_t *tx)
 {
@@ -338,9 +353,9 @@ zfs_purgedir(znode_t *dzp)
 
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_bonus(tx, dzp->z_id);
-		dmu_tx_hold_zap(tx, dzp->z_id, -1);
+		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
 		dmu_tx_hold_bonus(tx, xzp->z_id);
-		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
@@ -579,10 +594,10 @@ zfs_rmnode(znode_t *zp)
 	 */
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
-	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1);
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
 	if (xzp) {
 		dmu_tx_hold_bonus(tx, xzp->z_id);
-		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, TRUE, NULL);
 	}
 	if (acl_obj)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
@@ -764,7 +779,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		dmu_tx_abort(tx);
diff --git a/usr/src/uts/common/fs/zfs/zfs_fm.c b/usr/src/uts/common/fs/zfs/zfs_fm.c
new file mode 100644
index 0000000000..007445c713
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_fm.c
@@ -0,0 +1,316 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+#include <sys/fm/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/sysevent.h>
+
+/*
+ * This general routine is responsible for generating all the different ZFS
+ * ereports.  The payload is dependent on the class, and which arguments are
+ * supplied to the function:
+ *
+ * 	EREPORT			POOL	VDEV	IO
+ * 	block			X	X	X
+ * 	data			X		X
+ * 	device			X	X
+ * 	pool			X
+ *
+ * If we are in a loading state, all errors are chained together by the same
+ * SPA-wide ENA.
+ *
+ * For isolated I/O requests, we get the ENA from the zio_t. The propagation
+ * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
+ * to chain together all ereports associated with a logical piece of data.  For
+ * read I/Os, there  are basically three 'types' of I/O, which form a roughly
+ * layered diagram:
+ *
+ *      +---------------+
+ * 	| Aggregate I/O |	No associated logical data or device
+ * 	+---------------+
+ *              |
+ *              V
+ * 	+---------------+	Reads associated with a piece of logical data.
+ * 	|   Read I/O    |	This includes reads on behalf of RAID-Z,
+ * 	+---------------+       mirrors, gang blocks, retries, etc.
+ *              |
+ *              V
+ * 	+---------------+	Reads associated with a particular device, but
+ * 	| Physical I/O  |	no logical data.  Issued as part of vdev caching
+ * 	+---------------+	and I/O aggregation.
+ *
+ * Note that 'physical I/O' here is not the same terminology as used in the rest
+ * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
+ * blockpointer.  But I/O with no associated block pointer can still be related
+ * to a logical piece of data (i.e. RAID-Z requests).
+ *
+ * Purely physical I/O always have unique ENAs.  They are not related to a
+ * particular piece of logical data, and therefore cannot be chained together.
+ * We still generate an ereport, but the DE doesn't correlate it with any
+ * logical piece of data.  When such an I/O fails, the delegated I/O requests
+ * will issue a retry, which will trigger the 'real' ereport with the correct
+ * ENA.
+ *
+ * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
+ * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
+ * then inherit this pointer, so that when it is first set subsequent failures
+ * will use the same ENA.  If a physical I/O is issued (by passing the
+ * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a
+ * unique ENA will be generated.  For an aggregate I/O, this pointer is set to
+ * NULL, and no ereport will be generated (since it doesn't actually correspond
+ * to any particular device or piece of data).
+ */
+void
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+    uint64_t stateoroffset, uint64_t size)
+{
+#ifdef _KERNEL
+	nvlist_t *ereport, *detector;
+	uint64_t ena;
+	char class[64];
+
+	/*
+	 * If we are doing a spa_tryimport(), ignore errors.
+	 */
+	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+		return;
+
+	/*
+	 * If we are in the middle of opening a pool, and the previous attempt
+	 * failed, don't bother logging any new ereports - we're just going to
+	 * get the same diagnosis anyway.
+	 */
+	if (spa->spa_load_state != SPA_LOAD_NONE &&
+	    spa->spa_last_open_failed)
+		return;
+
+	/*
+	 * Ignore any errors from I/Os that we are going to retry anyway - we
+	 * only generate errors from the final failure.
+	 */
+	if (zio && zio_should_retry(zio))
+		return;
+
+	if ((ereport = fm_nvlist_create(NULL)) == NULL)
+		return;
+
+	if ((detector = fm_nvlist_create(NULL)) == NULL) {
+		fm_nvlist_destroy(ereport, FM_NVA_FREE);
+		return;
+	}
+
+	/*
+	 * Serialize ereport generation
+	 */
+	mutex_enter(&spa->spa_errlist_lock);
+
+	/*
+	 * Determine the ENA to use for this event.  If we are in a loading
+	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
+	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
+	 */
+	if (spa->spa_load_state != SPA_LOAD_NONE) {
+		if (spa->spa_ena == 0)
+			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
+		ena = spa->spa_ena;
+	} else if (zio != NULL && zio->io_logical != NULL) {
+		if (zio->io_logical->io_ena == 0)
+			zio->io_logical->io_ena =
+			    fm_ena_generate(0, FM_ENA_FMT1);
+		ena = zio->io_logical->io_ena;
+	} else {
+		ena = fm_ena_generate(0, FM_ENA_FMT1);
+	}
+
+	/*
+	 * Construct the full class, detector, and other standard FMA fields.
+	 */
+	(void) snprintf(class, sizeof (class), "%s.%s",
+	    ZFS_ERROR_CLASS, subclass);
+
+	fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
+	    vd != NULL ? vd->vdev_guid : 0);
+
+	fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
+
+	/*
+	 * Construct the per-ereport payload, depending on which parameters are
+	 * passed in.
+	 */
+
+	/*
+	 * Generic payload members common to all ereports.
+	 *
+	 * The direct reference to spa_name is used rather than spa_name()
+	 * because of the asynchronous nature of the zio pipeline.  spa_name()
+	 * asserts that the config lock is held in some form.  This is always
+	 * the case in I/O context, but because the check for RW_WRITER compares
+	 * against 'curthread', we may be in an asynchronous context and blow
+	 * this assert.  Rather than loosen this assert, we acknowledge that all
+	 * contexts in which this function is called (pool open, I/O) are safe,
+	 * and dereference the name directly.
+	 */
+	fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
+	    DATA_TYPE_STRING, spa->spa_name, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+	    DATA_TYPE_UINT64, spa_guid(spa),
+	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
+	    spa->spa_load_state, NULL);
+
+	if (vd != NULL) {
+		vdev_t *pvd = vd->vdev_parent;
+
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+		    DATA_TYPE_UINT64, vd->vdev_guid,
+		    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+		    DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
+		if (vd->vdev_path)
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
+			    DATA_TYPE_STRING, vd->vdev_path, NULL);
+		if (vd->vdev_devid)
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
+			    DATA_TYPE_STRING, vd->vdev_devid, NULL);
+
+		if (pvd != NULL) {
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
+			    DATA_TYPE_UINT64, pvd->vdev_guid,
+			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
+			    DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
+			    NULL);
+			if (pvd->vdev_path)
+				fm_payload_set(ereport,
+				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
+				    DATA_TYPE_STRING, vd->vdev_path, NULL);
+			if (pvd->vdev_devid)
+				fm_payload_set(ereport,
+				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
+				    DATA_TYPE_STRING, pvd->vdev_devid, NULL);
+		}
+	}
+
+	if (zio != NULL) {
+		/*
+		 * Payload common to all I/Os.
+		 */
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
+		    DATA_TYPE_INT32, zio->io_error, NULL);
+
+		/*
+		 * If the 'size' parameter is non-zero, it indicates this is a
+		 * RAID-Z or other I/O where the physical offset and length are
+		 * provided for us, instead of within the zio_t.
+		 */
+		if (vd != NULL) {
+			if (size)
+				fm_payload_set(ereport,
+				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+				    DATA_TYPE_UINT64, stateoroffset,
+				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+				    DATA_TYPE_UINT64, size);
+			else
+				fm_payload_set(ereport,
+				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+				    DATA_TYPE_UINT64, zio->io_offset,
+				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+				    DATA_TYPE_UINT64, zio->io_size);
+		}
+
+		/*
+		 * Payload for I/Os with corresponding logical information.
+		 */
+		if (zio->io_logical != NULL)
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
+			    DATA_TYPE_UINT64,
+			    zio->io_logical->io_bookmark.zb_objset,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
+			    DATA_TYPE_UINT64,
+			    zio->io_logical->io_bookmark.zb_object,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
+			    DATA_TYPE_INT32,
+			    zio->io_logical->io_bookmark.zb_level,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
+			    DATA_TYPE_UINT64,
+			    zio->io_logical->io_bookmark.zb_blkid);
+	} else if (vd != NULL) {
+		/*
+		 * If we have a vdev but no zio, this is a device fault, and the
+		 * 'stateoroffset' parameter indicates the previous state of the
+		 * vdev.
+		 */
+		fm_payload_set(ereport,
+		    FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
+		    DATA_TYPE_UINT64, stateoroffset, NULL);
+	}
+	mutex_exit(&spa->spa_errlist_lock);
+
+	fm_ereport_post(ereport, EVCH_SLEEP);
+
+	fm_nvlist_destroy(ereport, FM_NVA_FREE);
+	fm_nvlist_destroy(detector, FM_NVA_FREE);
+#endif
+}
+
+/*
+ * The 'resource.fs.zfs.ok' event is an internal signal that the associated
+ * resource (pool or disk) has been identified by ZFS as healthy.  This will
+ * then trigger the DE to close the associated case, if any.
+ */
+void
+zfs_post_ok(spa_t *spa, vdev_t *vd)
+{
+#ifdef _KERNEL
+	nvlist_t *resource;
+	char class[64];
+
+	if ((resource = fm_nvlist_create(NULL)) == NULL)
+		return;
+
+	(void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
+	    ZFS_ERROR_CLASS, FM_RESOURCE_OK);
+	VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
+	VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
+	VERIFY(nvlist_add_uint64(resource,
+	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
+	if (vd)
+		VERIFY(nvlist_add_uint64(resource,
+		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
+
+	fm_ereport_post(resource, EVCH_SLEEP);
+
+	fm_nvlist_destroy(resource, FM_NVA_FREE);
+#endif
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 29b01e4331..422b24a993 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -297,6 +297,16 @@ zfs_secpolicy_config(const char *unused, const char *unused2, cred_t *cr)
 }
 
 /*
+ * Policy for fault injection.  Requires all privileges.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_inject(const char *unused, const char *unused2, cred_t *cr)
+{
+	return (secpolicy_zinject(cr));
+}
+
+/*
  * Returns the nvlist as specified by the user in the zfs_cmd_t.
  */
 static int
@@ -368,7 +378,7 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
 		return (error);
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
-	    guid != zc->zc_pool_guid)
+	    guid != zc->zc_guid)
 		error = EINVAL;
 	else
 		error = spa_import(zc->zc_name, config,
@@ -396,7 +406,8 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc)
 	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
 		return (EEXIST);
 
-	VERIFY(nvlist_pack(configs, &packed, &size, NV_ENCODE_NATIVE, 0) == 0);
+	VERIFY(nvlist_pack(configs, &packed, &size, NV_ENCODE_NATIVE,
+	    KM_SLEEP) == 0);
 
 	if (size > zc->zc_config_dst_size)
 		error = ENOMEM;
@@ -420,7 +431,7 @@ zfs_ioc_pool_guid(zfs_cmd_t *zc)
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error == 0) {
-		zc->zc_pool_guid = spa_guid(spa);
+		zc->zc_guid = spa_guid(spa);
 		spa_close(spa, FTAG);
 	}
 	return (error);
@@ -433,28 +444,37 @@ zfs_ioc_pool_stats(zfs_cmd_t *zc)
 	char *packed = NULL;
 	size_t size = 0;
 	int error;
+	int ret = 0;
 
-	error = spa_get_stats(zc->zc_name, &config);
+	error = spa_get_stats(zc->zc_name, &config, zc->zc_root,
+	    sizeof (zc->zc_root));
 
 	if (config != NULL) {
 		VERIFY(nvlist_pack(config, &packed, &size,
-		    NV_ENCODE_NATIVE, 0) == 0);
+		    NV_ENCODE_NATIVE, KM_SLEEP) == 0);
 
 		if (size > zc->zc_config_dst_size)
-			error = ENOMEM;
+			ret = ENOMEM;
 		else if (xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst,
 		    size))
-			error = EFAULT;
+			ret = EFAULT;
 
 		zc->zc_config_dst_size = size;
 
 		kmem_free(packed, size);
 		nvlist_free(config);
+
+		/*
+		 * The config may be present even if 'error' is non-zero.
+		 * In this case we return success, and preserve the real errno
+		 * in 'zc_cookie'.
+		 */
+		zc->zc_cookie = error;
 	} else {
-		ASSERT(error != 0);
+		ret = error;
 	}
 
-	return (error);
+	return (ret);
 }
 
 /*
@@ -479,7 +499,8 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
 	if (config == NULL)
 		return (EINVAL);
 
-	VERIFY(nvlist_pack(config, &packed, &size, NV_ENCODE_NATIVE, 0) == 0);
+	VERIFY(nvlist_pack(config, &packed, &size, NV_ENCODE_NATIVE,
+	    KM_SLEEP) == 0);
 
 	if (size > zc->zc_config_dst_size)
 		error = ENOMEM;
@@ -554,13 +575,12 @@ static int
 zfs_ioc_vdev_online(zfs_cmd_t *zc)
 {
 	spa_t *spa;
-	char *path = zc->zc_prop_value;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
-	error = vdev_online(spa, path);
+	error = vdev_online(spa, zc->zc_guid);
 	spa_close(spa, FTAG);
 	return (error);
 }
@@ -569,14 +589,13 @@ static int
 zfs_ioc_vdev_offline(zfs_cmd_t *zc)
 {
 	spa_t *spa;
-	char *path = zc->zc_prop_value;
 	int istmp = zc->zc_cookie;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
-	error = vdev_offline(spa, path, istmp);
+	error = vdev_offline(spa, zc->zc_guid, istmp);
 	spa_close(spa, FTAG);
 	return (error);
 }
@@ -585,7 +604,6 @@ static int
 zfs_ioc_vdev_attach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
-	char *path = zc->zc_prop_value;
 	int replacing = zc->zc_cookie;
 	nvlist_t *config;
 	int error;
@@ -595,7 +613,7 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc)
 		return (error);
 
 	if ((error = get_config(zc, &config)) == 0) {
-		error = spa_vdev_attach(spa, path, config, replacing);
+		error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
 		nvlist_free(config);
 	}
 
@@ -607,14 +625,13 @@ static int
 zfs_ioc_vdev_detach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
-	char *path = zc->zc_prop_value;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
-	error = spa_vdev_detach(spa, path, 0, B_FALSE);
+	error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
 
 	spa_close(spa, FTAG);
 	return (error);
@@ -625,7 +642,7 @@ zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *path = zc->zc_prop_value;
-	uint64_t guid = zc->zc_pool_guid;
+	uint64_t guid = zc->zc_guid;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
@@ -688,6 +705,8 @@ retry:
 	if (!error && zc->zc_objset_stats.dds_type == DMU_OST_ZVOL)
 		error = zvol_get_stats(zc, os);
 
+	spa_altroot(dmu_objset_spa(os), zc->zc_root, sizeof (zc->zc_root));
+
 	dmu_objset_close(os);
 	return (error);
 }
@@ -1008,8 +1027,8 @@ zfs_ioc_recvbackup(zfs_cmd_t *zc)
 	fp = getf(fd);
 	if (fp == NULL)
 		return (EBADF);
-	error = dmu_recvbackup(&zc->zc_begin_record, &zc->zc_cookie,
-	    fp->f_vnode, fp->f_offset);
+	error = dmu_recvbackup(zc->zc_filename, &zc->zc_begin_record,
+	    &zc->zc_cookie, fp->f_vnode, fp->f_offset);
 	releasef(fd);
 	return (error);
 }
@@ -1053,6 +1072,110 @@ zfs_ioc_sendbackup(zfs_cmd_t *zc)
 	return (error);
 }
 
+static int
+zfs_ioc_inject_fault(zfs_cmd_t *zc)
+{
+	int id, error;
+
+	error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
+	    &zc->zc_inject_record);
+
+	if (error == 0)
+		zc->zc_guid = (uint64_t)id;
+
+	return (error);
+}
+
+static int
+zfs_ioc_clear_fault(zfs_cmd_t *zc)
+{
+	return (zio_clear_fault((int)zc->zc_guid));
+}
+
+static int
+zfs_ioc_inject_list_next(zfs_cmd_t *zc)
+{
+	int id = (int)zc->zc_guid;
+	int error;
+
+	error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
+	    &zc->zc_inject_record);
+
+	zc->zc_guid = id;
+
+	return (error);
+}
+
+static int
+zfs_ioc_error_log(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+	size_t count = (size_t)zc->zc_config_dst_size;
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+
+	error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_config_dst,
+	    &count);
+	if (error == 0)
+		zc->zc_config_dst_size = count;
+	else
+		zc->zc_config_dst_size = spa_get_errlog_size(spa);
+
+	spa_close(spa, FTAG);
+
+	return (error);
+}
+
+static int
+zfs_ioc_clear(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	vdev_t *vd;
+	int error;
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+
+	spa_config_enter(spa, RW_WRITER, FTAG);
+
+	if (zc->zc_prop_value[0] == '\0')
+		vd = NULL;
+	else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) {
+		spa_config_exit(spa, FTAG);
+		spa_close(spa, FTAG);
+		return (ENODEV);
+	}
+
+	vdev_clear(spa, vd);
+
+	spa_config_exit(spa, FTAG);
+
+	spa_close(spa, FTAG);
+
+	return (0);
+}
+
+static int
+zfs_ioc_bookmark_name(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+
+	error = spa_bookmark_name(spa, &zc->zc_bookmark,
+	    zc->zc_prop_name, sizeof (zc->zc_prop_name), zc->zc_prop_value,
+	    sizeof (zc->zc_prop_value), zc->zc_filename,
+	    sizeof (zc->zc_filename));
+
+	spa_close(spa, FTAG);
+
+	return (error);
+}
+
 static zfs_ioc_vec_t zfs_ioc_vec[] = {
 	{ zfs_ioc_pool_create,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_destroy,		zfs_secpolicy_config,	pool_name },
@@ -1087,6 +1210,12 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
 	{ zfs_ioc_rename,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_recvbackup,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_sendbackup,		zfs_secpolicy_write,	dataset_name },
+	{ zfs_ioc_inject_fault,		zfs_secpolicy_inject,	no_name },
+	{ zfs_ioc_clear_fault,		zfs_secpolicy_inject,	no_name },
+	{ zfs_ioc_inject_list_next,	zfs_secpolicy_inject,	no_name },
+	{ zfs_ioc_error_log,		zfs_secpolicy_inject,	pool_name },
+	{ zfs_ioc_clear,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_bookmark_name,	zfs_secpolicy_inject,	pool_name }
 };
 
 static int
@@ -1279,7 +1408,7 @@ _fini(void)
 {
 	int error;
 
-	if (spa_busy() || zfs_busy() || zvol_busy())
+	if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
 		return (EBUSY);
 
 	if ((error = mod_remove(&modlinkage)) != 0)
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 17771b2e26..68a3e414eb 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -52,6 +52,7 @@
 #include <sys/modctl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_ctldir.h>
+#include <sys/bootconf.h>
 #include <sys/sunddi.h>
 #include <sys/dnlc.h>
 
@@ -61,8 +62,11 @@ static major_t zfs_major;
 static minor_t zfs_minor;
 static kmutex_t	zfs_dev_mtx;
 
+extern char zfs_bootpath[BO_MAXOBJNAME];
+
 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
+static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
@@ -71,6 +75,7 @@ static void zfs_objset_close(zfsvfs_t *zfsvfs);
 
 static const fs_operation_def_t zfs_vfsops_template[] = {
 	VFSNAME_MOUNT, zfs_mount,
+	VFSNAME_MOUNTROOT, zfs_mountroot,
 	VFSNAME_UNMOUNT, zfs_umount,
 	VFSNAME_ROOT, zfs_root,
 	VFSNAME_STATVFS, zfs_statvfs,
@@ -150,6 +155,58 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
 	return (0);
 }
 
+static int
+zfs_create_unique_device(dev_t *dev)
+{
+	major_t new_major;
+
+	do {
+		ASSERT3U(zfs_minor, <=, MAXMIN32);
+		minor_t start = zfs_minor;
+		do {
+			mutex_enter(&zfs_dev_mtx);
+			if (zfs_minor >= MAXMIN32) {
+				/*
+				 * If we're still using the real major
+				 * keep out of /dev/zfs and /dev/zvol minor
+				 * number space.  If we're using a getudev()'ed
+				 * major number, we can use all of its minors.
+				 */
+				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
+					zfs_minor = ZFS_MIN_MINOR;
+				else
+					zfs_minor = 0;
+			} else {
+				zfs_minor++;
+			}
+			*dev = makedevice(zfs_major, zfs_minor);
+			mutex_exit(&zfs_dev_mtx);
+		} while (vfs_devismounted(*dev) && zfs_minor != start);
+		if (zfs_minor == start) {
+			/*
+			 * We are using all ~262,000 minor numbers for the
+			 * current major number.  Create a new major number.
+			 */
+			if ((new_major = getudev()) == (major_t)-1) {
+				cmn_err(CE_WARN,
+				    "zfs_mount: Can't get unique major "
+				    "device number.");
+				return (-1);
+			}
+			mutex_enter(&zfs_dev_mtx);
+			zfs_major = new_major;
+			zfs_minor = 0;
+
+			mutex_exit(&zfs_dev_mtx);
+		} else {
+			break;
+		}
+		/* CONSTANTCONDITION */
+	} while (1);
+
+	return (0);
+}
+
 static void
 atime_changed_cb(void *arg, uint64_t newval)
 {
@@ -271,110 +328,182 @@ acl_inherit_changed_cb(void *arg, uint64_t newval)
 	zfsvfs->z_acl_inherit = newval;
 }
 
-/*ARGSUSED*/
 static int
-zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+zfs_refresh_properties(vfs_t *vfsp)
 {
-	zfsvfs_t	*zfsvfs = NULL;
-	znode_t		*zp = NULL;
-	vnode_t		*vp = NULL;
-	objset_t	*os = NULL;
-	struct dsl_dataset *ds;
-	char		*osname;
-	uint64_t	readonly, recordsize;
-	pathname_t	spn;
-	dev_t		mount_dev;
-	major_t		new_major;
-	int		mode;
-	int		error = 0;
-	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
-				UIO_SYSSPACE : UIO_USERSPACE;
-	int		canwrite;
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
-	if (mvp->v_type != VDIR)
-		return (ENOTDIR);
+	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+		readonly_changed_cb(zfsvfs, B_TRUE);
+	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+		if (dmu_objset_is_snapshot(zfsvfs->z_os))
+			return (EROFS);
+		readonly_changed_cb(zfsvfs, B_FALSE);
+	}
 
-	mutex_enter(&mvp->v_lock);
-	if ((uap->flags & MS_REMOUNT) == 0 &&
-	    (uap->flags & MS_OVERLAY) == 0 &&
-	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
-		mutex_exit(&mvp->v_lock);
-		return (EBUSY);
+	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+		devices_changed_cb(zfsvfs, B_FALSE);
+		setuid_changed_cb(zfsvfs, B_FALSE);
+	} else {
+		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
+			devices_changed_cb(zfsvfs, B_FALSE);
+		else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
+			devices_changed_cb(zfsvfs, B_TRUE);
+
+		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
+			setuid_changed_cb(zfsvfs, B_FALSE);
+		else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
+			setuid_changed_cb(zfsvfs, B_TRUE);
 	}
-	mutex_exit(&mvp->v_lock);
 
-	/*
-	 * ZFS does not support passing unparsed data in via MS_DATA.
-	 * Users should use the MS_OPTIONSTR interface; this means
-	 * that all option parsing is already done and the options struct
-	 * can be interrogated.
-	 */
-	if ((uap->flags & MS_DATA) && uap->datalen > 0)
-		return (EINVAL);
+	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
+		exec_changed_cb(zfsvfs, B_FALSE);
+	else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
+		exec_changed_cb(zfsvfs, B_TRUE);
+
+	return (0);
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+	struct dsl_dataset *ds = NULL;
+	objset_t *os = NULL;
+	zfsvfs_t *zfsvfs = NULL;
+	int do_readonly = FALSE, readonly;
+	int do_setuid = FALSE, setuid;
+	int do_exec = FALSE, exec;
+	int do_devices = FALSE, devices;
+	int error = 0;
+
+	ASSERT(vfsp);
+	zfsvfs = vfsp->vfs_data;
+	ASSERT(zfsvfs);
+	os = zfsvfs->z_os;
 
 	/*
-	 * When doing a remount, we simply refresh our temporary properties
-	 * according to those options set in the current VFS options.
+	 * The act of registering our callbacks will destroy any mount
+	 * options we may have.  In order to enable temporary overrides
+	 * of mount options, we stash away the current values and restore
+	 * restore them after we register the callbacks.
 	 */
-	if (uap->flags & MS_REMOUNT) {
-		zfsvfs = vfsp->vfs_data;
-
-		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
-			readonly_changed_cb(zfsvfs, B_TRUE);
-		else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
-			if (dmu_objset_is_snapshot(zfsvfs->z_os))
-				return (EROFS);
-			readonly_changed_cb(zfsvfs, B_FALSE);
+	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+		readonly = B_TRUE;
+		do_readonly = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+		readonly = B_FALSE;
+		do_readonly = B_TRUE;
+	}
+	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+		devices = B_FALSE;
+		setuid = B_FALSE;
+		do_devices = B_TRUE;
+		do_setuid = B_TRUE;
+	} else {
+		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
+			devices = B_FALSE;
+			do_devices = B_TRUE;
+		} else if (vfs_optionisset(vfsp,
+			    MNTOPT_DEVICES, NULL)) {
+			devices = B_TRUE;
+			do_devices = B_TRUE;
 		}
 
-		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
-			devices_changed_cb(zfsvfs, B_FALSE);
-			setuid_changed_cb(zfsvfs, B_FALSE);
-		} else {
-			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
-				devices_changed_cb(zfsvfs, B_FALSE);
-			else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
-				devices_changed_cb(zfsvfs, B_TRUE);
-
-			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
-				setuid_changed_cb(zfsvfs, B_FALSE);
-			else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
-				setuid_changed_cb(zfsvfs, B_TRUE);
+		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
+			setuid = B_FALSE;
+			do_setuid = B_TRUE;
+		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
+			setuid = B_TRUE;
+			do_setuid = B_TRUE;
 		}
-
-		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
-			exec_changed_cb(zfsvfs, B_FALSE);
-		else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
-			exec_changed_cb(zfsvfs, B_TRUE);
-
-		return (0);
+	}
+	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
+		exec = B_FALSE;
+		do_exec = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
+		exec = B_TRUE;
+		do_exec = B_TRUE;
 	}
 
 	/*
-	 * Get the objset name (the "special" mount argument).
+	 * Register property callbacks.
+	 *
+	 * It would probably be fine to just check for i/o error from
+	 * the first prop_register(), but I guess I like to go
+	 * overboard...
 	 */
-	if (error = pn_get(uap->spec, fromspace, &spn))
-		return (error);
+	ds = dmu_objset_ds(os);
+	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "recordsize", blksz_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "readonly", readonly_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "devices", devices_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "setuid", setuid_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "exec", exec_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "snapdir", snapdir_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "aclmode", acl_mode_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
+	if (error)
+		goto unregister;
 
-	osname = spn.pn_path;
+	/*
+	 * Invoke our callbacks to restore temporary mount options.
+	 */
+	if (do_readonly)
+		readonly_changed_cb(zfsvfs, readonly);
+	if (do_setuid)
+		setuid_changed_cb(zfsvfs, setuid);
+	if (do_exec)
+		exec_changed_cb(zfsvfs, exec);
+	if (do_devices)
+		devices_changed_cb(zfsvfs, devices);
 
-	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
-		goto out;
+	return (0);
 
+unregister:
 	/*
-	 * Refuse to mount a filesystem if we are in a local zone and the
-	 * dataset is not visible.
+	 * We may attempt to unregister some callbacks that are not
+	 * registered, but this is OK; it will simply return ENOMSG,
+	 * which we will ignore.
 	 */
-	if (!INGLOBALZONE(curproc) &&
-	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
-		error = EPERM;
-		goto out;
-	}
+	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
+	    zfsvfs);
+	return (error);
+
+}
+
+static int
+zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr)
+{
+	dev_t mount_dev;
+	uint64_t recordsize, readonly;
+	int error = 0;
+	int mode;
+	zfsvfs_t *zfsvfs;
+	znode_t *zp = NULL;
+
+	ASSERT(vfsp);
+	ASSERT(osname);
 
 	/*
 	 * Initialize the zfs-specific filesystem structure.
 	 * Should probably make this a kmem cache, shuffle fields,
-	 * and just bzero upto z_hold_mtx[].
+	 * and just bzero up to z_hold_mtx[].
 	 */
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 	zfsvfs->z_vfs = vfsp;
@@ -388,63 +517,19 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	    offsetof(znode_t, z_link_node));
 	rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
 
-	/*
-	 * Initialize the generic filesystem structure.
-	 */
+	/* Initialize the generic filesystem structure. */
 	vfsp->vfs_bcount = 0;
 	vfsp->vfs_data = NULL;
 
-	/*
-	 * Create a unique device for the mount.
-	 */
-	do {
-		ASSERT3U(zfs_minor, <=, MAXMIN32);
-		minor_t start = zfs_minor;
-		do {
-			mutex_enter(&zfs_dev_mtx);
-			if (zfs_minor >= MAXMIN32) {
-				/*
-				 * If we're still using the real major number,
-				 * keep out of /dev/zfs and /dev/zvol minor
-				 * number space.  If we're using a getudev()'ed
-				 * major number, we can use all of its minors.
-				 */
-				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
-					zfs_minor = ZFS_MIN_MINOR;
-				else
-					zfs_minor = 0;
-			} else {
-				zfs_minor++;
-			}
-			mount_dev = makedevice(zfs_major, zfs_minor);
-			mutex_exit(&zfs_dev_mtx);
-		} while (vfs_devismounted(mount_dev) && zfs_minor != start);
-		if (zfs_minor == start) {
-			/*
-			 * We are using all ~262,000 minor numbers
-			 * for the current major number.  Create a
-			 * new major number.
-			 */
-			if ((new_major = getudev()) == (major_t)-1) {
-				cmn_err(CE_WARN,
-				    "zfs_mount: Can't get unique"
-				    " major device number.");
-				goto out;
-			}
-			mutex_enter(&zfs_dev_mtx);
-			zfs_major = new_major;
-			zfs_minor = 0;
-			mutex_exit(&zfs_dev_mtx);
-		} else {
-			break;
-		}
-		/* CONSTANTCONDITION */
-	} while (1);
-
+	if (zfs_create_unique_device(&mount_dev) == -1) {
+		error = ENODEV;
+		goto out;
+	}
 	ASSERT(vfs_devismounted(mount_dev) == 0);
 
-	if (dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL) != 0)
-		recordsize = SPA_MAXBLOCKSIZE;
+	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
+	    NULL))
+		goto out;
 
 	vfsp->vfs_dev = mount_dev;
 	vfsp->vfs_fstype = zfsfstype;
@@ -452,8 +537,7 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	vfsp->vfs_flag |= VFS_NOTRUNC;
 	vfsp->vfs_data = zfsvfs;
 
-	error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL);
-	if (error)
+	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
 		goto out;
 
 	if (readonly)
@@ -467,7 +551,6 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
 		    &zfsvfs->z_os);
 	}
-	os = zfsvfs->z_os;
 
 	if (error)
 		goto out;
@@ -475,16 +558,18 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	if (error = zfs_init_fs(zfsvfs, &zp, cr))
 		goto out;
 
-	if (dmu_objset_is_snapshot(os)) {
+	/* The call to zfs_init_fs leaves the vnode held, release it here. */
+	VN_RELE(ZTOV(zp));
+
+	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 		ASSERT(mode & DS_MODE_READONLY);
 		atime_changed_cb(zfsvfs, B_FALSE);
 		readonly_changed_cb(zfsvfs, B_TRUE);
 		zfsvfs->z_issnap = B_TRUE;
 	} else {
-		int do_readonly = FALSE, readonly;
-		int do_setuid = FALSE, setuid;
-		int do_exec = FALSE, exec;
-		int do_devices = FALSE, devices;
+		error = zfs_register_callbacks(vfsp);
+		if (error)
+			goto out;
 
 		/*
 		 * Start a delete thread running.
@@ -494,119 +579,216 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 		/*
 		 * Parse and replay the intent log.
 		 */
-		zil_replay(os, zfsvfs, &zfsvfs->z_assign, zfs_replay_vector,
-		    (void (*)(void *))zfs_delete_wait_empty);
+		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
+		    zfs_replay_vector, (void (*)(void *))zfs_delete_wait_empty);
 
 		if (!zil_disable)
-			zfsvfs->z_log = zil_open(os, zfs_get_data);
+			zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+	}
 
-		/*
-		 * The act of registering our callbacks will destroy any mount
-		 * options we may have.  In order to enable temporary overrides
-		 * of mount options, we stash away the current values and
-		 * restore them after we register the callbacks.
-		 */
-		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
-			readonly = B_TRUE;
-			do_readonly = B_TRUE;
-		} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
-			readonly = B_FALSE;
-			do_readonly = B_TRUE;
-		}
-		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
-			devices = B_FALSE;
-			setuid = B_FALSE;
-			do_devices = B_TRUE;
-			do_setuid = B_TRUE;
-		} else {
-			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
-				devices = B_FALSE;
-				do_devices = B_TRUE;
-			} else if (vfs_optionisset(vfsp,
-			    MNTOPT_DEVICES, NULL)) {
-				devices = B_TRUE;
-				do_devices = B_TRUE;
-			}
+	if (!zfsvfs->z_issnap)
+		zfsctl_create(zfsvfs);
+out:
+	if (error) {
+		if (zfsvfs->z_os)
+			dmu_objset_close(zfsvfs->z_os);
+		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+	} else {
+		atomic_add_32(&zfs_active_fs_count, 1);
+	}
 
-			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
-				setuid = B_FALSE;
-				do_setuid = B_TRUE;
-			} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
-				setuid = B_TRUE;
-				do_setuid = B_TRUE;
-			}
-		}
-		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
-			exec = B_FALSE;
-			do_exec = B_TRUE;
-		} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
-			exec = B_TRUE;
-			do_exec = B_TRUE;
-		}
+	return (error);
 
-		/*
-		 * Register property callbacks.
-		 */
+}
+
+void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+	objset_t *os = zfsvfs->z_os;
+	struct dsl_dataset *ds;
+
+	/*
+	 * Unregister properties.
+	 */
+	if (!dmu_objset_is_snapshot(os)) {
 		ds = dmu_objset_ds(os);
-		VERIFY(dsl_prop_register(ds, "atime", atime_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "recordsize", blksz_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "readonly", readonly_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "devices", devices_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "setuid", setuid_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "exec", exec_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "snapdir", snapdir_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "aclmode", acl_mode_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "aclinherit",
+		VERIFY(dsl_prop_unregister(ds, "aclinherit",
 		    acl_inherit_changed_cb, zfsvfs) == 0);
+	}
+}
 
+static int
+zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
+{
+	int error = 0;
+	int ret = 0;
+	static int zfsrootdone = 0;
+	zfsvfs_t *zfsvfs = NULL;
+	znode_t *zp = NULL;
+	vnode_t *vp = NULL;
+
+	ASSERT(vfsp);
+
+	/*
+	 * The filesystem that we mount as root is defined in
+	 * /etc/system using the zfsroot variable.  The value defined
+	 * there is copied early in startup code to zfs_bootpath
+	 * (defined in modsysfile.c).
+	 */
+	if (why == ROOT_INIT) {
+		if (zfsrootdone++)
+			return (EBUSY);
 
 		/*
-		 * Invoke our callbacks to restore temporary mount options.
+		 * This needs to be done here, so that when we return from
+		 * mountroot, the vfs resource name will be set correctly.
 		 */
-		if (do_readonly)
-			readonly_changed_cb(zfsvfs, readonly);
-		if (do_setuid)
-			setuid_changed_cb(zfsvfs, setuid);
-		if (do_exec)
-			exec_changed_cb(zfsvfs, exec);
-		if (do_devices)
-			devices_changed_cb(zfsvfs, devices);
-	}
+		if (snprintf(rootfs.bo_name, BO_MAXOBJNAME, "%s", zfs_bootpath)
+		    >= BO_MAXOBJNAME)
+			return (ENAMETOOLONG);
 
-	vp = ZTOV(zp);
-	if (!zfsvfs->z_issnap)
-		zfsctl_create(zfsvfs);
-out:
-	if (error) {
-		if (zp)
-			VN_RELE(vp);
+		if (error = vfs_lock(vfsp))
+			return (error);
 
-		if (zfsvfs) {
-			if (os)
-				dmu_objset_close(os);
-			kmem_free(zfsvfs, sizeof (zfsvfs_t));
-		}
-	} else {
-		atomic_add_32(&zfs_active_fs_count, 1);
+		if (error = zfs_domount(vfsp, zfs_bootpath, CRED()))
+			goto out;
+
+		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
+		ASSERT(zfsvfs);
+		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp))
+			goto out;
+
+		vp = ZTOV(zp);
+		mutex_enter(&vp->v_lock);
+		vp->v_flag |= VROOT;
+		mutex_exit(&vp->v_lock);
+		rootvp = vp;
+
+		/*
+		 * The zfs_zget call above returns with a hold on vp, we release
+		 * it here.
+		 */
 		VN_RELE(vp);
+
+		/*
+		 * Mount root as readonly initially, it will be remouted
+		 * read/write by /lib/svc/method/fs-usr.
+		 */
+		readonly_changed_cb(vfsp->vfs_data, B_TRUE);
+		vfs_add((struct vnode *)0, vfsp,
+		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
+out:
+		vfs_unlock(vfsp);
+		ret = (error) ? error : 0;
+		return (ret);
+
+	} else if (why == ROOT_REMOUNT) {
+
+		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
+		vfsp->vfs_flag |= VFS_REMOUNT;
+		return (zfs_refresh_properties(vfsp));
+
+	} else if (why == ROOT_UNMOUNT) {
+		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
+		(void) zfs_sync(vfsp, 0, 0);
+		return (0);
+	}
+
+	/*
+	 * if "why" is equal to anything else other than ROOT_INIT,
+	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
+	 */
+	return (ENOTSUP);
+}
+
+/*ARGSUSED*/
+static int
+zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+	char		*osname;
+	pathname_t	spn;
+	int		error = 0;
+	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
+				UIO_SYSSPACE : UIO_USERSPACE;
+	int		canwrite;
+
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_REMOUNT) == 0 &&
+	    (uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/*
+	 * ZFS does not support passing unparsed data in via MS_DATA.
+	 * Users should use the MS_OPTIONSTR interface; this means
+	 * that all option parsing is already done and the options struct
+	 * can be interrogated.
+	 */
+	if ((uap->flags & MS_DATA) && uap->datalen > 0)
+		return (EINVAL);
+
+	/*
+	 * When doing a remount, we simply refresh our temporary properties
+	 * according to those options set in the current VFS options.
+	 */
+	if (uap->flags & MS_REMOUNT) {
+		return (zfs_refresh_properties(vfsp));
 	}
 
+	/*
+	 * Get the objset name (the "special" mount argument).
+	 */
+	if (error = pn_get(uap->spec, fromspace, &spn))
+		return (error);
+
+	osname = spn.pn_path;
+
+	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+		goto out;
+
+	/*
+	 * Refuse to mount a filesystem if we are in a local zone and the
+	 * dataset is not visible.
+	 */
+	if (!INGLOBALZONE(curproc) &&
+	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
+		error = EPERM;
+		goto out;
+	}
+
+	error = zfs_domount(vfsp, osname, cr);
+
+out:
 	pn_free(&spn);
 	return (error);
 }
@@ -739,9 +921,6 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
 
 		return (0);
 	}
-
-	zfs_zcache_flush(zfsvfs);
-
 	/*
 	 * Stop all delete threads.
 	 */
@@ -866,7 +1045,6 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
 	zfs_delete_t	*zd = &zfsvfs->z_delete_head;
 	znode_t		*zp, *nextzp;
 	objset_t	*os = zfsvfs->z_os;
-	struct dsl_dataset *ds;
 
 	/*
 	 * Stop all delete threads.
@@ -881,8 +1059,6 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
 	 */
 	rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
 
-	zfs_zcache_flush(zfsvfs);
-
 	/*
 	 * Release all delete in progress znodes
 	 * They will be processed when the file system remounts.
@@ -891,7 +1067,7 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
 	while (zp = list_head(&zd->z_znodes)) {
 		list_remove(&zd->z_znodes, zp);
 		zp->z_dbuf_held = 0;
-		dmu_buf_rele(zp->z_dbuf);
+		dmu_buf_rele(zp->z_dbuf, NULL);
 	}
 	mutex_exit(&zd->z_mutex);
 
@@ -911,7 +1087,7 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
 			/* dbufs should only be held when force unmounting */
 			zp->z_dbuf_held = 0;
 			mutex_exit(&zfsvfs->z_znodes_lock);
-			dmu_buf_rele(zp->z_dbuf);
+			dmu_buf_rele(zp->z_dbuf, NULL);
 			/* Start again */
 			mutex_enter(&zfsvfs->z_znodes_lock);
 			nextzp = list_head(&zfsvfs->z_all_znodes);
@@ -922,36 +1098,8 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
 	/*
 	 * Unregister properties.
 	 */
-	if (!dmu_objset_is_snapshot(os)) {
-		ds = dmu_objset_ds(os);
-
-		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "aclinherit",
-		    acl_inherit_changed_cb, zfsvfs) == 0);
-	}
+	if (!dmu_objset_is_snapshot(os))
+		zfs_unregister_callbacks(zfsvfs);
 
 	/*
 	 * Make the dmu drop all it dbuf holds so that zfs_inactive
@@ -977,6 +1125,11 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
 	}
 
 	/*
+	 * Evict all dbufs so that cached znodes will be freed
+	 */
+	dmu_objset_evict_dbufs(os);
+
+	/*
 	 * Finally close the objset
 	 */
 	dmu_objset_close(os);
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index da5b41101a..2b9da086cc 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -229,6 +229,14 @@ zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
 	    case _FIOFFS:
 		return (zfs_sync(vp->v_vfsp, 0, cred));
 
+		/*
+		 * The following two ioctls are used by bfu.  Faking out,
+		 * necessary to avoid bfu errors.
+		 */
+	    case _FIOGDIO:
+	    case _FIOSDIO:
+		return (0);
+
 	    case _FIO_SEEK_DATA:
 	    case _FIO_SEEK_HOLE:
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
@@ -436,12 +444,10 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 		n = MIN(zfs_read_chunk_size,
 		    zp->z_phys->zp_size - uio->uio_loffset);
 		n = MIN(n, cnt);
-		dbpp = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id,
-		    uio->uio_loffset, n, &numbufs);
-		if (error = dmu_buf_read_array_canfail(dbpp, numbufs)) {
-			dmu_buf_rele_array(dbpp, numbufs);
+		error = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id,
+		    uio->uio_loffset, n, TRUE, FTAG, &numbufs, &dbpp);
+		if (error)
 			goto out;
-		}
 		/*
 		 * Compute the adjustment to align the dmu buffers
 		 * with the uio buffer.
@@ -467,7 +473,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 					(n < size ? n : size), UIO_READ, uio);
 			}
 			if (error) {
-				dmu_buf_rele_array(dbpp, numbufs);
+				dmu_buf_rele_array(dbpp, numbufs, FTAG);
 				goto out;
 			}
 			n -= dbp->db_size;
@@ -476,7 +482,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 				delta = 0;
 			}
 		}
-		dmu_buf_rele_array(dbpp, numbufs);
+		dmu_buf_rele_array(dbpp, numbufs, FTAG);
 	}
 out:
 	rw_exit(&zp->z_grow_lock);
@@ -850,10 +856,10 @@ zfs_get_data(void *arg, lr_write_t *lr)
 	 */
 	if (sizeof (lr_write_t) + dlen <= reclen) { /* immediate write */
 		rw_enter(&zp->z_grow_lock, RW_READER);
-		dmu_buf_t *db = dmu_buf_hold(os, lr->lr_foid, off);
-		dmu_buf_read(db);
+		dmu_buf_t *db;
+		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, off, FTAG, &db));
 		bcopy((char *)db->db_data + off - db->db_offset, lr + 1, dlen);
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, FTAG);
 		rw_exit(&zp->z_grow_lock);
 	} else {
 		/*
@@ -1071,7 +1077,7 @@ top:
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		dmu_tx_hold_bonus(tx, dzp->z_id);
-		dmu_tx_hold_zap(tx, dzp->z_id, 1);
+		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 		if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, SPA_MAXBLOCKSIZE);
@@ -1266,7 +1272,7 @@ top:
 	 * allow for either case.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_zap(tx, dzp->z_id, -1);
+	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	if (may_delete_now)
 		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
@@ -1289,7 +1295,7 @@ top:
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 
 	/* charge as an update -- would be nice not to charge at all */
-	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1);
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
 
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
@@ -1427,8 +1433,8 @@ top:
 	 * Add a new entry to the directory.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_zap(tx, dzp->z_id, 1);
-	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, SPA_MAXBLOCKSIZE);
@@ -1534,9 +1540,9 @@ top:
 	rw_enter(&zp->z_parent_lock, RW_WRITER);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		dmu_tx_abort(tx);
@@ -2059,8 +2065,7 @@ top:
 		have_grow_lock = TRUE;
 		if (off < zp->z_phys->zp_size)
 			dmu_tx_hold_free(tx, zp->z_id, off, DMU_OBJECT_END);
-		else if (zp->z_phys->zp_size &&
-		    zp->z_blksz < zfsvfs->z_max_blksz && off > zp->z_blksz)
+		else if (zp->z_blksz < zfsvfs->z_max_blksz && off > zp->z_blksz)
 			/* we will rewrite this block if we grow */
 			dmu_tx_hold_write(tx, zp->z_id, 0, zp->z_phys->zp_size);
 	}
@@ -2419,17 +2424,13 @@ top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
 	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
-	if (sdzp != tdzp) {
-		dmu_tx_hold_zap(tx, sdzp->z_id, 1);
-		dmu_tx_hold_zap(tx, tdzp->z_id, 1);
+	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+	if (sdzp != tdzp)
 		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
-	} else {
-		dmu_tx_hold_zap(tx, sdzp->z_id, 2);
-	}
-	if (tzp) {
-		dmu_tx_hold_bonus(tx, tzp->z_id);	/* nlink changes */
-	}
-	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+	if (tzp)
+		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		dmu_tx_abort(tx);
@@ -2532,7 +2533,7 @@ top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_bonus(tx, dzp->z_id);
-	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
@@ -2569,12 +2570,12 @@ top:
 		if (error)
 			goto out;
 
-		dbp = dmu_buf_hold(zfsvfs->z_os, zoid, 0);
+		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
 		dmu_buf_will_dirty(dbp, tx);
 
 		ASSERT3U(len, <=, dbp->db_size);
 		bcopy(link, dbp->db_data, len);
-		dmu_buf_rele(dbp);
+		dmu_buf_rele(dbp, FTAG);
 	}
 	zp->z_phys->zp_size = len;
 
@@ -2631,15 +2632,15 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
 		error = uiomove(zp->z_phys + 1,
 		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
 	} else {
-		dmu_buf_t *dbp = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0);
-		if ((error = dmu_buf_read_canfail(dbp)) != 0) {
-			dmu_buf_rele(dbp);
+		dmu_buf_t *dbp;
+		error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
+		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 		error = uiomove(dbp->db_data,
 		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
-		dmu_buf_rele(dbp);
+		dmu_buf_rele(dbp, FTAG);
 	}
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
@@ -2732,7 +2733,7 @@ top:
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, szp->z_id);
-	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		dmu_tx_abort(tx);
@@ -2921,8 +2922,14 @@ zfs_inactive(vnode_t *vp, cred_t *cr)
 			    B_INVAL, cr);
 		}
 
+		mutex_enter(&zp->z_lock);
 		vp->v_count = 0; /* count arrives as 1 */
-		zfs_znode_free(zp);
+		if (zp->z_dbuf == NULL) {
+			mutex_exit(&zp->z_lock);
+			zfs_znode_free(zp);
+		} else {
+			mutex_exit(&zp->z_lock);
+		}
 		rw_exit(&zfsvfs->z_um_lock);
 		VFS_RELE(zfsvfs->z_vfs);
 		return;
@@ -2986,27 +2993,21 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	uint_t cnt = 1;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
-	 * If file is being mapped, disallow frlock.  We set the mapcnt to
-	 * -1 here to signal that we are in the process of setting a lock.
-	 * This prevents a race with zfs_map().
-	 * XXX - well, sort of; since zfs_map() does not change z_mapcnt,
-	 * we could be in the middle of zfs_map() and still call fs_frlock().
-	 * Also, we are doing no checking in zfs_addmap() (where z_mapcnt
-	 * *is* manipulated).
+	 * We are following the UFS semantics with respect to mapcnt
+	 * here: If we see that the file is mapped already, then we will
+	 * return an error, but we don't worry about races between this
+	 * function and zfs_map().
 	 */
-	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
-	    (int)(cnt = atomic_cas_32(&zp->z_mapcnt, 0, -1)) > 0) {
+	if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) {
 		ZFS_EXIT(zfsvfs);
 		return (EAGAIN);
 	}
 	error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr);
-	ASSERT((cnt != 0) || ((int)atomic_cas_32(&zp->z_mapcnt, -1, 0) == -1));
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
@@ -3074,7 +3075,7 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
 	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
 		ASSERT(io_off == cur_pp->p_offset);
 		va = ppmapin(cur_pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
-		err = dmu_read_canfail(os, oid, io_off, PAGESIZE, va);
+		err = dmu_read(os, oid, io_off, PAGESIZE, va);
 		ppmapout(va);
 		if (err) {
 			/* On error, toss the entire kluster */
@@ -3241,6 +3242,20 @@ out:
 	return (err);
 }
 
+/*
+ * Request a memory map for a section of a file.  This code interacts
+ * with common code and the VM system as follows:
+ *
+ *	common code calls mmap(), which ends up in smmap_common()
+ *
+ *	this calls VOP_MAP(), which takes you into (say) zfs
+ *
+ *	zfs_map() calls as_map(), passing segvn_create() as the callback
+ *
+ *	segvn_create() creates the new segment and calls VOP_ADDMAP()
+ *
+ *	zfs_addmap() updates z_mapcnt
+ */
 static int
 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
@@ -3269,15 +3284,10 @@ zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
 
 	/*
 	 * If file is locked, disallow mapping.
-	 * XXX - since we don't modify z_mapcnt here, there is nothing
-	 * to stop a file lock being placed immediately after we complete
-	 * this check.
 	 */
-	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
-		if (vn_has_flocks(vp) || zp->z_mapcnt == -1) {
-			ZFS_EXIT(zfsvfs);
-			return (EAGAIN);
-		}
+	if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) {
+		ZFS_EXIT(zfsvfs);
+		return (EAGAIN);
 	}
 
 	as_rangelock(as);
@@ -3318,11 +3328,9 @@ static int
 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
 {
-	/*
-	 * XXX - shouldn't we be checking for file locks here?
-	 */
-	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0);
-	atomic_add_32(&VTOZ(vp)->z_mapcnt, btopr(len));
+	uint64_t pages = btopr(len);
+
+	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
 	return (0);
 }
 
@@ -3331,8 +3339,10 @@ static int
 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
     size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr)
 {
-	atomic_add_32(&VTOZ(vp)->z_mapcnt, -btopr(len));
-	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0);
+	uint64_t pages = btopr(len);
+
+	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
+	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
index 7eb3a2410d..3fd338940e 100644
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -55,251 +54,6 @@
 
 struct kmem_cache *znode_cache = NULL;
 
-/*
- * Note that znodes can be on one of 2 states:
- *	ZCACHE_mru	- recently used, currently cached
- *	ZCACHE_mfu	- frequently used, currently cached
- * When there are no active references to the znode, they
- * are linked onto one of the lists in zcache.  These are the
- * only znodes that can be evicted.
- */
-
-typedef struct zcache_state {
-	list_t	list;	/* linked list of evictable znodes in state */
-	uint64_t lcnt;	/* total number of znodes in the linked list */
-	uint64_t cnt;	/* total number of all znodes in this state */
-	uint64_t hits;
-	kmutex_t mtx;
-} zcache_state_t;
-
-/* The 2 states: */
-static zcache_state_t ZCACHE_mru;
-static zcache_state_t ZCACHE_mfu;
-
-static struct zcache {
-	zcache_state_t	*mru;
-	zcache_state_t	*mfu;
-	uint64_t	p;		/* Target size of mru */
-	uint64_t	c;		/* Target size of cache */
-	uint64_t	c_max;		/* Maximum target cache size */
-
-	/* performance stats */
-	uint64_t	missed;
-	uint64_t	evicted;
-	uint64_t	skipped;
-} zcache;
-
-void zcache_kmem_reclaim(void);
-
-#define	ZCACHE_MINTIME (hz>>4) /* 62 ms */
-
-/*
- * Move the supplied znode to the indicated state.  The mutex
- * for the znode must be held by the caller.
- */
-static void
-zcache_change_state(zcache_state_t *new_state, znode_t *zp)
-{
-	/* ASSERT(MUTEX_HELD(hash_mtx)); */
-	ASSERT(zp->z_active);
-
-	if (zp->z_zcache_state) {
-		ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
-		atomic_add_64(&zp->z_zcache_state->cnt, -1);
-	}
-	atomic_add_64(&new_state->cnt, 1);
-	zp->z_zcache_state = new_state;
-}
-
-static void
-zfs_zcache_evict(znode_t *zp, kmutex_t *hash_mtx)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	ASSERT(zp->z_phys);
-	ASSERT(zp->z_dbuf_held);
-
-	zp->z_dbuf_held = 0;
-	mutex_exit(&zp->z_lock);
-	dmu_buf_rele(zp->z_dbuf);
-	mutex_exit(hash_mtx);
-	VFS_RELE(zfsvfs->z_vfs);
-}
-
-/*
- * Evict znodes from list until we've removed the specified number
- */
-static void
-zcache_evict_state(zcache_state_t *state, int64_t cnt, zfsvfs_t *zfsvfs)
-{
-	int znodes_evicted = 0;
-	znode_t *zp, *zp_prev;
-	kmutex_t *hash_mtx;
-
-	ASSERT(state == zcache.mru || state == zcache.mfu);
-
-	mutex_enter(&state->mtx);
-
-	for (zp = list_tail(&state->list); zp; zp = zp_prev) {
-		zp_prev = list_prev(&state->list, zp);
-		if (zfsvfs && zp->z_zfsvfs != zfsvfs)
-			continue;
-		hash_mtx = ZFS_OBJ_MUTEX(zp);
-		if (mutex_tryenter(hash_mtx)) {
-			mutex_enter(&zp->z_lock);
-			list_remove(&zp->z_zcache_state->list, zp);
-			zp->z_zcache_state->lcnt -= 1;
-			ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
-			atomic_add_64(&zp->z_zcache_state->cnt, -1);
-			zp->z_zcache_state = NULL;
-			zp->z_zcache_access = 0;
-			/* drops z_lock and hash_mtx */
-			zfs_zcache_evict(zp, hash_mtx);
-			znodes_evicted += 1;
-			atomic_add_64(&zcache.evicted, 1);
-			if (znodes_evicted >= cnt)
-				break;
-		} else {
-			atomic_add_64(&zcache.skipped, 1);
-		}
-	}
-	mutex_exit(&state->mtx);
-
-	if (znodes_evicted < cnt)
-		dprintf("only evicted %lld znodes from %x",
-		    (longlong_t)znodes_evicted, state);
-}
-
-static void
-zcache_adjust(void)
-{
-	uint64_t mrucnt = zcache.mru->lcnt;
-	uint64_t mfucnt = zcache.mfu->lcnt;
-	uint64_t p = zcache.p;
-	uint64_t c = zcache.c;
-
-	if (mrucnt > p)
-		zcache_evict_state(zcache.mru, mrucnt - p, NULL);
-
-	if (mfucnt > 0 && mrucnt + mfucnt > c) {
-		int64_t toevict = MIN(mfucnt, mrucnt + mfucnt - c);
-		zcache_evict_state(zcache.mfu, toevict, NULL);
-	}
-}
-
-/*
- * Flush all *evictable* data from the cache.
- * NOTE: this will not touch "active" (i.e. referenced) data.
- */
-void
-zfs_zcache_flush(zfsvfs_t *zfsvfs)
-{
-	zcache_evict_state(zcache.mru, zcache.mru->lcnt, zfsvfs);
-	zcache_evict_state(zcache.mfu, zcache.mfu->lcnt, zfsvfs);
-}
-
-static void
-zcache_try_grow(int64_t cnt)
-{
-	int64_t size;
-	/*
-	 * If we're almost to the current target cache size,
-	 * increment the target cache size
-	 */
-	size = zcache.mru->lcnt + zcache.mfu->lcnt;
-	if ((zcache.c - size) <= 1) {
-		atomic_add_64(&zcache.c, cnt);
-		if (zcache.c > zcache.c_max)
-			zcache.c = zcache.c_max;
-		else if (zcache.p + cnt < zcache.c)
-			atomic_add_64(&zcache.p, cnt);
-	}
-}
-
-/*
- * This routine is called whenever a znode is accessed.
- */
-static void
-zcache_access(znode_t *zp, kmutex_t *hash_mtx)
-{
-	ASSERT(MUTEX_HELD(hash_mtx));
-
-	if (zp->z_zcache_state == NULL) {
-		/*
-		 * This znode is not in the cache.
-		 * Add the new znode to the MRU state.
-		 */
-
-		zcache_try_grow(1);
-
-		ASSERT(zp->z_zcache_access == 0);
-		zp->z_zcache_access = lbolt;
-		zcache_change_state(zcache.mru, zp);
-		mutex_exit(hash_mtx);
-
-		/*
-		 * If we are using less than 2/3 of our total target
-		 * cache size, bump up the target size for the MRU
-		 * list.
-		 */
-		if (zcache.mru->lcnt + zcache.mfu->lcnt < zcache.c*2/3) {
-			zcache.p = zcache.mru->lcnt + zcache.c/6;
-		}
-
-		zcache_adjust();
-
-		atomic_add_64(&zcache.missed, 1);
-	} else if (zp->z_zcache_state == zcache.mru) {
-		/*
-		 * This znode has been "accessed" only once so far,
-		 * Move it to the MFU state.
-		 */
-		if (lbolt > zp->z_zcache_access + ZCACHE_MINTIME) {
-			/*
-			 * More than 125ms have passed since we
-			 * instantiated this buffer.  Move it to the
-			 * most frequently used state.
-			 */
-			zp->z_zcache_access = lbolt;
-			zcache_change_state(zcache.mfu, zp);
-		}
-		atomic_add_64(&zcache.mru->hits, 1);
-		mutex_exit(hash_mtx);
-	} else {
-		ASSERT(zp->z_zcache_state == zcache.mfu);
-		/*
-		 * This buffer has been accessed more than once.
-		 * Keep it in the MFU state.
-		 */
-		atomic_add_64(&zcache.mfu->hits, 1);
-		mutex_exit(hash_mtx);
-	}
-}
-
-static void
-zcache_init(void)
-{
-	zcache.c = 20;
-	zcache.c_max = 50;
-
-	zcache.mru = &ZCACHE_mru;
-	zcache.mfu = &ZCACHE_mfu;
-
-	list_create(&zcache.mru->list, sizeof (znode_t),
-	    offsetof(znode_t, z_zcache_node));
-	list_create(&zcache.mfu->list, sizeof (znode_t),
-	    offsetof(znode_t, z_zcache_node));
-}
-
-static void
-zcache_fini(void)
-{
-	zfs_zcache_flush(NULL);
-
-	list_destroy(&zcache.mru->list);
-	list_destroy(&zcache.mfu->list);
-}
-
 /*ARGSUSED*/
 static void
 znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
@@ -307,9 +61,15 @@ znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
 	znode_t *zp = user_ptr;
 	vnode_t *vp = ZTOV(zp);
 
+	mutex_enter(&zp->z_lock);
 	if (vp->v_count == 0) {
+		mutex_exit(&zp->z_lock);
 		vn_invalid(vp);
 		zfs_znode_free(zp);
+	} else {
+		/* signal force unmount that this znode can be freed */
+		zp->z_dbuf = NULL;
+		mutex_exit(&zp->z_lock);
 	}
 }
 
@@ -359,15 +119,11 @@ zfs_znode_init(void)
 	znode_cache = kmem_cache_create("zfs_znode_cache",
 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
-
-	zcache_init();
 }
 
 void
 zfs_znode_fini(void)
 {
-	zcache_fini();
-
 	/*
 	 * Cleanup vfs & vnode ops
 	 */
@@ -488,8 +244,8 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
 	if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
 		dmu_tx_t *tx = dmu_tx_create(os);
 
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 3); /* master node */
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1); /* delete queue */
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		ASSERT3U(error, ==, 0);
@@ -497,8 +253,10 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
 		dmu_tx_commit(tx);
 	}
 
-	if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1, &version)) {
-		return (EINVAL);
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1,
+	    &version);
+	if (error) {
+		return (error);
 	} else if (version != ZFS_VERSION) {
 		(void) printf("Mismatched versions:  File system "
 		    "is version %lld on-disk format, which is "
@@ -524,9 +282,9 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
 	kmem_free(stats, sizeof (dmu_objset_stats_t));
 	stats = NULL;
 
-	if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid)) {
-		return (EINVAL);
-	}
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid);
+	if (error)
+		return (error);
 	ASSERT(zoid != 0);
 	zfsvfs->z_root = zoid;
 
@@ -545,9 +303,9 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
 		return (error);
 	ASSERT3U((*zpp)->z_id, ==, zoid);
 
-	if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid)) {
-		return (EINVAL);
-	}
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid);
+	if (error)
+		return (error);
 
 	zfsvfs->z_dqueue = zoid;
 
@@ -570,7 +328,7 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
  * up to the caller to do, in case you don't want to
  * return the znode
  */
-znode_t *
+static znode_t *
 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
 {
 	znode_t	*zp;
@@ -593,8 +351,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 
-	bzero(&zp->z_zcache_node, sizeof (list_node_t));
-
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
 	mutex_exit(&zfsvfs->z_znodes_lock);
@@ -662,9 +418,6 @@ zfs_znode_dmu_init(znode_t *zp)
 		ZTOV(zp)->v_flag |= VROOT;
 	}
 
-	zp->z_zcache_state = NULL;
-	zp->z_zcache_access = 0;
-
 	ASSERT(zp->z_dbuf_held == 0);
 	zp->z_dbuf_held = 1;
 	VFS_HOLD(zfsvfs->z_vfs);
@@ -715,6 +468,12 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
 	/*
 	 * Create a new DMU object.
 	 */
+	/*
+	 * There's currently no mechanism for pre-reading the blocks that will
+	 * be to needed allocate a new object, so we accept the small chance
+	 * that there will be an i/o error and we will fail one of the
+	 * assertions below.
+	 */
 	if (vap->va_type == VDIR) {
 		if (flag & IS_REPLAY) {
 			err = zap_create_claim(zfsvfs->z_os, *oid,
@@ -738,7 +497,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 		}
 	}
-	dbp = dmu_bonus_hold(zfsvfs->z_os, *oid);
+	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
 	dmu_buf_will_dirty(dbp, tx);
 
 	/*
@@ -803,11 +562,12 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
 
 		mutex_enter(hash_mtx);
 		zfs_znode_dmu_init(zp);
-		zcache_access(zp, hash_mtx);
+		mutex_exit(hash_mtx);
+
 		*zpp = zp;
 	} else {
 		ZTOV(zp)->v_count = 0;
-		dmu_buf_rele(dbp);
+		dmu_buf_rele(dbp, NULL);
 		zfs_znode_free(zp);
 	}
 }
@@ -818,25 +578,25 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 	dmu_object_info_t doi;
 	dmu_buf_t	*db;
 	znode_t		*zp;
+	int err;
 
 	*zpp = NULL;
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
-	db = dmu_bonus_hold(zfsvfs->z_os, obj_num);
-	if (db == NULL) {
+	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (ENOENT);
+		return (err);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (EINVAL);
 	}
-	dmu_buf_read(db);
 
 	ASSERT(db->db_object == obj_num);
 	ASSERT(db->db_offset == -1);
@@ -849,29 +609,23 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 
 		ASSERT3U(zp->z_id, ==, obj_num);
 		if (zp->z_reap) {
-			dmu_buf_rele(db);
+			dmu_buf_rele(db, NULL);
 			mutex_exit(&zp->z_lock);
 			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 			return (ENOENT);
 		} else if (zp->z_dbuf_held) {
-			dmu_buf_rele(db);
+			dmu_buf_rele(db, NULL);
 		} else {
 			zp->z_dbuf_held = 1;
 			VFS_HOLD(zfsvfs->z_vfs);
 		}
 
-		if (zp->z_active == 0) {
+		if (zp->z_active == 0)
 			zp->z_active = 1;
-			if (list_link_active(&zp->z_zcache_node)) {
-				mutex_enter(&zp->z_zcache_state->mtx);
-				list_remove(&zp->z_zcache_state->list, zp);
-				zp->z_zcache_state->lcnt -= 1;
-				mutex_exit(&zp->z_zcache_state->mtx);
-			}
-		}
+
 		VN_HOLD(ZTOV(zp));
 		mutex_exit(&zp->z_lock);
-		zcache_access(zp, ZFS_OBJ_MUTEX(zp));
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		*zpp = zp;
 		return (0);
 	}
@@ -882,7 +636,7 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 	zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
 	ASSERT3U(zp->z_id, ==, obj_num);
 	zfs_znode_dmu_init(zp);
-	zcache_access(zp, ZFS_OBJ_MUTEX(zp));
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 	*zpp = zp;
 	return (0);
 }
@@ -899,15 +653,11 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
 		    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
 		ASSERT3U(error, ==, 0);
 	}
-	if (zp->z_zcache_state) {
-		ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
-		atomic_add_64(&zp->z_zcache_state->cnt, -1);
-	}
 	error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
 	ASSERT3U(error, ==, 0);
 	zp->z_dbuf_held = 0;
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
-	dmu_buf_rele(zp->z_dbuf);
+	dmu_buf_rele(zp->z_dbuf, NULL);
 }
 
 void
@@ -954,9 +704,6 @@ zfs_zinactive(znode_t *zp)
 	if (zp->z_reap) {
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
-		ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
-		atomic_add_64(&zp->z_zcache_state->cnt, -1);
-		zp->z_zcache_state = NULL;
 		/* XATTR files are not put on the delete queue */
 		if (zp->z_phys->zp_flags & ZFS_XATTR) {
 			zfs_rmnode(zp);
@@ -970,23 +717,14 @@ zfs_zinactive(znode_t *zp)
 		VFS_RELE(zfsvfs->z_vfs);
 		return;
 	}
+	ASSERT(zp->z_phys);
+	ASSERT(zp->z_dbuf_held);
 
-	/*
-	 * If the file system for this znode is no longer mounted,
-	 * evict the znode now, don't put it in the cache.
-	 */
-	if (zfsvfs->z_unmounted1) {
-		zfs_zcache_evict(zp, ZFS_OBJ_MUTEX(zp));
-		return;
-	}
-
-	/* put znode on evictable list */
-	mutex_enter(&zp->z_zcache_state->mtx);
-	list_insert_head(&zp->z_zcache_state->list, zp);
-	zp->z_zcache_state->lcnt += 1;
-	mutex_exit(&zp->z_zcache_state->mtx);
+	zp->z_dbuf_held = 0;
 	mutex_exit(&zp->z_lock);
+	dmu_buf_rele(zp->z_dbuf, NULL);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+	VFS_RELE(zfsvfs->z_vfs);
 }
 
 void
@@ -1206,7 +944,8 @@ zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx,
 		len = -1;
 	else if (end > size)
 		len = size - from;
-	dmu_free_range(zp->z_zfsvfs->z_os, zp->z_id, from, len, tx);
+	VERIFY(0 == dmu_free_range(zp->z_zfsvfs->z_os,
+	    zp->z_id, from, len, tx));
 
 	if (!have_grow_lock)
 		rw_exit(&zp->z_grow_lock);
@@ -1214,7 +953,6 @@ zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx,
 	return (0);
 }
 
-
 void
 zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
 {
@@ -1229,6 +967,10 @@ zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
 	/*
 	 * First attempt to create master node.
 	 */
+	/*
+	 * In an empty objset, there are no blocks to read and thus
+	 * there can be no i/o errors (which we assert below).
+	 */
 	moid = MASTER_NODE_OBJ;
 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
 	    DMU_OT_NONE, 0, tx);
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 14b989fbd3..55040166b4 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -136,11 +136,17 @@ zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf)
 	uint64_t blksz = BP_GET_LSIZE(bp);
 	zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1;
 	zio_cksum_t cksum;
+	zbookmark_t zb;
 	int error;
 
+	zb.zb_objset = bp->blk_cksum.zc_word[2];
+	zb.zb_object = 0;
+	zb.zb_level = -1;
+	zb.zb_blkid = bp->blk_cksum.zc_word[3];
+
 	error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz,
 	    NULL, NULL, ZIO_PRIORITY_SYNC_READ,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
 	if (error) {
 		dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ",
 		    zilog, bp, error);
@@ -551,6 +557,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
 	uint64_t txg;
 	uint64_t zil_blksz;
+	zbookmark_t zb;
 	int error;
 
 	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
@@ -579,11 +586,21 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 	error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
 	    zil_blksz, &ztp->zit_next_blk, txg);
 	if (error) {
+		/*
+		 * Reinitialise the lwb.
+		 * By returning NULL the caller will call tx_wait_synced()
+		 */
+		mutex_enter(&zilog->zl_lock);
+		ASSERT(lwb->lwb_state == UNWRITTEN);
+		lwb->lwb_nused = 0;
+		lwb->lwb_seq = 0;
+		mutex_exit(&zilog->zl_lock);
 		txg_rele_to_sync(&lwb->lwb_txgh);
 		return (NULL);
 	}
 
 	ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg);
+	ztp->zit_pad = 0;
 	ztp->zit_nused = lwb->lwb_nused;
 	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
 	ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum;
@@ -617,9 +634,15 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 	 * write the old log block
 	 */
 	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
+
+	zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[2];
+	zb.zb_object = 0;
+	zb.zb_level = -1;
+	zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[3];
+
 	zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0,
 	    &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,
-	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED));
+	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb));
 
 	return (nlwb);
 }
@@ -674,7 +697,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 		lwb = zil_lwb_write_start(zilog, lwb);
 		if (lwb == NULL)
 			return (NULL);
-		if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) {
+		ASSERT(lwb->lwb_nused == 0);
+		if (reclen > ZIL_BLK_DATA_SZ(lwb)) {
 			txg_wait_synced(zilog->zl_dmu_pool, txg);
 			mutex_enter(&zilog->zl_lock);
 			zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
@@ -1157,10 +1181,17 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 			 * checksum error.  We can safely ignore this because
 			 * the later write will provide the correct data.
 			 */
+			zbookmark_t zb;
+
+			zb.zb_objset = dmu_objset_id(zilog->zl_os);
+			zb.zb_object = lrw->lr_foid;
+			zb.zb_level = -1;
+			zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);
+
 			(void) zio_wait(zio_read(NULL, zilog->zl_spa,
 			    wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
 			    ZIO_PRIORITY_SYNC_READ,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
 			(void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
 		}
 	}
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 1554504a93..b9741ee5c2 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,13 +19,14 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/spa_impl.h>
@@ -35,9 +35,6 @@
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 
-static void zio_vdev_io_enter(zio_t *zio);
-static void zio_vdev_io_exit(zio_t *zio);
-
 /*
  * ==========================================================================
  * I/O priority table
@@ -128,6 +125,8 @@ zio_init(void)
 		if (zio_buf_cache[c - 1] == NULL)
 			zio_buf_cache[c - 1] = zio_buf_cache[c];
 	}
+
+	zio_inject_init();
 }
 
 void
@@ -143,6 +142,8 @@ zio_fini(void)
 		}
 		zio_buf_cache[c] = NULL;
 	}
+
+	zio_inject_fini();
 }
 
 /*
@@ -263,11 +264,12 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 
 	if (pio == NULL) {
 		if (!(flags & ZIO_FLAG_CONFIG_HELD))
-			spa_config_enter(zio->io_spa, RW_READER);
+			spa_config_enter(zio->io_spa, RW_READER, zio);
 		zio->io_root = zio;
 	} else {
 		zio->io_root = pio->io_root;
-
+		if (!(flags & ZIO_FLAG_NOBOOKMARK))
+			zio->io_logical = pio->io_logical;
 		mutex_enter(&pio->io_lock);
 		if (stage < ZIO_STAGE_READY)
 			pio->io_children_notready++;
@@ -305,7 +307,7 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags)
+    int priority, int flags, zbookmark_t *zb)
 {
 	zio_t *zio;
 	dva_t *dva;
@@ -314,6 +316,9 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
 
 	zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+	zio->io_bookmark = *zb;
+
+	zio->io_logical = zio;
 
 	/*
 	 * Work off our copy of the bp so the caller can free it.
@@ -345,7 +350,8 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags)
+    zio_done_func_t *done, void *private, int priority, int flags,
+    zbookmark_t *zb)
 {
 	zio_t *zio;
 
@@ -359,6 +365,10 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
 	    ZIO_TYPE_WRITE, priority, flags,
 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
 
+	zio->io_bookmark = *zb;
+
+	zio->io_logical = zio;
+
 	zio->io_checksum = checksum;
 	zio->io_compress = compress;
 
@@ -378,7 +388,8 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags)
+    zio_done_func_t *done, void *private, int priority, int flags,
+    zbookmark_t *zb)
 {
 	zio_t *zio;
 
@@ -387,6 +398,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
 	    ZIO_TYPE_WRITE, priority, flags,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 
+	zio->io_bookmark = *zb;
 	zio->io_checksum = checksum;
 	zio->io_compress = ZIO_COMPRESS_OFF;
 
@@ -667,8 +679,6 @@ zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
 		mutex_exit(&zio->io_lock);
 		zio_next_stage(zio);
 	} else {
-		if (zio->io_stage == ZIO_STAGE_VDEV_IO_START)
-			zio_vdev_io_exit(zio);
 		zio->io_stalled = stage;
 		mutex_exit(&zio->io_lock);
 	}
@@ -683,8 +693,6 @@ zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
 	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		pio->io_error = zio->io_error;
 	if (--*countp == 0 && pio->io_stalled == stage) {
-		if (pio->io_stage == ZIO_STAGE_VDEV_IO_START)
-			zio_vdev_io_enter(pio);
 		pio->io_stalled = 0;
 		mutex_exit(&pio->io_lock);
 		zio_next_stage_async(pio);
@@ -748,36 +756,45 @@ zio_done(zio_t *zio)
 		vdev_stat_update(zio);
 
 	if (zio->io_error) {
-		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
-		    bp ? bp : &zio->io_bp_copy);
-		dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): error %d\n",
-		    zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
-		    zio_type_name[zio->io_type],
-		    vdev_description(vd),
-		    (u_longlong_t)zio->io_offset,
-		    zio, blkbuf, zio->io_error);
-	}
-
-	if (zio->io_numerrors != 0 && zio->io_type == ZIO_TYPE_WRITE) {
-		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
-		    bp ? bp : &zio->io_bp_copy);
-		dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): %d errors\n",
-		    "partial write",
-		    zio_type_name[zio->io_type],
-		    vdev_description(vd),
-		    (u_longlong_t)zio->io_offset,
-		    zio, blkbuf, zio->io_numerrors);
-	}
+		/*
+		 * If this I/O is attached to a particular vdev,
+		 * generate an error message describing the I/O failure
+		 * at the block level.  We ignore these errors if the
+		 * device is currently unavailable.
+		 */
+		if (zio->io_error != ECKSUM && zio->io_vd &&
+		    !vdev_is_dead(zio->io_vd))
+			zfs_ereport_post(FM_EREPORT_ZFS_IO,
+			    zio->io_spa, zio->io_vd, zio, 0, 0);
+
+		if ((zio->io_error == EIO ||
+		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
+		    zio->io_logical == zio) {
+			/*
+			 * For root I/O requests, tell the SPA to log the error
+			 * appropriately.  Also, generate a logical data
+			 * ereport.
+			 */
+			spa_log_error(zio->io_spa, zio);
+
+			zfs_ereport_post(FM_EREPORT_ZFS_DATA,
+			    zio->io_spa, NULL, zio, 0, 0);
+		}
 
-	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
-		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
-		    bp ? bp : &zio->io_bp_copy);
-		panic("ZFS: %s (%s on %s off %llx: zio %p %s): error %d",
-		    zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
-		    zio_type_name[zio->io_type],
-		    vdev_description(vd),
-		    (u_longlong_t)zio->io_offset,
-		    zio, blkbuf, zio->io_error);
+		/*
+		 * For I/O requests that cannot fail, panic appropriately.
+		 */
+		if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+			sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
+			    bp ? bp : &zio->io_bp_copy);
+			panic("ZFS: %s (%s on %s off %llx: zio %p %s): error "
+			    "%d", zio->io_error == ECKSUM ?
+			    "bad checksum" : "I/O failure",
+			    zio_type_name[zio->io_type],
+			    vdev_description(vd),
+			    (u_longlong_t)zio->io_offset,
+			    zio, blkbuf, zio->io_error);
+		}
 	}
 
 	zio_clear_transform_stack(zio);
@@ -807,7 +824,7 @@ zio_done(zio_t *zio)
 	}
 
 	if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD))
-		spa_config_exit(spa);
+		spa_config_exit(spa, zio);
 
 	if (zio->io_waiter != NULL) {
 		mutex_enter(&zio->io_lock);
@@ -988,7 +1005,8 @@ zio_read_gang_members(zio_t *zio)
 
 		zio_nowait(zio_read(zio, zio->io_spa, gbp,
 		    (char *)zio->io_data + loff, lsize, NULL, NULL,
-		    zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT));
+		    zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
+		    &zio->io_bookmark));
 	}
 
 	zio_buf_free(gbh, gbufsize);
@@ -1022,7 +1040,8 @@ zio_rewrite_gang_members(zio_t *zio)
 
 		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
 		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
-		    NULL, NULL, zio->io_priority, zio->io_flags));
+		    NULL, NULL, zio->io_priority, zio->io_flags,
+		    &zio->io_bookmark));
 	}
 
 	zio_push_transform(zio, gbh, gsize, gbufsize);
@@ -1153,7 +1172,8 @@ zio_write_allocate_gang_members(zio_t *zio)
 			    zio->io_checksum, zio->io_txg, gbp,
 			    (char *)zio->io_data + loff, lsize,
 			    zio_write_allocate_gang_member_done, NULL,
-			    zio->io_priority, zio->io_flags));
+			    zio->io_priority, zio->io_flags,
+			    &zio->io_bookmark));
 		} else {
 			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
 			ASSERT(lsize != SPA_MINBLOCKSIZE);
@@ -1263,51 +1283,6 @@ zio_dva_translate(zio_t *zio)
  * Read and write to physical devices
  * ==========================================================================
  */
-static void
-zio_vdev_io_enter(zio_t *zio)
-{
-	vdev_t *tvd = zio->io_vd->vdev_top;
-
-	mutex_enter(&tvd->vdev_io_lock);
-	ASSERT(zio->io_pending.list_next == NULL);
-	list_insert_tail(&tvd->vdev_io_pending, zio);
-	mutex_exit(&tvd->vdev_io_lock);
-}
-
-static void
-zio_vdev_io_exit(zio_t *zio)
-{
-	vdev_t *tvd = zio->io_vd->vdev_top;
-
-	mutex_enter(&tvd->vdev_io_lock);
-	ASSERT(zio->io_pending.list_next != NULL);
-	list_remove(&tvd->vdev_io_pending, zio);
-	if (list_head(&tvd->vdev_io_pending) == NULL)
-		cv_broadcast(&tvd->vdev_io_cv);
-	mutex_exit(&tvd->vdev_io_lock);
-}
-
-static void
-zio_vdev_io_retry(void *vdarg)
-{
-	vdev_t *vd = vdarg;
-	zio_t *zio, *zq;
-
-	ASSERT(vd == vd->vdev_top);
-
-	/* XXPOLICY */
-	delay(hz);
-
-	vdev_reopen(vd, &zq);
-
-	while ((zio = zq) != NULL) {
-		zq = zio->io_retry_next;
-		zio->io_retry_next = NULL;
-		dprintf("async retry #%d for I/O to %s offset %llx\n",
-		    zio->io_retries, vdev_description(vd), zio->io_offset);
-		zio_next_stage_async(zio);
-	}
-}
 
 static void
 zio_vdev_io_setup(zio_t *zio)
@@ -1323,8 +1298,6 @@ zio_vdev_io_setup(zio_t *zio)
 		zio->io_offset += VDEV_LABEL_START_SIZE;
 	}
 
-	zio_vdev_io_enter(zio);
-
 	zio_next_stage(zio);
 }
 
@@ -1350,7 +1323,7 @@ zio_vdev_io_done(zio_t *zio)
 }
 
 /* XXPOLICY */
-static boolean_t
+boolean_t
 zio_should_retry(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -1363,11 +1336,7 @@ zio_should_retry(zio_t *zio)
 		return (B_FALSE);
 	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
 		return (B_FALSE);
-	if (zio->io_retries > 300 &&
-	    (zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL)))
-		return (B_FALSE);
-	if (zio->io_retries > 1 &&
-	    (zio->io_error == ECKSUM || zio->io_error == ENXIO))
+	if (zio->io_retries > 0)
 		return (B_FALSE);
 
 	return (B_TRUE);
@@ -1379,17 +1348,16 @@ zio_vdev_io_assess(zio_t *zio)
 	vdev_t *vd = zio->io_vd;
 	vdev_t *tvd = vd->vdev_top;
 
-	zio_vdev_io_exit(zio);
-
 	ASSERT(zio->io_vsd == NULL);
 
+	if (zio_injection_enabled && !zio->io_error)
+		zio->io_error = zio_handle_fault_injection(zio, EIO);
+
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
 	 */
 	/* XXPOLICY */
 	if (zio_should_retry(zio)) {
-		zio_t *zq;
-
 		ASSERT(tvd == vd);
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE));
 
@@ -1405,29 +1373,27 @@ zio_vdev_io_assess(zio_t *zio)
 		    zio->io_retries, zio_type_name[zio->io_type],
 		    vdev_description(vd), zio->io_offset);
 
-		/*
-		 * If this is the first retry, do it immediately.
-		 */
-		/* XXPOLICY */
-		if (zio->io_retries == 1) {
-			zio_next_stage_async(zio);
-			return;
-		}
+		zio_next_stage_async(zio);
+		return;
+	}
 
+	if (zio->io_error != 0 && !(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
+	    zio->io_error != ECKSUM) {
 		/*
-		 * This was not the first retry, so go through the
-		 * longer enqueue/delay/vdev_reopen() process.
+		 * Poor man's hotplug support.  Even if we're done retrying this
+		 * I/O, try to reopen the vdev to see if it's still attached.
+		 * To avoid excessive thrashing, we only try it once a minute.
+		 * This also has the effect of detecting when missing devices
+		 * have come back, by polling the device once a minute.
+		 *
+		 * We need to do this asynchronously because we can't grab
+		 * all the necessary locks way down here.
 		 */
-		mutex_enter(&tvd->vdev_io_lock);
-		ASSERT(zio->io_retry_next == NULL);
-		zio->io_retry_next = zq = tvd->vdev_io_retry;
-		tvd->vdev_io_retry = zio;
-		mutex_exit(&tvd->vdev_io_lock);
-		if (zq == NULL)
-			(void) taskq_dispatch(
-			    tvd->vdev_spa->spa_vdev_retry_taskq,
-			    zio_vdev_io_retry, tvd, TQ_SLEEP);
-		return;
+		if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) {
+			vd->vdev_last_try = gethrtime();
+			tvd->vdev_reopen_wanted = 1;
+			spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN);
+		}
 	}
 
 	zio_next_stage(zio);
@@ -1502,10 +1468,9 @@ zio_checksum_verify(zio_t *zio)
 {
 	if (zio->io_bp != NULL) {
 		zio->io_error = zio_checksum_error(zio);
-		if (zio->io_error) {
-			dprintf("bad checksum on vdev %s\n",
-			    vdev_description(zio->io_vd));
-		}
+		if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
+			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+			    zio->io_spa, zio->io_vd, zio, 0, 0);
 	}
 
 	zio_next_stage(zio);
@@ -1660,7 +1625,7 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp,
 {
 	int error;
 
-	spa_config_enter(spa, RW_READER);
+	spa_config_enter(spa, RW_READER, FTAG);
 
 	BP_ZERO(bp);
 
@@ -1677,7 +1642,7 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp,
 		bp->blk_birth = txg;
 	}
 
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	return (error);
 }
@@ -1693,9 +1658,9 @@ zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
 
 	dprintf_bp(bp, "txg %llu: ", txg);
 
-	spa_config_enter(spa, RW_READER);
+	spa_config_enter(spa, RW_READER, FTAG);
 
 	metaslab_free(spa, BP_IDENTITY(bp), txg);
 
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 }
diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c
index dc31527ce8..d57ab6d525 100644
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -170,5 +169,8 @@ zio_checksum_error(zio_t *zio)
 	    (actual_cksum.zc_word[3] - zc.zc_word[3]))
 		return (ECKSUM);
 
+	if (zio_injection_enabled && !zio->io_error)
+		return (zio_handle_fault_injection(zio, ECKSUM));
+
 	return (0);
 }
diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c
new file mode 100644
index 0000000000..4cada09d83
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c
@@ -0,0 +1,315 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * ZFS fault injection
+ *
+ * To handle fault injection, we keep track of a series of zinject_record_t
+ * structures which describe which logical block(s) should be injected with a
+ * fault.  These are kept in a global list.  Each record corresponds to a given
+ * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
+ * or exported while the injection record exists.
+ *
+ * Device level injection is done using the 'zi_guid' field.  If this is set, it
+ * means that the error is destined for a particular device, not a piece of
+ * data.
+ *
+ * This is a rather poor data structure and algorithm, but we don't expect more
+ * than a few faults at any one time, so it should be sufficient for our needs.
+ */
+
+#include <sys/arc.h>
+#include <sys/zio_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+
+uint32_t zio_injection_enabled;
+
+typedef struct inject_handler {
+	int			zi_id;
+	spa_t			*zi_spa;
+	zinject_record_t	zi_record;
+	list_node_t		zi_link;
+} inject_handler_t;
+
+static list_t inject_handlers;
+static krwlock_t inject_lock;
+static int inject_next_id = 1;
+
+/*
+ * Returns true if the given record matches the I/O in progress.
+ */
+static boolean_t
+zio_match_handler(zbookmark_t *zb, uint64_t type,
+    zinject_record_t *record, int error)
+{
+	/*
+	 * Check for a match against the MOS, which is based on type
+	 */
+	if (zb->zb_objset == 0 && record->zi_objset == 0 &&
+	    record->zi_object == 0) {
+		if (record->zi_type == DMU_OT_NONE ||
+		    type == record->zi_type)
+			return (record->zi_freq == 0 ||
+			    spa_get_random(100) < record->zi_freq);
+		else
+			return (B_FALSE);
+	}
+
+	/*
+	 * Check for an exact match.
+	 */
+	if (zb->zb_objset == record->zi_objset &&
+	    zb->zb_object == record->zi_object &&
+	    zb->zb_level == record->zi_level &&
+	    zb->zb_blkid >= record->zi_start &&
+	    zb->zb_blkid <= record->zi_end &&
+	    error == record->zi_error)
+		return (record->zi_freq == 0 ||
+		    spa_get_random(100) < record->zi_freq);
+
+	return (B_FALSE);
+}
+
+/*
+ * Determine if the I/O in question should return failure.  Returns the errno
+ * to be returned to the caller.
+ */
+int
+zio_handle_fault_injection(zio_t *zio, int error)
+{
+	int ret = 0;
+	inject_handler_t *handler;
+
+	/*
+	 * Ignore I/O not associated with any logical data.
+	 */
+	if (zio->io_logical == NULL)
+		return (0);
+
+	/*
+	 * Currently, we only support fault injection on reads.
+	 */
+	if (zio->io_type != ZIO_TYPE_READ)
+		return (0);
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		/* Ignore errors not destined for this pool */
+		if (zio->io_spa != handler->zi_spa)
+			continue;
+
+		/* Ignore device errors */
+		if (handler->zi_record.zi_guid != 0)
+			continue;
+
+		/* If this handler matches, return EIO */
+		if (zio_match_handler(&zio->io_logical->io_bookmark,
+		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
+		    &handler->zi_record, error)) {
+			ret = error;
+			break;
+		}
+	}
+
+	rw_exit(&inject_lock);
+
+	return (ret);
+}
+
+int
+zio_handle_device_injection(vdev_t *vd, int error)
+{
+	inject_handler_t *handler;
+	int ret = 0;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		if (vd->vdev_guid == handler->zi_record.zi_guid) {
+			if (handler->zi_record.zi_error == error) {
+				/*
+				 * For a failed open, pretend like the device
+				 * has gone away.
+				 */
+				if (error == ENXIO)
+					vd->vdev_stat.vs_aux =
+					    VDEV_AUX_OPEN_FAILED;
+				ret = error;
+				break;
+			}
+			if (handler->zi_record.zi_error == ENXIO) {
+				ret = EIO;
+				break;
+			}
+		}
+	}
+
+	rw_exit(&inject_lock);
+
+	return (ret);
+}
+
+/*
+ * Create a new handler for the given record.  We add it to the list, adding
+ * a reference to the spa_t in the process.  We increment zio_injection_enabled,
+ * which is the switch to trigger all fault injection.
+ */
+int
+zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
+{
+	inject_handler_t *handler;
+	int error;
+	spa_t *spa;
+
+	/*
+	 * If this is pool-wide metadata, make sure we unload the corresponding
+	 * spa_t, so that the next attempt to load it will trigger the fault.
+	 * We call spa_reset() to unload the pool appropriately.
+	 */
+	if (flags & ZINJECT_UNLOAD_SPA)
+		if ((error = spa_reset(name)) != 0)
+			return (error);
+
+	if (!(flags & ZINJECT_NULL)) {
+		/*
+		 * spa_inject_ref() will add an injection reference, which will
+		 * prevent the pool from being removed from the namespace while
+		 * still allowing it to be unloaded.
+		 */
+		if ((spa = spa_inject_addref(name)) == NULL)
+			return (ENOENT);
+
+		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
+
+		rw_enter(&inject_lock, RW_WRITER);
+
+		*id = handler->zi_id = inject_next_id++;
+		handler->zi_spa = spa;
+		handler->zi_record = *record;
+		list_insert_tail(&inject_handlers, handler);
+		atomic_add_32(&zio_injection_enabled, 1);
+
+		rw_exit(&inject_lock);
+	}
+
+	/*
+	 * Flush the ARC, so that any attempts to read this data will end up
+	 * going to the ZIO layer.  Note that this is a little overkill, but
+	 * we don't have the necessary ARC interfaces to do anything else, and
+	 * fault injection isn't a performance critical path.
+	 */
+	if (flags & ZINJECT_FLUSH_ARC)
+		arc_flush();
+
+	return (0);
+}
+
+/*
+ * Returns the next record with an ID greater than that supplied to the
+ * function.  Used to iterate over all handlers in the system.
+ */
+int
+zio_inject_list_next(int *id, char *name, size_t buflen,
+    zinject_record_t *record)
+{
+	inject_handler_t *handler;
+	int ret;
+
+	mutex_enter(&spa_namespace_lock);
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler))
+		if (handler->zi_id > *id)
+			break;
+
+	if (handler) {
+		*record = handler->zi_record;
+		*id = handler->zi_id;
+		(void) strncpy(name, spa_name(handler->zi_spa), buflen);
+		ret = 0;
+	} else {
+		ret = ENOENT;
+	}
+
+	rw_exit(&inject_lock);
+	mutex_exit(&spa_namespace_lock);
+
+	return (ret);
+}
+
+/*
+ * Clear the fault handler with the given identifier, or return ENOENT if none
+ * exists.
+ */
+int
+zio_clear_fault(int id)
+{
+	inject_handler_t *handler;
+	int ret;
+
+	rw_enter(&inject_lock, RW_WRITER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler))
+		if (handler->zi_id == id)
+			break;
+
+	if (handler == NULL) {
+		ret = ENOENT;
+	} else {
+		list_remove(&inject_handlers, handler);
+		spa_inject_delref(handler->zi_spa);
+		kmem_free(handler, sizeof (inject_handler_t));
+		atomic_add_32(&zio_injection_enabled, -1);
+		ret = 0;
+	}
+
+	rw_exit(&inject_lock);
+
+	return (ret);
+}
+
+void
+zio_inject_init(void)
+{
+	list_create(&inject_handlers, sizeof (inject_handler_t),
+	    offsetof(inject_handler_t, zi_link));
+}
+
+void
+zio_inject_fini(void)
+{
+	list_destroy(&inject_handlers);
+}
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index a570d4d971..69fb50c2c3 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -418,6 +417,7 @@ zvol_create_minor(zfs_cmd_t *zc)
 
 	zvol_size_changed(zv, dev);
 
+	/* XXX this should handle the possible i/o error */
 	VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
 	    "readonly", zvol_readonly_changed_cb, zv) == 0);
 
@@ -500,7 +500,7 @@ zvol_set_volsize(zfs_cmd_t *zc)
 	}
 
 	tx = dmu_tx_create(zv->zv_objset);
-	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, 1);
+	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	dmu_tx_hold_free(tx, ZVOL_OBJ, zc->zc_volsize, DMU_OBJECT_END);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
@@ -511,9 +511,10 @@ zvol_set_volsize(zfs_cmd_t *zc)
 
 	error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
 	    &zc->zc_volsize, tx);
-	if (error == 0)
-		dmu_free_range(zv->zv_objset, ZVOL_OBJ, zc->zc_volsize,
+	if (error == 0) {
+		error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, zc->zc_volsize,
 		    DMU_OBJECT_END, tx);
+	}
 
 	dmu_tx_commit(tx);
 
@@ -744,7 +745,7 @@ zvol_strategy(buf_t *bp)
 			size = volsize - off;
 
 		if (bp->b_flags & B_READ) {
-			error = dmu_read_canfail(os, ZVOL_OBJ,
+			error = dmu_read(os, ZVOL_OBJ,
 			    off, size, addr);
 		} else {
 			dmu_tx_t *tx = dmu_tx_create(os);
diff --git a/usr/src/uts/common/krtld/kobj.c b/usr/src/uts/common/krtld/kobj.c
index 003022d104..1cdf93e98f 100644
--- a/usr/src/uts/common/krtld/kobj.c
+++ b/usr/src/uts/common/krtld/kobj.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -108,6 +107,7 @@ static int kobj_boot_open(char *, int);
 static int kobj_boot_close(int);
 static int kobj_boot_seek(int, off_t, off_t);
 static int kobj_boot_read(int, caddr_t, size_t);
+static int kobj_boot_fstat(int, struct bootstat *);
 
 static Sym *lookup_one(struct module *, const char *);
 static void sym_insert(struct module *, char *, symid_t);
@@ -3324,8 +3324,8 @@ kobj_open(char *filename)
 			 */
 			cred_t *saved_cred = curthread->t_cred;
 			curthread->t_cred = kcred;
-			Errno = vn_open(filename, UIO_SYSSPACE, FREAD, 0, &vp,
-			    0, 0);
+			Errno = vn_openat(filename, UIO_SYSSPACE, FREAD, 0, &vp,
+			    0, 0, rootdir);
 			curthread->t_cred = saved_cred;
 		}
 		kobjopen_free(ltp);
@@ -3458,6 +3458,47 @@ kobj_close(intptr_t descr)
 		(void) kobj_boot_close((int)descr);
 }
 
+int
+kobj_fstat(intptr_t descr, struct bootstat *buf)
+{
+	if (buf == NULL)
+		return (-1);
+
+	if (_modrootloaded) {
+		vattr_t vattr;
+		struct vnode *vp = (struct vnode *)descr;
+		if (VOP_GETATTR(vp, &vattr, 0, kcred) != 0)
+			return (-1);
+
+		/*
+		 * The vattr and bootstat structures are similar, but not
+		 * identical.  We do our best to fill in the bootstat structure
+		 * from the contents of vattr (transfering only the ones that
+		 * are obvious.
+		 */
+
+		buf->st_mode = (uint32_t)vattr.va_mode;
+		buf->st_nlink = (uint32_t)vattr.va_nlink;
+		buf->st_uid = (int32_t)vattr.va_uid;
+		buf->st_gid = (int32_t)vattr.va_gid;
+		buf->st_rdev = (uint64_t)vattr.va_rdev;
+		buf->st_size = (uint64_t)vattr.va_size;
+		buf->st_atim.tv_sec = (int64_t)vattr.va_atime.tv_sec;
+		buf->st_atim.tv_nsec = (int64_t)vattr.va_atime.tv_nsec;
+		buf->st_mtim.tv_sec = (int64_t)vattr.va_mtime.tv_sec;
+		buf->st_mtim.tv_nsec = (int64_t)vattr.va_mtime.tv_nsec;
+		buf->st_ctim.tv_sec = (int64_t)vattr.va_ctime.tv_sec;
+		buf->st_ctim.tv_nsec = (int64_t)vattr.va_ctime.tv_nsec;
+		buf->st_blksize = (int32_t)vattr.va_blksize;
+		buf->st_blocks = (int64_t)vattr.va_nblocks;
+
+		return (0);
+	}
+
+	return (kobj_boot_fstat((int)descr, buf));
+}
+
+
 struct _buf *
 kobj_open_file(char *name)
 {
@@ -4097,6 +4138,18 @@ kobj_record_file(char *filename)
 }
 #endif	/* __x86 */
 
+static int
+kobj_boot_fstat(int fd, struct bootstat *stp)
+{
+#if defined(__sparc)
+	if (!standalone && _ioquiesced)
+		return (-1);
+	return (BOP_FSTAT(ops, fd, stp));
+#else
+	return (BRD_FSTAT(bfs_ops, fd, stp));
+#endif
+}
+
 /*
  * XXX these wrappers should go away when sparc is converted
  * boot from ramdisk
diff --git a/usr/src/uts/common/krtld/kobj_stubs.c b/usr/src/uts/common/krtld/kobj_stubs.c
index 3d972194bb..c592fb5317 100644
--- a/usr/src/uts/common/krtld/kobj_stubs.c
+++ b/usr/src/uts/common/krtld/kobj_stubs.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -108,6 +107,13 @@ kobj_close(intptr_t descr)
 
 /*ARGSUSED*/
 int
+kobj_fstat(intptr_t descr, struct bootstat *buf)
+{
+	return (-1);
+}
+
+/*ARGSUSED*/
+int
 kobj_filbuf(struct _buf *f)
 {
 	return (-1);
diff --git a/usr/src/uts/common/krtld/mapfile b/usr/src/uts/common/krtld/mapfile
index 398c6dcf32..cb1f85b04a 100644
--- a/usr/src/uts/common/krtld/mapfile
+++ b/usr/src/uts/common/krtld/mapfile
@@ -1,13 +1,9 @@
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -22,6 +18,9 @@
 #
 # CDDL HEADER END
 #
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 #
 
@@ -36,6 +35,7 @@
 		kobj_export_module;
 		kobj_filbuf;
 		kobj_free;
+		kobj_fstat;
 		kobj_getelfsym;
 		kobj_getmodinfo;
 		kobj_getpagesize;
diff --git a/usr/src/uts/common/os/fm.c b/usr/src/uts/common/os/fm.c
index 6ff4626405..43c3acbef0 100644
--- a/usr/src/uts/common/os/fm.c
+++ b/usr/src/uts/common/os/fm.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -1070,6 +1069,37 @@ fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
 	}
 }
 
+void
+fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
+    uint64_t vdev_guid)
+{
+	if (version != ZFS_SCHEME_VERSION0) {
+		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		return;
+	}
+
+	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
+		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		return;
+	}
+
+	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
+		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		return;
+	}
+
+	if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
+		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+	}
+
+	if (vdev_guid != 0) {
+		if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
+			atomic_add_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		}
+	}
+}
+
 uint64_t
 fm_ena_increment(uint64_t ena)
 {
diff --git a/usr/src/uts/common/os/modsysfile.c b/usr/src/uts/common/os/modsysfile.c
index 7ffcf66d10..0e36f3e2cc 100644
--- a/usr/src/uts/common/os/modsysfile.c
+++ b/usr/src/uts/common/os/modsysfile.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -73,6 +72,7 @@ static vmem_t *mod_sysfile_arena;	/* parser memory */
 
 char obp_bootpath[BO_MAXOBJNAME];	/* bootpath from obp */
 char svm_bootpath[BO_MAXOBJNAME];	/* bootpath redirected via rootdev */
+char zfs_bootpath[BO_MAXOBJNAME];	/* zfs bootpath, set via zfsroot */
 
 #if defined(_PSM_MODULES)
 
@@ -489,6 +489,8 @@ static struct modcmd modcmd[] = {
 	{ "set32",	MOD_SET32	},
 	{ "SET64",	MOD_SET64	},
 	{ "set64",	MOD_SET64	},
+	{ "ZFSROOT", 	MOD_ZFSROOT	},
+	{ "zfsroot", 	MOD_ZFSROOT	},
 	{ NULL,		MOD_UNKNOWN	}
 };
 
@@ -528,6 +530,7 @@ do_sysfile_cmd(struct _buf *file, const char *cmd)
 		 */
 	case MOD_ROOTFS:
 	case MOD_SWAPFS:
+	case MOD_ZFSROOT:
 		if ((token = kobj_lex(file, tok1, sizeof (tok1))) == COLON) {
 			token = kobj_lex(file, tok1, sizeof (tok1));
 		} else {
@@ -1520,7 +1523,10 @@ setparams()
 			(void) copystr(sysp->sys_ptr, bootobjp->bo_fstype,
 			    BO_MAXOBJNAME, NULL);
 			break;
-
+		case MOD_ZFSROOT:
+			(void) copystr(sysp->sys_ptr, zfs_bootpath,
+			    BO_MAXOBJNAME, NULL);
+			break;
 		default:
 			break;
 		}
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index fe4a5c82df..2e027b7ba5 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -1741,13 +1740,10 @@ secpolicy_contract_event_choice(const cred_t *cr)
 }
 
 /*
- * Name:   secpolicy_gart_access
- *
- * Normal: Verify if the subject has sufficient priveleges to make ioctls
- *	   to agpgart device
- *
- * Output: EPERM - if not privileged
+ * secpolicy_gart_access
  *
+ * Determine if the subject has sufficient priveleges to make ioctls to agpgart
+ * device.
  */
 int
 secpolicy_gart_access(const cred_t *cr)
@@ -1756,13 +1752,10 @@ secpolicy_gart_access(const cred_t *cr)
 }
 
 /*
- * Name:   secpolicy_gart_map
- *
- * Normal: Verify if the subject has sufficient privelegs to map aperture
- *	   range through agpgart driver
- *
- * Output: EPERM - if not privileged
+ * secpolicy_gart_map
  *
+ * Determine if the subject has sufficient priveleges to map aperture range
+ * through agpgart driver.
  */
 int
 secpolicy_gart_map(const cred_t *cr)
@@ -1774,10 +1767,22 @@ secpolicy_gart_map(const cred_t *cr)
 }
 
 /*
+ * secpolicy_zinject
+ *
+ * Determine if the subject can inject faults in the ZFS fault injection
+ * framework.  Requires all privileges.
+ */
+int
+secpolicy_zinject(const cred_t *cr)
+{
+	return (secpolicy_require_set(cr, PRIV_FULLSET, NULL));
+}
+
+/*
  * secpolicy_zfs
  *
- * Determine if the user has permission to manipulate ZFS datasets (not pools).
- * Equivalent to the SYS_MOUNT privilege.
+ * Determine if the subject has permission to manipulate ZFS datasets
+ * (not pools).  Equivalent to the SYS_MOUNT privilege.
  */
 int
 secpolicy_zfs(const cred_t *cr)
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index f82a933903..516ecc0a5a 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -657,6 +657,9 @@ FMHDRS=				\
 	protocol.h		\
 	util.h
 
+FMFSHDRS=			\
+	zfs.h
+
 FMIOHDRS=			\
 	ddi.h			\
 	pci.h			\
@@ -914,6 +917,7 @@ CHECKHDRS=						\
 	$(TAVORHDRS:%.h=ib/adapters/tavor/%.check)	\
 	$(ISOHDRS:%.h=iso/%.check)			\
 	$(FMHDRS:%.h=fm/%.check)			\
+	$(FMFSHDRS:%.h=fm/fs/%.check)			\
 	$(FMIOHDRS:%.h=fm/io/%.check)			\
 	$(FSHDRS:%.h=fs/%.check)			\
 	$(LVMHDRS:%.h=lvm/%.check)			\
@@ -949,6 +953,7 @@ CHECKHDRS=						\
 	$(ROOTISOHDRS)		\
 	$(ROOTFMHDRS)		\
 	$(ROOTFMIOHDRS)		\
+	$(ROOTFMFSHDRS)		\
 	$(ROOTFSHDRS)		\
 	$(ROOTIBDHDRS)		\
 	$(ROOTIBHDRS)		\
@@ -992,7 +997,8 @@ install_h:			\
 	$(ROOTDCAMHDRS)		\
 	$(ROOTISOHDRS)		\
 	$(ROOTFMHDRS)		\
-	$(ROOTFMIOHDRS)	\
+	$(ROOTFMFSHDRS)		\
+	$(ROOTFMIOHDRS)		\
 	$(ROOTFSHDRS)		\
 	$(ROOTIBDHDRS)		\
 	$(ROOTIBHDRS)		\
diff --git a/usr/src/uts/common/sys/Makefile.syshdrs b/usr/src/uts/common/sys/Makefile.syshdrs
index cdc3436049..d9c363b48b 100644
--- a/usr/src/uts/common/sys/Makefile.syshdrs
+++ b/usr/src/uts/common/sys/Makefile.syshdrs
@@ -1,5 +1,5 @@
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -18,10 +18,13 @@ av/%.check:	av/%.h
 fm/%.check:     fm/%.h
 	$(DOT_H_CHECK)
 
-fm/cpu/%.check:     fm/cpu/%.h
+fm/cpu/%.check:	fm/cpu/%.h
 	$(DOT_H_CHECK)
 
-fm/io/%.check:     fm/io/%.h
+fm/fs/%.check:	fm/fs/%.h
+	$(DOT_H_CHECK)
+
+fm/io/%.check:	fm/io/%.h
 	$(DOT_H_CHECK)
 
 fs/%.check:	fs/%.h
@@ -129,6 +132,7 @@ ROOTDIRS=			\
 	$(ROOTDIR)/iso		\
 	$(ROOTDIR)/fm		\
 	$(ROOTDIR)/fm/cpu	\
+	$(ROOTDIR)/fm/fs	\
 	$(ROOTDIR)/fm/io	\
 	$(ROOTDIR)/fs		\
 	$(ROOTDIR)/ib		\
@@ -187,6 +191,7 @@ ROOTISOHDRS= $(ISOHDRS:%=$(ROOTDIR)/iso/%)
 ROOTFMHDRS= $(FMHDRS:%=$(ROOTDIR)/fm/%)
 ROOTFMCPUHDRS= $(FMCPUHDRS:%=$(ROOTDIR)/fm/cpu/%)
 ROOTFMIOHDRS= $(FMIOHDRS:%=$(ROOTDIR)/fm/io/%)
+ROOTFMFSHDRS= $(FMFSHDRS:%=$(ROOTDIR)/fm/fs/%)
 
 ROOTFSHDRS= $(FSHDRS:%=$(ROOTDIR)/fs/%)
 
diff --git a/usr/src/uts/common/sys/fm/fs/zfs.h b/usr/src/uts/common/sys/fm/fs/zfs.h
new file mode 100644
index 0000000000..aa5c7ee0d7
--- /dev/null
+++ b/usr/src/uts/common/sys/fm/fs/zfs.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_FM_FS_ZFS_H
+#define	_SYS_FM_FS_ZFS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ZFS_ERROR_CLASS				"fs.zfs"
+
+#define	FM_EREPORT_ZFS_CHECKSUM			"checksum"
+#define	FM_EREPORT_ZFS_IO			"io"
+#define	FM_EREPORT_ZFS_DATA			"data"
+#define	FM_EREPORT_ZFS_POOL			"zpool"
+#define	FM_EREPORT_ZFS_DEVICE_UNKNOWN		"vdev.unknown"
+#define	FM_EREPORT_ZFS_DEVICE_OPEN_FAILED	"vdev.open_failed"
+#define	FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA	"vdev.corrupt_data"
+#define	FM_EREPORT_ZFS_DEVICE_NO_REPLICAS	"vdev.no_replicas"
+#define	FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM	"vdev.bad_guid_sum"
+#define	FM_EREPORT_ZFS_DEVICE_TOO_SMALL		"vdev.too_small"
+#define	FM_EREPORT_ZFS_DEVICE_BAD_LABEL		"vdev.bad_label"
+
+#define	FM_EREPORT_PAYLOAD_ZFS_POOL		"pool"
+#define	FM_EREPORT_PAYLOAD_ZFS_POOL_GUID	"pool_guid"
+#define	FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT	"pool_context"
+#define	FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID	"vdev_guid"
+#define	FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE	"vdev_type"
+#define	FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH	"vdev_path"
+#define	FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID	"vdev_devid"
+#define	FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID	"parent_guid"
+#define	FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE	"parent_type"
+#define	FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH	"parent_path"
+#define	FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID	"parent_devid"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET	"zio_objset"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT	"zio_object"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL	"zio_level"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID	"zio_blkid"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR		"zio_err"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET	"zio_offset"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE		"zio_size"
+#define	FM_EREPORT_PAYLOAD_ZFS_PREV_STATE	"prev_state"
+
+#define	FM_RESOURCE_OK				"ok"
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FM_FS_ZFS_H */
diff --git a/usr/src/uts/common/sys/fm/protocol.h b/usr/src/uts/common/sys/fm/protocol.h
index 89b761ef6c..1afa67f66b 100644
--- a/usr/src/uts/common/sys/fm/protocol.h
+++ b/usr/src/uts/common/sys/fm/protocol.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -168,6 +167,7 @@ extern "C" {
 #define	FM_FMRI_SCHEME_MOD		"mod"
 #define	FM_FMRI_SCHEME_PKG		"pkg"
 #define	FM_FMRI_SCHEME_LEGACY		"legacy-hc"
+#define	FM_FMRI_SCHEME_ZFS		"zfs"
 
 /* Scheme versions */
 #define	FMD_SCHEME_VERSION0		0
@@ -187,6 +187,8 @@ extern "C" {
 #define	FM_PKG_SCHEME_VERSION		PKG_SCHEME_VERSION0
 #define	LEGACY_SCHEME_VERSION0		0
 #define	FM_LEGACY_SCHEME_VERSION	LEGACY_SCHEME_VERSION0
+#define	ZFS_SCHEME_VERSION0		0
+#define	FM_ZFS_SCHEME_VERSION		ZFS_SCHEME_VERSION0
 
 /* hc scheme member names */
 #define	FM_FMRI_HC_SERIAL_ID		"serial"
@@ -253,6 +255,10 @@ extern "C" {
 #define	FM_FMRI_MOD_ID			"mod-id"
 #define	FM_FMRI_MOD_DESC		"mod-desc"
 
+/* zfs scheme member names */
+#define	FM_FMRI_ZFS_POOL		"pool"
+#define	FM_FMRI_ZFS_VDEV		"vdev"
+
 extern nv_alloc_t *fm_nva_xcreate(char *, size_t);
 extern void fm_nva_xdestroy(nv_alloc_t *);
 
@@ -277,6 +283,7 @@ extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *,
     const char *, uint64_t);
 extern void fm_authority_set(nvlist_t *, int, const char *, const char *,
     const char *, const char *);
+extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t);
 
 extern uint64_t fm_ena_increment(uint64_t);
 extern uint64_t fm_ena_generate(uint64_t, uchar_t);
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index 65425c829c..0fa884dcaa 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -133,6 +133,8 @@ uint64_t zfs_prop_default_numeric(zfs_prop_t);
 #define	ZPOOL_CONFIG_STATS		"stats"
 #define	ZPOOL_CONFIG_WHOLE_DISK		"whole_disk"
 #define	ZPOOL_CONFIG_OFFLINE		"offline"
+#define	ZPOOL_CONFIG_ERRCOUNT		"error_count"
+#define	ZPOOL_CONFIG_NOT_PRESENT	"not_present"
 
 #define	VDEV_TYPE_ROOT			"root"
 #define	VDEV_TYPE_MIRROR		"mirror"
@@ -304,9 +306,25 @@ typedef enum zfs_ioc {
 	ZFS_IOC_ROLLBACK,
 	ZFS_IOC_RENAME,
 	ZFS_IOC_RECVBACKUP,
-	ZFS_IOC_SENDBACKUP
+	ZFS_IOC_SENDBACKUP,
+	ZFS_IOC_INJECT_FAULT,
+	ZFS_IOC_CLEAR_FAULT,
+	ZFS_IOC_INJECT_LIST_NEXT,
+	ZFS_IOC_ERROR_LOG,
+	ZFS_IOC_CLEAR,
+	ZFS_IOC_BOOKMARK_NAME
 } zfs_ioc_t;
 
+/*
+ * Internal SPA load state.  Used by FMA diagnosis engine.
+ */
+typedef enum {
+	SPA_LOAD_NONE,		/* no load in progress */
+	SPA_LOAD_OPEN,		/* normal open */
+	SPA_LOAD_IMPORT,	/* import in progress */
+	SPA_LOAD_TRYIMPORT	/* tryimport in progress */
+} spa_load_state_t;
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/sys/kobj.h b/usr/src/uts/common/sys/kobj.h
index 7d2bd0922e..9276aa370f 100644
--- a/usr/src/uts/common/sys/kobj.h
+++ b/usr/src/uts/common/sys/kobj.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -34,6 +33,7 @@
 #include <sys/machelf.h>
 #include <sys/vmem.h>
 #include <sys/sdt.h>
+#include <sys/bootstat.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -162,6 +162,7 @@ extern uintptr_t kobj_getsymvalue(char *, int);
 extern char *kobj_getsymname(uintptr_t, ulong_t *);
 extern char *kobj_searchsym(struct module *, uintptr_t, ulong_t *);
 
+extern int kobj_fstat(intptr_t, struct bootstat *);
 extern intptr_t kobj_open(char *);
 extern int kobj_path_exists(char *, int);
 extern struct _buf *kobj_open_path(char *, int, int);
diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h
index 9653a58b0e..beabb63818 100644
--- a/usr/src/uts/common/sys/policy.h
+++ b/usr/src/uts/common/sys/policy.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -141,6 +140,7 @@ int secpolicy_vnode_setdac(const cred_t *, uid_t);
 int secpolicy_vnode_setid_retain(const cred_t *, boolean_t);
 int secpolicy_vnode_setids_setgids(const cred_t *, gid_t);
 int secpolicy_vnode_stky_modify(const cred_t *);
+int secpolicy_zinject(const cred_t *);
 int secpolicy_zfs(const cred_t *);
 void secpolicy_setid_clear(vattr_t *, cred_t *);
 
diff --git a/usr/src/uts/common/sys/sysconf.h b/usr/src/uts/common/sys/sysconf.h
index 4594d91287..654436a115 100644
--- a/usr/src/uts/common/sys/sysconf.h
+++ b/usr/src/uts/common/sys/sysconf.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1990-2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -72,6 +71,7 @@ struct modcmd {
 #define	MOD_UNKNOWN	9	/* unknown command */
 #define	MOD_SET32	10	/* like MOD_SET but -only- on 32-bit kernel */
 #define	MOD_SET64	11	/* like MOD_SET but -only- on 64-bit kernel */
+#define	MOD_ZFSROOT	12	/* use zfs as the root filesystem */
 
 /*
  * Commands for mod_sysctl()
diff --git a/usr/src/uts/intel/sys/bootconf.h b/usr/src/uts/intel/sys/bootconf.h
index 0a7e6a3d58..2e1ccbf326 100644
--- a/usr/src/uts/intel/sys/bootconf.h
+++ b/usr/src/uts/intel/sys/bootconf.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -185,6 +184,7 @@ extern struct bootobj swapfile;
 
 extern char obp_bootpath[BO_MAXOBJNAME];
 extern char svm_bootpath[BO_MAXOBJNAME];
+extern char zfs_bootpath[BO_MAXOBJNAME];
 
 extern dev_t getrootdev(void);
 extern void getfsname(char *, char *, size_t);
diff --git a/usr/src/uts/intel/sys/bootvfs.h b/usr/src/uts/intel/sys/bootvfs.h
index d572520821..63696395da 100644
--- a/usr/src/uts/intel/sys/bootvfs.h
+++ b/usr/src/uts/intel/sys/bootvfs.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -76,6 +75,7 @@ extern struct boot_fs_ops *bfs_ops;
 #define	BRD_CLOSE(ops, fd)		((ops)->fsw_close)(fd)
 #define	BRD_READ(ops, fd, buf, s)	((ops)->fsw_read)(fd, buf, s)
 #define	BRD_SEEK(ops, fd, addr, w)	((ops)->fsw_lseek)(fd, addr, w)
+#define	BRD_FSTAT(ops, fd, stp)		((ops)->fsw_fstat)(fd, stp)
 
 #ifdef _BOOT
 
diff --git a/usr/src/uts/sparc/krtld/mapfile b/usr/src/uts/sparc/krtld/mapfile
index 4fb702b4f6..6d40dc91c1 100644
--- a/usr/src/uts/sparc/krtld/mapfile
+++ b/usr/src/uts/sparc/krtld/mapfile
@@ -1,13 +1,12 @@
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -35,6 +34,7 @@
 		kobj_export_module;
 		kobj_filbuf;
 		kobj_free;
+		kobj_fstat;
 		kobj_getelfsym;
 		kobj_getmodinfo;
 		kobj_getpagesize;
diff --git a/usr/src/uts/sun/sys/bootconf.h b/usr/src/uts/sun/sys/bootconf.h
index 40e7d25fac..c96f8aa90b 100644
--- a/usr/src/uts/sun/sys/bootconf.h
+++ b/usr/src/uts/sun/sys/bootconf.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -219,6 +218,7 @@ extern struct bootobj swapfile;
 
 extern char obp_bootpath[BO_MAXOBJNAME];
 extern char svm_bootpath[BO_MAXOBJNAME];
+extern char zfs_bootpath[BO_MAXOBJNAME];
 
 extern dev_t getrootdev(void);
 extern void getfsname(char *, char *, size_t);
author	eschrock <none@none>	2006-03-03 20:08:16 -0800
committer	eschrock <none@none>	2006-03-03 20:08:16 -0800
commit	ea8dc4b6d2251b437950c0056bc626b311c73c27 (patch)
tree	69cc1808568f2ef8fd1e21c61e186ba452ea64da /usr/src
parent	5c18afbc96a46bc3a9e6f3667512daa374d6cd79 (diff)
download	illumos-gate-ea8dc4b6d2251b437950c0056bc626b311c73c27.tar.gz