PSARC 2006/077 zpool clear

PSARC 2006/139 FMA for ZFS 6284889 arc should replace the znode cache 6333006 DMU & DSL should not panic upon I/O error 6333092 concurrent reads to a file not scaling with number of readers 6338081 ZFS/FMA phase 1 6338386 need persistent error log 6341326 i/o error causes arc buf hash table corruption 6341639 zfs backup/restore should compute/verify checksum of backup stream 6348002 out of space due to changing properties 6354724 inaccurate error message from zfs restore 6354872 dmu_sync() blows predictive accounting 6355416 zpool scrubbing consumes all memory, system hung 6363995 df should only load libzfs when it encounters a ZFS filesystem 6366320 zfs backup/restore doesn't like signals 6368892 mount -m support needed for legacy mounts 6368902 boot archive fstat support needed for ZFS Mountroot 6369424 BFU complains when bfu'ing a ZFS root filesystem 6374062 mountroot support needed for ZFS 6376356 dirtying dbuf obj=43 lvl=0 blkid=0 but not tx_held 6378391 unused members of dmu_objset_stats_t 6378392 clean up zfs_cmd_t structure 6378685 buf_init should allocate its hash table more carefully 6378976 ziltest should be a first class citizen 6381086 zdb segfaults if there is a spa deferred-free bplist 6381203 deadlock due to i/o while assigning (tc_lock held) 6381209 freed space is not immediately available 6381344 'zpool clear' 6381345 FAULTED devices should really be UNAVAIL 6381346 import should mark devices as persistently unavailable 6383272 recursive mutex_enter() during log replay with zfs root 6386326 origin property is not displayed 6386354 libzfs does too much in its _init section, calls exit(1) 6386624 zpool should not complain about non-existent devices from libdiskmgt 6386910 spa needs to be i/o error hardened 6387735 need a mechanism to inject faults into ZFS 6387736 internal ZFS utilities should be placed in an ON-private package 6389928 libzfs should ship a lint library 6390609 malformed vdev config panics on zpool_create() 6390677 version number checking makes upgrades challenging 6390713 ztest hangs in zil_suspend() 6391873 metadata compression should be turned back on 6392113 ztest sometimes reports leaked blocks because ZIL isn't resilvered 6393004 minor memory leak in unique_insert()
author: eschrock <none@none> 2006-03-03 20:08:16 -0800
committer: eschrock <none@none> 2006-03-03 20:08:16 -0800
commit: ea8dc4b6d2251b437950c0056bc626b311c73c27 (patch)
tree: 69cc1808568f2ef8fd1e21c61e186ba452ea64da /usr/src/uts/common
parent: 5c18afbc96a46bc3a9e6f3667512daa374d6cd79 (diff)
download: illumos-joyent-ea8dc4b6d2251b437950c0056bc626b311c73c27.tar.gz
83 files changed, 6432 insertions, 3331 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index f2d155fd25..587e9e1535 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -864,6 +864,7 @@ ZFS_COMMON_OBJS +=		\
 	sha256.o		\
 	spa.o			\
 	spa_config.o		\
+	spa_errlog.o		\
 	spa_misc.o		\
 	space_map.o		\
 	txg.o			\
@@ -882,10 +883,12 @@ ZFS_COMMON_OBJS +=		\
 	zap_leaf.o		\
 	zap_micro.o		\
 	zfs_byteswap.o		\
+	zfs_fm.o		\
 	zil.o			\
 	zio.o			\
 	zio_checksum.o		\
-	zio_compress.o
+	zio_compress.o		\
+	zio_inject.o
 
 ZFS_SHARED_OBJS +=		\
 	zfs_namecheck.o		\
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index bd8a110990..904e746721 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -28,8 +28,8 @@
 /*
  * DVA-based Adjustable Relpacement Cache
  *
- * While much of the theory of operation and algorithms used here
- * are based on the self-tuning, low overhead replacement cache
+ * While much of the theory of operation used here is
+ * based on the self-tuning, low overhead replacement cache
  * presented by Megiddo and Modha at FAST 2003, there are some
  * significant differences:
  *
@@ -98,6 +98,15 @@
  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  * the "top" state mutex must be held before the "bot" state mutex.
  *
+ * Arc buffers may have an associated eviction callback function.
+ * This function will be invoked prior to removing the buffer (e.g.
+ * in arc_do_user_evicts()).  Note however that the data associated
+ * with the buffer may be evicted prior to the callback.  The callback
+ * must be made with *no locks held* (to prevent deadlock).  Additionally,
+ * the users of callbacks must ensure that their private data is
+ * protected from simultaneous callbacks from arc_buf_evict()
+ * and arc_do_user_evicts().
+ *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  */
@@ -136,10 +145,10 @@ static int arc_dead;
 /*
  * Note that buffers can be on one of 5 states:
  *	ARC_anon	- anonymous (discussed below)
- *	ARC_mru_top	- recently used, currently cached
- *	ARC_mru_bot	- recentely used, no longer in cache
- *	ARC_mfu_top	- frequently used, currently cached
- *	ARC_mfu_bot	- frequently used, no longer in cache
+ *	ARC_mru		- recently used, currently cached
+ *	ARC_mru_ghost	- recentely used, no longer in cache
+ *	ARC_mfu		- frequently used, currently cached
+ *	ARC_mfu_ghost	- frequently used, no longer in cache
  * When there are no active references to the buffer, they
  * are linked onto one of the lists in arc.  These are the
  * only buffers that can be evicted or deleted.
@@ -147,9 +156,9 @@ static int arc_dead;
  * Anonymous buffers are buffers that are not associated with
  * a DVA.  These are buffers that hold dirty block copies
  * before they are written to stable storage.  By definition,
- * they are "ref'd" and are considered part of arc_mru_top
+ * they are "ref'd" and are considered part of arc_mru
  * that cannot be freed.  Generally, they will aquire a DVA
- * as they are written and migrate onto the arc_mru_top list.
+ * as they are written and migrate onto the arc_mru list.
  */
 
 typedef struct arc_state {
@@ -162,24 +171,22 @@ typedef struct arc_state {
 
 /* The 5 states: */
 static arc_state_t ARC_anon;
-static arc_state_t ARC_mru_top;
-static arc_state_t ARC_mru_bot;
-static arc_state_t ARC_mfu_top;
-static arc_state_t ARC_mfu_bot;
+static arc_state_t ARC_mru;
+static arc_state_t ARC_mru_ghost;
+static arc_state_t ARC_mfu;
+static arc_state_t ARC_mfu_ghost;
 
 static struct arc {
 	arc_state_t 	*anon;
-	arc_state_t	*mru_top;
-	arc_state_t	*mru_bot;
-	arc_state_t	*mfu_top;
-	arc_state_t	*mfu_bot;
+	arc_state_t	*mru;
+	arc_state_t	*mru_ghost;
+	arc_state_t	*mfu;
+	arc_state_t	*mfu_ghost;
 	uint64_t	size;		/* Actual total arc size */
-	uint64_t	p;		/* Target size (in bytes) of mru_top */
+	uint64_t	p;		/* Target size (in bytes) of mru */
 	uint64_t	c;		/* Target size of cache (in bytes) */
 	uint64_t	c_min;		/* Minimum target cache size */
 	uint64_t	c_max;		/* Maximum target cache size */
-	uint64_t	incr;		/* Size by which to increment arc.c */
-	int64_t		size_check;
 
 	/* performance stats */
 	uint64_t	hits;
@@ -195,12 +202,6 @@ static struct arc {
 	int		no_grow;	/* Don't try to grow cache size */
 } arc;
 
-/* Default amount to grow arc.incr */
-static int64_t arc_incr_size = 1024;
-
-/* > 0 ==> time to increment arc.c */
-static int64_t arc_size_check_default = -1000;
-
 static uint64_t arc_tempreserve;
 
 typedef struct arc_callback arc_callback_t;
@@ -227,6 +228,7 @@ struct arc_buf_hdr {
 	arc_buf_hdr_t		*b_hash_next;
 	arc_buf_t		*b_buf;
 	uint32_t		b_flags;
+	uint32_t		b_datacnt;
 
 	kcondvar_t		b_cv;
 	arc_callback_t		*b_acb;
@@ -242,6 +244,13 @@ struct arc_buf_hdr {
 	refcount_t		b_refcnt;
 };
 
+static arc_buf_t *arc_eviction_list;
+static kmutex_t arc_eviction_mtx;
+static void arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
+
+#define	GHOST_STATE(state)	\
+	((state) == arc.mru_ghost || (state) == arc.mfu_ghost)
+
 /*
  * Private ARC flags.  These flags are private ARC only flags that will show up
  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
@@ -250,13 +259,17 @@ struct arc_buf_hdr {
  * public flags, make sure not to smash the private ones.
  */
 
+#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
 #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
 #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
 #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
+#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
 
+#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
 #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
+#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
 
 /*
  * Hash table routines
@@ -353,6 +366,7 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 	arc_buf_hdr_t *fbuf;
 	uint32_t max, i;
 
+	ASSERT(!HDR_IN_HASH_TABLE(buf));
 	fbufs_lastthread = curthread;
 	*lockp = hash_lock;
 	mutex_enter(hash_lock);
@@ -366,6 +380,7 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 
 	buf->b_hash_next = buf_hash_table.ht_table[idx];
 	buf_hash_table.ht_table[idx] = buf;
+	buf->b_flags |= ARC_IN_HASH_TABLE;
 
 	/* collect some hash table performance data */
 	if (i > 0) {
@@ -391,6 +406,7 @@ buf_hash_remove(arc_buf_hdr_t *buf)
 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 
 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
+	ASSERT(HDR_IN_HASH_TABLE(buf));
 
 	bufp = &buf_hash_table.ht_table[idx];
 	while ((fbuf = *bufp) != buf) {
@@ -399,6 +415,7 @@ buf_hash_remove(arc_buf_hdr_t *buf)
 	}
 	*bufp = buf->b_hash_next;
 	buf->b_hash_next = NULL;
+	buf->b_flags &= ~ARC_IN_HASH_TABLE;
 
 	/* collect some hash table performance data */
 	atomic_add_64(&arc.hash_elements, -1);
@@ -456,6 +473,7 @@ hdr_dest(void *vbuf, void *unused)
 	cv_destroy(&buf->b_cv);
 }
 
+static int arc_reclaim_needed(void);
 void arc_kmem_reclaim(void);
 
 /*
@@ -466,27 +484,33 @@ static void
 hdr_recl(void *unused)
 {
 	dprintf("hdr_recl called\n");
-	arc_kmem_reclaim();
+	if (arc_reclaim_needed())
+		arc_kmem_reclaim();
 }
 
 static void
 buf_init(void)
 {
 	uint64_t *ct;
-	uint64_t hsize = 1ULL << 10;
+	uint64_t hsize = 1ULL << 12;
 	int i, j;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
-	 * with an average 4k block size.  The table will take up
-	 * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte
-	 * pointers).
+	 * with an average 64K block size.  The table will take up
+	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 	 */
-	while (hsize * 4096 < physmem * PAGESIZE)
+	while (hsize * 65536 < physmem * PAGESIZE)
 		hsize <<= 1;
-
+retry:
 	buf_hash_table.ht_mask = hsize - 1;
-	buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+	buf_hash_table.ht_table =
+	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+	if (buf_hash_table.ht_table == NULL) {
+		ASSERT(hsize > (1ULL << 8));
+		hsize >>= 1;
+		goto retry;
+	}
 
 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
@@ -505,8 +529,6 @@ buf_init(void)
 
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
-#define	ARC_TAG		(void *)0x05201962
-
 static void
 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 {
@@ -514,14 +536,21 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 
 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
 	    (ab->b_state != arc.anon)) {
+		int delta = ab->b_size * ab->b_datacnt;
 
 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
 		mutex_enter(&ab->b_state->mtx);
-		ASSERT(!refcount_is_zero(&ab->b_refcnt));
+		ASSERT(refcount_count(&ab->b_refcnt) > 0);
 		ASSERT(list_link_active(&ab->b_arc_node));
 		list_remove(&ab->b_state->list, ab);
-		ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
-		ab->b_state->lsize -= ab->b_size;
+		if (GHOST_STATE(ab->b_state)) {
+			ASSERT3U(ab->b_datacnt, ==, 0);
+			ASSERT3P(ab->b_buf, ==, NULL);
+			delta = ab->b_size;
+		}
+		ASSERT(delta > 0);
+		ASSERT3U(ab->b_state->lsize, >=, delta);
+		atomic_add_64(&ab->b_state->lsize, -delta);
 		mutex_exit(&ab->b_state->mtx);
 	}
 }
@@ -531,7 +560,8 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 {
 	int cnt;
 
-	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(ab->b_state == arc.anon || MUTEX_HELD(hash_lock));
+	ASSERT(!GHOST_STATE(ab->b_state));
 
 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
 	    (ab->b_state != arc.anon)) {
@@ -540,8 +570,9 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 		mutex_enter(&ab->b_state->mtx);
 		ASSERT(!list_link_active(&ab->b_arc_node));
 		list_insert_head(&ab->b_state->list, ab);
-		ASSERT(ab->b_buf != NULL);
-		ab->b_state->lsize += ab->b_size;
+		ASSERT(ab->b_datacnt > 0);
+		atomic_add_64(&ab->b_state->lsize, ab->b_size * ab->b_datacnt);
+		ASSERT3U(ab->b_state->size, >=, ab->b_state->lsize);
 		mutex_exit(&ab->b_state->mtx);
 	}
 	return (cnt);
@@ -552,49 +583,70 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
  * for the buffer must be held by the caller.
  */
 static void
-arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab,
-    kmutex_t *hash_lock)
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 {
-	arc_buf_t *buf;
+	arc_state_t *old_state = ab->b_state;
+	int refcnt = refcount_count(&ab->b_refcnt);
+	int from_delta, to_delta;
 
 	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(new_state != old_state);
+	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
+	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+
+	from_delta = to_delta = ab->b_datacnt * ab->b_size;
 
 	/*
 	 * If this buffer is evictable, transfer it from the
 	 * old state list to the new state list.
 	 */
-	if (refcount_is_zero(&ab->b_refcnt)) {
-		if (ab->b_state != arc.anon) {
-			int drop_mutex = FALSE;
+	if (refcnt == 0) {
+		if (old_state != arc.anon) {
+			int use_mutex = !MUTEX_HELD(&old_state->mtx);
+
+			if (use_mutex)
+				mutex_enter(&old_state->mtx);
 
-			if (!MUTEX_HELD(&ab->b_state->mtx)) {
-				mutex_enter(&ab->b_state->mtx);
-				drop_mutex = TRUE;
-			}
 			ASSERT(list_link_active(&ab->b_arc_node));
-			list_remove(&ab->b_state->list, ab);
-			ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
-			ab->b_state->lsize -= ab->b_size;
-			if (drop_mutex)
-				mutex_exit(&ab->b_state->mtx);
+			list_remove(&old_state->list, ab);
+
+			/* ghost elements have a ghost size */
+			if (GHOST_STATE(old_state)) {
+				ASSERT(ab->b_datacnt == 0);
+				ASSERT(ab->b_buf == NULL);
+				from_delta = ab->b_size;
+			}
+			ASSERT3U(old_state->lsize, >=, from_delta);
+			atomic_add_64(&old_state->lsize, -from_delta);
+
+			if (use_mutex)
+				mutex_exit(&old_state->mtx);
 		}
 		if (new_state != arc.anon) {
-			int drop_mutex = FALSE;
+			int use_mutex = !MUTEX_HELD(&new_state->mtx);
 
-			if (!MUTEX_HELD(&new_state->mtx)) {
+			if (use_mutex)
 				mutex_enter(&new_state->mtx);
-				drop_mutex = TRUE;
-			}
+
 			list_insert_head(&new_state->list, ab);
-			ASSERT(ab->b_buf != NULL);
-			new_state->lsize += ab->b_size;
-			if (drop_mutex)
+
+			/* ghost elements have a ghost size */
+			if (GHOST_STATE(new_state)) {
+				ASSERT(ab->b_datacnt == 0);
+				ASSERT(ab->b_buf == NULL);
+				to_delta = ab->b_size;
+			}
+			atomic_add_64(&new_state->lsize, to_delta);
+			ASSERT3U(new_state->size + to_delta, >=,
+			    new_state->lsize);
+
+			if (use_mutex)
 				mutex_exit(&new_state->mtx);
 		}
 	}
 
 	ASSERT(!BUF_EMPTY(ab));
-	if (new_state == arc.anon && ab->b_state != arc.anon) {
+	if (new_state == arc.anon && old_state != arc.anon) {
 		buf_hash_remove(ab);
 	}
 
@@ -602,22 +654,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab,
 	 * If this buffer isn't being transferred to the MRU-top
 	 * state, it's safe to clear its prefetch flag
 	 */
-	if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) {
+	if ((new_state != arc.mru) && (new_state != arc.mru_ghost)) {
 		ab->b_flags &= ~ARC_PREFETCH;
 	}
 
-	buf = ab->b_buf;
-	if (buf == NULL) {
-		ASSERT3U(ab->b_state->size, >=, ab->b_size);
-		atomic_add_64(&ab->b_state->size, -ab->b_size);
-		/* we should only be here if we are deleting state */
-		ASSERT(new_state == arc.anon &&
-		    (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot));
-	} else while (buf) {
-		ASSERT3U(ab->b_state->size, >=, ab->b_size);
-		atomic_add_64(&ab->b_state->size, -ab->b_size);
-		atomic_add_64(&new_state->size, ab->b_size);
-		buf = buf->b_next;
+	/* adjust state sizes */
+	if (to_delta)
+		atomic_add_64(&new_state->size, to_delta);
+	if (from_delta) {
+		ASSERT3U(old_state->size, >=, from_delta);
+		atomic_add_64(&old_state->size, -from_delta);
 	}
 	ab->b_state = new_state;
 }
@@ -637,9 +683,12 @@ arc_buf_alloc(spa_t *spa, int size, void *tag)
 	hdr->b_arc_access = 0;
 	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
 	buf->b_hdr = hdr;
+	buf->b_efunc = NULL;
+	buf->b_private = NULL;
 	buf->b_next = NULL;
 	buf->b_data = zio_buf_alloc(size);
 	hdr->b_buf = buf;
+	hdr->b_datacnt = 1;
 	hdr->b_flags = 0;
 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
 	(void) refcount_add(&hdr->b_refcnt, tag);
@@ -650,35 +699,124 @@ arc_buf_alloc(spa_t *spa, int size, void *tag)
 	return (buf);
 }
 
+static void *
+arc_data_copy(arc_buf_hdr_t *hdr, void *old_data)
+{
+	void *new_data = zio_buf_alloc(hdr->b_size);
+
+	atomic_add_64(&arc.size, hdr->b_size);
+	bcopy(old_data, new_data, hdr->b_size);
+	atomic_add_64(&hdr->b_state->size, hdr->b_size);
+	if (list_link_active(&hdr->b_arc_node)) {
+		ASSERT(refcount_is_zero(&hdr->b_refcnt));
+		atomic_add_64(&hdr->b_state->lsize, hdr->b_size);
+	}
+	return (new_data);
+}
+
+void
+arc_buf_add_ref(arc_buf_t *buf, void* tag)
+{
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_lock;
+
+	mutex_enter(&arc_eviction_mtx);
+	hdr = buf->b_hdr;
+	if (buf->b_data == NULL) {
+		/*
+		 * This buffer is evicted.
+		 */
+		mutex_exit(&arc_eviction_mtx);
+		return;
+	} else {
+		/*
+		 * Prevent this buffer from being evicted
+		 * while we add a reference.
+		 */
+		buf->b_hdr = NULL;
+	}
+	mutex_exit(&arc_eviction_mtx);
+
+	ASSERT(hdr->b_state != arc.anon);
+	hash_lock = HDR_LOCK(hdr);
+	mutex_enter(hash_lock);
+	ASSERT(!GHOST_STATE(hdr->b_state));
+	buf->b_hdr = hdr;
+	add_reference(hdr, hash_lock, tag);
+	arc_access_and_exit(hdr, hash_lock);
+	atomic_add_64(&arc.hits, 1);
+}
+
+static void
+arc_buf_destroy(arc_buf_t *buf, boolean_t all)
+{
+	arc_buf_t **bufp;
+
+	/* free up data associated with the buf */
+	if (buf->b_data) {
+		arc_state_t *state = buf->b_hdr->b_state;
+		uint64_t size = buf->b_hdr->b_size;
+
+		zio_buf_free(buf->b_data, size);
+		atomic_add_64(&arc.size, -size);
+		if (list_link_active(&buf->b_hdr->b_arc_node)) {
+			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
+			ASSERT(state != arc.anon);
+			ASSERT3U(state->lsize, >=, size);
+			atomic_add_64(&state->lsize, -size);
+		}
+		ASSERT3U(state->size, >=, size);
+		atomic_add_64(&state->size, -size);
+		buf->b_data = NULL;
+		ASSERT(buf->b_hdr->b_datacnt > 0);
+		buf->b_hdr->b_datacnt -= 1;
+	}
+
+	/* only remove the buf if requested */
+	if (!all)
+		return;
+
+	/* remove the buf from the hdr list */
+	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
+		continue;
+	*bufp = buf->b_next;
+
+	ASSERT(buf->b_efunc == NULL);
+
+	/* clean up the buf */
+	buf->b_hdr = NULL;
+	kmem_cache_free(buf_cache, buf);
+}
+
 static void
-arc_hdr_free(arc_buf_hdr_t *hdr)
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
 	ASSERT3P(hdr->b_state, ==, arc.anon);
+	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 
 	if (!BUF_EMPTY(hdr)) {
-		/*
-		 * We can be called with an arc state lock held,
-		 * so we can't hold a hash lock here.
-		 * ASSERT(not in hash table)
-		 */
-		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		ASSERT(!HDR_IN_HASH_TABLE(hdr));
 		bzero(&hdr->b_dva, sizeof (dva_t));
 		hdr->b_birth = 0;
 		hdr->b_cksum0 = 0;
 	}
-	if (hdr->b_buf) {
+	while (hdr->b_buf) {
 		arc_buf_t *buf = hdr->b_buf;
 
-		ASSERT3U(hdr->b_size, >, 0);
-		zio_buf_free(buf->b_data, hdr->b_size);
-		atomic_add_64(&arc.size, -hdr->b_size);
-		ASSERT3U(arc.anon->size, >=, hdr->b_size);
-		atomic_add_64(&arc.anon->size, -hdr->b_size);
-		ASSERT3P(buf->b_next, ==, NULL);
-		kmem_cache_free(buf_cache, buf);
-		hdr->b_buf = NULL;
+		if (buf->b_efunc) {
+			mutex_enter(&arc_eviction_mtx);
+			ASSERT(buf->b_hdr != NULL);
+			arc_buf_destroy(hdr->b_buf, FALSE);
+			hdr->b_buf = buf->b_next;
+			buf->b_next = arc_eviction_list;
+			arc_eviction_list = buf;
+			mutex_exit(&arc_eviction_mtx);
+		} else {
+			arc_buf_destroy(hdr->b_buf, TRUE);
+		}
 	}
+
 	ASSERT(!list_link_active(&hdr->b_arc_node));
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
 	ASSERT3P(hdr->b_acb, ==, NULL);
@@ -689,36 +827,73 @@ void
 arc_buf_free(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
-	kmutex_t *hash_lock = HDR_LOCK(hdr);
-	int freeable;
+	int hashed = hdr->b_state != arc.anon;
 
-	mutex_enter(hash_lock);
-	if (remove_reference(hdr, hash_lock, tag) > 0) {
-		arc_buf_t **bufp = &hdr->b_buf;
-		arc_state_t *state = hdr->b_state;
-		uint64_t size = hdr->b_size;
-
-		ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr));
-		while (*bufp != buf) {
-			ASSERT(*bufp);
-			bufp = &(*bufp)->b_next;
-		}
-		*bufp = buf->b_next;
+	ASSERT(buf->b_efunc == NULL);
+	ASSERT(buf->b_data != NULL);
+
+	if (hashed) {
+		kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+		mutex_enter(hash_lock);
+		(void) remove_reference(hdr, hash_lock, tag);
+		if (hdr->b_datacnt > 1)
+			arc_buf_destroy(buf, TRUE);
+		else
+			hdr->b_flags |= ARC_BUF_AVAILABLE;
 		mutex_exit(hash_lock);
-		zio_buf_free(buf->b_data, size);
-		atomic_add_64(&arc.size, -size);
-		kmem_cache_free(buf_cache, buf);
-		ASSERT3U(state->size, >=, size);
-		atomic_add_64(&state->size, -size);
-		return;
+	} else if (HDR_IO_IN_PROGRESS(hdr)) {
+		int destroy_hdr;
+		/*
+		 * We are in the middle of an async write.  Don't destroy
+		 * this buffer unless the write completes before we finish
+		 * decrementing the reference count.
+		 */
+		mutex_enter(&arc_eviction_mtx);
+		(void) remove_reference(hdr, NULL, tag);
+		ASSERT(refcount_is_zero(&hdr->b_refcnt));
+		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
+		mutex_exit(&arc_eviction_mtx);
+		if (destroy_hdr)
+			arc_hdr_destroy(hdr);
+	} else {
+		if (remove_reference(hdr, NULL, tag) > 0) {
+			ASSERT(HDR_IO_ERROR(hdr));
+			arc_buf_destroy(buf, TRUE);
+		} else {
+			arc_hdr_destroy(hdr);
+		}
 	}
+}
 
-	/* don't free buffers that are in the middle of an async write */
-	freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL);
-	mutex_exit(hash_lock);
+int
+arc_buf_remove_ref(arc_buf_t *buf, void* tag)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	kmutex_t *hash_lock = HDR_LOCK(hdr);
+	int no_callback = (buf->b_efunc == NULL);
 
-	if (freeable)
-		arc_hdr_free(hdr);
+	if (hdr->b_state == arc.anon) {
+		arc_buf_free(buf, tag);
+		return (no_callback);
+	}
+
+	mutex_enter(hash_lock);
+	ASSERT(hdr->b_state != arc.anon);
+	ASSERT(buf->b_data != NULL);
+
+	(void) remove_reference(hdr, hash_lock, tag);
+	if (hdr->b_datacnt > 1) {
+		if (no_callback)
+			arc_buf_destroy(buf, TRUE);
+	} else if (no_callback) {
+		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+		hdr->b_flags |= ARC_BUF_AVAILABLE;
+	}
+	ASSERT(no_callback || hdr->b_datacnt > 1 ||
+	    refcount_is_zero(&hdr->b_refcnt));
+	mutex_exit(hash_lock);
+	return (no_callback);
 }
 
 int
@@ -732,19 +907,16 @@ arc_buf_size(arc_buf_t *buf)
  * bytes.  Move the removed buffers to the appropriate evict state.
  */
 static uint64_t
-arc_evict_state(arc_state_t *state, int64_t bytes)
+arc_evict(arc_state_t *state, int64_t bytes)
 {
 	arc_state_t *evicted_state;
-	uint64_t bytes_evicted = 0;
+	uint64_t bytes_evicted = 0, skipped = 0;
 	arc_buf_hdr_t *ab, *ab_prev;
 	kmutex_t *hash_lock;
 
-	ASSERT(state == arc.mru_top || state == arc.mfu_top);
+	ASSERT(state == arc.mru || state == arc.mfu);
 
-	if (state == arc.mru_top)
-		evicted_state = arc.mru_bot;
-	else
-		evicted_state = arc.mfu_bot;
+	evicted_state = (state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost;
 
 	mutex_enter(&state->mtx);
 	mutex_enter(&evicted_state->mtx);
@@ -754,19 +926,42 @@ arc_evict_state(arc_state_t *state, int64_t bytes)
 		hash_lock = HDR_LOCK(ab);
 		if (mutex_tryenter(hash_lock)) {
 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+			ASSERT(ab->b_datacnt > 0);
+			while (ab->b_buf) {
+				arc_buf_t *buf = ab->b_buf;
+				if (buf->b_data)
+					bytes_evicted += ab->b_size;
+				if (buf->b_efunc) {
+					mutex_enter(&arc_eviction_mtx);
+					/*
+					 * arc_buf_add_ref() could derail
+					 * this eviction.
+					 */
+					if (buf->b_hdr == NULL) {
+						mutex_exit(&arc_eviction_mtx);
+						mutex_exit(hash_lock);
+						goto skip;
+					}
+					arc_buf_destroy(buf, FALSE);
+					ab->b_buf = buf->b_next;
+					buf->b_next = arc_eviction_list;
+					arc_eviction_list = buf;
+					mutex_exit(&arc_eviction_mtx);
+				} else {
+					arc_buf_destroy(buf, TRUE);
+				}
+			}
+			ASSERT(ab->b_datacnt == 0);
 			arc_change_state(evicted_state, ab, hash_lock);
-			zio_buf_free(ab->b_buf->b_data, ab->b_size);
-			atomic_add_64(&arc.size, -ab->b_size);
-			ASSERT3P(ab->b_buf->b_next, ==, NULL);
-			kmem_cache_free(buf_cache, ab->b_buf);
-			ab->b_buf = NULL;
+			ASSERT(HDR_IN_HASH_TABLE(ab));
+			ab->b_flags = ARC_IN_HASH_TABLE;
 			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
-			bytes_evicted += ab->b_size;
 			mutex_exit(hash_lock);
-			if (bytes_evicted >= bytes)
+			if (bytes >= 0 && bytes_evicted >= bytes)
 				break;
 		} else {
-			atomic_add_64(&arc.skipped, 1);
+skip:
+			skipped += 1;
 		}
 	}
 	mutex_exit(&evicted_state->mtx);
@@ -776,6 +971,9 @@ arc_evict_state(arc_state_t *state, int64_t bytes)
 		dprintf("only evicted %lld bytes from %x",
 		    (longlong_t)bytes_evicted, state);
 
+	atomic_add_64(&arc.skipped, skipped);
+	if (bytes < 0)
+		return (skipped);
 	return (bytes_evicted);
 }
 
@@ -784,25 +982,27 @@ arc_evict_state(arc_state_t *state, int64_t bytes)
  * bytes.  Destroy the buffers that are removed.
  */
 static void
-arc_delete_state(arc_state_t *state, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, int64_t bytes)
 {
-	uint_t bufs_skipped = 0;
-	uint64_t bytes_deleted = 0;
 	arc_buf_hdr_t *ab, *ab_prev;
 	kmutex_t *hash_lock;
+	uint64_t bytes_deleted = 0;
+	uint_t bufs_skipped = 0;
 
+	ASSERT(GHOST_STATE(state));
 top:
 	mutex_enter(&state->mtx);
 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
 		ab_prev = list_prev(&state->list, ab);
 		hash_lock = HDR_LOCK(ab);
 		if (mutex_tryenter(hash_lock)) {
+			ASSERT(ab->b_buf == NULL);
 			arc_change_state(arc.anon, ab, hash_lock);
 			mutex_exit(hash_lock);
 			atomic_add_64(&arc.deleted, 1);
-			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
 			bytes_deleted += ab->b_size;
-			arc_hdr_free(ab);
+			arc_hdr_destroy(ab);
+			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
 			if (bytes >= 0 && bytes_deleted >= bytes)
 				break;
 		} else {
@@ -832,41 +1032,62 @@ arc_adjust(void)
 {
 	int64_t top_sz, mru_over, arc_over;
 
-	top_sz = arc.anon->size + arc.mru_top->size;
+	top_sz = arc.anon->size + arc.mru->size;
 
-	if (top_sz > arc.p && arc.mru_top->lsize > 0) {
-		int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p);
-		(void) arc_evict_state(arc.mru_top, toevict);
-		top_sz = arc.anon->size + arc.mru_top->size;
+	if (top_sz > arc.p && arc.mru->lsize > 0) {
+		int64_t toevict = MIN(arc.mru->lsize, top_sz-arc.p);
+		(void) arc_evict(arc.mru, toevict);
+		top_sz = arc.anon->size + arc.mru->size;
 	}
 
-	mru_over = top_sz + arc.mru_bot->size - arc.c;
+	mru_over = top_sz + arc.mru_ghost->size - arc.c;
 
 	if (mru_over > 0) {
-		if (arc.mru_bot->lsize > 0) {
-			int64_t todelete = MIN(arc.mru_bot->lsize, mru_over);
-			arc_delete_state(arc.mru_bot, todelete);
+		if (arc.mru_ghost->lsize > 0) {
+			int64_t todelete = MIN(arc.mru_ghost->lsize, mru_over);
+			arc_evict_ghost(arc.mru_ghost, todelete);
 		}
 	}
 
 	if ((arc_over = arc.size - arc.c) > 0) {
-		int64_t table_over;
+		int64_t tbl_over;
 
-		if (arc.mfu_top->lsize > 0) {
-			int64_t toevict = MIN(arc.mfu_top->lsize, arc_over);
-			(void) arc_evict_state(arc.mfu_top, toevict);
+		if (arc.mfu->lsize > 0) {
+			int64_t toevict = MIN(arc.mfu->lsize, arc_over);
+			(void) arc_evict(arc.mfu, toevict);
 		}
 
-		table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize
-		    - arc.c*2;
+		tbl_over = arc.size + arc.mru_ghost->lsize +
+		    arc.mfu_ghost->lsize - arc.c*2;
 
-		if (table_over > 0 && arc.mfu_bot->lsize > 0) {
-			int64_t todelete = MIN(arc.mfu_bot->lsize, table_over);
-			arc_delete_state(arc.mfu_bot, todelete);
+		if (tbl_over > 0 && arc.mfu_ghost->lsize > 0) {
+			int64_t todelete = MIN(arc.mfu_ghost->lsize, tbl_over);
+			arc_evict_ghost(arc.mfu_ghost, todelete);
 		}
 	}
 }
 
+static void
+arc_do_user_evicts(void)
+{
+	mutex_enter(&arc_eviction_mtx);
+	while (arc_eviction_list != NULL) {
+		arc_buf_t *buf = arc_eviction_list;
+		arc_eviction_list = buf->b_next;
+		buf->b_hdr = NULL;
+		mutex_exit(&arc_eviction_mtx);
+
+		ASSERT(buf->b_efunc != NULL);
+		VERIFY(buf->b_efunc(buf) == 0);
+
+		buf->b_efunc = NULL;
+		buf->b_private = NULL;
+		kmem_cache_free(buf_cache, buf);
+		mutex_enter(&arc_eviction_mtx);
+	}
+	mutex_exit(&arc_eviction_mtx);
+}
+
 /*
  * Flush all *evictable* data from the cache.
  * NOTE: this will not touch "active" (i.e. referenced) data.
@@ -874,17 +1095,22 @@ arc_adjust(void)
 void
 arc_flush(void)
 {
-	arc_delete_state(arc.mru_top, -1);
-	arc_delete_state(arc.mfu_top, -1);
+	while (arc_evict(arc.mru, -1));
+	while (arc_evict(arc.mfu, -1));
 
-	arc_delete_state(arc.mru_bot, -1);
-	arc_delete_state(arc.mfu_bot, -1);
+	arc_evict_ghost(arc.mru_ghost, -1);
+	arc_evict_ghost(arc.mfu_ghost, -1);
+
+	mutex_enter(&arc_reclaim_thr_lock);
+	arc_do_user_evicts();
+	mutex_exit(&arc_reclaim_thr_lock);
+	ASSERT(arc_eviction_list == NULL);
 }
 
 void
 arc_kmem_reclaim(void)
 {
-	/* Remove 6.25% */
+	/* Remove 12.5% */
 	/*
 	 * We need arc_reclaim_lock because we don't want multiple
 	 * threads trying to reclaim concurrently.
@@ -898,19 +1124,23 @@ arc_kmem_reclaim(void)
 	if (arc_dead)
 		return;
 
+	if (arc.c <= arc.c_min)
+		return;
+
 	mutex_enter(&arc_reclaim_lock);
 
-	atomic_add_64(&arc.c, -(arc.c >> 4));
+	atomic_add_64(&arc.c, -(arc.c >> 3));
+	atomic_add_64(&arc.p, -(arc.p >> 3));
+	if (arc.c > arc.size)
+		arc.c = arc.size;
 	if (arc.c < arc.c_min)
 		arc.c = arc.c_min;
-	atomic_add_64(&arc.p, -(arc.p >> 4));
+	if (arc.p > arc.c)
+		arc.p = (arc.c >> 1);
+	ASSERT((int64_t)arc.p >= 0);
 
 	arc_adjust();
 
-	/* Cool it for a while */
-	arc.incr = 0;
-	arc.size_check = arc_size_check_default << 3;
-
 	mutex_exit(&arc_reclaim_lock);
 }
 
@@ -985,16 +1215,11 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 #endif
 
 	/*
-	 * an agressive reclamation will shrink the cache size as well as reap
-	 * free kmem buffers.  The arc_kmem_reclaim function is called when the
-	 * header-cache is reaped, so we only reap the header cache if we're
-	 * performing an agressive reclaim.  If we're not, just clean the kmem
-	 * buffer caches.
+	 * An agressive reclamation will shrink the cache size as well as
+	 * reap free buffers from the arc kmem caches.
 	 */
 	if (strat == ARC_RECLAIM_AGGR)
-		kmem_cache_reap_now(hdr_cache);
-
-	kmem_cache_reap_now(buf_cache);
+		arc_kmem_reclaim();
 
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 		if (zio_buf_cache[i] != prev_cache) {
@@ -1002,6 +1227,8 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 			kmem_cache_reap_now(zio_buf_cache[i]);
 		}
 	}
+	kmem_cache_reap_now(buf_cache);
+	kmem_cache_reap_now(hdr_cache);
 }
 
 static void
@@ -1038,6 +1265,9 @@ arc_reclaim_thread(void)
 			arc.no_grow = FALSE;
 		}
 
+		if (arc_eviction_list != NULL)
+			arc_do_user_evicts();
+
 		/* block until needed, or one second, whichever is shorter */
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait(&arc_reclaim_thr_cv,
@@ -1051,14 +1281,37 @@ arc_reclaim_thread(void)
 	thread_exit();
 }
 
+/*
+ * Adapt arc info given the number of bytes we are trying to add and
+ * the state that we are comming from.  This function is only called
+ * when we are adding new content to the cache.
+ */
 static void
-arc_try_grow(int64_t bytes)
+arc_adapt(int bytes, arc_state_t *state)
 {
+	int mult;
+
+	ASSERT(bytes > 0);
 	/*
-	 * If we're within (2 * maxblocksize) bytes of the target
-	 * cache size, increment the target cache size
+	 * Adapt the target size of the MRU list:
+	 *	- if we just hit in the MRU ghost list, then increase
+	 *	  the target size of the MRU list.
+	 *	- if we just hit in the MFU ghost list, then increase
+	 *	  the target size of the MFU list by decreasing the
+	 *	  target size of the MRU list.
 	 */
-	atomic_add_64((uint64_t *)&arc.size_check, 1);
+	if (state == arc.mru_ghost) {
+		mult = ((arc.mru_ghost->size >= arc.mfu_ghost->size) ?
+		    1 : (arc.mfu_ghost->size/arc.mru_ghost->size));
+
+		arc.p = MIN(arc.c, arc.p + bytes * mult);
+	} else if (state == arc.mfu_ghost) {
+		mult = ((arc.mfu_ghost->size >= arc.mru_ghost->size) ?
+		    1 : (arc.mru_ghost->size/arc.mfu_ghost->size));
+
+		arc.p = MAX(0, (int64_t)arc.p - bytes * mult);
+	}
+	ASSERT((int64_t)arc.p >= 0);
 
 	if (arc_reclaim_needed()) {
 		cv_signal(&arc_reclaim_thr_cv);
@@ -1068,52 +1321,36 @@ arc_try_grow(int64_t bytes)
 	if (arc.no_grow)
 		return;
 
+	if (arc.c >= arc.c_max)
+		return;
+
 	/*
-	 * return true if we successfully grow, or if there's enough space that
-	 * we don't have to grow.  Above, we return false if we can't grow, or
-	 * if we shouldn't because a reclaim is in progress.
+	 * If we're within (2 * maxblocksize) bytes of the target
+	 * cache size, increment the target cache size
 	 */
-	if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) {
-		if (arc.size_check > 0) {
-			arc.size_check = arc_size_check_default;
-			atomic_add_64(&arc.incr, arc_incr_size);
-		}
-		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
+	if (arc.size > arc.c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+		atomic_add_64(&arc.c, (int64_t)bytes);
 		if (arc.c > arc.c_max)
 			arc.c = arc.c_max;
-		else
-			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
-	} else if (arc.size > arc.c) {
-		if (arc.size_check > 0) {
-			arc.size_check = arc_size_check_default;
-			atomic_add_64(&arc.incr, arc_incr_size);
-		}
-		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
-		if (arc.c > arc.c_max)
-			arc.c = arc.c_max;
-		else
-			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
+		else if (state == arc.anon)
+			atomic_add_64(&arc.p, (int64_t)bytes);
+		if (arc.p > arc.c)
+			arc.p = arc.c;
 	}
+	ASSERT((int64_t)arc.p >= 0);
 }
 
 /*
- * check if the cache has reached its limits and eviction is required prior to
- * insert.  In this situation, we want to evict if no_grow is set Otherwise, the
- * cache is either big enough that we can insert, or a arc_try_grow will result
- * in more space being made available.
+ * Check if the cache has reached its limits and eviction is required
+ * prior to insert.
  */
-
 static int
 arc_evict_needed()
 {
-
 	if (arc_reclaim_needed())
 		return (1);
 
-	if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c))
-		return (1);
-
-	return (0);
+	return (arc.size > arc.c);
 }
 
 /*
@@ -1121,21 +1358,21 @@ arc_evict_needed()
  * inserted on its behalf. So, determine which cache must be victimized to
  * satisfy an insertion for this state.  We have the following cases:
  *
- * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) ->
+ * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru) ->
  * In this situation if we're out of space, but the resident size of the MFU is
  * under the limit, victimize the MFU cache to satisfy this insertion request.
  *
- * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) ->
+ * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru) ->
  * Here, we've used up all of the available space for the MRU, so we need to
  * evict from our own cache instead.  Evict from the set of resident MRU
  * entries.
  *
- * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) ->
+ * 3. Insert for MFU (c - p) > sizeof(arc.mfu) ->
  * c minus p represents the MFU space in the cache, since p is the size of the
  * cache that is dedicated to the MRU.  In this situation there's still space on
  * the MFU side, so the MRU side needs to be victimized.
  *
- * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) ->
+ * 4. Insert for MFU (c - p) < sizeof(arc.mfu) ->
  * MFU's resident set is consuming more space than it has been allotted.  In
  * this situation, we must victimize our own cache, the MFU, for this insertion.
  */
@@ -1146,35 +1383,35 @@ arc_evict_for_state(arc_state_t *state, uint64_t bytes)
 	uint64_t	mfu_space;
 	uint64_t	evicted;
 
-	ASSERT(state == arc.mru_top || state == arc.mfu_top);
+	ASSERT(state == arc.mru || state == arc.mfu);
 
-	if (state == arc.mru_top) {
-		mru_used = arc.anon->size + arc.mru_top->size;
+	if (state == arc.mru) {
+		mru_used = arc.anon->size + arc.mru->size;
 		if (arc.p > mru_used) {
 			/* case 1 */
-			evicted = arc_evict_state(arc.mfu_top, bytes);
+			evicted = arc_evict(arc.mfu, bytes);
 			if (evicted < bytes) {
 				arc_adjust();
 			}
 		} else {
 			/* case 2 */
-			evicted = arc_evict_state(arc.mru_top, bytes);
+			evicted = arc_evict(arc.mru, bytes);
 			if (evicted < bytes) {
 				arc_adjust();
 			}
 		}
 	} else {
-		/* MFU_top case */
+		/* MFU case */
 		mfu_space = arc.c - arc.p;
-		if (mfu_space > arc.mfu_top->size) {
+		if (mfu_space > arc.mfu->size) {
 			/* case 3 */
-			evicted = arc_evict_state(arc.mru_top, bytes);
+			evicted = arc_evict(arc.mru, bytes);
 			if (evicted < bytes) {
 				arc_adjust();
 			}
 		} else {
 			/* case 4 */
-			evicted = arc_evict_state(arc.mfu_top, bytes);
+			evicted = arc_evict(arc.mfu, bytes);
 			if (evicted < bytes) {
 				arc_adjust();
 			}
@@ -1184,11 +1421,13 @@ arc_evict_for_state(arc_state_t *state, uint64_t bytes)
 
 /*
  * This routine is called whenever a buffer is accessed.
+ * NOTE: the hash lock is dropped in this function.
  */
 static void
-arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 {
-	int		blksz, mult;
+	arc_state_t	*evict_state = NULL;
+	int		blksz;
 
 	ASSERT(MUTEX_HELD(hash_lock));
 
@@ -1201,27 +1440,16 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 * to the MRU state.
 		 */
 
-		arc_try_grow(blksz);
-		if (arc_evict_needed()) {
-			arc_evict_for_state(arc.mru_top, blksz);
-		}
+		arc_adapt(blksz, arc.anon);
+		if (arc_evict_needed())
+			evict_state = arc.mru;
 
 		ASSERT(buf->b_arc_access == 0);
 		buf->b_arc_access = lbolt;
-		DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *,
-		    buf);
-		arc_change_state(arc.mru_top, buf, hash_lock);
+		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+		arc_change_state(arc.mru, buf, hash_lock);
 
-		/*
-		 * If we are using less than 2/3 of our total target
-		 * cache size, bump up the target size for the MRU
-		 * list.
-		 */
-		if (arc.size < arc.c*2/3) {
-			arc.p = arc.anon->size + arc.mru_top->size + arc.c/6;
-		}
-
-	} else if (buf->b_state == arc.mru_top) {
+	} else if (buf->b_state == arc.mru) {
 		/*
 		 * If this buffer is in the MRU-top state and has the prefetch
 		 * flag, the first read was actually part of a prefetch.  In
@@ -1230,7 +1458,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 */
 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
 			buf->b_flags &= ~ARC_PREFETCH;
-			atomic_add_64(&arc.mru_top->hits, 1);
+			atomic_add_64(&arc.mru->hits, 1);
+			mutex_exit(hash_lock);
 			return;
 		}
 
@@ -1246,12 +1475,11 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 			 * most frequently used state.
 			 */
 			buf->b_arc_access = lbolt;
-			DTRACE_PROBE1(new_state__mfu_top,
-			    arc_buf_hdr_t *, buf);
-			arc_change_state(arc.mfu_top, buf, hash_lock);
+			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+			arc_change_state(arc.mfu, buf, hash_lock);
 		}
-		atomic_add_64(&arc.mru_top->hits, 1);
-	} else if (buf->b_state == arc.mru_bot) {
+		atomic_add_64(&arc.mru->hits, 1);
+	} else if (buf->b_state == arc.mru_ghost) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer has been "accessed" recently, but
@@ -1260,30 +1488,23 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 */
 
 		if (buf->b_flags & ARC_PREFETCH) {
-			new_state = arc.mru_top;
-			DTRACE_PROBE1(new_state__mru_top,
-			    arc_buf_hdr_t *, buf);
+			new_state = arc.mru;
+			buf->b_flags &= ~ARC_PREFETCH;
+			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
 		} else {
-			new_state = arc.mfu_top;
-			DTRACE_PROBE1(new_state__mfu_top,
-			    arc_buf_hdr_t *, buf);
-		}
-
-		arc_try_grow(blksz);
-		if (arc_evict_needed()) {
-			arc_evict_for_state(new_state, blksz);
+			new_state = arc.mfu;
+			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 		}
 
-		/* Bump up the target size of the MRU list */
-		mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ?
-		    1 : (arc.mfu_bot->size/arc.mru_bot->size));
-		arc.p = MIN(arc.c, arc.p + blksz * mult);
+		arc_adapt(blksz, arc.mru_ghost);
+		if (arc_evict_needed())
+			evict_state = new_state;
 
 		buf->b_arc_access = lbolt;
 		arc_change_state(new_state, buf, hash_lock);
 
-		atomic_add_64(&arc.mru_bot->hits, 1);
-	} else if (buf->b_state == arc.mfu_top) {
+		atomic_add_64(&arc.mru_ghost->hits, 1);
+	} else if (buf->b_state == arc.mfu) {
 		/*
 		 * This buffer has been accessed more than once and is
 		 * still in the cache.  Keep it in the MFU state.
@@ -1293,34 +1514,30 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 * so even if it was a prefetch, it will be put back at
 		 * the head of the list when we remove_reference().
 		 */
-		atomic_add_64(&arc.mfu_top->hits, 1);
-	} else if (buf->b_state == arc.mfu_bot) {
+		atomic_add_64(&arc.mfu->hits, 1);
+	} else if (buf->b_state == arc.mfu_ghost) {
 		/*
 		 * This buffer has been accessed more than once but has
 		 * been evicted from the cache.  Move it back to the
 		 * MFU state.
 		 */
 
-		arc_try_grow(blksz);
-		if (arc_evict_needed()) {
-			arc_evict_for_state(arc.mfu_top, blksz);
-		}
-
-		/* Bump up the target size for the MFU list */
-		mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ?
-		    1 : (arc.mru_bot->size/arc.mfu_bot->size));
-		arc.p = MAX(0, (int64_t)arc.p - blksz * mult);
+		arc_adapt(blksz, arc.mfu_ghost);
+		if (arc_evict_needed())
+			evict_state = arc.mfu;
 
 		buf->b_arc_access = lbolt;
-		DTRACE_PROBE1(new_state__mfu_top,
-		    arc_buf_hdr_t *, buf);
-		arc_change_state(arc.mfu_top, buf, hash_lock);
+		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+		arc_change_state(arc.mfu, buf, hash_lock);
 
-		atomic_add_64(&arc.mfu_bot->hits, 1);
+		atomic_add_64(&arc.mfu_ghost->hits, 1);
 	} else {
 		ASSERT(!"invalid arc state");
 	}
 
+	mutex_exit(hash_lock);
+	if (evict_state)
+		arc_evict_for_state(evict_state, blksz);
 }
 
 /* a generic arc_done_func_t which you can use */
@@ -1329,7 +1546,7 @@ void
 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
 	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
-	arc_buf_free(buf, arg);
+	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
 }
 
 /* a generic arc_done_func_t which you can use */
@@ -1338,7 +1555,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
 	arc_buf_t **bufp = arg;
 	if (zio && zio->io_error) {
-		arc_buf_free(buf, arg);
+		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
 		*bufp = NULL;
 	} else {
 		*bufp = buf;
@@ -1387,13 +1604,13 @@ arc_read_done(zio_t *zio)
 		if (acb->acb_done) {
 			if (abuf == NULL) {
 				abuf = kmem_cache_alloc(buf_cache, KM_SLEEP);
-				abuf->b_data = zio_buf_alloc(hdr->b_size);
-				atomic_add_64(&arc.size, hdr->b_size);
-				bcopy(buf->b_data, abuf->b_data, hdr->b_size);
+				abuf->b_data = arc_data_copy(hdr, buf->b_data);
 				abuf->b_hdr = hdr;
+				abuf->b_efunc = NULL;
+				abuf->b_private = NULL;
 				abuf->b_next = hdr->b_buf;
 				hdr->b_buf = abuf;
-				atomic_add_64(&hdr->b_state->size, hdr->b_size);
+				hdr->b_datacnt += 1;
 			}
 			acb->acb_buf = abuf;
 			abuf = NULL;
@@ -1414,6 +1631,9 @@ arc_read_done(zio_t *zio)
 	}
 	hdr->b_acb = NULL;
 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+	ASSERT(!HDR_BUF_AVAILABLE(hdr));
+	if (abuf == buf)
+		hdr->b_flags |= ARC_BUF_AVAILABLE;
 
 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
 
@@ -1421,9 +1641,21 @@ arc_read_done(zio_t *zio)
 		hdr->b_flags |= ARC_IO_ERROR;
 		if (hdr->b_state != arc.anon)
 			arc_change_state(arc.anon, hdr, hash_lock);
+		if (HDR_IN_HASH_TABLE(hdr))
+			buf_hash_remove(hdr);
 		freeable = refcount_is_zero(&hdr->b_refcnt);
+		/* translate checksum errors into IO errors */
+		if (zio->io_error == ECKSUM)
+			zio->io_error = EIO;
 	}
 
+	/*
+	 * Broadcast before we drop the hash_lock.  This is less efficient,
+	 * but avoids the possibility that the hdr (and hence the cv) might
+	 * be freed before we get to the cv_broadcast().
+	 */
+	cv_broadcast(&hdr->b_cv);
+
 	if (!HDR_FREED_IN_READ(hdr)) {
 		/*
 		 * Only call arc_access on anonymous buffers.  This is because
@@ -1432,8 +1664,9 @@ arc_read_done(zio_t *zio)
 		 * getting confused).
 		 */
 		if (zio->io_error == 0 && hdr->b_state == arc.anon)
-			arc_access(hdr, hash_lock);
-		mutex_exit(hash_lock);
+			arc_access_and_exit(hdr, hash_lock);
+		else
+			mutex_exit(hash_lock);
 	} else {
 		/*
 		 * This block was freed while we waited for the read to
@@ -1445,8 +1678,6 @@ arc_read_done(zio_t *zio)
 		freeable = refcount_is_zero(&hdr->b_refcnt);
 	}
 
-	cv_broadcast(&hdr->b_cv);
-
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
 		if (acb->acb_done)
@@ -1462,7 +1693,7 @@ arc_read_done(zio_t *zio)
 	}
 
 	if (freeable)
-		arc_hdr_free(hdr);
+		arc_hdr_destroy(hdr);
 }
 
 /*
@@ -1486,7 +1717,7 @@ arc_read_done(zio_t *zio)
 int
 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
     arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t arc_flags)
+    uint32_t arc_flags, zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
@@ -1495,15 +1726,9 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
 
 top:
 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
-	if (hdr && hdr->b_buf) {
-
-		ASSERT((hdr->b_state == arc.mru_top) ||
-		    (hdr->b_state == arc.mfu_top) ||
-		    ((hdr->b_state == arc.anon) &&
-		    (HDR_IO_IN_PROGRESS(hdr))));
+	if (hdr && hdr->b_datacnt > 0) {
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
-
 			if ((arc_flags & ARC_NOWAIT) && done) {
 				arc_callback_t	*acb = NULL;
 
@@ -1527,35 +1752,39 @@ top:
 				mutex_exit(hash_lock);
 				goto top;
 			}
-
 			mutex_exit(hash_lock);
 			return (0);
 		}
 
-		/*
-		 * If there is already a reference on this block, create
-		 * a new copy of the data so that we will be guaranteed
-		 * that arc_release() will always succeed.
-		 */
+		ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu);
 
-		if (done)
-			add_reference(hdr, hash_lock, private);
-		if (done && refcount_count(&hdr->b_refcnt) > 1) {
-			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
-			buf->b_data = zio_buf_alloc(hdr->b_size);
-			ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1);
-			atomic_add_64(&arc.size, hdr->b_size);
-			bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size);
-			buf->b_hdr = hdr;
-			buf->b_next = hdr->b_buf;
-			hdr->b_buf = buf;
-			atomic_add_64(&hdr->b_state->size, hdr->b_size);
-		} else {
+		if (done) {
+			/*
+			 * If this block is already in use, create a new
+			 * copy of the data so that we will be guaranteed
+			 * that arc_release() will always succeed.
+			 */
 			buf = hdr->b_buf;
+			ASSERT(buf);
+			ASSERT(buf->b_data);
+			if (!HDR_BUF_AVAILABLE(hdr)) {
+				void *data = arc_data_copy(hdr, buf->b_data);
+				buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+				buf->b_hdr = hdr;
+				buf->b_data = data;
+				buf->b_efunc = NULL;
+				buf->b_private = NULL;
+				buf->b_next = hdr->b_buf;
+				hdr->b_buf = buf;
+				hdr->b_datacnt += 1;
+			} else {
+				ASSERT(buf->b_efunc == NULL);
+				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+			}
+			add_reference(hdr, hash_lock, private);
 		}
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
-		arc_access(hdr, hash_lock);
-		mutex_exit(hash_lock);
+		arc_access_and_exit(hdr, hash_lock);
 		atomic_add_64(&arc.hits, 1);
 		if (done)
 			done(NULL, buf, private);
@@ -1579,24 +1808,28 @@ top:
 				bzero(&hdr->b_dva, sizeof (dva_t));
 				hdr->b_birth = 0;
 				hdr->b_cksum0 = 0;
-				arc_buf_free(buf, private);
+				(void) arc_buf_remove_ref(buf, private);
 				goto top; /* restart the IO request */
 			}
 
 		} else {
 			/* this block is in the ghost cache */
-			ASSERT((hdr->b_state == arc.mru_bot) ||
-			    (hdr->b_state == arc.mfu_bot));
+			ASSERT(GHOST_STATE(hdr->b_state));
+			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 			add_reference(hdr, hash_lock, private);
+			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
 
+			ASSERT(hdr->b_buf == NULL);
 			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
-			buf->b_data = zio_buf_alloc(hdr->b_size);
-			atomic_add_64(&arc.size, hdr->b_size);
-			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
 			buf->b_hdr = hdr;
+			buf->b_efunc = NULL;
+			buf->b_private = NULL;
 			buf->b_next = NULL;
 			hdr->b_buf = buf;
+			buf->b_data = zio_buf_alloc(hdr->b_size);
+			atomic_add_64(&arc.size, hdr->b_size);
+			ASSERT(hdr->b_datacnt == 0);
+			hdr->b_datacnt = 1;
 		}
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -1623,18 +1856,17 @@ top:
 		 * buffer ought to notice that it's legit but has a pending I/O.
 		 */
 
-		if ((hdr->b_state == arc.mru_bot) ||
-		    (hdr->b_state == arc.mfu_bot))
-			arc_access(hdr, hash_lock);
-
-		mutex_exit(hash_lock);
+		if (GHOST_STATE(hdr->b_state))
+			arc_access_and_exit(hdr, hash_lock);
+		else
+			mutex_exit(hash_lock);
 
 		ASSERT3U(hdr->b_size, ==, size);
-		DTRACE_PROBE2(arc__miss, blkptr_t *, bp,
-		    uint64_t, size);
+		DTRACE_PROBE2(arc__miss, blkptr_t *, bp, uint64_t, size);
 		atomic_add_64(&arc.misses, 1);
+
 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
-		    arc_read_done, buf, priority, flags);
+		    arc_read_done, buf, priority, flags, zb);
 
 		if (arc_flags & ARC_WAIT)
 			return (zio_wait(rzio));
@@ -1660,10 +1892,18 @@ arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
 
 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
 
-	if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr))
-		bcopy(hdr->b_buf->b_data, data, hdr->b_size);
-	else
+	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
+		arc_buf_t *buf = hdr->b_buf;
+
+		ASSERT(buf);
+		while (buf->b_data == NULL) {
+			buf = buf->b_next;
+			ASSERT(buf);
+		}
+		bcopy(buf->b_data, data, hdr->b_size);
+	} else {
 		rc = ENOENT;
+	}
 
 	if (hash_mtx)
 		mutex_exit(hash_mtx);
@@ -1671,6 +1911,104 @@ arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
 	return (rc);
 }
 
+void
+arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
+{
+	ASSERT(buf->b_hdr != NULL);
+	ASSERT(buf->b_hdr->b_state != arc.anon);
+	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+	buf->b_efunc = func;
+	buf->b_private = private;
+}
+
+/*
+ * This is used by the DMU to let the ARC know that a buffer is
+ * being evicted, so the ARC should clean up.  If this arc buf
+ * is not yet in the evicted state, it will be put there.
+ */
+int
+arc_buf_evict(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_lock;
+	arc_buf_t **bufp;
+
+	mutex_enter(&arc_eviction_mtx);
+	hdr = buf->b_hdr;
+	if (hdr == NULL) {
+		/*
+		 * We are in arc_do_user_evicts().
+		 * NOTE: We can't be in arc_buf_add_ref() because
+		 * that would violate the interface rules.
+		 */
+		ASSERT(buf->b_data == NULL);
+		mutex_exit(&arc_eviction_mtx);
+		return (0);
+	} else if (buf->b_data == NULL) {
+		/*
+		 * We are on the eviction list, pull us off.
+		 */
+		bufp = &arc_eviction_list;
+		while (*bufp != buf)
+			bufp = &(*bufp)->b_next;
+		*bufp = buf->b_next;
+		mutex_exit(&arc_eviction_mtx);
+		goto out;
+	} else {
+		/*
+		 * Prevent a race with arc_evict()
+		 */
+		ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
+		buf->b_hdr = NULL;
+	}
+	mutex_exit(&arc_eviction_mtx);
+
+	hash_lock = HDR_LOCK(hdr);
+	mutex_enter(hash_lock);
+
+	ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu);
+
+	/*
+	 * Pull this buffer off of the hdr
+	 */
+	bufp = &hdr->b_buf;
+	while (*bufp != buf)
+		bufp = &(*bufp)->b_next;
+	*bufp = buf->b_next;
+
+	ASSERT(buf->b_data != NULL);
+	buf->b_hdr = hdr;
+	arc_buf_destroy(buf, FALSE);
+
+	if (hdr->b_datacnt == 0) {
+		arc_state_t *old_state = hdr->b_state;
+		arc_state_t *evicted_state;
+
+		ASSERT(refcount_is_zero(&hdr->b_refcnt));
+
+		evicted_state =
+		    (old_state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost;
+
+		mutex_enter(&old_state->mtx);
+		mutex_enter(&evicted_state->mtx);
+
+		arc_change_state(evicted_state, hdr, hash_lock);
+		ASSERT(HDR_IN_HASH_TABLE(hdr));
+		hdr->b_flags = ARC_IN_HASH_TABLE;
+
+		mutex_exit(&evicted_state->mtx);
+		mutex_exit(&old_state->mtx);
+	}
+	mutex_exit(hash_lock);
+out:
+	VERIFY(buf->b_efunc(buf) == 0);
+	buf->b_efunc = NULL;
+	buf->b_private = NULL;
+	buf->b_hdr = NULL;
+	kmem_cache_free(buf_cache, buf);
+	return (1);
+}
+
 /*
  * Release this buffer from the cache.  This must be done
  * after a read and prior to modifying the buffer contents.
@@ -1690,30 +2028,40 @@ arc_release(arc_buf_t *buf, void *tag)
 		/* this buffer is already released */
 		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
 		ASSERT(BUF_EMPTY(hdr));
+		ASSERT(buf->b_efunc == NULL);
 		return;
 	}
 
 	mutex_enter(hash_lock);
 
-	if (refcount_count(&hdr->b_refcnt) > 1) {
+	/*
+	 * Do we have more than one buf?
+	 */
+	if (hdr->b_buf != buf || buf->b_next != NULL) {
 		arc_buf_hdr_t *nhdr;
 		arc_buf_t **bufp;
 		uint64_t blksz = hdr->b_size;
 		spa_t *spa = hdr->b_spa;
 
+		ASSERT(hdr->b_datacnt > 1);
 		/*
 		 * Pull the data off of this buf and attach it to
 		 * a new anonymous buf.
 		 */
+		(void) remove_reference(hdr, hash_lock, tag);
 		bufp = &hdr->b_buf;
-		while (*bufp != buf) {
-			ASSERT(*bufp);
+		while (*bufp != buf)
 			bufp = &(*bufp)->b_next;
-		}
 		*bufp = (*bufp)->b_next;
-		(void) refcount_remove(&hdr->b_refcnt, tag);
+
 		ASSERT3U(hdr->b_state->size, >=, hdr->b_size);
 		atomic_add_64(&hdr->b_state->size, -hdr->b_size);
+		if (refcount_is_zero(&hdr->b_refcnt)) {
+			ASSERT3U(hdr->b_state->lsize, >=, hdr->b_size);
+			atomic_add_64(&hdr->b_state->lsize, -hdr->b_size);
+		}
+		hdr->b_datacnt -= 1;
+
 		mutex_exit(hash_lock);
 
 		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
@@ -1723,6 +2071,7 @@ arc_release(arc_buf_t *buf, void *tag)
 		nhdr->b_state = arc.anon;
 		nhdr->b_arc_access = 0;
 		nhdr->b_flags = 0;
+		nhdr->b_datacnt = 1;
 		buf->b_hdr = nhdr;
 		buf->b_next = NULL;
 		(void) refcount_add(&nhdr->b_refcnt, tag);
@@ -1730,6 +2079,7 @@ arc_release(arc_buf_t *buf, void *tag)
 
 		hdr = nhdr;
 	} else {
+		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
 		ASSERT(!list_link_active(&hdr->b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		arc_change_state(arc.anon, hdr, hash_lock);
@@ -1739,14 +2089,30 @@ arc_release(arc_buf_t *buf, void *tag)
 		hdr->b_birth = 0;
 		hdr->b_cksum0 = 0;
 	}
+	buf->b_efunc = NULL;
+	buf->b_private = NULL;
 }
 
 int
 arc_released(arc_buf_t *buf)
 {
-	return (buf->b_hdr->b_state == arc.anon);
+	return (buf->b_data != NULL && buf->b_hdr->b_state == arc.anon);
+}
+
+int
+arc_has_callback(arc_buf_t *buf)
+{
+	return (buf->b_efunc != NULL);
 }
 
+#ifdef ZFS_DEBUG
+int
+arc_referenced(arc_buf_t *buf)
+{
+	return (refcount_count(&buf->b_hdr->b_refcnt));
+}
+#endif
+
 static void
 arc_write_done(zio_t *zio)
 {
@@ -1758,6 +2124,7 @@ arc_write_done(zio_t *zio)
 	hdr = buf->b_hdr;
 	acb = hdr->b_acb;
 	hdr->b_acb = NULL;
+	ASSERT(acb != NULL);
 
 	/* this buffer is on no lists and is not in the hash table */
 	ASSERT3P(hdr->b_state, ==, arc.anon);
@@ -1765,9 +2132,12 @@ arc_write_done(zio_t *zio)
 	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
 	hdr->b_birth = zio->io_bp->blk_birth;
 	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
-	/* clear the "in-write" flag */
-	hdr->b_hash_next = NULL;
-	/* This write may be all-zero */
+	/*
+	 * If the block to be written was all-zero, we may have
+	 * compressed it away.  In this case no write was performed
+	 * so there will be no dva/birth-date/checksum.  The buffer
+	 * must therefor remain anonymous (and uncached).
+	 */
 	if (!BUF_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
@@ -1787,27 +2157,41 @@ arc_write_done(zio_t *zio)
 			ASSERT(refcount_is_zero(&exists->b_refcnt));
 			arc_change_state(arc.anon, exists, hash_lock);
 			mutex_exit(hash_lock);
-			arc_hdr_free(exists);
+			arc_hdr_destroy(exists);
 			exists = buf_hash_insert(hdr, &hash_lock);
 			ASSERT3P(exists, ==, NULL);
 		}
-		arc_access(hdr, hash_lock);
-		mutex_exit(hash_lock);
+		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+		arc_access_and_exit(hdr, hash_lock);
+	} else if (acb->acb_done == NULL) {
+		int destroy_hdr;
+		/*
+		 * This is an anonymous buffer with no user callback,
+		 * destroy it if there are no active references.
+		 */
+		mutex_enter(&arc_eviction_mtx);
+		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
+		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+		mutex_exit(&arc_eviction_mtx);
+		if (destroy_hdr)
+			arc_hdr_destroy(hdr);
+	} else {
+		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 	}
-	if (acb && acb->acb_done) {
+
+	if (acb->acb_done) {
 		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
 		acb->acb_done(zio, buf, acb->acb_private);
 	}
 
-	if (acb)
-		kmem_free(acb, sizeof (arc_callback_t));
+	kmem_free(acb, sizeof (arc_callback_t));
 }
 
 int
 arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
     arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t arc_flags)
+    uint32_t arc_flags, zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_callback_t	*acb;
@@ -1822,8 +2206,9 @@ arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
 	acb->acb_private = private;
 	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
 	hdr->b_acb = acb;
+	hdr->b_flags |= ARC_IO_IN_PROGRESS;
 	rzio = zio_write(pio, spa, checksum, compress, txg, bp,
-	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags);
+	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb);
 
 	if (arc_flags & ARC_WAIT)
 		return (zio_wait(rzio));
@@ -1858,16 +2243,21 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 		arc_change_state(arc.anon, ab, hash_lock);
 		if (refcount_is_zero(&ab->b_refcnt)) {
 			mutex_exit(hash_lock);
-			arc_hdr_free(ab);
+			arc_hdr_destroy(ab);
 			atomic_add_64(&arc.deleted, 1);
 		} else {
 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1);
+			ASSERT3U(ab->b_datacnt, ==, 1);
 			if (HDR_IO_IN_PROGRESS(ab))
 				ab->b_flags |= ARC_FREED_IN_READ;
+			if (HDR_IN_HASH_TABLE(ab))
+				buf_hash_remove(ab);
 			ab->b_arc_access = 0;
 			bzero(&ab->b_dva, sizeof (dva_t));
 			ab->b_birth = 0;
 			ab->b_cksum0 = 0;
+			ab->b_buf->b_efunc = NULL;
+			ab->b_buf->b_private = NULL;
 			mutex_exit(hash_lock);
 		}
 	}
@@ -1967,23 +2357,26 @@ arc_init(void)
 		arc.c = arc.c_min;
 
 	arc.anon = &ARC_anon;
-	arc.mru_top = &ARC_mru_top;
-	arc.mru_bot = &ARC_mru_bot;
-	arc.mfu_top = &ARC_mfu_top;
-	arc.mfu_bot = &ARC_mfu_bot;
+	arc.mru = &ARC_mru;
+	arc.mru_ghost = &ARC_mru_ghost;
+	arc.mfu = &ARC_mfu;
+	arc.mfu_ghost = &ARC_mfu_ghost;
+	arc.size = 0;
 
-	list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t),
+	list_create(&arc.mru->list, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t),
+	list_create(&arc.mru_ghost->list, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t),
+	list_create(&arc.mfu->list, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t),
+	list_create(&arc.mfu_ghost->list, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_arc_node));
 
 	buf_init();
 
 	arc_thread_exit = 0;
+	arc_eviction_list = NULL;
+	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
 	    TS_RUN, minclsyspri);
@@ -2002,14 +2395,15 @@ arc_fini(void)
 
 	arc_dead = TRUE;
 
+	mutex_destroy(&arc_eviction_mtx);
 	mutex_destroy(&arc_reclaim_lock);
 	mutex_destroy(&arc_reclaim_thr_lock);
 	cv_destroy(&arc_reclaim_thr_cv);
 
-	list_destroy(&arc.mru_top->list);
-	list_destroy(&arc.mru_bot->list);
-	list_destroy(&arc.mfu_top->list);
-	list_destroy(&arc.mfu_bot->list);
+	list_destroy(&arc.mru->list);
+	list_destroy(&arc.mru_ghost->list);
+	list_destroy(&arc.mfu->list);
+	list_destroy(&arc.mfu_ghost->list);
 
 	buf_fini();
 }
diff --git a/usr/src/uts/common/fs/zfs/bplist.c b/usr/src/uts/common/fs/zfs/bplist.c
index 68f79ac5a2..db0d3534d6 100644
--- a/usr/src/uts/common/fs/zfs/bplist.c
+++ b/usr/src/uts/common/fs/zfs/bplist.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -29,16 +28,18 @@
 #include <sys/bplist.h>
 #include <sys/zfs_context.h>
 
-static void
+static int
 bplist_hold(bplist_t *bpl)
 {
 	ASSERT(MUTEX_HELD(&bpl->bpl_lock));
 	if (bpl->bpl_dbuf == NULL) {
-		bpl->bpl_dbuf = dmu_bonus_hold_tag(bpl->bpl_mos,
-		    bpl->bpl_object, bpl);
-		dmu_buf_read(bpl->bpl_dbuf);
+		int err = dmu_bonus_hold(bpl->bpl_mos,
+		    bpl->bpl_object, bpl, &bpl->bpl_dbuf);
+		if (err)
+			return (err);
 		bpl->bpl_phys = bpl->bpl_dbuf->db_data;
 	}
+	return (0);
 }
 
 uint64_t
@@ -58,12 +59,15 @@ bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
 	VERIFY(dmu_object_free(mos, object, tx) == 0);
 }
 
-void
+int
 bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
 {
 	dmu_object_info_t doi;
+	int err;
 
-	VERIFY(dmu_object_info(mos, object, &doi) == 0);
+	err = dmu_object_info(mos, object, &doi);
+	if (err)
+		return (err);
 
 	mutex_enter(&bpl->bpl_lock);
 
@@ -79,6 +83,7 @@ bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
 	bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
 
 	mutex_exit(&bpl->bpl_lock);
+	return (0);
 }
 
 void
@@ -89,11 +94,11 @@ bplist_close(bplist_t *bpl)
 	ASSERT(bpl->bpl_queue == NULL);
 
 	if (bpl->bpl_cached_dbuf) {
-		dmu_buf_rele(bpl->bpl_cached_dbuf);
+		dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
 		bpl->bpl_cached_dbuf = NULL;
 	}
 	if (bpl->bpl_dbuf) {
-		dmu_buf_rele_tag(bpl->bpl_dbuf, bpl);
+		dmu_buf_rele(bpl->bpl_dbuf, bpl);
 		bpl->bpl_dbuf = NULL;
 		bpl->bpl_phys = NULL;
 	}
@@ -110,22 +115,45 @@ bplist_empty(bplist_t *bpl)
 		return (B_TRUE);
 
 	mutex_enter(&bpl->bpl_lock);
-	bplist_hold(bpl);
+	VERIFY(0 == bplist_hold(bpl)); /* XXX */
 	rv = (bpl->bpl_phys->bpl_entries == 0);
 	mutex_exit(&bpl->bpl_lock);
 
 	return (rv);
 }
 
+static int
+bplist_cache(bplist_t *bpl, uint64_t blkid)
+{
+	int err = 0;
+
+	if (bpl->bpl_cached_dbuf == NULL ||
+	    bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
+		if (bpl->bpl_cached_dbuf != NULL)
+			dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
+		err = dmu_buf_hold(bpl->bpl_mos,
+		    bpl->bpl_object, blkid << bpl->bpl_blockshift,
+		    bpl, &bpl->bpl_cached_dbuf);
+		ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
+		    1ULL << bpl->bpl_blockshift);
+	}
+	return (err);
+}
+
 int
 bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
 {
 	uint64_t blk, off;
 	blkptr_t *bparray;
-	dmu_buf_t *db;
+	int err;
 
 	mutex_enter(&bpl->bpl_lock);
-	bplist_hold(bpl);
+
+	err = bplist_hold(bpl);
+	if (err) {
+		mutex_exit(&bpl->bpl_lock);
+		return (err);
+	}
 
 	if (*itorp >= bpl->bpl_phys->bpl_entries) {
 		mutex_exit(&bpl->bpl_lock);
@@ -134,51 +162,44 @@ bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
 
 	blk = *itorp >> bpl->bpl_bpshift;
 	off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
-	db = bpl->bpl_cached_dbuf;
 
-	if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) {
-		if (db != NULL)
-			dmu_buf_rele(db);
-		bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos,
-		    bpl->bpl_object, blk << bpl->bpl_blockshift);
+	err = bplist_cache(bpl, blk);
+	if (err) {
+		mutex_exit(&bpl->bpl_lock);
+		return (err);
 	}
 
-	ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift);
-
-	dmu_buf_read(db);
-	bparray = db->db_data;
+	bparray = bpl->bpl_cached_dbuf->db_data;
 	*bp = bparray[off];
 	(*itorp)++;
 	mutex_exit(&bpl->bpl_lock);
 	return (0);
 }
 
-void
+int
 bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
 {
 	uint64_t blk, off;
 	blkptr_t *bparray;
-	dmu_buf_t *db;
+	int err;
 
 	ASSERT(!BP_IS_HOLE(bp));
 	mutex_enter(&bpl->bpl_lock);
-	bplist_hold(bpl);
+	err = bplist_hold(bpl);
+	if (err)
+		return (err);
 
 	blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
 	off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
-	db = bpl->bpl_cached_dbuf;
 
-	if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) {
-		if (db != NULL)
-			dmu_buf_rele(db);
-		bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos,
-		    bpl->bpl_object, blk << bpl->bpl_blockshift);
+	err = bplist_cache(bpl, blk);
+	if (err) {
+		mutex_exit(&bpl->bpl_lock);
+		return (err);
 	}
 
-	ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift);
-
-	dmu_buf_will_dirty(db, tx);
-	bparray = db->db_data;
+	dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
+	bparray = bpl->bpl_cached_dbuf->db_data;
 	bparray[off] = *bp;
 
 	/* We never need the fill count. */
@@ -191,6 +212,8 @@ bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
 	bpl->bpl_phys->bpl_entries++;
 	bpl->bpl_phys->bpl_bytes += BP_GET_ASIZE(bp);
 	mutex_exit(&bpl->bpl_lock);
+
+	return (0);
 }
 
 /*
@@ -218,7 +241,7 @@ bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
 	while ((bpq = bpl->bpl_queue) != NULL) {
 		bpl->bpl_queue = bpq->bpq_next;
 		mutex_exit(&bpl->bpl_lock);
-		bplist_enqueue(bpl, &bpq->bpq_blk, tx);
+		VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
 		kmem_free(bpq, sizeof (*bpq));
 		mutex_enter(&bpl->bpl_lock);
 	}
@@ -230,9 +253,10 @@ bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
 {
 	mutex_enter(&bpl->bpl_lock);
 	ASSERT3P(bpl->bpl_queue, ==, NULL);
-	bplist_hold(bpl);
+	VERIFY(0 == bplist_hold(bpl));
 	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
-	dmu_free_range(bpl->bpl_mos, bpl->bpl_object, 0, -1ULL, tx);
+	VERIFY(0 == dmu_free_range(bpl->bpl_mos,
+	    bpl->bpl_object, 0, -1ULL, tx));
 	bpl->bpl_phys->bpl_entries = 0;
 	bpl->bpl_phys->bpl_bytes = 0;
 	mutex_exit(&bpl->bpl_lock);
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 6f93e86078..13f4fdb202 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -118,7 +118,7 @@ dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 			mutex_enter(&db->db_mtx);
-			if (!refcount_is_zero(&db->db_holds)) {
+			if (db->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (db);
 			}
@@ -151,7 +151,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
 	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 			mutex_enter(&dbf->db_mtx);
-			if (!refcount_is_zero(&dbf->db_holds)) {
+			if (dbf->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (dbf);
 			}
@@ -186,7 +186,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
 	 * DBUF_HASH_MUTEX > db_mtx.
 	 */
 	ASSERT(refcount_is_zero(&db->db_holds));
-	ASSERT(db->db_dnode != NULL);
+	ASSERT(db->db_state == DB_EVICTING);
 	ASSERT(!MUTEX_HELD(&db->db_mtx));
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
@@ -201,20 +201,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
 	atomic_add_64(&dbuf_hash_count, -1);
 }
 
-static int dbuf_evictable(dmu_buf_impl_t *db);
-static void dbuf_clear(dmu_buf_impl_t *db);
-
-void
-dbuf_evict(dmu_buf_impl_t *db)
-{
-	int err;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	err = dbuf_evictable(db);
-	ASSERT(err == TRUE);
-	dbuf_clear(db);
-	dbuf_destroy(db);
-}
+static arc_evict_func_t dbuf_do_evict;
 
 static void
 dbuf_evict_user(dmu_buf_impl_t *db)
@@ -233,23 +220,47 @@ dbuf_evict_user(dmu_buf_impl_t *db)
 }
 
 void
+dbuf_evict(dmu_buf_impl_t *db)
+{
+	int i;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db_buf == NULL);
+
+#ifdef ZFS_DEBUG
+	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT(!list_link_active(&db->db_dirty_node[i]));
+		ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL);
+	}
+#endif
+	dbuf_clear(db);
+	dbuf_destroy(db);
+}
+
+void
 dbuf_init(void)
 {
-	uint64_t hsize = 1;
+	uint64_t hsize = 1ULL << 16;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	int i;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
-	 * with an average 64k block size.  The table will take up
-	 * totalmem*sizeof(void*)/64k bytes (i.e. 128KB/GB with 8-byte
-	 * pointers).
+	 * with an average 4K block size.  The table will take up
+	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
 	 */
-	while (hsize * 65536 < physmem * PAGESIZE)
+	while (hsize * 4096 < physmem * PAGESIZE)
 		hsize <<= 1;
 
+retry:
 	h->hash_table_mask = hsize - 1;
-	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
+	if (h->hash_table == NULL) {
+		/* XXX - we should really return an error instead of assert */
+		ASSERT(hsize > (1ULL << 10));
+		hsize >>= 1;
+		goto retry;
+	}
 
 	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
@@ -299,8 +310,9 @@ dbuf_verify(dmu_buf_impl_t *db)
 	} else {
 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
-		ASSERT(list_head(&dn->dn_dbufs));
 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
+		ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+		    list_head(&dn->dn_dbufs));
 	}
 	if (db->db_blkid == DB_BONUS_BLKID) {
 		ASSERT(dn != NULL);
@@ -311,19 +323,11 @@ dbuf_verify(dmu_buf_impl_t *db)
 	}
 
 	if (db->db_level == 0) {
-		void **udpp = db->db_d.db_user_data_ptr_ptr;
 		/* we can be momentarily larger in dnode_set_blksz() */
 		if (db->db_blkid != DB_BONUS_BLKID && dn) {
 			ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
 		}
-		if (udpp) {
-			ASSERT((refcount_is_zero(&db->db_holds) &&
-			    *udpp == NULL) ||
-			    (!refcount_is_zero(&db->db_holds) &&
-			    *udpp == db->db.db_data));
-		}
-
-		if (IS_DNODE_DNODE(db->db.db_object)) {
+		if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 			for (i = 0; i < TXG_SIZE; i++) {
 				/*
 				 * it should only be modified in syncing
@@ -341,7 +345,7 @@ dbuf_verify(dmu_buf_impl_t *db)
 		if (db->db_parent == dn->dn_dbuf) {
 			/* db is pointed to by the dnode */
 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
-			if (IS_DNODE_DNODE(db->db.db_object))
+			if (db->db.db_object == DMU_META_DNODE_OBJECT)
 				ASSERT(db->db_parent == NULL);
 			else
 				ASSERT(db->db_parent != NULL);
@@ -399,10 +403,19 @@ static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(buf->b_data != NULL);
+	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
 	db->db_buf = buf;
-	db->db.db_data = buf->b_data;
-	dbuf_update_data(db);
+	if (buf != NULL) {
+		ASSERT(buf->b_data != NULL);
+		db->db.db_data = buf->b_data;
+		if (!arc_released(buf))
+			arc_set_callback(buf, dbuf_do_evict, db);
+		dbuf_update_data(db);
+	} else {
+		dbuf_evict_user(db);
+		db->db.db_data = NULL;
+		db->db_state = DB_UNCACHED;
+	}
 }
 
 uint64_t
@@ -427,6 +440,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	 * All reads are synchronous, so we must have a hold on the dbuf
 	 */
 	ASSERT(refcount_count(&db->db_holds) > 0);
+	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
 	if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
 		/* we were freed in flight; disregard any error */
@@ -440,60 +454,36 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 		db->db_state = DB_CACHED;
 	} else {
 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
-		arc_buf_free(buf, db);
-		db->db_state = DB_UNCACHED;
 		ASSERT3P(db->db_buf, ==, NULL);
+		VERIFY(arc_buf_remove_ref(buf, db) == 1);
+		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
+	dbuf_rele(db, NULL);
 }
 
-void
+static void
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
-	arc_buf_t *buf;
 	blkptr_t *bp;
+	zbookmark_t zb;
 
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
 	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
-
-	/*
-	 * prefetch only data blocks (level 0) -- don't prefetch indirect
-	 * blocks
-	 */
-	if ((db->db_level > 0) || (db->db_blkid == DB_BONUS_BLKID)) {
-		flags |= DB_RF_NOPREFETCH;
-	}
-
-	if (((flags & DB_RF_NOPREFETCH) == 0) && (db->db_dnode != NULL)) {
-		dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
-		    db->db.db_size);
-	}
-
-	if (db->db_state == DB_CACHED) {
-		ASSERT(db->db.db_data != NULL);
-		return;
-	}
-
-	mutex_enter(&db->db_mtx);
-
-	if (db->db_state != DB_UNCACHED) {
-		mutex_exit(&db->db_mtx);
-		return;
-	}
-
-	ASSERT3U(db->db_state, ==, DB_UNCACHED);
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db_state == DB_UNCACHED);
+	ASSERT(db->db_buf == NULL);
 
 	if (db->db_blkid == DB_BONUS_BLKID) {
 		ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
-		buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-		    DN_MAX_BONUSLEN, db);
+		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 		if (db->db.db_size < DN_MAX_BONUSLEN)
-			bzero(buf->b_data, DN_MAX_BONUSLEN);
-		bcopy(DN_BONUS(db->db_dnode->dn_phys), buf->b_data,
+			bzero(db->db.db_data, DN_MAX_BONUSLEN);
+		bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
 		    db->db.db_size);
-		dbuf_set_data(db, buf);
+		dbuf_update_data(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
@@ -522,20 +512,27 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 	db->db_state = DB_READ;
 	mutex_exit(&db->db_mtx);
 
+	zb.zb_objset = db->db_objset->os_dsl_dataset ?
+	    db->db_objset->os_dsl_dataset->ds_object : 0;
+	zb.zb_object = db->db.db_object;
+	zb.zb_level = db->db_level;
+	zb.zb_blkid = db->db_blkid;
+
+	dbuf_add_ref(db, NULL);
 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
 	(void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
 	    db->db_level > 0 ? byteswap_uint64_array :
 	    dmu_ot[db->db_dnode->dn_type].ot_byteswap,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 	    (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
-	    ARC_NOWAIT);
+	    ARC_NOWAIT, &zb);
 }
 
-static int
-dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags)
+int
+dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
-	zio_t *zio;
-	int err;
+	int err = 0;
+	int havepzio = (zio != NULL);
 
 	/*
 	 * We don't have to hold the mutex to check db_state because it
@@ -545,71 +542,67 @@ dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags)
 	if (db->db_state == DB_CACHED)
 		return (0);
 
-	if (db->db_state == DB_UNCACHED) {
-		zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL,
-		    ZIO_FLAG_CANFAIL);
+	if ((flags & DB_RF_HAVESTRUCT) == 0)
+		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+
+	mutex_enter(&db->db_mtx);
+	if (db->db_state == DB_CACHED) {
+		mutex_exit(&db->db_mtx);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+			rw_exit(&db->db_dnode->dn_struct_rwlock);
+	} else if (db->db_state == DB_UNCACHED) {
+		if (zio == NULL) {
+			zio = zio_root(db->db_dnode->dn_objset->os_spa,
+			    NULL, NULL, ZIO_FLAG_CANFAIL);
+		}
 		dbuf_read_impl(db, zio, flags);
+		/* dbuf_read_impl has dropped db_mtx for us */
+
+		if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+		    (flags & DB_RF_NOPREFETCH) == 0 &&
+		    db->db_dnode != NULL) {
+			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+			    db->db.db_size);
+		}
+
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&db->db_dnode->dn_struct_rwlock);
-		err = zio_wait(zio);
-		if (err)
-			return (err);
-	}
 
-	mutex_enter(&db->db_mtx);
-	while (db->db_state == DB_READ || db->db_state == DB_FILL) {
-		ASSERT(db->db_state == DB_READ ||
-		    (flags & DB_RF_HAVESTRUCT) == 0);
-		cv_wait(&db->db_changed, &db->db_mtx);
+		if (!havepzio)
+			err = zio_wait(zio);
+	} else {
+		if ((flags & DB_RF_HAVESTRUCT) == 0)
+			rw_exit(&db->db_dnode->dn_struct_rwlock);
+		if ((flags & DB_RF_NEVERWAIT) == 0) {
+			while (db->db_state == DB_READ ||
+			    db->db_state == DB_FILL) {
+				ASSERT(db->db_state == DB_READ ||
+				    (flags & DB_RF_HAVESTRUCT) == 0);
+				cv_wait(&db->db_changed, &db->db_mtx);
+			}
+			if (db->db_state == DB_UNCACHED)
+				err = EIO;
+		}
+		mutex_exit(&db->db_mtx);
 	}
-	ASSERT3U(db->db_state, ==, DB_CACHED);
-	mutex_exit(&db->db_mtx);
-
-	return (0);
-}
-
-#pragma weak dmu_buf_read = dbuf_read
-void
-dbuf_read(dmu_buf_impl_t *db)
-{
-	int err;
-
-	err = dbuf_read_generic(db, DB_RF_MUST_SUCCEED);
-	ASSERT(err == 0);
-}
-
-#pragma weak dmu_buf_read_canfail = dbuf_read_canfail
-int
-dbuf_read_canfail(dmu_buf_impl_t *db)
-{
-	return (dbuf_read_generic(db, DB_RF_CANFAIL));
-}
-
-void
-dbuf_read_havestruct(dmu_buf_impl_t *db)
-{
-	int err;
 
-	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
-	err = dbuf_read_generic(db, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH));
-	ASSERT(err == 0);
+	ASSERT(err || havepzio || db->db_state == DB_CACHED);
+	return (err);
 }
 
 static void
 dbuf_noread(dmu_buf_impl_t *db)
 {
 	ASSERT(!refcount_is_zero(&db->db_holds));
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 	if (db->db_state == DB_UNCACHED) {
-		int blksz = (db->db_blkid == DB_BONUS_BLKID) ?
-		    DN_MAX_BONUSLEN : db->db.db_size;
+		ASSERT(db->db_buf == NULL);
 		ASSERT(db->db.db_data == NULL);
 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-		    blksz, db));
+		    db->db.db_size, db));
 		db->db_state = DB_FILL;
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
@@ -634,14 +627,13 @@ static void
 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 {
 	arc_buf_t **quiescing, **syncing;
-	int size = (db->db_blkid == DB_BONUS_BLKID) ?
-	    DN_MAX_BONUSLEN : db->db.db_size;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db.db_data != NULL);
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 
-	quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
-	syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
+	quiescing = (arc_buf_t **)&db->db_d.db_data_old[(txg-1)&TXG_MASK];
+	syncing = (arc_buf_t **)&db->db_d.db_data_old[(txg-2)&TXG_MASK];
 
 	/*
 	 * If this buffer is referenced from the current quiescing
@@ -656,13 +648,12 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 		 */
 		ASSERT(*syncing != db->db_buf);
 		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+			int size = db->db.db_size;
 			*quiescing = arc_buf_alloc(
 			    db->db_dnode->dn_objset->os_spa, size, db);
 			bcopy(db->db.db_data, (*quiescing)->b_data, size);
 		} else {
-			db->db.db_data = NULL;
-			db->db_buf = NULL;
-			db->db_state = DB_UNCACHED;
+			dbuf_set_data(db, NULL);
 		}
 		return;
 	}
@@ -677,22 +668,49 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 		ASSERT3P(*quiescing, ==, NULL);
 		ASSERT3U(db->db_dirtycnt, ==, 1);
 		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+			int size = db->db.db_size;
 			/* we can't copy if we have already started a write */
 			ASSERT(*syncing != db->db_data_pending);
 			*syncing = arc_buf_alloc(
 			    db->db_dnode->dn_objset->os_spa, size, db);
 			bcopy(db->db.db_data, (*syncing)->b_data, size);
 		} else {
-			db->db.db_data = NULL;
-			db->db_buf = NULL;
-			db->db_state = DB_UNCACHED;
+			dbuf_set_data(db, NULL);
 		}
 	}
 }
 
+/*
+ * This is the "bonus buffer" version of the above routine
+ */
+static void
+dbuf_fix_old_bonus_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+	void **quiescing, **syncing;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db.db_data != NULL);
+	ASSERT(db->db_blkid == DB_BONUS_BLKID);
+
+	quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
+	syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
+
+	if (*quiescing == db->db.db_data) {
+		ASSERT(*syncing != db->db.db_data);
+		*quiescing = zio_buf_alloc(DN_MAX_BONUSLEN);
+		bcopy(db->db.db_data, *quiescing, DN_MAX_BONUSLEN);
+	} else if (*syncing == db->db.db_data) {
+		ASSERT3P(*quiescing, ==, NULL);
+		ASSERT3U(db->db_dirtycnt, ==, 1);
+		*syncing = zio_buf_alloc(DN_MAX_BONUSLEN);
+		bcopy(db->db.db_data, *syncing, DN_MAX_BONUSLEN);
+	}
+}
+
 void
 dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg)
 {
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
 		db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
@@ -724,7 +742,8 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 	mutex_enter(&dn->dn_dbufs_mtx);
 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
 		db_next = list_next(&dn->dn_dbufs, db);
-		if ((db->db_level != 0) || (db->db_blkid == DB_BONUS_BLKID))
+		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+		if (db->db_level != 0)
 			continue;
 		dprintf_dbuf(db, "found buf %s\n", "");
 		if (db->db_blkid < blkid ||
@@ -736,7 +755,8 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 			continue;
 
 		mutex_enter(&db->db_mtx);
-		if (db->db_state == DB_UNCACHED) {
+		if (db->db_state == DB_UNCACHED ||
+		    db->db_state == DB_EVICTING) {
 			ASSERT(db->db.db_data == NULL);
 			mutex_exit(&db->db_mtx);
 			continue;
@@ -753,22 +773,40 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
+		if (refcount_count(&db->db_holds) == 0) {
+			ASSERT(db->db_buf);
+			dbuf_clear(db);
+			continue;
+		}
+		/* The dbuf is CACHED and referenced */
 
-		/* make a copy of the data if necessary */
-		dbuf_fix_old_data(db, txg);
-
-		if (db->db.db_data) {
-			/* fill in with appropriate data */
+		if (!list_link_active(&db->db_dirty_node[txg & TXG_MASK])) {
+			/*
+			 * This dbuf is not currently dirty.  We will either
+			 * uncache it (if its not referenced in the open
+			 * context) or reset its contents to empty.
+			 */
+			dbuf_fix_old_data(db, txg);
+		} else if (db->db_d.db_overridden_by[txg & TXG_MASK] != NULL) {
+			/*
+			 * This dbuf is overridden.  Clear that state.
+			 */
+			dbuf_unoverride(db, txg);
+		}
+		/* fill in with appropriate data */
+		if (db->db_state == DB_CACHED) {
+			ASSERT(db->db.db_data != NULL);
 			arc_release(db->db_buf, db);
 			bzero(db->db.db_data, db->db.db_size);
 		}
+
 		mutex_exit(&db->db_mtx);
 	}
 	mutex_exit(&dn->dn_dbufs_mtx);
 }
 
 static int
-dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx)
+dbuf_new_block(dmu_buf_impl_t *db)
 {
 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
 	uint64_t birth_txg = 0;
@@ -790,7 +828,7 @@ dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		birth_txg = db->db_blkptr->blk_birth;
 
 	if (birth_txg)
-		return (!dsl_dataset_block_freeable(ds, birth_txg, tx));
+		return (!dsl_dataset_block_freeable(ds, birth_txg));
 	else
 		return (TRUE);
 }
@@ -801,6 +839,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 	arc_buf_t *buf, *obuf;
 	int osize = db->db.db_size;
 
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
 	/* XXX does *this* func really need the lock? */
 	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
 
@@ -814,6 +854,10 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 	 * be happening.
 	 */
 	/* Make a copy of the data if necessary */
+	/*
+	 * XXX we should be doing a dbuf_read, checking the return
+	 * value and returning that up to our callers
+	 */
 	dbuf_will_dirty(db, tx);
 
 	/* create the data buffer for the new block */
@@ -829,7 +873,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 	mutex_enter(&db->db_mtx);
 	/* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */
 	dbuf_set_data(db, buf);
-	arc_buf_free(obuf, db);
+	VERIFY(arc_buf_remove_ref(obuf, db) == 1);
 	db->db.db_size = size;
 
 	/* fix up the dirty info */
@@ -861,7 +905,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	 */
 	ASSERT(!(dmu_tx_is_syncing(tx) &&
 	    !BP_IS_HOLE(&dn->dn_objset->os_rootbp) &&
-	    !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    dn->dn_objset->os_dsl_dataset != NULL &&
 	    !dsl_dir_is_private(
 	    dn->dn_objset->os_dsl_dataset->ds_dir)));
@@ -871,7 +915,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	 * check if we're already dirty.  They are allowed to re-dirty
 	 * in syncing context.
 	 */
-	ASSERT(dn->dn_object & DMU_PRIVATE_OBJECT ||
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED ||
 	    dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
@@ -940,22 +984,27 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
-	if (db->db_level == 0) {
+	/*
+	 * If this buffer is dirty in an old transaction group we need
+	 * to make a copy of it so that the changes we make in this
+	 * transaction group won't leak out when we sync the older txg.
+	 */
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		ASSERT(db->db.db_data != NULL);
+		ASSERT(db->db_d.db_data_old[txgoff] == NULL);
+		dbuf_fix_old_bonus_data(db, tx->tx_txg);
+		db->db_d.db_data_old[txgoff] = db->db.db_data;
+	} else if (db->db_level == 0) {
 		/*
 		 * Release the data buffer from the cache so that we
 		 * can modify it without impacting possible other users
 		 * of this cached data block.  Note that indirect blocks
 		 * and private objects are not released until the syncing
 		 * state (since they are only modified then).
-		 *
-		 * If this buffer is dirty in an old transaction group we need
-		 * to make a copy of it so that the changes we make in this
-		 * transaction group won't leak out when we sync the older txg.
 		 */
 		ASSERT(db->db_buf != NULL);
-		ASSERT(db->db.db_data != NULL);
 		ASSERT(db->db_d.db_data_old[txgoff] == NULL);
-		if (!(db->db.db_object & DMU_PRIVATE_OBJECT)) {
+		if (db->db.db_object != DMU_META_DNODE_OBJECT) {
 			arc_release(db->db_buf, db);
 			dbuf_fix_old_data(db, tx->tx_txg);
 			ASSERT(db->db_buf != NULL);
@@ -978,12 +1027,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db);
 	mutex_exit(&dn->dn_mtx);
 
-	/*
-	 * If writting this buffer will consume a new block on disk,
-	 * then update the accounting.
-	 */
 	if (db->db_blkid != DB_BONUS_BLKID) {
-		if (!dbuf_new_block(db, tx) && db->db_blkptr) {
+		/*
+		 * Update the accounting.
+		 */
+		if (!dbuf_new_block(db) && db->db_blkptr) {
 			/*
 			 * This is only a guess -- if the dbuf is dirty
 			 * in a previous txg, we don't know how much
@@ -1028,7 +1076,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
 		dbuf_dirty(parent, tx);
-		dbuf_remove_ref(parent, FTAG);
+		dbuf_rele(parent, FTAG);
 	} else {
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
@@ -1042,8 +1090,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn = db->db_dnode;
 	int txgoff = tx->tx_txg & TXG_MASK;
+	int64_t holds;
 
 	ASSERT(tx->tx_txg != 0);
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 
 	mutex_enter(&db->db_mtx);
 
@@ -1080,7 +1130,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		ASSERT(db->db_buf != NULL);
 		ASSERT(db->db_d.db_data_old[txgoff] != NULL);
 		if (db->db_d.db_data_old[txgoff] != db->db_buf)
-			arc_buf_free(db->db_d.db_data_old[txgoff], db);
+			VERIFY(arc_buf_remove_ref(
+			    db->db_d.db_data_old[txgoff], db) == 1);
 		db->db_d.db_data_old[txgoff] = NULL;
 	}
 
@@ -1095,15 +1146,17 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 
-	if (refcount_remove(&db->db_holds,
-	    (void *)(uintptr_t)tx->tx_txg) == 0) {
-		/* make duf_verify() happy */
-		if (db->db.db_data)
-			bzero(db->db.db_data, db->db.db_size);
+	if ((holds = refcount_remove(&db->db_holds,
+	    (void *)(uintptr_t)tx->tx_txg)) == 0) {
+		arc_buf_t *buf = db->db_buf;
 
+		ASSERT(arc_released(buf));
+		dbuf_set_data(db, NULL);
+		VERIFY(arc_buf_remove_ref(buf, db) == 1);
 		dbuf_evict(db);
 		return (1);
 	}
+	ASSERT(holds > 0);
 
 	mutex_exit(&db->db_mtx);
 	return (0);
@@ -1120,19 +1173,21 @@ dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
 		rf |= DB_RF_HAVESTRUCT;
-	(void) dbuf_read_generic(db, rf);
+	(void) dbuf_read(db, NULL, rf);
 	dbuf_dirty(db, tx);
 }
 
-#pragma weak dmu_buf_will_fill = dbuf_will_fill
 void
-dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx)
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(db->db_level == 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
-	ASSERT(!(db->db.db_object & DMU_PRIVATE_OBJECT) ||
+	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
 	    dmu_tx_private_ok(tx));
 
 	dbuf_noread(db);
@@ -1149,6 +1204,7 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	if (db->db_state == DB_FILL) {
 		if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
+			ASSERT(db->db_blkid != DB_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
 			bzero(db->db.db_data, db->db.db_size);
@@ -1160,47 +1216,62 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	mutex_exit(&db->db_mtx);
 }
 
-
-static void
+/*
+ * "Clear" the contents of this dbuf.  This will mark the dbuf
+ * EVICTING and clear *most* of its references.  Unfortunetely,
+ * when we are not holding the dn_dbufs_mtx, we can't clear the
+ * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
+ * in this case.  For callers from the DMU we will usually see:
+ *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
+ * For the arc callback, we will usually see:
+ * 	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ * Sometimes, though, we will get a mix of these two:
+ *	DMU: dbuf_clear()->arc_buf_evict()
+ *	ARC: dbuf_do_evict()->dbuf_destroy()
+ */
+void
 dbuf_clear(dmu_buf_impl_t *db)
 {
 	dnode_t *dn = db->db_dnode;
+	dmu_buf_impl_t *parent = db->db_parent;
+	int dbuf_gone = FALSE;
 
-	ASSERT(MUTEX_HELD(&dn->dn_dbufs_mtx));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(refcount_is_zero(&db->db_holds));
 
+	dbuf_evict_user(db);
+
 	if (db->db_state == DB_CACHED) {
-		ASSERT(db->db_buf != NULL);
-		arc_buf_free(db->db_buf, db);
+		ASSERT(db->db.db_data != NULL);
+		if (db->db_blkid == DB_BONUS_BLKID)
+			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
 		db->db.db_data = NULL;
-		db->db_buf = NULL;
 		db->db_state = DB_UNCACHED;
 	}
 
 	ASSERT3U(db->db_state, ==, DB_UNCACHED);
-	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db_data_pending == NULL);
 
-	mutex_exit(&db->db_mtx);
+	db->db_state = DB_EVICTING;
+	db->db_blkptr = NULL;
+
+	if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
+		list_remove(&dn->dn_dbufs, db);
+		dnode_rele(dn, db);
+	}
+
+	if (db->db_buf)
+		dbuf_gone = arc_buf_evict(db->db_buf);
+
+	if (!dbuf_gone)
+		mutex_exit(&db->db_mtx);
 
 	/*
 	 * If this dbuf is referened from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
-	if (db->db_parent && db->db_parent != dn->dn_dbuf)
-		dbuf_remove_ref(db->db_parent, db);
-
-	/* remove from dn_dbufs */
-	list_remove(&dn->dn_dbufs, db);
-
-	dnode_rele(dn, db);
-
-	dbuf_hash_remove(db);
-
-	db->db_dnode = NULL;
-	db->db_parent = NULL;
-	db->db_blkptr = NULL;
+	if (parent && parent != dn->dn_dbuf)
+		dbuf_rele(parent, db);
 }
 
 static int
@@ -1209,6 +1280,8 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
 {
 	int nlevels, epbs;
 
+	ASSERT(blkid != DB_BONUS_BLKID);
+
 	if (dn->dn_phys->dn_nlevels == 0)
 		nlevels = 1;
 	else
@@ -1218,12 +1291,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
 
 	ASSERT3U(level * epbs, <, 64);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
-	if (blkid == DB_BONUS_BLKID) {
-		/* this is the bonus buffer */
-		*parentp = NULL;
-		*bpp = NULL;
-		return (0);
-	} else if (level >= nlevels ||
+	if (level >= nlevels ||
 	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
 		/* the buffer has no parent yet */
 		*parentp = NULL;
@@ -1235,10 +1303,13 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
 		    blkid >> epbs, fail_sparse, NULL, parentp);
 		if (err)
 			return (err);
-		dbuf_read_havestruct(*parentp);
-		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
-		    (blkid & ((1ULL << epbs) - 1));
-		return (0);
+		err = dbuf_read(*parentp, NULL,
+		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+		if (err == 0) {
+			*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+			    (blkid & ((1ULL << epbs) - 1));
+		}
+		return (err);
 	} else {
 		/* the block is referenced from the dnode */
 		ASSERT3U(level, ==, nlevels-1);
@@ -1266,11 +1337,21 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 	db->db.db_object = dn->dn_object;
 	db->db_level = level;
 	db->db_blkid = blkid;
-	db->db_state = DB_UNCACHED;
+	db->db_dirtied = 0;
+	db->db_dirtycnt = 0;
+	db->db_dnode = dn;
+	db->db_parent = parent;
+	db->db_blkptr = blkptr;
 
-	if (db->db_blkid == DB_BONUS_BLKID) {
+	bzero(&db->db_d, sizeof (db->db_d));
+
+	if (blkid == DB_BONUS_BLKID) {
+		ASSERT3P(parent, ==, dn->dn_dbuf);
 		db->db.db_size = dn->dn_bonuslen;
 		db->db.db_offset = DB_BONUS_BLKID;
+		db->db_state = DB_UNCACHED;
+		/* the bonus dbuf is not placed in the hash table */
+		return (db);
 	} else {
 		int blocksize =
 		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
@@ -1278,11 +1359,6 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 		db->db.db_offset = db->db_blkid * blocksize;
 	}
 
-	db->db_dirtied = 0;
-	db->db_dirtycnt = 0;
-
-	bzero(&db->db_d, sizeof (db->db_d));
-
 	/*
 	 * Hold the dn_dbufs_mtx while we get the new dbuf
 	 * in the hash table *and* added to the dbufs list.
@@ -1291,6 +1367,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 	 * dn_dbufs list.
 	 */
 	mutex_enter(&dn->dn_dbufs_mtx);
+	db->db_state = DB_EVICTING;
 	if ((odb = dbuf_hash_insert(db)) != NULL) {
 		/* someone else inserted it first */
 		kmem_cache_free(dbuf_cache, db);
@@ -1298,50 +1375,43 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 		return (odb);
 	}
 	list_insert_head(&dn->dn_dbufs, db);
+	db->db_state = DB_UNCACHED;
 	mutex_exit(&dn->dn_dbufs_mtx);
 
 	if (parent && parent != dn->dn_dbuf)
 		dbuf_add_ref(parent, db);
 
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+	    refcount_count(&dn->dn_holds) > 0);
 	(void) refcount_add(&dn->dn_holds, db);
 
-	db->db_dnode = dn;
-	db->db_parent = parent;
-	db->db_blkptr = blkptr;
-
 	dprintf_dbuf(db, "db=%p\n", db);
 
 	return (db);
 }
 
 static int
-dbuf_evictable(dmu_buf_impl_t *db)
+dbuf_do_evict(void *private)
 {
-	int i;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	DBUF_VERIFY(db);
+	arc_buf_t *buf = private;
+	dmu_buf_impl_t *db = buf->b_private;
 
-	if (db->db_state != DB_UNCACHED && db->db_state != DB_CACHED)
-		return (FALSE);
+	if (!MUTEX_HELD(&db->db_mtx))
+		mutex_enter(&db->db_mtx);
 
-	if (!refcount_is_zero(&db->db_holds))
-		return (FALSE);
+	ASSERT(db->db_buf == buf);
+	ASSERT(refcount_is_zero(&db->db_holds));
 
-#ifdef ZFS_DEBUG
-	for (i = 0; i < TXG_SIZE; i++) {
-		ASSERT(!list_link_active(&db->db_dirty_node[i]));
-		ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL);
+	if (db->db_state != DB_EVICTING) {
+		ASSERT(db->db_state == DB_CACHED);
+		DBUF_VERIFY(db);
+		db->db_buf = NULL;
+		dbuf_evict(db);
+	} else {
+		mutex_exit(&db->db_mtx);
+		dbuf_destroy(db);
 	}
-#endif
-
-	/*
-	 * Now we know we want to free it.
-	 * This call must be done last, since it has side effects -
-	 * calling the db_evict_func().
-	 */
-	dbuf_evict_user(db);
-	return (TRUE);
+	return (0);
 }
 
 static void
@@ -1349,9 +1419,36 @@ dbuf_destroy(dmu_buf_impl_t *db)
 {
 	ASSERT(refcount_is_zero(&db->db_holds));
 
+	if (db->db_blkid != DB_BONUS_BLKID) {
+		dnode_t *dn = db->db_dnode;
+
+		/*
+		 * If this dbuf is still on the dn_dbufs list,
+		 * remove it from that list.
+		 */
+		if (list_link_active(&db->db_link)) {
+			int need_mutex;
+
+			ASSERT(!MUTEX_HELD(&dn->dn_dbufs_mtx));
+			need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
+			if (need_mutex)
+				mutex_enter(&dn->dn_dbufs_mtx);
+
+			/* remove from dn_dbufs */
+			list_remove(&dn->dn_dbufs, db);
+
+			if (need_mutex)
+				mutex_exit(&dn->dn_dbufs_mtx);
+
+			dnode_rele(dn, db);
+		}
+		dbuf_hash_remove(db);
+	}
+	db->db_parent = NULL;
+	db->db_dnode = NULL;
+	db->db_buf = NULL;
+
 	ASSERT(db->db.db_data == NULL);
-	ASSERT(db->db_dnode == NULL);
-	ASSERT(db->db_parent == NULL);
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
@@ -1384,14 +1481,21 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
 
 	if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) {
 		if (bp && !BP_IS_HOLE(bp)) {
+			zbookmark_t zb;
+			zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
+			    dn->dn_objset->os_dsl_dataset->ds_object : 0;
+			zb.zb_object = dn->dn_object;
+			zb.zb_level = 0;
+			zb.zb_blkid = blkid;
+
 			(void) arc_read(NULL, dn->dn_objset->os_spa, bp,
 			    dmu_ot[dn->dn_type].ot_byteswap,
 			    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
-			    (ARC_NOWAIT | ARC_PREFETCH));
+			    (ARC_NOWAIT | ARC_PREFETCH), &zb);
 		}
 		if (parent && parent != dn->dn_dbuf)
-			dbuf_rele(parent);
+			dbuf_rele(parent, NULL);
 	}
 }
 
@@ -1405,11 +1509,12 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
 {
 	dmu_buf_impl_t *db, *parent = NULL;
 
+	ASSERT(blkid != DB_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT3U(dn->dn_nlevels, >, level);
 
 	*dbp = NULL;
-
+top:
 	/* dbuf_find() returns with db_mtx held */
 	db = dbuf_find(dn, level, blkid);
 
@@ -1423,13 +1528,26 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
 				err = ENOENT;
 			if (err) {
 				if (parent && parent != dn->dn_dbuf)
-					dbuf_rele(parent);
+					dbuf_rele(parent, NULL);
 				return (err);
 			}
 		}
+		if (err && err != ENOENT)
+			return (err);
 		db = dbuf_create(dn, level, blkid, parent, bp);
 	}
 
+	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
+		arc_buf_add_ref(db->db_buf, db);
+		if (db->db_buf->b_data == NULL) {
+			dbuf_clear(db);
+			goto top;
+		}
+		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+	}
+
+	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
+
 	/*
 	 * If this buffer is currently syncing out, and we are
 	 * are still referencing it from db_data, we need to make
@@ -1437,7 +1555,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
 	 * again in this txg.
 	 */
 	if (db->db_level == 0 && db->db_state == DB_CACHED &&
-	    !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    db->db_data_pending == db->db_buf) {
 		int size = (db->db_blkid == DB_BONUS_BLKID) ?
 		    DN_MAX_BONUSLEN : db->db.db_size;
@@ -1448,14 +1566,14 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
 		    db->db.db_size);
 	}
 
-	dbuf_add_ref(db, tag);
+	(void) refcount_add(&db->db_holds, tag);
 	dbuf_update_data(db);
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
 	if (parent && parent != dn->dn_dbuf)
-		dbuf_rele(parent);
+		dbuf_rele(parent, NULL);
 
 	ASSERT3P(db->db_dnode, ==, dn);
 	ASSERT3U(db->db_blkid, ==, blkid);
@@ -1466,81 +1584,83 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
 }
 
 dmu_buf_impl_t *
-dbuf_hold(dnode_t *dn, uint64_t blkid)
+dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
-	(void) dbuf_hold_impl(dn, 0, blkid, FALSE, NULL, &db);
-	return (db);
+	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
+	return (err ? NULL : db);
 }
 
 dmu_buf_impl_t *
 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
-	(void) dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
-	return (db);
+	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+	return (err ? NULL : db);
 }
 
 dmu_buf_impl_t *
-dbuf_hold_bonus(dnode_t *dn, void *tag)
+dbuf_create_bonus(dnode_t *dn)
 {
-	dmu_buf_impl_t *db;
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	(void) dbuf_hold_impl(dn, 0, DB_BONUS_BLKID, FALSE, tag, &db);
-	rw_exit(&dn->dn_struct_rwlock);
+	dmu_buf_impl_t *db = dn->dn_bonus;
+
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+	ASSERT(dn->dn_bonus == NULL);
+	db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
 	return (db);
 }
 
+#pragma weak dmu_buf_add_ref = dbuf_add_ref
 void
 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
 {
-	(void) refcount_add(&db->db_holds, tag);
-	/* dprintf_dbuf(db, "adding ref %p; holds up to %lld\n", tag, holds); */
+	int64_t holds = refcount_add(&db->db_holds, tag);
+	ASSERT(holds > 1);
 }
 
+#pragma weak dmu_buf_rele = dbuf_rele
 void
-dbuf_remove_ref(dmu_buf_impl_t *db, void *tag)
+dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds;
-	dnode_t *dn = db->db_dnode;
-	int need_mutex;
-
-	ASSERT(dn != NULL);
-	need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
-
-	if (need_mutex) {
-		dnode_add_ref(dn, FTAG);
-		mutex_enter(&dn->dn_dbufs_mtx);
-	}
 
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 
 	holds = refcount_remove(&db->db_holds, tag);
+	ASSERT(holds >= 0);
+
+	if (holds == db->db_dirtycnt &&
+	    db->db_level == 0 && db->db_d.db_immediate_evict)
+		dbuf_evict_user(db);
 
 	if (holds == 0) {
-		ASSERT3U(db->db_state, !=, DB_FILL);
-		if (db->db_level == 0 &&
-		    db->db_d.db_user_data_ptr_ptr != NULL)
-			*db->db_d.db_user_data_ptr_ptr = NULL;
-		dbuf_evict(db);
+		if (db->db_blkid == DB_BONUS_BLKID) {
+			mutex_exit(&db->db_mtx);
+			dnode_rele(db->db_dnode, db);
+		} else if (db->db_buf == NULL) {
+			/*
+			 * This is a special case: we never associated this
+			 * dbuf with any data allocated from the ARC.
+			 */
+			ASSERT3U(db->db_state, ==, DB_UNCACHED);
+			dbuf_evict(db);
+		} else  if (arc_released(db->db_buf)) {
+			arc_buf_t *buf = db->db_buf;
+			/*
+			 * This dbuf has anonymous data associated with it.
+			 */
+			dbuf_set_data(db, NULL);
+			VERIFY(arc_buf_remove_ref(buf, db) == 1);
+			dbuf_evict(db);
+		} else {
+			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
+			mutex_exit(&db->db_mtx);
+		}
 	} else {
-		if (holds == db->db_dirtycnt &&
-		    db->db_level == 0 && db->db_d.db_immediate_evict)
-			dbuf_evict_user(db);
 		mutex_exit(&db->db_mtx);
 	}
-
-	if (need_mutex) {
-		mutex_exit(&dn->dn_dbufs_mtx);
-		dnode_rele(dn, FTAG);
-	}
-}
-
-void
-dbuf_rele(dmu_buf_impl_t *db)
-{
-	dbuf_remove_ref(db, NULL);
 }
 
 #pragma weak dmu_buf_refcount = dbuf_refcount
@@ -1611,6 +1731,8 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 	dnode_t *dn = db->db_dnode;
 	objset_impl_t *os = dn->dn_objset;
 	int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	int checksum, compress;
+	zbookmark_t zb;
 	int blksz;
 
 	ASSERT(dmu_tx_is_syncing(tx));
@@ -1638,8 +1760,38 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 	 * be modified yet.
 	 */
 
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		void **datap = &db->db_d.db_data_old[txg&TXG_MASK];
+		/*
+		 * Simply copy the bonus data into the dnode.  It will
+		 * be written out when the dnode is synced (and it will
+		 * be synced, since it must have been dirty for dbuf_sync
+		 * to be called).
+		 */
+		/*
+		 * Use dn_phys->dn_bonuslen since db.db_size is the length
+		 * of the bonus buffer in the open transaction rather than
+		 * the syncing transaction.
+		 */
+		ASSERT(*datap != NULL);
+		ASSERT3U(db->db_level, ==, 0);
+		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+		if (*datap != db->db.db_data)
+			zio_buf_free(*datap, DN_MAX_BONUSLEN);
+		db->db_d.db_data_old[txg&TXG_MASK] = NULL;
+		db->db_data_pending = NULL;
+		if (db->db_dirtied == txg)
+			db->db_dirtied = 0;
+		ASSERT(db->db_dirtycnt > 0);
+		db->db_dirtycnt -= 1;
+		mutex_exit(&db->db_mtx);
+		dbuf_rele(db, (void *)(uintptr_t)txg);
+		return;
+	}
+
 	if (db->db_level == 0) {
-		data = &db->db_d.db_data_old[txg&TXG_MASK];
+		data = (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
 		blksz = arc_buf_size(*data);
 		/*
 		 * If this buffer is currently "in use" (i.e., there are
@@ -1651,17 +1803,15 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		 * modified in the syncing context (e.g. DNONE_DNODE blocks)
 		 * or if there is no actual write involved (bonus blocks).
 		 */
-		if (!(dn->dn_object & DMU_PRIVATE_OBJECT) &&
-		    db->db_d.db_overridden_by[txg&TXG_MASK] == NULL &&
-		    db->db_blkid != DB_BONUS_BLKID) {
+		if (dn->dn_object != DMU_META_DNODE_OBJECT &&
+		    db->db_d.db_overridden_by[txg&TXG_MASK] == NULL) {
 			if (refcount_count(&db->db_holds) > 1 &&
 			    *data == db->db_buf) {
-				*data = arc_buf_alloc(
-				    db->db_dnode->dn_objset->os_spa, blksz, db);
+				*data = arc_buf_alloc(os->os_spa, blksz, db);
 				bcopy(db->db.db_data, (*data)->b_data, blksz);
 			}
 			db->db_data_pending = *data;
-		} else if (dn->dn_object & DMU_PRIVATE_OBJECT) {
+		} else if (dn->dn_object == DMU_META_DNODE_OBJECT) {
 			/*
 			 * Private object buffers are released here rather
 			 * than in dbuf_dirty() since they are only modified
@@ -1683,7 +1833,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 			ASSERT(db->db_dirtycnt > 0);
 			db->db_dirtycnt -= 1;
 			mutex_exit(&db->db_mtx);
-			dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+			dbuf_rele(db, (void *)(uintptr_t)txg);
 			return;
 		}
 		blksz = db->db.db_size;
@@ -1692,35 +1842,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 
 	ASSERT(*data != NULL);
 
-	if (db->db_blkid == DB_BONUS_BLKID) {
-		/*
-		 * Simply copy the bonus data into the dnode.  It will
-		 * be written out when the dnode is synced (and it will
-		 * be synced, since it must have been dirty for dbuf_sync
-		 * to be called).  The bonus data will be byte swapped
-		 * in dnode_byteswap.
-		 */
-		/*
-		 * Use dn_phys->dn_bonuslen since db.db_size is the length
-		 * of the bonus buffer in the open transaction rather than
-		 * the syncing transaction.
-		 */
-		ASSERT3U(db->db_level, ==, 0);
-		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, blksz);
-		bcopy((*data)->b_data, DN_BONUS(dn->dn_phys),
-		    dn->dn_phys->dn_bonuslen);
-		if (*data != db->db_buf)
-			arc_buf_free(*data, db);
-		db->db_d.db_data_old[txg&TXG_MASK] = NULL;
-		db->db_data_pending = NULL;
-		if (db->db_dirtied == txg)
-			db->db_dirtied = 0;
-		ASSERT(db->db_dirtycnt > 0);
-		db->db_dirtycnt -= 1;
-		mutex_exit(&db->db_mtx);
-		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
-		return;
-	} else if (db->db_level > 0 && !arc_released(db->db_buf)) {
+	if (db->db_level > 0 && !arc_released(db->db_buf)) {
 		/*
 		 * This indirect buffer was marked dirty, but
 		 * never modified (if it had been modified, then
@@ -1733,7 +1855,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
 		mutex_exit(&db->db_mtx);
-		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+		dbuf_rele(db, (void *)(uintptr_t)txg);
 		return;
 	} else if (db->db_blkptr == NULL &&
 	    db->db_level == dn->dn_phys->dn_nlevels-1 &&
@@ -1757,18 +1879,18 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		if (parent == NULL) {
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			(void) dbuf_hold_impl(dn, db->db_level+1,
-			    db->db_blkid >> epbs, FALSE, NULL, &parent);
+			    db->db_blkid >> epbs, FALSE, FTAG, &parent);
 			rw_exit(&dn->dn_struct_rwlock);
 			dbuf_add_ref(parent, db);
 			db->db_parent = parent;
-			dbuf_rele(parent);
+			dbuf_rele(parent, FTAG);
 		}
-		dbuf_read(parent);
+		(void) dbuf_read(parent, NULL, DB_RF_MUST_SUCCEED);
 	} else {
 		mutex_exit(&db->db_mtx);
 	}
 
-	ASSERT(IS_DNODE_DNODE(dn->dn_object) || db->db_parent != NULL);
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || db->db_parent != NULL);
 
 	if (db->db_level > 0 &&
 	    db->db_blkid > dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)) {
@@ -1801,7 +1923,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		mutex_enter(&db->db_mtx);
 		db->db_dirtycnt -= 1;
 		mutex_exit(&db->db_mtx);
-		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+		dbuf_rele(db, (void *)(uintptr_t)txg);
 		return;
 	}
 
@@ -1812,20 +1934,17 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		ASSERT(db->db_level == parent->db_level-1);
 		ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK]));
 		/*
-		 * We may have read this block after we dirtied it,
+		 * We may have read this indirect block after we dirtied it,
 		 * so never released it from the cache.
 		 */
-		arc_release(parent->db_buf, parent);
+		arc_release(parent->db_buf, db->db_parent);
 
 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
 		    (db->db_blkid & ((1ULL << epbs) - 1));
 		DBUF_VERIFY(db);
 		mutex_exit(&db->db_mtx);
-	}
-	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
-
 #ifdef ZFS_DEBUG
-	if (db->db_parent == dn->dn_dbuf) {
+	} else {
 		/*
 		 * We don't need to dnode_setdirty(dn) because if we got
 		 * here then the parent is already dirty.
@@ -1833,11 +1952,14 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
 		ASSERT3P(db->db_blkptr, ==,
 		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
-	}
 #endif
+	}
+	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
 	if (db->db_level == 0 &&
 	    db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
-		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
+		arc_buf_t **old =
+		    (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
 		blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK];
 		int old_size = BP_GET_ASIZE(db->db_blkptr);
 		int new_size = BP_GET_ASIZE(*bpp);
@@ -1861,7 +1983,11 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		*bpp = NULL;
 
 		if (*old != db->db_buf)
-			arc_buf_free(*old, db);
+			VERIFY(arc_buf_remove_ref(*old, db) == 1);
+		else if (!BP_IS_HOLE(db->db_blkptr))
+			arc_set_callback(db->db_buf, dbuf_do_evict, db);
+		else
+			ASSERT(arc_released(db->db_buf));
 		*old = NULL;
 		db->db_data_pending = NULL;
 
@@ -1870,54 +1996,55 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
 		mutex_exit(&db->db_mtx);
-		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
-	} else {
-		int checksum, compress;
+		dbuf_rele(db, (void *)(uintptr_t)txg);
+		return;
+	}
 
-		if (db->db_level > 0) {
-			/*
-			 * XXX -- we should design a compression algorithm
-			 * that specializes in arrays of bps.
-			 */
-			checksum = ZIO_CHECKSUM_FLETCHER_4;
-			/* XXX - disable compresssion for now */
-			compress = ZIO_COMPRESS_OFF;
+	if (db->db_level > 0) {
+		/*
+		 * XXX -- we should design a compression algorithm
+		 * that specializes in arrays of bps.
+		 */
+		checksum = ZIO_CHECKSUM_FLETCHER_4;
+		compress = ZIO_COMPRESS_LZJB;
+	} else {
+		/*
+		 * Allow dnode settings to override objset settings,
+		 * except for metadata checksums.
+		 */
+		if (dmu_ot[dn->dn_type].ot_metadata) {
+			checksum = os->os_md_checksum;
+			compress = zio_compress_select(dn->dn_compress,
+			    os->os_md_compress);
 		} else {
-			/*
-			 * Allow dnode settings to override objset settings,
-			 * except for metadata checksums.
-			 */
-			if (dmu_ot[dn->dn_type].ot_metadata) {
-				checksum = os->os_md_checksum;
-				compress = zio_compress_select(dn->dn_compress,
-				    os->os_md_compress);
-			} else {
-				checksum = zio_checksum_select(dn->dn_checksum,
-				    os->os_checksum);
-				compress = zio_compress_select(dn->dn_compress,
-				    os->os_compress);
-			}
+			checksum = zio_checksum_select(dn->dn_checksum,
+			    os->os_checksum);
+			compress = zio_compress_select(dn->dn_compress,
+			    os->os_compress);
 		}
+	}
 #ifdef ZFS_DEBUG
-		if (db->db_parent) {
-			ASSERT(list_link_active(
-			    &db->db_parent->db_dirty_node[txg&TXG_MASK]));
-			ASSERT(db->db_parent == dn->dn_dbuf ||
-			    db->db_parent->db_level > 0);
-			if (dn->dn_object & DMU_PRIVATE_OBJECT ||
-			    db->db_level > 0)
-				ASSERT(*data == db->db_buf);
-		}
-#endif
-		ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg);
-		(void) arc_write(zio, os->os_spa, checksum, compress, txg,
-		    db->db_blkptr, *data, dbuf_write_done, db,
-		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT);
-		/*
-		 * We can't access db after arc_write, since it could finish
-		 * and be freed, and we have no locks on it.
-		 */
+	if (db->db_parent) {
+		ASSERT(list_link_active(
+		    &db->db_parent->db_dirty_node[txg&TXG_MASK]));
+		ASSERT(db->db_parent == dn->dn_dbuf ||
+		    db->db_parent->db_level > 0);
+		if (dn->dn_object == DMU_META_DNODE_OBJECT || db->db_level > 0)
+			ASSERT(*data == db->db_buf);
 	}
+#endif
+	ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg);
+	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+	zb.zb_object = db->db.db_object;
+	zb.zb_level = db->db_level;
+	zb.zb_blkid = db->db_blkid;
+	(void) arc_write(zio, os->os_spa, checksum, compress, txg,
+	    db->db_blkptr, *data, dbuf_write_done, db,
+	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb);
+	/*
+	 * We can't access db after arc_write, since it could finish
+	 * and be freed, and we have no locks on it.
+	 */
 }
 
 struct dbuf_arg {
@@ -1970,12 +2097,17 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 		db->db_dirtied = 0;
 
 	if (db->db_level == 0) {
-		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
+		arc_buf_t **old =
+		    (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
 
 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
 
 		if (*old != db->db_buf)
-			arc_buf_free(*old, db);
+			VERIFY(arc_buf_remove_ref(*old, db) == 1);
+		else if (!BP_IS_HOLE(db->db_blkptr))
+			arc_set_callback(db->db_buf, dbuf_do_evict, db);
+		else
+			ASSERT(arc_released(db->db_buf));
 		*old = NULL;
 		db->db_data_pending = NULL;
 
@@ -2007,6 +2139,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 			    db->db.db_size);
 			ASSERT3U(dn->dn_phys->dn_maxblkid
 			    >> (db->db_level * epbs), >=, db->db_blkid);
+			arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
 			if (BP_IS_HOLE(bp))
@@ -2053,5 +2186,5 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 		}
 	}
 
-	dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+	dbuf_rele(db, (void *)(uintptr_t)txg);
 }
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 14fab6d420..f883842dad 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -40,6 +39,7 @@
 #include <sys/dmu_zfetch.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
+#include <sys/zio_checksum.h>
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	byteswap_uint8_array,	TRUE,	"unallocated"		},
@@ -70,101 +70,40 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	byteswap_uint8_array,	FALSE,	"other uint8[]"		},
 	{	byteswap_uint64_array,	FALSE,	"other uint64[]"	},
 	{	zap_byteswap,		TRUE,	"other ZAP"		},
+	{	zap_byteswap,		TRUE,	"persistent error log"	},
 };
 
-static int
-dmu_buf_read_array_impl(dmu_buf_impl_t **dbp, int numbufs, uint32_t flags)
-{
-	int i, err = 0;
-	dnode_t *dn;
-	zio_t *zio;
-	int canfail;
-	uint64_t rd_sz;
-
-	if (numbufs == 0)
-		return (0);
-
-	rd_sz = numbufs * dbp[0]->db.db_size;
-	ASSERT(rd_sz <= DMU_MAX_ACCESS);
-
-	dn = dbp[0]->db_dnode;
-	if (flags & DB_RF_CANFAIL) {
-		canfail = 1;
-	} else {
-		canfail = 0;
-	}
-	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, canfail);
-
-	/* don't prefetch if read the read is large */
-	if (rd_sz >= zfetch_array_rd_sz) {
-		flags |= DB_RF_NOPREFETCH;
-	}
-
-	/* initiate async reads */
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	for (i = 0; i < numbufs; i++) {
-		if (dbp[i]->db_state == DB_UNCACHED)
-			dbuf_read_impl(dbp[i], zio, flags);
-	}
-	rw_exit(&dn->dn_struct_rwlock);
-	err = zio_wait(zio);
-
-	if (err)
-		return (err);
-
-	/* wait for other io to complete */
-	for (i = 0; i < numbufs; i++) {
-		mutex_enter(&dbp[i]->db_mtx);
-		while (dbp[i]->db_state == DB_READ ||
-		    dbp[i]->db_state == DB_FILL)
-			cv_wait(&dbp[i]->db_changed, &dbp[i]->db_mtx);
-		ASSERT(dbp[i]->db_state == DB_CACHED);
-		mutex_exit(&dbp[i]->db_mtx);
-	}
-
-	return (0);
-}
-
-void
-dmu_buf_read_array(dmu_buf_t **dbp_fake, int numbufs)
-{
-	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
-	int err;
-
-	err = dmu_buf_read_array_impl(dbp, numbufs, DB_RF_MUST_SUCCEED);
-	ASSERT(err == 0);
-}
-
 int
-dmu_buf_read_array_canfail(dmu_buf_t **dbp_fake, int numbufs)
-{
-	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
-
-	return (dmu_buf_read_array_impl(dbp, numbufs, DB_RF_CANFAIL));
-}
-
-dmu_buf_t *
-dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset)
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
+	int err;
 
 	/* dataset_verify(dd); */
 
-	dn = dnode_hold(os->os, object, FTAG);
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
 	blkid = dbuf_whichblock(dn, offset);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	db = dbuf_hold(dn, blkid);
+	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
-	dnode_rele(dn, FTAG);
-	return (&db->db);
-}
+	if (db == NULL) {
+		err = EIO;
+	} else {
+		err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+		if (err) {
+			dbuf_rele(db, tag);
+			db = NULL;
+		}
+	}
 
-dmu_buf_t *
-dmu_bonus_hold(objset_t *os, uint64_t object)
-{
-	return (dmu_bonus_hold_tag(os, object, NULL));
+	dnode_rele(dn, FTAG);
+	*dbp = &db->db;
+	return (err);
 }
 
 int
@@ -174,41 +113,69 @@ dmu_bonus_max(void)
 }
 
 /*
- * Returns held bonus buffer if the object exists, NULL if it doesn't.
+ * returns ENOENT, EIO, or 0.
  */
-dmu_buf_t *
-dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag)
+int
+dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dnode_t *dn;
+	int err, count;
 	dmu_buf_impl_t *db;
 
-	if (dn == NULL)
-		return (NULL);
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
 
-	db = dbuf_hold_bonus(dn, tag);
-	/* XXX - hack: hold the first block if this is a ZAP object */
-	if (dmu_ot[dn->dn_type].ot_byteswap == zap_byteswap) {
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		dn->dn_db0 = dbuf_hold(dn, 0);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (dn->dn_bonus == NULL) {
 		rw_exit(&dn->dn_struct_rwlock);
+		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		if (dn->dn_bonus == NULL)
+			dn->dn_bonus = dbuf_create_bonus(dn);
 	}
+	db = dn->dn_bonus;
+	rw_exit(&dn->dn_struct_rwlock);
+	mutex_enter(&db->db_mtx);
+	count = refcount_add(&db->db_holds, tag);
+	mutex_exit(&db->db_mtx);
+	if (count == 1)
+		dnode_add_ref(dn, db);
 	dnode_rele(dn, FTAG);
-	return (&db->db);
+
+	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
+
+	*dbp = &db->db;
+	return (0);
 }
 
-static dmu_buf_t **
-dbuf_hold_array(dnode_t *dn,
-    uint64_t offset, uint64_t length, int *numbufsp)
+int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
 {
+	dnode_t *dn;
 	dmu_buf_t **dbp;
 	uint64_t blkid, nblks, i;
+	uint32_t flags;
+	int err;
+	zio_t *zio;
+
+	ASSERT(length <= DMU_MAX_ACCESS);
 
 	if (length == 0) {
 		if (numbufsp)
 			*numbufsp = 0;
-		return (NULL);
+		*dbpp = NULL;
+		return (0);
 	}
 
+	flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
+	if (length >= zfetch_array_rd_sz)
+		flags |= DB_RF_NOPREFETCH;
+
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
@@ -218,83 +185,62 @@ dbuf_hold_array(dnode_t *dn,
 		ASSERT3U(offset + length, <=, dn->dn_datablksz);
 		nblks = 1;
 	}
-	dbp = kmem_alloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
+	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE);
 	blkid = dbuf_whichblock(dn, offset);
 	for (i = 0; i < nblks; i++) {
-		dmu_buf_impl_t *dbuf;
-		dbuf = dbuf_hold(dn, blkid+i);
-		dbp[i] = &dbuf->db;
+		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
+		if (db == NULL) {
+			rw_exit(&dn->dn_struct_rwlock);
+			dmu_buf_rele_array(dbp, nblks, tag);
+			dnode_rele(dn, FTAG);
+			zio_nowait(zio);
+			return (EIO);
+		}
+		/* initiate async i/o */
+		if (read && db->db_state == DB_UNCACHED) {
+			rw_exit(&dn->dn_struct_rwlock);
+			(void) dbuf_read(db, zio, flags);
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		}
+		dbp[i] = &db->db;
 	}
 	rw_exit(&dn->dn_struct_rwlock);
-
-	if (numbufsp)
-		*numbufsp = nblks;
-	return (dbp);
-}
-
-dmu_buf_t **
-dmu_buf_hold_array(objset_t *os, uint64_t object,
-	uint64_t offset, uint64_t length, int *numbufsp)
-{
-	dnode_t *dn;
-	dmu_buf_t **dbp;
-
-	ASSERT(length <= DMU_MAX_ACCESS);
-
-	if (length == 0) {
-		if (numbufsp)
-			*numbufsp = 0;
-		return (NULL);
-	}
-
-	dn = dnode_hold(os->os, object, FTAG);
-	dbp = dbuf_hold_array(dn, offset, length, numbufsp);
 	dnode_rele(dn, FTAG);
 
-	return (dbp);
-}
-
-void
-dmu_buf_add_ref(dmu_buf_t *dbuf, void *tag)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
-	dbuf_add_ref(db, tag);
-}
-
-void
-dmu_buf_remove_ref(dmu_buf_t *dbuf, void *tag)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
-	dbuf_remove_ref(db, tag);
-}
-
-void
-dmu_buf_rele(dmu_buf_t *dbuf_fake)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
-
-	/* XXX - hack: hold the first block  if this is a ZAP object */
-	if (db->db_blkid == DB_BONUS_BLKID &&
-	    dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
-		dbuf_rele(db->db_dnode->dn_db0);
-	dbuf_rele(db);
-}
+	/* wait for async i/o */
+	err = zio_wait(zio);
+	if (err) {
+		dmu_buf_rele_array(dbp, nblks, tag);
+		return (err);
+	}
 
-void
-dmu_buf_rele_tag(dmu_buf_t *dbuf_fake, void *tag)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
+	/* wait for other io to complete */
+	if (read) {
+		for (i = 0; i < nblks; i++) {
+			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+			mutex_enter(&db->db_mtx);
+			while (db->db_state == DB_READ ||
+			    db->db_state == DB_FILL)
+				cv_wait(&db->db_changed, &db->db_mtx);
+			if (db->db_state == DB_UNCACHED)
+				err = EIO;
+			mutex_exit(&db->db_mtx);
+			if (err) {
+				dmu_buf_rele_array(dbp, nblks, tag);
+				return (err);
+			}
+		}
+	}
 
-	/* XXX - hack: hold the first block  if this is a ZAP object */
-	if (db->db_blkid == DB_BONUS_BLKID &&
-	    dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
-		dbuf_rele(db->db_dnode->dn_db0);
-	dbuf_remove_ref(db, tag);
+	*numbufsp = nblks;
+	*dbpp = dbp;
+	return (0);
 }
 
 void
-dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs)
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
 {
 	int i;
 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
@@ -302,10 +248,10 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs)
 	if (numbufs == 0)
 		return;
 
-	ASSERT((numbufs * dbp[0]->db.db_size) <= DMU_MAX_ACCESS);
-
-	for (i = 0; i < numbufs; i++)
-		dbuf_rele(dbp[i]);
+	for (i = 0; i < numbufs; i++) {
+		if (dbp[i])
+			dbuf_rele(dbp[i], tag);
+	}
 
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
@@ -315,7 +261,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 {
 	dnode_t *dn;
 	uint64_t blkid;
-	int nblks, i;
+	int nblks, i, err;
 
 	if (len == 0) {  /* they're interested in the bonus buffer */
 		dn = os->os->os_meta_dnode;
@@ -335,8 +281,8 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 	 * already cached, we will do a *synchronous* read in the
 	 * dnode_hold() call.  The same is true for any indirects.
 	 */
-	dn = dnode_hold(os->os, object, FTAG);
-	if (dn == NULL)
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err != 0)
 		return;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
@@ -359,39 +305,44 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 	dnode_rele(dn, FTAG);
 }
 
-void
+int
 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dnode_t *dn;
+	int err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
 	ASSERT(offset < UINT64_MAX);
 	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
 	dnode_free_range(dn, offset, size, tx);
 	dnode_rele(dn, FTAG);
+	return (0);
 }
 
-static int
-dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    void *buf, uint32_t flags)
+int
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    void *buf)
 {
 	dnode_t *dn;
 	dmu_buf_t **dbp;
-	int numbufs, i;
-
-	dn = dnode_hold(os->os, object, FTAG);
+	int numbufs, i, err;
 
+	/*
+	 * Deal with odd block sizes, where there can't be data past the
+	 * first block.
+	 */
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
 	if (dn->dn_datablkshift == 0) {
 		int newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
 		bzero((char *)buf + newsz, size - newsz);
 		size = newsz;
 	}
-
 	dnode_rele(dn, FTAG);
 
-	if (size == 0)
-		return (0);
-
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 		int err;
@@ -400,13 +351,10 @@ dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		 * NB: we could do this block-at-a-time, but it's nice
 		 * to be reading in parallel.
 		 */
-		dbp = dmu_buf_hold_array(os, object, offset, mylen, &numbufs);
-		err = dmu_buf_read_array_impl((dmu_buf_impl_t **)dbp, numbufs,
-		    flags);
-		if (err) {
-			dmu_buf_rele_array(dbp, numbufs);
+		err = dmu_buf_hold_array(os, object, offset, mylen,
+		    TRUE, FTAG, &numbufs, &dbp);
+		if (err)
 			return (err);
-		}
 
 		for (i = 0; i < numbufs; i++) {
 			int tocpy;
@@ -424,36 +372,20 @@ dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 			size -= tocpy;
 			buf = (char *)buf + tocpy;
 		}
-		dmu_buf_rele_array(dbp, numbufs);
+		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	return (0);
 }
 
 void
-dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    void *buf)
-{
-	int err;
-
-	err = dmu_read_impl(os, object, offset, size, buf, DB_RF_MUST_SUCCEED);
-	ASSERT3U(err, ==, 0);
-}
-
-int
-dmu_read_canfail(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    void *buf)
-{
-	return (dmu_read_impl(os, object, offset, size, buf, DB_RF_CANFAIL));
-}
-
-void
 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 
-	dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
+	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp));
 
 	for (i = 0; i < numbufs; i++) {
 		int tocpy;
@@ -481,7 +413,7 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		size -= tocpy;
 		buf = (char *)buf + tocpy;
 	}
-	dmu_buf_rele_array(dbp, numbufs);
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 #ifdef _KERNEL
@@ -493,7 +425,10 @@ dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	int numbufs, i;
 	int err = 0;
 
-	dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
+	err = dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp);
+	if (err)
+		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		int tocpy;
@@ -530,7 +465,7 @@ dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		offset += tocpy;
 		size -= tocpy;
 	}
-	dmu_buf_rele_array(dbp, numbufs);
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
 #endif
@@ -539,6 +474,7 @@ struct backuparg {
 	dmu_replay_record_t *drr;
 	vnode_t *vp;
 	objset_t *os;
+	zio_cksum_t zc;
 	int err;
 };
 
@@ -546,8 +482,9 @@ static int
 dump_bytes(struct backuparg *ba, void *buf, int len)
 {
 	ssize_t resid; /* have to get resid to get detailed errno */
-	/* Need to compute checksum here */
 	ASSERT3U(len % 8, ==, 0);
+
+	fletcher_4_incremental_native(buf, len, &ba->zc);
 	ba->err = vn_rdwr(UIO_WRITE, ba->vp,
 	    (caddr_t)buf, len,
 	    0, UIO_SYSSPACE, FAPPEND, RLIM_INFINITY, CRED(), &resid);
@@ -652,7 +589,7 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 	void *data = bc->bc_data;
 	int err = 0;
 
-	if (issig(JUSTLOOKING))
+	if (issig(JUSTLOOKING) && issig(FORREAL))
 		return (EINTR);
 
 	ASSERT(data || bp == NULL);
@@ -681,16 +618,21 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 		int blksz = BP_GET_LSIZE(bp);
 		if (data == NULL) {
 			arc_buf_t *abuf;
+			zbookmark_t zb;
 
+			zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object;
+			zb.zb_object = object;
+			zb.zb_level = level;
+			zb.zb_blkid = blkid;
 			(void) arc_read(NULL, spa, bp,
 			    dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
 			    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
-			    ARC_WAIT);
+			    ARC_WAIT, &zb);
 
 			if (abuf) {
 				err = dump_data(ba, type, object, blkid * blksz,
 				    blksz, abuf->b_data);
-				arc_buf_free(abuf, &abuf);
+				(void) arc_buf_remove_ref(abuf, &abuf);
 			}
 		} else {
 			err = dump_data(ba, type, object, blkid * blksz,
@@ -736,6 +678,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp)
 	ba.drr = drr;
 	ba.vp = vp;
 	ba.os = tosnap;
+	ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
 
 	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
 		kmem_free(drr, sizeof (dmu_replay_record_t));
@@ -755,6 +698,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp)
 
 	bzero(drr, sizeof (dmu_replay_record_t));
 	drr->drr_type = DRR_END;
+	drr->drr_u.drr_end.drr_checksum = ba.zc;
 
 	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)))
 		return (ba.err);
@@ -773,6 +717,7 @@ struct restorearg {
 	int buflen; /* number of valid bytes in buf */
 	int bufoff; /* next offset to read */
 	int bufsize; /* amount of memory allocated for buf */
+	zio_cksum_t zc;
 };
 
 static int
@@ -789,8 +734,11 @@ replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	if (dd->dd_phys->dd_head_dataset_obj == 0)
 		goto die;
 
-	ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
-	    NULL, DS_MODE_EXCLUSIVE, FTAG);
+	err = dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj,
+	    NULL, DS_MODE_EXCLUSIVE, FTAG, &ds);
+	if (err)
+		goto die;
 
 	if (ds == NULL) {
 		err = EBUSY;
@@ -804,9 +752,11 @@ replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	}
 
 	/* most recent snapshot must match fromguid */
-	ds_prev = dsl_dataset_open_obj(dd->dd_pool,
+	err = dsl_dataset_open_obj(dd->dd_pool,
 	    ds->ds_phys->ds_prev_snap_obj, NULL,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds_prev);
+	if (err)
+		goto die;
 	if (ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) {
 		err = ENODEV;
 		goto die;
@@ -885,9 +835,8 @@ replay_full_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 	/* the point of no (unsuccessful) return */
 
-	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname,
-	    DS_MODE_EXCLUSIVE, FTAG, &ds);
-	ASSERT3U(err, ==, 0);
+	VERIFY(0 == dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname,
+	    DS_MODE_EXCLUSIVE, FTAG, &ds));
 	kmem_free(fsfullname, MAXNAMELEN);
 
 	(void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
@@ -921,9 +870,8 @@ replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 		return (err);
 
 	/* set snapshot's creation time and guid */
-	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname,
-	    DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds);
-	ASSERT3U(err, ==, 0);
+	VERIFY(0 == dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname,
+	    DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds));
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
@@ -932,8 +880,9 @@ replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 	dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
 
-	ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
-	    NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG);
+	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj,
+	    NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG, &ds));
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_restoring = FALSE;
 	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
@@ -959,8 +908,6 @@ restore_read(struct restorearg *ra, int len)
 		    ra->voff, UIO_SYSSPACE, FAPPEND,
 		    RLIM_INFINITY, CRED(), &resid);
 
-		/* Need to compute checksum */
-
 		ra->voff += ra->bufsize - leftover - resid;
 		ra->buflen = ra->bufsize - resid;
 		ra->bufoff = 0;
@@ -968,12 +915,17 @@ restore_read(struct restorearg *ra, int len)
 			ra->err = EINVAL;
 		if (ra->err)
 			return (NULL);
+		/* Could compute checksum here? */
 	}
 
 	ASSERT3U(ra->bufoff % 8, ==, 0);
 	ASSERT3U(ra->buflen - ra->bufoff, >=, len);
 	rv = ra->buf + ra->bufoff;
 	ra->bufoff += len;
+	if (ra->byteswap)
+		fletcher_4_incremental_byteswap(rv, len, &ra->zc);
+	else
+		fletcher_4_incremental_native(rv, len, &ra->zc);
 	return (rv);
 }
 
@@ -1016,7 +968,10 @@ backup_byteswap(dmu_replay_record_t *drr)
 		DO64(drr_free.drr_length);
 		break;
 	case DRR_END:
-		DO64(drr_end.drr_checksum);
+		DO64(drr_end.drr_checksum.zc_word[0]);
+		DO64(drr_end.drr_checksum.zc_word[1]);
+		DO64(drr_end.drr_checksum.zc_word[2]);
+		DO64(drr_end.drr_checksum.zc_word[3]);
 		break;
 	}
 #undef DO64
@@ -1089,7 +1044,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 	if (drro->drr_bonuslen) {
 		dmu_buf_t *db;
 		void *data;
-		db = dmu_bonus_hold(os, drro->drr_object);
+		VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
 		dmu_buf_will_dirty(db, tx);
 
 		ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
@@ -1103,7 +1058,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 			dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
 			    drro->drr_bonuslen);
 		}
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, FTAG);
 	}
 	dmu_tx_commit(tx);
 	return (0);
@@ -1202,21 +1157,22 @@ restore_free(struct restorearg *ra, objset_t *os,
 		dmu_tx_abort(tx);
 		return (err);
 	}
-	dmu_free_range(os, drrf->drr_object,
+	err = dmu_free_range(os, drrf->drr_object,
 	    drrf->drr_offset, drrf->drr_length, tx);
 	dmu_tx_commit(tx);
-	return (0);
+	return (err);
 }
 
 int
-dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
+dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
     vnode_t *vp, uint64_t voffset)
 {
 	struct restorearg ra;
 	dmu_replay_record_t *drr;
-	char *cp, *tosnap;
+	char *cp;
 	dsl_dir_t *dd = NULL;
 	objset_t *os = NULL;
+	zio_cksum_t pzc;
 
 	bzero(&ra, sizeof (ra));
 	ra.vp = vp;
@@ -1233,6 +1189,23 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 		goto out;
 	}
 
+	/*
+	 * NB: this assumes that struct drr_begin will be the largest in
+	 * dmu_replay_record_t's drr_u, and thus we don't need to pad it
+	 * with zeros to make it the same length as we wrote out.
+	 */
+	((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN;
+	((dmu_replay_record_t *)ra.buf)->drr_pad = 0;
+	((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb;
+	if (ra.byteswap) {
+		fletcher_4_incremental_byteswap(ra.buf,
+		    sizeof (dmu_replay_record_t), &ra.zc);
+	} else {
+		fletcher_4_incremental_native(ra.buf,
+		    sizeof (dmu_replay_record_t), &ra.zc);
+	}
+	(void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */
+
 	if (ra.byteswap) {
 		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
 		drrb->drr_version = BSWAP_64(drrb->drr_version);
@@ -1244,7 +1217,6 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 
 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 
-	tosnap = drrb->drr_toname;
 	if (drrb->drr_version != DMU_BACKUP_VERSION ||
 	    drrb->drr_type >= DMU_OST_NUMTYPES ||
 	    strchr(drrb->drr_toname, '@') == NULL) {
@@ -1260,12 +1232,10 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 
 		cp = strchr(tosnap, '@');
 		*cp = '\0';
-		dd = dsl_dir_open(tosnap, FTAG, NULL);
+		ra.err = dsl_dir_open(tosnap, FTAG, &dd, NULL);
 		*cp = '@';
-		if (dd == NULL) {
-			ra.err = ENOENT;
+		if (ra.err)
 			goto out;
-		}
 
 		ra.err = dsl_dir_sync_task(dd, replay_incremental_sync,
 		    drrb, 1<<20);
@@ -1275,12 +1245,10 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 
 		cp = strchr(tosnap, '@');
 		*cp = '\0';
-		dd = dsl_dir_open(tosnap, FTAG, &tail);
+		ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail);
 		*cp = '@';
-		if (dd == NULL) {
-			ra.err = ENOENT;
+		if (ra.err)
 			goto out;
-		}
 		if (tail == NULL) {
 			ra.err = EEXIST;
 			goto out;
@@ -1306,9 +1274,10 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 	/*
 	 * Read records and process them.
 	 */
+	pzc = ra.zc;
 	while (ra.err == 0 &&
 	    NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
-		if (issig(JUSTLOOKING)) {
+		if (issig(JUSTLOOKING) && issig(FORREAL)) {
 			ra.err = EINTR;
 			goto out;
 		}
@@ -1348,7 +1317,22 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 			break;
 		}
 		case DRR_END:
-			/* Need to verify checksum. */
+		{
+			struct drr_end drre = drr->drr_u.drr_end;
+			/*
+			 * We compare against the *previous* checksum
+			 * value, because the stored checksum is of
+			 * everything before the DRR_END record.
+			 */
+			if (drre.drr_checksum.zc_word[0] != 0 &&
+			    ((drre.drr_checksum.zc_word[0] - pzc.zc_word[0]) |
+			    (drre.drr_checksum.zc_word[1] - pzc.zc_word[1]) |
+			    (drre.drr_checksum.zc_word[2] - pzc.zc_word[2]) |
+			    (drre.drr_checksum.zc_word[3] - pzc.zc_word[3]))) {
+				ra.err = ECKSUM;
+				goto out;
+			}
+
 			/*
 			 * dd may be the parent of the dd we are
 			 * restoring into (eg. if it's a full backup).
@@ -1356,10 +1340,12 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
 			ra.err = dsl_dir_sync_task(dmu_objset_ds(os)->
 			    ds_dir, replay_end_sync, drrb, 1<<20);
 			goto out;
+		}
 		default:
 			ra.err = EINVAL;
 			goto out;
 		}
+		pzc = ra.zc;
 	}
 
 out:
@@ -1443,6 +1429,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 	dmu_buf_impl_t *db;
 	blkptr_t *blk;
 	int err;
+	zbookmark_t zb;
 
 	ASSERT(RW_LOCK_HELD(&tx->tx_suspend));
 	ASSERT(BP_IS_HOLE(bp));
@@ -1452,6 +1439,11 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 	    txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
 
 	/*
+	 * XXX why is this routine using dmu_buf_*() and casting between
+	 * dmu_buf_impl_t and dmu_buf_t?
+	 */
+
+	/*
 	 * If this txg already synced, there's nothing to do.
 	 */
 	if (txg <= tx->tx_synced_txg) {
@@ -1459,7 +1451,10 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 		 * If we're running ziltest, we need the blkptr regardless.
 		 */
 		if (txg > spa_freeze_txg(dp->dp_spa)) {
-			db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
+			err = dmu_buf_hold(os, object, offset,
+			    FTAG, (dmu_buf_t **)&db);
+			if (err)
+				return (err);
 			/* if db_blkptr == NULL, this was an empty write */
 			if (db->db_blkptr)
 				*bp = *db->db_blkptr; /* structure assignment */
@@ -1467,7 +1462,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 				bzero(bp, sizeof (blkptr_t));
 			*blkoff = offset - db->db.db_offset;
 			ASSERT3U(*blkoff, <, db->db.db_size);
-			dmu_buf_rele((dmu_buf_t *)db);
+			dmu_buf_rele((dmu_buf_t *)db, FTAG);
 			return (0);
 		}
 		return (EALREADY);
@@ -1481,7 +1476,9 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 		return (EINPROGRESS);
 	}
 
-	db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
+	err = dmu_buf_hold(os, object, offset, FTAG, (dmu_buf_t **)&db);
+	if (err)
+		return (err);
 
 	mutex_enter(&db->db_mtx);
 
@@ -1491,7 +1488,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 	 */
 	if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) {
 		mutex_exit(&db->db_mtx);
-		dmu_buf_rele((dmu_buf_t *)db);
+		dmu_buf_rele((dmu_buf_t *)db, FTAG);
 		return (ENOENT);
 	}
 
@@ -1505,7 +1502,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 		ASSERT(blk != IN_DMU_SYNC);
 		if (blk == IN_DMU_SYNC) {
 			mutex_exit(&db->db_mtx);
-			dmu_buf_rele((dmu_buf_t *)db);
+			dmu_buf_rele((dmu_buf_t *)db, FTAG);
 			return (EBUSY);
 		}
 		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
@@ -1522,11 +1519,15 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 	blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
 	blk->blk_birth = 0; /* mark as invalid */
 
+	zb.zb_objset = os->os->os_dsl_dataset->ds_object;
+	zb.zb_object = db->db.db_object;
+	zb.zb_level = db->db_level;
+	zb.zb_blkid = db->db_blkid;
 	err = arc_write(NULL, os->os->os_spa,
 	    zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum),
 	    zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress),
 	    txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
 	ASSERT(err == 0);
 
 	if (!BP_IS_HOLE(blk)) {
@@ -1546,7 +1547,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 		ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL);
 		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
 		mutex_exit(&db->db_mtx);
-		dmu_buf_rele((dmu_buf_t *)db);
+		dmu_buf_rele((dmu_buf_t *)db, FTAG);
 		/* Note that this block does not free on disk until txg syncs */
 
 		/*
@@ -1563,7 +1564,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 
 	db->db_d.db_overridden_by[txg&TXG_MASK] = blk;
 	mutex_exit(&db->db_mtx);
-	dmu_buf_rele((dmu_buf_t *)db);
+	dmu_buf_rele((dmu_buf_t *)db, FTAG);
 	ASSERT3U(txg, >, tx->tx_syncing_txg);
 	return (0);
 }
@@ -1571,7 +1572,10 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
 uint64_t
 dmu_object_max_nonzero_offset(objset_t *os, uint64_t object)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dnode_t *dn;
+
+	/* XXX assumes dnode_hold will not get an i/o error */
+	(void) dnode_hold(os->os, object, FTAG, &dn);
 	uint64_t rv = dnode_max_nonzero_offset(dn);
 	dnode_rele(dn, FTAG);
 	return (rv);
@@ -1581,8 +1585,13 @@ int
 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
 	dmu_tx_t *tx)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
-	int err = dnode_set_blksz(dn, size, ibs, tx);
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
+	err = dnode_set_blksz(dn, size, ibs, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
@@ -1591,7 +1600,10 @@ void
 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
 	dmu_tx_t *tx)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dnode_t *dn;
+
+	/* XXX assumes dnode_hold will not get an i/o error */
+	(void) dnode_hold(os->os, object, FTAG, &dn);
 	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
@@ -1602,7 +1614,10 @@ void
 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
 	dmu_tx_t *tx)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dnode_t *dn;
+
+	/* XXX assumes dnode_hold will not get an i/o error */
+	(void) dnode_hold(os->os, object, FTAG, &dn);
 	ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
@@ -1615,7 +1630,9 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 	dnode_t *dn;
 	int i, err;
 
-	dn = dnode_hold(os->os, object, FTAG);
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err)
+		return (err);
 	/*
 	 * Sync any current changes before
 	 * we go trundling through the block pointers.
@@ -1627,7 +1644,9 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 	if (i != TXG_SIZE) {
 		dnode_rele(dn, FTAG);
 		txg_wait_synced(dmu_objset_pool(os), 0);
-		dn = dnode_hold(os->os, object, FTAG);
+		err = dnode_hold(os->os, object, FTAG, &dn);
+		if (err)
+			return (err);
 	}
 
 	err = dnode_next_offset(dn, hole, off, 1, 1);
@@ -1665,10 +1684,11 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 int
 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
 {
-	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dnode_t *dn;
+	int err = dnode_hold(os->os, object, FTAG, &dn);
 
-	if (dn == NULL)
-		return (ENOENT);
+	if (err)
+		return (err);
 
 	if (doi != NULL)
 		dmu_object_info_from_dnode(dn, doi);
@@ -1699,6 +1719,71 @@ dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
 	*nblk512 = dn->dn_phys->dn_secphys + 1;	/* add 1 for dnode space */
 }
 
+/*
+ * Given a bookmark, return the name of the dataset, object, and range in
+ * human-readable format.
+ */
+int
+spa_bookmark_name(spa_t *spa, zbookmark_t *zb, char *dsname, size_t dslen,
+    char *objname, size_t objlen, char *range, size_t rangelen)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds = NULL;
+	objset_t *os = NULL;
+	dnode_t *dn = NULL;
+	int err, shift;
+
+	if (dslen < MAXNAMELEN || objlen < 32 || rangelen < 64)
+		return (ENOSPC);
+
+	dp = spa_get_dsl(spa);
+	if (zb->zb_objset != 0) {
+		rw_enter(&dp->dp_config_rwlock, RW_READER);
+		err = dsl_dataset_open_obj(dp, zb->zb_objset,
+		    NULL, DS_MODE_NONE, FTAG, &ds);
+		if (err) {
+			rw_exit(&dp->dp_config_rwlock);
+			return (err);
+		}
+		dsl_dataset_name(ds, dsname);
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		rw_exit(&dp->dp_config_rwlock);
+
+		err = dmu_objset_open(dsname, DMU_OST_ANY, DS_MODE_NONE, &os);
+		if (err)
+			goto out;
+
+	} else {
+		dsl_dataset_name(NULL, dsname);
+		os = dp->dp_meta_objset;
+	}
+
+
+	if (zb->zb_object == DMU_META_DNODE_OBJECT) {
+		(void) strncpy(objname, "mdn", objlen);
+	} else {
+		(void) snprintf(objname, objlen, "%lld",
+		    (longlong_t)zb->zb_object);
+	}
+
+	err = dnode_hold(os->os, zb->zb_object, FTAG, &dn);
+	if (err)
+		goto out;
+
+	shift = (dn->dn_datablkshift?dn->dn_datablkshift:SPA_MAXBLOCKSHIFT) +
+	    zb->zb_level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
+	(void) snprintf(range, rangelen, "%llu-%llu",
+	    (u_longlong_t)(zb->zb_blkid << shift),
+	    (u_longlong_t)((zb->zb_blkid+1) << shift));
+
+out:
+	if (dn)
+		dnode_rele(dn, FTAG);
+	if (os && os != dp->dp_meta_objset)
+		dmu_objset_close(os);
+	return (err);
+}
+
 void
 byteswap_uint64_array(void *vbuf, size_t size)
 {
diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c
index d150d6c400..99d40c5ec5 100644
--- a/usr/src/uts/common/fs/zfs/dmu_object.c
+++ b/usr/src/uts/common/fs/zfs/dmu_object.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,7 +38,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
 	uint64_t object;
 	uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
 	    (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
-	dnode_t *dn;
+	dnode_t *dn = NULL;
 	int restarted = B_FALSE;
 
 	mutex_enter(&osi->os_obj_lock);
@@ -62,7 +61,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
 		}
 		osi->os_obj_next = ++object;
 
-		dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG);
+		/*
+		 * XXX We should check for an i/o error here and return
+		 * up to our caller.  Actually we should pre-read it in
+		 * dmu_tx_assign(), but there is currently no mechanism
+		 * to do so.
+		 */
+		(void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
+		    FTAG, &dn);
 		if (dn)
 			break;
 
@@ -84,13 +90,14 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	dnode_t *dn;
+	int err;
 
-	if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx))
+	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
 		return (EBADF);
 
-	dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG);
-	if (dn == NULL)
-		return (EEXIST);
+	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+	if (err)
+		return (err);
 	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
 	dnode_rele(dn, FTAG);
 
@@ -103,13 +110,15 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	dnode_t *dn;
+	int err;
 
-	if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx))
+	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
 		return (EBADF);
 
-	dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG);
-	if (dn == NULL)
-		return (EBADF);
+	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+	    FTAG, &dn);
+	if (err)
+		return (err);
 	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
 	dnode_rele(dn, FTAG);
 
@@ -120,12 +129,14 @@ int
 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
+	int err;
 
-	ASSERT(!(object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx));
+	ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 
-	dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG);
-	if (dn == NULL)
-		return (ENOENT);
+	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+	    FTAG, &dn);
+	if (err)
+		return (err);
 
 	ASSERT(dn->dn_type != DMU_OT_NONE);
 	dnode_free(dn, tx);
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 8d77ff70c0..6625fdb98d 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -127,8 +126,9 @@ dmu_objset_byteswap(void *buf, size_t size)
 	osp->os_type = BSWAP_64(osp->os_type);
 }
 
-objset_impl_t *
-dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
+int
+dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+    objset_impl_t **osip)
 {
 	objset_impl_t *winner, *osi;
 	int i, err, checksum;
@@ -141,15 +141,25 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
 		osi->os_rootbp = *bp;
 	osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t));
 	if (!BP_IS_HOLE(&osi->os_rootbp)) {
+		zbookmark_t zb;
+		zb.zb_objset = ds ? ds->ds_object : 0;
+		zb.zb_object = 0;
+		zb.zb_level = -1;
+		zb.zb_blkid = 0;
+
 		dprintf_bp(&osi->os_rootbp, "reading %s", "");
-		(void) arc_read(NULL, spa, &osi->os_rootbp,
+		err = arc_read(NULL, spa, &osi->os_rootbp,
 		    dmu_ot[DMU_OT_OBJSET].ot_byteswap,
 		    arc_bcopy_func, osi->os_phys,
-		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, ARC_WAIT, &zb);
+		if (err) {
+			zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
+			kmem_free(osi, sizeof (objset_impl_t));
+			return (err);
+		}
 	} else {
 		bzero(osi->os_phys, sizeof (objset_phys_t));
 	}
-	osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
 
 	/*
 	 * Note: the changed_cb will be called once before the register
@@ -159,18 +169,22 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
 	if (ds) {
 		err = dsl_prop_register(ds, "checksum",
 		    checksum_changed_cb, osi);
-		ASSERT(err == 0);
-
-		err = dsl_prop_register(ds, "compression",
-		    compression_changed_cb, osi);
-		ASSERT(err == 0);
+		if (err == 0)
+			err = dsl_prop_register(ds, "compression",
+			    compression_changed_cb, osi);
+		if (err) {
+			zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
+			kmem_free(osi, sizeof (objset_impl_t));
+			return (err);
+		}
 	} else {
 		/* It's the meta-objset. */
-		/* XXX - turn off metadata compression temporarily */
 		osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
-		osi->os_compress = ZIO_COMPRESS_OFF;
+		osi->os_compress = ZIO_COMPRESS_LZJB;
 	}
 
+	osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
+
 	/*
 	 * Metadata always gets compressed and checksummed.
 	 * If the data checksum is multi-bit correctable, and it's not
@@ -184,9 +198,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
 		osi->os_md_checksum = checksum;
 	else
 		osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
-
-	/* XXX - turn off metadata compression temporarily */
-	osi->os_md_compress = ZIO_COMPRESS_OFF;
+	osi->os_md_compress = ZIO_COMPRESS_LZJB;
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
@@ -210,7 +222,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
 		}
 	}
 
-	return (osi);
+	*osip = osi;
+	return (0);
 }
 
 /* called from zpl */
@@ -235,7 +248,13 @@ dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
 		blkptr_t bp;
 
 		dsl_dataset_get_blkptr(ds, &bp);
-		osi = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, &bp);
+		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
+		    ds, &bp, &osi);
+		if (err) {
+			dsl_dataset_close(ds, mode, os);
+			kmem_free(os, sizeof (objset_t));
+			return (err);
+		}
 	}
 
 	os->os = osi;
@@ -257,9 +276,51 @@ dmu_objset_close(objset_t *os)
 }
 
 void
+dmu_objset_evict_dbufs(objset_t *os)
+{
+	objset_impl_t *osi = os->os;
+	dnode_t *mdn = osi->os_meta_dnode;
+	dnode_t *dn;
+	int allzero = B_TRUE;
+
+	/*
+	 * Each time we process an entry on the list, we first move it
+	 * to the tail so that we don't process it over and over again.
+	 * We use the meta-dnode as a marker: if we make a complete pass
+	 * over the list without finding any work to do, we're done.
+	 * This ensures that we complete in linear time rather than
+	 * quadratic time, as described in detail in bug 1182169.
+	 */
+	mutex_enter(&osi->os_lock);
+	list_remove(&osi->os_dnodes, mdn);
+	list_insert_tail(&osi->os_dnodes, mdn);
+	while ((dn = list_head(&osi->os_dnodes)) != NULL) {
+		list_remove(&osi->os_dnodes, dn);
+		list_insert_tail(&osi->os_dnodes, dn);
+		if (dn == mdn) {
+			if (allzero)
+				break;
+			allzero = B_TRUE;
+			continue;
+		}
+		if (!refcount_is_zero(&dn->dn_holds)) {
+			allzero = B_FALSE;
+			dnode_add_ref(dn, FTAG);
+			mutex_exit(&osi->os_lock);
+			dnode_evict_dbufs(dn);
+			dnode_rele(dn, FTAG);
+			mutex_enter(&osi->os_lock);
+		}
+	}
+	mutex_exit(&osi->os_lock);
+	dnode_evict_dbufs(mdn);
+}
+
+void
 dmu_objset_evict(dsl_dataset_t *ds, void *arg)
 {
 	objset_impl_t *osi = arg;
+	objset_t os;
 	int err, i;
 
 	for (i = 0; i < TXG_SIZE; i++) {
@@ -277,6 +338,13 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
 		ASSERT(err == 0);
 	}
 
+	/*
+	 * We should need only a single pass over the dnode list, since
+	 * nothing can be added to the list at this point.
+	 */
+	os.os = osi;
+	dmu_objset_evict_dbufs(&os);
+
 	ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
 	ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
 	ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
@@ -297,7 +365,7 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type,
 	dnode_t *mdn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
-	osi = dmu_objset_open_impl(spa, ds, NULL);
+	VERIFY(0 == dmu_objset_open_impl(spa, ds, NULL, &osi));
 	mdn = osi->os_meta_dnode;
 
 	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
@@ -314,9 +382,21 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type,
 	 * needs to be synced multiple times as spa_sync() iterates
 	 * to convergence, so minimizing its dn_nlevels matters.
 	 */
-	if (ds != NULL)
+	if (ds != NULL) {
+		int levels = 1;
+
+		/*
+		 * Determine the number of levels necessary for the meta-dnode
+		 * to contain DN_MAX_OBJECT dnodes.
+		 */
+		while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
+		    (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
+		    DN_MAX_OBJECT * sizeof (dnode_phys_t))
+			levels++;
+
 		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
-		    mdn->dn_nlevels = DN_META_DNODE_LEVELS;
+		    mdn->dn_nlevels = levels;
+	}
 
 	ASSERT(type != DMU_OST_NONE);
 	ASSERT(type != DMU_OST_ANY);
@@ -354,9 +434,8 @@ dmu_objset_create_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	if (err)
 		return (err);
 
-	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds);
-	ASSERT3U(err, ==, 0);
+	VERIFY(0 == dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
 	dsl_dataset_get_blkptr(ds, &bp);
 	if (BP_IS_HOLE(&bp)) {
 		objset_impl_t *osi;
@@ -382,9 +461,9 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
 	const char *tail;
 	int err = 0;
 
-	pds = dsl_dir_open(name, FTAG, &tail);
-	if (pds == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(name, FTAG, &pds, &tail);
+	if (err)
+		return (err);
 	if (tail == NULL) {
 		dsl_dir_close(pds, FTAG);
 		return (EEXIST);
@@ -554,6 +633,7 @@ dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
 	int txgoff;
 	list_t *dirty_list;
 	int err;
+	zbookmark_t zb;
 	arc_buf_t *abuf =
 	    arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG);
 
@@ -586,11 +666,15 @@ dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
 	 * Sync the root block.
 	 */
 	bcopy(os->os_phys, abuf->b_data, sizeof (objset_phys_t));
+	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+	zb.zb_object = 0;
+	zb.zb_level = -1;
+	zb.zb_blkid = 0;
 	err = arc_write(NULL, os->os_spa, os->os_md_checksum,
 	    os->os_md_compress, tx->tx_txg, &os->os_rootbp, abuf, killer, os,
-	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
 	ASSERT(err == 0);
-	arc_buf_free(abuf, FTAG);
+	VERIFY(arc_buf_remove_ref(abuf, FTAG) == 1);
 
 	dsl_dataset_set_blkptr(os->os_dsl_dataset, &os->os_rootbp, tx);
 
@@ -707,10 +791,10 @@ dmu_objset_find(char *name, void func(char *, void *), void *arg, int flags)
 	zap_cursor_t zc;
 	zap_attribute_t attr;
 	char *child;
-	int do_self;
+	int do_self, err;
 
-	dd = dsl_dir_open(name, FTAG, NULL);
-	if (dd == NULL)
+	err = dsl_dir_open(name, FTAG, &dd, NULL);
+	if (err)
 		return;
 
 	do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c
index fedeba015d..fbc55fec86 100644
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -339,7 +338,7 @@ traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
 	} else {
 		error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
 		    BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
-		    th->th_zio_flags | ZIO_FLAG_DONT_CACHE));
+		    th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
 
 		if (BP_SHOULD_BYTESWAP(bp) && error == 0)
 			(zb->zb_level > 0 ? byteswap_uint64_array :
@@ -469,13 +468,70 @@ get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
 	return (rc);
 }
 
+/* ARGSUSED */
+static void
+traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t maxtxg)
+{
+	traverse_handle_t *th = arg;
+	traverse_blk_cache_t *bc = &th->th_zil_cache;
+	zbookmark_t *zb = &bc->bc_bookmark;
+
+	if (bp->blk_birth < maxtxg) {
+		zb->zb_object = 0;
+		zb->zb_blkid = bp->blk_cksum.zc_word[3];
+		bc->bc_blkptr = *bp;
+		(void) th->th_func(bc, th->th_spa, th->th_arg);
+	}
+}
+
+/* ARGSUSED */
+static void
+traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t maxtxg)
+{
+	traverse_handle_t *th = arg;
+	traverse_blk_cache_t *bc = &th->th_zil_cache;
+	zbookmark_t *zb = &bc->bc_bookmark;
+
+	if (lrc->lrc_txtype == TX_WRITE) {
+		lr_write_t *lr = (lr_write_t *)lrc;
+		blkptr_t *bp = &lr->lr_blkptr;
+
+		if (bp->blk_birth != 0 && bp->blk_birth < maxtxg) {
+			zb->zb_object = lr->lr_foid;
+			zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
+			bc->bc_blkptr = *bp;
+			(void) th->th_func(bc, th->th_spa, th->th_arg);
+		}
+	}
+}
+
+static void
+traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc, uint64_t maxtxg)
+{
+	spa_t *spa = th->th_spa;
+	objset_phys_t *osphys = bc->bc_data;
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	zilog_t *zilog;
+
+	ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
+	ASSERT(bc->bc_bookmark.zb_level == -1);
+
+	th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
+
+	zilog = zil_alloc(dp->dp_meta_objset, &osphys->os_zil_header);
+
+	zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, maxtxg);
+
+	zil_free(zilog);
+}
+
 static int
 traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
 {
 	zbookmark_t *zb = &zseg->seg_start;
 	traverse_blk_cache_t *bc;
 	dnode_phys_t *dn, *dn_tmp;
-	int worklimit = 1000;
+	int worklimit = 100;
 	int rc;
 
 	dprintf("<%llu, %llu, %d, %llx>\n",
@@ -529,6 +585,8 @@ traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
 
 	if (zb->zb_level == -1) {
 		ASSERT(zb->zb_object == 0);
+		ASSERT(zb->zb_blkid == 0);
+		ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
 
 		if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
 			rc = traverse_callback(th, zseg, bc);
@@ -536,6 +594,9 @@ traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
 				ASSERT(rc == EINTR);
 				return (rc);
 			}
+			if ((th->th_advance & ADVANCE_ZIL) &&
+			    zb->zb_objset != 0)
+				traverse_zil(th, bc, zseg->seg_maxtxg);
 		}
 
 		return (advance_from_osphys(zseg, th->th_advance));
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 6576107ae2..894bd63f36 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -37,6 +37,9 @@
 #include <sys/spa.h>
 #include <sys/zfs_context.h>
 
+typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
+    uint64_t arg1, uint64_t arg2);
+
 #ifdef ZFS_DEBUG
 int dmu_use_tx_debug_bufs = 1;
 #endif
@@ -60,6 +63,7 @@ dmu_tx_create(objset_t *os)
 {
 	dmu_tx_t *tx = dmu_tx_create_ds(os->os->os_dsl_dataset->ds_dir);
 	tx->tx_objset = os;
+	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
 	return (tx);
 }
 
@@ -85,7 +89,7 @@ dmu_tx_is_syncing(dmu_tx_t *tx)
 int
 dmu_tx_private_ok(dmu_tx_t *tx)
 {
-	return (tx->tx_anyobj || tx->tx_privateobj);
+	return (tx->tx_anyobj);
 }
 
 static void
@@ -95,11 +99,16 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 {
 	dmu_tx_hold_t *dth;
 	dnode_t *dn = NULL;
+	int err;
 
 	if (object != DMU_NEW_OBJECT) {
-		dn = dnode_hold(os->os, object, tx);
+		err = dnode_hold(os->os, object, tx, &dn);
+		if (err) {
+			tx->tx_err = err;
+			return;
+		}
 
-		if (tx->tx_txg != 0) {
+		if (err == 0 && tx->tx_txg != 0) {
 			mutex_enter(&dn->dn_mtx);
 			/*
 			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
@@ -118,15 +127,12 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 	dth = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 	dth->dth_dnode = dn;
 	dth->dth_type = type;
-	dth->dth_func = func;
 	dth->dth_arg1 = arg1;
 	dth->dth_arg2 = arg2;
-	/*
-	 * XXX Investigate using a different data structure to keep
-	 * track of dnodes in a tx.  Maybe array, since there will
-	 * generally not be many entries?
-	 */
 	list_insert_tail(&tx->tx_holds, dth);
+
+	if (func)
+		func(tx, dn, arg1, arg2);
 }
 
 void
@@ -142,11 +148,27 @@ dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 	}
 }
 
+static int
+dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
+{
+	int err;
+	dmu_buf_impl_t *db;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	db = dbuf_hold_level(dn, level, blkid, FTAG);
+	rw_exit(&dn->dn_struct_rwlock);
+	if (db == NULL)
+		return (EIO);
+	err = dbuf_read(db, zio, DB_RF_CANFAIL);
+	dbuf_rele(db, FTAG);
+	return (err);
+}
+
 /* ARGSUSED */
 static void
 dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 {
-	uint64_t start, end, space;
+	uint64_t start, end, i, space;
 	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 
 	if (len == 0)
@@ -158,6 +180,64 @@ dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 	max_ibs = DN_MAX_INDBLKSHIFT;
 
 	/*
+	 * For i/o error checking, read the first and last level-0
+	 * blocks, and all the level-1 blocks.  We needn't do this on
+	 * the meta-dnode, because we've already read it in.
+	 */
+
+	if (dn && dn->dn_object != DMU_META_DNODE_OBJECT) {
+		int err;
+
+		if (dn->dn_maxblkid == 0) {
+			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+			if (err) {
+				tx->tx_err = err;
+				return;
+			}
+		} else {
+			zio_t *zio = zio_root(tx->tx_pool->dp_spa,
+			    NULL, NULL, ZIO_FLAG_CANFAIL);
+
+			/* first level-0 block */
+			start = off/dn->dn_datablksz;
+			err = dmu_tx_check_ioerr(zio, dn, 0, start);
+			if (err) {
+				tx->tx_err = err;
+				return;
+			}
+
+			/* last level-0 block */
+			end = (off+len)/dn->dn_datablksz;
+			if (end != start) {
+				err = dmu_tx_check_ioerr(zio, dn, 0, end);
+				if (err) {
+					tx->tx_err = err;
+					return;
+				}
+			}
+
+			/* level-1 blocks */
+			if (dn->dn_nlevels > 1) {
+				start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+				end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+				for (i = start+1; i < end; i++) {
+					err = dmu_tx_check_ioerr(zio, dn, 1, i);
+					if (err) {
+						tx->tx_err = err;
+						return;
+					}
+				}
+			}
+
+			err = zio_wait(zio);
+			if (err) {
+				tx->tx_err = err;
+				return;
+			}
+		}
+	}
+
+	/*
 	 * If there's more than one block, the blocksize can't change,
 	 * so we can make a more precise estimate.  Alternatively,
 	 * if the dnode's ibs is larger than max_ibs, always use that.
@@ -218,7 +298,7 @@ dmu_tx_count_dnode(dmu_tx_t *tx, dnode_t *dn)
 	dmu_tx_count_write(tx, mdn, object << DNODE_SHIFT, 1 << DNODE_SHIFT);
 	if (dn && dn->dn_dbuf->db_blkptr &&
 	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-	    dn->dn_dbuf->db_blkptr->blk_birth, tx)) {
+	    dn->dn_dbuf->db_blkptr->blk_birth)) {
 		tx->tx_space_tooverwrite +=
 			tx->tx_space_towrite - pre_write_space;
 		tx->tx_space_towrite = pre_write_space;
@@ -237,7 +317,7 @@ void
 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 {
 	ASSERT(tx->tx_txg == 0);
-	ASSERT(len > 0 && len < DMU_MAX_ACCESS);
+	ASSERT(len < DMU_MAX_ACCESS);
 	ASSERT(UINT64_MAX - off >= len - 1);
 
 	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE,
@@ -251,8 +331,6 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 	uint64_t space = 0;
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 
-	ASSERT(dn->dn_assigned_tx == tx || dn->dn_assigned_tx == NULL);
-
 	if (dn->dn_datablkshift == 0)
 		return;
 	/*
@@ -264,8 +342,10 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 	blkid = off >> dn->dn_datablkshift;
 	nblks = (off + len) >> dn->dn_datablkshift;
 
-	if (blkid >= dn->dn_maxblkid)
-		goto out;
+	if (blkid >= dn->dn_maxblkid) {
+		rw_exit(&dn->dn_struct_rwlock);
+		return;
+	}
 	if (blkid + nblks > dn->dn_maxblkid)
 		nblks = dn->dn_maxblkid - blkid;
 
@@ -278,12 +358,12 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 			blkptr_t *bp = dn->dn_phys->dn_blkptr;
 			ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
 			bp += blkid + i;
-			if (dsl_dataset_block_freeable(ds, bp->blk_birth, tx)) {
+			if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
 				dprintf_bp(bp, "can free old%s", "");
 				space += BP_GET_ASIZE(bp);
 			}
 		}
-		goto out;
+		nblks = 0;
 	}
 
 	while (nblks) {
@@ -299,20 +379,26 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 			int i;
 			blkptr_t *bp;
 
-			dbuf_read_havestruct(dbuf);
+			err = dbuf_read(dbuf, NULL,
+			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
+			if (err != 0) {
+				tx->tx_err = err;
+				dbuf_rele(dbuf, FTAG);
+				break;
+			}
 
 			bp = dbuf->db.db_data;
 			bp += blkoff;
 
 			for (i = 0; i < tochk; i++) {
 				if (dsl_dataset_block_freeable(ds,
-				    bp[i].blk_birth, tx)) {
+				    bp[i].blk_birth)) {
 					dprintf_bp(&bp[i],
 					    "can free old%s", "");
 					space += BP_GET_ASIZE(&bp[i]);
 				}
 			}
-			dbuf_remove_ref(dbuf, FTAG);
+			dbuf_rele(dbuf, FTAG);
 		} else {
 			/* the indirect block is sparse */
 			ASSERT(err == ENOENT);
@@ -321,7 +407,6 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 		blkid += tochk;
 		nblks -= tochk;
 	}
-out:
 	rw_exit(&dn->dn_struct_rwlock);
 
 	tx->tx_space_tofree += space;
@@ -330,7 +415,9 @@ out:
 static void
 dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 {
-	int dirty;
+	uint64_t start, end, i;
+	int dirty, err, shift;
+	zio_t *zio;
 
 	/* first block */
 	if (off != 0 /* || dn->dn_maxblkid == 0 */)
@@ -339,13 +426,46 @@ dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 	if (len != DMU_OBJECT_END)
 		dmu_tx_count_write(tx, dn, off+len, 1);
 
-	dmu_tx_count_dnode(tx, dn);
-
 	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 		return;
 	if (len == DMU_OBJECT_END)
 		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 
+	/*
+	 * For i/o error checking, read the first and last level-0
+	 * blocks, and all the level-1 blocks.  The above count_write's
+	 * will take care of the level-0 blocks.
+	 */
+	shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	start = off >> shift;
+	end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
+
+	zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+	for (i = start+1; i < end; i++) {
+		uint64_t ibyte = i << shift;
+		err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1);
+		i = ibyte >> shift;
+		if (err == ESRCH)
+			break;
+		if (err) {
+			tx->tx_err = err;
+			return;
+		}
+
+		err = dmu_tx_check_ioerr(zio, dn, 1, i);
+		if (err) {
+			tx->tx_err = err;
+			return;
+		}
+	}
+	err = zio_wait(zio);
+	if (err) {
+		tx->tx_err = err;
+		return;
+	}
+
+	dmu_tx_count_dnode(tx, dn);
+
 	/* XXX locking */
 	dirty = dn->dn_dirtyblksz[0] | dn->dn_dirtyblksz[1] |
 	    dn->dn_dirtyblksz[2] | dn->dn_dirtyblksz[3];
@@ -364,17 +484,17 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 
 /* ARGSUSED */
 static void
-dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops)
+dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t add, uint64_t iname)
 {
 	uint64_t nblocks;
-	int epbs;
+	int epbs, err;
+	char *name = (char *)(uintptr_t)iname;
 
 	dmu_tx_count_dnode(tx, dn);
 
 	if (dn == NULL) {
 		/*
-		 * Assuming that nops+cops is not super huge, we will be
-		 * able to fit a new object's entries into one leaf
+		 * We will be able to fit a new object's entries into one leaf
 		 * block.  So there will be at most 2 blocks total,
 		 * including the header block.
 		 */
@@ -384,25 +504,44 @@ dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops)
 
 	ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
 
-	if (dn->dn_maxblkid == 0 && nops == 0) {
+	if (dn->dn_maxblkid == 0 && !add) {
 		/*
 		 * If there is only one block  (i.e. this is a micro-zap)
-		 * and we are only doing updates, the accounting is simple.
+		 * and we are not adding anything, the accounting is simple.
 		 */
+		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+		if (err) {
+			tx->tx_err = err;
+			return;
+		}
+
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-		    dn->dn_phys->dn_blkptr[0].blk_birth, tx))
+		    dn->dn_phys->dn_blkptr[0].blk_birth))
 			tx->tx_space_tooverwrite += dn->dn_datablksz;
 		else
 			tx->tx_space_towrite += dn->dn_datablksz;
 		return;
 	}
 
+	if (dn->dn_maxblkid > 0 && name) {
+		/*
+		 * access the name in this fat-zap so that we'll check
+		 * for i/o errors to the leaf blocks, etc.
+		 */
+		err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
+		    8, 0, NULL);
+		if (err == EIO) {
+			tx->tx_err = err;
+			return;
+		}
+	}
+
 	/*
-	 * 3 blocks overwritten per op: target leaf, ptrtbl block, header block
-	 * 3 new blocks written per op: new split leaf, 2 grown ptrtbl blocks
+	 * 3 blocks overwritten: target leaf, ptrtbl block, header block
+	 * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
 	 */
 	dmu_tx_count_write(tx, dn, dn->dn_maxblkid * dn->dn_datablksz,
-	    (nops * 6ULL + cops * 3ULL) << dn->dn_datablkshift);
+	    (3 + add ? 3 : 0) << dn->dn_datablkshift);
 
 	/*
 	 * If the modified blocks are scattered to the four winds,
@@ -410,17 +549,16 @@ dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops)
 	 */
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
-		tx->tx_space_towrite +=
-		    ((nops + cops) * 3ULL) << dn->dn_indblkshift;
+		tx->tx_space_towrite += 3 << dn->dn_indblkshift;
 }
 
 void
-dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops)
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
 {
 	ASSERT(tx->tx_txg == 0);
 
 	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP,
-	    dmu_tx_hold_zap_impl, (ops > 0?ops:0), (ops < 0?-ops:0));
+	    dmu_tx_hold_zap_impl, add, (uintptr_t)name);
 }
 
 void
@@ -492,7 +630,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 		return;
 
 	/* XXX No checking on the meta dnode for now */
-	if (db->db.db_object & DMU_PRIVATE_OBJECT)
+	if (db->db.db_object == DMU_META_DNODE_OBJECT)
 		return;
 
 	for (dth = list_head(&tx->tx_holds); dth;
@@ -572,20 +710,19 @@ static int
 dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth)
 {
 	dmu_tx_hold_t *dth;
-	uint64_t lsize, asize, fsize;
+	uint64_t lsize, asize, fsize, towrite;
 
 	*last_dth = NULL;
 
-	tx->tx_space_towrite = 0;
-	tx->tx_space_tofree = 0;
-	tx->tx_space_tooverwrite = 0;
 	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 
 	if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
 		return (ERESTART);
+	if (tx->tx_err)
+		return (tx->tx_err);
 
 	for (dth = list_head(&tx->tx_holds); dth;
-	    *last_dth = dth, dth = list_next(&tx->tx_holds, dth)) {
+	    dth = list_next(&tx->tx_holds, dth)) {
 		dnode_t *dn = dth->dth_dnode;
 		if (dn != NULL) {
 			mutex_enter(&dn->dn_mtx);
@@ -608,8 +745,21 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth)
 			(void) refcount_add(&dn->dn_tx_holds, tx);
 			mutex_exit(&dn->dn_mtx);
 		}
-		if (dth->dth_func)
-			dth->dth_func(tx, dn, dth->dth_arg1, dth->dth_arg2);
+		*last_dth = dth;
+		if (tx->tx_err)
+			return (tx->tx_err);
+	}
+
+	/*
+	 * If a snapshot has been taken since we made our estimates,
+	 * assume that we won't be able to free or overwrite anything.
+	 */
+	if (tx->tx_objset &&
+	    dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
+	    tx->tx_lastsnap_txg) {
+		tx->tx_space_towrite += tx->tx_space_tooverwrite;
+		tx->tx_space_tooverwrite = 0;
+		tx->tx_space_tofree = 0;
 	}
 
 	/*
@@ -619,13 +769,16 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth)
 	    tx->tx_space_tofree;
 	lsize = tx->tx_space_towrite + tx->tx_space_tooverwrite;
 	asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+	towrite = tx->tx_space_towrite;
 	tx->tx_space_towrite = asize;
 
 	if (tx->tx_dir && asize != 0) {
 		int err = dsl_dir_tempreserve_space(tx->tx_dir,
 		    lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
-		if (err)
+		if (err) {
+			tx->tx_space_towrite = towrite;
 			return (err);
+		}
 	}
 
 	return (0);
@@ -688,8 +841,6 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
 	ASSERT(tx->tx_txg == 0);
 	ASSERT(txg_how != 0);
 	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
-	ASSERT3U(tx->tx_space_towrite, ==, 0);
-	ASSERT3U(tx->tx_space_tofree, ==, 0);
 
 	while ((err = dmu_tx_try_assign(tx, txg_how, &last_dth)) != 0) {
 		uint64_t txg = dmu_tx_unassign(tx, last_dth);
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
index 03ce2a0398..8adb692ec8 100644
--- a/usr/src/uts/common/fs/zfs/dnode.c
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -155,7 +155,7 @@ dnode_verify(dnode_t *dn)
 	}
 	if (dn->dn_phys->dn_type != DMU_OT_NONE)
 		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
-	ASSERT(IS_DNODE_DNODE(dn->dn_object) || dn->dn_dbuf);
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL);
 	if (dn->dn_dbuf != NULL) {
 		ASSERT3P(dn->dn_phys, ==,
 		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
@@ -307,6 +307,11 @@ dnode_destroy(dnode_t *dn)
 		dn->dn_dirtyctx_firstset = NULL;
 	}
 	dmu_zfetch_rele(&dn->dn_zfetch);
+	if (dn->dn_bonus) {
+		mutex_enter(&dn->dn_bonus->db_mtx);
+		dbuf_evict(dn->dn_bonus);
+		dn->dn_bonus = NULL;
+	}
 	kmem_cache_free(dnode_cache, dn);
 }
 
@@ -381,13 +386,10 @@ void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
-	dmu_buf_impl_t *db = NULL;
-
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
 	ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-	ASSERT(!(dn->dn_object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx));
+	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 	ASSERT(tx->tx_txg != 0);
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
@@ -398,6 +400,10 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 	ASSERT(dn->dn_dirtyblksz[2] == 0);
 	ASSERT(dn->dn_dirtyblksz[3] == 0);
 
+	/* clean up any unreferenced dbufs */
+	dnode_evict_dbufs(dn);
+	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
 	/*
 	 * XXX I should really have a generation number to tell if we
 	 * need to do this...
@@ -421,17 +427,25 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 	dn->dn_type = ot;
 
 	if (dn->dn_bonuslen != bonuslen) {
+		dmu_buf_impl_t *db = NULL;
+
 		/* change bonus size */
 		if (bonuslen == 0)
 			bonuslen = 1; /* XXX */
-		db = dbuf_hold_bonus(dn, FTAG);
-		dbuf_read(db);
+		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		if (dn->dn_bonus == NULL)
+			dn->dn_bonus = dbuf_create_bonus(dn);
+		db = dn->dn_bonus;
+		rw_exit(&dn->dn_struct_rwlock);
+		if (refcount_add(&db->db_holds, FTAG) == 1)
+			dnode_add_ref(dn, db);
 		mutex_enter(&db->db_mtx);
 		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
 		ASSERT(db->db.db_data != NULL);
 		db->db.db_size = bonuslen;
 		mutex_exit(&db->db_mtx);
 		dbuf_dirty(db, tx);
+		dbuf_rele(db, FTAG);
 	}
 
 	/* change bonus size and type */
@@ -445,14 +459,19 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
-
-	if (db)
-		dbuf_remove_ref(db, FTAG);
 }
 
 void
 dnode_special_close(dnode_t *dn)
 {
+	/*
+	 * Wait for final references to the dnode to clear.  This can
+	 * only happen if the arc is asyncronously evicting state that
+	 * has a hold on this dnode while we are trying to evict this
+	 * dnode.
+	 */
+	while (refcount_count(&dn->dn_holds) > 0)
+		delay(1);
 	dnode_destroy(dn);
 }
 
@@ -498,21 +517,25 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg)
 }
 
 /*
- * Returns held dnode if the object number is valid, NULL if not.
- * Note that this will succeed even for free dnodes.
+ * errors:
+ * EINVAL - invalid object number.
+ * EIO - i/o error.
+ * succeeds even for free dnodes.
  */
-dnode_t *
-dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref)
+int
+dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
+    void *tag, dnode_t **dnp)
 {
-	int epb, idx;
+	int epb, idx, err;
 	int drop_struct_lock = FALSE;
+	int type;
 	uint64_t blk;
 	dnode_t *mdn, *dn;
 	dmu_buf_impl_t *db;
 	dnode_t **children_dnodes;
 
 	if (object == 0 || object >= DN_MAX_OBJECT)
-		return (NULL);
+		return (EINVAL);
 
 	mdn = os->os_meta_dnode;
 
@@ -525,10 +548,16 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref)
 
 	blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
 
-	db = dbuf_hold(mdn, blk);
+	db = dbuf_hold(mdn, blk, FTAG);
 	if (drop_struct_lock)
 		rw_exit(&mdn->dn_struct_rwlock);
-	dbuf_read(db);
+	if (db == NULL)
+		return (EIO);
+	err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+	if (err) {
+		dbuf_rele(db, FTAG);
+		return (err);
+	}
 
 	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
 	epb = db->db.db_size >> DNODE_SHIFT;
@@ -559,51 +588,53 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref)
 	}
 
 	mutex_enter(&dn->dn_mtx);
+	type = dn->dn_type;
 	if (dn->dn_free_txg ||
-	    ((flag & DNODE_MUST_BE_ALLOCATED) && dn->dn_type == DMU_OT_NONE) ||
-	    ((flag & DNODE_MUST_BE_FREE) && dn->dn_type != DMU_OT_NONE)) {
+	    ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
+	    ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) {
 		mutex_exit(&dn->dn_mtx);
-		dbuf_rele(db);
-		return (NULL);
+		dbuf_rele(db, FTAG);
+		return (type == DMU_OT_NONE ? ENOENT : EEXIST);
 	}
 	mutex_exit(&dn->dn_mtx);
 
-	if (refcount_add(&dn->dn_holds, ref) == 1)
+	if (refcount_add(&dn->dn_holds, tag) == 1)
 		dbuf_add_ref(db, dn);
 
 	DNODE_VERIFY(dn);
 	ASSERT3P(dn->dn_dbuf, ==, db);
 	ASSERT3U(dn->dn_object, ==, object);
-	dbuf_rele(db);
+	dbuf_rele(db, FTAG);
 
-	return (dn);
+	*dnp = dn;
+	return (0);
 }
 
 /*
  * Return held dnode if the object is allocated, NULL if not.
  */
-dnode_t *
-dnode_hold(objset_impl_t *os, uint64_t object, void *ref)
+int
+dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
 {
-	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, ref));
+	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
 }
 
 void
-dnode_add_ref(dnode_t *dn, void *ref)
+dnode_add_ref(dnode_t *dn, void *tag)
 {
 	ASSERT(refcount_count(&dn->dn_holds) > 0);
-	(void) refcount_add(&dn->dn_holds, ref);
+	(void) refcount_add(&dn->dn_holds, tag);
 }
 
 void
-dnode_rele(dnode_t *dn, void *ref)
+dnode_rele(dnode_t *dn, void *tag)
 {
 	uint64_t refs;
 
-	refs = refcount_remove(&dn->dn_holds, ref);
+	refs = refcount_remove(&dn->dn_holds, tag);
 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
 	if (refs == 0 && dn->dn_dbuf)
-		dbuf_remove_ref(dn->dn_dbuf, dn);
+		dbuf_rele(dn->dn_dbuf, dn);
 }
 
 void
@@ -612,7 +643,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 	objset_impl_t *os = dn->dn_objset;
 	uint64_t txg = tx->tx_txg;
 
-	if (IS_DNODE_DNODE(dn->dn_object))
+	if (dn->dn_object == DMU_META_DNODE_OBJECT)
 		return;
 
 	DNODE_VERIFY(dn);
@@ -658,7 +689,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 	 * dnode will hang around after we finish processing its
 	 * children.
 	 */
-	(void) refcount_add(&dn->dn_holds, (void *)(uintptr_t)tx->tx_txg);
+	dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
 
 	dbuf_dirty(dn->dn_dbuf, tx);
 
@@ -764,7 +795,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	}
 
 	/* obtain the old block */
-	db = dbuf_hold(dn, 0);
+	db = dbuf_hold(dn, 0, FTAG);
 
 	dbuf_new_size(db, size, tx);
 
@@ -773,7 +804,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	/* don't need dd_dirty_mtx, dnode is already dirty */
 	dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size;
 	dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
-	dbuf_rele(db);
+	dbuf_rele(db, FTAG);
 
 	err = 0;
 end:
@@ -844,7 +875,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
 		dmu_buf_impl_t *db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
 		dprintf("dn %p dirtying left indirects\n", dn);
 		dbuf_dirty(db, tx);
-		dbuf_remove_ref(db, FTAG);
+		dbuf_rele(db, FTAG);
 	}
 #ifdef ZFS_DEBUG
 	else if (old_nlevels > 1 && new_nlevels > old_nlevels) {
@@ -855,7 +886,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
 			db = dbuf_hold_level(dn, old_nlevels-1, i, FTAG);
 			ASSERT(!
 			    list_link_active(&db->db_dirty_node[txgoff]));
-			dbuf_remove_ref(db, FTAG);
+			dbuf_rele(db, FTAG);
 		}
 	}
 #endif
@@ -976,7 +1007,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 				data = db->db.db_data;
 				bzero(data + start, head);
 			}
-			dbuf_remove_ref(db, FTAG);
+			dbuf_rele(db, FTAG);
 		}
 		off += head;
 		len -= head;
@@ -1009,7 +1040,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 				bzero(db->db.db_data, tail);
 			}
-			dbuf_remove_ref(db, FTAG);
+			dbuf_rele(db, FTAG);
 		}
 		len -= tail;
 	}
@@ -1022,7 +1053,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 		db = dbuf_hold_level(dn, 1,
 		    (off - head) >> (blkshift + epbs), FTAG);
 		dbuf_will_dirty(db, tx);
-		dbuf_remove_ref(db, FTAG);
+		dbuf_rele(db, FTAG);
 	}
 
 	/* dirty the right indirects */
@@ -1030,7 +1061,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 		db = dbuf_hold_level(dn, 1,
 		    (off + len + tail - 1) >> (blkshift + epbs), FTAG);
 		dbuf_will_dirty(db, tx);
-		dbuf_remove_ref(db, FTAG);
+		dbuf_rele(db, FTAG);
 	}
 
 	/*
@@ -1189,7 +1220,8 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
 				return (hole ? 0 : ESRCH);
 			return (error);
 		}
-		dbuf_read_havestruct(db);
+		(void) dbuf_read(db, NULL,
+		    DB_RF_MUST_SUCCEED | DB_RF_HAVESTRUCT);
 		data = db->db.db_data;
 	}
 
@@ -1228,7 +1260,7 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
 	}
 
 	if (db)
-		dbuf_remove_ref(db, FTAG);
+		dbuf_rele(db, FTAG);
 
 	return (error);
 }
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
index 597cafb44e..dcfb9ee7d2 100644
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -48,13 +47,15 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
 	/* this dnode can't be paged out because it's dirty */
 
 	db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
+	ASSERT(db != NULL);
 	for (i = 0; i < dn->dn_phys->dn_nblkptr; i++)
 		if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
 			break;
 	if (i != dn->dn_phys->dn_nblkptr) {
 		ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]));
 
-		dbuf_read_havestruct(db);
+		(void) dbuf_read(db, NULL,
+		    DB_RF_HAVESTRUCT | DB_RF_MUST_SUCCEED);
 		arc_release(db->db_buf, db);
 		/* copy dnode's block pointers to new indirect block */
 		ASSERT3U(sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr, <=,
@@ -102,7 +103,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
 	bzero(dn->dn_phys->dn_blkptr,
 		sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
 
-	dbuf_remove_ref(db, FTAG);
+	dbuf_rele(db, FTAG);
 }
 
 static void
@@ -163,7 +164,8 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 
 		/* db_data_old better be zeroed */
 		if (child->db_d.db_data_old[txg & TXG_MASK]) {
-			buf = (child->db_d.db_data_old[txg & TXG_MASK])->b_data;
+			buf = ((arc_buf_t *)child->db_d.db_data_old
+			    [txg & TXG_MASK])->b_data;
 			for (j = 0; j < child->db.db_size >> 3; j++) {
 				if (buf[j] != 0) {
 					panic("freed data not zero: "
@@ -194,7 +196,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 		}
 		mutex_exit(&child->db_mtx);
 
-		dbuf_remove_ref(child, FTAG);
+		dbuf_rele(child, FTAG);
 	}
 }
 #endif
@@ -211,7 +213,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
 	int txgoff = tx->tx_txg & TXG_MASK;
 	int all = TRUE;
 
-	dbuf_read(db);
+	(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 	arc_release(db->db_buf, db);
 	bp = (blkptr_t *)db->db.db_data;
 
@@ -254,7 +256,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
 		} else {
 			all = FALSE;
 		}
-		dbuf_remove_ref(subdb, FTAG);
+		dbuf_rele(subdb, FTAG);
 	}
 #ifdef ZFS_DEBUG
 	bp -= (end-start)+1;
@@ -326,7 +328,7 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 			ASSERT3P(db->db_blkptr, ==, bp);
 			free_blocks(dn, bp, 1, tx);
 		}
-		dbuf_remove_ref(db, FTAG);
+		dbuf_rele(db, FTAG);
 	}
 	if (trunc) {
 		uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
@@ -338,6 +340,48 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 	}
 }
 
+/*
+ * Try to kick all the dnodes dbufs out of the cache...
+ */
+void
+dnode_evict_dbufs(dnode_t *dn)
+{
+	dmu_buf_impl_t *db;
+
+	mutex_enter(&dn->dn_dbufs_mtx);
+	while (db = list_head(&dn->dn_dbufs)) {
+		int progress = 0;
+		for (; db; db = list_next(&dn->dn_dbufs, db)) {
+			mutex_enter(&db->db_mtx);
+			if (db->db_state != DB_EVICTING &&
+			    refcount_is_zero(&db->db_holds))
+				break;
+			else if (db->db_state == DB_EVICTING)
+				progress = 1;
+			mutex_exit(&db->db_mtx);
+		}
+		if (db) {
+			ASSERT(!arc_released(db->db_buf));
+			dbuf_clear(db);
+			mutex_exit(&dn->dn_dbufs_mtx);
+			progress = 1;
+		} else {
+			if (progress == 0)
+				break;
+			mutex_exit(&dn->dn_dbufs_mtx);
+		}
+		mutex_enter(&dn->dn_dbufs_mtx);
+	}
+	mutex_exit(&dn->dn_dbufs_mtx);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
+		mutex_enter(&dn->dn_bonus->db_mtx);
+		dbuf_evict(dn->dn_bonus);
+		dn->dn_bonus = NULL;
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
 static int
 dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 {
@@ -352,32 +396,35 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 		/* XXX - use dbuf_undirty()? */
 		list_remove(&dn->dn_dirty_dbufs[txgoff], db);
 		if (db->db_level == 0) {
-			ASSERT3P(db->db_d.db_data_old[txgoff], ==, db->db_buf);
+			ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+			    db->db_d.db_data_old[txgoff] == db->db_buf);
 			if (db->db_d.db_overridden_by[txgoff])
 				dbuf_unoverride(db, tx->tx_txg);
 			db->db_d.db_data_old[txgoff] = NULL;
 		}
 		db->db_dirtycnt -= 1;
 		mutex_exit(&db->db_mtx);
-		dbuf_remove_ref(db, (void *)(uintptr_t)tx->tx_txg);
+		dbuf_rele(db, (void *)(uintptr_t)tx->tx_txg);
 	}
 
-	ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
+	dnode_evict_dbufs(dn);
+	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+	/*
+	 * XXX - It would be nice to assert this, but we may still
+	 * have residual holds from async evictions from the arc...
+	 *
+	 * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
+	 */
 
 	/* Undirty next bits */
 	dn->dn_next_nlevels[txgoff] = 0;
 	dn->dn_next_indblkshift[txgoff] = 0;
 
 	/* free up all the blocks in the file. */
-	dbuf_free_range(dn, 0, -1, tx);
 	dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
 	ASSERT3U(dn->dn_phys->dn_secphys, ==, 0);
 
-	/*
-	 * All dbufs should be gone, since all holds are gone...
-	 */
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-
 	/* ASSERT(blkptrs are zero); */
 	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
 	ASSERT(dn->dn_type != DMU_OT_NONE);
@@ -394,7 +441,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 	dn->dn_allocated_txg = 0;
 	mutex_exit(&dn->dn_mtx);
 
-	ASSERT(!IS_DNODE_DNODE(dn->dn_object));
+	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 
 	dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
 	/*
@@ -420,7 +467,7 @@ dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx)
 
 	/* ASSERT(dn->dn_objset->dd_snapshot == NULL); */
 	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(IS_DNODE_DNODE(dn->dn_object) ||
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    dn->dn_dirtyblksz[txgoff] > 0);
 
 	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
@@ -533,7 +580,7 @@ dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx)
 		dn->dn_dirtyblksz[txgoff] = 0;
 
 
-		if (!IS_DNODE_DNODE(dn->dn_object)) {
+		if (dn->dn_object != DMU_META_DNODE_OBJECT) {
 			dbuf_will_dirty(dn->dn_dbuf, tx);
 			dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
 		}
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index e77b772922..7db7745270 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -146,7 +145,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 		    -used, -compressed, -uncompressed, tx);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
-		bplist_enqueue(&ds->ds_deadlist, bp, tx);
+		VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 		if (ds->ds_phys->ds_prev_snap_obj != 0) {
 			ASSERT3U(ds->ds_prev->ds_object, ==,
@@ -175,14 +174,14 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 	mutex_exit(&ds->ds_lock);
 }
 
-int
-dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, dmu_tx_t *tx)
+uint64_t
+dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 {
-	uint64_t prev_snap_txg;
+	uint64_t txg;
 	dsl_dir_t *dd;
-	/* ASSERT that it is not a snapshot */
+
 	if (ds == NULL)
-		return (TRUE);
+		return (0);
 	/*
 	 * The snapshot creation could fail, but that would cause an
 	 * incorrect FALSE return, which would only result in an
@@ -195,13 +194,19 @@ dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, dmu_tx_t *tx)
 	 */
 	dd = ds->ds_dir;
 	mutex_enter(&dd->dd_lock);
-	if (dd->dd_sync_func == dsl_dataset_snapshot_sync &&
-	    dd->dd_sync_txg < tx->tx_txg)
-		prev_snap_txg = dd->dd_sync_txg;
+	if (dd->dd_sync_func == dsl_dataset_snapshot_sync)
+		txg = dd->dd_sync_txg;
 	else
-		prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+		txg = ds->ds_phys->ds_prev_snap_txg;
 	mutex_exit(&dd->dd_lock);
-	return (blk_birth > prev_snap_txg);
+
+	return (txg);
+}
+
+int
+dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
+{
+	return (blk_birth > dsl_dataset_prev_snap_txg(ds));
 }
 
 /* ARGSUSED */
@@ -236,7 +241,7 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
 
-static void
+static int
 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 {
 	dsl_dataset_phys_t *headphys;
@@ -246,34 +251,37 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds)
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (ds->ds_snapname[0])
-		return;
+		return (0);
 	if (ds->ds_phys->ds_next_snap_obj == 0)
-		return;
+		return (0);
 
-	headdbuf = dmu_bonus_hold_tag(mos,
-	    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG);
-	dmu_buf_read(headdbuf);
+	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
+	    FTAG, &headdbuf);
+	if (err)
+		return (err);
 	headphys = headdbuf->db_data;
 	err = zap_value_search(dp->dp_meta_objset,
 	    headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
-	ASSERT(err == 0);
-	dmu_buf_rele_tag(headdbuf, FTAG);
+	dmu_buf_rele(headdbuf, FTAG);
+	return (err);
 }
 
-dsl_dataset_t *
+int
 dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
-    int mode, void *tag)
+    int mode, void *tag, dsl_dataset_t **dsp)
 {
 	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_buf_t *dbuf;
 	dsl_dataset_t *ds;
+	int err;
 
 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 	    dsl_pool_sync_context(dp));
 
-	dbuf = dmu_bonus_hold_tag(mos, dsobj, tag);
-	dmu_buf_read(dbuf);
+	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
+	if (err)
+		return (err);
 	ds = dmu_buf_get_user(dbuf);
 	if (ds == NULL) {
 		dsl_dataset_t *winner;
@@ -282,47 +290,60 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
 		ds->ds_dbuf = dbuf;
 		ds->ds_object = dsobj;
 		ds->ds_phys = dbuf->db_data;
-		ds->ds_dir = dsl_dir_open_obj(dp,
-		    ds->ds_phys->ds_dir_obj, NULL, ds);
 
-		bplist_open(&ds->ds_deadlist,
+		err = bplist_open(&ds->ds_deadlist,
 		    mos, ds->ds_phys->ds_deadlist_obj);
+		if (err == 0) {
+			err = dsl_dir_open_obj(dp,
+			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
+		}
+		if (err) {
+			/*
+			 * we don't really need to close the blist if we
+			 * just opened it.
+			 */
+			kmem_free(ds, sizeof (dsl_dataset_t));
+			dmu_buf_rele(dbuf, tag);
+			return (err);
+		}
 
 		if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
 			ds->ds_snapname[0] = '\0';
 			if (ds->ds_phys->ds_prev_snap_obj) {
-				ds->ds_prev =
-				    dsl_dataset_open_obj(dp,
+				err = dsl_dataset_open_obj(dp,
 				    ds->ds_phys->ds_prev_snap_obj, NULL,
-				    DS_MODE_NONE, ds);
+				    DS_MODE_NONE, ds, &ds->ds_prev);
 			}
 		} else {
 			if (snapname) {
 #ifdef ZFS_DEBUG
 				dsl_dataset_phys_t *headphys;
-				int err;
-				dmu_buf_t *headdbuf = dmu_bonus_hold_tag(mos,
-				    ds->ds_dir->dd_phys->
-				    dd_head_dataset_obj, FTAG);
-				dmu_buf_read(headdbuf);
-				headphys = headdbuf->db_data;
-				uint64_t foundobj;
-				err = zap_lookup(dp->dp_meta_objset,
-				    headphys->ds_snapnames_zapobj,
-				    snapname, sizeof (foundobj), 1, &foundobj);
-				ASSERT3U(err, ==, 0);
-				ASSERT3U(foundobj, ==, dsobj);
-				dmu_buf_rele_tag(headdbuf, FTAG);
+				dmu_buf_t *headdbuf;
+				err = dmu_bonus_hold(mos,
+				    ds->ds_dir->dd_phys->dd_head_dataset_obj,
+				    FTAG, &headdbuf);
+				if (err == 0) {
+					headphys = headdbuf->db_data;
+					uint64_t foundobj;
+					err = zap_lookup(dp->dp_meta_objset,
+					    headphys->ds_snapnames_zapobj,
+					    snapname, sizeof (foundobj), 1,
+					    &foundobj);
+					ASSERT3U(foundobj, ==, dsobj);
+					dmu_buf_rele(headdbuf, FTAG);
+				}
 #endif
 				(void) strcat(ds->ds_snapname, snapname);
 			} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
-				dsl_dataset_get_snapname(ds);
+				err = dsl_dataset_get_snapname(ds);
 			}
 		}
 
-		winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
-		    dsl_dataset_evict);
-		if (winner) {
+		if (err == 0) {
+			winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
+			    dsl_dataset_evict);
+		}
+		if (err || winner) {
 			bplist_close(&ds->ds_deadlist);
 			if (ds->ds_prev) {
 				dsl_dataset_close(ds->ds_prev,
@@ -330,6 +351,10 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
 			}
 			dsl_dir_close(ds->ds_dir, ds);
 			kmem_free(ds, sizeof (dsl_dataset_t));
+			if (err) {
+				dmu_buf_rele(dbuf, tag);
+				return (err);
+			}
 			ds = winner;
 		} else {
 			uint64_t new =
@@ -349,12 +374,13 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
 	    (ds->ds_open_refcount + weight > DOS_REF_MAX)) {
 		mutex_exit(&ds->ds_lock);
 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
-		return (NULL);
+		return (EBUSY);
 	}
 	ds->ds_open_refcount += weight;
 	mutex_exit(&ds->ds_lock);
 
-	return (ds);
+	*dsp = ds;
+	return (0);
 }
 
 int
@@ -368,9 +394,9 @@ dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
 	dsl_dataset_t *ds = NULL;
 	int err = 0;
 
-	dd = dsl_dir_open_spa(spa, name, FTAG, &tail);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
+	if (err)
+		return (err);
 
 	dp = dd->dd_pool;
 	obj = dd->dd_phys->dd_head_dataset_obj;
@@ -384,7 +410,10 @@ dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
 	if (tail != NULL) {
 		objset_t *mos = dp->dp_meta_objset;
 
-		ds = dsl_dataset_open_obj(dp, obj, NULL, DS_MODE_NONE, tag);
+		err = dsl_dataset_open_obj(dp, obj, NULL,
+		    DS_MODE_NONE, tag, &ds);
+		if (err)
+			goto out;
 		obj = ds->ds_phys->ds_snapnames_zapobj;
 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
 		ds = NULL;
@@ -405,9 +434,7 @@ dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
 		if (err)
 			goto out;
 	}
-	ds = dsl_dataset_open_obj(dp, obj, tail, mode, tag);
-	if (ds == NULL)
-		err = EBUSY;
+	err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
 
 out:
 	rw_exit(&dp->dp_config_rwlock);
@@ -433,7 +460,7 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name)
 		(void) strcpy(name, "mos");
 	} else {
 		dsl_dir_name(ds->ds_dir, name);
-		dsl_dataset_get_snapname(ds);
+		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			(void) strcat(name, "@");
 			if (!MUTEX_HELD(&ds->ds_lock)) {
@@ -462,7 +489,7 @@ dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
 	    mode, ds->ds_open_refcount);
 	mutex_exit(&ds->ds_lock);
 
-	dmu_buf_rele_tag(ds->ds_dbuf, tag);
+	dmu_buf_rele(ds->ds_dbuf, tag);
 }
 
 void
@@ -476,16 +503,16 @@ dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
 	dsl_dir_t *dd;
 
 	dsl_dir_create_root(mos, ddobjp, tx);
-	dd = dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG);
-	ASSERT(dd != NULL);
+	VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	dbuf = dmu_bonus_hold(mos, dsobj);
+	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	dsphys->ds_dir_obj = dd->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
+	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_snapnames_zapobj =
@@ -494,13 +521,14 @@ dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
 	dsphys->ds_creation_txg = tx->tx_txg;
 	dsphys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	dmu_buf_rele(dbuf);
+	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_head_dataset_obj = dsobj;
 	dsl_dir_close(dd, FTAG);
 
-	ds = dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG);
+	VERIFY(0 ==
+	    dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
 	(void) dmu_objset_create_impl(dp->dp_spa, ds, DMU_OST_ZFS, tx);
 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 }
@@ -537,14 +565,13 @@ dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
 	err = dsl_dir_create_sync(pds, lastname, tx);
 	if (err)
 		return (err);
-	dd = dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, NULL);
-	ASSERT(dd != NULL);
+	VERIFY(0 == dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, &dd, NULL));
 
 	/* This is the point of no (unsuccessful) return */
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	dbuf = dmu_bonus_hold(mos, dsobj);
+	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	dsphys->ds_dir_obj = dd->dd_object;
@@ -576,7 +603,7 @@ dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
 		dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
 	}
-	dmu_buf_rele(dbuf);
+	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_head_dataset_obj = dsobj;
@@ -594,9 +621,9 @@ dsl_dataset_destroy(const char *name)
 	dsl_dir_t *dd;
 	const char *tail;
 
-	dd = dsl_dir_open(name, FTAG, &tail);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(name, FTAG, &dd, &tail);
+	if (err)
+		return (err);
 
 	dp = dd->dd_pool;
 	if (tail != NULL) {
@@ -631,10 +658,12 @@ dsl_dataset_destroy(const char *name)
 		 * dsl_dataset_destroy_sync() to destroy the head dataset.
 		 */
 		rw_enter(&dp->dp_config_rwlock, RW_READER);
-		pds = dsl_dir_open_obj(dd->dd_pool,
-		    dd->dd_phys->dd_parent_obj, NULL, FTAG);
+		err = dsl_dir_open_obj(dd->dd_pool,
+		    dd->dd_phys->dd_parent_obj, NULL, FTAG, &pds);
 		dsl_dir_close(dd, FTAG);
 		rw_exit(&dp->dp_config_rwlock);
+		if (err)
+			return (err);
 
 		(void) strcpy(buf, name);
 		cp = strrchr(buf, '/') + 1;
@@ -657,9 +686,9 @@ dsl_dataset_rollback(const char *name)
 	dsl_dir_t *dd;
 	const char *tail;
 
-	dd = dsl_dir_open(name, FTAG, &tail);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(name, FTAG, &dd, &tail);
+	if (err)
+		return (err);
 
 	if (tail != NULL) {
 		dsl_dir_close(dd, FTAG);
@@ -777,11 +806,14 @@ dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	dsl_dataset_t *ds;
+	int err;
 
 	if (dd->dd_phys->dd_head_dataset_obj == 0)
 		return (EINVAL);
-	ds = dsl_dataset_open_obj(dd->dd_pool,
-	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+	err = dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &ds);
+	if (err)
+		return (err);
 
 	if (ds->ds_phys->ds_prev_snap_txg == 0) {
 		/*
@@ -823,7 +855,8 @@ dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
 	ds->ds_phys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
+	    ds->ds_phys->ds_deadlist_obj));
 	dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
 
 	{
@@ -891,27 +924,23 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 		drop_lock = TRUE;
 	}
 
-	ds = dsl_dataset_open_obj(dd->dd_pool,
+	err = dsl_dataset_open_obj(dd->dd_pool,
 	    dd->dd_phys->dd_head_dataset_obj, NULL,
-	    snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG);
+	    snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG, &ds);
 
-	if (snapname) {
+	if (err == 0 && snapname) {
 		err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
 		    snapname, 8, 1, &obj);
 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
-		if (err) {
-			if (drop_lock)
-				rw_exit(&dp->dp_config_rwlock);
-			return (err);
+		if (err == 0) {
+			err = dsl_dataset_open_obj(dd->dd_pool, obj, NULL,
+			    DS_MODE_EXCLUSIVE, FTAG, &ds);
 		}
-
-		ds = dsl_dataset_open_obj(dd->dd_pool, obj, NULL,
-		    DS_MODE_EXCLUSIVE, FTAG);
 	}
-	if (ds == NULL) {
+	if (err) {
 		if (drop_lock)
 			rw_exit(&dp->dp_config_rwlock);
-		return (EBUSY);
+		return (err);
 	}
 
 	obj = ds->ds_object;
@@ -942,22 +971,25 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	 * them.  Try again.
 	 */
 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
-		mutex_exit(&ds->ds_lock);
 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 		if (drop_lock)
 			rw_exit(&dp->dp_config_rwlock);
 		return (EAGAIN);
 	}
 
-	/* THE POINT OF NO (unsuccessful) RETURN */
-
 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
 		if (ds->ds_prev) {
 			ds_prev = ds->ds_prev;
 		} else {
-			ds_prev = dsl_dataset_open_obj(dd->dd_pool,
+			err = dsl_dataset_open_obj(dd->dd_pool,
 			    ds->ds_phys->ds_prev_snap_obj, NULL,
-			    DS_MODE_NONE, FTAG);
+			    DS_MODE_NONE, FTAG, &ds_prev);
+			if (err) {
+				dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+				if (drop_lock)
+					rw_exit(&dp->dp_config_rwlock);
+				return (err);
+			}
 		}
 		after_branch_point =
 		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
@@ -974,6 +1006,8 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 		}
 	}
 
+	/* THE POINT OF NO (unsuccessful) RETURN */
+
 	ASSERT3P(tx->tx_pool, ==, dd->dd_pool);
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 
@@ -983,8 +1017,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 		spa_scrub_restart(dp->dp_spa, tx->tx_txg);
 
-		ds_next = dsl_dataset_open_obj(dd->dd_pool,
-		    ds->ds_phys->ds_next_snap_obj, NULL, DS_MODE_NONE, FTAG);
+		VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+		    ds->ds_phys->ds_next_snap_obj, NULL,
+		    DS_MODE_NONE, FTAG, &ds_next));
 		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
 
 		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
@@ -1006,7 +1041,8 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 		while (bplist_iterate(&ds_next->ds_deadlist, &itor,
 		    &bp) == 0) {
 			if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
-				bplist_enqueue(&ds->ds_deadlist, &bp, tx);
+				VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
+				    &bp, tx));
 				if (ds_prev && !after_branch_point &&
 				    bp.blk_birth >
 				    ds_prev->ds_phys->ds_prev_snap_txg) {
@@ -1030,8 +1066,8 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 		/* set next's deadlist to our deadlist */
 		ds_next->ds_phys->ds_deadlist_obj =
 		    ds->ds_phys->ds_deadlist_obj;
-		bplist_open(&ds_next->ds_deadlist, mos,
-		    ds_next->ds_phys->ds_deadlist_obj);
+		VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
+		    ds_next->ds_phys->ds_deadlist_obj));
 		ds->ds_phys->ds_deadlist_obj = 0;
 
 		if (ds_next->ds_phys->ds_next_snap_obj != 0) {
@@ -1049,9 +1085,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 			 */
 			dsl_dataset_t *ds_after_next;
 
-			ds_after_next = dsl_dataset_open_obj(dd->dd_pool,
+			VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
 			    ds_next->ds_phys->ds_next_snap_obj, NULL,
-			    DS_MODE_NONE, FTAG);
+			    DS_MODE_NONE, FTAG, &ds_after_next));
 			itor = 0;
 			while (bplist_iterate(&ds_after_next->ds_deadlist,
 			    &itor, &bp) == 0) {
@@ -1078,9 +1114,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 			dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
 			    ds_next);
 			if (ds_prev) {
-				ds_next->ds_prev = dsl_dataset_open_obj(
-				    dd->dd_pool, ds->ds_phys->ds_prev_snap_obj,
-				    NULL, DS_MODE_NONE, ds_next);
+				VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+				    ds->ds_phys->ds_prev_snap_obj, NULL,
+				    DS_MODE_NONE, ds_next, &ds_next->ds_prev));
 			} else {
 				ds_next->ds_prev = NULL;
 			}
@@ -1144,8 +1180,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	} else {
 		/* remove from snapshot namespace */
 		dsl_dataset_t *ds_head;
-		ds_head = dsl_dataset_open_obj(dd->dd_pool,
-		    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+		VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+		    dd->dd_phys->dd_head_dataset_obj, NULL,
+		    DS_MODE_NONE, FTAG, &ds_head));
 #ifdef ZFS_DEBUG
 		{
 			uint64_t val;
@@ -1195,8 +1232,10 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 	if (dd->dd_phys->dd_head_dataset_obj == 0)
 		return (EINVAL);
-	ds = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL,
-	    DS_MODE_NONE, FTAG);
+	err = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL,
+	    DS_MODE_NONE, FTAG, &ds);
+	if (err)
+		return (err);
 
 	err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
 	    snapname, 8, 1, &value);
@@ -1217,7 +1256,7 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	dbuf = dmu_bonus_hold(mos, dsobj);
+	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	dsphys->ds_dir_obj = dd->dd_object;
@@ -1237,13 +1276,14 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
 	dsphys->ds_restoring = ds->ds_phys->ds_restoring;
 	dsphys->ds_bp = ds->ds_phys->ds_bp;
-	dmu_buf_rele(dbuf);
+	dmu_buf_rele(dbuf, FTAG);
 
 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
 		dsl_dataset_t *ds_prev;
 
-		ds_prev = dsl_dataset_open_obj(dp,
-		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_NONE, FTAG);
+		VERIFY(0 == dsl_dataset_open_obj(dp,
+		    ds->ds_phys->ds_prev_snap_obj, NULL,
+		    DS_MODE_NONE, FTAG, &ds_prev));
 		ASSERT(ds_prev->ds_phys->ds_next_snap_obj ==
 		    ds->ds_object ||
 		    ds_prev->ds_phys->ds_num_children > 1);
@@ -1266,7 +1306,8 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	ds->ds_phys->ds_unique_bytes = 0;
 	ds->ds_phys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
+	    ds->ds_phys->ds_deadlist_obj));
 
 	dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
 	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
@@ -1275,8 +1316,9 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 	if (ds->ds_prev)
 		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
-	ds->ds_prev = dsl_dataset_open_obj(dp,
-	    ds->ds_phys->ds_prev_snap_obj, snapname, DS_MODE_NONE, ds);
+	VERIFY(0 == dsl_dataset_open_obj(dp,
+	    ds->ds_phys->ds_prev_snap_obj, snapname,
+	    DS_MODE_NONE, ds, &ds->ds_prev));
 
 	rw_exit(&dp->dp_config_rwlock);
 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
@@ -1295,7 +1337,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, dmu_tx_t *tx)
 	dsl_dir_dirty(ds->ds_dir, tx);
 	bplist_close(&ds->ds_deadlist);
 
-	dmu_buf_remove_ref(ds->ds_dbuf, ds);
+	dmu_buf_rele(ds->ds_dbuf, ds);
 }
 
 void
@@ -1319,7 +1361,6 @@ dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds)
 	dds->dds_creation_txg = ds->ds_phys->ds_creation_txg;
 	dds->dds_space_refd = ds->ds_phys->ds_used_bytes;
 	dds->dds_fsid_guid = ds->ds_phys->ds_fsid_guid;
-	dds->dds_guid = ds->ds_phys->ds_guid;
 
 	if (ds->ds_phys->ds_next_snap_obj) {
 		/*
@@ -1332,8 +1373,6 @@ dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds)
 		dds->dds_uncompressed_bytes =
 		    ds->ds_phys->ds_uncompressed_bytes;
 	}
-
-	dds->dds_objset_obj = ds->ds_object;
 }
 
 dsl_pool_t *
@@ -1375,10 +1414,11 @@ dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	}
 
 	/* new fs better exist */
-	nds = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname, FTAG, &tail);
-	if (nds == NULL) {
+	err = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname,
+	    FTAG, &nds, &tail);
+	if (err) {
 		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
-		return (ENOENT);
+		return (err);
 	}
 
 	dsl_dir_close(nds, FTAG);
@@ -1397,8 +1437,12 @@ dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 
 	tail++;
 
-	fsds = dsl_dataset_open_obj(dd->dd_pool,
-	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+	err = dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &fsds);
+	if (err) {
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (err);
+	}
 
 	/* new name better not be in use */
 	err = zap_lookup(mos, fsds->ds_phys->ds_snapnames_zapobj,
@@ -1414,7 +1458,7 @@ dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	/* The point of no (unsuccessful) return */
 
 	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER);
-	dsl_dataset_get_snapname(snds);
+	VERIFY(0 == dsl_dataset_get_snapname(snds));
 	err = zap_remove(mos, fsds->ds_phys->ds_snapnames_zapobj,
 	    snds->ds_snapname, tx);
 	ASSERT3U(err, ==, 0);
@@ -1440,9 +1484,9 @@ dsl_dataset_rename(const char *osname, const char *newname)
 	struct osrenamearg ora;
 	int err;
 
-	dd = dsl_dir_open(osname, FTAG, &tail);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(osname, FTAG, &dd, &tail);
+	if (err)
+		return (err);
 	if (tail == NULL) {
 		err = dsl_dir_sync_task(dd,
 		    dsl_dir_rename_sync, (void*)newname, 1<<12);
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 4ea1d62de5..8ffa145477 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -76,18 +75,20 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
 	kmem_free(dd, sizeof (dsl_dir_t));
 }
 
-dsl_dir_t *
+int
 dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
-    const char *tail, void *tag)
+    const char *tail, void *tag, dsl_dir_t **ddp)
 {
 	dmu_buf_t *dbuf;
 	dsl_dir_t *dd;
+	int err;
 
 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 	    dsl_pool_sync_context(dp));
 
-	dbuf = dmu_bonus_hold_tag(dp->dp_meta_objset, ddobj, tag);
-	dmu_buf_read(dbuf);
+	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
+	if (err)
+		return (err);
 	dd = dmu_buf_get_user(dbuf);
 #ifdef ZFS_DEBUG
 	{
@@ -112,8 +113,13 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 		    offsetof(dsl_prop_cb_record_t, cbr_node));
 
 		if (dd->dd_phys->dd_parent_obj) {
-			dd->dd_parent = dsl_dir_open_obj(dp,
-			    dd->dd_phys->dd_parent_obj, NULL, dd);
+			err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
+			    NULL, dd, &dd->dd_parent);
+			if (err) {
+				kmem_free(dd, sizeof (dsl_dir_t));
+				dmu_buf_rele(dbuf, tag);
+				return (err);
+			}
 			if (tail) {
 #ifdef ZFS_DEBUG
 				uint64_t foundobj;
@@ -122,8 +128,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 				    dd->dd_parent->dd_phys->
 				    dd_child_dir_zapobj,
 				    tail, sizeof (foundobj), 1, &foundobj);
-				ASSERT3U(err, ==, 0);
-				ASSERT3U(foundobj, ==, ddobj);
+				ASSERT(err || foundobj == ddobj);
 #endif
 				(void) strcpy(dd->dd_myname, tail);
 			} else {
@@ -131,11 +136,12 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 				    dd->dd_parent->dd_phys->
 				    dd_child_dir_zapobj,
 				    ddobj, dd->dd_myname);
-				/*
-				 * The caller should be protecting this ddobj
-				 * from being deleted concurrently
-				 */
-				ASSERT(err == 0);
+			}
+			if (err) {
+				dsl_dir_close(dd->dd_parent, dd);
+				kmem_free(dd, sizeof (dsl_dir_t));
+				dmu_buf_rele(dbuf, tag);
+				return (err);
 			}
 		} else {
 			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
@@ -166,7 +172,8 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 	ASSERT3P(dd->dd_pool, ==, dp);
 	ASSERT3U(dd->dd_object, ==, ddobj);
 	ASSERT3P(dd->dd_dbuf, ==, dbuf);
-	return (dd);
+	*ddp = dd;
+	return (0);
 }
 
 void
@@ -174,7 +181,7 @@ dsl_dir_close(dsl_dir_t *dd, void *tag)
 {
 	dprintf_dd(dd, "%s\n", "");
 	spa_close(dd->dd_pool->dp_spa, tag);
-	dmu_buf_rele_tag(dd->dd_dbuf, tag);
+	dmu_buf_rele(dd->dd_dbuf, tag);
 }
 
 /* buf must be long enough (MAXNAMELEN should do) */
@@ -266,8 +273,9 @@ getcomponent(const char *path, char *component, const char **nextp)
  * same as dsl_open_dir, ignore the first component of name and use the
  * spa instead
  */
-dsl_dir_t *
-dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
+int
+dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+    dsl_dir_t **ddp, const char **tailp)
 {
 	char buf[MAXNAMELEN];
 	const char *next, *nextnext = NULL;
@@ -280,15 +288,15 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
 	dprintf("%s\n", name);
 
 	if (name == NULL)
-		return (NULL);
+		return (ENOENT);
 	err = getcomponent(name, buf, &next);
 	if (err)
-		return (NULL);
+		return (err);
 	if (spa == NULL) {
 		err = spa_open(buf, &spa, FTAG);
 		if (err) {
 			dprintf("spa_open(%s) failed\n", buf);
-			return (NULL);
+			return (err);
 		}
 		openedspa = TRUE;
 
@@ -299,17 +307,19 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
 	dp = spa_get_dsl(spa);
 
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	dd = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag);
+	err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
+	if (err) {
+		rw_exit(&dp->dp_config_rwlock);
+		if (openedspa)
+			spa_close(spa, FTAG);
+		return (err);
+	}
+
 	while (next != NULL) {
 		dsl_dir_t *child_ds;
 		err = getcomponent(next, buf, &nextnext);
-		if (err) {
-			dsl_dir_close(dd, tag);
-			rw_exit(&dp->dp_config_rwlock);
-			if (openedspa)
-				spa_close(spa, FTAG);
-			return (NULL);
-		}
+		if (err)
+			break;
 		ASSERT(next[0] != '\0');
 		if (next[0] == '@')
 			break;
@@ -321,18 +331,28 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
 		err = zap_lookup(dp->dp_meta_objset,
 		    dd->dd_phys->dd_child_dir_zapobj,
 		    buf, sizeof (ddobj), 1, &ddobj);
-		if (err == ENOENT) {
+		if (err) {
+			if (err == ENOENT)
+				err = 0;
 			break;
 		}
-		ASSERT(err == 0);
 
-		child_ds = dsl_dir_open_obj(dp, ddobj, buf, tag);
+		err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
+		if (err)
+			break;
 		dsl_dir_close(dd, tag);
 		dd = child_ds;
 		next = nextnext;
 	}
 	rw_exit(&dp->dp_config_rwlock);
 
+	if (err) {
+		dsl_dir_close(dd, tag);
+		if (openedspa)
+			spa_close(spa, FTAG);
+		return (err);
+	}
+
 	/*
 	 * It's an error if there's more than one component left, or
 	 * tailp==NULL and there's any component left.
@@ -342,14 +362,14 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
 		/* bad path name */
 		dsl_dir_close(dd, tag);
 		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
-		next = NULL;
-		dd = NULL;
+		err = ENOENT;
 	}
 	if (tailp)
 		*tailp = next;
 	if (openedspa)
 		spa_close(spa, FTAG);
-	return (dd);
+	*ddp = dd;
+	return (err);
 }
 
 /*
@@ -358,10 +378,10 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
  * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
  * means that the last component is a snapshot.
  */
-dsl_dir_t *
-dsl_dir_open(const char *name, void *tag, const char **tailp)
+int
+dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
 {
-	return (dsl_dir_open_spa(NULL, name, tag, tailp));
+	return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
 }
 
 int
@@ -397,7 +417,7 @@ dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
 	dprintf("dataset_create: zap_add %s->%lld to %lld returned %d\n",
 	    name, ddobj, pds->dd_phys->dd_child_dir_zapobj, err);
 
-	dbuf = dmu_bonus_hold(mos, ddobj);
+	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 
@@ -407,7 +427,7 @@ dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 	dsphys->dd_child_dir_zapobj = zap_create(mos,
 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
-	dmu_buf_rele(dbuf);
+	dmu_buf_rele(dbuf, FTAG);
 
 	rw_exit(&pds->dd_pool->dp_config_rwlock);
 
@@ -431,7 +451,9 @@ dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx)
 	if (err)
 		goto out;
 
-	dd = dsl_dir_open_obj(dp, obj, name, FTAG);
+	err = dsl_dir_open_obj(dp, obj, name, FTAG, &dd);
+	if (err)
+		goto out;
 	ASSERT3U(dd->dd_phys->dd_parent_obj, ==, pds->dd_object);
 
 	if (dmu_buf_refcount(dd->dd_dbuf) > 1) {
@@ -512,7 +534,7 @@ dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
 	    sizeof (uint64_t), 1, ddobjp, tx);
 	ASSERT3U(error, ==, 0);
 
-	dbuf = dmu_bonus_hold(mos, *ddobjp);
+	VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsp = dbuf->db_data;
 
@@ -522,7 +544,7 @@ dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
 	dsp->dd_child_dir_zapobj = zap_create(mos,
 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 
-	dmu_buf_rele(dbuf);
+	dmu_buf_rele(dbuf, FTAG);
 }
 
 void
@@ -530,7 +552,6 @@ dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds)
 {
 	bzero(dds, sizeof (dmu_objset_stats_t));
 
-	dds->dds_dir_obj = dd->dd_object;
 	dds->dds_available = dsl_dir_space_available(dd, NULL, 0, TRUE);
 
 	mutex_enter(&dd->dd_lock);
@@ -543,22 +564,17 @@ dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds)
 
 	dds->dds_creation_time = dd->dd_phys->dd_creation_time;
 
-	dds->dds_is_placeholder = (dd->dd_phys->dd_head_dataset_obj == 0);
-
 	if (dd->dd_phys->dd_clone_parent_obj) {
 		dsl_dataset_t *ds;
 
 		rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-		ds = dsl_dataset_open_obj(dd->dd_pool,
-		    dd->dd_phys->dd_clone_parent_obj, NULL, DS_MODE_NONE, FTAG);
+		VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+		    dd->dd_phys->dd_clone_parent_obj,
+		    NULL, DS_MODE_NONE, FTAG, &ds));
 		dsl_dataset_name(ds, dds->dds_clone_of);
-		dds->dds_clone_of_obj = dd->dd_phys->dd_clone_parent_obj;
 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 		rw_exit(&dd->dd_pool->dp_config_rwlock);
 	}
-
-	spa_altroot(dd->dd_pool->dp_spa, dds->dds_altroot,
-	    sizeof (dds->dds_altroot));
 }
 
 int
@@ -668,7 +684,7 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
 	mutex_exit(&dd->dd_lock);
 
 	/* release the hold from dsl_dir_dirty */
-	dmu_buf_remove_ref(dd->dd_dbuf, dd);
+	dmu_buf_rele(dd->dd_dbuf, dd);
 }
 
 static uint64_t
@@ -679,7 +695,7 @@ dsl_dir_estimated_space(dsl_dir_t *dd)
 
 	ASSERT(MUTEX_HELD(&dd->dd_lock));
 
-	space = dd->dd_used_bytes;
+	space = dd->dd_phys->dd_used_bytes;
 	ASSERT(space >= 0);
 	for (i = 0; i < TXG_SIZE; i++) {
 		space += dd->dd_space_towrite[i&TXG_MASK];
@@ -788,6 +804,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd,
 	struct tempreserve *tr;
 
 	ASSERT3U(txg, !=, 0);
+	ASSERT3S(asize, >=, 0);
 
 	mutex_enter(&dd->dd_lock);
 	/*
@@ -827,10 +844,14 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd,
 	/*
 	 * If they are requesting more space, and our current estimate
 	 * is over quota.  They get to try again unless the actual
-	 * on-disk is over quota.
+	 * on-disk is over quota and there are no pending changes (which
+	 * may free up space for us).
 	 */
 	if (asize > 0 && est_used > quota) {
-		if (dd->dd_used_bytes < quota)
+		if (dd->dd_space_towrite[txg & TXG_MASK] != 0 ||
+		    dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 ||
+		    dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 ||
+		    dd->dd_used_bytes < quota)
 			edquot = ERESTART;
 		dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
 		    "quota=%lluK tr=%lluK err=%d\n",
@@ -876,6 +897,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
 	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 	list_create(tr_list, sizeof (struct tempreserve),
 	    offsetof(struct tempreserve, tr_node));
+	ASSERT3S(asize, >=, 0);
+	ASSERT3S(fsize, >=, 0);
 
 	err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
 	    tr_list, tx);
@@ -975,8 +998,6 @@ dsl_dir_diduse_space(dsl_dir_t *dd,
 	ASSERT(uncompressed >= 0 ||
 	    dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
 	dd->dd_used_bytes += used;
-	if (used > 0)
-		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] -= used;
 	dd->dd_phys->dd_uncompressed_bytes += uncompressed;
 	dd->dd_phys->dd_compressed_bytes += compressed;
 	mutex_exit(&dd->dd_lock);
@@ -1013,9 +1034,9 @@ dsl_dir_set_quota(const char *ddname, uint64_t quota)
 	dsl_dir_t *dd;
 	int err;
 
-	dd = dsl_dir_open(ddname, FTAG, NULL);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	if (err)
+		return (err);
 	/*
 	 * If someone removes a file, then tries to set the quota, we
 	 * want to make sure the file freeing takes effect.
@@ -1073,9 +1094,9 @@ dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
 	dsl_dir_t *dd;
 	int err;
 
-	dd = dsl_dir_open(ddname, FTAG, NULL);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	if (err)
+		return (err);
 	err = dsl_dir_sync_task(dd,
 	    dsl_dir_set_reservation_sync, &reservation, 0);
 	dsl_dir_close(dd, FTAG);
@@ -1128,11 +1149,10 @@ dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 		return (ENXIO);
 	}
 
-	newpds = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &tail);
-
 	/* new parent should exist */
-	if (newpds == NULL)
-		return (ENOENT);
+	err = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &newpds, &tail);
+	if (err)
+		return (err);
 
 	/* new name should not already exist */
 	if (tail == NULL) {
@@ -1195,8 +1215,8 @@ dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
 	(void) strcpy(dd->dd_myname, tail);
 	dsl_dir_close(dd->dd_parent, dd);
 	dd->dd_phys->dd_parent_obj = newpds->dd_object;
-	dd->dd_parent = dsl_dir_open_obj(dd->dd_pool,
-	    newpds->dd_object, NULL, dd);
+	VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
+	    newpds->dd_object, NULL, dd, &dd->dd_parent));
 
 	/* add to new parent zapobj */
 	err = zap_add(mos, newpds->dd_phys->dd_child_dir_zapobj,
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 5b71ccfaa9..b8e54be6f6 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,8 +38,8 @@
 /* internal reserved dir name */
 #define	MOS_DIR_NAME "$MOS"
 
-static dsl_dir_t *
-dsl_pool_open_mos_dir(dsl_pool_t *dp)
+static int
+dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp)
 {
 	uint64_t obj;
 	int err;
@@ -48,9 +47,10 @@ dsl_pool_open_mos_dir(dsl_pool_t *dp)
 	err = zap_lookup(dp->dp_meta_objset,
 	    dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
 	    MOS_DIR_NAME, sizeof (obj), 1, &obj);
-	ASSERT3U(err, ==, 0);
+	if (err)
+		return (err);
 
-	return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp));
+	return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp));
 }
 
 static dsl_pool_t *
@@ -74,38 +74,56 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	return (dp);
 }
 
-dsl_pool_t *
-dsl_pool_open(spa_t *spa, uint64_t txg)
+int
+dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
-
-	dp->dp_meta_objset =
-	    &dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp)->os;
+	objset_impl_t *osi;
 
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
+	if (err)
+		goto out;
+	dp->dp_meta_objset = &osi->os;
+
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 	    &dp->dp_root_dir_obj);
-	ASSERT3U(err, ==, 0);
+	if (err)
+		goto out;
+
+	err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+	    NULL, dp, &dp->dp_root_dir);
+	if (err)
+		goto out;
 
-	dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
-	    NULL, dp);
-	dp->dp_mos_dir = dsl_pool_open_mos_dir(dp);
+	err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir);
+	if (err)
+		goto out;
+
+out:
 	rw_exit(&dp->dp_config_rwlock);
+	if (err)
+		dsl_pool_close(dp);
+	else
+		*dpp = dp;
 
-	return (dp);
+	return (err);
 }
 
 void
 dsl_pool_close(dsl_pool_t *dp)
 {
 	/* drop our reference from dsl_pool_open() */
-	dsl_dir_close(dp->dp_mos_dir, dp);
-	dsl_dir_close(dp->dp_root_dir, dp);
+	if (dp->dp_mos_dir)
+		dsl_dir_close(dp->dp_mos_dir, dp);
+	if (dp->dp_root_dir)
+		dsl_dir_close(dp->dp_root_dir, dp);
 
 	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
-	dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+	if (dp->dp_meta_objset)
+		dmu_objset_evict(NULL, dp->dp_meta_objset->os);
 
 	txg_list_destroy(&dp->dp_dirty_datasets);
 	txg_list_destroy(&dp->dp_dirty_dirs);
@@ -132,14 +150,13 @@ dsl_pool_create(spa_t *spa, uint64_t txg)
 
 	/* create and open the root dir */
 	dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
-	dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
-	    NULL, dp);
+	VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+	    NULL, dp, &dp->dp_root_dir));
 
 	/* create and open the meta-objset dir */
-	err = dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME,
-	    tx);
+	VERIFY(0 == dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx));
 	ASSERT3U(err, ==, 0);
-	dp->dp_mos_dir = dsl_pool_open_mos_dir(dp);
+	VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir));
 
 	dmu_tx_commit(tx);
 
diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c
index 3feb93e468..fc33b1c591 100644
--- a/usr/src/uts/common/fs/zfs/dsl_prop.c
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -75,7 +74,10 @@ dsl_prop_get_impl(dsl_pool_t *dp, uint64_t ddobj, const char *propname,
 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
 
 	while (ddobj != 0) {
-		dsl_dir_t *dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG);
+		dsl_dir_t *dd;
+		err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
+		if (err)
+			break;
 		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
 		    propname, intsz, numint, buf);
 		if (err != ENOENT) {
@@ -136,7 +138,8 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
 
 	cbr->cbr_func(cbr->cbr_arg, value);
 
-	(void) dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, cbr);
+	VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object,
+	    NULL, cbr, &dd));
 	rw_exit(&dd->dd_pool->dp_config_rwlock);
 	/* Leave dataset open until this callback is unregistered */
 	return (0);
@@ -164,9 +167,9 @@ dsl_prop_get(const char *ddname, const char *propname,
 	const char *tail;
 	int err;
 
-	dd = dsl_dir_open(ddname, FTAG, &tail);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(ddname, FTAG, &dd, &tail);
+	if (err)
+		return (err);
 	if (tail && tail[0] != '@') {
 		dsl_dir_close(dd, FTAG);
 		return (ENOENT);
@@ -258,7 +261,9 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
 	int err;
 
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
-	dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG);
+	err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
+	if (err)
+		return;
 
 	if (!first) {
 		/*
@@ -353,15 +358,15 @@ dsl_prop_set(const char *ddname, const char *propname,
 	int err;
 	struct prop_set_arg psa;
 
-	dd = dsl_dir_open(ddname, FTAG, NULL);
-	if (dd == NULL)
-		return (ENOENT);
+	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	if (err)
+		return (err);
 
 	psa.name = propname;
 	psa.intsz = intsz;
 	psa.numints = numints;
 	psa.buf = buf;
-	err = dsl_dir_sync_task(dd, dsl_prop_set_sync, &psa, 0);
+	err = dsl_dir_sync_task(dd, dsl_prop_set_sync, &psa, 1<<20);
 
 	dsl_dir_close(dd, FTAG);
 
@@ -457,10 +462,12 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
 		if (dd->dd_phys->dd_parent_obj == 0)
 			parent = NULL;
 		else
-			parent = dsl_dir_open_obj(dp,
-			    dd->dd_phys->dd_parent_obj, NULL, FTAG);
+			err = dsl_dir_open_obj(dp,
+			    dd->dd_phys->dd_parent_obj, NULL, FTAG, &parent);
 		if (dd != ds->ds_dir)
 			dsl_dir_close(dd, FTAG);
+		if (err)
+			break;
 		dd = parent;
 	}
 	rw_exit(&dp->dp_config_rwlock);
diff --git a/usr/src/uts/common/fs/zfs/fletcher.c b/usr/src/uts/common/fs/zfs/fletcher.c
index 03186d1387..edda3c9a9d 100644
--- a/usr/src/uts/common/fs/zfs/fletcher.c
+++ b/usr/src/uts/common/fs/zfs/fletcher.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -98,3 +97,49 @@ fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
 
 	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
 }
+
+void
+fletcher_4_incremental_native(const void *buf, uint64_t size,
+    zio_cksum_t *zcp)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	a = zcp->zc_word[0];
+	b = zcp->zc_word[1];
+	c = zcp->zc_word[2];
+	d = zcp->zc_word[3];
+
+	for (; ip < ipend; ip++) {
+		a += ip[0];
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
+    zio_cksum_t *zcp)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	a = zcp->zc_word[0];
+	b = zcp->zc_word[1];
+	c = zcp->zc_word[2];
+	d = zcp->zc_word[3];
+
+	for (; ip < ipend; ip++) {
+		a += BSWAP_32(ip[0]);
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 9d682e4990..d31e6edda3 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -379,11 +378,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 			    os, tx);
 		}
 
-		db = dmu_bonus_hold(os, smo->smo_object);
+		VERIFY(0 == dmu_bonus_hold(os, smo->smo_object, FTAG, &db));
 		dmu_buf_will_dirty(db, tx);
 		ASSERT3U(db->db_size, ==, sizeof (*smo));
 		bcopy(smo, db->db_data, db->db_size);
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, FTAG);
 
 		dmu_tx_commit(tx);
 	}
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 9b9bcab217..02be864b36 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -33,6 +32,7 @@
  */
 
 #include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
@@ -62,6 +62,44 @@ static uint32_t spa_active_count;
  * ==========================================================================
  */
 
+static int
+spa_error_entry_compare(const void *a, const void *b)
+{
+	spa_error_entry_t *sa = (spa_error_entry_t *)a;
+	spa_error_entry_t *sb = (spa_error_entry_t *)b;
+	int ret;
+
+	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
+	    sizeof (zbookmark_t));
+
+	if (ret < 0)
+		return (-1);
+	else if (ret > 0)
+		return (1);
+	else
+		return (0);
+}
+
+/*
+ * Utility function which retrieves copies of the current logs and
+ * re-initializes them in the process.
+ */
+void
+spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
+{
+	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
+
+	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
+	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
+
+	avl_create(&spa->spa_errlist_scrub,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+	avl_create(&spa->spa_errlist_last,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+}
+
 /*
  * Activate an uninitialized pool.
  */
@@ -76,9 +114,6 @@ spa_activate(spa_t *spa)
 
 	spa->spa_normal_class = metaslab_class_create();
 
-	spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry",
-	    4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
-
 	for (t = 0; t < ZIO_TYPES; t++) {
 		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
 		    8, maxclsyspri, 50, INT_MAX,
@@ -95,6 +130,13 @@ spa_activate(spa_t *spa)
 
 	txg_list_create(&spa->spa_vdev_txg_list,
 	    offsetof(struct vdev, vdev_txg_node));
+
+	avl_create(&spa->spa_errlist_scrub,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+	avl_create(&spa->spa_errlist_last,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
 }
 
 /*
@@ -124,12 +166,18 @@ spa_deactivate(spa_t *spa)
 		spa->spa_zio_intr_taskq[t] = NULL;
 	}
 
-	taskq_destroy(spa->spa_vdev_retry_taskq);
-	spa->spa_vdev_retry_taskq = NULL;
-
 	metaslab_class_destroy(spa->spa_normal_class);
 	spa->spa_normal_class = NULL;
 
+	/*
+	 * If this was part of an import or the open otherwise failed, we may
+	 * still have errors left in the queues.  Empty them just in case.
+	 */
+	spa_errlog_drain(spa);
+
+	avl_destroy(&spa->spa_errlist_scrub);
+	avl_destroy(&spa->spa_errlist_last);
+
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 }
 
@@ -175,6 +223,11 @@ static void
 spa_unload(spa_t *spa)
 {
 	/*
+	 * Stop async tasks.
+	 */
+	spa_async_suspend(spa);
+
+	/*
 	 * Stop syncing.
 	 */
 	if (spa->spa_sync_on) {
@@ -185,8 +238,8 @@ spa_unload(spa_t *spa)
 	/*
 	 * Wait for any outstanding prefetch I/O to complete.
 	 */
-	spa_config_enter(spa, RW_WRITER);
-	spa_config_exit(spa);
+	spa_config_enter(spa, RW_WRITER, FTAG);
+	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Close the dsl pool.
@@ -203,16 +256,16 @@ spa_unload(spa_t *spa)
 		vdev_free(spa->spa_root_vdev);
 		spa->spa_root_vdev = NULL;
 	}
+
+	spa->spa_async_suspended = 0;
 }
 
 /*
  * Load an existing storage pool, using the pool's builtin spa_config as a
- * source of configuration information.  The 'readonly' flag will prevent us
- * from writing any updated state to disk, and can be use when testing a pool
- * for import.
+ * source of configuration information.
  */
 static int
-spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
+spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 {
 	int error = 0;
 	nvlist_t *nvroot = NULL;
@@ -221,25 +274,34 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 	uint64_t pool_guid;
 	zio_t *zio;
 
+	spa->spa_load_state = state;
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
-	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
-		return (EINVAL);
+	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+		error = EINVAL;
+		goto out;
+	}
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    &spa->spa_config_txg);
 
-	if (import && spa_guid_exists(pool_guid, 0))
-		return (EEXIST);
+	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+	    spa_guid_exists(pool_guid, 0)) {
+		error = EEXIST;
+		goto out;
+	}
 
 	/*
 	 * Parse the configuration into a vdev tree.
 	 */
-	spa_config_enter(spa, RW_WRITER);
+	spa_config_enter(spa, RW_WRITER, FTAG);
 	rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
-	if (rvd == NULL)
-		return (EINVAL);
+	if (rvd == NULL) {
+		error = EINVAL;
+		goto out;
+	}
 
 	spa->spa_root_vdev = rvd;
 	ASSERT(spa_guid(spa) == pool_guid);
@@ -247,8 +309,10 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 	/*
 	 * Try to open all vdevs, loading each label in the process.
 	 */
-	if (vdev_open(rvd) != 0)
-		return (ENXIO);
+	if (vdev_open(rvd) != 0) {
+		error = ENXIO;
+		goto out;
+	}
 
 	/*
 	 * Find the best uberblock.
@@ -264,8 +328,16 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 	 * If we weren't able to find a single valid uberblock, return failure.
 	 */
 	if (ub->ub_txg == 0) {
-		dprintf("ub_txg is zero\n");
-		return (ENXIO);
+		error = ENXIO;
+		goto out;
+	}
+
+	/*
+	 * If the pool is newer than the code, we can't open it.
+	 */
+	if (ub->ub_version > UBERBLOCK_VERSION) {
+		error = ENOTSUP;
+		goto out;
 	}
 
 	/*
@@ -273,11 +345,10 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 	 * incomplete configuration.
 	 */
 	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
-		rvd->vdev_state = VDEV_STATE_CANT_OPEN;
-		rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM;
-		dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n",
-		    rvd->vdev_guid_sum, ub->ub_guid_sum);
-		return (ENXIO);
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_BAD_GUID_SUM);
+		error = ENXIO;
+		goto out;
 	}
 
 	/*
@@ -286,12 +357,22 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
-	spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg);
+	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+	if (error) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		goto out;
+	}
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 
-	VERIFY(zap_lookup(spa->spa_meta_objset,
+	if (zap_lookup(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
-	    sizeof (uint64_t), 1, &spa->spa_config_object) == 0);
+	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
 
 	if (!mosconfig) {
 		dmu_buf_t *db;
@@ -299,21 +380,24 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 		size_t nvsize = 0;
 		nvlist_t *newconfig = NULL;
 
-		db = dmu_bonus_hold(spa->spa_meta_objset,
-		    spa->spa_config_object);
-		dmu_buf_read(db);
+		VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset,
+		    spa->spa_config_object, FTAG, &db));
 		nvsize = *(uint64_t *)db->db_data;
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, FTAG);
 
 		packed = kmem_alloc(nvsize, KM_SLEEP);
-		error = dmu_read_canfail(spa->spa_meta_objset,
+		error = dmu_read(spa->spa_meta_objset,
 		    spa->spa_config_object, 0, nvsize, packed);
 		if (error == 0)
 			error = nvlist_unpack(packed, nvsize, &newconfig, 0);
 		kmem_free(packed, nvsize);
 
-		if (error)
-			return (ENXIO);
+		if (error) {
+			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			error = EIO;
+			goto out;
+		}
 
 		spa_config_set(spa, newconfig);
 
@@ -321,39 +405,76 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 		spa_deactivate(spa);
 		spa_activate(spa);
 
-		return (spa_load(spa, newconfig, readonly, import, B_TRUE));
+		return (spa_load(spa, newconfig, state, B_TRUE));
 	}
 
-	VERIFY(zap_lookup(spa->spa_meta_objset,
+	if (zap_lookup(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0);
+	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
 
 	/*
-	 * Load the vdev state for all top level vdevs.
+	 * Load the persistent error log.  If we have an older pool, this will
+	 * not be present.
 	 */
-	if ((error = vdev_load(rvd, import)) != 0)
-		return (error);
+	error = zap_lookup(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
+	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
+	if (error != 0 &&error != ENOENT) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
+
+	error = zap_lookup(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
+	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
+	if (error != 0 && error != ENOENT) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
+
+	/*
+	 * Load the vdev state for all top level vdevs.  We need to grab the
+	 * config lock because all label I/O is done with the
+	 * ZIO_FLAG_CONFIG_HELD flag.
+	 */
+	spa_config_enter(spa, RW_READER, FTAG);
+	if ((error = vdev_load(rvd)) != 0) {
+		spa_config_exit(spa, FTAG);
+		goto out;
+	}
+	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
 	 */
-	spa_config_enter(spa, RW_WRITER);
+	spa_config_enter(spa, RW_WRITER, FTAG);
 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	/*
 	 * Check the state of the root vdev.  If it can't be opened, it
 	 * indicates one or more toplevel vdevs are faulted.
 	 */
-	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
-		return (ENXIO);
+	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+		error = ENXIO;
+		goto out;
+	}
 
 	/*
 	 * Claim log blocks that haven't been committed yet, and update all
 	 * top-level vdevs to sync any config changes found in vdev_load().
 	 * This must all happen in a single txg.
 	 */
-	if ((spa_mode & FWRITE) && !readonly) {
+	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
 		dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa),
 		    spa_first_txg(spa));
 		dmu_objset_find(spa->spa_name, zil_claim, tx, 0);
@@ -369,7 +490,14 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 	}
 
-	return (0);
+	error = 0;
+out:
+	if (error)
+		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
+	spa->spa_load_state = SPA_LOAD_NONE;
+	spa->spa_ena = 0;
+
+	return (error);
 }
 
 /*
@@ -415,7 +543,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 		spa_activate(spa);
 
 		error = spa_load(spa, spa->spa_config,
-		    B_FALSE, B_FALSE, B_FALSE);
+		    SPA_LOAD_OPEN, B_FALSE);
 
 		if (error == EBADF) {
 			/*
@@ -432,7 +560,9 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			return (ENOENT);
-		} if (error) {
+		}
+
+		if (error) {
 			/*
 			 * We can't open the pool, but we still have useful
 			 * information: the state of each vdev after the
@@ -443,10 +573,14 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 				    B_TRUE);
 			spa_unload(spa);
 			spa_deactivate(spa);
+			spa->spa_last_open_failed = B_TRUE;
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			*spapp = NULL;
 			return (error);
+		} else {
+			zfs_post_ok(spa, NULL);
+			spa->spa_last_open_failed = B_FALSE;
 		}
 
 		loaded = B_TRUE;
@@ -459,9 +593,9 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 	*spapp = spa;
 
 	if (config != NULL) {
-		spa_config_enter(spa, RW_READER);
+		spa_config_enter(spa, RW_READER, FTAG);
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-		spa_config_exit(spa);
+		spa_config_exit(spa, FTAG);
 	}
 
 	/*
@@ -479,8 +613,36 @@ spa_open(const char *name, spa_t **spapp, void *tag)
 	return (spa_open_common(name, spapp, tag, NULL));
 }
 
+/*
+ * Lookup the given spa_t, incrementing the inject count in the process,
+ * preventing it from being exported or destroyed.
+ */
+spa_t *
+spa_inject_addref(char *name)
+{
+	spa_t *spa;
+
+	mutex_enter(&spa_namespace_lock);
+	if ((spa = spa_lookup(name)) == NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (NULL);
+	}
+	spa->spa_inject_ref++;
+	mutex_exit(&spa_namespace_lock);
+
+	return (spa);
+}
+
+void
+spa_inject_delref(spa_t *spa)
+{
+	mutex_enter(&spa_namespace_lock);
+	spa->spa_inject_ref--;
+	mutex_exit(&spa_namespace_lock);
+}
+
 int
-spa_get_stats(const char *name, nvlist_t **config)
+spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
 {
 	int error;
 	spa_t *spa;
@@ -488,6 +650,29 @@ spa_get_stats(const char *name, nvlist_t **config)
 	*config = NULL;
 	error = spa_open_common(name, &spa, FTAG, config);
 
+	if (spa && *config != NULL)
+		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
+		    spa_get_errlog_size(spa)) == 0);
+
+	/*
+	 * We want to get the alternate root even for faulted pools, so we cheat
+	 * and call spa_lookup() directly.
+	 */
+	if (altroot) {
+		if (spa == NULL) {
+			mutex_enter(&spa_namespace_lock);
+			spa = spa_lookup(name);
+			if (spa)
+				spa_altroot(spa, altroot, buflen);
+			else
+				altroot[0] = '\0';
+			spa = NULL;
+			mutex_exit(&spa_namespace_lock);
+		} else {
+			spa_altroot(spa, altroot, buflen);
+		}
+	}
+
 	if (spa != NULL)
 		spa_close(spa, FTAG);
 
@@ -551,9 +736,11 @@ spa_create(const char *pool, nvlist_t *nvroot, char *altroot)
 	    DMU_OT_PACKED_NVLIST, 1 << 14,
 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
 
-	VERIFY(zap_add(spa->spa_meta_objset,
+	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
-	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0);
+	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
+		cmn_err(CE_PANIC, "failed to add pool config");
+	}
 
 	/*
 	 * Create the deferred-free bplist object.  Turn off compression
@@ -565,9 +752,11 @@ spa_create(const char *pool, nvlist_t *nvroot, char *altroot)
 	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
 	    ZIO_COMPRESS_OFF, tx);
 
-	VERIFY(zap_add(spa->spa_meta_objset,
+	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0);
+	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
+		cmn_err(CE_PANIC, "failed to add bplist");
+	}
 
 	dmu_tx_commit(tx);
 
@@ -619,7 +808,7 @@ spa_import(const char *pool, nvlist_t *config, char *altroot)
 	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
 	 * so that we don't try to open the pool if the config is damaged.
 	 */
-	error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE);
+	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
 
 	if (error) {
 		spa_unload(spa);
@@ -694,7 +883,7 @@ spa_tryimport(nvlist_t *tryconfig)
 	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
 	 * so we don't try to open the pool if the config is damaged.
 	 */
-	(void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE);
+	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
@@ -738,6 +927,16 @@ spa_export_common(char *pool, int new_state)
 	}
 
 	/*
+	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
+	 * reacquire the namespace lock, and see if we can export.
+	 */
+	spa_open_ref(spa, FTAG);
+	mutex_exit(&spa_namespace_lock);
+	spa_async_suspend(spa);
+	mutex_enter(&spa_namespace_lock);
+	spa_close(spa, FTAG);
+
+	/*
 	 * The pool will be in core if it's openable,
 	 * in which case we can modify its state.
 	 */
@@ -749,17 +948,20 @@ spa_export_common(char *pool, int new_state)
 		spa_scrub_suspend(spa);
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 
-		if (!spa_refcount_zero(spa)) {
+		/*
+		 * A pool cannot be exported or destroyed if there are active
+		 * references.  If we are resetting a pool, allow references by
+		 * fault injection handlers.
+		 */
+		if (!spa_refcount_zero(spa) ||
+		    (spa->spa_inject_ref != 0 &&
+		    new_state != POOL_STATE_UNINITIALIZED)) {
 			spa_scrub_resume(spa);
+			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (EBUSY);
 		}
 
-		/*
-		 * Update the pool state.
-		 */
-		spa->spa_state = new_state;
-
 		spa_scrub_resume(spa);
 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
 
@@ -771,7 +973,10 @@ spa_export_common(char *pool, int new_state)
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
 		 */
-		vdev_config_dirty(spa->spa_root_vdev);
+		if (new_state != POOL_STATE_UNINITIALIZED) {
+			spa->spa_state = new_state;
+			vdev_config_dirty(spa->spa_root_vdev);
+		}
 	}
 
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
@@ -779,8 +984,10 @@ spa_export_common(char *pool, int new_state)
 		spa_deactivate(spa);
 	}
 
-	spa_remove(spa);
-	spa_config_sync();
+	if (new_state != POOL_STATE_UNINITIALIZED) {
+		spa_remove(spa);
+		spa_config_sync();
+	}
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
@@ -805,6 +1012,17 @@ spa_export(char *pool)
 }
 
 /*
+ * Similar to spa_export(), this unloads the spa_t without actually removing it
+ * from the namespace in any way.
+ */
+int
+spa_reset(char *pool)
+{
+	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED));
+}
+
+
+/*
  * ==========================================================================
  * Device manipulation
  * ==========================================================================
@@ -845,7 +1063,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 			tvd->vdev_id = rvd->vdev_children;
 			vdev_add_child(rvd, tvd);
 		}
-		vdev_init(tvd, txg);
+		if ((error = vdev_init(tvd, txg)) != 0)
+			return (spa_vdev_exit(spa, vd, txg, error));
 		vdev_config_dirty(tvd);
 	}
 
@@ -871,7 +1090,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
  * is automatically detached.
  */
 int
-spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
+spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 {
 	uint64_t txg, open_txg;
 	int error;
@@ -881,7 +1100,7 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
 
 	txg = spa_vdev_enter(spa);
 
-	oldvd = vdev_lookup_by_path(rvd, path);
+	oldvd = vdev_lookup_by_guid(rvd, guid);
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
@@ -954,6 +1173,12 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
 	newvd->vdev_id = pvd->vdev_children;
 	vdev_add_child(pvd, newvd);
 
+	/*
+	 * If newvd is smaller than oldvd, but larger than its rsize,
+	 * the addition of newvd may have decreased our parent's asize.
+	 */
+	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
+
 	tvd = newvd->vdev_top;
 	ASSERT(pvd->vdev_top == tvd);
 	ASSERT(tvd->vdev_parent == rvd);
@@ -962,7 +1187,6 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
 	 * Update the config based on the new in-core state.
 	 */
 	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
-
 	vdev_config_dirty(tvd);
 
 	/*
@@ -976,14 +1200,14 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
 	    open_txg - TXG_INITIAL + 1);
 	mutex_exit(&newvd->vdev_dtl_lock);
 
+	dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg);
+
 	/*
 	 * Mark newvd's DTL dirty in this txg.
 	 */
 	vdev_dirty(tvd, VDD_DTL, txg);
 	(void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg);
 
-	dprintf("attached %s, replacing=%d\n", path, replacing);
-
 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
 
 	/*
@@ -1000,7 +1224,7 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
  * is a replacing vdev.
  */
 int
-spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
+spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 {
 	uint64_t txg;
 	int c, t, error;
@@ -1009,14 +1233,11 @@ spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
 
 	txg = spa_vdev_enter(spa);
 
-	vd = vdev_lookup_by_path(rvd, path);
+	vd = vdev_lookup_by_guid(rvd, guid);
 
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
-	if (guid != 0 && vd->vdev_guid != guid)
-		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
 	pvd = vd->vdev_parent;
 
 	/*
@@ -1105,13 +1326,16 @@ spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
 	/*
 	 * Reopen this top-level vdev to reassess health after detach.
 	 */
-	vdev_reopen(tvd, NULL);
+	vdev_reopen(tvd);
 
 	/*
 	 * If the device we just detached was smaller than the others,
-	 * it may be possible to add metaslabs (i.e. grow the pool).
+	 * it may be possible to add metaslabs (i.e. grow the pool).  We ignore
+	 * the error here because the detach still succeeded - we just weren't
+	 * able to reinitialize the metaslabs.  This pool is in for a world of
+	 * hurt, in any case.
 	 */
-	vdev_metaslab_init(tvd, txg);
+	(void) vdev_metaslab_init(tvd, txg);
 
 	/*
 	 * Update the config based on the new in-core state.
@@ -1133,72 +1357,59 @@ spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
 
-	dprintf("detached %s\n", path);
+	dprintf("detached %s in txg %llu\n", vd->vdev_path, txg);
 
 	return (spa_vdev_exit(spa, vd, txg, 0));
 }
 
 /*
- * If there are any replacing vdevs that have finished replacing, detach them.
- * We can't hold the config lock across detaches, so we lock the config,
- * build a list of candidates, unlock the config, and try each candidate.
+ * Find any device that's done replacing, so we can detach it.
  */
-typedef struct vdev_detach_link {
-	char		*vdl_path;
-	uint64_t	vdl_guid;
-	list_node_t	vdl_node;
-} vdev_detach_link_t;
-
-static void
-spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd)
+static vdev_t *
+spa_vdev_replace_done_hunt(vdev_t *vd)
 {
+	vdev_t *newvd, *oldvd;
 	int c;
 
-	for (c = 0; c < vd->vdev_children; c++)
-		spa_vdev_replace_done_make_list(l, vd->vdev_child[c]);
+	for (c = 0; c < vd->vdev_children; c++) {
+		oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
+		if (oldvd != NULL)
+			return (oldvd);
+	}
 
 	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
-		vdev_t *cvd0 = vd->vdev_child[0];
-		vdev_t *cvd1 = vd->vdev_child[1];
-		vdev_detach_link_t *vdl;
-		int dirty1;
-
-		mutex_enter(&cvd1->vdev_dtl_lock);
-		dirty1 = cvd1->vdev_dtl_map.sm_space |
-		    cvd1->vdev_dtl_scrub.sm_space;
-		mutex_exit(&cvd1->vdev_dtl_lock);
-
-		if (!dirty1) {
-			vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP);
-			vdl->vdl_path = spa_strdup(cvd0->vdev_path);
-			vdl->vdl_guid = cvd0->vdev_guid;
-			list_insert_tail(l, vdl);
+		oldvd = vd->vdev_child[0];
+		newvd = vd->vdev_child[1];
+
+		mutex_enter(&newvd->vdev_dtl_lock);
+		if (newvd->vdev_dtl_map.sm_space == 0 &&
+		    newvd->vdev_dtl_scrub.sm_space == 0) {
+			mutex_exit(&newvd->vdev_dtl_lock);
+			return (oldvd);
 		}
+		mutex_exit(&newvd->vdev_dtl_lock);
 	}
+
+	return (NULL);
 }
 
-void
+static void
 spa_vdev_replace_done(spa_t *spa)
 {
-	vdev_detach_link_t *vdl;
-	list_t vdlist;
-
-	list_create(&vdlist, sizeof (vdev_detach_link_t),
-	    offsetof(vdev_detach_link_t, vdl_node));
-
-	spa_config_enter(spa, RW_READER);
-	spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev);
-	spa_config_exit(spa);
-
-	while ((vdl = list_head(&vdlist)) != NULL) {
-		list_remove(&vdlist, vdl);
-		(void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid,
-		    B_TRUE);
-		spa_strfree(vdl->vdl_path);
-		kmem_free(vdl, sizeof (*vdl));
+	vdev_t *vd;
+	uint64_t guid;
+
+	spa_config_enter(spa, RW_READER, FTAG);
+
+	while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
+		guid = vd->vdev_guid;
+		spa_config_exit(spa, FTAG);
+		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
+			return;
+		spa_config_enter(spa, RW_READER, FTAG);
 	}
 
-	list_destroy(&vdlist);
+	spa_config_exit(spa, FTAG);
 }
 
 /*
@@ -1234,7 +1445,16 @@ spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
  * ==========================================================================
  */
 
-static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t);
+void
+spa_scrub_throttle(spa_t *spa, int direction)
+{
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_throttled += direction;
+	ASSERT(spa->spa_scrub_throttled >= 0);
+	if (spa->spa_scrub_throttled == 0)
+		cv_broadcast(&spa->spa_scrub_io_cv);
+	mutex_exit(&spa->spa_scrub_lock);
+}
 
 static void
 spa_scrub_io_done(zio_t *zio)
@@ -1244,22 +1464,23 @@ spa_scrub_io_done(zio_t *zio)
 	zio_buf_free(zio->io_data, zio->io_size);
 
 	mutex_enter(&spa->spa_scrub_lock);
-	if (zio->io_error)
-		spa->spa_scrub_errors++;
-	if (--spa->spa_scrub_inflight == 0)
-		cv_broadcast(&spa->spa_scrub_io_cv);
-	mutex_exit(&spa->spa_scrub_lock);
-
-	if (zio->io_error) {
+	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 		vdev_t *vd = zio->io_vd;
+		spa->spa_scrub_errors++;
 		mutex_enter(&vd->vdev_stat_lock);
 		vd->vdev_stat.vs_scrub_errors++;
 		mutex_exit(&vd->vdev_stat_lock);
 	}
+	if (--spa->spa_scrub_inflight == 0) {
+		cv_broadcast(&spa->spa_scrub_io_cv);
+		ASSERT(spa->spa_scrub_throttled == 0);
+	}
+	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static void
-spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags)
+spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
+    zbookmark_t *zb)
 {
 	size_t size = BP_GET_LSIZE(bp);
 	void *data = zio_buf_alloc(size);
@@ -1268,8 +1489,13 @@ spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags)
 	spa->spa_scrub_inflight++;
 	mutex_exit(&spa->spa_scrub_lock);
 
+	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
+
+	flags |= ZIO_FLAG_CANFAIL;
+
 	zio_nowait(zio_read(NULL, spa, bp, data, size,
-	    spa_scrub_io_done, NULL, priority, flags));
+	    spa_scrub_io_done, NULL, priority, flags, zb));
 }
 
 /* ARGSUSED */
@@ -1319,12 +1545,11 @@ spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
 		}
 		if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) {
 			spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY |
-			    ZIO_FLAG_RESILVER);
+			    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
 		}
 	} else {
 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB);
+		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
 	}
 
 	return (0);
@@ -1348,19 +1573,25 @@ spa_scrub_thread(spa_t *spa)
 	 */
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
-	spa_config_enter(spa, RW_WRITER);
-	vdev_reopen(rvd, NULL);		/* purge all vdev caches */
+	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
+	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
+	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
+
+	spa_config_enter(spa, RW_WRITER, FTAG);
+	vdev_reopen(rvd);		/* purge all vdev caches */
 	vdev_config_dirty(rvd);		/* rewrite all disk labels */
 	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_errors = 0;
 	spa->spa_scrub_active = 1;
+	ASSERT(spa->spa_scrub_inflight == 0);
+	ASSERT(spa->spa_scrub_throttled == 0);
 
 	while (!spa->spa_scrub_stop) {
 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
-		while (spa->spa_scrub_suspend) {
+		while (spa->spa_scrub_suspended) {
 			spa->spa_scrub_active = 0;
 			cv_broadcast(&spa->spa_scrub_cv);
 			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
@@ -1376,6 +1607,9 @@ spa_scrub_thread(spa_t *spa)
 		mutex_enter(&spa->spa_scrub_lock);
 		if (error != EAGAIN)
 			break;
+
+		while (spa->spa_scrub_throttled > 0)
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 	}
 
 	while (spa->spa_scrub_inflight)
@@ -1384,16 +1618,25 @@ spa_scrub_thread(spa_t *spa)
 	if (spa->spa_scrub_restart_txg != 0)
 		error = ERESTART;
 
+	if (spa->spa_scrub_stop)
+		error = EINTR;
+
 	spa->spa_scrub_active = 0;
 	cv_broadcast(&spa->spa_scrub_cv);
 
 	/*
-	 * If the traverse completed, and there were no errors,
-	 * then the scrub was completely successful.
+	 * Even if there were uncorrectable errors, we consider the scrub
+	 * completed.  The downside is that if there is a transient error during
+	 * a resilver, we won't resilver the data properly to the target.  But
+	 * if the damage is permanent (more likely) we will resilver forever,
+	 * which isn't really acceptable.  Since there is enough information for
+	 * the user to know what has failed and why, this seems like a more
+	 * tractable approach.
 	 */
-	complete = (error == 0 && spa->spa_scrub_errors == 0);
+	complete = (error == 0);
 
-	dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
 	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
 	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
 
@@ -1403,31 +1646,32 @@ spa_scrub_thread(spa_t *spa)
 	 * If the scrub/resilver completed, update all DTLs to reflect this.
 	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
 	 */
-	spa_config_enter(spa, RW_WRITER);
+	spa_config_enter(spa, RW_WRITER, FTAG);
 	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
 	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
-	spa_config_exit(spa);
-
-	spa_vdev_replace_done(spa);
-
-	spa_config_enter(spa, RW_READER);
 	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
-	spa_config_exit(spa);
+	spa_errlog_rotate(spa);
+	spa_config_exit(spa, FTAG);
 
 	mutex_enter(&spa->spa_scrub_lock);
 
-	spa->spa_scrub_type = POOL_SCRUB_NONE;
-	spa->spa_scrub_active = 0;
-	spa->spa_scrub_thread = NULL;
-
-	cv_broadcast(&spa->spa_scrub_cv);
+	/*
+	 * We may have finished replacing a device.
+	 * Let the async thread assess this and handle the detach.
+	 */
+	spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
 
 	/*
 	 * If we were told to restart, our final act is to start a new scrub.
 	 */
 	if (error == ERESTART)
-		VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0);
+		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
+		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
 
+	spa->spa_scrub_type = POOL_SCRUB_NONE;
+	spa->spa_scrub_active = 0;
+	spa->spa_scrub_thread = NULL;
+	cv_broadcast(&spa->spa_scrub_cv);
 	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
 	thread_exit();
 }
@@ -1436,7 +1680,7 @@ void
 spa_scrub_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_suspend++;
+	spa->spa_scrub_suspended++;
 	while (spa->spa_scrub_active) {
 		cv_broadcast(&spa->spa_scrub_cv);
 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
@@ -1450,8 +1694,8 @@ void
 spa_scrub_resume(spa_t *spa)
 {
 	mutex_enter(&spa->spa_scrub_lock);
-	ASSERT(spa->spa_scrub_suspend != 0);
-	if (--spa->spa_scrub_suspend == 0)
+	ASSERT(spa->spa_scrub_suspended != 0);
+	if (--spa->spa_scrub_suspended == 0)
 		cv_broadcast(&spa->spa_scrub_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
@@ -1469,17 +1713,19 @@ spa_scrub_restart(spa_t *spa, uint64_t txg)
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
-static int
-spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+int
+spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
 {
 	space_seg_t *ss;
 	uint64_t mintxg, maxtxg;
 	vdev_t *rvd = spa->spa_root_vdev;
-	int advance = 0;
+	int advance = ADVANCE_PRE | ADVANCE_ZIL;
 
 	if ((uint_t)type >= POOL_SCRUB_TYPES)
 		return (ENOTSUP);
 
+	mutex_enter(&spa->spa_scrub_lock);
+
 	/*
 	 * If there's a scrub or resilver already in progress, stop it.
 	 */
@@ -1487,9 +1733,10 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
 		/*
 		 * Don't stop a resilver unless forced.
 		 */
-		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force)
+		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
+			mutex_exit(&spa->spa_scrub_lock);
 			return (EBUSY);
-
+		}
 		spa->spa_scrub_stop = 1;
 		cv_broadcast(&spa->spa_scrub_cv);
 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
@@ -1503,19 +1750,36 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
 		spa->spa_scrub_th = NULL;
 	}
 
-	spa->spa_scrub_stop = 0;
-	spa->spa_scrub_type = type;
-	spa->spa_scrub_restart_txg = 0;
+	if (rvd == NULL) {
+		ASSERT(spa->spa_scrub_stop == 0);
+		ASSERT(spa->spa_scrub_type == type);
+		ASSERT(spa->spa_scrub_restart_txg == 0);
+		mutex_exit(&spa->spa_scrub_lock);
+		return (0);
+	}
 
 	mintxg = TXG_INITIAL - 1;
 	maxtxg = spa_last_synced_txg(spa) + 1;
 
-	switch (type) {
+	mutex_enter(&rvd->vdev_dtl_lock);
 
-	case POOL_SCRUB_NONE:
-		break;
+	if (rvd->vdev_dtl_map.sm_space == 0) {
+		/*
+		 * The pool-wide DTL is empty.
+		 * If this is a resilver, there's nothing to do.
+		 */
+		if (type == POOL_SCRUB_RESILVER)
+			type = POOL_SCRUB_NONE;
+	} else {
+		/*
+		 * The pool-wide DTL is non-empty.
+		 * If this is a normal scrub, upgrade to a resilver instead.
+		 */
+		if (type == POOL_SCRUB_EVERYTHING)
+			type = POOL_SCRUB_RESILVER;
+	}
 
-	case POOL_SCRUB_RESILVER:
+	if (type == POOL_SCRUB_RESILVER) {
 		/*
 		 * Determine the resilvering boundaries.
 		 *
@@ -1525,26 +1789,22 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
 		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
 		 * so we don't claim to resilver a txg that's still changing.
 		 */
-		mutex_enter(&rvd->vdev_dtl_lock);
 		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
-		mintxg = ss ? ss->ss_start - 1 : 0;
+		mintxg = ss->ss_start - 1;
 		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
-		maxtxg = ss ? ss->ss_end : 0;
-		maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1);
-		mutex_exit(&rvd->vdev_dtl_lock);
+		maxtxg = MIN(ss->ss_end, maxtxg);
 
-		advance = ADVANCE_PRE | ADVANCE_PRUNE;
-		break;
-
-	case POOL_SCRUB_EVERYTHING:
-		/*
-		 * A scrub is like a resilver, but not pruned by DTL.
-		 */
-		advance = ADVANCE_PRE;
-		break;
+		advance |= ADVANCE_PRUNE;
 	}
 
-	if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) {
+	mutex_exit(&rvd->vdev_dtl_lock);
+
+	spa->spa_scrub_stop = 0;
+	spa->spa_scrub_type = type;
+	spa->spa_scrub_restart_txg = 0;
+
+	if (type != POOL_SCRUB_NONE) {
+		spa->spa_scrub_mintxg = mintxg;
 		spa->spa_scrub_maxtxg = maxtxg;
 		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
 		    advance, ZIO_FLAG_CANFAIL);
@@ -1553,24 +1813,119 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
 		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
 	}
 
+	mutex_exit(&spa->spa_scrub_lock);
+
 	return (0);
 }
 
-int
-spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+/*
+ * ==========================================================================
+ * SPA async task processing
+ * ==========================================================================
+ */
+
+static void
+spa_async_reopen(spa_t *spa)
 {
-	int error;
-	traverse_handle_t *th;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *tvd;
+	int c;
 
-	mutex_enter(&spa->spa_scrub_lock);
-	error = spa_scrub_locked(spa, type, force);
-	th = spa->spa_scrub_th;
-	mutex_exit(&spa->spa_scrub_lock);
+	spa_config_enter(spa, RW_WRITER, FTAG);
+
+	for (c = 0; c < rvd->vdev_children; c++) {
+		tvd = rvd->vdev_child[c];
+		if (tvd->vdev_reopen_wanted) {
+			tvd->vdev_reopen_wanted = 0;
+			vdev_reopen(tvd);
+		}
+	}
+
+	spa_config_exit(spa, FTAG);
+}
 
-	if (th == NULL && type != POOL_SCRUB_NONE)
+static void
+spa_async_thread(spa_t *spa)
+{
+	int tasks;
+
+	ASSERT(spa->spa_sync_on);
+
+	mutex_enter(&spa->spa_async_lock);
+	tasks = spa->spa_async_tasks;
+	spa->spa_async_tasks = 0;
+	mutex_exit(&spa->spa_async_lock);
+
+	/*
+	 * See if any devices need to be reopened.
+	 */
+	if (tasks & SPA_ASYNC_REOPEN)
+		spa_async_reopen(spa);
+
+	/*
+	 * If any devices are done replacing, detach them.
+	 */
+	if (tasks & SPA_ASYNC_REPLACE_DONE)
 		spa_vdev_replace_done(spa);
 
-	return (error);
+	/*
+	 * Kick off a scrub.
+	 */
+	if (tasks & SPA_ASYNC_SCRUB)
+		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
+
+	/*
+	 * Kick off a resilver.
+	 */
+	if (tasks & SPA_ASYNC_RESILVER)
+		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+	/*
+	 * Let the world know that we're done.
+	 */
+	mutex_enter(&spa->spa_async_lock);
+	spa->spa_async_thread = NULL;
+	cv_broadcast(&spa->spa_async_cv);
+	mutex_exit(&spa->spa_async_lock);
+	thread_exit();
+}
+
+void
+spa_async_suspend(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	spa->spa_async_suspended++;
+	while (spa->spa_async_thread != NULL)
+		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
+	mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_resume(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	ASSERT(spa->spa_async_suspended != 0);
+	spa->spa_async_suspended--;
+	mutex_exit(&spa->spa_async_lock);
+}
+
+static void
+spa_async_dispatch(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
+	    spa->spa_async_thread == NULL)
+		spa->spa_async_thread = thread_create(NULL, 0,
+		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
+	mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_request(spa_t *spa, int task)
+{
+	mutex_enter(&spa->spa_async_lock);
+	spa->spa_async_tasks |= task;
+	mutex_exit(&spa->spa_async_lock);
 }
 
 /*
@@ -1628,17 +1983,19 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 
 	packed = kmem_alloc(nvsize, KM_SLEEP);
 
-	VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0);
+	VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR,
+	    KM_SLEEP) == 0);
 
 	dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize,
 	    packed, tx);
 
 	kmem_free(packed, nvsize);
 
-	db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object);
+	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset,
+	    spa->spa_config_object, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	*(uint64_t *)db->db_data = nvsize;
-	dmu_buf_rele(db);
+	dmu_buf_rele(db, FTAG);
 }
 
 /*
@@ -1651,7 +2008,6 @@ spa_sync(spa_t *spa, uint64_t txg)
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	objset_t *mos = spa->spa_meta_objset;
 	bplist_t *bpl = &spa->spa_sync_bplist;
-	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	dmu_tx_t *tx;
 	int dirty_vdevs;
@@ -1659,12 +2015,12 @@ spa_sync(spa_t *spa, uint64_t txg)
 	/*
 	 * Lock out configuration changes.
 	 */
-	spa_config_enter(spa, RW_READER);
+	spa_config_enter(spa, RW_READER, FTAG);
 
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
-	bplist_open(bpl, mos, spa->spa_sync_bplist_obj);
+	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
 
 	/*
 	 * If anything has changed in this txg, push the deferred frees
@@ -1685,6 +2041,8 @@ spa_sync(spa_t *spa, uint64_t txg)
 		spa_sync_config_object(spa, tx);
 		dmu_tx_commit(tx);
 
+		spa_errlog_sync(spa, txg);
+
 		dsl_pool_sync(dp, txg);
 
 		dirty_vdevs = 0;
@@ -1707,11 +2065,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 	 * Rewrite the vdev configuration (which includes the uberblock)
 	 * to commit the transaction group.
 	 */
-	while (spa_sync_labels(spa, txg)) {
-		dprintf("waiting for devices to heal\n");
-		delay(hz);
-		vdev_reopen(rvd, NULL);
-	}
+	VERIFY(0 == spa_sync_labels(spa, txg));
 
 	/*
 	 * Make a stable copy of the fully synced uberblock.
@@ -1748,7 +2102,12 @@ spa_sync(spa_t *spa, uint64_t txg)
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 	ASSERT(bpl->bpl_queue == NULL);
 
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
+
+	/*
+	 * If any async tasks have been requested, kick them off.
+	 */
+	spa_async_dispatch(spa);
 }
 
 /*
@@ -1800,13 +2159,13 @@ spa_evict_all(void)
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(NULL)) != NULL) {
 		/*
-		 * Stop all scrub and resilver activity.  spa_scrub() needs to
-		 * wait for the scrub thread, which may do a detach and sync the
-		 * configs, which needs spa_namespace_lock.  Drop the lock while
-		 * maintaining a hold on the spa_t.
+		 * Stop async tasks.  The async thread may need to detach
+		 * a device that's been replaced, which requires grabbing
+		 * spa_namespace_lock, so we must drop it here.
 		 */
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
+		spa_async_suspend(spa);
 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
@@ -1819,3 +2178,9 @@ spa_evict_all(void)
 	}
 	mutex_exit(&spa_namespace_lock);
 }
+
+vdev_t *
+spa_lookup_by_guid(spa_t *spa, uint64_t guid)
+{
+	return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
+}
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
index abcd67ddb9..addf3af885 100644
--- a/usr/src/uts/common/fs/zfs/spa_config.c
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -33,6 +32,11 @@
 #include <sys/fs/zfs.h>
 #include <sys/vdev_impl.h>
 #include <sys/zfs_ioctl.h>
+#ifdef _KERNEL
+#include <sys/kobj.h>
+#endif
+
+extern int modrootloaded;
 
 /*
  * Pool configuration repository.
@@ -65,43 +69,39 @@ const char *spa_config_dir = ZPOOL_CACHE_DIR;
 void
 spa_config_load(void)
 {
-	vnode_t *vp;
 	void *buf = NULL;
-	vattr_t vattr;
-	ssize_t resid;
 	nvlist_t *nvlist, *child;
 	nvpair_t *nvpair;
 	spa_t *spa;
 	char pathname[128];
+	struct _buf *file;
+	struct bootstat bst;
 
 	/*
 	 * Open the configuration file.
 	 */
-	(void) snprintf(pathname, sizeof (pathname), "./%s/%s", spa_config_dir,
-	    ZPOOL_CACHE_FILE);
-	if (vn_openat(pathname, UIO_SYSSPACE, FREAD | FOFFMAX, 0, &vp, 0, 0,
-	    rootdir) != 0)
+	(void) snprintf(pathname, sizeof (pathname), "%s%s/%s",
+	    (modrootloaded) ? "./" : "", spa_config_dir, ZPOOL_CACHE_FILE);
+
+	file = kobj_open_file(pathname);
+	if (file == (struct _buf *)-1)
 		return;
 
-	/*
-	 * Read the nvlist from the file.
-	 */
-	if (VOP_GETATTR(vp, &vattr, 0, kcred) != 0)
+	if (kobj_fstat(file->_fd, &bst) != 0)
 		goto out;
 
-	buf = kmem_alloc(vattr.va_size, KM_SLEEP);
+	buf = kmem_alloc(bst.st_size, KM_SLEEP);
 
-	if (vn_rdwr(UIO_READ, vp, buf, vattr.va_size, 0, UIO_SYSSPACE,
-	    0, RLIM64_INFINITY, kcred, &resid) != 0)
-		goto out;
-
-	if (resid != 0)
+	/*
+	 * Read the nvlist from the file.
+	 */
+	if (kobj_read_file(file, buf, bst.st_size, 0) < 0)
 		goto out;
 
 	/*
 	 * Unpack the nvlist.
 	 */
-	if (nvlist_unpack(buf, vattr.va_size, &nvlist, KM_SLEEP) != 0)
+	if (nvlist_unpack(buf, bst.st_size, &nvlist, KM_SLEEP) != 0)
 		goto out;
 
 	/*
@@ -133,10 +133,9 @@ spa_config_load(void)
 
 out:
 	if (buf != NULL)
-		kmem_free(buf, vattr.va_size);
+		kmem_free(buf, bst.st_size);
 
-	(void) VOP_CLOSE(vp, FREAD | FOFFMAX, 1, 0, kcred);
-	VN_RELE(vp);
+	kobj_close_file(file);
 }
 
 /*
@@ -157,7 +156,7 @@ spa_config_sync(void)
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
-	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	/*
 	 * Add all known pools to the configuration list, ignoring those with
@@ -179,7 +178,8 @@ spa_config_sync(void)
 
 	buf = kmem_alloc(buflen, KM_SLEEP);
 
-	VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR, 0) == 0);
+	VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR,
+	    KM_SLEEP) == 0);
 
 	/*
 	 * Write the configuration to disk.  We need to do the traditional
@@ -226,7 +226,7 @@ spa_all_configs(uint64_t *generation)
 	if (*generation == spa_config_generation)
 		return (NULL);
 
-	VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	spa = NULL;
 	mutex_enter(&spa_namespace_lock);
@@ -279,7 +279,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 	else if (txg != 0 && vd == rvd)
 		spa->spa_config_txg = txg;
 
-	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 	    UBERBLOCK_VERSION) == 0);
diff --git a/usr/src/uts/common/fs/zfs/spa_errlog.c b/usr/src/uts/common/fs/zfs/spa_errlog.c
new file mode 100644
index 0000000000..b52c3236d2
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/spa_errlog.c
@@ -0,0 +1,436 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Routines to manage the on-disk persistent error log.
+ *
+ * Each pool stores a log of all logical data errors seen during normal
+ * operation.  This is actually the union of two distinct logs: the last log,
+ * and the current log.  All errors seen are logged to the current log.  When a
+ * scrub completes, the current log becomes the last log, the last log is thrown
+ * out, and the current log is reinitialized.  This way, if an error is somehow
+ * corrected, a new scrub will show that that it no longer exists, and will be
+ * deleted from the log when the scrub completes.
+ *
+ * The log is stored using a ZAP object whose key is a string form of the
+ * zbookmark tuple (objset, object, level, blkid), and whose contents is an
+ * optional 'objset:object' human-readable string describing the data.  When an
+ * error is first logged, this string will be empty, indicating that no name is
+ * known.  This prevents us from having to issue a potentially large amount of
+ * I/O to discover the object name during an error path.  Instead, we do the
+ * calculation when the data is requested, storing the result so future queries
+ * will be faster.
+ *
+ * This log is then shipped into an nvlist where the key is the dataset name and
+ * the value is the object name.  Userland is then responsible for uniquifying
+ * this list and displaying it to the user.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+
+/*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexidecimal numbers that don't overflow.
+ */
+static uint64_t
+strtonum(char *str, char **nptr)
+{
+	uint64_t val = 0;
+	char c;
+	int digit;
+
+	while ((c = *str) != '\0') {
+		if (c >= '0' && c <= '9')
+			digit = c - '0';
+		else if (c >= 'a' && c <= 'f')
+			digit = 10 + c - 'a';
+		else
+			break;
+
+		val *= 16;
+		val += digit;
+
+		str++;
+	}
+
+	*nptr = str;
+
+	return (val);
+}
+
+/*
+ * Convert a bookmark to a string.
+ */
+static void
+bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
+{
+	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
+	    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
+	    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
+}
+
+/*
+ * Convert a string to a bookmark
+ */
+static void
+name_to_bookmark(char *buf, zbookmark_t *zb)
+{
+	zb->zb_objset = strtonum(buf, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_object = strtonum(buf + 1, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_level = (int)strtonum(buf + 1, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_blkid = strtonum(buf + 1, &buf);
+	ASSERT(*buf == '\0');
+}
+
+/*
+ * Log an uncorrectable error to the persistent error log.  We add it to the
+ * spa's list of pending errors.  The changes are actually synced out to disk
+ * during spa_errlog_sync().
+ */
+void
+spa_log_error(spa_t *spa, zio_t *zio)
+{
+	zbookmark_t *zb = &zio->io_logical->io_bookmark;
+	spa_error_entry_t search;
+	spa_error_entry_t *new;
+	avl_tree_t *tree;
+	avl_index_t where;
+
+	/*
+	 * If we are trying to import a pool, ignore any errors, as we won't be
+	 * writing to the pool any time soon.
+	 */
+	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+		return;
+
+	mutex_enter(&spa->spa_errlist_lock);
+
+	/*
+	 * If we have had a request to rotate the log, log it to the next list
+	 * instead of the current one.
+	 */
+	if (spa->spa_scrub_active || spa->spa_scrub_finished)
+		tree = &spa->spa_errlist_scrub;
+	else
+		tree = &spa->spa_errlist_last;
+
+	search.se_bookmark = *zb;
+	if (avl_find(tree, &search, &where) != NULL) {
+		mutex_exit(&spa->spa_errlist_lock);
+		return;
+	}
+
+	new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+	new->se_bookmark = *zb;
+	avl_insert(tree, new, where);
+
+	mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Return the number of errors currently in the error log.  This is actually the
+ * sum of both the last log and the current log, since we don't know the union
+ * of these logs until we reach userland.
+ */
+uint64_t
+spa_get_errlog_size(spa_t *spa)
+{
+	uint64_t total = 0, count;
+
+	mutex_enter(&spa->spa_errlog_lock);
+	if (spa->spa_errlog_scrub != 0 &&
+	    zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
+	    &count) == 0)
+		total += count;
+
+	if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+	    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+	    &count) == 0)
+		total += count;
+	mutex_exit(&spa->spa_errlog_lock);
+
+	mutex_enter(&spa->spa_errlist_lock);
+	total += avl_numnodes(&spa->spa_errlist_last);
+	total += avl_numnodes(&spa->spa_errlist_scrub);
+	mutex_exit(&spa->spa_errlist_lock);
+
+	return (total);
+}
+
+#ifdef _KERNEL
+static int
+process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	zbookmark_t zb;
+
+	if (obj == 0)
+		return (0);
+
+	for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+
+		if (*count == 0) {
+			zap_cursor_fini(&zc);
+			return (ENOMEM);
+		}
+
+		name_to_bookmark(za.za_name, &zb);
+
+		if (copyout(&zb, (char *)addr +
+		    (*count - 1) * sizeof (zbookmark_t),
+		    sizeof (zbookmark_t)) != 0)
+			return (EFAULT);
+
+		*count -= 1;
+	}
+
+	zap_cursor_fini(&zc);
+
+	return (0);
+}
+
+static int
+process_error_list(avl_tree_t *list, void *addr, size_t *count)
+{
+	spa_error_entry_t *se;
+
+	for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+
+		if (*count == 0)
+			return (ENOMEM);
+
+		if (copyout(&se->se_bookmark, (char *)addr +
+		    (*count - 1) * sizeof (zbookmark_t),
+		    sizeof (zbookmark_t)) != 0)
+			return (EFAULT);
+
+		*count -= 1;
+	}
+
+	return (0);
+}
+#endif
+
+/*
+ * Copy all known errors to userland as an array of bookmarks.  This is
+ * actually a union of the on-disk last log and current log, as well as any
+ * pending error requests.
+ *
+ * Because the act of reading the on-disk log could cause errors to be
+ * generated, we have two separate locks: one for the error log and one for the
+ * in-core error lists.  We only need the error list lock to log and error, so
+ * we grab the error log lock while we read the on-disk logs, and only pick up
+ * the error list lock when we are finished.
+ */
+int
+spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
+{
+	int ret = 0;
+
+#ifdef _KERNEL
+	mutex_enter(&spa->spa_errlog_lock);
+
+	ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
+
+	if (!ret && !spa->spa_scrub_finished)
+		ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
+		    count);
+
+	mutex_enter(&spa->spa_errlist_lock);
+	if (!ret)
+		ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
+		    count);
+	if (!ret)
+		ret = process_error_list(&spa->spa_errlist_last, uaddr,
+		    count);
+	mutex_exit(&spa->spa_errlist_lock);
+
+	mutex_exit(&spa->spa_errlog_lock);
+#endif
+
+	return (ret);
+}
+
+/*
+ * Called when a scrub completes.  This simply set a bit which tells which AVL
+ * tree to add new errors.  spa_errlog_sync() is responsible for actually
+ * syncing the changes to the underlying objects.
+ */
+void
+spa_errlog_rotate(spa_t *spa)
+{
+	mutex_enter(&spa->spa_errlist_lock);
+
+	ASSERT(!spa->spa_scrub_finished);
+	spa->spa_scrub_finished = B_TRUE;
+
+	mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Discard any pending errors from the spa_t.  Called when unloading a faulted
+ * pool, as the errors encountered during the open cannot be synced to disk.
+ */
+void
+spa_errlog_drain(spa_t *spa)
+{
+	spa_error_entry_t *se;
+	void *cookie;
+
+	mutex_enter(&spa->spa_errlist_lock);
+
+	cookie = NULL;
+	while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
+	    &cookie)) != NULL)
+		kmem_free(se, sizeof (spa_error_entry_t));
+	cookie = NULL;
+	while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
+	    &cookie)) != NULL)
+		kmem_free(se, sizeof (spa_error_entry_t));
+
+	mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Process a list of errors into the current on-disk log.
+ */
+static void
+sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
+{
+	spa_error_entry_t *se;
+	char buf[64];
+	void *cookie;
+
+	if (avl_numnodes(t) != 0) {
+		/* create log if necessary */
+		if (*obj == 0)
+			*obj = zap_create(spa->spa_meta_objset,
+			    DMU_OT_ERROR_LOG, DMU_OT_NONE,
+			    0, tx);
+
+		/* add errors to the current log */
+		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
+			char *name = se->se_name ? se->se_name : "";
+
+			bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
+
+			(void) zap_update(spa->spa_meta_objset,
+			    *obj, buf, 1, strlen(name) + 1, name, tx);
+		}
+
+		/* purge the error list */
+		cookie = NULL;
+		while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
+			kmem_free(se, sizeof (spa_error_entry_t));
+	}
+}
+
+/*
+ * Sync the error log out to disk.  This is a little tricky because the act of
+ * writing the error log requires the spa_errlist_lock.  So, we need to lock the
+ * error lists, take a copy of the lists, and then reinitialize them.  Then, we
+ * drop the error list lock and take the error log lock, at which point we
+ * do the errlog processing.  Then, if we encounter an I/O error during this
+ * process, we can successfully add the error to the list.  Note that this will
+ * result in the perpetual recycling of errors, but it is an unlikely situation
+ * and not a performance critical operation.
+ */
+void
+spa_errlog_sync(spa_t *spa, uint64_t txg)
+{
+	dmu_tx_t *tx;
+	avl_tree_t scrub, last;
+	int scrub_finished;
+
+	mutex_enter(&spa->spa_errlist_lock);
+
+	/*
+	 * Bail out early under normal circumstances.
+	 */
+	if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
+	    avl_numnodes(&spa->spa_errlist_last) == 0 &&
+	    !spa->spa_scrub_finished) {
+		mutex_exit(&spa->spa_errlist_lock);
+		return;
+	}
+
+	spa_get_errlists(spa, &last, &scrub);
+	scrub_finished = spa->spa_scrub_finished;
+	spa->spa_scrub_finished = B_FALSE;
+
+	mutex_exit(&spa->spa_errlist_lock);
+	mutex_enter(&spa->spa_errlog_lock);
+
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+	/*
+	 * Sync out the current list of errors.
+	 */
+	sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
+
+	/*
+	 * Rotate the log if necessary.
+	 */
+	if (scrub_finished) {
+		if (spa->spa_errlog_last != 0)
+			VERIFY(dmu_object_free(spa->spa_meta_objset,
+			    spa->spa_errlog_last, tx) == 0);
+		spa->spa_errlog_last = spa->spa_errlog_scrub;
+		spa->spa_errlog_scrub = 0;
+
+		sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
+	}
+
+	/*
+	 * Sync out any pending scrub errors.
+	 */
+	sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
+
+	/*
+	 * Update the MOS to reflect the new values.
+	 */
+	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
+	    &spa->spa_errlog_last, tx);
+	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
+	    &spa->spa_errlog_scrub, tx);
+
+	dmu_tx_commit(tx);
+
+	mutex_exit(&spa->spa_errlog_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 1ea7edfb77..8e0f6ce722 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -60,6 +59,7 @@
  * 		- Increase spa_refcount from non-zero
  * 		- Check if spa_refcount is zero
  * 		- Rename a spa_t
+ *		- add/remove/attach/detach devices
  * 		- Held for the duration of create/destroy/import/export
  *
  * 	It does not need to handle recursion.  A create or destroy may
@@ -91,14 +91,6 @@
  *      must have the namespace lock or non-zero refcount to have any kind
  *      of spa_t pointer at all.
  *
- * spa_vdev_lock (global mutex)
- *
- * 	This special lock is a global mutex used to serialize attempts to
- * 	access devices through ZFS.  It makes sure that we do not try to add
- * 	a single vdev to multiple pools at the same time.  It must be held
- * 	when adding or removing a device from the pool.
- *
- *
  * The locking order is fairly straightforward:
  *
  * 		spa_namespace_lock	->	spa_refcount
@@ -111,10 +103,9 @@
  * 	There must be at least one valid reference on the spa_t to acquire
  * 	the config lock.
  *
- * 		spa_vdev_lock		->	spa_config_lock
+ * 		spa_namespace_lock	->	spa_config_lock
  *
- * 	There are no locks required for spa_vdev_lock, but it must be
- * 	acquired before spa_config_lock.
+ * 	The namespace lock must always be taken before the config lock.
  *
  *
  * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
@@ -136,6 +127,7 @@
  * 	spa_evict_all()		Shutdown and remove all spa_t structures in
  * 				the system.
  *
+ *	spa_guid_exists()	Determine whether a pool/device guid exists.
  *
  * The spa_refcount is manipulated using the following functions:
  *
@@ -162,15 +154,14 @@
  * 	spa_config_held()	Returns true if the config lock is currently
  * 				held in the given state.
  *
- * The spa_vdev_lock, while acquired directly, is hidden by the following
- * functions, which imply additional semantics that must be followed:
+ * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
  *
- * 	spa_vdev_enter()	Acquire the vdev lock and the config lock for
- * 				writing.
+ * 	spa_vdev_enter()	Acquire the namespace lock and the config lock
+ *				for writing.
  *
  * 	spa_vdev_exit()		Release the config lock, wait for all I/O
- * 				to complete, release the vdev lock, and sync
- * 				the updated configs to the cache.
+ * 				to complete, sync the updated configs to the
+ *				cache, and release the namespace lock.
  *
  * The spa_name() function also requires either the spa_namespace_lock
  * or the spa_config_lock, as both are needed to do a rename.  spa_rename() is
@@ -191,8 +182,6 @@ int zfs_flags = ~0;
 int zfs_flags = 0;
 #endif
 
-static kmutex_t spa_vdev_lock;
-
 #define	SPA_MINREF	5	/* spa_refcnt for an open-but-idle pool */
 
 /*
@@ -238,6 +227,7 @@ spa_add(const char *name)
 	spa->spa_freeze_txg = UINT64_MAX;
 
 	refcount_create(&spa->spa_refcount);
+	refcount_create(&spa->spa_config_lock.scl_count);
 
 	avl_add(&spa_namespace_avl, spa);
 
@@ -268,6 +258,7 @@ spa_remove(spa_t *spa)
 	spa_config_set(spa, NULL);
 
 	refcount_destroy(&spa->spa_refcount);
+	refcount_destroy(&spa->spa_config_lock.scl_count);
 
 	kmem_free(spa, sizeof (spa_t));
 }
@@ -351,7 +342,7 @@ spa_refcount_zero(spa_t *spa)
  * valid use during create.
  */
 void
-spa_config_enter(spa_t *spa, krw_t rw)
+spa_config_enter(spa_t *spa, krw_t rw, void *tag)
 {
 	spa_config_lock_t *scl = &spa->spa_config_lock;
 
@@ -362,13 +353,14 @@ spa_config_enter(spa_t *spa, krw_t rw)
 			while (scl->scl_writer != NULL)
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 		} else {
-			while (scl->scl_writer != NULL || scl->scl_count > 0)
+			while (scl->scl_writer != NULL ||
+			    !refcount_is_zero(&scl->scl_count))
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 			scl->scl_writer = curthread;
 		}
 	}
 
-	scl->scl_count++;
+	(void) refcount_add(&scl->scl_count, tag);
 
 	mutex_exit(&scl->scl_lock);
 }
@@ -377,14 +369,14 @@ spa_config_enter(spa_t *spa, krw_t rw)
  * Release the spa config lock, notifying any waiters in the process.
  */
 void
-spa_config_exit(spa_t *spa)
+spa_config_exit(spa_t *spa, void *tag)
 {
 	spa_config_lock_t *scl = &spa->spa_config_lock;
 
 	mutex_enter(&scl->scl_lock);
 
-	ASSERT(scl->scl_count > 0);
-	if (--scl->scl_count == 0) {
+	ASSERT(!refcount_is_zero(&scl->scl_count));
+	if (refcount_remove(&scl->scl_count, tag) == 0) {
 		cv_broadcast(&scl->scl_cv);
 		scl->scl_writer = NULL;  /* OK in either case */
 	}
@@ -405,7 +397,7 @@ spa_config_held(spa_t *spa, krw_t rw)
 	if (rw == RW_WRITER)
 		held = (scl->scl_writer == curthread);
 	else
-		held = (scl->scl_count != 0);
+		held = !refcount_is_zero(&scl->scl_count);
 	mutex_exit(&scl->scl_lock);
 
 	return (held);
@@ -418,16 +410,22 @@ spa_config_held(spa_t *spa, krw_t rw)
  */
 
 /*
- * Lock the given spa_t for the purpose of adding or removing a vdev.  This
- * grabs the global spa_vdev_lock as well as the spa config lock for writing.
+ * Lock the given spa_t for the purpose of adding or removing a vdev.
+ * Grabs the global spa_namespace_lock plus the spa config lock for writing.
  * It returns the next transaction group for the spa_t.
  */
 uint64_t
 spa_vdev_enter(spa_t *spa)
 {
-	mutex_enter(&spa_vdev_lock);
+	/*
+	 * Suspend scrub activity while we mess with the config.
+	 */
+	spa_scrub_suspend(spa);
 
-	spa_config_enter(spa, RW_WRITER);
+	if (spa->spa_root_vdev != NULL)		/* not spa_create() */
+		mutex_enter(&spa_namespace_lock);
+
+	spa_config_enter(spa, RW_WRITER, spa);
 
 	return (spa_last_synced_txg(spa) + 1);
 }
@@ -441,14 +439,26 @@ spa_vdev_enter(spa_t *spa)
 int
 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 {
-	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+	ASSERT(txg != 0);
+
+	/*
+	 * Reassess the DTLs.  spa_scrub() looks at the DTLs without
+	 * taking the config lock at all, so keep it safe.
+	 */
+	if (spa->spa_root_vdev)
+		vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+
+	spa_config_exit(spa, spa);
 
-	spa_config_exit(spa);
+	/*
+	 * If there was a scrub or resilver in progress, indicate that
+	 * it must restart, and then allow it to resume.
+	 */
+	spa_scrub_restart(spa, txg);
+	spa_scrub_resume(spa);
 
-	if (vd == spa->spa_root_vdev) {		/* spa_create() */
-		mutex_exit(&spa_vdev_lock);
+	if (vd == spa->spa_root_vdev)		/* spa_create() */
 		return (error);
-	}
 
 	/*
 	 * Note: this txg_wait_synced() is important because it ensures
@@ -458,8 +468,6 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 	if (error == 0)
 		txg_wait_synced(spa->spa_dsl_pool, txg);
 
-	mutex_exit(&spa_vdev_lock);
-
 	if (vd != NULL) {
 		ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
 		vdev_free(vd);
@@ -469,11 +477,10 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 	 * If we're in the middle of export or destroy, don't sync the
 	 * config -- it will do that anyway, and we deadlock if we try.
 	 */
-	if (error == 0 && spa->spa_state == POOL_STATE_ACTIVE) {
-		mutex_enter(&spa_namespace_lock);
+	if (error == 0 && spa->spa_state == POOL_STATE_ACTIVE)
 		spa_config_sync();
-		mutex_exit(&spa_namespace_lock);
-	}
+
+	mutex_exit(&spa_namespace_lock);
 
 	return (error);
 }
@@ -497,7 +504,7 @@ spa_rename(const char *name, const char *newname)
 	 * Lookup the spa_t and grab the config lock for writing.  We need to
 	 * actually open the pool so that we can sync out the necessary labels.
 	 * It's OK to call spa_open() with the namespace lock held because we
-	 * alllow recursive calls for other reasons.
+	 * allow recursive calls for other reasons.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if ((err = spa_open(name, &spa, FTAG)) != 0) {
@@ -505,7 +512,7 @@ spa_rename(const char *name, const char *newname)
 		return (err);
 	}
 
-	spa_config_enter(spa, RW_WRITER);
+	spa_config_enter(spa, RW_WRITER, FTAG);
 
 	avl_remove(&spa_namespace_avl, spa);
 	spa_strfree(spa->spa_name);
@@ -519,7 +526,7 @@ spa_rename(const char *name, const char *newname)
 	 */
 	vdev_config_dirty(spa->spa_root_vdev);
 
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 
@@ -548,12 +555,8 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
 {
 	spa_t *spa;
 	avl_tree_t *t = &spa_namespace_avl;
-	boolean_t locked = B_FALSE;
 
-	if (mutex_owner(&spa_namespace_lock) != curthread) {
-		mutex_enter(&spa_namespace_lock);
-		locked = B_TRUE;
-	}
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
 		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
@@ -565,9 +568,6 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
 			break;
 	}
 
-	if (locked)
-		mutex_exit(&spa_namespace_lock);
-
 	return (spa != NULL);
 }
 
@@ -646,12 +646,12 @@ spa_freeze(spa_t *spa)
 {
 	uint64_t freeze_txg = 0;
 
-	spa_config_enter(spa, RW_WRITER);
+	spa_config_enter(spa, RW_WRITER, FTAG);
 	if (spa->spa_freeze_txg == UINT64_MAX) {
 		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
 		spa->spa_freeze_txg = freeze_txg;
 	}
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 	if (freeze_txg != 0)
 		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
 }
diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c
index 25f66bf94b..a99ec3f360 100644
--- a/usr/src/uts/common/fs/zfs/space_map.c
+++ b/usr/src/uts/common/fs/zfs/space_map.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -293,7 +292,8 @@ space_map_load(space_map_t *sm, space_map_obj_t *smo, uint8_t maptype,
 
 		dprintf("object=%llu  offset=%llx  size=%llx\n",
 		    smo->smo_object, offset, size);
-		dmu_read(os, smo->smo_object, offset, size, entry_map);
+		VERIFY(0 == dmu_read(os, smo->smo_object, offset, size,
+		    entry_map));
 
 		entry_map_end = entry_map + (size / sizeof (uint64_t));
 		for (entry = entry_map; entry < entry_map_end; entry++) {
@@ -394,7 +394,8 @@ space_map_write(space_map_t *sm, space_map_obj_t *smo, objset_t *os,
 {
 	uint64_t oldsize = smo->smo_objsize;
 
-	dmu_free_range(os, smo->smo_object, 0, smo->smo_objsize, tx);
+	VERIFY(0 == dmu_free_range(os, smo->smo_object, 0,
+	    smo->smo_objsize, tx));
 
 	smo->smo_objsize = 0;
 
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index b11cd42b6d..1a93d4e4ca 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -41,6 +40,7 @@ typedef struct arc_buf_hdr arc_buf_hdr_t;
 typedef struct arc_buf arc_buf_t;
 typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
 typedef void arc_byteswap_func_t(void *buf, size_t size);
+typedef int arc_evict_func_t(void *private);
 
 /* generic arc_done_func_t's which you can use */
 arc_done_func_t arc_bcopy_func;
@@ -50,6 +50,8 @@ struct arc_buf {
 	arc_buf_hdr_t		*b_hdr;
 	arc_buf_t		*b_next;
 	void			*b_data;
+	arc_evict_func_t	*b_efunc;
+	void			*b_private;
 };
 
 /*
@@ -60,22 +62,30 @@ struct arc_buf {
 #define	ARC_PREFETCH	(1 << 3)	/* I/O is a prefetch */
 
 arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag);
-void arc_buf_free(arc_buf_t *buf, void *tag);
+void arc_buf_add_ref(arc_buf_t *buf, void *tag);
+int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
 void arc_release(arc_buf_t *buf, void *tag);
 int arc_released(arc_buf_t *buf);
+int arc_has_callback(arc_buf_t *buf);
+#ifdef ZFS_DEBUG
+int arc_referenced(arc_buf_t *buf);
+#endif
 
 int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
     arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t arc_flags);
+    uint32_t arc_flags, zbookmark_t *zb);
 int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
     arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t arc_flags);
+    uint32_t arc_flags, zbookmark_t *zb);
 int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     zio_done_func_t *done, void *private, uint32_t arc_flags);
 int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
 
+void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
+int arc_buf_evict(arc_buf_t *buf);
+
 void arc_flush(void);
 void arc_tempreserve_clear(uint64_t tempreserve);
 int arc_tempreserve_space(uint64_t tempreserve);
diff --git a/usr/src/uts/common/fs/zfs/sys/bplist.h b/usr/src/uts/common/fs/zfs/sys/bplist.h
index 0933cb977b..c716fe7aa6 100644
--- a/usr/src/uts/common/fs/zfs/sys/bplist.h
+++ b/usr/src/uts/common/fs/zfs/sys/bplist.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -67,11 +66,11 @@ typedef struct bplist {
 
 extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
 extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
-extern void bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
+extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
 extern void bplist_close(bplist_t *bpl);
 extern boolean_t bplist_empty(bplist_t *bpl);
 extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern void bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
+extern int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
 extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp);
 extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
 extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
index d67901b31a..5724f7a324 100644
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -45,13 +44,14 @@ extern "C" {
 #define	IN_DMU_SYNC ((blkptr_t *)-1)
 
 /*
- * define flags for dbuf_read and friends
+ * define flags for dbuf_read
  */
 
 #define	DB_RF_MUST_SUCCEED	0
 #define	DB_RF_CANFAIL		(1 << 1)
 #define	DB_RF_HAVESTRUCT	(1 << 2)
 #define	DB_RF_NOPREFETCH	(1 << 3)
+#define	DB_RF_NEVERWAIT		(1 << 4)
 
 /*
  * The state transition diagram for dbufs looks like:
@@ -59,7 +59,7 @@ extern "C" {
  *		+----> READ ----+
  *		|		|
  *		|		V
- *   (alloc)-->UNCACHED	     CACHED-->(free)
+ *  (alloc)-->UNCACHED	     CACHED-->EVICTING-->(free)
  *		|		^
  *		|		|
  *		+----> FILL ----+
@@ -68,7 +68,8 @@ typedef enum dbuf_states {
 	DB_UNCACHED,
 	DB_FILL,
 	DB_READ,
-	DB_CACHED
+	DB_CACHED,
+	DB_EVICTING
 } dbuf_states_t;
 
 struct objset_impl;
@@ -158,8 +159,8 @@ typedef struct dmu_buf_impl {
 	uint64_t db_dirtied;
 
 	/*
-	 * If dd_dnode != NULL, our link on the owner dnodes's dn_dbufs list.
-	 * Protected by its dn_mtx.
+	 * If db_dnode != NULL, our link on the owner dnodes's dn_dbufs list.
+	 * Protected by its dn_dbufs_mtx.
 	 */
 	list_node_t db_link;
 
@@ -194,7 +195,7 @@ typedef struct dmu_buf_impl {
 		 * modify (dirty or clean). db_mtx must be held
 		 * before dn_dirty_mtx.
 		 */
-		arc_buf_t *db_data_old[TXG_SIZE];
+		void *db_data_old[TXG_SIZE];
 		blkptr_t *db_overridden_by[TXG_SIZE];
 	} db_d;
 } dmu_buf_impl_t;
@@ -212,35 +213,32 @@ typedef struct dbuf_hash_table {
 uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
 
 dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
+dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn);
 
-dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid);
+dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
     void *tag);
-dmu_buf_impl_t *dbuf_hold_bonus(struct dnode *dn, void *tag);
 int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
     void *tag, dmu_buf_impl_t **dbp);
 
 void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
 
 void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
-void dbuf_remove_ref(dmu_buf_impl_t *db, void *tag);
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
-void dbuf_rele(dmu_buf_impl_t *db);
+void dbuf_rele(dmu_buf_impl_t *db, void *tag);
 
 dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
 
-void dbuf_read(dmu_buf_impl_t *db);
-int dbuf_read_canfail(dmu_buf_impl_t *db);
-void dbuf_read_havestruct(dmu_buf_impl_t *db);
-void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
+int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
 void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 
+void dbuf_clear(dmu_buf_impl_t *db);
 void dbuf_evict(dmu_buf_impl_t *db);
 
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
@@ -250,7 +248,6 @@ void dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg);
 void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
     struct dmu_tx *);
 
-void dbuf_downgrade(dmu_buf_impl_t *db, int evicting);
 void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
 
 void dbuf_init(void);
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 62cc46c4de..f0ba816a7c 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -99,6 +98,8 @@ typedef enum dmu_object_type {
 	DMU_OT_PLAIN_OTHER,		/* UINT8 */
 	DMU_OT_UINT64_OTHER,		/* UINT64 */
 	DMU_OT_ZAP_OTHER,		/* ZAP */
+	/* new object types: */
+	DMU_OT_ERROR_LOG,		/* ZAP */
 
 	DMU_OT_NUMTYPES
 } dmu_object_type_t;
@@ -146,6 +147,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
 int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
     objset_t **osp);
 void dmu_objset_close(objset_t *os);
+void dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type,
     objset_t *clone_parent,
     void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
@@ -177,6 +179,8 @@ typedef void dmu_byteswap_func_t(void *buf, size_t size);
 #define	DMU_POOL_CONFIG			"config"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
 #define	DMU_POOL_SYNC_BPLIST		"sync_bplist"
+#define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
+#define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
@@ -268,8 +272,7 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
  * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
  * buffer as well.  You must release your hold with dmu_buf_rele().
  */
-dmu_buf_t *dmu_bonus_hold(objset_t *os, uint64_t object);
-dmu_buf_t *dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag);
+int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
 
 /*
@@ -286,11 +289,10 @@ int dmu_bonus_max(void);
  *
  * The object number must be a valid, allocated object number.
  */
-dmu_buf_t *dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset);
+int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **);
 void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
-void dmu_buf_remove_ref(dmu_buf_t *db, void* tag);
-void dmu_buf_rele(dmu_buf_t *db);
-void dmu_buf_rele_tag(dmu_buf_t *db, void *tag);
+void dmu_buf_rele(dmu_buf_t *db, void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 
 /*
@@ -303,9 +305,9 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db);
  * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
  * individually with dmu_buf_rele.
  */
-dmu_buf_t **dmu_buf_hold_array(objset_t *os, uint64_t object,
-    uint64_t offset, uint64_t length, int *numbufs);
-void dmu_buf_rele_array(dmu_buf_t **, int numbufs);
+int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
+void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
 
 /*
  * Returns NULL on success, or the existing user ptr if it's already
@@ -348,19 +350,6 @@ void dmu_buf_rele_data(dmu_buf_t *db);
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 /*
- * Indicate that you are going to read the buffer's data (db_data).
- *
- * This routine will read the data from disk if necessary.
- *
- * These routines will return 0 on success, or an errno if there is a
- * nonrecoverable I/O error.
- */
-void dmu_buf_read(dmu_buf_t *db);
-int dmu_buf_read_canfail(dmu_buf_t *db);
-void dmu_buf_read_array(dmu_buf_t **dbp, int numbufs);
-int dmu_buf_read_array_canfail(dmu_buf_t **dbp, int numbufs);
-
-/*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
  * The transaction (tx) must be assigned to a txg (ie. you've called
@@ -370,20 +359,6 @@ int dmu_buf_read_array_canfail(dmu_buf_t **dbp, int numbufs);
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 
 /*
- * Indicate that you are going to modify the entire contents of the
- * buffer's data ("fill" it).
- *
- * This routine is the same as dmu_buf_will_dirty, except that it won't
- * read the contents off the disk, so the contents may be uninitialized
- * and you must overwrite it.
- *
- * The transaction (tx) must be assigned to a txg (ie. you've called
- * dmu_tx_assign()).  The buffer's object must be held in the tx (ie.
- * you've called dmu_tx_hold_object(tx, db->db_object)).
- */
-/* void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); */
-
-/*
  * You must create a transaction, then hold the objects which you will
  * (or might) modify as part of this transaction.  Then you must assign
  * the transaction to a transaction group.  Once the transaction has
@@ -408,7 +383,7 @@ dmu_tx_t *dmu_tx_create(objset_t *os);
 void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
-void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
@@ -418,7 +393,7 @@ void dmu_tx_commit(dmu_tx_t *tx);
  * Free up the data blocks for a defined range of a file.  If size is
  * zero, the range from offset to end-of-file is freed.
  */
-void dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size, dmu_tx_t *tx);
 
 /*
@@ -427,10 +402,8 @@ void dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
-void dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	void *buf);
-int dmu_read_canfail(objset_t *dd, uint64_t object, uint64_t offset,
-	uint64_t size, void *buf);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 int dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
@@ -491,8 +464,7 @@ uint64_t dmu_object_max_nonzero_offset(objset_t *os, uint64_t object);
 typedef struct dmu_objset_stats {
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
-	uint8_t dds_is_placeholder;
-	uint8_t dds_pad[2];
+	uint8_t dds_pad[3];
 
 	uint64_t dds_creation_time;
 	uint64_t dds_creation_txg;
@@ -532,7 +504,6 @@ typedef struct dmu_objset_stats {
 	 * change, so there is a small probability that it will collide.
 	 */
 	uint64_t dds_fsid_guid;
-	uint64_t dds_guid;
 
 	uint64_t dds_objects_used;	/* number of objects used */
 	uint64_t dds_objects_avail;	/* number of objects available */
@@ -553,15 +524,9 @@ typedef struct dmu_objset_stats {
 	uint64_t dds_available;
 
 	/*
-	 * Miscellaneous
+	 * Used for debugging purposes
 	 */
-	char dds_altroot[MAXPATHLEN];
-
-	/* The following are for debugging purposes only */
 	uint64_t dds_last_txg;
-	uint64_t dds_dir_obj;
-	uint64_t dds_objset_obj;
-	uint64_t dds_clone_of_obj;
 } dmu_objset_stats_t;
 
 /*
@@ -617,7 +582,7 @@ void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 
 int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp);
-int dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
+int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
     struct vnode *vp, uint64_t voffset);
 
 /* CRC64 table */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
index d0a77fcfb9..ee14bfab85 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -86,12 +85,7 @@ typedef struct objset_impl {
 	list_t os_downgraded_dbufs;
 } objset_impl_t;
 
-#define	DMU_PRIVATE_OBJECT		(1ULL << 63)
-
-#define	DMU_META_DNODE_OBJECT		(1ULL << 63)
-
-/* XXX rename this to DMU_IS_DNODE_OBJECT? */
-#define	IS_DNODE_DNODE(object) ((object) == DMU_META_DNODE_OBJECT)
+#define	DMU_META_DNODE_OBJECT	0
 
 /* called from zpl */
 int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
@@ -106,13 +100,14 @@ void dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds);
 void dmu_objset_find(char *name, void func(char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
+void dmu_objset_evict_dbufs(objset_t *os);
 
 /* called from dsl */
 void dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx);
 objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
     dmu_objset_type_t type, dmu_tx_t *tx);
-objset_impl_t *dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds,
-    blkptr_t *bp);
+int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
+    objset_impl_t **osip);
 void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
index 7087912e00..a80345afd0 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -45,7 +44,8 @@ extern "C" {
 #define	ADVANCE_PRUNE	0x02		/* prune by prev snapshot birth time */
 #define	ADVANCE_DATA	0x04		/* read user data blocks */
 #define	ADVANCE_HOLES	0x08		/* visit holes */
-#define	ADVANCE_NOLOCK	0x10		/* Don't grab SPA sync lock */
+#define	ADVANCE_ZIL	0x10		/* visit intent log blocks */
+#define	ADVANCE_NOLOCK	0x20		/* Don't grab SPA sync lock */
 
 #define	ZB_NO_LEVEL	-2
 #define	ZB_MAXLEVEL	32		/* Next power of 2 >= DN_MAX_LEVELS */
@@ -58,13 +58,6 @@ extern "C" {
 #define	ZB_DN_CACHE	2
 #define	ZB_DEPTH	3
 
-typedef struct zbookmark {
-	uint64_t	zb_objset;
-	uint64_t	zb_object;
-	int		zb_level;
-	uint64_t	zb_blkid;
-} zbookmark_t;
-
 typedef struct zseg {
 	uint64_t	seg_mintxg;
 	uint64_t	seg_maxtxg;
@@ -93,6 +86,7 @@ struct traverse_handle {
 	int		th_zio_flags;
 	list_t		th_seglist;
 	traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL];
+	traverse_blk_cache_t th_zil_cache;
 	uint64_t	th_hits;
 	uint64_t	th_arc_hits;
 	uint64_t	th_reads;
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
index d04c7c8d6b..9b55c56bc9 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -54,6 +53,7 @@ struct dmu_tx {
 	struct dsl_dir *tx_dir;
 	struct dsl_pool *tx_pool;
 	uint64_t tx_txg;
+	uint64_t tx_lastsnap_txg;
 	txg_handle_t tx_txgh;
 	uint64_t tx_space_towrite;
 	refcount_t tx_space_written;
@@ -62,7 +62,7 @@ struct dmu_tx {
 	uint64_t tx_space_tooverwrite;
 	void *tx_tempreserve_cookie;
 	uint8_t tx_anyobj;
-	uint8_t tx_privateobj;
+	int tx_err;
 #ifdef ZFS_DEBUG
 	char *tx_debug_buf;
 	int tx_debug_len;
@@ -79,15 +79,10 @@ enum dmu_tx_hold_type {
 	THT_NUMTYPES
 };
 
-typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
-    uint64_t arg1, uint64_t arg2);
-
-
 typedef struct dmu_tx_hold {
 	list_node_t dth_node;
 	struct dnode *dth_dnode;
 	enum dmu_tx_hold_type dth_type;
-	dmu_tx_hold_func_t dth_func;
 	uint64_t dth_arg1;
 	uint64_t dth_arg2;
 	/* XXX track what the actual estimates were for this hold */
diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h
index 1b43805e93..31b148f295 100644
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -63,23 +62,16 @@ extern "C" {
 #define	DNODE_SIZE	(1 << DNODE_SHIFT)
 #define	DN_MAX_NBLKPTR	((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
 #define	DN_MAX_BONUSLEN	(DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
+#define	DN_MAX_OBJECT	(1ULL << DN_MAX_OBJECT_SHIFT)
 
 #define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
 #define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
 #define	DNODES_PER_LEVEL_SHIFT	(DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
 
-#define	DN_META_DNODE_LEVELS	\
-	(1 + (DN_MAX_OBJECT_SHIFT - DNODE_SHIFT + SPA_BLKPTRSHIFT -	\
-	DNODES_PER_BLOCK_SHIFT) / DNODES_PER_LEVEL_SHIFT)
-
 /* The +2 here is a cheesy way to round up */
 #define	DN_MAX_LEVELS	(2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
 	(DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
 
-#define	DN_MAX_OBJECT		\
-	((uint64_t)DN_MAX_NBLKPTR << (DNODES_PER_BLOCK_SHIFT +	\
-	(DN_META_DNODE_LEVELS - 1) * DNODES_PER_LEVEL_SHIFT))
-
 #define	DN_BONUS(dnp)	((void*)((dnp)->dn_bonus + \
 	(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
 
@@ -213,15 +205,7 @@ typedef struct dnode {
 
 	kmutex_t dn_dbufs_mtx;
 	list_t dn_dbufs;		/* linked list of descendent dbuf_t's */
-	kcondvar_t dn_evicted;		/* a child dbuf has been evicted */
-
-	/*
-	 * Performance hack: whenever we have a hold on the bonus buffer of a
-	 * ZAP object, we will also have a hold on db0.  This will keep the
-	 * meta-data for a micro-zap object cached as long as the znode for the
-	 * object is in the znode cache.
-	 */
-	struct dmu_buf_impl *dn_db0;
+	struct dmu_buf_impl *dn_bonus;	/* bonus buffer dbuf */
 
 	/* holds prefetch structure */
 	struct zfetch	dn_zfetch;
@@ -237,9 +221,10 @@ dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
     uint64_t object);
 void dnode_special_close(dnode_t *dn);
 
-dnode_t *dnode_hold(struct objset_impl *dd, uint64_t object, void *ref);
-dnode_t *dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
-    void *ref);
+int dnode_hold(struct objset_impl *dd, uint64_t object,
+    void *ref, dnode_t **dnp);
+int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
+    void *ref, dnode_t **dnp);
 void dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
 void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
@@ -266,6 +251,7 @@ void dnode_init(void);
 void dnode_fini(void);
 int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
     uint64_t blkfill);
+void dnode_evict_dbufs(dnode_t *dn);
 
 #ifdef ZFS_DEBUG
 
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
index e56c8a67d9..3411eba68b 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -108,8 +107,8 @@ int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
     void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_open(const char *name, int mode, void *tag,
     dsl_dataset_t **dsp);
-dsl_dataset_t *dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
-    const char *tail, int mode, void *tag);
+int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
+    const char *tail, int mode, void *tag, dsl_dataset_t **);
 void dsl_dataset_name(dsl_dataset_t *ds, char *name);
 void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag);
 int dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
@@ -134,8 +133,8 @@ void dsl_dataset_sync(dsl_dataset_t *os, dmu_tx_t *tx);
 
 void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth,
-    dmu_tx_t *tx);
+int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
+uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
 
 void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
 void dsl_dataset_stats(dsl_dataset_t *os, dmu_objset_stats_t *dds);
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
index 0499d731e6..5c23fdc497 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -98,11 +97,11 @@ struct dsl_dir {
 };
 
 void dsl_dir_close(dsl_dir_t *dd, void *tag);
-dsl_dir_t *dsl_dir_open(const char *name, void *tag, const char **tail);
-dsl_dir_t *dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail);
+int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **,
     const char **tailp);
-dsl_dir_t *dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
-    const char *tail, void *tag);
+int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+    const char *tail, void *tag, dsl_dir_t **);
 void dsl_dir_name(dsl_dir_t *dd, char *buf);
 int dsl_dir_is_private(dsl_dir_t *dd);
 int dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx);
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
index 4fca4548ad..2eab6ae945 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -67,7 +66,7 @@ typedef struct dsl_pool {
 	krwlock_t dp_config_rwlock;
 } dsl_pool_t;
 
-dsl_pool_t *dsl_pool_open(spa_t *spa, uint64_t txg);
+int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
 void dsl_pool_close(dsl_pool_t *dp);
 dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg);
 void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h
index f9fffd2443..0b7e12f2cb 100644
--- a/usr/src/uts/common/fs/zfs/sys/refcount.h
+++ b/usr/src/uts/common/fs/zfs/sys/refcount.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -42,7 +41,7 @@ extern "C" {
  * particular object, use FTAG (which is a string) for the holder_tag.
  * Otherwise, use the object that holds the reference.
  */
-#define	FTAG ((void*)__func__)
+#define	FTAG ((char *)__func__)
 
 #if defined(DEBUG) || !defined(_KERNEL)
 typedef struct reference {
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index fbe2822a13..2c8a43bb37 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -292,21 +291,30 @@ typedef struct blkptr {
 
 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, void *tag);
-extern int spa_get_stats(const char *pool, nvlist_t **config);
+extern int spa_get_stats(const char *pool, nvlist_t **config,
+    char *altroot, size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *config, char *altroot);
 extern int spa_import(const char *pool, nvlist_t *config, char *altroot);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
 extern int spa_export(char *pool);
+extern int spa_reset(char *pool);
+extern void spa_async_request(spa_t *spa, int flag);
+extern void spa_async_suspend(spa_t *spa);
+extern void spa_async_resume(spa_t *spa);
+extern spa_t *spa_inject_addref(char *pool);
+extern void spa_inject_delref(spa_t *spa);
+
+#define	SPA_ASYNC_REOPEN	0x01
+#define	SPA_ASYNC_REPLACE_DONE	0x02
+#define	SPA_ASYNC_SCRUB		0x04
+#define	SPA_ASYNC_RESILVER	0x08
 
 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
-extern int spa_vdev_add_unlocked(spa_t *spa, nvlist_t *nvroot);
-extern int spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot,
+extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
     int replacing);
-extern int spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid,
-    int replace_done);
-extern void spa_vdev_replace_done(spa_t *spa);
+extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 
 /* scrubbing */
@@ -314,6 +322,7 @@ extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
 extern void spa_scrub_suspend(spa_t *spa);
 extern void spa_scrub_resume(spa_t *spa);
 extern void spa_scrub_restart(spa_t *spa, uint64_t txg);
+extern void spa_scrub_throttle(spa_t *spa, int direction);
 
 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
@@ -345,8 +354,8 @@ extern void spa_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
 /* Pool configuration lock */
-extern void spa_config_enter(spa_t *spa, krw_t rw);
-extern void spa_config_exit(spa_t *spa);
+extern void spa_config_enter(spa_t *spa, krw_t rw, void *tag);
+extern void spa_config_exit(spa_t *spa, void *tag);
 extern boolean_t spa_config_held(spa_t *spa, krw_t rw);
 
 /* Pool vdev add/remove lock */
@@ -383,6 +392,23 @@ extern uint64_t spa_get_random(uint64_t range);
 extern void sprintf_blkptr(char *buf, int len, blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern void spa_evict_all(void);
+extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid);
+
+/* error handling */
+struct zbookmark;
+struct zio;
+extern void spa_log_error(spa_t *spa, struct zio *zio);
+extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
+    struct zio *zio, uint64_t stateoroffset, uint64_t length);
+extern void zfs_post_ok(spa_t *spa, vdev_t *vd);
+extern uint64_t spa_get_errlog_size(spa_t *spa);
+extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
+extern void spa_errlog_rotate(spa_t *spa);
+extern void spa_errlog_drain(spa_t *spa);
+extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
+extern int spa_bookmark_name(spa_t *spa, struct zbookmark *zb, char *ds,
+    size_t dsname, char *obj, size_t objname, char *range, size_t rangelen);
+extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
 
 /* Initialization and termination */
 extern void spa_init(int flags);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 0fcef6c48b..e9192956c3 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -46,27 +45,33 @@ extern "C" {
 
 typedef struct spa_config_lock {
 	kmutex_t	scl_lock;
-	uint64_t	scl_count;
+	refcount_t	scl_count;
 	kthread_t	*scl_writer;
 	kcondvar_t	scl_cv;
 } spa_config_lock_t;
 
+typedef struct spa_error_entry {
+	zbookmark_t	se_bookmark;
+	char		*se_name;
+	avl_node_t	se_avl;
+} spa_error_entry_t;
+
 struct spa {
 	/*
 	 * Fields protected by spa_namespace_lock.
 	 */
 	char		*spa_name;
 	avl_node_t	spa_avl;
-	int		spa_anon;
 	nvlist_t	*spa_config;
 	uint64_t	spa_config_txg;		/* txg of last config change */
 	spa_config_lock_t spa_config_lock;	/* configuration changes */
 	kmutex_t	spa_config_cache_lock;	/* for spa_config RW_READER */
 	int		spa_sync_pass;		/* iterate-to-convergence */
 	int		spa_state;		/* pool state */
-	uint8_t		spa_minref;		/* min refcnt of open pool */
+	int		spa_inject_ref;		/* injection references */
 	uint8_t		spa_traverse_wanted;	/* traverse lock wanted */
-	taskq_t		*spa_vdev_retry_taskq;
+	uint8_t		spa_sync_on;		/* sync threads are running */
+	spa_load_state_t spa_load_state;	/* current load operation */
 	taskq_t		*spa_zio_issue_taskq[ZIO_TYPES];
 	taskq_t		*spa_zio_intr_taskq[ZIO_TYPES];
 	dsl_pool_t	*spa_dsl_pool;
@@ -88,18 +93,33 @@ struct spa {
 	kthread_t	*spa_scrub_thread;	/* scrub/resilver thread */
 	traverse_handle_t *spa_scrub_th;	/* scrub traverse handle */
 	uint64_t	spa_scrub_restart_txg;	/* need to restart */
+	uint64_t	spa_scrub_mintxg;	/* min txg we'll scrub */
 	uint64_t	spa_scrub_maxtxg;	/* max txg we'll scrub */
 	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
+	int64_t		spa_scrub_throttled;	/* over-throttle scrub I/Os */
 	uint64_t	spa_scrub_errors;	/* scrub I/O error count */
+	int		spa_scrub_suspended;	/* tell scrubber to suspend */
 	kcondvar_t	spa_scrub_cv;		/* scrub thread state change */
 	kcondvar_t	spa_scrub_io_cv;	/* scrub I/O completion */
 	uint8_t		spa_scrub_stop;		/* tell scrubber to stop */
-	uint8_t		spa_scrub_suspend;	/* tell scrubber to suspend */
 	uint8_t		spa_scrub_active;	/* active or suspended? */
 	uint8_t		spa_scrub_type;		/* type of scrub we're doing */
-	int		spa_sync_on;		/* sync threads are running */
+	kmutex_t	spa_async_lock;		/* protect async state */
+	kthread_t	*spa_async_thread;	/* thread doing async task */
+	int		spa_async_suspended;	/* async tasks suspended */
+	kcondvar_t	spa_async_cv;		/* wait for thread_exit() */
+	uint16_t	spa_async_tasks;	/* async task mask */
 	char		*spa_root;		/* alternate root directory */
 	kmutex_t	spa_uberblock_lock;	/* vdev_uberblock_load_done() */
+	uint64_t	spa_ena;		/* spa-wide ereport ENA */
+	boolean_t	spa_last_open_failed;	/* true if last open faled */
+	kmutex_t	spa_errlog_lock;	/* error log lock */
+	uint64_t	spa_errlog_last;	/* last error log object */
+	uint64_t	spa_errlog_scrub;	/* scrub error log object */
+	kmutex_t	spa_errlist_lock;	/* error list/ereport lock */
+	avl_tree_t	spa_errlist_last;	/* last error list */
+	avl_tree_t	spa_errlist_scrub;	/* scrub error list */
+	int		spa_scrub_finished;	/* indicator to rotate logs */
 	/*
 	 * spa_refcnt must be the last element because it changes size based on
 	 * compilation options.  In order for the MDB module to function
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index 86d2f1b1ab..f3d7379049 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -60,11 +60,10 @@ typedef struct vdev_knob {
 extern int vdev_open(vdev_t *);
 extern void vdev_close(vdev_t *);
 extern int vdev_create(vdev_t *, uint64_t txg);
-extern void vdev_init(vdev_t *, uint64_t txg);
-extern void vdev_reopen(vdev_t *, zio_t **zq);
+extern int vdev_init(vdev_t *, uint64_t txg);
+extern void vdev_reopen(vdev_t *);
 
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
-extern vdev_t *vdev_lookup_by_path(vdev_t *vd, const char *path);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
 extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
 extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
@@ -73,16 +72,16 @@ extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
 
 extern const char *vdev_description(vdev_t *vd);
 
-extern void vdev_metaslab_init(vdev_t *vd, uint64_t txg);
+extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
 extern void vdev_metaslab_fini(vdev_t *vd);
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
 extern void vdev_stat_update(zio_t *zio);
 extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
     boolean_t complete);
-extern void vdev_checksum_error(zio_t *zio, vdev_t *vd);
 extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
-extern void vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux);
+extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
+    vdev_aux_t aux);
 
 extern void vdev_space_update(vdev_t *vd, uint64_t space_delta,
     uint64_t alloc_delta);
@@ -92,11 +91,10 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
 extern void vdev_io_start(zio_t *zio);
 extern void vdev_io_done(zio_t *zio);
 
-extern int vdev_online(spa_t *spa, const char *path);
-extern int vdev_offline(spa_t *spa, const char *path, int istmp);
+extern int vdev_online(spa_t *spa, uint64_t guid);
+extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp);
+extern void vdev_clear(spa_t *spa, vdev_t *vd);
 
-extern int vdev_error_setup(spa_t *spa, const char *path, int mode, int mask,
-    uint64_t arg);
 extern int vdev_error_inject(vdev_t *vd, zio_t *zio);
 extern int vdev_is_dead(vdev_t *vd);
 
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 53a202a906..2dfc45edff 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -103,9 +103,11 @@ struct vdev_cache {
 struct vdev_queue {
 	uint64_t	vq_min_pending;
 	uint64_t	vq_max_pending;
+	uint64_t	vq_scrub_limit;
 	uint64_t	vq_agg_limit;
 	uint64_t	vq_time_shift;
 	uint64_t	vq_ramp_rate;
+	uint64_t	vq_scrub_count;
 	avl_tree_t	vq_deadline_tree;
 	avl_tree_t	vq_read_tree;
 	avl_tree_t	vq_write_tree;
@@ -150,10 +152,9 @@ struct vdev {
 	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
 	uint8_t		vdev_dirty[TXG_SIZE]; /* per-txg dirty flags	*/
-	int		vdev_is_dirty;	/* on config dirty list?	*/
+	uint8_t		vdev_is_dirty;	/* on config dirty list?	*/
+	uint8_t		vdev_reopen_wanted; /* async reopen wanted?	*/
 	list_node_t	vdev_dirty_node; /* config dirty list		*/
-	zio_t		*vdev_io_retry;	/* I/O retry list		*/
-	list_t		vdev_io_pending; /* I/O pending list		*/
 
 	/*
 	 * Leaf vdev state.
@@ -173,6 +174,8 @@ struct vdev {
 	uint8_t		vdev_detached;	/* device detached?		*/
 	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
 	vdev_cache_t	vdev_cache;	/* physical block cache		*/
+	uint64_t	vdev_not_present; /* not present during import	*/
+	hrtime_t	vdev_last_try;	/* last reopen time		*/
 
 	/*
 	 * For DTrace to work in userland (libzpool) context, these fields must
@@ -183,8 +186,6 @@ struct vdev {
 	 */
 	kmutex_t	vdev_dtl_lock;	/* vdev_dtl_{map,resilver}	*/
 	kmutex_t	vdev_dirty_lock; /* vdev_dirty[]		*/
-	kmutex_t	vdev_io_lock;	/* vdev_io_pending list		*/
-	kcondvar_t	vdev_io_cv;	/* vdev_io_pending list empty?	*/
 	kmutex_t	vdev_stat_lock;	/* vdev_stat			*/
 };
 
@@ -260,7 +261,7 @@ extern void vdev_remove_parent(vdev_t *cvd);
 /*
  * vdev sync load and sync
  */
-extern int vdev_load(vdev_t *vd, int import);
+extern int vdev_load(vdev_t *vd);
 extern void vdev_sync(vdev_t *vd, uint64_t txg);
 extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
 extern void vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg);
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
index 9fb6a6c5a4..e77a2efa61 100644
--- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
@@ -199,7 +199,7 @@ void zap_put_leaf(struct zap_leaf *l);
 
 int fzap_add_cd(zap_t *zap, const char *name,
     uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, dmu_tx_t *tx, struct zap_leaf **lp);
+    const void *val, uint32_t cd, dmu_tx_t *tx);
 void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
index 2ea27493f9..34057e83c9 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -103,7 +102,6 @@ int zfs_zaccess_rename(struct znode *, struct znode *,
     struct znode *, struct znode *, cred_t *cr);
 int zfs_zaccess_v4_perm(struct znode *, int, cred_t *);
 void zfs_acl_free(zfs_acl_t *);
-zfs_acl_t *zfs_acl_node_read(struct znode *);
 
 #endif
 
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index c914b23570..14ad31e629 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -31,6 +30,7 @@
 
 #include <sys/cred.h>
 #include <sys/dmu.h>
+#include <sys/zio.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -66,7 +66,7 @@ typedef struct dmu_replay_record {
 			char drr_toname[MAXNAMELEN];
 		} drr_begin;
 		struct drr_end {
-			uint64_t drr_checksum;
+			zio_cksum_t drr_checksum;
 		} drr_end;
 		struct drr_object {
 			uint64_t drr_object;
@@ -97,15 +97,31 @@ typedef struct dmu_replay_record {
 	} drr_u;
 } dmu_replay_record_t;
 
+typedef struct zinject_record {
+	uint64_t	zi_objset;
+	uint64_t	zi_object;
+	uint64_t	zi_start;
+	uint64_t	zi_end;
+	uint64_t	zi_guid;
+	uint32_t	zi_level;
+	uint32_t	zi_error;
+	uint64_t	zi_type;
+	uint32_t	zi_freq;
+} zinject_record_t;
+
+#define	ZINJECT_NULL		0x1
+#define	ZINJECT_FLUSH_ARC	0x2
+#define	ZINJECT_UNLOAD_SPA	0x4
+
 typedef struct zfs_cmd {
 	char		zc_name[MAXNAMELEN];
 	char		zc_prop_name[MAXNAMELEN];
 	char		zc_prop_value[MAXPATHLEN];
 	char		zc_root[MAXPATHLEN];
-	char		zc_filename[MAXPATHLEN];
+	char		zc_filename[MAXNAMELEN];
 	uint32_t	zc_intsz;
 	uint32_t	zc_numints;
-	uint64_t	zc_pool_guid;
+	uint64_t	zc_guid;
 	uint64_t	zc_config_src;	/* really (char *) */
 	uint64_t	zc_config_src_size;
 	uint64_t	zc_config_dst;	/* really (char *) */
@@ -116,9 +132,10 @@ typedef struct zfs_cmd {
 	uint64_t	zc_volsize;
 	uint64_t	zc_volblocksize;
 	uint64_t	zc_objset_type;
-	dmu_object_info_t zc_object_info;
 	dmu_objset_stats_t zc_objset_stats;
 	struct drr_begin zc_begin_record;
+	zinject_record_t zc_inject_record;
+	zbookmark_t	zc_bookmark;
 } zfs_cmd_t;
 
 #define	ZVOL_MAX_MINOR	(1 << 16)
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
index f9331be00a..02f4b3b247 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -133,8 +132,6 @@ typedef struct zfs_dirlock {
 	struct zfs_dirlock *dl_next;	/* next in z_dirlocks list */
 } zfs_dirlock_t;
 
-struct zcache_state;
-
 typedef struct znode {
 	struct zfsvfs	*z_zfsvfs;
 	vnode_t		*z_vnode;
@@ -150,16 +147,12 @@ typedef struct znode {
 	uint8_t		z_atime_dirty;	/* atime needs to be synced */
 	uint8_t		z_dbuf_held;	/* Is z_dbuf already held? */
 	uint8_t		z_zn_prefetch;	/* Prefetch znodes? */
-	uint_t		z_mapcnt;	/* number of memory maps to file */
 	uint_t		z_blksz;	/* block size in bytes */
 	uint_t		z_seq;		/* modification sequence number */
+	uint64_t	z_mapcnt;	/* number of pages mapped to file */
 	uint64_t	z_last_itx;	/* last ZIL itx on this znode */
 	kmutex_t	z_acl_lock;	/* acl data lock */
 	list_node_t	z_link_node;	/* all znodes in fs link */
-	list_node_t	z_zcache_node;
-	struct zcache_state *z_zcache_state;
-	uint64_t	z_zcache_access;
-
 	/*
 	 * These are dmu managed fields.
 	 */
@@ -241,14 +234,12 @@ extern int	zfs_freesp(znode_t *, uint64_t, uint64_t, int, dmu_tx_t *,
     cred_t *cr);
 extern void	zfs_znode_init(void);
 extern void	zfs_znode_fini(void);
-extern znode_t	*zfs_znode_alloc(zfsvfs_t *, dmu_buf_t *, uint64_t, int);
 extern int	zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
 extern void	zfs_zinactive(znode_t *);
 extern void	zfs_znode_delete(znode_t *, dmu_tx_t *);
 extern void	zfs_znode_free(znode_t *);
 extern int	zfs_delete_thread_target(zfsvfs_t *zfsvfs, int nthreads);
 extern void	zfs_delete_wait_empty(zfsvfs_t *zfsvfs);
-extern void	zfs_zcache_flush(zfsvfs_t *zfsvf);
 extern void	zfs_remove_op_tables();
 extern int	zfs_create_op_tables();
 extern int	zfs_sync(vfs_t *vfsp, short flag, cred_t *cr);
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 5d3227e546..d80310f2fa 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -109,23 +108,25 @@ enum zio_compress {
 #define	ZIO_PRIORITY_SCRUB		(zio_priority_table[9])
 #define	ZIO_PRIORITY_TABLE_SIZE		10
 
-#define	ZIO_FLAG_MUSTSUCCEED		0x0000
-#define	ZIO_FLAG_CANFAIL		0x0001
-#define	ZIO_FLAG_FAILFAST		0x0002
-#define	ZIO_FLAG_CONFIG_HELD		0x0004
+#define	ZIO_FLAG_MUSTSUCCEED		0x00000
+#define	ZIO_FLAG_CANFAIL		0x00001
+#define	ZIO_FLAG_FAILFAST		0x00002
+#define	ZIO_FLAG_CONFIG_HELD		0x00004
 
-#define	ZIO_FLAG_DONT_CACHE		0x0010
-#define	ZIO_FLAG_DONT_QUEUE		0x0020
-#define	ZIO_FLAG_DONT_PROPAGATE		0x0040
-#define	ZIO_FLAG_DONT_RETRY		0x0080
+#define	ZIO_FLAG_DONT_CACHE		0x00010
+#define	ZIO_FLAG_DONT_QUEUE		0x00020
+#define	ZIO_FLAG_DONT_PROPAGATE		0x00040
+#define	ZIO_FLAG_DONT_RETRY		0x00080
 
-#define	ZIO_FLAG_PHYSICAL		0x0100
-#define	ZIO_FLAG_IO_BYPASS		0x0200
-#define	ZIO_FLAG_IO_REPAIR		0x0400
-#define	ZIO_FLAG_SPECULATIVE		0x0800
+#define	ZIO_FLAG_PHYSICAL		0x00100
+#define	ZIO_FLAG_IO_BYPASS		0x00200
+#define	ZIO_FLAG_IO_REPAIR		0x00400
+#define	ZIO_FLAG_SPECULATIVE		0x00800
 
-#define	ZIO_FLAG_RESILVER		0x1000
-#define	ZIO_FLAG_SCRUB			0x2000
+#define	ZIO_FLAG_RESILVER		0x01000
+#define	ZIO_FLAG_SCRUB			0x02000
+
+#define	ZIO_FLAG_NOBOOKMARK		0x10000
 
 #define	ZIO_FLAG_GANG_INHERIT		\
 	(ZIO_FLAG_CANFAIL |		\
@@ -155,11 +156,39 @@ typedef struct zio_transform zio_transform_t;
 extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
 extern char *zio_type_name[ZIO_TYPES];
 
+/*
+ * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
+ * identifies any block in the pool.  By convention, the meta-objset (MOS)
+ * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is
+ * level -1 of the meta-dnode, and intent log blocks (which are chained
+ * off the root block) have blkid == sequence number.  In summary:
+ *
+ *	mos is objset 0
+ *	meta-dnode is object 0
+ *	root block is <objset, 0, -1, 0>
+ *	intent log is <objset, 0, -1, ZIL sequence number>
+ *
+ * Note: this structure is called a bookmark because its first purpose was
+ * to remember where to resume a pool-wide traverse.  The absolute ordering
+ * for block visitation during traversal is defined in compare_bookmark().
+ *
+ * Note: this structure is passed between userland and the kernel.
+ * Therefore it must not change size or alignment between 32/64 bit
+ * compilation options.
+ */
+typedef struct zbookmark {
+	uint64_t	zb_objset;
+	uint64_t	zb_object;
+	int64_t		zb_level;
+	uint64_t	zb_blkid;
+} zbookmark_t;
+
 struct zio {
 	/* Core information about this I/O */
 	zio_t		*io_parent;
 	zio_t		*io_root;
 	spa_t		*io_spa;
+	zbookmark_t	io_bookmark;
 	int		io_checksum;
 	int		io_compress;
 	int		io_dva_index;
@@ -170,6 +199,7 @@ struct zio {
 	zio_t		*io_sibling_prev;
 	zio_t		*io_sibling_next;
 	zio_transform_t *io_transform_stack;
+	zio_t		*io_logical;
 
 	/* Callback info */
 	zio_done_func_t	*io_done;
@@ -191,8 +221,6 @@ struct zio {
 	avl_tree_t	*io_vdev_tree;
 	zio_t		*io_delegate_list;
 	zio_t		*io_delegate_next;
-	zio_t		*io_retry_next;
-	list_node_t	io_pending;
 
 	/* Internal pipeline state */
 	int		io_flags;
@@ -212,6 +240,9 @@ struct zio {
 	void		*io_waiter;
 	kmutex_t	io_lock;
 	kcondvar_t	io_cv;
+
+	/* FMA state */
+	uint64_t	io_ena;
 };
 
 extern zio_t *zio_null(zio_t *pio, spa_t *spa,
@@ -222,15 +253,17 @@ extern zio_t *zio_root(spa_t *spa,
 
 extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags);
+    int priority, int flags, zbookmark_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags);
+    zio_done_func_t *done, void *private, int priority, int flags,
+    zbookmark_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags);
+    zio_done_func_t *done, void *private, int priority, int flags,
+    zbookmark_t *zb);
 
 extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     zio_done_func_t *done, void *private);
@@ -285,12 +318,27 @@ extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp);
 extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
 extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
 
+boolean_t zio_should_retry(zio_t *zio);
+
 /*
  * Initial setup and teardown.
  */
 extern void zio_init(void);
 extern void zio_fini(void);
 
+/*
+ * Fault injection
+ */
+struct zinject_record;
+extern uint32_t zio_injection_enabled;
+extern int zio_inject_fault(char *name, int flags, int *id,
+    struct zinject_record *record);
+extern int zio_inject_list_next(int *id, char *name, size_t buflen,
+    struct zinject_record *record);
+extern int zio_clear_fault(int id);
+extern int zio_handle_fault_injection(zio_t *zio, int error);
+extern int zio_handle_device_injection(vdev_t *vd, int error);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
index ba3dc48d28..bb7bd41e0b 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -57,9 +56,11 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
  */
 extern zio_checksum_t fletcher_2_native;
 extern zio_checksum_t fletcher_4_native;
+extern zio_checksum_t fletcher_4_incremental_native;
 
 extern zio_checksum_t fletcher_2_byteswap;
 extern zio_checksum_t fletcher_4_byteswap;
+extern zio_checksum_t fletcher_4_incremental_byteswap;
 
 extern zio_checksum_t zio_checksum_SHA256;
 
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
index 0b2b07de29..e1abf0e49d 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -201,6 +200,9 @@ struct zio_transform {
 	zio_transform_t	*zt_next;
 };
 
+extern void zio_inject_init(void);
+extern void zio_inject_fini(void);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/fs/zfs/uberblock.c b/usr/src/uts/common/fs/zfs/uberblock.c
index 63bff0ae4b..b6d3fe9595 100644
--- a/usr/src/uts/common/fs/zfs/uberblock.c
+++ b/usr/src/uts/common/fs/zfs/uberblock.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -30,9 +29,6 @@
 #include <sys/uberblock_impl.h>
 #include <sys/vdev_impl.h>
 
-/* Keep the uberblock version in a varialbe so we can get at it with mdb */
-static uint64_t uberblock_version = UBERBLOCK_VERSION;
-
 int
 uberblock_verify(uberblock_t *ub)
 {
@@ -42,9 +38,6 @@ uberblock_verify(uberblock_t *ub)
 	if (ub->ub_magic != UBERBLOCK_MAGIC)
 		return (EINVAL);
 
-	if (ub->ub_version != UBERBLOCK_VERSION)
-		return (ENOTSUP);
-
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 838e1bfc88..363be462ab 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -26,6 +26,7 @@
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
@@ -137,34 +138,6 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev)
 }
 
 vdev_t *
-vdev_lookup_by_path(vdev_t *vd, const char *path)
-{
-	int c;
-	vdev_t *mvd;
-
-	if (vd->vdev_path != NULL) {
-		if (vd->vdev_wholedisk == 1) {
-			/*
-			 * For whole disks, the internal path has 's0', but the
-			 * path passed in by the user doesn't.
-			 */
-			if (strlen(path) == strlen(vd->vdev_path) - 2 &&
-			    strncmp(path, vd->vdev_path, strlen(path)) == 0)
-				return (vd);
-		} else if (strcmp(path, vd->vdev_path) == 0) {
-			return (vd);
-		}
-	}
-
-	for (c = 0; c < vd->vdev_children; c++)
-		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
-		    NULL)
-			return (mvd);
-
-	return (NULL);
-}
-
-vdev_t *
 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 {
 	int c;
@@ -305,10 +278,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
 
-	mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
-	list_create(&vd->vdev_io_pending, sizeof (zio_t),
-	    offsetof(zio_t, io_pending));
 	mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
@@ -343,9 +312,6 @@ vdev_free_common(vdev_t *vd)
 	mutex_exit(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_dirty_lock);
-	list_destroy(&vd->vdev_io_pending);
-	mutex_destroy(&vd->vdev_io_lock);
-	cv_destroy(&vd->vdev_io_cv);
 
 	kmem_free(vd, sizeof (vdev_t));
 }
@@ -402,6 +368,13 @@ vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
 		vd->vdev_wholedisk = -1ULL;
 
 	/*
+	 * Look for the 'not present' flag.  This will only be set if the device
+	 * was not present at the time of import.
+	 */
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+	    &vd->vdev_not_present);
+
+	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
 	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
@@ -536,8 +509,8 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 		vdev_config_dirty(tvd);
 	}
 
-	ASSERT(svd->vdev_io_retry == NULL);
-	ASSERT(list_is_empty(&svd->vdev_io_pending));
+	tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted;
+	svd->vdev_reopen_wanted = 0;
 }
 
 static void
@@ -611,7 +584,7 @@ vdev_remove_parent(vdev_t *cvd)
 	vdev_free(mvd);
 }
 
-void
+int
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
@@ -621,6 +594,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	space_map_obj_t *smo = vd->vdev_smo;
 	metaslab_t **mspp = vd->vdev_ms;
+	int ret;
 
 	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
 
@@ -638,21 +612,29 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 			ms_array = kmem_zalloc(newc * sizeof (uint64_t),
 			    KM_SLEEP);
 
-			dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
-			    0, newc * sizeof (uint64_t), ms_array);
+			if ((ret = dmu_read(spa->spa_meta_objset,
+			    vd->vdev_ms_array, 0,
+			    newc * sizeof (uint64_t), ms_array)) != 0) {
+				kmem_free(ms_array, newc * sizeof (uint64_t));
+				goto error;
+			}
 
 			for (c = 0; c < newc; c++) {
 				if (ms_array[c] == 0)
 					continue;
-				db = dmu_bonus_hold(spa->spa_meta_objset,
-				    ms_array[c]);
-				dmu_buf_read(db);
+				if ((ret = dmu_bonus_hold(
+				    spa->spa_meta_objset, ms_array[c],
+				    FTAG, &db)) != 0) {
+					kmem_free(ms_array,
+					    newc * sizeof (uint64_t));
+					goto error;
+				}
 				ASSERT3U(db->db_size, ==, sizeof (*smo));
 				bcopy(db->db_data, &vd->vdev_smo[c],
 				    db->db_size);
 				ASSERT3U(vd->vdev_smo[c].smo_object, ==,
 				    ms_array[c]);
-				dmu_buf_rele(db);
+				dmu_buf_rele(db, FTAG);
 			}
 			kmem_free(ms_array, newc * sizeof (uint64_t));
 		}
@@ -674,6 +656,21 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 		kmem_free(mspp, oldc * sizeof (*mspp));
 	}
 
+	return (0);
+
+error:
+	/*
+	 * On error, undo any partial progress we may have made, and restore the
+	 * old metaslab values.
+	 */
+	kmem_free(vd->vdev_smo, newc * sizeof (*smo));
+	kmem_free(vd->vdev_ms, newc * sizeof (*mspp));
+
+	vd->vdev_smo = smo;
+	vd->vdev_ms = mspp;
+	vd->vdev_ms_count = oldc;
+
+	return (ret);
 }
 
 void
@@ -735,39 +732,39 @@ vdev_open(vdev_t *vd)
 
 	if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
-		dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
-		vd->vdev_state = VDEV_STATE_OFFLINE;
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
 		return (ENXIO);
 	}
 
 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
 
+	if (zio_injection_enabled && error == 0)
+		error = zio_handle_device_injection(vd, ENXIO);
+
 	dprintf("%s = %d, osize %llu, state = %d\n",
 	    vdev_description(vd), error, osize, vd->vdev_state);
 
 	if (error) {
-		dprintf("%s in %s failed to open, error %d, aux %d\n",
-		    vdev_description(vd),
-		    vdev_description(vd->vdev_parent),
-		    error,
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    vd->vdev_stat.vs_aux);
-
-		vd->vdev_state = VDEV_STATE_CANT_OPEN;
 		return (error);
 	}
 
 	vd->vdev_state = VDEV_STATE_HEALTHY;
 
 	for (c = 0; c < vd->vdev_children; c++)
-		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
-			vd->vdev_state = VDEV_STATE_DEGRADED;
+		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+			    VDEV_AUX_NONE);
+			break;
+		}
 
 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
 
 	if (vd->vdev_children == 0) {
 		if (osize < SPA_MINDEVSIZE) {
-			vd->vdev_state = VDEV_STATE_CANT_OPEN;
-			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_TOO_SMALL);
 			return (EOVERFLOW);
 		}
 		psize = osize;
@@ -775,8 +772,8 @@ vdev_open(vdev_t *vd)
 	} else {
 		if (osize < SPA_MINDEVSIZE -
 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
-			vd->vdev_state = VDEV_STATE_CANT_OPEN;
-			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_TOO_SMALL);
 			return (EOVERFLOW);
 		}
 		psize = 0;
@@ -796,9 +793,8 @@ vdev_open(vdev_t *vd)
 		 * Make sure the alignment requirement hasn't increased.
 		 */
 		if (ashift > vd->vdev_ashift) {
-			dprintf("%s: ashift grew\n", vdev_description(vd));
-			vd->vdev_state = VDEV_STATE_CANT_OPEN;
-			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_BAD_LABEL);
 			return (EINVAL);
 		}
 
@@ -806,9 +802,8 @@ vdev_open(vdev_t *vd)
 		 * Make sure the device hasn't shrunk.
 		 */
 		if (asize < vd->vdev_asize) {
-			dprintf("%s: device shrank\n", vdev_description(vd));
-			vd->vdev_state = VDEV_STATE_CANT_OPEN;
-			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_BAD_LABEL);
 			return (EINVAL);
 		}
 
@@ -818,11 +813,29 @@ vdev_open(vdev_t *vd)
 		 */
 		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
 		    asize > vd->vdev_asize) {
-			dprintf("%s: device grew\n", vdev_description(vd));
 			vd->vdev_asize = asize;
 		}
 	}
 
+	/*
+	 * If we were able to open a vdev that was marked permanently
+	 * unavailable, clear that state now.
+	 */
+	if (vd->vdev_not_present)
+		vd->vdev_not_present = 0;
+
+	/*
+	 * This allows the ZFS DE to close cases appropriately.  If a device
+	 * goes away and later returns, we want to close the associated case.
+	 * But it's not enough to simply post this only when a device goes from
+	 * CANT_OPEN -> HEALTHY.  If we reboot the system and the device is
+	 * back, we also need to close the case (otherwise we will try to replay
+	 * it).  So we have to post this notifier every time.  Since this only
+	 * occurs during pool open or error recovery, this should not be an
+	 * issue.
+	 */
+	zfs_post_ok(vd->vdev_spa, vd);
+
 	return (0);
 }
 
@@ -832,8 +845,6 @@ vdev_open(vdev_t *vd)
 void
 vdev_close(vdev_t *vd)
 {
-	ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
-
 	vd->vdev_ops->vdev_op_close(vd);
 
 	if (vd->vdev_cache_active) {
@@ -846,43 +857,29 @@ vdev_close(vdev_t *vd)
 		vd->vdev_state = VDEV_STATE_OFFLINE;
 	else
 		vd->vdev_state = VDEV_STATE_CLOSED;
+	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 }
 
 void
-vdev_reopen(vdev_t *vd, zio_t **rq)
+vdev_reopen(vdev_t *vd)
 {
-	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
+	ASSERT(spa_config_held(spa, RW_WRITER));
+
 	if (vd == rvd) {
-		ASSERT(rq == NULL);
 		for (c = 0; c < rvd->vdev_children; c++)
-			vdev_reopen(rvd->vdev_child[c], NULL);
+			vdev_reopen(rvd->vdev_child[c]);
 		return;
 	}
 
 	/* only valid for top-level vdevs */
 	ASSERT3P(vd, ==, vd->vdev_top);
 
-	/*
-	 * vdev_state can change when spa_config_lock is held as writer,
-	 * or when it's held as reader and we're doing a vdev_reopen().
-	 * To handle the latter case, we grab rvd's io_lock to serialize
-	 * reopens.  This ensures that there's never more than one vdev
-	 * state changer active at a time.
-	 */
-	mutex_enter(&rvd->vdev_io_lock);
-
-	mutex_enter(&vd->vdev_io_lock);
-	while (list_head(&vd->vdev_io_pending) != NULL)
-		cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
 	vdev_close(vd);
 	(void) vdev_open(vd);
-	if (rq != NULL) {
-		*rq = vd->vdev_io_retry;
-		vd->vdev_io_retry = NULL;
-	}
-	mutex_exit(&vd->vdev_io_lock);
 
 	/*
 	 * Reassess root vdev's health.
@@ -892,8 +889,6 @@ vdev_reopen(vdev_t *vd, zio_t **rq)
 		uint64_t state = rvd->vdev_child[c]->vdev_state;
 		rvd->vdev_state = MIN(rvd->vdev_state, state);
 	}
-
-	mutex_exit(&rvd->vdev_io_lock);
 }
 
 int
@@ -930,7 +925,7 @@ vdev_create(vdev_t *vd, uint64_t txg)
  * For creation, we want to try to create all vdevs at once and then undo it
  * if anything fails; this is much harder if we have pending transactions.
  */
-void
+int
 vdev_init(vdev_t *vd, uint64_t txg)
 {
 	/*
@@ -942,7 +937,7 @@ vdev_init(vdev_t *vd, uint64_t txg)
 	/*
 	 * Initialize the vdev's metaslabs.
 	 */
-	vdev_metaslab_init(vd, txg);
+	return (vdev_metaslab_init(vd, txg));
 }
 
 void
@@ -993,9 +988,10 @@ vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
 void
 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 {
+	spa_t *spa = vd->vdev_spa;
 	int c;
 
-	ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
+	ASSERT(spa_config_held(spa, RW_WRITER));
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
@@ -1019,6 +1015,12 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 		return;
 	}
 
+	/*
+	 * Make sure the DTLs are always correct under the scrub lock.
+	 */
+	if (vd == spa->spa_root_vdev)
+		mutex_enter(&spa->spa_scrub_lock);
+
 	mutex_enter(&vd->vdev_dtl_lock);
 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
@@ -1032,6 +1034,9 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
 		mutex_exit(&vd->vdev_dtl_lock);
 	}
+
+	if (vd == spa->spa_root_vdev)
+		mutex_exit(&spa->spa_scrub_lock);
 }
 
 static int
@@ -1047,11 +1052,12 @@ vdev_dtl_load(vdev_t *vd)
 	if (smo->smo_object == 0)
 		return (0);
 
-	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
-	dmu_buf_read(db);
+	if ((error = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object,
+	    FTAG, &db)) != 0)
+		return (error);
 	ASSERT3U(db->db_size, ==, sizeof (*smo));
 	bcopy(db->db_data, smo, db->db_size);
-	dmu_buf_rele(db);
+	dmu_buf_rele(db, FTAG);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
@@ -1100,8 +1106,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 		vdev_config_dirty(vd->vdev_top);
 	}
 
-	dmu_free_range(spa->spa_meta_objset, smo->smo_object,
-	    0, smo->smo_objsize, tx);
+	VERIFY(0 == dmu_free_range(spa->spa_meta_objset, smo->smo_object,
+	    0, smo->smo_objsize, tx));
 
 	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
 
@@ -1124,17 +1130,18 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	mutex_exit(&smlock);
 	mutex_destroy(&smlock);
 
-	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
+	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object,
+	    FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	ASSERT3U(db->db_size, ==, sizeof (*smo));
 	bcopy(smo, db->db_data, db->db_size);
-	dmu_buf_rele(db);
+	dmu_buf_rele(db, FTAG);
 
 	dmu_tx_commit(tx);
 }
 
 int
-vdev_load(vdev_t *vd, int import)
+vdev_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	int c, error;
@@ -1147,7 +1154,7 @@ vdev_load(vdev_t *vd, int import)
 	 * Recursively load all children.
 	 */
 	for (c = 0; c < vd->vdev_children; c++)
-		if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
+		if ((error = vdev_load(vd->vdev_child[c])) != 0)
 			return (error);
 
 	/*
@@ -1166,7 +1173,7 @@ vdev_load(vdev_t *vd, int import)
 		 */
 		if ((label = vdev_label_read_config(vd)) == NULL) {
 			dprintf("can't load label config\n");
-			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			return (0);
 		}
@@ -1174,7 +1181,7 @@ vdev_load(vdev_t *vd, int import)
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
 		    &guid) != 0 || guid != spa_guid(spa)) {
 			dprintf("bad or missing pool GUID (%llu)\n", guid);
-			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			return (0);
@@ -1184,7 +1191,7 @@ vdev_load(vdev_t *vd, int import)
 		    guid != vd->vdev_guid) {
 			dprintf("bad or missing vdev guid (%llu != %llu)\n",
 			    guid, vd->vdev_guid);
-			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			return (0);
@@ -1201,14 +1208,15 @@ vdev_load(vdev_t *vd, int import)
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 		    &state)) {
 			dprintf("missing pool state\n");
-			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			return (0);
 		}
 
 		if (state != POOL_STATE_ACTIVE &&
-		    (!import || state != POOL_STATE_EXPORTED)) {
+		    (spa->spa_load_state == SPA_LOAD_OPEN ||
+		    state != POOL_STATE_EXPORTED)) {
 			dprintf("pool state not active (%llu)\n", state);
 			nvlist_free(label);
 			return (EBADF);
@@ -1227,12 +1235,16 @@ vdev_load(vdev_t *vd, int import)
 		    vd->vdev_ms_shift == 0 ||
 		    vd->vdev_ashift == 0 ||
 		    vd->vdev_asize == 0) {
-			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			return (0);
 		}
 
-		vdev_metaslab_init(vd, 0);
+		if ((error = vdev_metaslab_init(vd, 0)) != 0) {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			return (0);
+		}
 	}
 
 	/*
@@ -1243,7 +1255,7 @@ vdev_load(vdev_t *vd, int import)
 		if (error) {
 			dprintf("can't load DTL for %s, error %d\n",
 			    vdev_description(vd), error);
-			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			return (0);
 		}
@@ -1344,7 +1356,7 @@ vdev_description(vdev_t *vd)
 }
 
 int
-vdev_online(spa_t *spa, const char *path)
+vdev_online(spa_t *spa, uint64_t guid)
 {
 	vdev_t *rvd, *vd;
 	uint64_t txg;
@@ -1352,24 +1364,14 @@ vdev_online(spa_t *spa, const char *path)
 	txg = spa_vdev_enter(spa);
 
 	rvd = spa->spa_root_vdev;
-	if ((vd = vdev_lookup_by_path(rvd, path)) == NULL)
+	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	dprintf("ONLINE: %s\n", vdev_description(vd));
 
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
-
-	/*
-	 * Clear the error counts.  The idea is that you expect to see all
-	 * zeroes when everything is working, so if you've just onlined a
-	 * device, you don't want to keep hearing about errors from before.
-	 */
-	vd->vdev_stat.vs_read_errors = 0;
-	vd->vdev_stat.vs_write_errors = 0;
-	vd->vdev_stat.vs_checksum_errors = 0;
-
-	vdev_reopen(vd->vdev_top, NULL);
+	vdev_reopen(vd->vdev_top);
 
 	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
 
@@ -1383,7 +1385,7 @@ vdev_online(spa_t *spa, const char *path)
 }
 
 int
-vdev_offline(spa_t *spa, const char *path, int istmp)
+vdev_offline(spa_t *spa, uint64_t guid, int istmp)
 {
 	vdev_t *rvd, *vd;
 	uint64_t txg;
@@ -1391,7 +1393,7 @@ vdev_offline(spa_t *spa, const char *path, int istmp)
 	txg = spa_vdev_enter(spa);
 
 	rvd = spa->spa_root_vdev;
-	if ((vd = vdev_lookup_by_path(rvd, path)) == NULL)
+	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	dprintf("OFFLINE: %s\n", vdev_description(vd));
@@ -1416,10 +1418,10 @@ vdev_offline(spa_t *spa, const char *path, int istmp)
 	 * undo it and fail the request.
 	 */
 	vd->vdev_offline = B_TRUE;
-	vdev_reopen(vd->vdev_top, NULL);
+	vdev_reopen(vd->vdev_top);
 	if (vdev_is_dead(vd->vdev_top)) {
 		vd->vdev_offline = B_FALSE;
-		vdev_reopen(vd->vdev_top, NULL);
+		vdev_reopen(vd->vdev_top);
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 	}
 
@@ -1434,25 +1436,25 @@ vdev_offline(spa_t *spa, const char *path, int istmp)
 	return (spa_vdev_exit(spa, NULL, txg, 0));
 }
 
-int
-vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
+/*
+ * Clear the error counts associated with this vdev.  Unlike vdev_online() and
+ * vdev_offline(), we assume the spa config is locked.  We also clear all
+ * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
+ */
+void
+vdev_clear(spa_t *spa, vdev_t *vd)
 {
-	vdev_t *vd;
-
-	spa_config_enter(spa, RW_WRITER);
-
-	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
-		spa_config_exit(spa);
-		return (ENODEV);
-	}
+	int c;
 
-	vd->vdev_fault_mode = mode;
-	vd->vdev_fault_mask = mask;
-	vd->vdev_fault_arg = arg;
+	if (vd == NULL)
+		vd = spa->spa_root_vdev;
 
-	spa_config_exit(spa);
+	vd->vdev_stat.vs_read_errors = 0;
+	vd->vdev_stat.vs_write_errors = 0;
+	vd->vdev_stat.vs_checksum_errors = 0;
 
-	return (0);
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_clear(spa, vd->vdev_child[c]);
 }
 
 int
@@ -1631,24 +1633,6 @@ vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
 }
 
 /*
- * Report checksum errors that a vdev that didn't realize it made.
- * This can happen, for example, when RAID-Z combinatorial reconstruction
- * infers that one of its components returned bad data.
- */
-void
-vdev_checksum_error(zio_t *zio, vdev_t *vd)
-{
-	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
-	    vdev_description(vd));
-
-	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_checksum_errors++;
-		mutex_exit(&vd->vdev_stat_lock);
-	}
-}
-
-/*
  * Update the in-core space usage stats for this vdev and the root vdev.
  */
 void
@@ -1709,6 +1693,14 @@ static vdev_knob_t vdev_knob[] = {
 		offsetof(struct vdev, vdev_queue.vq_max_pending)
 	},
 	{
+		"scrub_limit",
+		"maximum scrub/resilver I/O queue",
+		0,
+		10000,
+		70,
+		offsetof(struct vdev, vdev_queue.vq_scrub_limit)
+	},
+	{
 		"agg_limit",
 		"maximum size of aggregated I/Os",
 		0,
@@ -1781,20 +1773,78 @@ vdev_config_clean(vdev_t *vd)
 }
 
 /*
- * Set a vdev's state, updating any parent's state as well.
+ * Set a vdev's state.  If this is during an open, we don't update the parent
+ * state, because we're in the process of opening children depth-first.
+ * Otherwise, we propagate the change to the parent.
+ *
+ * If this routine places a device in a faulted state, an appropriate ereport is
+ * generated.
  */
 void
-vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
+vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 {
-	if (state == vd->vdev_state)
+	uint64_t prev_state;
+
+	if (state == vd->vdev_state) {
+		vd->vdev_stat.vs_aux = aux;
 		return;
+	}
+
+	prev_state = vd->vdev_state;
 
 	vd->vdev_state = state;
 	vd->vdev_stat.vs_aux = aux;
 
+	if (state == VDEV_STATE_CANT_OPEN) {
+		/*
+		 * If we fail to open a vdev during an import, we mark it as
+		 * "not available", which signifies that it was never there to
+		 * begin with.  Failure to open such a device is not considered
+		 * an error.
+		 */
+		if (!vd->vdev_not_present &&
+		    vd != vd->vdev_spa->spa_root_vdev) {
+			const char *class;
+
+			switch (aux) {
+			case VDEV_AUX_OPEN_FAILED:
+				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
+				break;
+			case VDEV_AUX_CORRUPT_DATA:
+				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
+				break;
+			case VDEV_AUX_NO_REPLICAS:
+				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
+				break;
+			case VDEV_AUX_BAD_GUID_SUM:
+				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
+				break;
+			case VDEV_AUX_TOO_SMALL:
+				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
+				break;
+			case VDEV_AUX_BAD_LABEL:
+				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
+				break;
+			default:
+				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
+			}
+
+			zfs_ereport_post(class, vd->vdev_spa,
+			    vd, NULL, prev_state, 0);
+		}
+
+		if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT &&
+		    vd->vdev_ops->vdev_op_leaf)
+			vd->vdev_not_present = 1;
+	}
+
+	if (isopen)
+		return;
+
 	if (vd->vdev_parent != NULL) {
 		int c;
 		int degraded = 0, faulted = 0;
+		int corrupted = 0;
 		vdev_t *parent, *child;
 
 		parent = vd->vdev_parent;
@@ -1804,9 +1854,23 @@ vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
 				faulted++;
 			else if (child->vdev_state == VDEV_STATE_DEGRADED)
 				degraded++;
+
+			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+				corrupted++;
 		}
 
 		vd->vdev_parent->vdev_ops->vdev_op_state_change(
 		    vd->vdev_parent, faulted, degraded);
-	    }
+
+		/*
+		 * Root special: if this is a toplevel vdev that cannot be
+		 * opened due to corrupted metadata, then propagate the root
+		 * vdev's aux state as 'corrupt' rather than 'insufficient
+		 * replicas'.
+		 */
+		if (corrupted && vd == vd->vdev_top)
+			vdev_set_state(vd->vdev_spa->spa_root_vdev,
+			    B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+	}
 }
diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c
index e1e7c1a36f..67a8924b52 100644
--- a/usr/src/uts/common/fs/zfs/vdev_cache.c
+++ b/usr/src/uts/common/fs/zfs/vdev_cache.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -286,7 +285,8 @@ vdev_cache_read(zio_t *zio)
 	fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
 	    ve->ve_data, vc->vc_blocksize, ZIO_TYPE_READ,
 	    ZIO_PRIORITY_CACHE_FILL,
-	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
+	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+	    ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
 	    vdev_cache_fill, ve);
 
 	ve->ve_fill_io = fio;
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index 1556c387b2..b4d7d7a0d2 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -323,6 +323,9 @@ vdev_disk_io_done(zio_t *zio)
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		vdev_cache_write(zio);
 
+	if (zio_injection_enabled && zio->io_error == 0)
+		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
 	zio_next_stage(zio);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index a789008e17..a82abf80b7 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -190,6 +189,9 @@ vdev_file_io_done(zio_t *zio)
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		vdev_cache_write(zio);
 
+	if (zio_injection_enabled && zio->io_error == 0)
+		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
 	zio_next_stage(zio);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 1282df0d9a..3571be9064 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -165,8 +165,8 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
 	zio_nowait(zio_read_phys(zio, vd,
 	    vdev_label_offset(vd->vdev_psize, l, offset),
 	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
-	    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_SPECULATIVE |
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY));
+	    ZIO_PRIORITY_SYNC_READ,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
 }
 
 static void
@@ -178,8 +178,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
 	zio_nowait(zio_write_phys(zio, vd,
 	    vdev_label_offset(vd->vdev_psize, l, offset),
 	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
-	    ZIO_PRIORITY_SYNC_WRITE,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY));
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL));
 }
 
 /*
@@ -190,7 +189,7 @@ vdev_config_generate(vdev_t *vd, int getstats)
 {
 	nvlist_t *nv = NULL;
 
-	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
 	    vd->vdev_ops->vdev_op_type) == 0);
@@ -209,6 +208,9 @@ vdev_config_generate(vdev_t *vd, int getstats)
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 		    vd->vdev_wholedisk) == 0);
 
+	if (vd->vdev_not_present)
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
+
 	if (vd == vd->vdev_top) {
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vd->vdev_ms_array) == 0);
@@ -269,7 +271,6 @@ vdev_label_read_config(vdev_t *vd)
 {
 	nvlist_t *config = NULL;
 	vdev_phys_t *vp;
-	uint64_t version;
 	zio_t *zio;
 	int l;
 
@@ -280,8 +281,8 @@ vdev_label_read_config(vdev_t *vd)
 
 	for (l = 0; l < VDEV_LABELS; l++) {
 
-		zio = zio_root(vd->vdev_spa, NULL, NULL,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD);
+		zio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_CANFAIL |
+		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD);
 
 		vdev_label_read(zio, vd, l, vp,
 		    offsetof(vdev_label_t, vl_vdev_phys),
@@ -289,10 +290,7 @@ vdev_label_read_config(vdev_t *vd)
 
 		if (zio_wait(zio) == 0 &&
 		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
-		    &config, 0) == 0 &&
-		    nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
-		    &version) == 0 &&
-		    version == UBERBLOCK_VERSION)
+		    &config, 0) == 0)
 			break;
 
 		if (config != NULL) {
@@ -341,16 +339,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg)
 	 * Check whether this device is already in use.
 	 * Ignore the check if crtxg == 0, which we use for device removal.
 	 */
-	if (crtxg != 0 && (label = vdev_label_read_config(vd)) != NULL) {
-		uint64_t version, state, pool_guid, device_guid, txg;
+	if (crtxg != 0 &&
+	    (label = vdev_label_read_config(vd)) != NULL) {
+		uint64_t state, pool_guid, device_guid, txg;
 		uint64_t mycrtxg = 0;
 
 		(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
 		    &mycrtxg);
 
-		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION,
-		    &version) == 0 && version == UBERBLOCK_VERSION &&
-		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 		    &state) == 0 && state == POOL_STATE_ACTIVE &&
 		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
 		    &pool_guid) == 0 &&
@@ -390,7 +387,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg)
 	buf = vp->vp_nvlist;
 	buflen = sizeof (vp->vp_nvlist);
 
-	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) {
+	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) != 0) {
 		nvlist_free(label);
 		zio_buf_free(vp, sizeof (vdev_phys_t));
 		return (EINVAL);
@@ -491,7 +488,7 @@ vdev_uberblock_load_done(zio_t *zio)
 
 	ASSERT3U(zio->io_size, ==, sizeof (uberblock_phys_t));
 
-	if (uberblock_verify(ub) == 0) {
+	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
 		mutex_enter(&spa->spa_uberblock_lock);
 		if (vdev_uberblock_compare(ub, ubbest) > 0)
 			*ubbest = *ub;
@@ -645,7 +642,7 @@ vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
 	buf = vp->vp_nvlist;
 	buflen = sizeof (vp->vp_nvlist);
 
-	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) == 0)
+	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0)
 		vdev_label_write(zio, vd, l, vp,
 		    offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
 		    vdev_sync_label_done, NULL);
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index 45eb7ce78b..b88b999c6f 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -209,7 +208,8 @@ vdev_mirror_io_start(zio_t *zio)
 	mm = vdev_mirror_map_alloc(zio);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		if (zio->io_flags & ZIO_FLAG_SCRUB) {
+		if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
+		    vd->vdev_ops != &vdev_replacing_ops) {
 			/*
 			 * For scrubbing reads we need to allocate a read
 			 * buffer for each child and issue reads to all
@@ -384,11 +384,12 @@ static void
 vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	if (faulted == vd->vdev_children)
-		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded + faulted != 0)
-		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
-		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 vdev_ops_t vdev_mirror_ops = {
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 09831e1504..bb838fedd1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -103,6 +102,8 @@ vdev_queue_fini(vdev_t *vd)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 
+	ASSERT(vq->vq_scrub_count == 0);
+
 	avl_destroy(&vq->vq_deadline_tree);
 	avl_destroy(&vq->vq_read_tree);
 	avl_destroy(&vq->vq_write_tree);
@@ -112,6 +113,28 @@ vdev_queue_fini(vdev_t *vd)
 }
 
 static void
+vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
+{
+	avl_add(&vq->vq_deadline_tree, zio);
+	avl_add(zio->io_vdev_tree, zio);
+
+	if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) &&
+	    ++vq->vq_scrub_count >= vq->vq_scrub_limit)
+		spa_scrub_throttle(zio->io_spa, 1);
+}
+
+static void
+vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
+{
+	if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) &&
+	    vq->vq_scrub_count-- >= vq->vq_scrub_limit)
+		spa_scrub_throttle(zio->io_spa, -1);
+
+	avl_remove(&vq->vq_deadline_tree, zio);
+	avl_remove(zio->io_vdev_tree, zio);
+}
+
+static void
 vdev_queue_agg_io_done(zio_t *aio)
 {
 	zio_t *dio;
@@ -182,18 +205,19 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
 		aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
 		    fio->io_offset, buf, size, fio->io_type,
 		    ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
-		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE,
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+		    ZIO_FLAG_NOBOOKMARK,
 		    vdev_queue_agg_io_done, NULL);
 
 		aio->io_delegate_list = fio;
 
 		for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
 			ASSERT(dio->io_type == aio->io_type);
+			ASSERT(dio->io_vdev_tree == tree);
 			if (dio->io_type == ZIO_TYPE_WRITE)
 				bcopy(dio->io_data, buf + offset, dio->io_size);
 			offset += dio->io_size;
-			avl_remove(&vq->vq_deadline_tree, dio);
-			avl_remove(tree, dio);
+			vdev_queue_io_remove(vq, dio);
 			zio_vdev_io_bypass(dio);
 			nagg++;
 		}
@@ -211,8 +235,8 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
 		return (aio);
 	}
 
-	avl_remove(&vq->vq_deadline_tree, fio);
-	avl_remove(tree, fio);
+	ASSERT(fio->io_vdev_tree == tree);
+	vdev_queue_io_remove(vq, fio);
 
 	avl_add(&vq->vq_pending_tree, fio);
 
@@ -245,8 +269,7 @@ vdev_queue_io(zio_t *zio)
 	zio->io_deadline = (zio->io_timestamp >> vq->vq_time_shift) +
 	    zio->io_priority;
 
-	avl_add(&vq->vq_deadline_tree, zio);
-	avl_add(zio->io_vdev_tree, zio);
+	vdev_queue_io_add(vq, zio);
 
 	nio = vdev_queue_io_to_issue(vq, vq->vq_min_pending, &func);
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index c2c4985856..157ae5001c 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -32,6 +31,7 @@
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
 
 /*
  * Virtual device vector for RAID-Z.
@@ -327,6 +327,28 @@ vdev_raidz_io_start(zio_t *zio)
 	zio_wait_children_done(zio);
 }
 
+/*
+ * Report a checksum error for a child of a RAID-Z device.
+ */
+static void
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
+{
+	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_col];
+	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
+	    vdev_description(vd));
+
+	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_checksum_errors++;
+		mutex_exit(&vd->vdev_stat_lock);
+	}
+
+	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
+		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
+}
+
+
 static void
 vdev_raidz_io_done(zio_t *zio)
 {
@@ -398,8 +420,7 @@ vdev_raidz_io_done(zio_t *zio)
 			bcopy(rc->rc_data, orig, rc->rc_size);
 			vdev_raidz_reconstruct(rm, c);
 			if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) {
-				vdev_checksum_error(zio,
-				    vd->vdev_child[rc->rc_col]);
+				raidz_checksum_error(zio, rc);
 				rc->rc_error = ECKSUM;
 				unexpected_errors++;
 			}
@@ -500,8 +521,7 @@ vdev_raidz_io_done(zio_t *zio)
 			 * inform it.
 			 */
 			if (rc->rc_tried && rc->rc_error == 0)
-				vdev_checksum_error(zio,
-				    vd->vdev_child[rc->rc_col]);
+				raidz_checksum_error(zio, rc);
 			rc->rc_error = ECKSUM;
 			goto done;
 		}
@@ -511,9 +531,18 @@ vdev_raidz_io_done(zio_t *zio)
 	}
 
 	/*
-	 * All combinations failed to checksum.
+	 * All combinations failed to checksum.  Generate checksum ereports for
+	 * every one.
 	 */
 	zio->io_error = ECKSUM;
+	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+		for (c = 0; c < rm->rm_cols; c++) {
+			rc = &rm->rm_col[c];
+			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+			    zio->io_spa, vd->vdev_child[rc->rc_col], zio,
+			    rc->rc_offset, rc->rc_size);
+		}
+	}
 
 done:
 	zio_checksum_verified(zio);
@@ -558,11 +587,12 @@ static void
 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	if (faulted > 1)
-		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded + faulted != 0)
-		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
-		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 vdev_ops_t vdev_raidz_ops = {
diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c
index 4e44b5bb05..85671d00b1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_root.c
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -79,11 +78,12 @@ static void
 vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	if (faulted > 0)
-		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded != 0)
-		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
-		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 vdev_ops_t vdev_root_ops = {
diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c
index 2866b7f729..8dc17ed4b1 100644
--- a/usr/src/uts/common/fs/zfs/zap.c
+++ b/usr/src/uts/common/fs/zfs/zap.c
@@ -45,6 +45,7 @@
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
 #include <sys/zap.h>
+#include <sys/refcount.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 
@@ -54,8 +55,8 @@ int fzap_default_block_shift = 14; /* 16k blocksize */
 
 static void zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx);
 static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx);
-static zap_leaf_t *zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
-    dmu_tx_t *tx, krw_t lt);
+static int zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
+    dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp);
 static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
 
 
@@ -120,8 +121,8 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
 	/*
 	 * set up block 1 - the first leaf
 	 */
-	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    1<<FZAP_BLOCK_SHIFT(zap));
+	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 
 	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
@@ -131,7 +132,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
 	zap_leaf_init(l);
 
 	kmem_free(l, sizeof (zap_leaf_t));
-	dmu_buf_rele(db);
+	dmu_buf_rele(db, FTAG);
 }
 
 static int
@@ -157,6 +158,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 {
 	uint64_t b, newblk;
 	dmu_buf_t *db_old, *db_new;
+	int err;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 	int hepb = 1<<(bs-4);
 	/* hepb = half the number of entries in a block */
@@ -181,26 +183,27 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 	 */
 
 	b = tbl->zt_blks_copied;
-	db_old = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + b) << bs);
-	dmu_buf_read(db_old);
+	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + b) << bs, FTAG, &db_old);
+	if (err)
+		return;
 
 	/* first half of entries in old[b] go to new[2*b+0] */
-	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (newblk + 2*b+0) << bs);
+	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (newblk + 2*b+0) << bs, FTAG, &db_new));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func(db_old->db_data, db_new->db_data, hepb);
-	dmu_buf_rele(db_new);
+	dmu_buf_rele(db_new, FTAG);
 
 	/* second half of entries in old[b] go to new[2*b+1] */
-	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (newblk + 2*b+1) << bs);
+	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (newblk + 2*b+1) << bs, FTAG, &db_new));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func((uint64_t *)db_old->db_data + hepb,
 	    db_new->db_data, hepb);
-	dmu_buf_rele(db_new);
+	dmu_buf_rele(db_new, FTAG);
 
-	dmu_buf_rele(db_old);
+	dmu_buf_rele(db_old, FTAG);
 
 	tbl->zt_blks_copied++;
 
@@ -208,7 +211,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 	    tbl->zt_blks_copied, tbl->zt_numblks);
 
 	if (tbl->zt_blks_copied == tbl->zt_numblks) {
-		dmu_free_range(zap->zap_objset, zap->zap_object,
+		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
 		    tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
 
 		tbl->zt_blk = newblk;
@@ -222,13 +225,14 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 	}
 }
 
-static uint64_t
+static int
 zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
     dmu_tx_t *tx)
 {
-	uint64_t blk, off, oldval;
-	dmu_buf_t *db;
+	int err;
+	uint64_t blk, off;
 	int bs = FZAP_BLOCK_SHIFT(zap);
+	dmu_buf_t *db;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 	ASSERT(tbl->zt_blk != 0);
@@ -238,33 +242,41 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 	blk = idx >> (bs-3);
 	off = idx & ((1<<(bs-3))-1);
 
-	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + blk) << bs);
+	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + blk) << bs, FTAG, &db);
+	if (err)
+		return (err);
 	dmu_buf_will_dirty(db, tx);
-	oldval = ((uint64_t *)db->db_data)[off];
-	((uint64_t *)db->db_data)[off] = val;
-	dmu_buf_rele(db);
 
 	if (tbl->zt_nextblk != 0) {
-		idx *= 2;
-		blk = idx >> (bs-3);
-		off = idx & ((1<<(bs-3))-1);
-
-		db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-		    (tbl->zt_nextblk + blk) << bs);
-		dmu_buf_will_dirty(db, tx);
-		((uint64_t *)db->db_data)[off] = val;
-		((uint64_t *)db->db_data)[off+1] = val;
-		dmu_buf_rele(db);
+		uint64_t idx2 = idx * 2;
+		uint64_t blk2 = idx2 >> (bs-3);
+		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
+		dmu_buf_t *db2;
+
+		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2);
+		if (err) {
+			dmu_buf_rele(db, FTAG);
+			return (err);
+		}
+		dmu_buf_will_dirty(db2, tx);
+		((uint64_t *)db2->db_data)[off2] = val;
+		((uint64_t *)db2->db_data)[off2+1] = val;
+		dmu_buf_rele(db2, FTAG);
 	}
 
-	return (oldval);
+	((uint64_t *)db->db_data)[off] = val;
+	dmu_buf_rele(db, FTAG);
+
+	return (0);
 }
 
-static uint64_t
-zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx)
+static int
+zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 {
-	uint64_t blk, off, val;
+	uint64_t blk, off;
+	int err;
 	dmu_buf_t *db;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 
@@ -273,12 +285,26 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx)
 	blk = idx >> (bs-3);
 	off = idx & ((1<<(bs-3))-1);
 
-	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + blk) << bs);
-	dmu_buf_read(db);
-	val = ((uint64_t *)db->db_data)[off];
-	dmu_buf_rele(db);
-	return (val);
+	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + blk) << bs, FTAG, &db);
+	if (err)
+		return (err);
+	*valp = ((uint64_t *)db->db_data)[off];
+	dmu_buf_rele(db, FTAG);
+
+	if (tbl->zt_nextblk != 0) {
+		/*
+		 * read the nextblk for the sake of i/o error checking,
+		 * so that zap_table_load() will catch errors for
+		 * zap_table_store.
+		 */
+		blk = (idx*2) >> (bs-3);
+
+		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		    (tbl->zt_nextblk + blk) << bs, FTAG, &db);
+		dmu_buf_rele(db, FTAG);
+	}
+	return (err);
 }
 
 /*
@@ -310,19 +336,21 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 		 */
 		uint64_t newblk;
 		dmu_buf_t *db_new;
+		int err;
 
 		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
 		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
 
 		newblk = zap_allocate_blocks(zap, 1, tx);
-		db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-		    newblk << FZAP_BLOCK_SHIFT(zap));
-
+		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new);
+		if (err)
+			return;
 		dmu_buf_will_dirty(db_new, tx);
 		zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-		dmu_buf_rele(db_new);
+		dmu_buf_rele(db_new, FTAG);
 
 		zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
 		zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
@@ -386,8 +414,8 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 	l->l_dbuf = NULL;
 	l->l_phys = NULL;
 
-	l->l_dbuf = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    l->l_blkid << FZAP_BLOCK_SHIFT(zap));
+	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
 	winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
 	ASSERT(winner == NULL);
 	dmu_buf_will_dirty(l->l_dbuf, tx);
@@ -403,7 +431,7 @@ zap_destroy_leaf(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
 {
 	/* uint64_t offset = l->l_blkid << ZAP_BLOCK_SHIFT; */
 	rw_exit(&l->l_rwlock);
-	dmu_buf_rele(l->l_dbuf);
+	dmu_buf_rele(l->l_dbuf, NULL);
 	/* XXX there are still holds on this block, so we can't free it? */
 	/* dmu_free_range(zap->zap_objset, zap->zap_object, */
 	    /* offset,  1<<ZAP_BLOCK_SHIFT, tx); */
@@ -430,11 +458,11 @@ zap_put_leaf(zap_leaf_t *l)
 	while (nl) {
 		zap_leaf_t *nnl = nl->l_next;
 		rw_exit(&nl->l_rwlock);
-		dmu_buf_rele(nl->l_dbuf);
+		dmu_buf_rele(nl->l_dbuf, NULL);
 		nl = nnl;
 	}
 	rw_exit(&l->l_rwlock);
-	dmu_buf_rele(l->l_dbuf);
+	dmu_buf_rele(l->l_dbuf, NULL);
 }
 
 _NOTE(ARGSUSED(0))
@@ -489,23 +517,27 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 	return (l);
 }
 
-static zap_leaf_t *
-zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
+static int
+zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
+    zap_leaf_t **lp)
 {
 	dmu_buf_t *db;
 	zap_leaf_t *l;
 	int bs = FZAP_BLOCK_SHIFT(zap);
+	int err;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
-	db = dmu_buf_hold(zap->zap_objset, zap->zap_object, blkid << bs);
+	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    blkid << bs, NULL, &db);
+	if (err)
+		return (err);
 
 	ASSERT3U(db->db_object, ==, zap->zap_object);
 	ASSERT3U(db->db_offset, ==, blkid << bs);
 	ASSERT3U(db->db_size, ==, 1 << bs);
 	ASSERT(blkid != 0);
 
-	dmu_buf_read(db);
 	l = dmu_buf_get_user(db);
 
 	if (l == NULL)
@@ -524,43 +556,53 @@ zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
 	ASSERT3U(l->lh_block_type, ==, ZBT_LEAF);
 	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
 
-	return (l);
+	*lp = l;
+	return (0);
 }
 
-static zap_leaf_t *
-zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
+static int
+zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
+    zap_leaf_t **lp)
 {
-	zap_leaf_t *l, *nl;
+	int err;
+	zap_leaf_t *nl;
 
-	l = zap_get_leaf_byblk_impl(zap, blkid, tx, lt);
+	err = zap_get_leaf_byblk_impl(zap, blkid, tx, lt, lp);
+	if (err)
+		return (err);
 
-	nl = l;
+	nl = *lp;
 	while (nl->lh_next != 0) {
 		zap_leaf_t *nnl;
-		nnl = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt);
+		err = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt, &nnl);
+		if (err) {
+			zap_put_leaf(*lp);
+			return (err);
+		}
 		nl->l_next = nnl;
 		nl = nnl;
 	}
 
-	return (l);
+	return (err);
 }
 
-static uint64_t
-zap_idx_to_blk(zap_t *zap, uint64_t idx)
+static int
+zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
 {
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
 		ASSERT3U(idx, <,
 		    (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
-		return (ZAP_EMBEDDED_PTRTBL_ENT(zap, idx));
+		*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
+		return (0);
 	} else {
 		return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
-		    idx));
+		    idx, valp));
 	}
 }
 
-static void
+static int
 zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
 {
 	ASSERT(tx != NULL);
@@ -568,32 +610,37 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
 
 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
 		ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
+		return (0);
 	} else {
-		(void) zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
-		    idx, blk, tx);
+		return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		    idx, blk, tx));
 	}
 }
 
-static zap_leaf_t *
-zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt)
+static int
+zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 {
-	uint64_t idx;
-	zap_leaf_t *l;
+	uint64_t idx, blk;
+	int err;
 
 	ASSERT(zap->zap_dbuf == NULL ||
 	    zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
 	ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
 	idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
-	l = zap_get_leaf_byblk(zap, zap_idx_to_blk(zap, idx), tx, lt);
-
-	ASSERT3U(ZAP_HASH_IDX(h, l->lh_prefix_len), ==, l->lh_prefix);
+	err = zap_idx_to_blk(zap, idx, &blk);
+	if (err != 0)
+		return (err);
+	err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
 
-	return (l);
+	ASSERT(err ||
+	    ZAP_HASH_IDX(h, (*lp)->lh_prefix_len) == (*lp)->lh_prefix);
+	return (err);
 }
 
 
-static zap_leaf_t *
-zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
+static int
+zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx,
+    zap_leaf_t **lp)
 {
 	zap_leaf_t *nl;
 	int prefix_diff, i, err;
@@ -616,11 +663,13 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
 		err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
 		ASSERT3U(err, ==, 0);
 		ASSERT(!zap->zap_ismicro);
-		l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+		(void) zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
 
-		if (l->lh_prefix_len != old_prefix_len)
+		if (l->lh_prefix_len != old_prefix_len) {
 			/* it split while our locks were down */
-			return (l);
+			*lp = l;
+			return (0);
+		}
 	}
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
@@ -629,21 +678,33 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
 		(void) zap_leaf_chainmore(l, zap_create_leaf(zap, tx));
 		dprintf("chaining leaf %x/%d\n", l->lh_prefix,
 		    l->lh_prefix_len);
-		return (l);
+		*lp = l;
+		return (0);
 	}
 
 	ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
 
 	/* There's more than one pointer to us. Split this leaf. */
-	nl = zap_leaf_split(zap, l, tx);
 
 	/* set sibling pointers */
 	prefix_diff =
-	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
-	sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len) | 1) << prefix_diff;
+	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - (l->lh_prefix_len + 1);
+	sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len + 1) | 1) << prefix_diff;
+
+	/* check for i/o errors before doing zap_leaf_split */
 	for (i = 0; i < (1ULL<<prefix_diff); i++) {
-		ASSERT3U(zap_idx_to_blk(zap, sibling+i), ==, l->l_blkid);
-		zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+		uint64_t blk;
+		err = zap_idx_to_blk(zap, sibling+i, &blk);
+		if (err)
+			return (err);
+		ASSERT3U(blk, ==, l->l_blkid);
+	}
+
+	nl = zap_leaf_split(zap, l, tx);
+
+	for (i = 0; i < (1ULL<<prefix_diff); i++) {
+		err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+		ASSERT3U(err, ==, 0); /* we checked for i/o errors above */
 		/* dprintf("set %d to %u %x\n", sibling+i, nl->l_blkid, nl); */
 	}
 
@@ -657,7 +718,8 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
 		zap_put_leaf(nl);
 	}
 
-	return (l);
+	*lp = l;
+	return (0);
 }
 
 static void
@@ -682,7 +744,8 @@ again:
 			err = zap_lockdir(os, zapobj, tx,
 			    RW_WRITER, FALSE, &zap);
 			ASSERT3U(err, ==, 0);
-			l = zap_get_leaf_byblk(zap, blkid, tx, RW_READER);
+			(void) zap_get_leaf_byblk(zap, blkid, tx,
+			    RW_READER, &l);
 			goto again;
 		}
 
@@ -734,7 +797,9 @@ fzap_lookup(zap_t *zap, const char *name,
 		return (err);
 
 	hash = zap_hash(zap, name);
-	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
+	err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
+	if (err != 0)
+		return (err);
 	err = zap_leaf_lookup(l, name, hash, &zeh);
 	if (err != 0)
 		goto out;
@@ -747,7 +812,7 @@ out:
 int
 fzap_add_cd(zap_t *zap, const char *name,
     uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, dmu_tx_t *tx, zap_leaf_t **lp)
+    const void *val, uint32_t cd, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	uint64_t hash;
@@ -759,14 +824,17 @@ fzap_add_cd(zap_t *zap, const char *name,
 	ASSERT(fzap_checksize(integer_size, num_integers) == 0);
 
 	hash = zap_hash(zap, name);
-	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+	err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+	if (err != 0)
+		return (err);
 retry:
 	err = zap_leaf_lookup(l, name, hash, &zeh);
 	if (err == 0) {
 		err = EEXIST;
 		goto out;
 	}
-	ASSERT(err == ENOENT);
+	if (err != ENOENT)
+		goto out;
 
 	/* XXX If this leaf is chained, split it if we can. */
 	err = zap_entry_create(l, name, hash, cd,
@@ -775,15 +843,14 @@ retry:
 	if (err == 0) {
 		zap_increment_num_entries(zap, 1, tx);
 	} else if (err == EAGAIN) {
-		l = zap_expand_leaf(zap, l, hash, tx);
+		err = zap_expand_leaf(zap, l, hash, tx, &l);
+		if (err != 0)
+			goto out;
 		goto retry;
 	}
 
 out:
-	if (lp)
-		*lp = l;
-	else
-		zap_put_leaf(l);
+	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
 	return (err);
 }
 
@@ -793,16 +860,14 @@ fzap_add(zap_t *zap, const char *name,
     const void *val, dmu_tx_t *tx)
 {
 	int err;
-	zap_leaf_t *l;
 
 	err = fzap_checksize(integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
 	err = fzap_add_cd(zap, name, integer_size, num_integers,
-	    val, ZAP_MAXCD, tx, &l);
+	    val, ZAP_MAXCD, tx);
 
-	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
 	return (err);
 }
 
@@ -821,7 +886,9 @@ fzap_update(zap_t *zap, const char *name,
 		return (err);
 
 	hash = zap_hash(zap, name);
-	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+	err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+	if (err != 0)
+		return (err);
 retry:
 	err = zap_leaf_lookup(l, name, hash, &zeh);
 	create = (err == ENOENT);
@@ -839,10 +906,13 @@ retry:
 	}
 
 	if (err == EAGAIN) {
-		l = zap_expand_leaf(zap, l, hash, tx);
+		err = zap_expand_leaf(zap, l, hash, tx, &l);
+		if (err != 0)
+			goto out;
 		goto retry;
 	}
 
+out:
 	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
 	return (err);
 }
@@ -857,7 +927,9 @@ fzap_length(zap_t *zap, const char *name,
 	zap_entry_handle_t zeh;
 
 	hash = zap_hash(zap, name);
-	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
+	err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
+	if (err != 0)
+		return (err);
 	err = zap_leaf_lookup(l, name, hash, &zeh);
 	if (err != 0)
 		goto out;
@@ -880,7 +952,9 @@ fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
 	zap_entry_handle_t zeh;
 
 	hash = zap_hash(zap, name);
-	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+	err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+	if (err != 0)
+		return (err);
 	err = zap_leaf_lookup(l, name, hash, &zeh);
 	if (err == 0) {
 		zap_entry_remove(&zeh);
@@ -938,7 +1012,10 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 
 again:
 	if (zc->zc_leaf == NULL) {
-		zc->zc_leaf = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER);
+		err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
+		    &zc->zc_leaf);
+		if (err != 0)
+			return (err);
 	} else {
 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 	}
@@ -982,7 +1059,7 @@ again:
 static void
 zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
 {
-	int i;
+	int i, err;
 	uint64_t lastblk = 0;
 
 	/*
@@ -997,10 +1074,11 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
 			continue;
 		lastblk = tbl[i];
 
-		l = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER);
-
-		zap_stats_leaf(zap, l, zs);
-		zap_put_leaf(l);
+		err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
+		if (err == 0) {
+			zap_stats_leaf(zap, l, zs);
+			zap_put_leaf(l);
+		}
 	}
 }
 
@@ -1028,12 +1106,16 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 		for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
 		    b++) {
 			dmu_buf_t *db;
-
-			db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-			    (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs);
-			dmu_buf_read(db);
-			zap_stats_ptrtbl(zap, db->db_data, 1<<(bs-3), zs);
-			dmu_buf_rele(db);
+			int err;
+
+			err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+			    (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
+			    FTAG, &db);
+			if (err == 0) {
+				zap_stats_ptrtbl(zap, db->db_data,
+				    1<<(bs-3), zs);
+				dmu_buf_rele(db, FTAG);
+			}
 		}
 	}
 }
diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c
index 3e150b9b1d..2d3180e37f 100644
--- a/usr/src/uts/common/fs/zfs/zap_micro.c
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c
@@ -29,6 +29,7 @@
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
 #include <sys/zap.h>
+#include <sys/refcount.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 #include <sys/avl.h>
@@ -269,7 +270,9 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 
 	*zapp = NULL;
 
-	db = dmu_buf_hold(os, obj, 0);
+	err = dmu_buf_hold(os, obj, 0, NULL, &db);
+	if (err)
+		return (err);
 
 #ifdef ZFS_DEBUG
 	{
@@ -279,12 +282,6 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 	}
 #endif
 
-	/*
-	 * The zap can deal with EIO here, but its callers don't yet, so
-	 * spare them by doing a mustsucceed read.
-	 */
-	dmu_buf_read(db);
-
 	zap = dmu_buf_get_user(db);
 	if (zap == NULL)
 		zap = mzap_open(os, obj, db);
@@ -340,7 +337,7 @@ void
 zap_unlockdir(zap_t *zap)
 {
 	rw_exit(&zap->zap_rwlock);
-	dmu_buf_rele(zap->zap_dbuf);
+	dmu_buf_rele(zap->zap_dbuf, NULL);
 }
 
 static void
@@ -375,7 +372,7 @@ mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
 		    mze->mze_name, mze->mze_value);
 		err = fzap_add_cd(zap,
 		    mze->mze_name, 8, 1, &mze->mze_value,
-		    mze->mze_cd, tx, NULL);
+		    mze->mze_cd, tx);
 		ASSERT3U(err, ==, 0);
 	}
 	kmem_free(mzp, sz);
@@ -411,7 +408,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
 	dmu_buf_t *db;
 	mzap_phys_t *zp;
 
-	db = dmu_buf_hold(os, obj, 0);
+	VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
 
 #ifdef ZFS_DEBUG
 	{
@@ -426,7 +423,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
 	zp->mz_block_type = ZBT_MICRO;
 	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
 	ASSERT(zp->mz_salt != 0);
-	dmu_buf_rele(db);
+	dmu_buf_rele(db, FTAG);
 }
 
 int
diff --git a/usr/src/uts/common/fs/zfs/zfs_acl.c b/usr/src/uts/common/fs/zfs/zfs_acl.c
index 69acccf493..c70986b853 100644
--- a/usr/src/uts/common/fs/zfs/zfs_acl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_acl.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -288,25 +287,33 @@ zfs_acl_node_read_internal(znode_t *zp)
 /*
  * Read an external acl object.
  */
-zfs_acl_t *
-zfs_acl_node_read(znode_t *zp)
+static int
+zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp)
 {
 	uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
 	zfs_acl_t	*aclp;
+	int error;
 
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 
-	if (zp->z_phys->zp_acl.z_acl_extern_obj == 0)
-		return (zfs_acl_node_read_internal(zp));
+	if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
+		*aclpp = zfs_acl_node_read_internal(zp);
+		return (0);
+	}
 
 	aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count);
 
-	dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
+	error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
 	    ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl);
+	if (error != 0) {
+		zfs_acl_free(aclp);
+		return (error);
+	}
 
 	aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
 
-	return (aclp);
+	*aclpp = aclp;
+	return (0);
 }
 
 static boolean_t
@@ -868,15 +875,17 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
 int
 zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx)
 {
-	zfs_acl_t *aclp;
+	zfs_acl_t *aclp = NULL;
 	int error;
 
 	ASSERT(MUTEX_HELD(&zp->z_lock));
 	mutex_enter(&zp->z_acl_lock);
-	aclp = zfs_acl_node_read(zp);
-	error = zfs_acl_chmod(zp, mode, aclp, tx);
+	error = zfs_acl_node_read(zp, &aclp);
+	if (error == 0)
+		error = zfs_acl_chmod(zp, mode, aclp, tx);
 	mutex_exit(&zp->z_acl_lock);
-	zfs_acl_free(aclp);
+	if (aclp)
+		zfs_acl_free(aclp);
 	return (error);
 }
 
@@ -1047,7 +1056,7 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
 	pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE);
 	if (pull_down) {
 		mutex_enter(&parent->z_acl_lock);
-		paclp = zfs_acl_node_read(parent);
+		VERIFY(0 == zfs_acl_node_read(parent, &paclp));
 		mutex_exit(&parent->z_acl_lock);
 		aclp = zfs_acl_inherit(zp, paclp);
 		zfs_acl_free(paclp);
@@ -1106,7 +1115,12 @@ zfs_getacl(znode_t *zp, vsecattr_t  *vsecp, cred_t *cr)
 
 	mutex_enter(&zp->z_acl_lock);
 
-	aclp = zfs_acl_node_read(zp);
+	error = zfs_acl_node_read(zp, &aclp);
+	if (error != 0) {
+		mutex_exit(&zp->z_acl_lock);
+		return (error);
+	}
+
 
 	if (mask & VSA_ACECNT) {
 		vsecp->vsa_aclcnt = aclp->z_acl_count;
@@ -1240,6 +1254,7 @@ zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
 	int		mode_wanted = v4_mode;
 	int		cnt;
 	int		i;
+	int		error;
 	int		access_deny = ACCESS_UNDETERMINED;
 	uint_t		entry_type;
 	uid_t		uid = crgetuid(cr);
@@ -1257,7 +1272,12 @@ zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
 
 	mutex_enter(&zp->z_acl_lock);
 
-	aclp = zfs_acl_node_read(zp);
+	error = zfs_acl_node_read(zp, &aclp);
+	if (error != 0) {
+		mutex_exit(&zp->z_acl_lock);
+		return (error);
+	}
+
 
 	zacep = aclp->z_acl;
 	cnt = aclp->z_acl_count;
diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c
index ebdce10c33..d73315b47d 100644
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c
@@ -289,6 +289,21 @@ zfs_dq_hexname(char namebuf[17], uint64_t x)
 	return (name);
 }
 
+/*
+ * Delete Queue Error Handling
+ *
+ * When dealing with the delete queue, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating.  We
+ * also fib and say that we won't be adding any new entries to the
+ * delete queue, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem).  So on the small
+ * chance that the delete queue is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the delete queue below to fail due to i/o error.  On a
+ * nondebug system, this will result in the space being leaked.
+ */
+
 void
 zfs_dq_add(znode_t *zp, dmu_tx_t *tx)
 {
@@ -338,9 +353,9 @@ zfs_purgedir(znode_t *dzp)
 
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_bonus(tx, dzp->z_id);
-		dmu_tx_hold_zap(tx, dzp->z_id, -1);
+		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
 		dmu_tx_hold_bonus(tx, xzp->z_id);
-		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
@@ -579,10 +594,10 @@ zfs_rmnode(znode_t *zp)
 	 */
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
-	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1);
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
 	if (xzp) {
 		dmu_tx_hold_bonus(tx, xzp->z_id);
-		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, TRUE, NULL);
 	}
 	if (acl_obj)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
@@ -764,7 +779,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		dmu_tx_abort(tx);
diff --git a/usr/src/uts/common/fs/zfs/zfs_fm.c b/usr/src/uts/common/fs/zfs/zfs_fm.c
new file mode 100644
index 0000000000..007445c713
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_fm.c
@@ -0,0 +1,316 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+#include <sys/fm/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/sysevent.h>
+
+/*
+ * This general routine is responsible for generating all the different ZFS
+ * ereports.  The payload is dependent on the class, and which arguments are
+ * supplied to the function:
+ *
+ * 	EREPORT			POOL	VDEV	IO
+ * 	block			X	X	X
+ * 	data			X		X
+ * 	device			X	X
+ * 	pool			X
+ *
+ * If we are in a loading state, all errors are chained together by the same
+ * SPA-wide ENA.
+ *
+ * For isolated I/O requests, we get the ENA from the zio_t. The propagation
+ * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
+ * to chain together all ereports associated with a logical piece of data.  For
+ * read I/Os, there  are basically three 'types' of I/O, which form a roughly
+ * layered diagram:
+ *
+ *      +---------------+
+ * 	| Aggregate I/O |	No associated logical data or device
+ * 	+---------------+
+ *              |
+ *              V
+ * 	+---------------+	Reads associated with a piece of logical data.
+ * 	|   Read I/O    |	This includes reads on behalf of RAID-Z,
+ * 	+---------------+       mirrors, gang blocks, retries, etc.
+ *              |
+ *              V
+ * 	+---------------+	Reads associated with a particular device, but
+ * 	| Physical I/O  |	no logical data.  Issued as part of vdev caching
+ * 	+---------------+	and I/O aggregation.
+ *
+ * Note that 'physical I/O' here is not the same terminology as used in the rest
+ * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
+ * blockpointer.  But I/O with no associated block pointer can still be related
+ * to a logical piece of data (i.e. RAID-Z requests).
+ *
+ * Purely physical I/O always have unique ENAs.  They are not related to a
+ * particular piece of logical data, and therefore cannot be chained together.
+ * We still generate an ereport, but the DE doesn't correlate it with any
+ * logical piece of data.  When such an I/O fails, the delegated I/O requests
+ * will issue a retry, which will trigger the 'real' ereport with the correct
+ * ENA.
+ *
+ * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
+ * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
+ * then inherit this pointer, so that when it is first set subsequent failures
+ * will use the same ENA.  If a physical I/O is issued (by passing the
+ * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a
+ * unique ENA will be generated.  For an aggregate I/O, this pointer is set to
+ * NULL, and no ereport will be generated (since it doesn't actually correspond
+ * to any particular device or piece of data).
+ */
+void
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+    uint64_t stateoroffset, uint64_t size)
+{
+#ifdef _KERNEL
+	nvlist_t *ereport, *detector;
+	uint64_t ena;
+	char class[64];
+
+	/*
+	 * If we are doing a spa_tryimport(), ignore errors.
+	 */
+	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+		return;
+
+	/*
+	 * If we are in the middle of opening a pool, and the previous attempt
+	 * failed, don't bother logging any new ereports - we're just going to
+	 * get the same diagnosis anyway.
+	 */
+	if (spa->spa_load_state != SPA_LOAD_NONE &&
+	    spa->spa_last_open_failed)
+		return;
+
+	/*
+	 * Ignore any errors from I/Os that we are going to retry anyway - we
+	 * only generate errors from the final failure.
+	 */
+	if (zio && zio_should_retry(zio))
+		return;
+
+	if ((ereport = fm_nvlist_create(NULL)) == NULL)
+		return;
+
+	if ((detector = fm_nvlist_create(NULL)) == NULL) {
+		fm_nvlist_destroy(ereport, FM_NVA_FREE);
+		return;
+	}
+
+	/*
+	 * Serialize ereport generation
+	 */
+	mutex_enter(&spa->spa_errlist_lock);
+
+	/*
+	 * Determine the ENA to use for this event.  If we are in a loading
+	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
+	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
+	 */
+	if (spa->spa_load_state != SPA_LOAD_NONE) {
+		if (spa->spa_ena == 0)
+			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
+		ena = spa->spa_ena;
+	} else if (zio != NULL && zio->io_logical != NULL) {
+		if (zio->io_logical->io_ena == 0)
+			zio->io_logical->io_ena =
+			    fm_ena_generate(0, FM_ENA_FMT1);
+		ena = zio->io_logical->io_ena;
+	} else {
+		ena = fm_ena_generate(0, FM_ENA_FMT1);
+	}
+
+	/*
+	 * Construct the full class, detector, and other standard FMA fields.
+	 */
+	(void) snprintf(class, sizeof (class), "%s.%s",
+	    ZFS_ERROR_CLASS, subclass);
+
+	fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
+	    vd != NULL ? vd->vdev_guid : 0);
+
+	fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
+
+	/*
+	 * Construct the per-ereport payload, depending on which parameters are
+	 * passed in.
+	 */
+
+	/*
+	 * Generic payload members common to all ereports.
+	 *
+	 * The direct reference to spa_name is used rather than spa_name()
+	 * because of the asynchronous nature of the zio pipeline.  spa_name()
+	 * asserts that the config lock is held in some form.  This is always
+	 * the case in I/O context, but because the check for RW_WRITER compares
+	 * against 'curthread', we may be in an asynchronous context and blow
+	 * this assert.  Rather than loosen this assert, we acknowledge that all
+	 * contexts in which this function is called (pool open, I/O) are safe,
+	 * and dereference the name directly.
+	 */
+	fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
+	    DATA_TYPE_STRING, spa->spa_name, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+	    DATA_TYPE_UINT64, spa_guid(spa),
+	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
+	    spa->spa_load_state, NULL);
+
+	if (vd != NULL) {
+		vdev_t *pvd = vd->vdev_parent;
+
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+		    DATA_TYPE_UINT64, vd->vdev_guid,
+		    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+		    DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
+		if (vd->vdev_path)
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
+			    DATA_TYPE_STRING, vd->vdev_path, NULL);
+		if (vd->vdev_devid)
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
+			    DATA_TYPE_STRING, vd->vdev_devid, NULL);
+
+		if (pvd != NULL) {
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
+			    DATA_TYPE_UINT64, pvd->vdev_guid,
+			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
+			    DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
+			    NULL);
+			if (pvd->vdev_path)
+				fm_payload_set(ereport,
+				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
+				    DATA_TYPE_STRING, vd->vdev_path, NULL);
+			if (pvd->vdev_devid)
+				fm_payload_set(ereport,
+				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
+				    DATA_TYPE_STRING, pvd->vdev_devid, NULL);
+		}
+	}
+
+	if (zio != NULL) {
+		/*
+		 * Payload common to all I/Os.
+		 */
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
+		    DATA_TYPE_INT32, zio->io_error, NULL);
+
+		/*
+		 * If the 'size' parameter is non-zero, it indicates this is a
+		 * RAID-Z or other I/O where the physical offset and length are
+		 * provided for us, instead of within the zio_t.
+		 */
+		if (vd != NULL) {
+			if (size)
+				fm_payload_set(ereport,
+				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+				    DATA_TYPE_UINT64, stateoroffset,
+				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+				    DATA_TYPE_UINT64, size);
+			else
+				fm_payload_set(ereport,
+				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+				    DATA_TYPE_UINT64, zio->io_offset,
+				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+				    DATA_TYPE_UINT64, zio->io_size);
+		}
+
+		/*
+		 * Payload for I/Os with corresponding logical information.
+		 */
+		if (zio->io_logical != NULL)
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
+			    DATA_TYPE_UINT64,
+			    zio->io_logical->io_bookmark.zb_objset,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
+			    DATA_TYPE_UINT64,
+			    zio->io_logical->io_bookmark.zb_object,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
+			    DATA_TYPE_INT32,
+			    zio->io_logical->io_bookmark.zb_level,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
+			    DATA_TYPE_UINT64,
+			    zio->io_logical->io_bookmark.zb_blkid);
+	} else if (vd != NULL) {
+		/*
+		 * If we have a vdev but no zio, this is a device fault, and the
+		 * 'stateoroffset' parameter indicates the previous state of the
+		 * vdev.
+		 */
+		fm_payload_set(ereport,
+		    FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
+		    DATA_TYPE_UINT64, stateoroffset, NULL);
+	}
+	mutex_exit(&spa->spa_errlist_lock);
+
+	fm_ereport_post(ereport, EVCH_SLEEP);
+
+	fm_nvlist_destroy(ereport, FM_NVA_FREE);
+	fm_nvlist_destroy(detector, FM_NVA_FREE);
+#endif
+}
+
+/*
+ * The 'resource.fs.zfs.ok' event is an internal signal that the associated
+ * resource (pool or disk) has been identified by ZFS as healthy.  This will
+ * then trigger the DE to close the associated case, if any.
+ */
+void
+zfs_post_ok(spa_t *spa, vdev_t *vd)
+{
+#ifdef _KERNEL
+	nvlist_t *resource;
+	char class[64];
+
+	if ((resource = fm_nvlist_create(NULL)) == NULL)
+		return;
+
+	(void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
+	    ZFS_ERROR_CLASS, FM_RESOURCE_OK);
+	VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
+	VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
+	VERIFY(nvlist_add_uint64(resource,
+	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
+	if (vd)
+		VERIFY(nvlist_add_uint64(resource,
+		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
+
+	fm_ereport_post(resource, EVCH_SLEEP);
+
+	fm_nvlist_destroy(resource, FM_NVA_FREE);
+#endif
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 29b01e4331..422b24a993 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -297,6 +297,16 @@ zfs_secpolicy_config(const char *unused, const char *unused2, cred_t *cr)
 }
 
 /*
+ * Policy for fault injection.  Requires all privileges.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_inject(const char *unused, const char *unused2, cred_t *cr)
+{
+	return (secpolicy_zinject(cr));
+}
+
+/*
  * Returns the nvlist as specified by the user in the zfs_cmd_t.
  */
 static int
@@ -368,7 +378,7 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
 		return (error);
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
-	    guid != zc->zc_pool_guid)
+	    guid != zc->zc_guid)
 		error = EINVAL;
 	else
 		error = spa_import(zc->zc_name, config,
@@ -396,7 +406,8 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc)
 	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
 		return (EEXIST);
 
-	VERIFY(nvlist_pack(configs, &packed, &size, NV_ENCODE_NATIVE, 0) == 0);
+	VERIFY(nvlist_pack(configs, &packed, &size, NV_ENCODE_NATIVE,
+	    KM_SLEEP) == 0);
 
 	if (size > zc->zc_config_dst_size)
 		error = ENOMEM;
@@ -420,7 +431,7 @@ zfs_ioc_pool_guid(zfs_cmd_t *zc)
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error == 0) {
-		zc->zc_pool_guid = spa_guid(spa);
+		zc->zc_guid = spa_guid(spa);
 		spa_close(spa, FTAG);
 	}
 	return (error);
@@ -433,28 +444,37 @@ zfs_ioc_pool_stats(zfs_cmd_t *zc)
 	char *packed = NULL;
 	size_t size = 0;
 	int error;
+	int ret = 0;
 
-	error = spa_get_stats(zc->zc_name, &config);
+	error = spa_get_stats(zc->zc_name, &config, zc->zc_root,
+	    sizeof (zc->zc_root));
 
 	if (config != NULL) {
 		VERIFY(nvlist_pack(config, &packed, &size,
-		    NV_ENCODE_NATIVE, 0) == 0);
+		    NV_ENCODE_NATIVE, KM_SLEEP) == 0);
 
 		if (size > zc->zc_config_dst_size)
-			error = ENOMEM;
+			ret = ENOMEM;
 		else if (xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst,
 		    size))
-			error = EFAULT;
+			ret = EFAULT;
 
 		zc->zc_config_dst_size = size;
 
 		kmem_free(packed, size);
 		nvlist_free(config);
+
+		/*
+		 * The config may be present even if 'error' is non-zero.
+		 * In this case we return success, and preserve the real errno
+		 * in 'zc_cookie'.
+		 */
+		zc->zc_cookie = error;
 	} else {
-		ASSERT(error != 0);
+		ret = error;
 	}
 
-	return (error);
+	return (ret);
 }
 
 /*
@@ -479,7 +499,8 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
 	if (config == NULL)
 		return (EINVAL);
 
-	VERIFY(nvlist_pack(config, &packed, &size, NV_ENCODE_NATIVE, 0) == 0);
+	VERIFY(nvlist_pack(config, &packed, &size, NV_ENCODE_NATIVE,
+	    KM_SLEEP) == 0);
 
 	if (size > zc->zc_config_dst_size)
 		error = ENOMEM;
@@ -554,13 +575,12 @@ static int
 zfs_ioc_vdev_online(zfs_cmd_t *zc)
 {
 	spa_t *spa;
-	char *path = zc->zc_prop_value;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
-	error = vdev_online(spa, path);
+	error = vdev_online(spa, zc->zc_guid);
 	spa_close(spa, FTAG);
 	return (error);
 }
@@ -569,14 +589,13 @@ static int
 zfs_ioc_vdev_offline(zfs_cmd_t *zc)
 {
 	spa_t *spa;
-	char *path = zc->zc_prop_value;
 	int istmp = zc->zc_cookie;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
-	error = vdev_offline(spa, path, istmp);
+	error = vdev_offline(spa, zc->zc_guid, istmp);
 	spa_close(spa, FTAG);
 	return (error);
 }
@@ -585,7 +604,6 @@ static int
 zfs_ioc_vdev_attach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
-	char *path = zc->zc_prop_value;
 	int replacing = zc->zc_cookie;
 	nvlist_t *config;
 	int error;
@@ -595,7 +613,7 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc)
 		return (error);
 
 	if ((error = get_config(zc, &config)) == 0) {
-		error = spa_vdev_attach(spa, path, config, replacing);
+		error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
 		nvlist_free(config);
 	}
 
@@ -607,14 +625,13 @@ static int
 zfs_ioc_vdev_detach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
-	char *path = zc->zc_prop_value;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
-	error = spa_vdev_detach(spa, path, 0, B_FALSE);
+	error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
 
 	spa_close(spa, FTAG);
 	return (error);
@@ -625,7 +642,7 @@ zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *path = zc->zc_prop_value;
-	uint64_t guid = zc->zc_pool_guid;
+	uint64_t guid = zc->zc_guid;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
@@ -688,6 +705,8 @@ retry:
 	if (!error && zc->zc_objset_stats.dds_type == DMU_OST_ZVOL)
 		error = zvol_get_stats(zc, os);
 
+	spa_altroot(dmu_objset_spa(os), zc->zc_root, sizeof (zc->zc_root));
+
 	dmu_objset_close(os);
 	return (error);
 }
@@ -1008,8 +1027,8 @@ zfs_ioc_recvbackup(zfs_cmd_t *zc)
 	fp = getf(fd);
 	if (fp == NULL)
 		return (EBADF);
-	error = dmu_recvbackup(&zc->zc_begin_record, &zc->zc_cookie,
-	    fp->f_vnode, fp->f_offset);
+	error = dmu_recvbackup(zc->zc_filename, &zc->zc_begin_record,
+	    &zc->zc_cookie, fp->f_vnode, fp->f_offset);
 	releasef(fd);
 	return (error);
 }
@@ -1053,6 +1072,110 @@ zfs_ioc_sendbackup(zfs_cmd_t *zc)
 	return (error);
 }
 
+static int
+zfs_ioc_inject_fault(zfs_cmd_t *zc)
+{
+	int id, error;
+
+	error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
+	    &zc->zc_inject_record);
+
+	if (error == 0)
+		zc->zc_guid = (uint64_t)id;
+
+	return (error);
+}
+
+static int
+zfs_ioc_clear_fault(zfs_cmd_t *zc)
+{
+	return (zio_clear_fault((int)zc->zc_guid));
+}
+
+static int
+zfs_ioc_inject_list_next(zfs_cmd_t *zc)
+{
+	int id = (int)zc->zc_guid;
+	int error;
+
+	error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
+	    &zc->zc_inject_record);
+
+	zc->zc_guid = id;
+
+	return (error);
+}
+
+static int
+zfs_ioc_error_log(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+	size_t count = (size_t)zc->zc_config_dst_size;
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+
+	error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_config_dst,
+	    &count);
+	if (error == 0)
+		zc->zc_config_dst_size = count;
+	else
+		zc->zc_config_dst_size = spa_get_errlog_size(spa);
+
+	spa_close(spa, FTAG);
+
+	return (error);
+}
+
+static int
+zfs_ioc_clear(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	vdev_t *vd;
+	int error;
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+
+	spa_config_enter(spa, RW_WRITER, FTAG);
+
+	if (zc->zc_prop_value[0] == '\0')
+		vd = NULL;
+	else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) {
+		spa_config_exit(spa, FTAG);
+		spa_close(spa, FTAG);
+		return (ENODEV);
+	}
+
+	vdev_clear(spa, vd);
+
+	spa_config_exit(spa, FTAG);
+
+	spa_close(spa, FTAG);
+
+	return (0);
+}
+
+static int
+zfs_ioc_bookmark_name(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+
+	error = spa_bookmark_name(spa, &zc->zc_bookmark,
+	    zc->zc_prop_name, sizeof (zc->zc_prop_name), zc->zc_prop_value,
+	    sizeof (zc->zc_prop_value), zc->zc_filename,
+	    sizeof (zc->zc_filename));
+
+	spa_close(spa, FTAG);
+
+	return (error);
+}
+
 static zfs_ioc_vec_t zfs_ioc_vec[] = {
 	{ zfs_ioc_pool_create,		zfs_secpolicy_config,	pool_name },
 	{ zfs_ioc_pool_destroy,		zfs_secpolicy_config,	pool_name },
@@ -1087,6 +1210,12 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
 	{ zfs_ioc_rename,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_recvbackup,		zfs_secpolicy_write,	dataset_name },
 	{ zfs_ioc_sendbackup,		zfs_secpolicy_write,	dataset_name },
+	{ zfs_ioc_inject_fault,		zfs_secpolicy_inject,	no_name },
+	{ zfs_ioc_clear_fault,		zfs_secpolicy_inject,	no_name },
+	{ zfs_ioc_inject_list_next,	zfs_secpolicy_inject,	no_name },
+	{ zfs_ioc_error_log,		zfs_secpolicy_inject,	pool_name },
+	{ zfs_ioc_clear,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_bookmark_name,	zfs_secpolicy_inject,	pool_name }
 };
 
 static int
@@ -1279,7 +1408,7 @@ _fini(void)
 {
 	int error;
 
-	if (spa_busy() || zfs_busy() || zvol_busy())
+	if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
 		return (EBUSY);
 
 	if ((error = mod_remove(&modlinkage)) != 0)
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 17771b2e26..68a3e414eb 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -52,6 +52,7 @@
 #include <sys/modctl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_ctldir.h>
+#include <sys/bootconf.h>
 #include <sys/sunddi.h>
 #include <sys/dnlc.h>
 
@@ -61,8 +62,11 @@ static major_t zfs_major;
 static minor_t zfs_minor;
 static kmutex_t	zfs_dev_mtx;
 
+extern char zfs_bootpath[BO_MAXOBJNAME];
+
 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
+static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
@@ -71,6 +75,7 @@ static void zfs_objset_close(zfsvfs_t *zfsvfs);
 
 static const fs_operation_def_t zfs_vfsops_template[] = {
 	VFSNAME_MOUNT, zfs_mount,
+	VFSNAME_MOUNTROOT, zfs_mountroot,
 	VFSNAME_UNMOUNT, zfs_umount,
 	VFSNAME_ROOT, zfs_root,
 	VFSNAME_STATVFS, zfs_statvfs,
@@ -150,6 +155,58 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
 	return (0);
 }
 
+static int
+zfs_create_unique_device(dev_t *dev)
+{
+	major_t new_major;
+
+	do {
+		ASSERT3U(zfs_minor, <=, MAXMIN32);
+		minor_t start = zfs_minor;
+		do {
+			mutex_enter(&zfs_dev_mtx);
+			if (zfs_minor >= MAXMIN32) {
+				/*
+				 * If we're still using the real major
+				 * keep out of /dev/zfs and /dev/zvol minor
+				 * number space.  If we're using a getudev()'ed
+				 * major number, we can use all of its minors.
+				 */
+				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
+					zfs_minor = ZFS_MIN_MINOR;
+				else
+					zfs_minor = 0;
+			} else {
+				zfs_minor++;
+			}
+			*dev = makedevice(zfs_major, zfs_minor);
+			mutex_exit(&zfs_dev_mtx);
+		} while (vfs_devismounted(*dev) && zfs_minor != start);
+		if (zfs_minor == start) {
+			/*
+			 * We are using all ~262,000 minor numbers for the
+			 * current major number.  Create a new major number.
+			 */
+			if ((new_major = getudev()) == (major_t)-1) {
+				cmn_err(CE_WARN,
+				    "zfs_mount: Can't get unique major "
+				    "device number.");
+				return (-1);
+			}
+			mutex_enter(&zfs_dev_mtx);
+			zfs_major = new_major;
+			zfs_minor = 0;
+
+			mutex_exit(&zfs_dev_mtx);
+		} else {
+			break;
+		}
+		/* CONSTANTCONDITION */
+	} while (1);
+
+	return (0);
+}
+
 static void
 atime_changed_cb(void *arg, uint64_t newval)
 {
@@ -271,110 +328,182 @@ acl_inherit_changed_cb(void *arg, uint64_t newval)
 	zfsvfs->z_acl_inherit = newval;
 }
 
-/*ARGSUSED*/
 static int
-zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+zfs_refresh_properties(vfs_t *vfsp)
 {
-	zfsvfs_t	*zfsvfs = NULL;
-	znode_t		*zp = NULL;
-	vnode_t		*vp = NULL;
-	objset_t	*os = NULL;
-	struct dsl_dataset *ds;
-	char		*osname;
-	uint64_t	readonly, recordsize;
-	pathname_t	spn;
-	dev_t		mount_dev;
-	major_t		new_major;
-	int		mode;
-	int		error = 0;
-	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
-				UIO_SYSSPACE : UIO_USERSPACE;
-	int		canwrite;
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
-	if (mvp->v_type != VDIR)
-		return (ENOTDIR);
+	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+		readonly_changed_cb(zfsvfs, B_TRUE);
+	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+		if (dmu_objset_is_snapshot(zfsvfs->z_os))
+			return (EROFS);
+		readonly_changed_cb(zfsvfs, B_FALSE);
+	}
 
-	mutex_enter(&mvp->v_lock);
-	if ((uap->flags & MS_REMOUNT) == 0 &&
-	    (uap->flags & MS_OVERLAY) == 0 &&
-	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
-		mutex_exit(&mvp->v_lock);
-		return (EBUSY);
+	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+		devices_changed_cb(zfsvfs, B_FALSE);
+		setuid_changed_cb(zfsvfs, B_FALSE);
+	} else {
+		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
+			devices_changed_cb(zfsvfs, B_FALSE);
+		else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
+			devices_changed_cb(zfsvfs, B_TRUE);
+
+		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
+			setuid_changed_cb(zfsvfs, B_FALSE);
+		else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
+			setuid_changed_cb(zfsvfs, B_TRUE);
 	}
-	mutex_exit(&mvp->v_lock);
 
-	/*
-	 * ZFS does not support passing unparsed data in via MS_DATA.
-	 * Users should use the MS_OPTIONSTR interface; this means
-	 * that all option parsing is already done and the options struct
-	 * can be interrogated.
-	 */
-	if ((uap->flags & MS_DATA) && uap->datalen > 0)
-		return (EINVAL);
+	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
+		exec_changed_cb(zfsvfs, B_FALSE);
+	else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
+		exec_changed_cb(zfsvfs, B_TRUE);
+
+	return (0);
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+	struct dsl_dataset *ds = NULL;
+	objset_t *os = NULL;
+	zfsvfs_t *zfsvfs = NULL;
+	int do_readonly = FALSE, readonly;
+	int do_setuid = FALSE, setuid;
+	int do_exec = FALSE, exec;
+	int do_devices = FALSE, devices;
+	int error = 0;
+
+	ASSERT(vfsp);
+	zfsvfs = vfsp->vfs_data;
+	ASSERT(zfsvfs);
+	os = zfsvfs->z_os;
 
 	/*
-	 * When doing a remount, we simply refresh our temporary properties
-	 * according to those options set in the current VFS options.
+	 * The act of registering our callbacks will destroy any mount
+	 * options we may have.  In order to enable temporary overrides
+	 * of mount options, we stash away the current values and restore
+	 * restore them after we register the callbacks.
 	 */
-	if (uap->flags & MS_REMOUNT) {
-		zfsvfs = vfsp->vfs_data;
-
-		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
-			readonly_changed_cb(zfsvfs, B_TRUE);
-		else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
-			if (dmu_objset_is_snapshot(zfsvfs->z_os))
-				return (EROFS);
-			readonly_changed_cb(zfsvfs, B_FALSE);
+	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+		readonly = B_TRUE;
+		do_readonly = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+		readonly = B_FALSE;
+		do_readonly = B_TRUE;
+	}
+	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+		devices = B_FALSE;
+		setuid = B_FALSE;
+		do_devices = B_TRUE;
+		do_setuid = B_TRUE;
+	} else {
+		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
+			devices = B_FALSE;
+			do_devices = B_TRUE;
+		} else if (vfs_optionisset(vfsp,
+			    MNTOPT_DEVICES, NULL)) {
+			devices = B_TRUE;
+			do_devices = B_TRUE;
 		}
 
-		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
-			devices_changed_cb(zfsvfs, B_FALSE);
-			setuid_changed_cb(zfsvfs, B_FALSE);
-		} else {
-			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
-				devices_changed_cb(zfsvfs, B_FALSE);
-			else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
-				devices_changed_cb(zfsvfs, B_TRUE);
-
-			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
-				setuid_changed_cb(zfsvfs, B_FALSE);
-			else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
-				setuid_changed_cb(zfsvfs, B_TRUE);
+		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
+			setuid = B_FALSE;
+			do_setuid = B_TRUE;
+		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
+			setuid = B_TRUE;
+			do_setuid = B_TRUE;
 		}
-
-		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
-			exec_changed_cb(zfsvfs, B_FALSE);
-		else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
-			exec_changed_cb(zfsvfs, B_TRUE);
-
-		return (0);
+	}
+	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
+		exec = B_FALSE;
+		do_exec = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
+		exec = B_TRUE;
+		do_exec = B_TRUE;
 	}
 
 	/*
-	 * Get the objset name (the "special" mount argument).
+	 * Register property callbacks.
+	 *
+	 * It would probably be fine to just check for i/o error from
+	 * the first prop_register(), but I guess I like to go
+	 * overboard...
 	 */
-	if (error = pn_get(uap->spec, fromspace, &spn))
-		return (error);
+	ds = dmu_objset_ds(os);
+	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "recordsize", blksz_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "readonly", readonly_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "devices", devices_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "setuid", setuid_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "exec", exec_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "snapdir", snapdir_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "aclmode", acl_mode_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
+	if (error)
+		goto unregister;
 
-	osname = spn.pn_path;
+	/*
+	 * Invoke our callbacks to restore temporary mount options.
+	 */
+	if (do_readonly)
+		readonly_changed_cb(zfsvfs, readonly);
+	if (do_setuid)
+		setuid_changed_cb(zfsvfs, setuid);
+	if (do_exec)
+		exec_changed_cb(zfsvfs, exec);
+	if (do_devices)
+		devices_changed_cb(zfsvfs, devices);
 
-	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
-		goto out;
+	return (0);
 
+unregister:
 	/*
-	 * Refuse to mount a filesystem if we are in a local zone and the
-	 * dataset is not visible.
+	 * We may attempt to unregister some callbacks that are not
+	 * registered, but this is OK; it will simply return ENOMSG,
+	 * which we will ignore.
 	 */
-	if (!INGLOBALZONE(curproc) &&
-	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
-		error = EPERM;
-		goto out;
-	}
+	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
+	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
+	    zfsvfs);
+	return (error);
+
+}
+
+static int
+zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr)
+{
+	dev_t mount_dev;
+	uint64_t recordsize, readonly;
+	int error = 0;
+	int mode;
+	zfsvfs_t *zfsvfs;
+	znode_t *zp = NULL;
+
+	ASSERT(vfsp);
+	ASSERT(osname);
 
 	/*
 	 * Initialize the zfs-specific filesystem structure.
 	 * Should probably make this a kmem cache, shuffle fields,
-	 * and just bzero upto z_hold_mtx[].
+	 * and just bzero up to z_hold_mtx[].
 	 */
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 	zfsvfs->z_vfs = vfsp;
@@ -388,63 +517,19 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	    offsetof(znode_t, z_link_node));
 	rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
 
-	/*
-	 * Initialize the generic filesystem structure.
-	 */
+	/* Initialize the generic filesystem structure. */
 	vfsp->vfs_bcount = 0;
 	vfsp->vfs_data = NULL;
 
-	/*
-	 * Create a unique device for the mount.
-	 */
-	do {
-		ASSERT3U(zfs_minor, <=, MAXMIN32);
-		minor_t start = zfs_minor;
-		do {
-			mutex_enter(&zfs_dev_mtx);
-			if (zfs_minor >= MAXMIN32) {
-				/*
-				 * If we're still using the real major number,
-				 * keep out of /dev/zfs and /dev/zvol minor
-				 * number space.  If we're using a getudev()'ed
-				 * major number, we can use all of its minors.
-				 */
-				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
-					zfs_minor = ZFS_MIN_MINOR;
-				else
-					zfs_minor = 0;
-			} else {
-				zfs_minor++;
-			}
-			mount_dev = makedevice(zfs_major, zfs_minor);
-			mutex_exit(&zfs_dev_mtx);
-		} while (vfs_devismounted(mount_dev) && zfs_minor != start);
-		if (zfs_minor == start) {
-			/*
-			 * We are using all ~262,000 minor numbers
-			 * for the current major number.  Create a
-			 * new major number.
-			 */
-			if ((new_major = getudev()) == (major_t)-1) {
-				cmn_err(CE_WARN,
-				    "zfs_mount: Can't get unique"
-				    " major device number.");
-				goto out;
-			}
-			mutex_enter(&zfs_dev_mtx);
-			zfs_major = new_major;
-			zfs_minor = 0;
-			mutex_exit(&zfs_dev_mtx);
-		} else {
-			break;
-		}
-		/* CONSTANTCONDITION */
-	} while (1);
-
+	if (zfs_create_unique_device(&mount_dev) == -1) {
+		error = ENODEV;
+		goto out;
+	}
 	ASSERT(vfs_devismounted(mount_dev) == 0);
 
-	if (dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL) != 0)
-		recordsize = SPA_MAXBLOCKSIZE;
+	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
+	    NULL))
+		goto out;
 
 	vfsp->vfs_dev = mount_dev;
 	vfsp->vfs_fstype = zfsfstype;
@@ -452,8 +537,7 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	vfsp->vfs_flag |= VFS_NOTRUNC;
 	vfsp->vfs_data = zfsvfs;
 
-	error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL);
-	if (error)
+	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
 		goto out;
 
 	if (readonly)
@@ -467,7 +551,6 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
 		    &zfsvfs->z_os);
 	}
-	os = zfsvfs->z_os;
 
 	if (error)
 		goto out;
@@ -475,16 +558,18 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	if (error = zfs_init_fs(zfsvfs, &zp, cr))
 		goto out;
 
-	if (dmu_objset_is_snapshot(os)) {
+	/* The call to zfs_init_fs leaves the vnode held, release it here. */
+	VN_RELE(ZTOV(zp));
+
+	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 		ASSERT(mode & DS_MODE_READONLY);
 		atime_changed_cb(zfsvfs, B_FALSE);
 		readonly_changed_cb(zfsvfs, B_TRUE);
 		zfsvfs->z_issnap = B_TRUE;
 	} else {
-		int do_readonly = FALSE, readonly;
-		int do_setuid = FALSE, setuid;
-		int do_exec = FALSE, exec;
-		int do_devices = FALSE, devices;
+		error = zfs_register_callbacks(vfsp);
+		if (error)
+			goto out;
 
 		/*
 		 * Start a delete thread running.
@@ -494,119 +579,216 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 		/*
 		 * Parse and replay the intent log.
 		 */
-		zil_replay(os, zfsvfs, &zfsvfs->z_assign, zfs_replay_vector,
-		    (void (*)(void *))zfs_delete_wait_empty);
+		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
+		    zfs_replay_vector, (void (*)(void *))zfs_delete_wait_empty);
 
 		if (!zil_disable)
-			zfsvfs->z_log = zil_open(os, zfs_get_data);
+			zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+	}
 
-		/*
-		 * The act of registering our callbacks will destroy any mount
-		 * options we may have.  In order to enable temporary overrides
-		 * of mount options, we stash away the current values and
-		 * restore them after we register the callbacks.
-		 */
-		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
-			readonly = B_TRUE;
-			do_readonly = B_TRUE;
-		} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
-			readonly = B_FALSE;
-			do_readonly = B_TRUE;
-		}
-		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
-			devices = B_FALSE;
-			setuid = B_FALSE;
-			do_devices = B_TRUE;
-			do_setuid = B_TRUE;
-		} else {
-			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
-				devices = B_FALSE;
-				do_devices = B_TRUE;
-			} else if (vfs_optionisset(vfsp,
-			    MNTOPT_DEVICES, NULL)) {
-				devices = B_TRUE;
-				do_devices = B_TRUE;
-			}
+	if (!zfsvfs->z_issnap)
+		zfsctl_create(zfsvfs);
+out:
+	if (error) {
+		if (zfsvfs->z_os)
+			dmu_objset_close(zfsvfs->z_os);
+		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+	} else {
+		atomic_add_32(&zfs_active_fs_count, 1);
+	}
 
-			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
-				setuid = B_FALSE;
-				do_setuid = B_TRUE;
-			} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
-				setuid = B_TRUE;
-				do_setuid = B_TRUE;
-			}
-		}
-		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
-			exec = B_FALSE;
-			do_exec = B_TRUE;
-		} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
-			exec = B_TRUE;
-			do_exec = B_TRUE;
-		}
+	return (error);
 
-		/*
-		 * Register property callbacks.
-		 */
+}
+
+void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+	objset_t *os = zfsvfs->z_os;
+	struct dsl_dataset *ds;
+
+	/*
+	 * Unregister properties.
+	 */
+	if (!dmu_objset_is_snapshot(os)) {
 		ds = dmu_objset_ds(os);
-		VERIFY(dsl_prop_register(ds, "atime", atime_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "recordsize", blksz_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "readonly", readonly_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "devices", devices_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "setuid", setuid_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "exec", exec_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "snapdir", snapdir_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "aclmode", acl_mode_changed_cb,
+		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
 		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_register(ds, "aclinherit",
+		VERIFY(dsl_prop_unregister(ds, "aclinherit",
 		    acl_inherit_changed_cb, zfsvfs) == 0);
+	}
+}
 
+static int
+zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
+{
+	int error = 0;
+	int ret = 0;
+	static int zfsrootdone = 0;
+	zfsvfs_t *zfsvfs = NULL;
+	znode_t *zp = NULL;
+	vnode_t *vp = NULL;
+
+	ASSERT(vfsp);
+
+	/*
+	 * The filesystem that we mount as root is defined in
+	 * /etc/system using the zfsroot variable.  The value defined
+	 * there is copied early in startup code to zfs_bootpath
+	 * (defined in modsysfile.c).
+	 */
+	if (why == ROOT_INIT) {
+		if (zfsrootdone++)
+			return (EBUSY);
 
 		/*
-		 * Invoke our callbacks to restore temporary mount options.
+		 * This needs to be done here, so that when we return from
+		 * mountroot, the vfs resource name will be set correctly.
 		 */
-		if (do_readonly)
-			readonly_changed_cb(zfsvfs, readonly);
-		if (do_setuid)
-			setuid_changed_cb(zfsvfs, setuid);
-		if (do_exec)
-			exec_changed_cb(zfsvfs, exec);
-		if (do_devices)
-			devices_changed_cb(zfsvfs, devices);
-	}
+		if (snprintf(rootfs.bo_name, BO_MAXOBJNAME, "%s", zfs_bootpath)
+		    >= BO_MAXOBJNAME)
+			return (ENAMETOOLONG);
 
-	vp = ZTOV(zp);
-	if (!zfsvfs->z_issnap)
-		zfsctl_create(zfsvfs);
-out:
-	if (error) {
-		if (zp)
-			VN_RELE(vp);
+		if (error = vfs_lock(vfsp))
+			return (error);
 
-		if (zfsvfs) {
-			if (os)
-				dmu_objset_close(os);
-			kmem_free(zfsvfs, sizeof (zfsvfs_t));
-		}
-	} else {
-		atomic_add_32(&zfs_active_fs_count, 1);
+		if (error = zfs_domount(vfsp, zfs_bootpath, CRED()))
+			goto out;
+
+		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
+		ASSERT(zfsvfs);
+		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp))
+			goto out;
+
+		vp = ZTOV(zp);
+		mutex_enter(&vp->v_lock);
+		vp->v_flag |= VROOT;
+		mutex_exit(&vp->v_lock);
+		rootvp = vp;
+
+		/*
+		 * The zfs_zget call above returns with a hold on vp, we release
+		 * it here.
+		 */
 		VN_RELE(vp);
+
+		/*
+		 * Mount root as readonly initially, it will be remouted
+		 * read/write by /lib/svc/method/fs-usr.
+		 */
+		readonly_changed_cb(vfsp->vfs_data, B_TRUE);
+		vfs_add((struct vnode *)0, vfsp,
+		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
+out:
+		vfs_unlock(vfsp);
+		ret = (error) ? error : 0;
+		return (ret);
+
+	} else if (why == ROOT_REMOUNT) {
+
+		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
+		vfsp->vfs_flag |= VFS_REMOUNT;
+		return (zfs_refresh_properties(vfsp));
+
+	} else if (why == ROOT_UNMOUNT) {
+		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
+		(void) zfs_sync(vfsp, 0, 0);
+		return (0);
+	}
+
+	/*
+	 * if "why" is equal to anything else other than ROOT_INIT,
+	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
+	 */
+	return (ENOTSUP);
+}
+
+/*ARGSUSED*/
+static int
+zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+	char		*osname;
+	pathname_t	spn;
+	int		error = 0;
+	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
+				UIO_SYSSPACE : UIO_USERSPACE;
+	int		canwrite;
+
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_REMOUNT) == 0 &&
+	    (uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/*
+	 * ZFS does not support passing unparsed data in via MS_DATA.
+	 * Users should use the MS_OPTIONSTR interface; this means
+	 * that all option parsing is already done and the options struct
+	 * can be interrogated.
+	 */
+	if ((uap->flags & MS_DATA) && uap->datalen > 0)
+		return (EINVAL);
+
+	/*
+	 * When doing a remount, we simply refresh our temporary properties
+	 * according to those options set in the current VFS options.
+	 */
+	if (uap->flags & MS_REMOUNT) {
+		return (zfs_refresh_properties(vfsp));
 	}
 
+	/*
+	 * Get the objset name (the "special" mount argument).
+	 */
+	if (error = pn_get(uap->spec, fromspace, &spn))
+		return (error);
+
+	osname = spn.pn_path;
+
+	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+		goto out;
+
+	/*
+	 * Refuse to mount a filesystem if we are in a local zone and the
+	 * dataset is not visible.
+	 */
+	if (!INGLOBALZONE(curproc) &&
+	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
+		error = EPERM;
+		goto out;
+	}
+
+	error = zfs_domount(vfsp, osname, cr);
+
+out:
 	pn_free(&spn);
 	return (error);
 }
@@ -739,9 +921,6 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
 
 		return (0);
 	}
-
-	zfs_zcache_flush(zfsvfs);
-
 	/*
 	 * Stop all delete threads.
 	 */
@@ -866,7 +1045,6 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
 	zfs_delete_t	*zd = &zfsvfs->z_delete_head;
 	znode_t		*zp, *nextzp;
 	objset_t	*os = zfsvfs->z_os;
-	struct dsl_dataset *ds;
 
 	/*
 	 * Stop all delete threads.
@@ -881,8 +1059,6 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
 	 */
 	rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
 
-	zfs_zcache_flush(zfsvfs);
-
 	/*
 	 * Release all delete in progress znodes
 	 * They will be processed when the file system remounts.
@@ -891,7 +1067,7 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
 	while (zp = list_head(&zd->z_znodes)) {
 		list_remove(&zd->z_znodes, zp);
 		zp->z_dbuf_held = 0;
-		dmu_buf_rele(zp->z_dbuf);
+		dmu_buf_rele(zp->z_dbuf, NULL);
 	}
 	mutex_exit(&zd->z_mutex);
 
@@ -911,7 +1087,7 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
 			/* dbufs should only be held when force unmounting */
 			zp->z_dbuf_held = 0;
 			mutex_exit(&zfsvfs->z_znodes_lock);
-			dmu_buf_rele(zp->z_dbuf);
+			dmu_buf_rele(zp->z_dbuf, NULL);
 			/* Start again */
 			mutex_enter(&zfsvfs->z_znodes_lock);
 			nextzp = list_head(&zfsvfs->z_all_znodes);
@@ -922,36 +1098,8 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
 	/*
 	 * Unregister properties.
 	 */
-	if (!dmu_objset_is_snapshot(os)) {
-		ds = dmu_objset_ds(os);
-
-		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "aclinherit",
-		    acl_inherit_changed_cb, zfsvfs) == 0);
-	}
+	if (!dmu_objset_is_snapshot(os))
+		zfs_unregister_callbacks(zfsvfs);
 
 	/*
 	 * Make the dmu drop all it dbuf holds so that zfs_inactive
@@ -977,6 +1125,11 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
 	}
 
 	/*
+	 * Evict all dbufs so that cached znodes will be freed
+	 */
+	dmu_objset_evict_dbufs(os);
+
+	/*
 	 * Finally close the objset
 	 */
 	dmu_objset_close(os);
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index da5b41101a..2b9da086cc 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -229,6 +229,14 @@ zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
 	    case _FIOFFS:
 		return (zfs_sync(vp->v_vfsp, 0, cred));
 
+		/*
+		 * The following two ioctls are used by bfu.  Faking out,
+		 * necessary to avoid bfu errors.
+		 */
+	    case _FIOGDIO:
+	    case _FIOSDIO:
+		return (0);
+
 	    case _FIO_SEEK_DATA:
 	    case _FIO_SEEK_HOLE:
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
@@ -436,12 +444,10 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 		n = MIN(zfs_read_chunk_size,
 		    zp->z_phys->zp_size - uio->uio_loffset);
 		n = MIN(n, cnt);
-		dbpp = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id,
-		    uio->uio_loffset, n, &numbufs);
-		if (error = dmu_buf_read_array_canfail(dbpp, numbufs)) {
-			dmu_buf_rele_array(dbpp, numbufs);
+		error = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id,
+		    uio->uio_loffset, n, TRUE, FTAG, &numbufs, &dbpp);
+		if (error)
 			goto out;
-		}
 		/*
 		 * Compute the adjustment to align the dmu buffers
 		 * with the uio buffer.
@@ -467,7 +473,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 					(n < size ? n : size), UIO_READ, uio);
 			}
 			if (error) {
-				dmu_buf_rele_array(dbpp, numbufs);
+				dmu_buf_rele_array(dbpp, numbufs, FTAG);
 				goto out;
 			}
 			n -= dbp->db_size;
@@ -476,7 +482,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 				delta = 0;
 			}
 		}
-		dmu_buf_rele_array(dbpp, numbufs);
+		dmu_buf_rele_array(dbpp, numbufs, FTAG);
 	}
 out:
 	rw_exit(&zp->z_grow_lock);
@@ -850,10 +856,10 @@ zfs_get_data(void *arg, lr_write_t *lr)
 	 */
 	if (sizeof (lr_write_t) + dlen <= reclen) { /* immediate write */
 		rw_enter(&zp->z_grow_lock, RW_READER);
-		dmu_buf_t *db = dmu_buf_hold(os, lr->lr_foid, off);
-		dmu_buf_read(db);
+		dmu_buf_t *db;
+		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, off, FTAG, &db));
 		bcopy((char *)db->db_data + off - db->db_offset, lr + 1, dlen);
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, FTAG);
 		rw_exit(&zp->z_grow_lock);
 	} else {
 		/*
@@ -1071,7 +1077,7 @@ top:
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 		dmu_tx_hold_bonus(tx, dzp->z_id);
-		dmu_tx_hold_zap(tx, dzp->z_id, 1);
+		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 		if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, SPA_MAXBLOCKSIZE);
@@ -1266,7 +1272,7 @@ top:
 	 * allow for either case.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_zap(tx, dzp->z_id, -1);
+	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	if (may_delete_now)
 		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
@@ -1289,7 +1295,7 @@ top:
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 
 	/* charge as an update -- would be nice not to charge at all */
-	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1);
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
 
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
@@ -1427,8 +1433,8 @@ top:
 	 * Add a new entry to the directory.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_zap(tx, dzp->z_id, 1);
-	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, SPA_MAXBLOCKSIZE);
@@ -1534,9 +1540,9 @@ top:
 	rw_enter(&zp->z_parent_lock, RW_WRITER);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		dmu_tx_abort(tx);
@@ -2059,8 +2065,7 @@ top:
 		have_grow_lock = TRUE;
 		if (off < zp->z_phys->zp_size)
 			dmu_tx_hold_free(tx, zp->z_id, off, DMU_OBJECT_END);
-		else if (zp->z_phys->zp_size &&
-		    zp->z_blksz < zfsvfs->z_max_blksz && off > zp->z_blksz)
+		else if (zp->z_blksz < zfsvfs->z_max_blksz && off > zp->z_blksz)
 			/* we will rewrite this block if we grow */
 			dmu_tx_hold_write(tx, zp->z_id, 0, zp->z_phys->zp_size);
 	}
@@ -2419,17 +2424,13 @@ top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
 	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
-	if (sdzp != tdzp) {
-		dmu_tx_hold_zap(tx, sdzp->z_id, 1);
-		dmu_tx_hold_zap(tx, tdzp->z_id, 1);
+	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+	if (sdzp != tdzp)
 		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
-	} else {
-		dmu_tx_hold_zap(tx, sdzp->z_id, 2);
-	}
-	if (tzp) {
-		dmu_tx_hold_bonus(tx, tzp->z_id);	/* nlink changes */
-	}
-	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+	if (tzp)
+		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		dmu_tx_abort(tx);
@@ -2532,7 +2533,7 @@ top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_bonus(tx, dzp->z_id);
-	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
@@ -2569,12 +2570,12 @@ top:
 		if (error)
 			goto out;
 
-		dbp = dmu_buf_hold(zfsvfs->z_os, zoid, 0);
+		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
 		dmu_buf_will_dirty(dbp, tx);
 
 		ASSERT3U(len, <=, dbp->db_size);
 		bcopy(link, dbp->db_data, len);
-		dmu_buf_rele(dbp);
+		dmu_buf_rele(dbp, FTAG);
 	}
 	zp->z_phys->zp_size = len;
 
@@ -2631,15 +2632,15 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
 		error = uiomove(zp->z_phys + 1,
 		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
 	} else {
-		dmu_buf_t *dbp = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0);
-		if ((error = dmu_buf_read_canfail(dbp)) != 0) {
-			dmu_buf_rele(dbp);
+		dmu_buf_t *dbp;
+		error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
+		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 		error = uiomove(dbp->db_data,
 		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
-		dmu_buf_rele(dbp);
+		dmu_buf_rele(dbp, FTAG);
 	}
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
@@ -2732,7 +2733,7 @@ top:
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, szp->z_id);
-	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		dmu_tx_abort(tx);
@@ -2921,8 +2922,14 @@ zfs_inactive(vnode_t *vp, cred_t *cr)
 			    B_INVAL, cr);
 		}
 
+		mutex_enter(&zp->z_lock);
 		vp->v_count = 0; /* count arrives as 1 */
-		zfs_znode_free(zp);
+		if (zp->z_dbuf == NULL) {
+			mutex_exit(&zp->z_lock);
+			zfs_znode_free(zp);
+		} else {
+			mutex_exit(&zp->z_lock);
+		}
 		rw_exit(&zfsvfs->z_um_lock);
 		VFS_RELE(zfsvfs->z_vfs);
 		return;
@@ -2986,27 +2993,21 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	uint_t cnt = 1;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
-	 * If file is being mapped, disallow frlock.  We set the mapcnt to
-	 * -1 here to signal that we are in the process of setting a lock.
-	 * This prevents a race with zfs_map().
-	 * XXX - well, sort of; since zfs_map() does not change z_mapcnt,
-	 * we could be in the middle of zfs_map() and still call fs_frlock().
-	 * Also, we are doing no checking in zfs_addmap() (where z_mapcnt
-	 * *is* manipulated).
+	 * We are following the UFS semantics with respect to mapcnt
+	 * here: If we see that the file is mapped already, then we will
+	 * return an error, but we don't worry about races between this
+	 * function and zfs_map().
 	 */
-	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
-	    (int)(cnt = atomic_cas_32(&zp->z_mapcnt, 0, -1)) > 0) {
+	if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) {
 		ZFS_EXIT(zfsvfs);
 		return (EAGAIN);
 	}
 	error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr);
-	ASSERT((cnt != 0) || ((int)atomic_cas_32(&zp->z_mapcnt, -1, 0) == -1));
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
@@ -3074,7 +3075,7 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
 	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
 		ASSERT(io_off == cur_pp->p_offset);
 		va = ppmapin(cur_pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
-		err = dmu_read_canfail(os, oid, io_off, PAGESIZE, va);
+		err = dmu_read(os, oid, io_off, PAGESIZE, va);
 		ppmapout(va);
 		if (err) {
 			/* On error, toss the entire kluster */
@@ -3241,6 +3242,20 @@ out:
 	return (err);
 }
 
+/*
+ * Request a memory map for a section of a file.  This code interacts
+ * with common code and the VM system as follows:
+ *
+ *	common code calls mmap(), which ends up in smmap_common()
+ *
+ *	this calls VOP_MAP(), which takes you into (say) zfs
+ *
+ *	zfs_map() calls as_map(), passing segvn_create() as the callback
+ *
+ *	segvn_create() creates the new segment and calls VOP_ADDMAP()
+ *
+ *	zfs_addmap() updates z_mapcnt
+ */
 static int
 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
@@ -3269,15 +3284,10 @@ zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
 
 	/*
 	 * If file is locked, disallow mapping.
-	 * XXX - since we don't modify z_mapcnt here, there is nothing
-	 * to stop a file lock being placed immediately after we complete
-	 * this check.
 	 */
-	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
-		if (vn_has_flocks(vp) || zp->z_mapcnt == -1) {
-			ZFS_EXIT(zfsvfs);
-			return (EAGAIN);
-		}
+	if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) {
+		ZFS_EXIT(zfsvfs);
+		return (EAGAIN);
 	}
 
 	as_rangelock(as);
@@ -3318,11 +3328,9 @@ static int
 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
 {
-	/*
-	 * XXX - shouldn't we be checking for file locks here?
-	 */
-	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0);
-	atomic_add_32(&VTOZ(vp)->z_mapcnt, btopr(len));
+	uint64_t pages = btopr(len);
+
+	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
 	return (0);
 }
 
@@ -3331,8 +3339,10 @@ static int
 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
     size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr)
 {
-	atomic_add_32(&VTOZ(vp)->z_mapcnt, -btopr(len));
-	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0);
+	uint64_t pages = btopr(len);
+
+	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
+	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
index 7eb3a2410d..3fd338940e 100644
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -55,251 +54,6 @@
 
 struct kmem_cache *znode_cache = NULL;
 
-/*
- * Note that znodes can be on one of 2 states:
- *	ZCACHE_mru	- recently used, currently cached
- *	ZCACHE_mfu	- frequently used, currently cached
- * When there are no active references to the znode, they
- * are linked onto one of the lists in zcache.  These are the
- * only znodes that can be evicted.
- */
-
-typedef struct zcache_state {
-	list_t	list;	/* linked list of evictable znodes in state */
-	uint64_t lcnt;	/* total number of znodes in the linked list */
-	uint64_t cnt;	/* total number of all znodes in this state */
-	uint64_t hits;
-	kmutex_t mtx;
-} zcache_state_t;
-
-/* The 2 states: */
-static zcache_state_t ZCACHE_mru;
-static zcache_state_t ZCACHE_mfu;
-
-static struct zcache {
-	zcache_state_t	*mru;
-	zcache_state_t	*mfu;
-	uint64_t	p;		/* Target size of mru */
-	uint64_t	c;		/* Target size of cache */
-	uint64_t	c_max;		/* Maximum target cache size */
-
-	/* performance stats */
-	uint64_t	missed;
-	uint64_t	evicted;
-	uint64_t	skipped;
-} zcache;
-
-void zcache_kmem_reclaim(void);
-
-#define	ZCACHE_MINTIME (hz>>4) /* 62 ms */
-
-/*
- * Move the supplied znode to the indicated state.  The mutex
- * for the znode must be held by the caller.
- */
-static void
-zcache_change_state(zcache_state_t *new_state, znode_t *zp)
-{
-	/* ASSERT(MUTEX_HELD(hash_mtx)); */
-	ASSERT(zp->z_active);
-
-	if (zp->z_zcache_state) {
-		ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
-		atomic_add_64(&zp->z_zcache_state->cnt, -1);
-	}
-	atomic_add_64(&new_state->cnt, 1);
-	zp->z_zcache_state = new_state;
-}
-
-static void
-zfs_zcache_evict(znode_t *zp, kmutex_t *hash_mtx)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	ASSERT(zp->z_phys);
-	ASSERT(zp->z_dbuf_held);
-
-	zp->z_dbuf_held = 0;
-	mutex_exit(&zp->z_lock);
-	dmu_buf_rele(zp->z_dbuf);
-	mutex_exit(hash_mtx);
-	VFS_RELE(zfsvfs->z_vfs);
-}
-
-/*
- * Evict znodes from list until we've removed the specified number
- */
-static void
-zcache_evict_state(zcache_state_t *state, int64_t cnt, zfsvfs_t *zfsvfs)
-{
-	int znodes_evicted = 0;
-	znode_t *zp, *zp_prev;
-	kmutex_t *hash_mtx;
-
-	ASSERT(state == zcache.mru || state == zcache.mfu);
-
-	mutex_enter(&state->mtx);
-
-	for (zp = list_tail(&state->list); zp; zp = zp_prev) {
-		zp_prev = list_prev(&state->list, zp);
-		if (zfsvfs && zp->z_zfsvfs != zfsvfs)
-			continue;
-		hash_mtx = ZFS_OBJ_MUTEX(zp);
-		if (mutex_tryenter(hash_mtx)) {
-			mutex_enter(&zp->z_lock);
-			list_remove(&zp->z_zcache_state->list, zp);
-			zp->z_zcache_state->lcnt -= 1;
-			ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
-			atomic_add_64(&zp->z_zcache_state->cnt, -1);
-			zp->z_zcache_state = NULL;
-			zp->z_zcache_access = 0;
-			/* drops z_lock and hash_mtx */
-			zfs_zcache_evict(zp, hash_mtx);
-			znodes_evicted += 1;
-			atomic_add_64(&zcache.evicted, 1);
-			if (znodes_evicted >= cnt)
-				break;
-		} else {
-			atomic_add_64(&zcache.skipped, 1);
-		}
-	}
-	mutex_exit(&state->mtx);
-
-	if (znodes_evicted < cnt)
-		dprintf("only evicted %lld znodes from %x",
-		    (longlong_t)znodes_evicted, state);
-}
-
-static void
-zcache_adjust(void)
-{
-	uint64_t mrucnt = zcache.mru->lcnt;
-	uint64_t mfucnt = zcache.mfu->lcnt;
-	uint64_t p = zcache.p;
-	uint64_t c = zcache.c;
-
-	if (mrucnt > p)
-		zcache_evict_state(zcache.mru, mrucnt - p, NULL);
-
-	if (mfucnt > 0 && mrucnt + mfucnt > c) {
-		int64_t toevict = MIN(mfucnt, mrucnt + mfucnt - c);
-		zcache_evict_state(zcache.mfu, toevict, NULL);
-	}
-}
-
-/*
- * Flush all *evictable* data from the cache.
- * NOTE: this will not touch "active" (i.e. referenced) data.
- */
-void
-zfs_zcache_flush(zfsvfs_t *zfsvfs)
-{
-	zcache_evict_state(zcache.mru, zcache.mru->lcnt, zfsvfs);
-	zcache_evict_state(zcache.mfu, zcache.mfu->lcnt, zfsvfs);
-}
-
-static void
-zcache_try_grow(int64_t cnt)
-{
-	int64_t size;
-	/*
-	 * If we're almost to the current target cache size,
-	 * increment the target cache size
-	 */
-	size = zcache.mru->lcnt + zcache.mfu->lcnt;
-	if ((zcache.c - size) <= 1) {
-		atomic_add_64(&zcache.c, cnt);
-		if (zcache.c > zcache.c_max)
-			zcache.c = zcache.c_max;
-		else if (zcache.p + cnt < zcache.c)
-			atomic_add_64(&zcache.p, cnt);
-	}
-}
-
-/*
- * This routine is called whenever a znode is accessed.
- */
-static void
-zcache_access(znode_t *zp, kmutex_t *hash_mtx)
-{
-	ASSERT(MUTEX_HELD(hash_mtx));
-
-	if (zp->z_zcache_state == NULL) {
-		/*
-		 * This znode is not in the cache.
-		 * Add the new znode to the MRU state.
-		 */
-
-		zcache_try_grow(1);
-
-		ASSERT(zp->z_zcache_access == 0);
-		zp->z_zcache_access = lbolt;
-		zcache_change_state(zcache.mru, zp);
-		mutex_exit(hash_mtx);
-
-		/*
-		 * If we are using less than 2/3 of our total target
-		 * cache size, bump up the target size for the MRU
-		 * list.
-		 */
-		if (zcache.mru->lcnt + zcache.mfu->lcnt < zcache.c*2/3) {
-			zcache.p = zcache.mru->lcnt + zcache.c/6;
-		}
-
-		zcache_adjust();
-
-		atomic_add_64(&zcache.missed, 1);
-	} else if (zp->z_zcache_state == zcache.mru) {
-		/*
-		 * This znode has been "accessed" only once so far,
-		 * Move it to the MFU state.
-		 */
-		if (lbolt > zp->z_zcache_access + ZCACHE_MINTIME) {
-			/*
-			 * More than 125ms have passed since we
-			 * instantiated this buffer.  Move it to the
-			 * most frequently used state.
-			 */
-			zp->z_zcache_access = lbolt;
-			zcache_change_state(zcache.mfu, zp);
-		}
-		atomic_add_64(&zcache.mru->hits, 1);
-		mutex_exit(hash_mtx);
-	} else {
-		ASSERT(zp->z_zcache_state == zcache.mfu);
-		/*
-		 * This buffer has been accessed more than once.
-		 * Keep it in the MFU state.
-		 */
-		atomic_add_64(&zcache.mfu->hits, 1);
-		mutex_exit(hash_mtx);
-	}
-}
-
-static void
-zcache_init(void)
-{
-	zcache.c = 20;
-	zcache.c_max = 50;
-
-	zcache.mru = &ZCACHE_mru;
-	zcache.mfu = &ZCACHE_mfu;
-
-	list_create(&zcache.mru->list, sizeof (znode_t),
-	    offsetof(znode_t, z_zcache_node));
-	list_create(&zcache.mfu->list, sizeof (znode_t),
-	    offsetof(znode_t, z_zcache_node));
-}
-
-static void
-zcache_fini(void)
-{
-	zfs_zcache_flush(NULL);
-
-	list_destroy(&zcache.mru->list);
-	list_destroy(&zcache.mfu->list);
-}
-
 /*ARGSUSED*/
 static void
 znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
@@ -307,9 +61,15 @@ znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
 	znode_t *zp = user_ptr;
 	vnode_t *vp = ZTOV(zp);
 
+	mutex_enter(&zp->z_lock);
 	if (vp->v_count == 0) {
+		mutex_exit(&zp->z_lock);
 		vn_invalid(vp);
 		zfs_znode_free(zp);
+	} else {
+		/* signal force unmount that this znode can be freed */
+		zp->z_dbuf = NULL;
+		mutex_exit(&zp->z_lock);
 	}
 }
 
@@ -359,15 +119,11 @@ zfs_znode_init(void)
 	znode_cache = kmem_cache_create("zfs_znode_cache",
 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
-
-	zcache_init();
 }
 
 void
 zfs_znode_fini(void)
 {
-	zcache_fini();
-
 	/*
 	 * Cleanup vfs & vnode ops
 	 */
@@ -488,8 +244,8 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
 	if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
 		dmu_tx_t *tx = dmu_tx_create(os);
 
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 3); /* master node */
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1); /* delete queue */
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		ASSERT3U(error, ==, 0);
@@ -497,8 +253,10 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
 		dmu_tx_commit(tx);
 	}
 
-	if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1, &version)) {
-		return (EINVAL);
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1,
+	    &version);
+	if (error) {
+		return (error);
 	} else if (version != ZFS_VERSION) {
 		(void) printf("Mismatched versions:  File system "
 		    "is version %lld on-disk format, which is "
@@ -524,9 +282,9 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
 	kmem_free(stats, sizeof (dmu_objset_stats_t));
 	stats = NULL;
 
-	if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid)) {
-		return (EINVAL);
-	}
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid);
+	if (error)
+		return (error);
 	ASSERT(zoid != 0);
 	zfsvfs->z_root = zoid;
 
@@ -545,9 +303,9 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
 		return (error);
 	ASSERT3U((*zpp)->z_id, ==, zoid);
 
-	if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid)) {
-		return (EINVAL);
-	}
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid);
+	if (error)
+		return (error);
 
 	zfsvfs->z_dqueue = zoid;
 
@@ -570,7 +328,7 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
  * up to the caller to do, in case you don't want to
  * return the znode
  */
-znode_t *
+static znode_t *
 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
 {
 	znode_t	*zp;
@@ -593,8 +351,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 
-	bzero(&zp->z_zcache_node, sizeof (list_node_t));
-
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
 	mutex_exit(&zfsvfs->z_znodes_lock);
@@ -662,9 +418,6 @@ zfs_znode_dmu_init(znode_t *zp)
 		ZTOV(zp)->v_flag |= VROOT;
 	}
 
-	zp->z_zcache_state = NULL;
-	zp->z_zcache_access = 0;
-
 	ASSERT(zp->z_dbuf_held == 0);
 	zp->z_dbuf_held = 1;
 	VFS_HOLD(zfsvfs->z_vfs);
@@ -715,6 +468,12 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
 	/*
 	 * Create a new DMU object.
 	 */
+	/*
+	 * There's currently no mechanism for pre-reading the blocks that will
+	 * be to needed allocate a new object, so we accept the small chance
+	 * that there will be an i/o error and we will fail one of the
+	 * assertions below.
+	 */
 	if (vap->va_type == VDIR) {
 		if (flag & IS_REPLAY) {
 			err = zap_create_claim(zfsvfs->z_os, *oid,
@@ -738,7 +497,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 		}
 	}
-	dbp = dmu_bonus_hold(zfsvfs->z_os, *oid);
+	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
 	dmu_buf_will_dirty(dbp, tx);
 
 	/*
@@ -803,11 +562,12 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
 
 		mutex_enter(hash_mtx);
 		zfs_znode_dmu_init(zp);
-		zcache_access(zp, hash_mtx);
+		mutex_exit(hash_mtx);
+
 		*zpp = zp;
 	} else {
 		ZTOV(zp)->v_count = 0;
-		dmu_buf_rele(dbp);
+		dmu_buf_rele(dbp, NULL);
 		zfs_znode_free(zp);
 	}
 }
@@ -818,25 +578,25 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 	dmu_object_info_t doi;
 	dmu_buf_t	*db;
 	znode_t		*zp;
+	int err;
 
 	*zpp = NULL;
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
-	db = dmu_bonus_hold(zfsvfs->z_os, obj_num);
-	if (db == NULL) {
+	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (ENOENT);
+		return (err);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
-		dmu_buf_rele(db);
+		dmu_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (EINVAL);
 	}
-	dmu_buf_read(db);
 
 	ASSERT(db->db_object == obj_num);
 	ASSERT(db->db_offset == -1);
@@ -849,29 +609,23 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 
 		ASSERT3U(zp->z_id, ==, obj_num);
 		if (zp->z_reap) {
-			dmu_buf_rele(db);
+			dmu_buf_rele(db, NULL);
 			mutex_exit(&zp->z_lock);
 			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 			return (ENOENT);
 		} else if (zp->z_dbuf_held) {
-			dmu_buf_rele(db);
+			dmu_buf_rele(db, NULL);
 		} else {
 			zp->z_dbuf_held = 1;
 			VFS_HOLD(zfsvfs->z_vfs);
 		}
 
-		if (zp->z_active == 0) {
+		if (zp->z_active == 0)
 			zp->z_active = 1;
-			if (list_link_active(&zp->z_zcache_node)) {
-				mutex_enter(&zp->z_zcache_state->mtx);
-				list_remove(&zp->z_zcache_state->list, zp);
-				zp->z_zcache_state->lcnt -= 1;
-				mutex_exit(&zp->z_zcache_state->mtx);
-			}
-		}
+
 		VN_HOLD(ZTOV(zp));
 		mutex_exit(&zp->z_lock);
-		zcache_access(zp, ZFS_OBJ_MUTEX(zp));
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		*zpp = zp;
 		return (0);
 	}
@@ -882,7 +636,7 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 	zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
 	ASSERT3U(zp->z_id, ==, obj_num);
 	zfs_znode_dmu_init(zp);
-	zcache_access(zp, ZFS_OBJ_MUTEX(zp));
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 	*zpp = zp;
 	return (0);
 }
@@ -899,15 +653,11 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
 		    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
 		ASSERT3U(error, ==, 0);
 	}
-	if (zp->z_zcache_state) {
-		ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
-		atomic_add_64(&zp->z_zcache_state->cnt, -1);
-	}
 	error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
 	ASSERT3U(error, ==, 0);
 	zp->z_dbuf_held = 0;
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
-	dmu_buf_rele(zp->z_dbuf);
+	dmu_buf_rele(zp->z_dbuf, NULL);
 }
 
 void
@@ -954,9 +704,6 @@ zfs_zinactive(znode_t *zp)
 	if (zp->z_reap) {
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
-		ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
-		atomic_add_64(&zp->z_zcache_state->cnt, -1);
-		zp->z_zcache_state = NULL;
 		/* XATTR files are not put on the delete queue */
 		if (zp->z_phys->zp_flags & ZFS_XATTR) {
 			zfs_rmnode(zp);
@@ -970,23 +717,14 @@ zfs_zinactive(znode_t *zp)
 		VFS_RELE(zfsvfs->z_vfs);
 		return;
 	}
+	ASSERT(zp->z_phys);
+	ASSERT(zp->z_dbuf_held);
 
-	/*
-	 * If the file system for this znode is no longer mounted,
-	 * evict the znode now, don't put it in the cache.
-	 */
-	if (zfsvfs->z_unmounted1) {
-		zfs_zcache_evict(zp, ZFS_OBJ_MUTEX(zp));
-		return;
-	}
-
-	/* put znode on evictable list */
-	mutex_enter(&zp->z_zcache_state->mtx);
-	list_insert_head(&zp->z_zcache_state->list, zp);
-	zp->z_zcache_state->lcnt += 1;
-	mutex_exit(&zp->z_zcache_state->mtx);
+	zp->z_dbuf_held = 0;
 	mutex_exit(&zp->z_lock);
+	dmu_buf_rele(zp->z_dbuf, NULL);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+	VFS_RELE(zfsvfs->z_vfs);
 }
 
 void
@@ -1206,7 +944,8 @@ zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx,
 		len = -1;
 	else if (end > size)
 		len = size - from;
-	dmu_free_range(zp->z_zfsvfs->z_os, zp->z_id, from, len, tx);
+	VERIFY(0 == dmu_free_range(zp->z_zfsvfs->z_os,
+	    zp->z_id, from, len, tx));
 
 	if (!have_grow_lock)
 		rw_exit(&zp->z_grow_lock);
@@ -1214,7 +953,6 @@ zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx,
 	return (0);
 }
 
-
 void
 zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
 {
@@ -1229,6 +967,10 @@ zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
 	/*
 	 * First attempt to create master node.
 	 */
+	/*
+	 * In an empty objset, there are no blocks to read and thus
+	 * there can be no i/o errors (which we assert below).
+	 */
 	moid = MASTER_NODE_OBJ;
 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
 	    DMU_OT_NONE, 0, tx);
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 14b989fbd3..55040166b4 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -136,11 +136,17 @@ zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf)
 	uint64_t blksz = BP_GET_LSIZE(bp);
 	zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1;
 	zio_cksum_t cksum;
+	zbookmark_t zb;
 	int error;
 
+	zb.zb_objset = bp->blk_cksum.zc_word[2];
+	zb.zb_object = 0;
+	zb.zb_level = -1;
+	zb.zb_blkid = bp->blk_cksum.zc_word[3];
+
 	error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz,
 	    NULL, NULL, ZIO_PRIORITY_SYNC_READ,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
 	if (error) {
 		dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ",
 		    zilog, bp, error);
@@ -551,6 +557,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
 	uint64_t txg;
 	uint64_t zil_blksz;
+	zbookmark_t zb;
 	int error;
 
 	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
@@ -579,11 +586,21 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 	error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
 	    zil_blksz, &ztp->zit_next_blk, txg);
 	if (error) {
+		/*
+		 * Reinitialise the lwb.
+		 * By returning NULL the caller will call tx_wait_synced()
+		 */
+		mutex_enter(&zilog->zl_lock);
+		ASSERT(lwb->lwb_state == UNWRITTEN);
+		lwb->lwb_nused = 0;
+		lwb->lwb_seq = 0;
+		mutex_exit(&zilog->zl_lock);
 		txg_rele_to_sync(&lwb->lwb_txgh);
 		return (NULL);
 	}
 
 	ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg);
+	ztp->zit_pad = 0;
 	ztp->zit_nused = lwb->lwb_nused;
 	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
 	ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum;
@@ -617,9 +634,15 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 	 * write the old log block
 	 */
 	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
+
+	zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[2];
+	zb.zb_object = 0;
+	zb.zb_level = -1;
+	zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[3];
+
 	zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0,
 	    &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,
-	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED));
+	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb));
 
 	return (nlwb);
 }
@@ -674,7 +697,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 		lwb = zil_lwb_write_start(zilog, lwb);
 		if (lwb == NULL)
 			return (NULL);
-		if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) {
+		ASSERT(lwb->lwb_nused == 0);
+		if (reclen > ZIL_BLK_DATA_SZ(lwb)) {
 			txg_wait_synced(zilog->zl_dmu_pool, txg);
 			mutex_enter(&zilog->zl_lock);
 			zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
@@ -1157,10 +1181,17 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 			 * checksum error.  We can safely ignore this because
 			 * the later write will provide the correct data.
 			 */
+			zbookmark_t zb;
+
+			zb.zb_objset = dmu_objset_id(zilog->zl_os);
+			zb.zb_object = lrw->lr_foid;
+			zb.zb_level = -1;
+			zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);
+
 			(void) zio_wait(zio_read(NULL, zilog->zl_spa,
 			    wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
 			    ZIO_PRIORITY_SYNC_READ,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
 			(void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
 		}
 	}
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 1554504a93..b9741ee5c2 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,13 +19,14 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/spa_impl.h>
@@ -35,9 +35,6 @@
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 
-static void zio_vdev_io_enter(zio_t *zio);
-static void zio_vdev_io_exit(zio_t *zio);
-
 /*
  * ==========================================================================
  * I/O priority table
@@ -128,6 +125,8 @@ zio_init(void)
 		if (zio_buf_cache[c - 1] == NULL)
 			zio_buf_cache[c - 1] = zio_buf_cache[c];
 	}
+
+	zio_inject_init();
 }
 
 void
@@ -143,6 +142,8 @@ zio_fini(void)
 		}
 		zio_buf_cache[c] = NULL;
 	}
+
+	zio_inject_fini();
 }
 
 /*
@@ -263,11 +264,12 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 
 	if (pio == NULL) {
 		if (!(flags & ZIO_FLAG_CONFIG_HELD))
-			spa_config_enter(zio->io_spa, RW_READER);
+			spa_config_enter(zio->io_spa, RW_READER, zio);
 		zio->io_root = zio;
 	} else {
 		zio->io_root = pio->io_root;
-
+		if (!(flags & ZIO_FLAG_NOBOOKMARK))
+			zio->io_logical = pio->io_logical;
 		mutex_enter(&pio->io_lock);
 		if (stage < ZIO_STAGE_READY)
 			pio->io_children_notready++;
@@ -305,7 +307,7 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags)
+    int priority, int flags, zbookmark_t *zb)
 {
 	zio_t *zio;
 	dva_t *dva;
@@ -314,6 +316,9 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
 
 	zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+	zio->io_bookmark = *zb;
+
+	zio->io_logical = zio;
 
 	/*
 	 * Work off our copy of the bp so the caller can free it.
@@ -345,7 +350,8 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags)
+    zio_done_func_t *done, void *private, int priority, int flags,
+    zbookmark_t *zb)
 {
 	zio_t *zio;
 
@@ -359,6 +365,10 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
 	    ZIO_TYPE_WRITE, priority, flags,
 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
 
+	zio->io_bookmark = *zb;
+
+	zio->io_logical = zio;
+
 	zio->io_checksum = checksum;
 	zio->io_compress = compress;
 
@@ -378,7 +388,8 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags)
+    zio_done_func_t *done, void *private, int priority, int flags,
+    zbookmark_t *zb)
 {
 	zio_t *zio;
 
@@ -387,6 +398,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
 	    ZIO_TYPE_WRITE, priority, flags,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 
+	zio->io_bookmark = *zb;
 	zio->io_checksum = checksum;
 	zio->io_compress = ZIO_COMPRESS_OFF;
 
@@ -667,8 +679,6 @@ zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
 		mutex_exit(&zio->io_lock);
 		zio_next_stage(zio);
 	} else {
-		if (zio->io_stage == ZIO_STAGE_VDEV_IO_START)
-			zio_vdev_io_exit(zio);
 		zio->io_stalled = stage;
 		mutex_exit(&zio->io_lock);
 	}
@@ -683,8 +693,6 @@ zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
 	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		pio->io_error = zio->io_error;
 	if (--*countp == 0 && pio->io_stalled == stage) {
-		if (pio->io_stage == ZIO_STAGE_VDEV_IO_START)
-			zio_vdev_io_enter(pio);
 		pio->io_stalled = 0;
 		mutex_exit(&pio->io_lock);
 		zio_next_stage_async(pio);
@@ -748,36 +756,45 @@ zio_done(zio_t *zio)
 		vdev_stat_update(zio);
 
 	if (zio->io_error) {
-		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
-		    bp ? bp : &zio->io_bp_copy);
-		dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): error %d\n",
-		    zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
-		    zio_type_name[zio->io_type],
-		    vdev_description(vd),
-		    (u_longlong_t)zio->io_offset,
-		    zio, blkbuf, zio->io_error);
-	}
-
-	if (zio->io_numerrors != 0 && zio->io_type == ZIO_TYPE_WRITE) {
-		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
-		    bp ? bp : &zio->io_bp_copy);
-		dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): %d errors\n",
-		    "partial write",
-		    zio_type_name[zio->io_type],
-		    vdev_description(vd),
-		    (u_longlong_t)zio->io_offset,
-		    zio, blkbuf, zio->io_numerrors);
-	}
+		/*
+		 * If this I/O is attached to a particular vdev,
+		 * generate an error message describing the I/O failure
+		 * at the block level.  We ignore these errors if the
+		 * device is currently unavailable.
+		 */
+		if (zio->io_error != ECKSUM && zio->io_vd &&
+		    !vdev_is_dead(zio->io_vd))
+			zfs_ereport_post(FM_EREPORT_ZFS_IO,
+			    zio->io_spa, zio->io_vd, zio, 0, 0);
+
+		if ((zio->io_error == EIO ||
+		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
+		    zio->io_logical == zio) {
+			/*
+			 * For root I/O requests, tell the SPA to log the error
+			 * appropriately.  Also, generate a logical data
+			 * ereport.
+			 */
+			spa_log_error(zio->io_spa, zio);
+
+			zfs_ereport_post(FM_EREPORT_ZFS_DATA,
+			    zio->io_spa, NULL, zio, 0, 0);
+		}
 
-	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
-		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
-		    bp ? bp : &zio->io_bp_copy);
-		panic("ZFS: %s (%s on %s off %llx: zio %p %s): error %d",
-		    zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
-		    zio_type_name[zio->io_type],
-		    vdev_description(vd),
-		    (u_longlong_t)zio->io_offset,
-		    zio, blkbuf, zio->io_error);
+		/*
+		 * For I/O requests that cannot fail, panic appropriately.
+		 */
+		if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+			sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
+			    bp ? bp : &zio->io_bp_copy);
+			panic("ZFS: %s (%s on %s off %llx: zio %p %s): error "
+			    "%d", zio->io_error == ECKSUM ?
+			    "bad checksum" : "I/O failure",
+			    zio_type_name[zio->io_type],
+			    vdev_description(vd),
+			    (u_longlong_t)zio->io_offset,
+			    zio, blkbuf, zio->io_error);
+		}
 	}
 
 	zio_clear_transform_stack(zio);
@@ -807,7 +824,7 @@ zio_done(zio_t *zio)
 	}
 
 	if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD))
-		spa_config_exit(spa);
+		spa_config_exit(spa, zio);
 
 	if (zio->io_waiter != NULL) {
 		mutex_enter(&zio->io_lock);
@@ -988,7 +1005,8 @@ zio_read_gang_members(zio_t *zio)
 
 		zio_nowait(zio_read(zio, zio->io_spa, gbp,
 		    (char *)zio->io_data + loff, lsize, NULL, NULL,
-		    zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT));
+		    zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
+		    &zio->io_bookmark));
 	}
 
 	zio_buf_free(gbh, gbufsize);
@@ -1022,7 +1040,8 @@ zio_rewrite_gang_members(zio_t *zio)
 
 		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
 		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
-		    NULL, NULL, zio->io_priority, zio->io_flags));
+		    NULL, NULL, zio->io_priority, zio->io_flags,
+		    &zio->io_bookmark));
 	}
 
 	zio_push_transform(zio, gbh, gsize, gbufsize);
@@ -1153,7 +1172,8 @@ zio_write_allocate_gang_members(zio_t *zio)
 			    zio->io_checksum, zio->io_txg, gbp,
 			    (char *)zio->io_data + loff, lsize,
 			    zio_write_allocate_gang_member_done, NULL,
-			    zio->io_priority, zio->io_flags));
+			    zio->io_priority, zio->io_flags,
+			    &zio->io_bookmark));
 		} else {
 			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
 			ASSERT(lsize != SPA_MINBLOCKSIZE);
@@ -1263,51 +1283,6 @@ zio_dva_translate(zio_t *zio)
  * Read and write to physical devices
  * ==========================================================================
  */
-static void
-zio_vdev_io_enter(zio_t *zio)
-{
-	vdev_t *tvd = zio->io_vd->vdev_top;
-
-	mutex_enter(&tvd->vdev_io_lock);
-	ASSERT(zio->io_pending.list_next == NULL);
-	list_insert_tail(&tvd->vdev_io_pending, zio);
-	mutex_exit(&tvd->vdev_io_lock);
-}
-
-static void
-zio_vdev_io_exit(zio_t *zio)
-{
-	vdev_t *tvd = zio->io_vd->vdev_top;
-
-	mutex_enter(&tvd->vdev_io_lock);
-	ASSERT(zio->io_pending.list_next != NULL);
-	list_remove(&tvd->vdev_io_pending, zio);
-	if (list_head(&tvd->vdev_io_pending) == NULL)
-		cv_broadcast(&tvd->vdev_io_cv);
-	mutex_exit(&tvd->vdev_io_lock);
-}
-
-static void
-zio_vdev_io_retry(void *vdarg)
-{
-	vdev_t *vd = vdarg;
-	zio_t *zio, *zq;
-
-	ASSERT(vd == vd->vdev_top);
-
-	/* XXPOLICY */
-	delay(hz);
-
-	vdev_reopen(vd, &zq);
-
-	while ((zio = zq) != NULL) {
-		zq = zio->io_retry_next;
-		zio->io_retry_next = NULL;
-		dprintf("async retry #%d for I/O to %s offset %llx\n",
-		    zio->io_retries, vdev_description(vd), zio->io_offset);
-		zio_next_stage_async(zio);
-	}
-}
 
 static void
 zio_vdev_io_setup(zio_t *zio)
@@ -1323,8 +1298,6 @@ zio_vdev_io_setup(zio_t *zio)
 		zio->io_offset += VDEV_LABEL_START_SIZE;
 	}
 
-	zio_vdev_io_enter(zio);
-
 	zio_next_stage(zio);
 }
 
@@ -1350,7 +1323,7 @@ zio_vdev_io_done(zio_t *zio)
 }
 
 /* XXPOLICY */
-static boolean_t
+boolean_t
 zio_should_retry(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -1363,11 +1336,7 @@ zio_should_retry(zio_t *zio)
 		return (B_FALSE);
 	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
 		return (B_FALSE);
-	if (zio->io_retries > 300 &&
-	    (zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL)))
-		return (B_FALSE);
-	if (zio->io_retries > 1 &&
-	    (zio->io_error == ECKSUM || zio->io_error == ENXIO))
+	if (zio->io_retries > 0)
 		return (B_FALSE);
 
 	return (B_TRUE);
@@ -1379,17 +1348,16 @@ zio_vdev_io_assess(zio_t *zio)
 	vdev_t *vd = zio->io_vd;
 	vdev_t *tvd = vd->vdev_top;
 
-	zio_vdev_io_exit(zio);
-
 	ASSERT(zio->io_vsd == NULL);
 
+	if (zio_injection_enabled && !zio->io_error)
+		zio->io_error = zio_handle_fault_injection(zio, EIO);
+
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
 	 */
 	/* XXPOLICY */
 	if (zio_should_retry(zio)) {
-		zio_t *zq;
-
 		ASSERT(tvd == vd);
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE));
 
@@ -1405,29 +1373,27 @@ zio_vdev_io_assess(zio_t *zio)
 		    zio->io_retries, zio_type_name[zio->io_type],
 		    vdev_description(vd), zio->io_offset);
 
-		/*
-		 * If this is the first retry, do it immediately.
-		 */
-		/* XXPOLICY */
-		if (zio->io_retries == 1) {
-			zio_next_stage_async(zio);
-			return;
-		}
+		zio_next_stage_async(zio);
+		return;
+	}
 
+	if (zio->io_error != 0 && !(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
+	    zio->io_error != ECKSUM) {
 		/*
-		 * This was not the first retry, so go through the
-		 * longer enqueue/delay/vdev_reopen() process.
+		 * Poor man's hotplug support.  Even if we're done retrying this
+		 * I/O, try to reopen the vdev to see if it's still attached.
+		 * To avoid excessive thrashing, we only try it once a minute.
+		 * This also has the effect of detecting when missing devices
+		 * have come back, by polling the device once a minute.
+		 *
+		 * We need to do this asynchronously because we can't grab
+		 * all the necessary locks way down here.
 		 */
-		mutex_enter(&tvd->vdev_io_lock);
-		ASSERT(zio->io_retry_next == NULL);
-		zio->io_retry_next = zq = tvd->vdev_io_retry;
-		tvd->vdev_io_retry = zio;
-		mutex_exit(&tvd->vdev_io_lock);
-		if (zq == NULL)
-			(void) taskq_dispatch(
-			    tvd->vdev_spa->spa_vdev_retry_taskq,
-			    zio_vdev_io_retry, tvd, TQ_SLEEP);
-		return;
+		if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) {
+			vd->vdev_last_try = gethrtime();
+			tvd->vdev_reopen_wanted = 1;
+			spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN);
+		}
 	}
 
 	zio_next_stage(zio);
@@ -1502,10 +1468,9 @@ zio_checksum_verify(zio_t *zio)
 {
 	if (zio->io_bp != NULL) {
 		zio->io_error = zio_checksum_error(zio);
-		if (zio->io_error) {
-			dprintf("bad checksum on vdev %s\n",
-			    vdev_description(zio->io_vd));
-		}
+		if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
+			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+			    zio->io_spa, zio->io_vd, zio, 0, 0);
 	}
 
 	zio_next_stage(zio);
@@ -1660,7 +1625,7 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp,
 {
 	int error;
 
-	spa_config_enter(spa, RW_READER);
+	spa_config_enter(spa, RW_READER, FTAG);
 
 	BP_ZERO(bp);
 
@@ -1677,7 +1642,7 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp,
 		bp->blk_birth = txg;
 	}
 
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 
 	return (error);
 }
@@ -1693,9 +1658,9 @@ zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
 
 	dprintf_bp(bp, "txg %llu: ", txg);
 
-	spa_config_enter(spa, RW_READER);
+	spa_config_enter(spa, RW_READER, FTAG);
 
 	metaslab_free(spa, BP_IDENTITY(bp), txg);
 
-	spa_config_exit(spa);
+	spa_config_exit(spa, FTAG);
 }
diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c
index dc31527ce8..d57ab6d525 100644
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -170,5 +169,8 @@ zio_checksum_error(zio_t *zio)
 	    (actual_cksum.zc_word[3] - zc.zc_word[3]))
 		return (ECKSUM);
 
+	if (zio_injection_enabled && !zio->io_error)
+		return (zio_handle_fault_injection(zio, ECKSUM));
+
 	return (0);
 }
diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c
new file mode 100644
index 0000000000..4cada09d83
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c
@@ -0,0 +1,315 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * ZFS fault injection
+ *
+ * To handle fault injection, we keep track of a series of zinject_record_t
+ * structures which describe which logical block(s) should be injected with a
+ * fault.  These are kept in a global list.  Each record corresponds to a given
+ * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
+ * or exported while the injection record exists.
+ *
+ * Device level injection is done using the 'zi_guid' field.  If this is set, it
+ * means that the error is destined for a particular device, not a piece of
+ * data.
+ *
+ * This is a rather poor data structure and algorithm, but we don't expect more
+ * than a few faults at any one time, so it should be sufficient for our needs.
+ */
+
+#include <sys/arc.h>
+#include <sys/zio_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+
+uint32_t zio_injection_enabled;
+
+typedef struct inject_handler {
+	int			zi_id;
+	spa_t			*zi_spa;
+	zinject_record_t	zi_record;
+	list_node_t		zi_link;
+} inject_handler_t;
+
+static list_t inject_handlers;
+static krwlock_t inject_lock;
+static int inject_next_id = 1;
+
+/*
+ * Returns true if the given record matches the I/O in progress.
+ */
+static boolean_t
+zio_match_handler(zbookmark_t *zb, uint64_t type,
+    zinject_record_t *record, int error)
+{
+	/*
+	 * Check for a match against the MOS, which is based on type
+	 */
+	if (zb->zb_objset == 0 && record->zi_objset == 0 &&
+	    record->zi_object == 0) {
+		if (record->zi_type == DMU_OT_NONE ||
+		    type == record->zi_type)
+			return (record->zi_freq == 0 ||
+			    spa_get_random(100) < record->zi_freq);
+		else
+			return (B_FALSE);
+	}
+
+	/*
+	 * Check for an exact match.
+	 */
+	if (zb->zb_objset == record->zi_objset &&
+	    zb->zb_object == record->zi_object &&
+	    zb->zb_level == record->zi_level &&
+	    zb->zb_blkid >= record->zi_start &&
+	    zb->zb_blkid <= record->zi_end &&
+	    error == record->zi_error)
+		return (record->zi_freq == 0 ||
+		    spa_get_random(100) < record->zi_freq);
+
+	return (B_FALSE);
+}
+
+/*
+ * Determine if the I/O in question should return failure.  Returns the errno
+ * to be returned to the caller.
+ */
+int
+zio_handle_fault_injection(zio_t *zio, int error)
+{
+	int ret = 0;
+	inject_handler_t *handler;
+
+	/*
+	 * Ignore I/O not associated with any logical data.
+	 */
+	if (zio->io_logical == NULL)
+		return (0);
+
+	/*
+	 * Currently, we only support fault injection on reads.
+	 */
+	if (zio->io_type != ZIO_TYPE_READ)
+		return (0);
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		/* Ignore errors not destined for this pool */
+		if (zio->io_spa != handler->zi_spa)
+			continue;
+
+		/* Ignore device errors */
+		if (handler->zi_record.zi_guid != 0)
+			continue;
+
+		/* If this handler matches, return EIO */
+		if (zio_match_handler(&zio->io_logical->io_bookmark,
+		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
+		    &handler->zi_record, error)) {
+			ret = error;
+			break;
+		}
+	}
+
+	rw_exit(&inject_lock);
+
+	return (ret);
+}
+
+int
+zio_handle_device_injection(vdev_t *vd, int error)
+{
+	inject_handler_t *handler;
+	int ret = 0;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		if (vd->vdev_guid == handler->zi_record.zi_guid) {
+			if (handler->zi_record.zi_error == error) {
+				/*
+				 * For a failed open, pretend like the device
+				 * has gone away.
+				 */
+				if (error == ENXIO)
+					vd->vdev_stat.vs_aux =
+					    VDEV_AUX_OPEN_FAILED;
+				ret = error;
+				break;
+			}
+			if (handler->zi_record.zi_error == ENXIO) {
+				ret = EIO;
+				break;
+			}
+		}
+	}
+
+	rw_exit(&inject_lock);
+
+	return (ret);
+}
+
+/*
+ * Create a new handler for the given record.  We add it to the list, adding
+ * a reference to the spa_t in the process.  We increment zio_injection_enabled,
+ * which is the switch to trigger all fault injection.
+ */
+int
+zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
+{
+	inject_handler_t *handler;
+	int error;
+	spa_t *spa;
+
+	/*
+	 * If this is pool-wide metadata, make sure we unload the corresponding
+	 * spa_t, so that the next attempt to load it will trigger the fault.
+	 * We call spa_reset() to unload the pool appropriately.
+	 */
+	if (flags & ZINJECT_UNLOAD_SPA)
+		if ((error = spa_reset(name)) != 0)
+			return (error);
+
+	if (!(flags & ZINJECT_NULL)) {
+		/*
+		 * spa_inject_ref() will add an injection reference, which will
+		 * prevent the pool from being removed from the namespace while
+		 * still allowing it to be unloaded.
+		 */
+		if ((spa = spa_inject_addref(name)) == NULL)
+			return (ENOENT);
+
+		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
+
+		rw_enter(&inject_lock, RW_WRITER);
+
+		*id = handler->zi_id = inject_next_id++;
+		handler->zi_spa = spa;
+		handler->zi_record = *record;
+		list_insert_tail(&inject_handlers, handler);
+		atomic_add_32(&zio_injection_enabled, 1);
+
+		rw_exit(&inject_lock);
+	}
+
+	/*
+	 * Flush the ARC, so that any attempts to read this data will end up
+	 * going to the ZIO layer.  Note that this is a little overkill, but
+	 * we don't have the necessary ARC interfaces to do anything else, and
+	 * fault injection isn't a performance critical path.
+	 */
+	if (flags & ZINJECT_FLUSH_ARC)
+		arc_flush();
+
+	return (0);
+}
+
+/*
+ * Returns the next record with an ID greater than that supplied to the
+ * function.  Used to iterate over all handlers in the system.
+ */
+int
+zio_inject_list_next(int *id, char *name, size_t buflen,
+    zinject_record_t *record)
+{
+	inject_handler_t *handler;
+	int ret;
+
+	mutex_enter(&spa_namespace_lock);
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler))
+		if (handler->zi_id > *id)
+			break;
+
+	if (handler) {
+		*record = handler->zi_record;
+		*id = handler->zi_id;
+		(void) strncpy(name, spa_name(handler->zi_spa), buflen);
+		ret = 0;
+	} else {
+		ret = ENOENT;
+	}
+
+	rw_exit(&inject_lock);
+	mutex_exit(&spa_namespace_lock);
+
+	return (ret);
+}
+
+/*
+ * Clear the fault handler with the given identifier, or return ENOENT if none
+ * exists.
+ */
+int
+zio_clear_fault(int id)
+{
+	inject_handler_t *handler;
+	int ret;
+
+	rw_enter(&inject_lock, RW_WRITER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler))
+		if (handler->zi_id == id)
+			break;
+
+	if (handler == NULL) {
+		ret = ENOENT;
+	} else {
+		list_remove(&inject_handlers, handler);
+		spa_inject_delref(handler->zi_spa);
+		kmem_free(handler, sizeof (inject_handler_t));
+		atomic_add_32(&zio_injection_enabled, -1);
+		ret = 0;
+	}
+
+	rw_exit(&inject_lock);
+
+	return (ret);
+}
+
+void
+zio_inject_init(void)
+{
+	list_create(&inject_handlers, sizeof (inject_handler_t),
+	    offsetof(inject_handler_t, zi_link));
+}
+
+void
+zio_inject_fini(void)
+{
+	list_destroy(&inject_handlers);
+}
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index a570d4d971..69fb50c2c3 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -418,6 +417,7 @@ zvol_create_minor(zfs_cmd_t *zc)
 
 	zvol_size_changed(zv, dev);
 
+	/* XXX this should handle the possible i/o error */
 	VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
 	    "readonly", zvol_readonly_changed_cb, zv) == 0);
 
@@ -500,7 +500,7 @@ zvol_set_volsize(zfs_cmd_t *zc)
 	}
 
 	tx = dmu_tx_create(zv->zv_objset);
-	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, 1);
+	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	dmu_tx_hold_free(tx, ZVOL_OBJ, zc->zc_volsize, DMU_OBJECT_END);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
@@ -511,9 +511,10 @@ zvol_set_volsize(zfs_cmd_t *zc)
 
 	error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
 	    &zc->zc_volsize, tx);
-	if (error == 0)
-		dmu_free_range(zv->zv_objset, ZVOL_OBJ, zc->zc_volsize,
+	if (error == 0) {
+		error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, zc->zc_volsize,
 		    DMU_OBJECT_END, tx);
+	}
 
 	dmu_tx_commit(tx);
 
@@ -744,7 +745,7 @@ zvol_strategy(buf_t *bp)
 			size = volsize - off;
 
 		if (bp->b_flags & B_READ) {
-			error = dmu_read_canfail(os, ZVOL_OBJ,
+			error = dmu_read(os, ZVOL_OBJ,
 			    off, size, addr);
 		} else {
 			dmu_tx_t *tx = dmu_tx_create(os);
diff --git a/usr/src/uts/common/krtld/kobj.c b/usr/src/uts/common/krtld/kobj.c
index 003022d104..1cdf93e98f 100644
--- a/usr/src/uts/common/krtld/kobj.c
+++ b/usr/src/uts/common/krtld/kobj.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -108,6 +107,7 @@ static int kobj_boot_open(char *, int);
 static int kobj_boot_close(int);
 static int kobj_boot_seek(int, off_t, off_t);
 static int kobj_boot_read(int, caddr_t, size_t);
+static int kobj_boot_fstat(int, struct bootstat *);
 
 static Sym *lookup_one(struct module *, const char *);
 static void sym_insert(struct module *, char *, symid_t);
@@ -3324,8 +3324,8 @@ kobj_open(char *filename)
 			 */
 			cred_t *saved_cred = curthread->t_cred;
 			curthread->t_cred = kcred;
-			Errno = vn_open(filename, UIO_SYSSPACE, FREAD, 0, &vp,
-			    0, 0);
+			Errno = vn_openat(filename, UIO_SYSSPACE, FREAD, 0, &vp,
+			    0, 0, rootdir);
 			curthread->t_cred = saved_cred;
 		}
 		kobjopen_free(ltp);
@@ -3458,6 +3458,47 @@ kobj_close(intptr_t descr)
 		(void) kobj_boot_close((int)descr);
 }
 
+int
+kobj_fstat(intptr_t descr, struct bootstat *buf)
+{
+	if (buf == NULL)
+		return (-1);
+
+	if (_modrootloaded) {
+		vattr_t vattr;
+		struct vnode *vp = (struct vnode *)descr;
+		if (VOP_GETATTR(vp, &vattr, 0, kcred) != 0)
+			return (-1);
+
+		/*
+		 * The vattr and bootstat structures are similar, but not
+		 * identical.  We do our best to fill in the bootstat structure
+		 * from the contents of vattr (transfering only the ones that
+		 * are obvious.
+		 */
+
+		buf->st_mode = (uint32_t)vattr.va_mode;
+		buf->st_nlink = (uint32_t)vattr.va_nlink;
+		buf->st_uid = (int32_t)vattr.va_uid;
+		buf->st_gid = (int32_t)vattr.va_gid;
+		buf->st_rdev = (uint64_t)vattr.va_rdev;
+		buf->st_size = (uint64_t)vattr.va_size;
+		buf->st_atim.tv_sec = (int64_t)vattr.va_atime.tv_sec;
+		buf->st_atim.tv_nsec = (int64_t)vattr.va_atime.tv_nsec;
+		buf->st_mtim.tv_sec = (int64_t)vattr.va_mtime.tv_sec;
+		buf->st_mtim.tv_nsec = (int64_t)vattr.va_mtime.tv_nsec;
+		buf->st_ctim.tv_sec = (int64_t)vattr.va_ctime.tv_sec;
+		buf->st_ctim.tv_nsec = (int64_t)vattr.va_ctime.tv_nsec;
+		buf->st_blksize = (int32_t)vattr.va_blksize;
+		buf->st_blocks = (int64_t)vattr.va_nblocks;
+
+		return (0);
+	}
+
+	return (kobj_boot_fstat((int)descr, buf));
+}
+
+
 struct _buf *
 kobj_open_file(char *name)
 {
@@ -4097,6 +4138,18 @@ kobj_record_file(char *filename)
 }
 #endif	/* __x86 */
 
+static int
+kobj_boot_fstat(int fd, struct bootstat *stp)
+{
+#if defined(__sparc)
+	if (!standalone && _ioquiesced)
+		return (-1);
+	return (BOP_FSTAT(ops, fd, stp));
+#else
+	return (BRD_FSTAT(bfs_ops, fd, stp));
+#endif
+}
+
 /*
  * XXX these wrappers should go away when sparc is converted
  * boot from ramdisk
diff --git a/usr/src/uts/common/krtld/kobj_stubs.c b/usr/src/uts/common/krtld/kobj_stubs.c
index 3d972194bb..c592fb5317 100644
--- a/usr/src/uts/common/krtld/kobj_stubs.c
+++ b/usr/src/uts/common/krtld/kobj_stubs.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -108,6 +107,13 @@ kobj_close(intptr_t descr)
 
 /*ARGSUSED*/
 int
+kobj_fstat(intptr_t descr, struct bootstat *buf)
+{
+	return (-1);
+}
+
+/*ARGSUSED*/
+int
 kobj_filbuf(struct _buf *f)
 {
 	return (-1);
diff --git a/usr/src/uts/common/krtld/mapfile b/usr/src/uts/common/krtld/mapfile
index 398c6dcf32..cb1f85b04a 100644
--- a/usr/src/uts/common/krtld/mapfile
+++ b/usr/src/uts/common/krtld/mapfile
@@ -1,13 +1,9 @@
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -22,6 +18,9 @@
 #
 # CDDL HEADER END
 #
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 #
 
@@ -36,6 +35,7 @@
 		kobj_export_module;
 		kobj_filbuf;
 		kobj_free;
+		kobj_fstat;
 		kobj_getelfsym;
 		kobj_getmodinfo;
 		kobj_getpagesize;
diff --git a/usr/src/uts/common/os/fm.c b/usr/src/uts/common/os/fm.c
index 6ff4626405..43c3acbef0 100644
--- a/usr/src/uts/common/os/fm.c
+++ b/usr/src/uts/common/os/fm.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -1070,6 +1069,37 @@ fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
 	}
 }
 
+void
+fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
+    uint64_t vdev_guid)
+{
+	if (version != ZFS_SCHEME_VERSION0) {
+		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		return;
+	}
+
+	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
+		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		return;
+	}
+
+	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
+		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		return;
+	}
+
+	if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
+		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+	}
+
+	if (vdev_guid != 0) {
+		if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
+			atomic_add_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		}
+	}
+}
+
 uint64_t
 fm_ena_increment(uint64_t ena)
 {
diff --git a/usr/src/uts/common/os/modsysfile.c b/usr/src/uts/common/os/modsysfile.c
index 7ffcf66d10..0e36f3e2cc 100644
--- a/usr/src/uts/common/os/modsysfile.c
+++ b/usr/src/uts/common/os/modsysfile.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -73,6 +72,7 @@ static vmem_t *mod_sysfile_arena;	/* parser memory */
 
 char obp_bootpath[BO_MAXOBJNAME];	/* bootpath from obp */
 char svm_bootpath[BO_MAXOBJNAME];	/* bootpath redirected via rootdev */
+char zfs_bootpath[BO_MAXOBJNAME];	/* zfs bootpath, set via zfsroot */
 
 #if defined(_PSM_MODULES)
 
@@ -489,6 +489,8 @@ static struct modcmd modcmd[] = {
 	{ "set32",	MOD_SET32	},
 	{ "SET64",	MOD_SET64	},
 	{ "set64",	MOD_SET64	},
+	{ "ZFSROOT", 	MOD_ZFSROOT	},
+	{ "zfsroot", 	MOD_ZFSROOT	},
 	{ NULL,		MOD_UNKNOWN	}
 };
 
@@ -528,6 +530,7 @@ do_sysfile_cmd(struct _buf *file, const char *cmd)
 		 */
 	case MOD_ROOTFS:
 	case MOD_SWAPFS:
+	case MOD_ZFSROOT:
 		if ((token = kobj_lex(file, tok1, sizeof (tok1))) == COLON) {
 			token = kobj_lex(file, tok1, sizeof (tok1));
 		} else {
@@ -1520,7 +1523,10 @@ setparams()
 			(void) copystr(sysp->sys_ptr, bootobjp->bo_fstype,
 			    BO_MAXOBJNAME, NULL);
 			break;
-
+		case MOD_ZFSROOT:
+			(void) copystr(sysp->sys_ptr, zfs_bootpath,
+			    BO_MAXOBJNAME, NULL);
+			break;
 		default:
 			break;
 		}
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index fe4a5c82df..2e027b7ba5 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -1741,13 +1740,10 @@ secpolicy_contract_event_choice(const cred_t *cr)
 }
 
 /*
- * Name:   secpolicy_gart_access
- *
- * Normal: Verify if the subject has sufficient priveleges to make ioctls
- *	   to agpgart device
- *
- * Output: EPERM - if not privileged
+ * secpolicy_gart_access
  *
+ * Determine if the subject has sufficient priveleges to make ioctls to agpgart
+ * device.
  */
 int
 secpolicy_gart_access(const cred_t *cr)
@@ -1756,13 +1752,10 @@ secpolicy_gart_access(const cred_t *cr)
 }
 
 /*
- * Name:   secpolicy_gart_map
- *
- * Normal: Verify if the subject has sufficient privelegs to map aperture
- *	   range through agpgart driver
- *
- * Output: EPERM - if not privileged
+ * secpolicy_gart_map
  *
+ * Determine if the subject has sufficient priveleges to map aperture range
+ * through agpgart driver.
  */
 int
 secpolicy_gart_map(const cred_t *cr)
@@ -1774,10 +1767,22 @@ secpolicy_gart_map(const cred_t *cr)
 }
 
 /*
+ * secpolicy_zinject
+ *
+ * Determine if the subject can inject faults in the ZFS fault injection
+ * framework.  Requires all privileges.
+ */
+int
+secpolicy_zinject(const cred_t *cr)
+{
+	return (secpolicy_require_set(cr, PRIV_FULLSET, NULL));
+}
+
+/*
  * secpolicy_zfs
  *
- * Determine if the user has permission to manipulate ZFS datasets (not pools).
- * Equivalent to the SYS_MOUNT privilege.
+ * Determine if the subject has permission to manipulate ZFS datasets
+ * (not pools).  Equivalent to the SYS_MOUNT privilege.
  */
 int
 secpolicy_zfs(const cred_t *cr)
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index f82a933903..516ecc0a5a 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -657,6 +657,9 @@ FMHDRS=				\
 	protocol.h		\
 	util.h
 
+FMFSHDRS=			\
+	zfs.h
+
 FMIOHDRS=			\
 	ddi.h			\
 	pci.h			\
@@ -914,6 +917,7 @@ CHECKHDRS=						\
 	$(TAVORHDRS:%.h=ib/adapters/tavor/%.check)	\
 	$(ISOHDRS:%.h=iso/%.check)			\
 	$(FMHDRS:%.h=fm/%.check)			\
+	$(FMFSHDRS:%.h=fm/fs/%.check)			\
 	$(FMIOHDRS:%.h=fm/io/%.check)			\
 	$(FSHDRS:%.h=fs/%.check)			\
 	$(LVMHDRS:%.h=lvm/%.check)			\
@@ -949,6 +953,7 @@ CHECKHDRS=						\
 	$(ROOTISOHDRS)		\
 	$(ROOTFMHDRS)		\
 	$(ROOTFMIOHDRS)		\
+	$(ROOTFMFSHDRS)		\
 	$(ROOTFSHDRS)		\
 	$(ROOTIBDHDRS)		\
 	$(ROOTIBHDRS)		\
@@ -992,7 +997,8 @@ install_h:			\
 	$(ROOTDCAMHDRS)		\
 	$(ROOTISOHDRS)		\
 	$(ROOTFMHDRS)		\
-	$(ROOTFMIOHDRS)	\
+	$(ROOTFMFSHDRS)		\
+	$(ROOTFMIOHDRS)		\
 	$(ROOTFSHDRS)		\
 	$(ROOTIBDHDRS)		\
 	$(ROOTIBHDRS)		\
diff --git a/usr/src/uts/common/sys/Makefile.syshdrs b/usr/src/uts/common/sys/Makefile.syshdrs
index cdc3436049..d9c363b48b 100644
--- a/usr/src/uts/common/sys/Makefile.syshdrs
+++ b/usr/src/uts/common/sys/Makefile.syshdrs
@@ -1,5 +1,5 @@
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -18,10 +18,13 @@ av/%.check:	av/%.h
 fm/%.check:     fm/%.h
 	$(DOT_H_CHECK)
 
-fm/cpu/%.check:     fm/cpu/%.h
+fm/cpu/%.check:	fm/cpu/%.h
 	$(DOT_H_CHECK)
 
-fm/io/%.check:     fm/io/%.h
+fm/fs/%.check:	fm/fs/%.h
+	$(DOT_H_CHECK)
+
+fm/io/%.check:	fm/io/%.h
 	$(DOT_H_CHECK)
 
 fs/%.check:	fs/%.h
@@ -129,6 +132,7 @@ ROOTDIRS=			\
 	$(ROOTDIR)/iso		\
 	$(ROOTDIR)/fm		\
 	$(ROOTDIR)/fm/cpu	\
+	$(ROOTDIR)/fm/fs	\
 	$(ROOTDIR)/fm/io	\
 	$(ROOTDIR)/fs		\
 	$(ROOTDIR)/ib		\
@@ -187,6 +191,7 @@ ROOTISOHDRS= $(ISOHDRS:%=$(ROOTDIR)/iso/%)
 ROOTFMHDRS= $(FMHDRS:%=$(ROOTDIR)/fm/%)
 ROOTFMCPUHDRS= $(FMCPUHDRS:%=$(ROOTDIR)/fm/cpu/%)
 ROOTFMIOHDRS= $(FMIOHDRS:%=$(ROOTDIR)/fm/io/%)
+ROOTFMFSHDRS= $(FMFSHDRS:%=$(ROOTDIR)/fm/fs/%)
 
 ROOTFSHDRS= $(FSHDRS:%=$(ROOTDIR)/fs/%)
 
diff --git a/usr/src/uts/common/sys/fm/fs/zfs.h b/usr/src/uts/common/sys/fm/fs/zfs.h
new file mode 100644
index 0000000000..aa5c7ee0d7
--- /dev/null
+++ b/usr/src/uts/common/sys/fm/fs/zfs.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_FM_FS_ZFS_H
+#define	_SYS_FM_FS_ZFS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ZFS_ERROR_CLASS				"fs.zfs"
+
+#define	FM_EREPORT_ZFS_CHECKSUM			"checksum"
+#define	FM_EREPORT_ZFS_IO			"io"
+#define	FM_EREPORT_ZFS_DATA			"data"
+#define	FM_EREPORT_ZFS_POOL			"zpool"
+#define	FM_EREPORT_ZFS_DEVICE_UNKNOWN		"vdev.unknown"
+#define	FM_EREPORT_ZFS_DEVICE_OPEN_FAILED	"vdev.open_failed"
+#define	FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA	"vdev.corrupt_data"
+#define	FM_EREPORT_ZFS_DEVICE_NO_REPLICAS	"vdev.no_replicas"
+#define	FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM	"vdev.bad_guid_sum"
+#define	FM_EREPORT_ZFS_DEVICE_TOO_SMALL		"vdev.too_small"
+#define	FM_EREPORT_ZFS_DEVICE_BAD_LABEL		"vdev.bad_label"
+
+#define	FM_EREPORT_PAYLOAD_ZFS_POOL		"pool"
+#define	FM_EREPORT_PAYLOAD_ZFS_POOL_GUID	"pool_guid"
+#define	FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT	"pool_context"
+#define	FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID	"vdev_guid"
+#define	FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE	"vdev_type"
+#define	FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH	"vdev_path"
+#define	FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID	"vdev_devid"
+#define	FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID	"parent_guid"
+#define	FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE	"parent_type"
+#define	FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH	"parent_path"
+#define	FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID	"parent_devid"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET	"zio_objset"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT	"zio_object"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL	"zio_level"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID	"zio_blkid"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR		"zio_err"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET	"zio_offset"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE		"zio_size"
+#define	FM_EREPORT_PAYLOAD_ZFS_PREV_STATE	"prev_state"
+
+#define	FM_RESOURCE_OK				"ok"
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FM_FS_ZFS_H */
diff --git a/usr/src/uts/common/sys/fm/protocol.h b/usr/src/uts/common/sys/fm/protocol.h
index 89b761ef6c..1afa67f66b 100644
--- a/usr/src/uts/common/sys/fm/protocol.h
+++ b/usr/src/uts/common/sys/fm/protocol.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -168,6 +167,7 @@ extern "C" {
 #define	FM_FMRI_SCHEME_MOD		"mod"
 #define	FM_FMRI_SCHEME_PKG		"pkg"
 #define	FM_FMRI_SCHEME_LEGACY		"legacy-hc"
+#define	FM_FMRI_SCHEME_ZFS		"zfs"
 
 /* Scheme versions */
 #define	FMD_SCHEME_VERSION0		0
@@ -187,6 +187,8 @@ extern "C" {
 #define	FM_PKG_SCHEME_VERSION		PKG_SCHEME_VERSION0
 #define	LEGACY_SCHEME_VERSION0		0
 #define	FM_LEGACY_SCHEME_VERSION	LEGACY_SCHEME_VERSION0
+#define	ZFS_SCHEME_VERSION0		0
+#define	FM_ZFS_SCHEME_VERSION		ZFS_SCHEME_VERSION0
 
 /* hc scheme member names */
 #define	FM_FMRI_HC_SERIAL_ID		"serial"
@@ -253,6 +255,10 @@ extern "C" {
 #define	FM_FMRI_MOD_ID			"mod-id"
 #define	FM_FMRI_MOD_DESC		"mod-desc"
 
+/* zfs scheme member names */
+#define	FM_FMRI_ZFS_POOL		"pool"
+#define	FM_FMRI_ZFS_VDEV		"vdev"
+
 extern nv_alloc_t *fm_nva_xcreate(char *, size_t);
 extern void fm_nva_xdestroy(nv_alloc_t *);
 
@@ -277,6 +283,7 @@ extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *,
     const char *, uint64_t);
 extern void fm_authority_set(nvlist_t *, int, const char *, const char *,
     const char *, const char *);
+extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t);
 
 extern uint64_t fm_ena_increment(uint64_t);
 extern uint64_t fm_ena_generate(uint64_t, uchar_t);
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index 65425c829c..0fa884dcaa 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -133,6 +133,8 @@ uint64_t zfs_prop_default_numeric(zfs_prop_t);
 #define	ZPOOL_CONFIG_STATS		"stats"
 #define	ZPOOL_CONFIG_WHOLE_DISK		"whole_disk"
 #define	ZPOOL_CONFIG_OFFLINE		"offline"
+#define	ZPOOL_CONFIG_ERRCOUNT		"error_count"
+#define	ZPOOL_CONFIG_NOT_PRESENT	"not_present"
 
 #define	VDEV_TYPE_ROOT			"root"
 #define	VDEV_TYPE_MIRROR		"mirror"
@@ -304,9 +306,25 @@ typedef enum zfs_ioc {
 	ZFS_IOC_ROLLBACK,
 	ZFS_IOC_RENAME,
 	ZFS_IOC_RECVBACKUP,
-	ZFS_IOC_SENDBACKUP
+	ZFS_IOC_SENDBACKUP,
+	ZFS_IOC_INJECT_FAULT,
+	ZFS_IOC_CLEAR_FAULT,
+	ZFS_IOC_INJECT_LIST_NEXT,
+	ZFS_IOC_ERROR_LOG,
+	ZFS_IOC_CLEAR,
+	ZFS_IOC_BOOKMARK_NAME
 } zfs_ioc_t;
 
+/*
+ * Internal SPA load state.  Used by FMA diagnosis engine.
+ */
+typedef enum {
+	SPA_LOAD_NONE,		/* no load in progress */
+	SPA_LOAD_OPEN,		/* normal open */
+	SPA_LOAD_IMPORT,	/* import in progress */
+	SPA_LOAD_TRYIMPORT	/* tryimport in progress */
+} spa_load_state_t;
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/sys/kobj.h b/usr/src/uts/common/sys/kobj.h
index 7d2bd0922e..9276aa370f 100644
--- a/usr/src/uts/common/sys/kobj.h
+++ b/usr/src/uts/common/sys/kobj.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -34,6 +33,7 @@
 #include <sys/machelf.h>
 #include <sys/vmem.h>
 #include <sys/sdt.h>
+#include <sys/bootstat.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -162,6 +162,7 @@ extern uintptr_t kobj_getsymvalue(char *, int);
 extern char *kobj_getsymname(uintptr_t, ulong_t *);
 extern char *kobj_searchsym(struct module *, uintptr_t, ulong_t *);
 
+extern int kobj_fstat(intptr_t, struct bootstat *);
 extern intptr_t kobj_open(char *);
 extern int kobj_path_exists(char *, int);
 extern struct _buf *kobj_open_path(char *, int, int);
diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h
index 9653a58b0e..beabb63818 100644
--- a/usr/src/uts/common/sys/policy.h
+++ b/usr/src/uts/common/sys/policy.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -141,6 +140,7 @@ int secpolicy_vnode_setdac(const cred_t *, uid_t);
 int secpolicy_vnode_setid_retain(const cred_t *, boolean_t);
 int secpolicy_vnode_setids_setgids(const cred_t *, gid_t);
 int secpolicy_vnode_stky_modify(const cred_t *);
+int secpolicy_zinject(const cred_t *);
 int secpolicy_zfs(const cred_t *);
 void secpolicy_setid_clear(vattr_t *, cred_t *);
 
diff --git a/usr/src/uts/common/sys/sysconf.h b/usr/src/uts/common/sys/sysconf.h
index 4594d91287..654436a115 100644
--- a/usr/src/uts/common/sys/sysconf.h
+++ b/usr/src/uts/common/sys/sysconf.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1990-2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -72,6 +71,7 @@ struct modcmd {
 #define	MOD_UNKNOWN	9	/* unknown command */
 #define	MOD_SET32	10	/* like MOD_SET but -only- on 32-bit kernel */
 #define	MOD_SET64	11	/* like MOD_SET but -only- on 64-bit kernel */
+#define	MOD_ZFSROOT	12	/* use zfs as the root filesystem */
 
 /*
  * Commands for mod_sysctl()
author	eschrock <none@none>	2006-03-03 20:08:16 -0800
committer	eschrock <none@none>	2006-03-03 20:08:16 -0800
commit	ea8dc4b6d2251b437950c0056bc626b311c73c27 (patch)
tree	69cc1808568f2ef8fd1e21c61e186ba452ea64da /usr/src/uts/common
parent	5c18afbc96a46bc3a9e6f3667512daa374d6cd79 (diff)
download	illumos-joyent-ea8dc4b6d2251b437950c0056bc626b311c73c27.tar.gz