60 files changed, 2869 insertions, 2026 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index f485fe9f7c..057f207bc6 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -437,6 +436,7 @@ struct arc_buf_hdr {
 
 	kmutex_t		b_freeze_lock;
 	zio_cksum_t		*b_freeze_cksum;
+	void			*b_thawed;
 
 	arc_buf_hdr_t		*b_hash_next;
 	arc_buf_t		*b_buf;
@@ -545,8 +545,8 @@ static buf_hash_table_t buf_hash_table;
 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define	HDR_LOCK(buf) \
-	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+#define	HDR_LOCK(hdr) \
+	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];
 
@@ -664,6 +664,15 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+	hdr->b_dva.dva_word[0] = 0;
+	hdr->b_dva.dva_word[1] = 0;
+	hdr->b_birth = 0;
+	hdr->b_cksum0 = 0;
+}
+
 static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 {
@@ -797,7 +806,8 @@ buf_cons(void *vbuf, void *unused, int kmflag)
 	arc_buf_t *buf = vbuf;
 
 	bzero(buf, sizeof (arc_buf_t));
-	rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+	rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
 	return (0);
@@ -826,7 +836,8 @@ buf_dest(void *vbuf, void *unused)
 {
 	arc_buf_t *buf = vbuf;
 
-	rw_destroy(&buf->b_lock);
+	mutex_destroy(&buf->b_evict_lock);
+	rw_destroy(&buf->b_data_lock);
 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
@@ -941,6 +952,11 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
+	kmutex_t *hash_lock;
+
+	hash_lock = HDR_LOCK(buf->b_hdr);
+	mutex_enter(hash_lock);
+
 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
 		if (buf->b_hdr->b_state != arc_anon)
 			panic("modifying non-anon buffer!");
@@ -954,18 +970,32 @@ arc_buf_thaw(arc_buf_t *buf)
 		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 		buf->b_hdr->b_freeze_cksum = NULL;
 	}
+
+	if (zfs_flags & ZFS_DEBUG_MODIFY) {
+		if (buf->b_hdr->b_thawed)
+			kmem_free(buf->b_hdr->b_thawed, 1);
+		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+	}
+
 	mutex_exit(&buf->b_hdr->b_freeze_lock);
+	mutex_exit(hash_lock);
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
+	kmutex_t *hash_lock;
+
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
+	hash_lock = HDR_LOCK(buf->b_hdr);
+	mutex_enter(hash_lock);
+
 	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
 	    buf->b_hdr->b_state == arc_anon);
 	arc_cksum_compute(buf, B_FALSE);
+	mutex_exit(hash_lock);
 }
 
 static void
@@ -1037,7 +1067,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 	ASSERT(new_state != old_state);
 	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
 	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
-	ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon);
 	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
 
 	from_delta = to_delta = ab->b_datacnt * ab->b_size;
@@ -1059,7 +1088,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 
 			/*
 			 * If prefetching out of the ghost cache,
-			 * we will have a non-null datacnt.
+			 * we will have a non-zero datacnt.
 			 */
 			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
 				/* ghost elements have a ghost size */
@@ -1095,9 +1124,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 	}
 
 	ASSERT(!BUF_EMPTY(ab));
-	if (new_state == arc_anon) {
+	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
 		buf_hash_remove(ab);
-	}
 
 	/* adjust state sizes */
 	if (to_delta)
@@ -1254,7 +1282,6 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr;
 
-	rw_enter(&buf->b_lock, RW_WRITER);
 	ASSERT(buf->b_data != NULL);
 	hdr = buf->b_hdr;
 	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
@@ -1263,7 +1290,6 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
 	buf->b_private = NULL;
 
 	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
-	rw_exit(&buf->b_lock);
 }
 
 static arc_buf_t *
@@ -1299,16 +1325,16 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
 	 * must verify b_data != NULL to know if the add_ref
 	 * was successful.
 	 */
-	rw_enter(&buf->b_lock, RW_READER);
+	mutex_enter(&buf->b_evict_lock);
 	if (buf->b_data == NULL) {
-		rw_exit(&buf->b_lock);
+		mutex_exit(&buf->b_evict_lock);
 		return;
 	}
-	hdr = buf->b_hdr;
-	ASSERT(hdr != NULL);
-	hash_lock = HDR_LOCK(hdr);
+	hash_lock = HDR_LOCK(buf->b_hdr);
 	mutex_enter(hash_lock);
-	rw_exit(&buf->b_lock);
+	hdr = buf->b_hdr;
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+	mutex_exit(&buf->b_evict_lock);
 
 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
 	add_reference(hdr, hash_lock, tag);
@@ -1394,6 +1420,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
 		continue;
 	*bufp = buf->b_next;
+	buf->b_next = NULL;
 
 	ASSERT(buf->b_efunc == NULL);
 
@@ -1442,23 +1469,21 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 
 	if (!BUF_EMPTY(hdr)) {
 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
-		bzero(&hdr->b_dva, sizeof (dva_t));
-		hdr->b_birth = 0;
-		hdr->b_cksum0 = 0;
+		buf_discard_identity(hdr);
 	}
 	while (hdr->b_buf) {
 		arc_buf_t *buf = hdr->b_buf;
 
 		if (buf->b_efunc) {
 			mutex_enter(&arc_eviction_mtx);
-			rw_enter(&buf->b_lock, RW_WRITER);
+			mutex_enter(&buf->b_evict_lock);
 			ASSERT(buf->b_hdr != NULL);
 			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
 			hdr->b_buf = buf->b_next;
 			buf->b_hdr = &arc_eviction_hdr;
 			buf->b_next = arc_eviction_list;
 			arc_eviction_list = buf;
-			rw_exit(&buf->b_lock);
+			mutex_exit(&buf->b_evict_lock);
 			mutex_exit(&arc_eviction_mtx);
 		} else {
 			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
@@ -1468,6 +1493,10 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 		hdr->b_freeze_cksum = NULL;
 	}
+	if (hdr->b_thawed) {
+		kmem_free(hdr->b_thawed, 1);
+		hdr->b_thawed = NULL;
+	}
 
 	ASSERT(!list_link_active(&hdr->b_arc_node));
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1488,6 +1517,9 @@ arc_buf_free(arc_buf_t *buf, void *tag)
 		kmutex_t *hash_lock = HDR_LOCK(hdr);
 
 		mutex_enter(hash_lock);
+		hdr = buf->b_hdr;
+		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
 		(void) remove_reference(hdr, hash_lock, tag);
 		if (hdr->b_datacnt > 1) {
 			arc_buf_destroy(buf, FALSE, TRUE);
@@ -1512,12 +1544,10 @@ arc_buf_free(arc_buf_t *buf, void *tag)
 		if (destroy_hdr)
 			arc_hdr_destroy(hdr);
 	} else {
-		if (remove_reference(hdr, NULL, tag) > 0) {
-			ASSERT(HDR_IO_ERROR(hdr));
+		if (remove_reference(hdr, NULL, tag) > 0)
 			arc_buf_destroy(buf, FALSE, TRUE);
-		} else {
+		else
 			arc_hdr_destroy(hdr);
-		}
 	}
 }
 
@@ -1535,6 +1565,8 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 	}
 
 	mutex_enter(hash_lock);
+	hdr = buf->b_hdr;
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT(hdr->b_state != arc_anon);
 	ASSERT(buf->b_data != NULL);
 
@@ -1613,7 +1645,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
 			ASSERT(ab->b_datacnt > 0);
 			while (ab->b_buf) {
 				arc_buf_t *buf = ab->b_buf;
-				if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
+				if (!mutex_tryenter(&buf->b_evict_lock)) {
 					missed += 1;
 					break;
 				}
@@ -1635,9 +1667,9 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
 					buf->b_next = arc_eviction_list;
 					arc_eviction_list = buf;
 					mutex_exit(&arc_eviction_mtx);
-					rw_exit(&buf->b_lock);
+					mutex_exit(&buf->b_evict_lock);
 				} else {
-					rw_exit(&buf->b_lock);
+					mutex_exit(&buf->b_evict_lock);
 					arc_buf_destroy(buf,
 					    buf->b_data == stolen, TRUE);
 				}
@@ -1854,9 +1886,9 @@ arc_do_user_evicts(void)
 	while (arc_eviction_list != NULL) {
 		arc_buf_t *buf = arc_eviction_list;
 		arc_eviction_list = buf->b_next;
-		rw_enter(&buf->b_lock, RW_WRITER);
+		mutex_enter(&buf->b_evict_lock);
 		buf->b_hdr = NULL;
-		rw_exit(&buf->b_lock);
+		mutex_exit(&buf->b_evict_lock);
 		mutex_exit(&arc_eviction_mtx);
 
 		if (buf->b_efunc != NULL)
@@ -2438,7 +2470,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 void
 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
-	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+	if (zio == NULL || zio->io_error == 0)
+		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
 	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
 }
 
@@ -2452,6 +2485,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
 		*bufp = NULL;
 	} else {
 		*bufp = buf;
+		ASSERT(buf->b_data);
 	}
 }
 
@@ -2606,13 +2640,22 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
 {
 	int err;
 
+	if (pbuf == NULL) {
+		/*
+		 * XXX This happens from traverse callback funcs, for
+		 * the objset_phys_t block.
+		 */
+		return (arc_read_nolock(pio, spa, bp, done, private, priority,
+		    zio_flags, arc_flags, zb));
+	}
+
 	ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
 	ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
-	rw_enter(&pbuf->b_lock, RW_READER);
+	rw_enter(&pbuf->b_data_lock, RW_READER);
 
 	err = arc_read_nolock(pio, spa, bp, done, private, priority,
 	    zio_flags, arc_flags, zb);
-	rw_exit(&pbuf->b_lock);
+	rw_exit(&pbuf->b_data_lock);
 
 	return (err);
 }
@@ -2721,9 +2764,7 @@ top:
 			if (exists) {
 				/* somebody beat us to the hash insert */
 				mutex_exit(hash_lock);
-				bzero(&hdr->b_dva, sizeof (dva_t));
-				hdr->b_birth = 0;
-				hdr->b_cksum0 = 0;
+				buf_discard_identity(hdr);
 				(void) arc_buf_remove_ref(buf, private);
 				goto top; /* restart the IO request */
 			}
@@ -2901,14 +2942,14 @@ arc_buf_evict(arc_buf_t *buf)
 	kmutex_t *hash_lock;
 	arc_buf_t **bufp;
 
-	rw_enter(&buf->b_lock, RW_WRITER);
+	mutex_enter(&buf->b_evict_lock);
 	hdr = buf->b_hdr;
 	if (hdr == NULL) {
 		/*
 		 * We are in arc_do_user_evicts().
 		 */
 		ASSERT(buf->b_data == NULL);
-		rw_exit(&buf->b_lock);
+		mutex_exit(&buf->b_evict_lock);
 		return (0);
 	} else if (buf->b_data == NULL) {
 		arc_buf_t copy = *buf; /* structure assignment */
@@ -2917,14 +2958,15 @@ arc_buf_evict(arc_buf_t *buf)
 		 * but let arc_do_user_evicts() do the reaping.
 		 */
 		buf->b_efunc = NULL;
-		rw_exit(&buf->b_lock);
+		mutex_exit(&buf->b_evict_lock);
 		VERIFY(copy.b_efunc(&copy) == 0);
 		return (1);
 	}
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
+	hdr = buf->b_hdr;
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
-	ASSERT(buf->b_hdr == hdr);
 	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
 
@@ -2943,6 +2985,7 @@ arc_buf_evict(arc_buf_t *buf)
 		arc_state_t *old_state = hdr->b_state;
 		arc_state_t *evicted_state;
 
+		ASSERT(hdr->b_buf == NULL);
 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
 
 		evicted_state =
@@ -2960,12 +3003,13 @@ arc_buf_evict(arc_buf_t *buf)
 		mutex_exit(&old_state->arcs_mtx);
 	}
 	mutex_exit(hash_lock);
-	rw_exit(&buf->b_lock);
+	mutex_exit(&buf->b_evict_lock);
 
 	VERIFY(buf->b_efunc(buf) == 0);
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 	buf->b_hdr = NULL;
+	buf->b_next = NULL;
 	kmem_cache_free(buf_cache, buf);
 	return (1);
 }
@@ -2980,12 +3024,17 @@ void
 arc_release(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_lock;
+	kmutex_t *hash_lock = NULL;
 	l2arc_buf_hdr_t *l2hdr;
 	uint64_t buf_size;
-	boolean_t released = B_FALSE;
 
-	rw_enter(&buf->b_lock, RW_WRITER);
+	/*
+	 * It would be nice to assert that if it's DMU metadata (level >
+	 * 0 || it's the dnode file), then it must be syncing context.
+	 * But we don't know that information at this level.
+	 */
+
+	mutex_enter(&buf->b_evict_lock);
 	hdr = buf->b_hdr;
 
 	/* this buffer is not on any list */
@@ -2993,15 +3042,12 @@ arc_release(arc_buf_t *buf, void *tag)
 
 	if (hdr->b_state == arc_anon) {
 		/* this buffer is already released */
-		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
-		ASSERT(BUF_EMPTY(hdr));
 		ASSERT(buf->b_efunc == NULL);
-		arc_buf_thaw(buf);
-		rw_exit(&buf->b_lock);
-		released = B_TRUE;
 	} else {
 		hash_lock = HDR_LOCK(hdr);
 		mutex_enter(hash_lock);
+		hdr = buf->b_hdr;
+		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	}
 
 	l2hdr = hdr->b_l2hdr;
@@ -3011,9 +3057,6 @@ arc_release(arc_buf_t *buf, void *tag)
 		buf_size = hdr->b_size;
 	}
 
-	if (released)
-		goto out;
-
 	/*
 	 * Do we have more than one buf?
 	 */
@@ -3027,14 +3070,14 @@ arc_release(arc_buf_t *buf, void *tag)
 
 		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
 		/*
-		 * Pull the data off of this buf and attach it to
-		 * a new anonymous buf.
+		 * Pull the data off of this hdr and attach it to
+		 * a new anonymous hdr.
 		 */
 		(void) remove_reference(hdr, hash_lock, tag);
 		bufp = &hdr->b_buf;
 		while (*bufp != buf)
 			bufp = &(*bufp)->b_next;
-		*bufp = (*bufp)->b_next;
+		*bufp = buf->b_next;
 		buf->b_next = NULL;
 
 		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
@@ -3062,26 +3105,25 @@ arc_release(arc_buf_t *buf, void *tag)
 		nhdr->b_freeze_cksum = NULL;
 		(void) refcount_add(&nhdr->b_refcnt, tag);
 		buf->b_hdr = nhdr;
-		rw_exit(&buf->b_lock);
+		mutex_exit(&buf->b_evict_lock);
 		atomic_add_64(&arc_anon->arcs_size, blksz);
 	} else {
-		rw_exit(&buf->b_lock);
+		mutex_exit(&buf->b_evict_lock);
 		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
 		ASSERT(!list_link_active(&hdr->b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-		arc_change_state(arc_anon, hdr, hash_lock);
+		if (hdr->b_state != arc_anon)
+			arc_change_state(arc_anon, hdr, hash_lock);
 		hdr->b_arc_access = 0;
-		mutex_exit(hash_lock);
+		if (hash_lock)
+			mutex_exit(hash_lock);
 
-		bzero(&hdr->b_dva, sizeof (dva_t));
-		hdr->b_birth = 0;
-		hdr->b_cksum0 = 0;
+		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 	}
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 
-out:
 	if (l2hdr) {
 		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
 		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
@@ -3090,14 +3132,27 @@ out:
 	}
 }
 
+/*
+ * Release this buffer.  If it does not match the provided BP, fill it
+ * with that block's contents.
+ */
+/* ARGSUSED */
+int
+arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+    zbookmark_t *zb)
+{
+	arc_release(buf, tag);
+	return (0);
+}
+
 int
 arc_released(arc_buf_t *buf)
 {
 	int released;
 
-	rw_enter(&buf->b_lock, RW_READER);
+	mutex_enter(&buf->b_evict_lock);
 	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
-	rw_exit(&buf->b_lock);
+	mutex_exit(&buf->b_evict_lock);
 	return (released);
 }
 
@@ -3106,9 +3161,9 @@ arc_has_callback(arc_buf_t *buf)
 {
 	int callback;
 
-	rw_enter(&buf->b_lock, RW_READER);
+	mutex_enter(&buf->b_evict_lock);
 	callback = (buf->b_efunc != NULL);
-	rw_exit(&buf->b_lock);
+	mutex_exit(&buf->b_evict_lock);
 	return (callback);
 }
 
@@ -3118,9 +3173,9 @@ arc_referenced(arc_buf_t *buf)
 {
 	int referenced;
 
-	rw_enter(&buf->b_lock, RW_READER);
+	mutex_enter(&buf->b_evict_lock);
 	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
-	rw_exit(&buf->b_lock);
+	mutex_exit(&buf->b_evict_lock);
 	return (referenced);
 }
 #endif
@@ -3173,8 +3228,8 @@ arc_write_done(zio_t *zio)
 	/*
 	 * If the block to be written was all-zero, we may have
 	 * compressed it away.  In this case no write was performed
-	 * so there will be no dva/birth-date/checksum.  The buffer
-	 * must therefor remain anonymous (and uncached).
+	 * so there will be no dva/birth/checksum.  The buffer must
+	 * therefore remain anonymous (and uncached).
 	 */
 	if (!BUF_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
@@ -3278,9 +3333,7 @@ arc_free(spa_t *spa, const blkptr_t *bp)
 			if (HDR_IN_HASH_TABLE(ab))
 				buf_hash_remove(ab);
 			ab->b_arc_access = 0;
-			bzero(&ab->b_dva, sizeof (dva_t));
-			ab->b_birth = 0;
-			ab->b_cksum0 = 0;
+			buf_discard_identity(ab);
 			ab->b_buf->b_efunc = NULL;
 			ab->b_buf->b_private = NULL;
 			mutex_exit(hash_lock);
@@ -3974,11 +4027,11 @@ l2arc_read_done(zio_t *zio)
 	ASSERT(cb != NULL);
 	buf = cb->l2rcb_buf;
 	ASSERT(buf != NULL);
-	hdr = buf->b_hdr;
-	ASSERT(hdr != NULL);
 
-	hash_lock = HDR_LOCK(hdr);
+	hash_lock = HDR_LOCK(buf->b_hdr);
 	mutex_enter(hash_lock);
+	hdr = buf->b_hdr;
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	/*
 	 * Check this survived the L2ARC journey.
diff --git a/usr/src/uts/common/fs/zfs/bplist.c b/usr/src/uts/common/fs/zfs/bplist.c
index 830b6c1a42..8f8c5e1fcf 100644
--- a/usr/src/uts/common/fs/zfs/bplist.c
+++ b/usr/src/uts/common/fs/zfs/bplist.c
@@ -175,23 +175,26 @@ bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
 		return (err);
 	}
 
-	if (*itorp >= bpl->bpl_phys->bpl_entries) {
-		mutex_exit(&bpl->bpl_lock);
-		return (ENOENT);
-	}
+	do {
+		if (*itorp >= bpl->bpl_phys->bpl_entries) {
+			mutex_exit(&bpl->bpl_lock);
+			return (ENOENT);
+		}
 
-	blk = *itorp >> bpl->bpl_bpshift;
-	off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
+		blk = *itorp >> bpl->bpl_bpshift;
+		off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
 
-	err = bplist_cache(bpl, blk);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
+		err = bplist_cache(bpl, blk);
+		if (err) {
+			mutex_exit(&bpl->bpl_lock);
+			return (err);
+		}
+
+		bparray = bpl->bpl_cached_dbuf->db_data;
+		*bp = bparray[off];
+		(*itorp)++;
+	} while (bp->blk_birth == 0);
 
-	bparray = bpl->bpl_cached_dbuf->db_data;
-	*bp = bparray[off];
-	(*itorp)++;
 	mutex_exit(&bpl->bpl_lock);
 	return (0);
 }
@@ -206,8 +209,10 @@ bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
 	ASSERT(!BP_IS_HOLE(bp));
 	mutex_enter(&bpl->bpl_lock);
 	err = bplist_hold(bpl);
-	if (err)
+	if (err) {
+		mutex_exit(&bpl->bpl_lock);
 		return (err);
+	}
 
 	blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
 	off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index c211ff79dd..e1cd431acb 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -363,6 +363,7 @@ dbuf_verify(dmu_buf_impl_t *db)
 		}
 	}
 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
+	    (db->db_buf == NULL || db->db_buf->b_data) &&
 	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
 		/*
@@ -477,8 +478,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
-	mutex_exit(&db->db_mtx);
-	dbuf_rele(db, NULL);
+	dbuf_rele_and_unlock(db, NULL);
 }
 
 static void
@@ -549,7 +549,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 	else
 		pbuf = db->db_objset->os_phys_buf;
 
-	(void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
+	(void) dsl_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 	    &aflags, &zb);
@@ -727,7 +727,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 
 	/* free this block */
 	if (!BP_IS_HOLE(bp))
-		dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp);
+		zio_free(db->db_dnode->dn_objset->os_spa, txg, bp);
 
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	/*
@@ -921,6 +921,26 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 	dnode_willuse_space(db->db_dnode, size-osize, tx);
 }
 
+void
+dbuf_release_bp(dmu_buf_impl_t *db)
+{
+	objset_t *os = db->db_dnode->dn_objset;
+	zbookmark_t zb;
+
+	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+	ASSERT(arc_released(os->os_phys_buf) ||
+	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
+	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+	zb.zb_objset = os->os_dsl_dataset ?
+	    os->os_dsl_dataset->ds_object : 0;
+	zb.zb_object = db->db.db_object;
+	zb.zb_level = db->db_level;
+	zb.zb_blkid = db->db_blkid;
+	(void) arc_release_bp(db->db_buf, db,
+	    db->db_blkptr, os->os_spa, &zb);
+}
+
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
@@ -1717,7 +1737,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
 			else
 				pbuf = dn->dn_objset->os_phys_buf;
 
-			(void) arc_read(NULL, dn->dn_objset->os_spa,
+			(void) dsl_read(NULL, dn->dn_objset->os_spa,
 			    bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zb);
@@ -2463,7 +2483,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 			if (BP_IS_HOLE(db->db_blkptr)) {
 				arc_buf_thaw(data);
 			} else {
-				arc_release(data, db);
+				dbuf_release_bp(db);
 			}
 		}
 	}
diff --git a/usr/src/uts/common/fs/zfs/ddt.c b/usr/src/uts/common/fs/zfs/ddt.c
index 852fd1cdc4..64cbcc1f92 100644
--- a/usr/src/uts/common/fs/zfs/ddt.c
+++ b/usr/src/uts/common/fs/zfs/ddt.c
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -35,6 +34,7 @@
 #include <sys/dsl_pool.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
+#include <sys/dsl_scan.h>
 
 static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
 	&ddt_zap_ops,
@@ -160,7 +160,7 @@ ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 	    ddt->ddt_object[type][class], dde));
 }
 
-static int
+int
 ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
     ddt_entry_t *dde, dmu_tx_t *tx)
 {
@@ -245,12 +245,13 @@ ddt_bp_create(enum zio_checksum checksum,
 		ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
 
 	bp->blk_cksum = ddk->ddk_cksum;
+	bp->blk_fill = 1;
 
 	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
 	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
 	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
 	BP_SET_CHECKSUM(bp, checksum);
-	BP_SET_TYPE(bp, DMU_OT_NONE);
+	BP_SET_TYPE(bp, DMU_OT_DEDUP);
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
@@ -996,10 +997,17 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 			ddt_object_create(ddt, ntype, nclass, tx);
 		VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
 
-		if (dp->dp_scrub_func != SCRUB_FUNC_NONE &&
-		    oclass > nclass &&
-		    nclass <= dp->dp_scrub_ddt_class_max)
-			dsl_pool_scrub_ddt_entry(dp, ddt->ddt_checksum, dde);
+		/*
+		 * If the class changes, the order that we scan this bp
+		 * changes.  If it decreases, we could miss it, so
+		 * scan it right now.  (This covers both class changing
+		 * while we are doing ddt_walk(), and when we are
+		 * traversing.)
+		 */
+		if (nclass < oclass) {
+			dsl_scan_ddt_entry(dp->dp_scan,
+			    ddt->ddt_checksum, dde, tx);
+		}
 	}
 }
 
@@ -1013,7 +1021,6 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 	if (avl_numnodes(&ddt->ddt_tree) == 0)
 		return;
 
-	ASSERT(spa_sync_pass(spa) == 1);
 	ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
 
 	if (spa->spa_ddt_stat_object == 0) {
@@ -1081,6 +1088,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
 					    ddb->ddb_type, ddb->ddb_class,
 					    &ddb->ddb_cursor, dde);
 				}
+				dde->dde_type = ddb->ddb_type;
+				dde->dde_class = ddb->ddb_class;
 				if (error == 0)
 					return (0);
 				if (error != ENOENT)
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 0be72aa4f2..582089b8e8 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -84,7 +84,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	byteswap_uint8_array,	TRUE,	"FUID table"		},
 	{	byteswap_uint64_array,	TRUE,	"FUID table size"	},
 	{	zap_byteswap,		TRUE,	"DSL dataset next clones"},
-	{	zap_byteswap,		TRUE,	"scrub work queue"	},
+	{	zap_byteswap,		TRUE,	"scan work queue"	},
 	{	zap_byteswap,		TRUE,	"ZFS user/group used"	},
 	{	zap_byteswap,		TRUE,	"ZFS user/group quota"	},
 	{	zap_byteswap,		TRUE,	"snapshot refcount tags"},
@@ -93,7 +93,10 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	byteswap_uint8_array,	TRUE,	"System attributes"	},
 	{	zap_byteswap,		TRUE,	"SA master node"	},
 	{	zap_byteswap,		TRUE,	"SA attr registration"	},
-	{	zap_byteswap,		TRUE,	"SA attr layouts"	}, };
+	{	zap_byteswap,		TRUE,	"SA attr layouts"	},
+	{	zap_byteswap,		TRUE,	"scan translations"	},
+	{	byteswap_uint8_array,	FALSE,	"deduplicated block"	},
+};
 
 int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
@@ -1630,6 +1633,7 @@ byteswap_uint8_array(void *vbuf, size_t size)
 void
 dmu_init(void)
 {
+	zfs_dbgmsg_init();
 	dbuf_init();
 	dnode_init();
 	zfetch_init();
@@ -1649,4 +1653,5 @@ dmu_fini(void)
 	l2arc_fini();
 	xuio_stat_fini();
 	sa_cache_fini();
+	zfs_dbgmsg_fini();
 }
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 210d693051..546cd98b84 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -259,11 +259,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 
 		dprintf_bp(os->os_rootbp, "reading %s", "");
 		/*
-		 * NB: when bprewrite scrub can change the bp,
+		 * XXX when bprewrite scrub can change the bp,
 		 * and this is called from dmu_objset_open_ds_os, the bp
 		 * could change, and we'll need a lock.
 		 */
-		err = arc_read_nolock(NULL, spa, os->os_rootbp,
+		err = dsl_read_nolock(NULL, spa, os->os_rootbp,
 		    arc_getbuf_func, &os->os_phys_buf,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
 		if (err) {
@@ -628,6 +628,7 @@ struct oscarg {
 	const char *lastname;
 	dmu_objset_type_t type;
 	uint64_t flags;
+	cred_t *cr;
 };
 
 /*ARGSUSED*/
@@ -659,7 +660,7 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	struct oscarg *oa = arg2;
@@ -668,7 +669,7 @@ dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dsobj = dsl_dataset_create_sync(dd, oa->lastname,
-	    oa->clone_origin, oa->flags, cr, tx);
+	    oa->clone_origin, oa->flags, oa->cr, tx);
 
 	if (oa->clone_origin == NULL) {
 		dsl_dataset_t *ds;
@@ -684,12 +685,12 @@ dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 		    ds, bp, oa->type, tx);
 
 		if (oa->userfunc)
-			oa->userfunc(os, oa->userarg, cr, tx);
+			oa->userfunc(os, oa->userarg, oa->cr, tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 
-	spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa,
-	    tx, cr, "dataset = %llu", dsobj);
+	spa_history_log_internal(LOG_DS_CREATE, dd->dd_pool->dp_spa,
+	    tx, "dataset = %llu", dsobj);
 }
 
 int
@@ -715,6 +716,7 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
 	oa.lastname = tail;
 	oa.type = type;
 	oa.flags = flags;
+	oa.cr = CRED();
 
 	err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
 	    dmu_objset_create_sync, pdd, &oa, 5);
@@ -742,6 +744,7 @@ dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags)
 	oa.lastname = tail;
 	oa.clone_origin = clone_origin;
 	oa.flags = flags;
+	oa.cr = CRED();
 
 	err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
 	    dmu_objset_create_sync, pdd, &oa, 5);
@@ -795,19 +798,19 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	objset_t *os = arg1;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	struct snaparg *sn = arg2;
 
-	dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx);
+	dsl_dataset_snapshot_sync(ds, sn->snapname, tx);
 
 	if (sn->props) {
 		dsl_props_arg_t pa;
 		pa.pa_props = sn->props;
 		pa.pa_source = ZPROP_SRC_LOCAL;
-		dsl_props_set_sync(ds->ds_prev, &pa, cr, tx);
+		dsl_props_set_sync(ds->ds_prev, &pa, tx);
 	}
 }
 
@@ -1016,11 +1019,11 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 	/*
 	 * Create the root block IO
 	 */
-	arc_release(os->os_phys_buf, &os->os_phys_buf);
-
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+	VERIFY3U(0, ==, arc_release_bp(os->os_phys_buf, &os->os_phys_buf,
+	    os->os_rootbp, os->os_spa, &zb));
 
 	dmu_write_policy(os, NULL, 0, 0, &zp);
 
@@ -1082,7 +1085,7 @@ dmu_objset_is_dirty(objset_t *os, uint64_t txg)
 	    !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
 }
 
-static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
+objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
 
 void
 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
@@ -1649,7 +1652,7 @@ dmu_objset_prefetch(const char *name, void *arg)
 			SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
 			    ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
-			(void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds),
+			(void) dsl_read_nolock(NULL, dsl_dataset_get_spa(ds),
 			    &ds->ds_phys->ds_bp, NULL, NULL,
 			    ZIO_PRIORITY_ASYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index 86c428b5f2..a675e28b15 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -301,7 +301,7 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
 
 /* ARGSUSED */
 static int
-backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct backuparg *ba = arg;
@@ -330,7 +330,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 		uint32_t aflags = ARC_WAIT;
 		arc_buf_t *abuf;
 
-		if (arc_read_nolock(NULL, spa, bp,
+		if (dsl_read(NULL, spa, bp, pbuf,
 		    arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
 		    ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
 			return (EIO);
@@ -361,7 +361,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 		arc_buf_t *abuf;
 		int blksz = BP_GET_LSIZE(bp);
 
-		if (arc_read_nolock(NULL, spa, bp,
+		if (dsl_read(NULL, spa, bp, pbuf,
 		    arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
 		    ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
 			return (EIO);
@@ -504,6 +504,7 @@ struct recvbeginsyncarg {
 	uint64_t dsflags;
 	char clonelastname[MAXNAMELEN];
 	dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
+	cred_t *cr;
 };
 
 /* ARGSUSED */
@@ -536,7 +537,7 @@ recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	struct recvbeginsyncarg *rbsa = arg2;
@@ -545,7 +546,7 @@ recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
 	/* Create and open new dataset. */
 	dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
-	    rbsa->origin, flags, cr, tx);
+	    rbsa->origin, flags, rbsa->cr, tx);
 	VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
 	    B_TRUE, dmu_recv_tag, &rbsa->ds));
 
@@ -554,8 +555,8 @@ recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 		    rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
 	}
 
-	spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
-	    dd->dd_pool->dp_spa, tx, cr, "dataset = %lld", dsobj);
+	spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC,
+	    dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj);
 }
 
 /* ARGSUSED */
@@ -630,7 +631,7 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 /* ARGSUSED */
 static void
-recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ohds = arg1;
 	struct recvbeginsyncarg *rbsa = arg2;
@@ -641,7 +642,7 @@ recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
 	/* create and open the temporary clone */
 	dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
-	    ohds->ds_prev, flags, cr, tx);
+	    ohds->ds_prev, flags, rbsa->cr, tx);
 	VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
 
 	/*
@@ -655,8 +656,8 @@ recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
 	rbsa->ds = cds;
 
-	spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
-	    dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
+	spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC,
+	    dp->dp_spa, tx, "dataset = %lld", dsobj);
 }
 
 
@@ -701,6 +702,7 @@ dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
 	rbsa.type = drrb->drr_type;
 	rbsa.tag = FTAG;
 	rbsa.dsflags = 0;
+	rbsa.cr = CRED();
 	versioninfo = drrb->drr_versioninfo;
 	flags = drrb->drr_flags;
 
@@ -1466,12 +1468,12 @@ recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	struct recvendsyncarg *resa = arg2;
 
-	dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx);
+	dsl_dataset_snapshot_sync(ds, resa->tosnap, tx);
 
 	/* set snapshot's creation time and guid */
 	dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c
index 653c3a2d41..429c76ae11 100644
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -77,7 +76,7 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
-	(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
+	(void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg);
 
 	return (0);
 }
@@ -102,7 +101,7 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL,
 		    lr->lr_offset / BP_GET_LSIZE(bp));
 
-		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
+		(void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
 		    td->td_arg);
 	}
 	return (0);
@@ -140,7 +139,8 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 	boolean_t hard = td->td_flags & TRAVERSE_HARD;
 
 	if (bp->blk_birth == 0) {
-		err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg);
+		err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
+		    td->td_arg);
 		return (err);
 	}
 
@@ -160,7 +160,8 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 	}
 
 	if (td->td_flags & TRAVERSE_PRE) {
-		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+		err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
+		    td->td_arg);
 		if (err)
 			return (err);
 	}
@@ -171,7 +172,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 
-		err = arc_read(NULL, td->td_spa, bp, pbuf,
+		err = dsl_read(NULL, td->td_spa, bp, pbuf,
 		    arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
@@ -195,7 +196,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
-		err = arc_read(NULL, td->td_spa, bp, pbuf,
+		err = dsl_read(NULL, td->td_spa, bp, pbuf,
 		    arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
@@ -217,7 +218,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 		objset_phys_t *osp;
 		dnode_phys_t *dnp;
 
-		err = arc_read_nolock(NULL, td->td_spa, bp,
+		err = dsl_read_nolock(NULL, td->td_spa, bp,
 		    arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
@@ -252,8 +253,10 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 	if (buf)
 		(void) arc_buf_remove_ref(buf, &buf);
 
-	if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST))
-		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+	if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
+		err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
+		    td->td_arg);
+	}
 
 	return (err != 0 ? err : lasterr);
 }
@@ -275,16 +278,17 @@ traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
 				break;
 			lasterr = err;
 		}
-		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
-			SET_BOOKMARK(&czb, objset,
-			    object, 0, DMU_SPILL_BLKID);
-			err = traverse_visitbp(td, dnp, buf,
-			    (blkptr_t *)&dnp->dn_spill, &czb);
-			if (err) {
-				if (!hard)
-					break;
-				lasterr = err;
-			}
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		SET_BOOKMARK(&czb, objset,
+		    object, 0, DMU_SPILL_BLKID);
+		err = traverse_visitbp(td, dnp, buf,
+		    (blkptr_t *)&dnp->dn_spill, &czb);
+		if (err) {
+			if (!hard)
+				return (err);
+			lasterr = err;
 		}
 	}
 	return (err != 0 ? err : lasterr);
@@ -293,7 +297,8 @@ traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
 /* ARGSUSED */
 static int
 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+    arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
+    void *arg)
 {
 	struct prefetch_data *pfd = arg;
 	uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
@@ -314,7 +319,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	cv_broadcast(&pfd->pd_cv);
 	mutex_exit(&pfd->pd_mtx);
 
-	(void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
+	(void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_READ,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 	    &aflags, zb);
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
index 523aad70da..fa5747c7aa 100644
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -227,7 +227,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
 	if (db->db_state != DB_CACHED)
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 
-	arc_release(db->db_buf, db);
+	dbuf_release_bp(db);
 	bp = (blkptr_t *)db->db.db_data;
 
 	epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index ee0ccd7d01..c645d4d785 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -38,6 +38,7 @@
 #include <sys/spa.h>
 #include <sys/zfs_znode.h>
 #include <sys/zvol.h>
+#include <sys/dsl_scan.h>
 
 static char *dsl_reaper = "the grim reaper";
 
@@ -80,7 +81,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 	int uncompressed = BP_GET_UCSIZE(bp);
 	int64_t delta;
 
-	dprintf_bp(bp, "born, ds=%p\n", ds);
+	dprintf_bp(bp, "ds=%p", ds);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* It could have been compressed away to nothing */
@@ -100,6 +101,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 		return;
 	}
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
 	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
 	delta = parent_delta(ds, used);
@@ -150,7 +152,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
 		int64_t delta;
 
-		dprintf_bp(bp, "freeing: %s", "");
+		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
 		mutex_enter(&ds->ds_dir->dd_lock);
@@ -191,7 +193,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 			ds->ds_prev->ds_phys->ds_unique_bytes += used;
 			mutex_exit(&ds->ds_prev->ds_lock);
 		}
-		if (bp->blk_birth > ds->ds_origin_txg) {
+		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 			dsl_dir_transfer_space(ds->ds_dir, used,
 			    DD_USED_HEAD, DD_USED_SNAP, tx);
 		}
@@ -397,19 +399,6 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 				    ds->ds_phys->ds_prev_snap_obj,
 				    ds, &ds->ds_prev);
 			}
-
-			if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) {
-				dsl_dataset_t *origin;
-
-				err = dsl_dataset_hold_obj(dp,
-				    ds->ds_dir->dd_phys->dd_origin_obj,
-				    FTAG, &origin);
-				if (err == 0) {
-					ds->ds_origin_txg =
-					    origin->ds_phys->ds_creation_txg;
-					dsl_dataset_rele(origin, FTAG);
-				}
-			}
 		} else {
 			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 				err = dsl_dataset_get_snapname(ds);
@@ -876,10 +865,6 @@ dsl_snapshot_destroy_one(const char *name, void *arg)
 		struct dsl_ds_destroyarg *dsda;
 
 		dsl_dataset_make_exclusive(ds, da->dstg);
-		if (ds->ds_objset != NULL) {
-			dmu_objset_evict(ds->ds_objset);
-			ds->ds_objset = NULL;
-		}
 		dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP);
 		dsda->ds = ds;
 		dsda->defer = da->defer;
@@ -989,11 +974,6 @@ dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
 			return (error);
 		dsda->rm_origin = origin;
 		dsl_dataset_make_exclusive(origin, tag);
-
-		if (origin->ds_objset != NULL) {
-			dmu_objset_evict(origin->ds_objset);
-			origin->ds_objset = NULL;
-		}
 	}
 
 	return (0);
@@ -1020,10 +1000,6 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
 		/* Destroying a snapshot is simpler */
 		dsl_dataset_make_exclusive(ds, tag);
 
-		if (ds->ds_objset != NULL) {
-			dmu_objset_evict(ds->ds_objset);
-			ds->ds_objset = NULL;
-		}
 		dsda.defer = defer;
 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 		    dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
@@ -1096,24 +1072,10 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
 	if (err)
 		goto out;
 
-	if (ds->ds_objset) {
-		/*
-		 * We need to sync out all in-flight IO before we try
-		 * to evict (the dataset evict func is trying to clear
-		 * the cached entries for this dataset in the ARC).
-		 */
-		txg_wait_synced(dd->dd_pool, 0);
-	}
-
 	/*
 	 * Blow away the dsl_dir + head dataset.
 	 */
 	dsl_dataset_make_exclusive(ds, tag);
-	if (ds->ds_objset) {
-		dmu_objset_evict(ds->ds_objset);
-		ds->ds_objset = NULL;
-	}
-
 	/*
 	 * If we're removing a clone, we might also need to remove its
 	 * origin.
@@ -1220,7 +1182,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
 	uint64_t mrs_used;
 	uint64_t dlused, dlcomp, dluncomp;
 
-	ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
+	ASSERT(!dsl_dataset_is_snapshot(ds));
 
 	if (ds->ds_phys->ds_prev_snap_obj != 0)
 		mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
@@ -1234,21 +1196,11 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
 	ds->ds_phys->ds_unique_bytes =
 	    ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
 
-	if (!DS_UNIQUE_IS_ACCURATE(ds) &&
-	    spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
 	    SPA_VERSION_UNIQUE_ACCURATE)
 		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 }
 
-static uint64_t
-dsl_dataset_unique(dsl_dataset_t *ds)
-{
-	if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
-		dsl_dataset_recalc_head_uniq(ds);
-
-	return (ds->ds_phys->ds_unique_bytes);
-}
-
 struct killarg {
 	dsl_dataset_t *ds;
 	dmu_tx_t *tx;
@@ -1256,7 +1208,7 @@ struct killarg {
 
 /* ARGSUSED */
 static int
-kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct killarg *ka = arg;
@@ -1315,7 +1267,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 /* ARGSUSED */
 static void
-dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
@@ -1324,8 +1276,8 @@ dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 
-	spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
-	    cr, "dataset = %llu", ds->ds_object);
+	spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
+	    "dataset = %llu", ds->ds_object);
 }
 
 static int
@@ -1499,7 +1451,7 @@ remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
 }
 
 void
-dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
 	struct dsl_ds_destroyarg *dsda = arg1;
 	dsl_dataset_t *ds = dsda->ds;
@@ -1531,6 +1483,11 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 	cv_broadcast(&ds->ds_exclusive_cv);
 	mutex_exit(&ds->ds_lock);
 
+	if (ds->ds_objset) {
+		dmu_objset_evict(ds->ds_objset);
+		ds->ds_objset = NULL;
+	}
+
 	/* Remove our reservation */
 	if (ds->ds_reserved != 0) {
 		dsl_prop_setarg_t psa;
@@ -1541,13 +1498,13 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 		    &value);
 		psa.psa_effective_value = 0;	/* predict default value */
 
-		dsl_dataset_set_reservation_sync(ds, &psa, cr, tx);
+		dsl_dataset_set_reservation_sync(ds, &psa, tx);
 		ASSERT3U(ds->ds_reserved, ==, 0);
 	}
 
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
-	dsl_pool_ds_destroyed(ds, tx);
+	dsl_scan_ds_destroyed(ds, tx);
 
 	obj = ds->ds_object;
 
@@ -1596,7 +1553,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 		}
 	}
 
-	if (ds->ds_phys->ds_next_snap_obj != 0) {
+	if (dsl_dataset_is_snapshot(ds)) {
 		blkptr_t bp;
 		zio_t *pio;
 		dsl_dataset_t *ds_next;
@@ -1608,7 +1565,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 		    ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
 		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
 
-		old_unique = dsl_dataset_unique(ds_next);
+		old_unique = ds_next->ds_phys->ds_unique_bytes;
 
 		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
 		ds_next->ds_phys->ds_prev_snap_obj =
@@ -1664,7 +1621,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 		    ds_next->ds_phys->ds_deadlist_obj));
 		ds->ds_phys->ds_deadlist_obj = 0;
 
-		if (ds_next->ds_phys->ds_next_snap_obj != 0) {
+		if (dsl_dataset_is_snapshot(ds_next)) {
 			/*
 			 * Update next's unique to include blocks which
 			 * were previously shared by only this snapshot
@@ -1790,8 +1747,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 		dsl_dataset_rele(ds_prev, FTAG);
 
 	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
-	spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx,
-	    cr, "dataset = %llu", ds->ds_object);
+	spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx,
+	    "dataset = %llu", ds->ds_object);
 
 	if (ds->ds_phys->ds_next_clones_obj != 0) {
 		uint64_t count;
@@ -1816,7 +1773,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 		struct dsl_ds_destroyarg ndsda = {0};
 
 		ndsda.ds = dsda->rm_origin;
-		dsl_dataset_destroy_sync(&ndsda, tag, cr, tx);
+		dsl_dataset_destroy_sync(&ndsda, tag, tx);
 	}
 }
 
@@ -1833,7 +1790,8 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
 	 * owned by the snapshot dataset must be accommodated by space
 	 * outside of the reservation.
 	 */
-	asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
+	asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
 	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
 		return (ENOSPC);
 
@@ -1847,7 +1805,6 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
 	return (0);
 }
 
-/* ARGSUSED */
 int
 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
@@ -1888,7 +1845,7 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 void
-dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	const char *snapname = arg2;
@@ -1959,9 +1916,11 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	 * since our unique space is going to zero.
 	 */
 	if (ds->ds_reserved) {
-		int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+		int64_t delta;
+		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+		delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
-		    add, 0, 0, tx);
+		    delta, 0, 0, tx);
 	}
 
 	bplist_close(&ds->ds_deadlist);
@@ -1987,11 +1946,11 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	VERIFY(0 == dsl_dataset_get_ref(dp,
 	    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
 
-	dsl_pool_ds_snapshotted(ds, tx);
+	dsl_scan_ds_snapshotted(ds, tx);
 
 	dsl_dir_snap_cmtime_update(ds->ds_dir);
 
-	spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
+	spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx,
 	    "dataset = %llu", dsobj);
 }
 
@@ -2035,7 +1994,7 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
 	    ds->ds_phys->ds_guid);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
-	    dsl_dataset_unique(ds));
+	    ds->ds_phys->ds_unique_bytes);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
 	    ds->ds_object);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
@@ -2163,8 +2122,7 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
-    cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	const char *newsnapname = arg2;
@@ -2188,8 +2146,8 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
 	    ds->ds_snapname, 8, 1, &ds->ds_object, tx);
 	ASSERT3U(err, ==, 0);
 
-	spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
-	    cr, "dataset = %llu", ds->ds_object);
+	spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
+	    "dataset = %llu", ds->ds_object);
 	dsl_dataset_rele(hds, FTAG);
 }
 
@@ -2371,14 +2329,14 @@ struct promotenode {
 
 struct promotearg {
 	list_t shared_snaps, origin_snaps, clone_snaps;
-	dsl_dataset_t *origin_origin, *origin_head;
+	dsl_dataset_t *origin_origin;
 	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
 	char *err_ds;
 };
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
+static boolean_t snaplist_unstable(list_t *l);
 
-/* ARGSUSED */
 static int
 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
@@ -2479,19 +2437,19 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 		/*
 		 * Note, typically this will not be a clone of a clone,
-		 * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so
+		 * so dd_origin_txg will be < TXG_INITIAL, so
 		 * these snaplist_space() -> bplist_space_birthrange()
 		 * calls will be fast because they do not have to
 		 * iterate over all bps.
 		 */
 		snap = list_head(&pa->origin_snaps);
 		err = snaplist_space(&pa->shared_snaps,
-		    snap->ds->ds_origin_txg, &pa->cloneusedsnap);
+		    snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
 		if (err)
 			return (err);
 
 		err = snaplist_space(&pa->clone_snaps,
-		    snap->ds->ds_origin_txg, &space);
+		    snap->ds->ds_dir->dd_origin_txg, &space);
 		if (err)
 			return (err);
 		pa->cloneusedsnap += space;
@@ -2510,7 +2468,7 @@ out:
 }
 
 static void
-dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *hds = arg1;
 	struct promotearg *pa = arg2;
@@ -2554,10 +2512,11 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
 	dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
-	hds->ds_origin_txg = origin_head->ds_origin_txg;
+	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
 	dmu_buf_will_dirty(odd->dd_dbuf, tx);
 	odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
-	origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg;
+	origin_head->ds_dir->dd_origin_txg =
+	    origin_ds->ds_phys->ds_creation_txg;
 
 	/* move snapshots to this dir */
 	for (snap = list_head(&pa->shared_snaps); snap;
@@ -2614,8 +2573,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	origin_ds->ds_phys->ds_unique_bytes = pa->unique;
 
 	/* log history record */
-	spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
-	    cr, "dataset = %llu", hds->ds_object);
+	spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
+	    "dataset = %llu", hds->ds_object);
 
 	dsl_dir_close(odd, FTAG);
 }
@@ -2862,7 +2821,7 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 /* ARGSUSED */
 static void
-dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	struct cloneswaparg *csa = arg1;
 	dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
@@ -2937,9 +2896,9 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 		 * changing that affects the snapused).
 		 */
 		VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
-		    csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used));
+		    csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &cdl_used));
 		VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist,
-		    csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used));
+		    csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &odl_used));
 		dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
 		    DD_USED_HEAD, DD_USED_SNAP, tx);
 	}
@@ -2975,7 +2934,7 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
 	    csa->ohds->ds_phys->ds_deadlist_obj));
 
-	dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx);
+	dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
 }
 
 /*
@@ -3110,24 +3069,24 @@ dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	return (0);
 }
 
-extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *);
+extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
 
 void
-dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dsl_prop_setarg_t *psa = arg2;
 	uint64_t effective_value = psa->psa_effective_value;
 
-	dsl_prop_set_sync(ds, psa, cr, tx);
+	dsl_prop_set_sync(ds, psa, tx);
 	DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
 
 	if (ds->ds_quota != effective_value) {
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ds->ds_quota = effective_value;
 
-		spa_history_internal_log(LOG_DS_REFQUOTA,
-		    ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu ",
+		spa_history_log_internal(LOG_DS_REFQUOTA,
+		    ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ",
 		    (longlong_t)ds->ds_quota, ds->ds_object);
 	}
 }
@@ -3188,7 +3147,9 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 		return (0);
 
 	mutex_enter(&ds->ds_lock);
-	unique = dsl_dataset_unique(ds);
+	if (!DS_UNIQUE_IS_ACCURATE(ds))
+		dsl_dataset_recalc_head_uniq(ds);
+	unique = ds->ds_phys->ds_unique_bytes;
 	mutex_exit(&ds->ds_lock);
 
 	if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
@@ -3205,10 +3166,8 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	return (0);
 }
 
-/* ARGSUSED */
 static void
-dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
-    dmu_tx_t *tx)
+dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dsl_prop_setarg_t *psa = arg2;
@@ -3216,14 +3175,15 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
 	uint64_t unique;
 	int64_t delta;
 
-	dsl_prop_set_sync(ds, psa, cr, tx);
+	dsl_prop_set_sync(ds, psa, tx);
 	DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
-	unique = dsl_dataset_unique(ds);
+	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+	unique = ds->ds_phys->ds_unique_bytes;
 	delta = MAX(0, (int64_t)(effective_value - unique)) -
 	    MAX(0, (int64_t)(ds->ds_reserved - unique));
 	ds->ds_reserved = effective_value;
@@ -3232,8 +3192,8 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
 	mutex_exit(&ds->ds_dir->dd_lock);
 
-	spa_history_internal_log(LOG_DS_REFRESERV,
-	    ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
+	spa_history_log_internal(LOG_DS_REFRESERV,
+	    ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu",
 	    (longlong_t)effective_value, ds->ds_object);
 }
 
@@ -3311,7 +3271,7 @@ dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	struct dsl_ds_holdarg *ha = arg2;
@@ -3343,8 +3303,8 @@ dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 		    htag, &now, tx));
 	}
 
-	spa_history_internal_log(LOG_DS_USER_HOLD,
-	    dp->dp_spa, tx, cr, "<%s> temp = %d dataset = %llu", htag,
+	spa_history_log_internal(LOG_DS_USER_HOLD,
+	    dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag,
 	    (int)ha->temphold, ds->ds_object);
 }
 
@@ -3495,10 +3455,6 @@ dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
 			 */
 			if (!ra->own)
 				return (EBUSY);
-			if (ds->ds_objset) {
-				dmu_objset_evict(ds->ds_objset);
-				ds->ds_objset = NULL;
-			}
 		}
 		dsda.ds = ds;
 		dsda.releasing = B_TRUE;
@@ -3509,7 +3465,7 @@ dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
 }
 
 static void
-dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
 	struct dsl_ds_releasearg *ra = arg1;
 	dsl_dataset_t *ds = ra->ds;
@@ -3520,6 +3476,11 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 	uint64_t refs;
 	int error;
 
+	if (ds->ds_objset) {
+		dmu_objset_evict(ds->ds_objset);
+		ds->ds_objset = NULL;
+	}
+
 	mutex_enter(&ds->ds_lock);
 	ds->ds_userrefs--;
 	refs = ds->ds_userrefs;
@@ -3536,11 +3497,11 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 		dsda.ds = ds;
 		dsda.releasing = B_TRUE;
 		/* We already did the destroy_check */
-		dsl_dataset_destroy_sync(&dsda, tag, cr, tx);
+		dsl_dataset_destroy_sync(&dsda, tag, tx);
 	}
 
-	spa_history_internal_log(LOG_DS_USER_RELEASE,
-	    dp->dp_spa, tx, cr, "<%s> %lld dataset = %llu",
+	spa_history_log_internal(LOG_DS_USER_RELEASE,
+	    dp->dp_spa, tx, "<%s> %lld dataset = %llu",
 	    ra->htag, (longlong_t)refs, dsobj);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/dsl_deleg.c b/usr/src/uts/common/fs/zfs/dsl_deleg.c
index 04053fdf20..85490c8d5f 100644
--- a/usr/src/uts/common/fs/zfs/dsl_deleg.c
+++ b/usr/src/uts/common/fs/zfs/dsl_deleg.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -148,7 +147,7 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
 }
 
 static void
-dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	nvlist_t *nvp = arg2;
@@ -183,8 +182,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
 			VERIFY(zap_update(mos, jumpobj,
 			    perm, 8, 1, &n, tx) == 0);
-			spa_history_internal_log(LOG_DS_PERM_UPDATE,
-			    dd->dd_pool->dp_spa, tx, cr,
+			spa_history_log_internal(LOG_DS_PERM_UPDATE,
+			    dd->dd_pool->dp_spa, tx,
 			    "%s %s dataset = %llu", whokey, perm,
 			    dd->dd_phys->dd_head_dataset_obj);
 		}
@@ -192,7 +191,7 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 }
 
 static void
-dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	nvlist_t *nvp = arg2;
@@ -215,8 +214,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 				(void) zap_remove(mos, zapobj, whokey, tx);
 				VERIFY(0 == zap_destroy(mos, jumpobj, tx));
 			}
-			spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE,
-			    dd->dd_pool->dp_spa, tx, cr,
+			spa_history_log_internal(LOG_DS_PERM_WHO_REMOVE,
+			    dd->dd_pool->dp_spa, tx,
 			    "%s dataset = %llu", whokey,
 			    dd->dd_phys->dd_head_dataset_obj);
 			continue;
@@ -236,8 +235,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 				VERIFY(0 == zap_destroy(mos,
 				    jumpobj, tx));
 			}
-			spa_history_internal_log(LOG_DS_PERM_REMOVE,
-			    dd->dd_pool->dp_spa, tx, cr,
+			spa_history_log_internal(LOG_DS_PERM_REMOVE,
+			    dd->dd_pool->dp_spa, tx,
 			    "%s %s dataset = %llu", whokey, perm,
 			    dd->dd_phys->dd_head_dataset_obj);
 		}
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 0dfb05da2d..ac86da6590 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -40,8 +39,7 @@
 #include "zfs_namecheck.h"
 
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
-static void dsl_dir_set_reservation_sync(void *arg1, void *arg2,
-    cred_t *cr, dmu_tx_t *tx);
+static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx);
 
 
 /* ARGSUSED */
@@ -64,8 +62,8 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
 	spa_close(dd->dd_pool->dp_spa, dd);
 
 	/*
-	 * The props callback list should be empty since they hold the
-	 * dir open.
+	 * The props callback list should have been cleaned up by
+	 * objset_evict().
 	 */
 	list_destroy(&dd->dd_prop_cbs);
 	mutex_destroy(&dd->dd_lock);
@@ -136,6 +134,25 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
 		}
 
+		if (dsl_dir_is_clone(dd)) {
+			dmu_buf_t *origin_bonus;
+			dsl_dataset_phys_t *origin_phys;
+
+			/*
+			 * We can't open the origin dataset, because
+			 * that would require opening this dsl_dir.
+			 * Just look at its phys directly instead.
+			 */
+			err = dmu_bonus_hold(dp->dp_meta_objset,
+			    dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
+			if (err)
+				goto errout;
+			origin_phys = origin_bonus->db_data;
+			dd->dd_origin_txg =
+			    origin_phys->ds_creation_txg;
+			dmu_buf_rele(origin_bonus, FTAG);
+		}
+
 		winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
 		    dsl_dir_evict);
 		if (winner) {
@@ -458,7 +475,7 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 void
-dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dsl_dir_t *dd = ds->ds_dir;
@@ -477,7 +494,7 @@ dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 	    &value);
 	psa.psa_effective_value = 0;	/* predict default value */
 
-	dsl_dir_set_reservation_sync(ds, &psa, cr, tx);
+	dsl_dir_set_reservation_sync(ds, &psa, tx);
 
 	ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0);
 	ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
@@ -652,15 +669,6 @@ dsl_dir_space_available(dsl_dir_t *dd,
 	if (used > quota) {
 		/* over quota */
 		myspace = 0;
-
-		/*
-		 * While it's OK to be a little over quota, if
-		 * we think we are using more space than there
-		 * is in the pool (which is already 1.6% more than
-		 * dsl_pool_adjustedsize()), something is very
-		 * wrong.
-		 */
-		ASSERT3U(used, <=, spa_get_dspace(dd->dd_pool->dp_spa));
 	} else {
 		/*
 		 * the lesser of the space provided by our parent and
@@ -1033,18 +1041,17 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	return (err);
 }
 
-extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *);
+extern dsl_syncfunc_t dsl_prop_set_sync;
 
-/* ARGSUSED */
 static void
-dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dsl_dir_t *dd = ds->ds_dir;
 	dsl_prop_setarg_t *psa = arg2;
 	uint64_t effective_value = psa->psa_effective_value;
 
-	dsl_prop_set_sync(ds, psa, cr, tx);
+	dsl_prop_set_sync(ds, psa, tx);
 	DSL_PROP_CHECK_PREDICTION(dd, psa);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
@@ -1053,8 +1060,8 @@ dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	dd->dd_phys->dd_quota = effective_value;
 	mutex_exit(&dd->dd_lock);
 
-	spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
-	    tx, cr, "%lld dataset = %llu ",
+	spa_history_log_internal(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
+	    tx, "%lld dataset = %llu ",
 	    (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
 }
 
@@ -1141,9 +1148,8 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	return (0);
 }
 
-/* ARGSUSED */
 static void
-dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dsl_dir_t *dd = ds->ds_dir;
@@ -1152,7 +1158,7 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	uint64_t used;
 	int64_t delta;
 
-	dsl_prop_set_sync(ds, psa, cr, tx);
+	dsl_prop_set_sync(ds, psa, tx);
 	DSL_PROP_CHECK_PREDICTION(dd, psa);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
@@ -1170,8 +1176,8 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	}
 	mutex_exit(&dd->dd_lock);
 
-	spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
-	    tx, cr, "%lld dataset = %llu",
+	spa_history_log_internal(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
+	    tx, "%lld dataset = %llu",
 	    (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
 }
 
@@ -1240,7 +1246,6 @@ struct renamearg {
 	const char *mynewname;
 };
 
-/*ARGSUSED*/
 static int
 dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
@@ -1287,7 +1292,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	struct renamearg *ra = arg2;
@@ -1336,8 +1341,8 @@ dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	    dd->dd_myname, 8, 1, &dd->dd_object, tx);
 	ASSERT3U(err, ==, 0);
 
-	spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa,
-	    tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
+	spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa,
+	    tx, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
 }
 
 int
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 30a5611365..77aa4af4e3 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -19,14 +19,16 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
+#include <sys/dsl_scan.h>
+#include <sys/dnode.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
@@ -50,7 +52,7 @@ kmutex_t zfs_write_limit_lock;
 
 static pgcnt_t old_physmem = 0;
 
-static int
+int
 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
 	uint64_t obj;
@@ -88,7 +90,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	    offsetof(dsl_dataset_t, ds_synced_link));
 
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
 	    1, 4, 0);
@@ -150,64 +151,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 	if (err)
 		goto out;
 
-	/* get scrub status */
-	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
-	    &dp->dp_scrub_func);
-	if (err == 0) {
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
-		    &dp->dp_scrub_queue_obj);
-		if (err)
-			goto out;
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
-		    &dp->dp_scrub_min_txg);
-		if (err)
-			goto out;
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
-		    &dp->dp_scrub_max_txg);
-		if (err)
-			goto out;
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
-		    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
-		    &dp->dp_scrub_bookmark);
-		if (err)
-			goto out;
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
-		    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
-		    &dp->dp_scrub_ddt_bookmark);
-		if (err && err != ENOENT)
-			goto out;
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
-		    &dp->dp_scrub_ddt_class_max);
-		if (err && err != ENOENT)
-			goto out;
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-		    &spa->spa_scrub_errors);
-		if (err)
-			goto out;
-		if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
-			/*
-			 * A new-type scrub was in progress on an old
-			 * pool.  Restart from the beginning, since the
-			 * old software may have changed the pool in the
-			 * meantime.
-			 */
-			dsl_pool_scrub_restart(dp);
-		}
-	} else {
-		/*
-		 * It's OK if there is no scrub in progress (and if
-		 * there was an I/O error, ignore it).
-		 */
-		err = 0;
-	}
+	err = dsl_scan_init(dp, txg);
 
 out:
 	rw_exit(&dp->dp_config_rwlock);
@@ -247,9 +191,9 @@ dsl_pool_close(dsl_pool_t *dp)
 
 	arc_flush(dp->dp_spa);
 	txg_fini(dp);
+	dsl_scan_fini(dp);
 	rw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
-	mutex_destroy(&dp->dp_scrub_cancel_lock);
 	taskq_destroy(dp->dp_vnrele_taskq);
 	if (dp->dp_blkstats)
 		kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
@@ -275,6 +219,9 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 	    DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 	ASSERT3U(err, ==, 0);
 
+	/* Initialize scan structures */
+	VERIFY3U(0, ==, dsl_scan_init(dp, txg));
+
 	/* create and open the root dir */
 	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 	VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
@@ -318,6 +265,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	uint64_t data_written;
 	int err;
 
+	/*
+	 * We need to copy dp_space_towrite() before doing
+	 * dsl_sync_task_group_sync(), because
+	 * dsl_dataset_snapshot_reserve_space() will increase
+	 * dp_space_towrite but not actually write anything.
+	 */
+	data_written = dp->dp_space_towrite[txg & TXG_MASK];
+
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	dp->dp_read_overhead = 0;
@@ -347,7 +302,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 
 	/*
 	 * Sync the datasets again to push out the changes due to
-	 * userquota updates.  This must be done before we process the
+	 * userspace updates.  This must be done before we process the
 	 * sync tasks, because that could cause a snapshot of a dataset
 	 * whose ds_bp will be rewritten when we do this 2nd sync.
 	 */
@@ -383,13 +338,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 		dsl_dir_sync(dd, tx);
 	write_time += gethrtime() - start;
 
-	if (spa_sync_pass(dp->dp_spa) == 1) {
-		dp->dp_scrub_prefetch_zio_root = zio_root(dp->dp_spa, NULL,
-		    NULL, ZIO_FLAG_CANFAIL);
-		dsl_pool_scrub_sync(dp, tx);
-		(void) zio_wait(dp->dp_scrub_prefetch_zio_root);
-	}
-
 	start = gethrtime();
 	if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 	    list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
@@ -407,7 +355,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 
 	dmu_tx_commit(tx);
 
-	data_written = dp->dp_space_towrite[txg & TXG_MASK];
 	dp->dp_space_towrite[txg & TXG_MASK] = 0;
 	ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
 
@@ -679,7 +626,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 	    NULL, 0, kcred, tx);
 	VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-	dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx);
+	dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx);
 	VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 	    dp, &dp->dp_origin_snap));
 	dsl_dataset_rele(ds, FTAG);
diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c
index f27305c953..cedd777687 100644
--- a/usr/src/uts/common/fs/zfs/dsl_prop.c
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -260,11 +259,8 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
 
 	cbr->cbr_func(cbr->cbr_arg, value);
 
-	VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
-	    NULL, cbr, &dd));
 	if (need_rwlock)
 		rw_exit(&dp->dp_config_rwlock);
-	/* Leave dir open until this callback is unregistered */
 	return (0);
 }
 
@@ -464,8 +460,6 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
 	kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
 	kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
 
-	/* Clean up from dsl_prop_register */
-	dsl_dir_close(dd, cbr);
 	return (0);
 }
 
@@ -552,7 +546,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
 }
 
 void
-dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dsl_prop_setarg_t *psa = arg2;
@@ -707,9 +701,9 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 		}
 	}
 
-	spa_history_internal_log((source == ZPROP_SRC_NONE ||
+	spa_history_log_internal((source == ZPROP_SRC_NONE ||
 	    source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT :
-	    LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr,
+	    LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx,
 	    "%s=%s dataset = %llu", propname,
 	    (valstr == NULL ? "" : valstr), ds->ds_object);
 
@@ -718,7 +712,7 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 }
 
 void
-dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dsl_props_arg_t *pa = arg2;
@@ -756,13 +750,13 @@ dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 			psa.psa_numints = 1;
 			psa.psa_value = &intval;
 		}
-		dsl_prop_set_sync(ds, &psa, cr, tx);
+		dsl_prop_set_sync(ds, &psa, tx);
 	}
 }
 
 void
 dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
-    cred_t *cr, dmu_tx_t *tx)
+    dmu_tx_t *tx)
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
@@ -773,7 +767,7 @@ dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
 
 	dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE);
 
-	spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr,
+	spa_history_log_internal(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx,
 	    "%s=%llu dataset = %llu", name, (u_longlong_t)val,
 	    dd->dd_phys->dd_head_dataset_obj);
 }
diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c
new file mode 100644
index 0000000000..f3b401d602
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c
@@ -0,0 +1,1660 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dsl_scan.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zil_impl.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+
+typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
+
+static scan_cb_t dsl_scan_defrag_cb;
+static scan_cb_t dsl_scan_scrub_cb;
+static scan_cb_t dsl_scan_remove_cb;
+static dsl_syncfunc_t dsl_scan_cancel_sync;
+static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
+
+int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
+
+#define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
+	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
+	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+
+extern int zfs_txg_timeout;
+
+/* the order has to match pool_scan_type */
+static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
+	NULL,
+	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
+	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
+};
+
+int
+dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
+{
+	int err;
+	dsl_scan_t *scn;
+	spa_t *spa = dp->dp_spa;
+	uint64_t f;
+
+	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
+	scn->scn_dp = dp;
+
+	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    "scrub_func", sizeof (uint64_t), 1, &f);
+	if (err == 0) {
+		/*
+		 * There was an old-style scrub in progress.  Restart a
+		 * new-style scrub from the beginning.
+		 */
+		scn->scn_restart_txg = txg;
+		zfs_dbgmsg("old-style scrub was in progress; "
+		    "restarting new-style scrub in txg %llu",
+		    scn->scn_restart_txg);
+
+		/*
+		 * Load the queue obj from the old location so that it
+		 * can be freed by dsl_scan_done().
+		 */
+		(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    "scrub_queue", sizeof (uint64_t), 1,
+		    &scn->scn_phys.scn_queue_obj);
+	} else {
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+		    &scn->scn_phys);
+		if (err == ENOENT)
+			return (0);
+		else if (err)
+			return (err);
+
+		if (scn->scn_phys.scn_state == DSS_SCANNING &&
+		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
+			/*
+			 * A new-type scrub was in progress on an old
+			 * pool, and the pool was accessed by old
+			 * software.  Restart from the beginning, since
+			 * the old software may have changed the pool in
+			 * the meantime.
+			 */
+			scn->scn_restart_txg = txg;
+			zfs_dbgmsg("new-style scrub was modified "
+			    "by old software; restarting in txg %llu",
+			    scn->scn_restart_txg);
+		}
+	}
+
+	spa_scan_stat_init(spa);
+	return (0);
+}
+
+void
+dsl_scan_fini(dsl_pool_t *dp)
+{
+	if (dp->dp_scan) {
+		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
+		dp->dp_scan = NULL;
+	}
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = arg1;
+
+	if (scn->scn_phys.scn_state == DSS_SCANNING)
+		return (EBUSY);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = arg1;
+	pool_scan_func_t *funcp = arg2;
+	dmu_object_type_t ot = 0;
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+
+	ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+	bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+	scn->scn_phys.scn_func = *funcp;
+	scn->scn_phys.scn_state = DSS_SCANNING;
+	scn->scn_phys.scn_min_txg = 0;
+	scn->scn_phys.scn_max_txg = tx->tx_txg;
+	scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
+	scn->scn_phys.scn_start_time = gethrestime_sec();
+	scn->scn_phys.scn_errors = 0;
+	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+	scn->scn_restart_txg = 0;
+	spa_scan_stat_init(spa);
+
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
+
+		/* rewrite all disk labels */
+		vdev_config_dirty(spa->spa_root_vdev);
+
+		if (vdev_resilver_needed(spa->spa_root_vdev,
+		    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
+			spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
+		} else {
+			spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
+		}
+
+		spa->spa_scrub_started = B_TRUE;
+		/*
+		 * If this is an incremental scrub, limit the DDT scrub phase
+		 * to just the auto-ditto class (for correctness); the rest
+		 * of the scrub should go faster using top-down pruning.
+		 */
+		if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
+			scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+
+	}
+
+	/* back to the generic stuff */
+
+	if (dp->dp_blkstats == NULL) {
+		dp->dp_blkstats =
+		    kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+	}
+	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+
+	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
+		ot = DMU_OT_ZAP_OTHER;
+
+	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
+	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
+
+	dsl_scan_sync_state(scn, tx);
+
+	spa_history_log_internal(LOG_POOL_SCAN, spa, tx,
+	    "func=%u mintxg=%llu maxtxg=%llu",
+	    *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+	static const char *old_names[] = {
+		"scrub_bookmark",
+		"scrub_ddt_bookmark",
+		"scrub_ddt_class_max",
+		"scrub_queue",
+		"scrub_min_txg",
+		"scrub_max_txg",
+		"scrub_func",
+		"scrub_errors",
+		NULL
+	};
+
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+	int i;
+
+	/* Remove any remnants of an old-style scrub. */
+	for (i = 0; old_names[i]; i++) {
+		(void) zap_remove(dp->dp_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
+	}
+
+	if (scn->scn_phys.scn_queue_obj != 0) {
+		VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, tx));
+		scn->scn_phys.scn_queue_obj = 0;
+	}
+
+	/*
+	 * If we were "restarted" from a stopped state, don't bother
+	 * with anything else.
+	 */
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	if (complete)
+		scn->scn_phys.scn_state = DSS_FINISHED;
+	else
+		scn->scn_phys.scn_state = DSS_CANCELED;
+
+	spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx,
+	    "complete=%u", complete);
+
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight > 0) {
+			cv_wait(&spa->spa_scrub_io_cv,
+			    &spa->spa_scrub_lock);
+		}
+		mutex_exit(&spa->spa_scrub_lock);
+		spa->spa_scrub_started = B_FALSE;
+		spa->spa_scrub_active = B_FALSE;
+
+		/*
+		 * If the scrub/resilver completed, update all DTLs to
+		 * reflect this.  Whether it succeeded or not, vacate
+		 * all temporary scrub DTLs.
+		 */
+		vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+		    complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
+		if (complete) {
+			spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
+			    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+		}
+		spa_errlog_rotate(spa);
+
+		/*
+		 * We may have finished replacing a device.
+		 * Let the async thread assess this and handle the detach.
+		 */
+		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+	}
+
+	scn->scn_phys.scn_end_time = gethrestime_sec();
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = arg1;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return (ENOENT);
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = arg1;
+
+	dsl_scan_done(scn, B_FALSE, tx);
+	dsl_scan_sync_state(scn, tx);
+}
+
+int
+dsl_scan_cancel(dsl_pool_t *dp)
+{
+	boolean_t complete = B_FALSE;
+	int err;
+
+	err = dsl_sync_task_do(dp, dsl_scan_cancel_check,
+	    dsl_scan_cancel_sync, dp->dp_scan, &complete, 3);
+	return (err);
+}
+
+static void dsl_scan_visitbp(blkptr_t *bp,
+    const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
+    dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+    dmu_tx_t *tx);
+static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
+    dmu_objset_type_t ostype,
+    dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
+
+void
+dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
+{
+	zio_free(dp->dp_spa, txg, bp);
+}
+
+void
+dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
+{
+	ASSERT(dsl_pool_sync_context(dp));
+	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
+}
+
+int
+dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+    arc_done_func_t *done, void *private, int priority, int zio_flags,
+    uint32_t *arc_flags, const zbookmark_t *zb)
+{
+	return (arc_read(pio, spa, bpp, pbuf, done, private,
+	    priority, zio_flags, arc_flags, zb));
+}
+
+int
+dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+    arc_done_func_t *done, void *private, int priority, int zio_flags,
+    uint32_t *arc_flags, const zbookmark_t *zb)
+{
+	return (arc_read_nolock(pio, spa, bpp, done, private,
+	    priority, zio_flags, arc_flags, zb));
+}
+
+static boolean_t
+bookmark_is_zero(const zbookmark_t *zb)
+{
+	return (zb->zb_objset == 0 && zb->zb_object == 0 &&
+	    zb->zb_level == 0 && zb->zb_blkid == 0);
+}
+
+/* dnp is the dnode for zb1->zb_object */
+static boolean_t
+bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
+    const zbookmark_t *zb2)
+{
+	uint64_t zb1nextL0, zb2thisobj;
+
+	ASSERT(zb1->zb_objset == zb2->zb_objset);
+	ASSERT(zb2->zb_level == 0);
+
+	/*
+	 * A bookmark in the deadlist is considered to be after
+	 * everything else.
+	 */
+	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
+		return (B_TRUE);
+
+	/* The objset_phys_t isn't before anything. */
+	if (dnp == NULL)
+		return (B_FALSE);
+
+	zb1nextL0 = (zb1->zb_blkid + 1) <<
+	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+
+	zb2thisobj = zb2->zb_object ? zb2->zb_object :
+	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+
+	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+		uint64_t nextobj = zb1nextL0 *
+		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+		return (nextobj <= zb2thisobj);
+	}
+
+	if (zb1->zb_object < zb2thisobj)
+		return (B_TRUE);
+	if (zb1->zb_object > zb2thisobj)
+		return (B_FALSE);
+	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+		return (B_FALSE);
+	return (zb1nextL0 <= zb2->zb_blkid);
+}
+
+static uint64_t
+dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+{
+	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+	if (dsl_dataset_is_snapshot(ds))
+		return (MIN(smt, ds->ds_phys->ds_creation_txg));
+	return (smt);
+}
+
+static void
+dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+	    &scn->scn_phys, tx));
+}
+
+static boolean_t
+dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
+{
+	uint64_t elapsed_nanosecs;
+	int mintime;
+
+	/* we never skip user/group accounting objects */
+	if (zb && (int64_t)zb->zb_object < 0)
+		return (B_FALSE);
+
+	if (scn->scn_pausing)
+		return (B_TRUE); /* we're already pausing */
+
+	if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
+		return (B_FALSE); /* we're resuming */
+
+	/* We only know how to resume from level-0 blocks. */
+	if (zb && zb->zb_level != 0)
+		return (B_FALSE);
+
+	mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+	    zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
+	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+	    (elapsed_nanosecs / MICROSEC > mintime &&
+	    txg_sync_waiting(scn->scn_dp)) ||
+	    spa_shutting_down(scn->scn_dp->dp_spa)) {
+		if (zb) {
+			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+			scn->scn_phys.scn_bookmark = *zb;
+		}
+		dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+		scn->scn_pausing = B_TRUE;
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+typedef struct zil_scan_arg {
+	dsl_pool_t	*zsa_dp;
+	zil_header_t	*zsa_zh;
+} zil_scan_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+	zil_scan_arg_t *zsa = arg;
+	dsl_pool_t *dp = zsa->zsa_dp;
+	dsl_scan_t *scn = dp->dp_scan;
+	zil_header_t *zh = zsa->zsa_zh;
+	zbookmark_t zb;
+
+	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+		return (0);
+
+	/*
+	 * One block ("stubby") can be allocated a long time ago; we
+	 * want to visit that one because it has been allocated
+	 * (on-disk) even if it hasn't been claimed (even though for
+	 * scrub there's nothing to do to it).
+	 */
+	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+		return (0);
+
+	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+	if (lrc->lrc_txtype == TX_WRITE) {
+		zil_scan_arg_t *zsa = arg;
+		dsl_pool_t *dp = zsa->zsa_dp;
+		dsl_scan_t *scn = dp->dp_scan;
+		zil_header_t *zh = zsa->zsa_zh;
+		lr_write_t *lr = (lr_write_t *)lrc;
+		blkptr_t *bp = &lr->lr_blkptr;
+		zbookmark_t zb;
+
+		if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+			return (0);
+
+		/*
+		 * birth can be < claim_txg if this record's txg is
+		 * already txg sync'ed (but this log block contains
+		 * other records that are not synced)
+		 */
+		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+			return (0);
+
+		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+		    lr->lr_foid, ZB_ZIL_LEVEL,
+		    lr->lr_offset / BP_GET_LSIZE(bp));
+
+		VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+	}
+	return (0);
+}
+
+static void
+dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
+{
+	uint64_t claim_txg = zh->zh_claim_txg;
+	zil_scan_arg_t zsa = { dp, zh };
+	zilog_t *zilog;
+
+	/*
+	 * We only want to visit blocks that have been claimed but not yet
+	 * replayed (or, in read-only mode, blocks that *would* be claimed).
+	 */
+	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+		return;
+
+	zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+	(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
+	    claim_txg);
+
+	zil_free(zilog);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
+    uint64_t objset, uint64_t object, uint64_t blkid)
+{
+	zbookmark_t czb;
+	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+
+	if (zfs_no_scrub_prefetch)
+		return;
+
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
+	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+		return;
+
+	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+
+	/*
+	 * XXX need to make sure all of these arc_read() prefetches are
+	 * done before setting xlateall (similar to dsl_read())
+	 */
+	(void) arc_read(scn->scn_prefetch_zio_root, scn->scn_dp->dp_spa, bp,
+	    buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+	    &flags, &czb);
+}
+
+static boolean_t
+dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
+    const zbookmark_t *zb)
+{
+	/*
+	 * We never skip over user/group accounting objects (obj<0)
+	 */
+	if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
+	    (int64_t)zb->zb_object >= 0) {
+		/*
+		 * If we already visited this bp & everything below (in
+		 * a prior txg sync), don't bother doing it again.
+		 */
+		if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+			return (B_TRUE);
+
+		/*
+		 * If we found the block we're trying to resume from, or
+		 * we went past it to a different object, zero it out to
+		 * indicate that it's OK to start checking for pausing
+		 * again.
+		 */
+		if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
+		    zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+			dprintf("resuming at %llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+			bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+		}
+	}
+	return (B_FALSE);
+}
+
+/*
+ * Return nonzero on i/o error.
+ * Return new buf to write out in *bufp.
+ */
+static int
+dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+    dnode_phys_t *dnp, const blkptr_t *bp,
+    const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	int err;
+
+	if (BP_GET_LEVEL(bp) > 0) {
+		uint32_t flags = ARC_WAIT;
+		int i;
+		blkptr_t *cbp;
+		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+
+		err = arc_read_nolock(NULL, dp->dp_spa, bp,
+		    arc_getbuf_func, bufp,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err) {
+			scn->scn_phys.scn_errors++;
+			return (err);
+		}
+		for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+			dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
+			    zb->zb_object, zb->zb_blkid * epb + i);
+		}
+		for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+			zbookmark_t czb;
+
+			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1,
+			    zb->zb_blkid * epb + i);
+			dsl_scan_visitbp(cbp, &czb, dnp,
+			    *bufp, ds, scn, ostype, tx);
+		}
+	} else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
+		uint32_t flags = ARC_WAIT;
+
+		err = arc_read_nolock(NULL, dp->dp_spa, bp,
+		    arc_getbuf_func, bufp,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err) {
+			scn->scn_phys.scn_errors++;
+			return (err);
+		}
+	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+		uint32_t flags = ARC_WAIT;
+		dnode_phys_t *cdnp;
+		int i, j;
+		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+		err = arc_read_nolock(NULL, dp->dp_spa, bp,
+		    arc_getbuf_func, bufp,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err) {
+			scn->scn_phys.scn_errors++;
+			return (err);
+		}
+		for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+			for (j = 0; j < cdnp->dn_nblkptr; j++) {
+				blkptr_t *cbp = &cdnp->dn_blkptr[j];
+				dsl_scan_prefetch(scn, *bufp, cbp,
+				    zb->zb_objset, zb->zb_blkid * epb + i, j);
+			}
+		}
+		for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+			dsl_scan_visitdnode(scn, ds, ostype,
+			    cdnp, *bufp, zb->zb_blkid * epb + i, tx);
+		}
+
+	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		uint32_t flags = ARC_WAIT;
+		objset_phys_t *osp;
+
+		err = arc_read_nolock(NULL, dp->dp_spa, bp,
+		    arc_getbuf_func, bufp,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err) {
+			scn->scn_phys.scn_errors++;
+			return (err);
+		}
+
+		osp = (*bufp)->b_data;
+
+		if (DSL_SCAN_IS_SCRUB_RESILVER(scn))
+			dsl_scan_zil(dp, &osp->os_zil_header);
+
+		dsl_scan_visitdnode(scn, ds, osp->os_type,
+		    &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
+
+		if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
+			/*
+			 * We also always visit user/group accounting
+			 * objects, and never skip them, even if we are
+			 * pausing.  This is necessary so that the space
+			 * deltas from this txg get integrated.
+			 */
+			dsl_scan_visitdnode(scn, ds, osp->os_type,
+			    &osp->os_groupused_dnode, *bufp,
+			    DMU_GROUPUSED_OBJECT, tx);
+			dsl_scan_visitdnode(scn, ds, osp->os_type,
+			    &osp->os_userused_dnode, *bufp,
+			    DMU_USERUSED_OBJECT, tx);
+		}
+	}
+
+	return (0);
+}
+
+static void
+dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
+    dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
+    uint64_t object, dmu_tx_t *tx)
+{
+	int j;
+
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		zbookmark_t czb;
+
+		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+		    dnp->dn_nlevels - 1, j);
+		dsl_scan_visitbp(&dnp->dn_blkptr[j],
+		    &czb, dnp, buf, ds, scn, ostype, tx);
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		zbookmark_t czb;
+		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+		    0, DMU_SPILL_BLKID);
+		dsl_scan_visitbp(&dnp->dn_spill,
+		    &czb, dnp, buf, ds, scn, ostype, tx);
+	}
+}
+
+/*
+ * The arguments are in this order because mdb can only print the
+ * first 5; we want them to be useful.
+ */
+static void
+dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
+    dnode_phys_t *dnp, arc_buf_t *pbuf,
+    dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+    dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	arc_buf_t *buf = NULL;
+	blkptr_t bp_toread = *bp;
+
+	/* ASSERT(pbuf == NULL || arc_released(pbuf)); */
+
+	if (dsl_scan_check_pause(scn, zb))
+		return;
+
+	if (dsl_scan_check_resume(scn, dnp, zb))
+		return;
+
+	if (bp->blk_birth == 0)
+		return;
+
+	scn->scn_visited_this_txg++;
+
+	dprintf_bp(bp,
+	    "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
+	    ds, ds ? ds->ds_object : 0,
+	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+	    pbuf, bp);
+
+	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+		return;
+
+	if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
+		/*
+		 * For non-user-accounting blocks, we need to read the
+		 * new bp (from a deleted snapshot, found in
+		 * check_existing_xlation).  If we used the old bp,
+		 * pointers inside this block from before we resumed
+		 * would be untranslated.
+		 *
+		 * For user-accounting blocks, we need to read the old
+		 * bp, because we will apply the entire space delta to
+		 * it (original untranslated -> translations from
+		 * deleted snap -> now).
+		 */
+		bp_toread = *bp;
+	}
+
+	if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
+	    &buf) != 0)
+		return;
+
+	/*
+	 * If dsl_scan_ddt() has aready visited this block, it will have
+	 * already done any translations or scrubbing, so don't call the
+	 * callback again.
+	 */
+	if (ddt_class_contains(dp->dp_spa,
+	    scn->scn_phys.scn_ddt_class_max, bp)) {
+		ASSERT(buf == NULL);
+		return;
+	}
+
+	/*
+	 * If this block is from the future (after cur_max_txg), then we
+	 * are doing this on behalf of a deleted snapshot, and we will
+	 * revisit the future block on the next pass of this dataset.
+	 * Don't scan it now unless we need to because something
+	 * under it was modified.
+	 */
+	if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
+		scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+	}
+	if (buf)
+		(void) arc_buf_remove_ref(buf, &buf);
+}
+
+static void
+dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
+    dmu_tx_t *tx)
+{
+	zbookmark_t zb;
+
+	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+	dsl_scan_visitbp(bp, &zb, NULL, NULL,
+	    ds, scn, DMU_OST_NONE, tx);
+
+	dprintf_ds(ds, "finished scan%s", "");
+}
+
+void
+dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+		if (dsl_dataset_is_snapshot(ds)) {
+			/* Note, scn_cur_{min,max}_txg stays the same. */
+			scn->scn_phys.scn_bookmark.zb_objset =
+			    ds->ds_phys->ds_next_snap_obj;
+			zfs_dbgmsg("destroying ds %llu; currently traversing; "
+			    "reset zb_objset to %llu",
+			    (u_longlong_t)ds->ds_object,
+			    (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+			scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
+		} else {
+			SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+			    ZB_DESTROYED_OBJSET, 0, 0, 0);
+			zfs_dbgmsg("destroying ds %llu; currently traversing; "
+			    "reset bookmark to -1,0,0,0",
+			    (u_longlong_t)ds->ds_object);
+		}
+	} else if (zap_lookup_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+		ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+		if (dsl_dataset_is_snapshot(ds)) {
+			/*
+			 * We keep the same mintxg; it could be >
+			 * ds_creation_txg if the previous snapshot was
+			 * deleted too.
+			 */
+			VERIFY(zap_add_int_key(dp->dp_meta_objset,
+			    scn->scn_phys.scn_queue_obj,
+			    ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
+			zfs_dbgmsg("destroying ds %llu; in queue; "
+			    "replacing with %llu",
+			    (u_longlong_t)ds->ds_object,
+			    (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+		} else {
+			zfs_dbgmsg("destroying ds %llu; in queue; removing",
+			    (u_longlong_t)ds->ds_object);
+		}
+	} else {
+		zfs_dbgmsg("destroying ds %llu; ignoring",
+		    (u_longlong_t)ds->ds_object);
+	}
+
+	/*
+	 * dsl_scan_sync() should be called after this, and should sync
+	 * out our changed state, but just to be safe, do it here.
+	 */
+	dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+
+	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+		scn->scn_phys.scn_bookmark.zb_objset =
+		    ds->ds_phys->ds_prev_snap_obj;
+		zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+		    "reset zb_objset to %llu",
+		    (u_longlong_t)ds->ds_object,
+		    (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+	} else if (zap_lookup_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj,
+		    ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
+		zfs_dbgmsg("snapshotting ds %llu; in queue; "
+		    "replacing with %llu",
+		    (u_longlong_t)ds->ds_object,
+		    (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+	}
+	dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
+		scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
+		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+		    "reset zb_objset to %llu",
+		    (u_longlong_t)ds1->ds_object,
+		    (u_longlong_t)ds2->ds_object);
+	} else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
+		scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
+		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+		    "reset zb_objset to %llu",
+		    (u_longlong_t)ds2->ds_object,
+		    (u_longlong_t)ds1->ds_object);
+	}
+
+	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+	    ds1->ds_object, &mintxg) == 0) {
+		int err;
+
+		ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
+		err = zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
+		VERIFY(err == 0 || err == EEXIST);
+		if (err == EEXIST) {
+			/* Both were there to begin with */
+			VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+			    scn->scn_phys.scn_queue_obj,
+			    ds1->ds_object, mintxg, tx));
+		}
+		zfs_dbgmsg("clone_swap ds %llu; in queue; "
+		    "replacing with %llu",
+		    (u_longlong_t)ds1->ds_object,
+		    (u_longlong_t)ds2->ds_object);
+	} else if (zap_lookup_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
+		ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
+		VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
+		zfs_dbgmsg("clone_swap ds %llu; in queue; "
+		    "replacing with %llu",
+		    (u_longlong_t)ds2->ds_object,
+		    (u_longlong_t)ds1->ds_object);
+	}
+
+	dsl_scan_sync_state(scn, tx);
+}
+
+struct enqueue_clones_arg {
+	dmu_tx_t *tx;
+	uint64_t originobj;
+};
+
+/* ARGSUSED */
+static int
+enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+	struct enqueue_clones_arg *eca = arg;
+	dsl_dataset_t *ds;
+	int err;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+	if (err)
+		return (err);
+
+	if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
+		while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+			dsl_dataset_t *prev;
+			err = dsl_dataset_hold_obj(dp,
+			    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+
+			dsl_dataset_rele(ds, FTAG);
+			if (err)
+				return (err);
+			ds = prev;
+		}
+		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds->ds_object,
+		    ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
+	}
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	dsl_dataset_t *ds;
+
+	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+	/*
+	 * Iterate over the bps in this ds.
+	 */
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
+
+	char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
+	dsl_dataset_name(ds, dsname);
+	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
+	    "pausing=%u",
+	    (longlong_t)dsobj, dsname,
+	    (longlong_t)scn->scn_phys.scn_cur_min_txg,
+	    (longlong_t)scn->scn_phys.scn_cur_max_txg,
+	    (int)scn->scn_pausing);
+	kmem_free(dsname, ZFS_MAXNAMELEN);
+
+	if (scn->scn_pausing)
+		goto out;
+
+	/*
+	 * We've finished this pass over this dataset.
+	 */
+
+	/*
+	 * If we did not completely visit this dataset, do another pass.
+	 */
+	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
+		zfs_dbgmsg("incomplete pass; visiting again");
+		scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
+		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds->ds_object,
+		    scn->scn_phys.scn_cur_max_txg, tx) == 0);
+		goto out;
+	}
+
+	/*
+	 * Add descendent datasets to work queue.
+	 */
+	if (ds->ds_phys->ds_next_snap_obj != 0) {
+		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
+		    ds->ds_phys->ds_creation_txg, tx) == 0);
+	}
+	if (ds->ds_phys->ds_num_children > 1) {
+		boolean_t usenext = B_FALSE;
+		if (ds->ds_phys->ds_next_clones_obj != 0) {
+			uint64_t count;
+			/*
+			 * A bug in a previous version of the code could
+			 * cause upgrade_clones_cb() to not set
+			 * ds_next_snap_obj when it should, leading to a
+			 * missing entry.  Therefore we can only use the
+			 * next_clones_obj when its count is correct.
+			 */
+			int err = zap_count(dp->dp_meta_objset,
+			    ds->ds_phys->ds_next_clones_obj, &count);
+			if (err == 0 &&
+			    count == ds->ds_phys->ds_num_children - 1)
+				usenext = B_TRUE;
+		}
+
+		if (usenext) {
+			VERIFY(zap_join_key(dp->dp_meta_objset,
+			    ds->ds_phys->ds_next_clones_obj,
+			    scn->scn_phys.scn_queue_obj,
+			    ds->ds_phys->ds_creation_txg, tx) == 0);
+		} else {
+			struct enqueue_clones_arg eca;
+			eca.tx = tx;
+			eca.originobj = ds->ds_object;
+
+			(void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
+			    NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
+		}
+	}
+
+out:
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/* ARGSUSED */
+static int
+enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+	dmu_tx_t *tx = arg;
+	dsl_dataset_t *ds;
+	int err;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+	if (err)
+		return (err);
+
+	while (ds->ds_phys->ds_prev_snap_obj != 0) {
+		dsl_dataset_t *prev;
+		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+		    FTAG, &prev);
+		if (err) {
+			dsl_dataset_rele(ds, FTAG);
+			return (err);
+		}
+
+		/*
+		 * If this is a clone, we don't need to worry about it for now.
+		 */
+		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+			dsl_dataset_rele(ds, FTAG);
+			dsl_dataset_rele(prev, FTAG);
+			return (0);
+		}
+		dsl_dataset_rele(ds, FTAG);
+		ds = prev;
+	}
+
+	VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+	    ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * We leverage the fact that the dde's replication class (enum ddt_class)
+ * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+ * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Since there are two replication classes which contain blocks with
+ * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+ * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can so we must account for changes
+ * in a block's replication class.
+ *
+ * Here's an example of what can occur:
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+ * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ */
+static void
+dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
+	ddt_entry_t dde = { 0 };
+	int error;
+	uint64_t n = 0;
+
+	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+		ddt_t *ddt;
+
+		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
+			break;
+		dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
+		    (longlong_t)ddb->ddb_class,
+		    (longlong_t)ddb->ddb_type,
+		    (longlong_t)ddb->ddb_checksum,
+		    (longlong_t)ddb->ddb_cursor);
+
+		/* There should be no pending changes to the dedup table */
+		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
+		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
+
+		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+		n++;
+
+		if (dsl_scan_check_pause(scn, NULL))
+			break;
+	}
+
+	zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
+	    (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
+	    (int)scn->scn_pausing);
+
+	ASSERT(error == 0 || error == ENOENT);
+	ASSERT(error != ENOENT ||
+	    ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
+}
+
+/* ARGSUSED */
+void
+dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	const ddt_key_t *ddk = &dde->dde_key;
+	ddt_phys_t *ddp = dde->dde_phys;
+	blkptr_t bp;
+	zbookmark_t zb = { 0 };
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (ddp->ddp_phys_birth == 0 ||
+		    ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
+			continue;
+		ddt_bp_create(checksum, ddk, ddp, &bp);
+
+		scn->scn_visited_this_txg++;
+		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+	}
+}
+
+static void
+dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+	    scn->scn_phys.scn_ddt_class_max) {
+		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+		dsl_scan_ddt(scn, tx);
+		if (scn->scn_pausing)
+			return;
+	}
+
+	if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
+		/* First do the MOS & ORIGIN */
+
+		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+		dsl_scan_visit_rootbp(scn, NULL,
+		    &dp->dp_meta_rootbp, tx);
+		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+		if (scn->scn_pausing)
+			return;
+
+		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+			VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
+		} else {
+			dsl_scan_visitds(scn,
+			    dp->dp_origin_snap->ds_object, tx);
+		}
+		ASSERT(!scn->scn_pausing);
+	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
+	    ZB_DESTROYED_OBJSET) {
+		/*
+		 * If we were paused, continue from here.  Note if the
+		 * ds we were paused on was deleted, the zb_objset may
+		 * be -1, so we will skip this and find a new objset
+		 * below.
+		 */
+		dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
+		if (scn->scn_pausing)
+			return;
+	}
+
+	/*
+	 * In case we were paused right at the end of the ds, zero the
+	 * bookmark so we don't think that we're still trying to resume.
+	 */
+	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
+
+	/* keep pulling things out of the zap-object-as-queue */
+	while (zap_cursor_init(&zc, dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj),
+	    zap_cursor_retrieve(&zc, &za) == 0) {
+		dsl_dataset_t *ds;
+		uint64_t dsobj;
+
+		dsobj = strtonum(za.za_name, NULL);
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, dsobj, tx));
+
+		/* Set up min/max txg */
+		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+		if (za.za_first_integer != 0) {
+			scn->scn_phys.scn_cur_min_txg =
+			    MAX(scn->scn_phys.scn_min_txg,
+			    za.za_first_integer);
+		} else {
+			scn->scn_phys.scn_cur_min_txg =
+			    MAX(scn->scn_phys.scn_min_txg,
+			    ds->ds_phys->ds_prev_snap_txg);
+		}
+		scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
+		dsl_dataset_rele(ds, FTAG);
+
+		dsl_scan_visitds(scn, dsobj, tx);
+		zap_cursor_fini(&zc);
+		if (scn->scn_pausing)
+			return;
+	}
+	zap_cursor_fini(&zc);
+}
+
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dp->dp_scan;
+	spa_t *spa = dp->dp_spa;
+
+	/*
+	 * Check for scn_restart_txg before checking spa_load_state, so
+	 * that we can restart an old-style scan while the pool is being
+	 * imported (see dsl_scan_init).
+	 */
+	if (scn->scn_restart_txg != 0 &&
+	    scn->scn_restart_txg <= tx->tx_txg) {
+		pool_scan_func_t func = POOL_SCAN_SCRUB;
+		dsl_scan_done(scn, B_FALSE, tx);
+		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+			func = POOL_SCAN_RESILVER;
+		zfs_dbgmsg("restarting scan func=%u txg=%llu",
+		    func, tx->tx_txg);
+		dsl_scan_setup_sync(scn, &func, tx);
+	}
+
+	if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE ||
+	    spa_shutting_down(spa) ||
+	    spa_sync_pass(dp->dp_spa) > 1 ||
+	    scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	scn->scn_visited_this_txg = 0;
+	scn->scn_pausing = B_FALSE;
+	scn->scn_sync_start_time = gethrtime();
+	spa->spa_scrub_active = B_TRUE;
+
+	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+	    scn->scn_phys.scn_ddt_class_max) {
+		zfs_dbgmsg("doing scan sync txg %llu; "
+		    "ddt bm=%llu/%llu/%llu/%llx",
+		    (longlong_t)tx->tx_txg,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+		ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
+		ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
+		ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
+		ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
+	} else {
+		zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
+		    (longlong_t)tx->tx_txg,
+		    (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
+		    (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
+		    (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
+		    (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
+	}
+
+	scn->scn_prefetch_zio_root = zio_root(dp->dp_spa, NULL,
+	    NULL, ZIO_FLAG_CANFAIL);
+	dsl_scan_visit(scn, tx);
+	(void) zio_wait(scn->scn_prefetch_zio_root);
+	scn->scn_prefetch_zio_root = NULL;
+
+	zfs_dbgmsg("visited %llu blocks in %llums",
+	    (longlong_t)scn->scn_visited_this_txg,
+	    (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
+
+	if (!scn->scn_pausing) {
+		/* finished with scan. */
+		zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
+		dsl_scan_done(scn, B_TRUE, tx);
+	}
+
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight > 0) {
+			cv_wait(&spa->spa_scrub_io_cv,
+			    &spa->spa_scrub_lock);
+		}
+		mutex_exit(&spa->spa_scrub_lock);
+	}
+
+	dsl_scan_sync_state(scn, tx);
+}
+
+/*
+ * This will start a new scan, or restart an existing one.
+ */
+void
+dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+{
+	if (txg == 0) {
+		dmu_tx_t *tx;
+		tx = dmu_tx_create_dd(dp->dp_mos_dir);
+		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+		txg = dmu_tx_get_txg(tx);
+		dp->dp_scan->scn_restart_txg = txg;
+		dmu_tx_commit(tx);
+	} else {
+		dp->dp_scan->scn_restart_txg = txg;
+	}
+	zfs_dbgmsg("restarting resilver txg=%llu", txg);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+	return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
+	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+/*
+ * scrub consumers
+ */
+
+static void
+count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+{
+	int i;
+
+	/*
+	 * If we resume after a reboot, zab will be NULL; don't record
+	 * incomplete stats in that case.
+	 */
+	if (zab == NULL)
+		return;
+
+	for (i = 0; i < 4; i++) {
+		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+		zfs_blkstat_t *zb = &zab->zab_type[l][t];
+		int equal;
+
+		zb->zb_count++;
+		zb->zb_asize += BP_GET_ASIZE(bp);
+		zb->zb_lsize += BP_GET_LSIZE(bp);
+		zb->zb_psize += BP_GET_PSIZE(bp);
+		zb->zb_gangs += BP_COUNT_GANG(bp);
+
+		switch (BP_GET_NDVAS(bp)) {
+		case 2:
+			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1]))
+				zb->zb_ditto_2_of_2_samevdev++;
+			break;
+		case 3:
+			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2]));
+			if (equal == 1)
+				zb->zb_ditto_2_of_3_samevdev++;
+			else if (equal == 3)
+				zb->zb_ditto_3_of_3_samevdev++;
+			break;
+		}
+	}
+}
+
+static void
+dsl_scan_scrub_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+
+	zio_data_buf_free(zio->io_data, zio->io_size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_inflight--;
+	cv_broadcast(&spa->spa_scrub_io_cv);
+
+	if (zio->io_error && (zio->io_error != ECKSUM ||
+	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+		spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+	}
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+dsl_scan_scrub_cb(dsl_pool_t *dp,
+    const blkptr_t *bp, const zbookmark_t *zb)
+{
+	dsl_scan_t *scn = dp->dp_scan;
+	size_t size = BP_GET_PSIZE(bp);
+	spa_t *spa = dp->dp_spa;
+	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+	boolean_t needs_io;
+	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+	int zio_priority;
+
+	if (phys_birth <= scn->scn_phys.scn_min_txg ||
+	    phys_birth >= scn->scn_phys.scn_max_txg)
+		return (0);
+
+	count_block(dp->dp_blkstats, bp);
+
+	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
+	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
+		zio_flags |= ZIO_FLAG_SCRUB;
+		zio_priority = ZIO_PRIORITY_SCRUB;
+		needs_io = B_TRUE;
+	} else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+		zio_flags |= ZIO_FLAG_RESILVER;
+		zio_priority = ZIO_PRIORITY_RESILVER;
+		needs_io = B_FALSE;
+	}
+
+	/* If it's an intent log block, failure is expected. */
+	if (zb->zb_level == ZB_ZIL_LEVEL)
+		zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
+		vdev_t *vd = vdev_lookup_top(spa,
+		    DVA_GET_VDEV(&bp->blk_dva[d]));
+
+		/*
+		 * Keep track of how much data we've examined so that
+		 * zpool(1M) status can make useful progress reports.
+		 */
+		scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
+		spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+
+		/* if it's a resilver, this may not be in the target range */
+		if (!needs_io) {
+			if (DVA_GET_GANG(&bp->blk_dva[d])) {
+				/*
+				 * Gang members may be spread across multiple
+				 * vdevs, so the best estimate we have is the
+				 * scrub range, which has already been checked.
+				 * XXX -- it would be better to change our
+				 * allocation policy to ensure that all
+				 * gang members reside on the same vdev.
+				 */
+				needs_io = B_TRUE;
+			} else {
+				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+				    phys_birth, 1);
+			}
+		}
+	}
+
+	if (needs_io && !zfs_no_scrub_io) {
+		void *data = zio_data_buf_alloc(size);
+
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+		spa->spa_scrub_inflight++;
+		mutex_exit(&spa->spa_scrub_lock);
+
+		zio_nowait(zio_read(NULL, spa, bp, data, size,
+		    dsl_scan_scrub_done, NULL, zio_priority,
+		    zio_flags, zb));
+	}
+
+	/* do not relocate this block */
+	return (0);
+}
+
+int
+dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+{
+	spa_t *spa = dp->dp_spa;
+
+	/*
+	 * Purge all vdev caches and probe all devices.  We do this here
+	 * rather than in sync context because this requires a writer lock
+	 * on the spa_config lock, which we can't do from sync context.  The
+	 * spa_scrub_reopen flag indicates that vdev_open() should not
+	 * attempt to start another scrub.
+	 */
+	spa_vdev_state_enter(spa, SCL_NONE);
+	spa->spa_scrub_reopen = B_TRUE;
+	vdev_reopen(spa->spa_root_vdev);
+	spa->spa_scrub_reopen = B_FALSE;
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+
+	return (dsl_sync_task_do(dp, dsl_scan_setup_check,
+	    dsl_scan_setup_sync, dp->dp_scan, &func, 0));
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_scrub.c b/usr/src/uts/common/fs/zfs/dsl_scrub.c
deleted file mode 100644
index b16ff66586..0000000000
--- a/usr/src/uts/common/fs/zfs/dsl_scrub.c
+++ /dev/null
@@ -1,1214 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dnode.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/arc.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/zil_impl.h>
-#include <sys/zio_checksum.h>
-#include <sys/ddt.h>
-#include <sys/sa.h>
-#include <sys/sa_impl.h>
-
-typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
-
-static scrub_cb_t dsl_pool_scrub_clean_cb;
-static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
-static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
-    uint64_t objset, uint64_t object);
-
-int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
-int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
-boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
-enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
-
-extern int zfs_txg_timeout;
-
-static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
-	NULL,
-	dsl_pool_scrub_clean_cb
-};
-
-/* ARGSUSED */
-static void
-dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = arg1;
-	enum scrub_func *funcp = arg2;
-	dmu_object_type_t ot = 0;
-	boolean_t complete = B_FALSE;
-
-	dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx);
-
-	ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE);
-	ASSERT(*funcp > SCRUB_FUNC_NONE);
-	ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS);
-
-	dp->dp_scrub_min_txg = 0;
-	dp->dp_scrub_max_txg = tx->tx_txg;
-	dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max;
-
-	if (*funcp == SCRUB_FUNC_CLEAN) {
-		vdev_t *rvd = dp->dp_spa->spa_root_vdev;
-
-		/* rewrite all disk labels */
-		vdev_config_dirty(rvd);
-
-		if (vdev_resilver_needed(rvd,
-		    &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) {
-			spa_event_notify(dp->dp_spa, NULL,
-			    ESC_ZFS_RESILVER_START);
-			dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
-			    tx->tx_txg);
-		} else {
-			spa_event_notify(dp->dp_spa, NULL,
-			    ESC_ZFS_SCRUB_START);
-		}
-
-		/* zero out the scrub stats in all vdev_stat_t's */
-		vdev_scrub_stat_update(rvd,
-		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
-		    POOL_SCRUB_EVERYTHING, B_FALSE);
-
-		/*
-		 * If this is an incremental scrub, limit the DDT scrub phase
-		 * to just the auto-ditto class (for correctness); the rest
-		 * of the scrub should go faster using top-down pruning.
-		 */
-		if (dp->dp_scrub_min_txg > TXG_INITIAL)
-			dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO;
-
-		dp->dp_spa->spa_scrub_started = B_TRUE;
-	}
-
-	/* back to the generic stuff */
-
-	if (dp->dp_blkstats == NULL) {
-		dp->dp_blkstats =
-		    kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
-	}
-	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
-
-	if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
-		ot = DMU_OT_ZAP_OTHER;
-
-	dp->dp_scrub_func = *funcp;
-	dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
-	    ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
-	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-	bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
-	dp->dp_scrub_restart = B_FALSE;
-	dp->dp_spa->spa_scrub_errors = 0;
-
-	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
-	    &dp->dp_scrub_func, tx));
-	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
-	    &dp->dp_scrub_queue_obj, tx));
-	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
-	    &dp->dp_scrub_min_txg, tx));
-	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
-	    &dp->dp_scrub_max_txg, tx));
-	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
-	    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
-	    &dp->dp_scrub_bookmark, tx));
-	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
-	    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
-	    &dp->dp_scrub_ddt_bookmark, tx));
-	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
-	    &dp->dp_scrub_ddt_class_max, tx));
-	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-	    &dp->dp_spa->spa_scrub_errors, tx));
-
-	spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr,
-	    "func=%u mintxg=%llu maxtxg=%llu",
-	    *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg);
-}
-
-int
-dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func)
-{
-	return (dsl_sync_task_do(dp, NULL,
-	    dsl_pool_scrub_setup_sync, dp, &func, 0));
-}
-
-/* ARGSUSED */
-static void
-dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = arg1;
-	boolean_t *completep = arg2;
-
-	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-		return;
-
-	mutex_enter(&dp->dp_scrub_cancel_lock);
-
-	if (dp->dp_scrub_restart) {
-		dp->dp_scrub_restart = B_FALSE;
-		*completep = B_FALSE;
-	}
-
-	/* XXX this is scrub-clean specific */
-	mutex_enter(&dp->dp_spa->spa_scrub_lock);
-	while (dp->dp_spa->spa_scrub_inflight > 0) {
-		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
-		    &dp->dp_spa->spa_scrub_lock);
-	}
-	mutex_exit(&dp->dp_spa->spa_scrub_lock);
-	dp->dp_spa->spa_scrub_active = B_FALSE;
-
-	dp->dp_scrub_func = SCRUB_FUNC_NONE;
-	VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
-	    dp->dp_scrub_queue_obj, tx));
-	dp->dp_scrub_queue_obj = 0;
-	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-	bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
-
-	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_QUEUE, tx));
-	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_MIN_TXG, tx));
-	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_MAX_TXG, tx));
-	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_BOOKMARK, tx));
-	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_FUNC, tx));
-	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_ERRORS, tx));
-
-	(void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_DDT_BOOKMARK, tx);
-	(void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_DDT_CLASS_MAX, tx);
-
-	spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
-	    "complete=%u", *completep);
-
-	/* below is scrub-clean specific */
-	vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE,
-	    *completep);
-	/*
-	 * If the scrub/resilver completed, update all DTLs to reflect this.
-	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
-	 */
-	vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
-	    *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
-	dp->dp_spa->spa_scrub_started = B_FALSE;
-	if (*completep)
-		spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ?
-		    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
-	spa_errlog_rotate(dp->dp_spa);
-
-	/*
-	 * We may have finished replacing a device.
-	 * Let the async thread assess this and handle the detach.
-	 */
-	spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE);
-
-	dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0;
-	mutex_exit(&dp->dp_scrub_cancel_lock);
-}
-
-int
-dsl_pool_scrub_cancel(dsl_pool_t *dp)
-{
-	boolean_t complete = B_FALSE;
-
-	return (dsl_sync_task_do(dp, NULL,
-	    dsl_pool_scrub_cancel_sync, dp, &complete, 3));
-}
-
-void
-dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
-{
-	/*
-	 * This function will be used by bp-rewrite wad to intercept frees.
-	 */
-	zio_free(dp->dp_spa, txg, bpp);
-}
-
-void
-dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
-{
-	ASSERT(dsl_pool_sync_context(dp));
-	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
-}
-
-static boolean_t
-bookmark_is_zero(const zbookmark_t *zb)
-{
-	return (zb->zb_objset == 0 && zb->zb_object == 0 &&
-	    zb->zb_level == 0 && zb->zb_blkid == 0);
-}
-
-/* dnp is the dnode for zb1->zb_object */
-static boolean_t
-bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
-    const zbookmark_t *zb2)
-{
-	uint64_t zb1nextL0, zb2thisobj;
-
-	ASSERT(zb1->zb_objset == zb2->zb_objset);
-	ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT);
-	ASSERT(zb2->zb_level == 0);
-
-	/*
-	 * A bookmark in the deadlist is considered to be after
-	 * everything else.
-	 */
-	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
-		return (B_TRUE);
-
-	/* The objset_phys_t isn't before anything. */
-	if (dnp == NULL)
-		return (B_FALSE);
-
-	zb1nextL0 = (zb1->zb_blkid + 1) <<
-	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
-
-	zb2thisobj = zb2->zb_object ? zb2->zb_object :
-	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
-
-	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
-		uint64_t nextobj = zb1nextL0 *
-		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
-		return (nextobj <= zb2thisobj);
-	}
-
-	if (zb1->zb_object < zb2thisobj)
-		return (B_TRUE);
-	if (zb1->zb_object > zb2thisobj)
-		return (B_FALSE);
-	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
-		return (B_FALSE);
-	return (zb1nextL0 <= zb2->zb_blkid);
-}
-
-static boolean_t
-scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb)
-{
-	uint64_t elapsed_nanosecs;
-	int mintime;
-
-	if (dp->dp_scrub_pausing)
-		return (B_TRUE); /* we're already pausing */
-
-	if (!bookmark_is_zero(&dp->dp_scrub_bookmark))
-		return (B_FALSE); /* we're resuming */
-
-	/* We only know how to resume from level-0 blocks. */
-	if (zb != NULL && zb->zb_level != 0)
-		return (B_FALSE);
-
-	mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time_ms :
-	    zfs_scrub_min_time_ms;
-	elapsed_nanosecs = gethrtime() - dp->dp_scrub_start_time;
-	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
-	    (elapsed_nanosecs / MICROSEC > mintime && txg_sync_waiting(dp))) {
-		if (zb) {
-			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
-			    (longlong_t)zb->zb_objset,
-			    (longlong_t)zb->zb_object,
-			    (longlong_t)zb->zb_level,
-			    (longlong_t)zb->zb_blkid);
-			dp->dp_scrub_bookmark = *zb;
-		}
-		if (ddb) {
-			dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
-			    (longlong_t)ddb->ddb_class,
-			    (longlong_t)ddb->ddb_type,
-			    (longlong_t)ddb->ddb_checksum,
-			    (longlong_t)ddb->ddb_cursor);
-			ASSERT(&dp->dp_scrub_ddt_bookmark == ddb);
-		}
-		dp->dp_scrub_pausing = B_TRUE;
-		return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-typedef struct zil_traverse_arg {
-	dsl_pool_t	*zta_dp;
-	zil_header_t	*zta_zh;
-} zil_traverse_arg_t;
-
-/* ARGSUSED */
-static int
-traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
-{
-	zil_traverse_arg_t *zta = arg;
-	dsl_pool_t *dp = zta->zta_dp;
-	zil_header_t *zh = zta->zta_zh;
-	zbookmark_t zb;
-
-	if (bp->blk_birth <= dp->dp_scrub_min_txg)
-		return (0);
-
-	/*
-	 * One block ("stubby") can be allocated a long time ago; we
-	 * want to visit that one because it has been allocated
-	 * (on-disk) even if it hasn't been claimed (even though for
-	 * plain scrub there's nothing to do to it).
-	 */
-	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
-		return (0);
-
-	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
-	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
-
-	VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
-{
-	if (lrc->lrc_txtype == TX_WRITE) {
-		zil_traverse_arg_t *zta = arg;
-		dsl_pool_t *dp = zta->zta_dp;
-		zil_header_t *zh = zta->zta_zh;
-		lr_write_t *lr = (lr_write_t *)lrc;
-		blkptr_t *bp = &lr->lr_blkptr;
-		zbookmark_t zb;
-
-		if (bp->blk_birth <= dp->dp_scrub_min_txg)
-			return (0);
-
-		/*
-		 * birth can be < claim_txg if this record's txg is
-		 * already txg sync'ed (but this log block contains
-		 * other records that are not synced)
-		 */
-		if (claim_txg == 0 || bp->blk_birth < claim_txg)
-			return (0);
-
-		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
-		    lr->lr_foid, ZB_ZIL_LEVEL,
-		    lr->lr_offset / BP_GET_LSIZE(bp));
-
-		VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
-	}
-	return (0);
-}
-
-static void
-traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
-{
-	uint64_t claim_txg = zh->zh_claim_txg;
-	zil_traverse_arg_t zta = { dp, zh };
-	zilog_t *zilog;
-
-	/*
-	 * We only want to visit blocks that have been claimed but not yet
-	 * replayed (or, in read-only mode, blocks that *would* be claimed).
-	 */
-	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
-		return;
-
-	zilog = zil_alloc(dp->dp_meta_objset, zh);
-
-	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta,
-	    claim_txg);
-
-	zil_free(zilog);
-}
-
-static void
-scrub_prefetch(dsl_pool_t *dp, arc_buf_t *buf, blkptr_t *bp, uint64_t objset,
-    uint64_t object, uint64_t blkid)
-{
-	zbookmark_t czb;
-	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
-
-	if (zfs_no_scrub_prefetch)
-		return;
-
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= dp->dp_scrub_min_txg ||
-	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
-		return;
-
-	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
-
-	(void) arc_read(dp->dp_scrub_prefetch_zio_root, dp->dp_spa, bp,
-	    buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
-	    &flags, &czb);
-}
-
-static void
-scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
-    arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
-{
-	int err;
-	arc_buf_t *buf = NULL;
-
-	if (bp->blk_birth <= dp->dp_scrub_min_txg)
-		return;
-
-	if (scrub_pause(dp, zb, NULL))
-		return;
-
-	if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
-		/*
-		 * If we already visited this bp & everything below (in
-		 * a prior txg), don't bother doing it again.
-		 */
-		if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark))
-			return;
-
-		/*
-		 * If we found the block we're trying to resume from, or
-		 * we went past it to a different object, zero it out to
-		 * indicate that it's OK to start checking for pausing
-		 * again.
-		 */
-		if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 ||
-		    zb->zb_object > dp->dp_scrub_bookmark.zb_object) {
-			dprintf("resuming at %llx/%llx/%llx/%llx\n",
-			    (longlong_t)zb->zb_objset,
-			    (longlong_t)zb->zb_object,
-			    (longlong_t)zb->zb_level,
-			    (longlong_t)zb->zb_blkid);
-			bzero(&dp->dp_scrub_bookmark, sizeof (*zb));
-		}
-	}
-
-	/*
-	 * If dsl_pool_scrub_ddt() has aready scrubbed this block,
-	 * don't scrub it again.
-	 */
-	if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp))
-		(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
-
-	if (BP_GET_LEVEL(bp) > 0) {
-		uint32_t flags = ARC_WAIT;
-		int i;
-		blkptr_t *cbp;
-		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
-
-		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
-		    arc_getbuf_func, &buf,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err) {
-			mutex_enter(&dp->dp_spa->spa_scrub_lock);
-			dp->dp_spa->spa_scrub_errors++;
-			mutex_exit(&dp->dp_spa->spa_scrub_lock);
-			return;
-		}
-		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
-			scrub_prefetch(dp, buf, cbp, zb->zb_objset,
-			    zb->zb_object, zb->zb_blkid * epb + i);
-		}
-		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
-			zbookmark_t czb;
-
-			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
-			    zb->zb_level - 1,
-			    zb->zb_blkid * epb + i);
-			scrub_visitbp(dp, dnp, buf, cbp, &czb);
-		}
-	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
-		uint32_t flags = ARC_WAIT;
-		dnode_phys_t *cdnp;
-		int i, j;
-		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
-
-		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
-		    arc_getbuf_func, &buf,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err) {
-			mutex_enter(&dp->dp_spa->spa_scrub_lock);
-			dp->dp_spa->spa_scrub_errors++;
-			mutex_exit(&dp->dp_spa->spa_scrub_lock);
-			return;
-		}
-		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
-			for (j = 0; j < cdnp->dn_nblkptr; j++) {
-				blkptr_t *cbp = &cdnp->dn_blkptr[j];
-				scrub_prefetch(dp, buf, cbp, zb->zb_objset,
-				    zb->zb_blkid * epb + i, j);
-			}
-		}
-		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
-			scrub_visitdnode(dp, cdnp, buf, zb->zb_objset,
-			    zb->zb_blkid * epb + i);
-		}
-	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
-		uint32_t flags = ARC_WAIT;
-		objset_phys_t *osp;
-
-		err = arc_read_nolock(NULL, dp->dp_spa, bp,
-		    arc_getbuf_func, &buf,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err) {
-			mutex_enter(&dp->dp_spa->spa_scrub_lock);
-			dp->dp_spa->spa_scrub_errors++;
-			mutex_exit(&dp->dp_spa->spa_scrub_lock);
-			return;
-		}
-
-		osp = buf->b_data;
-
-		traverse_zil(dp, &osp->os_zil_header);
-
-		scrub_visitdnode(dp, &osp->os_meta_dnode,
-		    buf, zb->zb_objset, DMU_META_DNODE_OBJECT);
-		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-			scrub_visitdnode(dp, &osp->os_userused_dnode,
-			    buf, zb->zb_objset, DMU_USERUSED_OBJECT);
-			scrub_visitdnode(dp, &osp->os_groupused_dnode,
-			    buf, zb->zb_objset, DMU_GROUPUSED_OBJECT);
-		}
-	}
-
-	if (buf)
-		(void) arc_buf_remove_ref(buf, &buf);
-}
-
-static void
-scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
-    uint64_t objset, uint64_t object)
-{
-	int j;
-
-	for (j = 0; j < dnp->dn_nblkptr; j++) {
-		zbookmark_t czb;
-
-		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
-		scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
-
-		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
-			zbookmark_t czb;
-			SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
-			scrub_visitbp(dp, dnp, buf, &dnp->dn_spill, &czb);
-		}
-	}
-}
-
-static void
-scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
-{
-	zbookmark_t zb;
-
-	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
-	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-	scrub_visitbp(dp, NULL, NULL, bp, &zb);
-}
-
-void
-dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-		return;
-
-	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
-		SET_BOOKMARK(&dp->dp_scrub_bookmark,
-		    ZB_DESTROYED_OBJSET, 0, 0, 0);
-	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-	    ds->ds_object, tx) != 0) {
-		return;
-	}
-
-	if (ds->ds_phys->ds_next_snap_obj != 0) {
-		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
-	}
-	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
-}
-
-void
-dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-		return;
-
-	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
-
-	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
-		dp->dp_scrub_bookmark.zb_objset =
-		    ds->ds_phys->ds_prev_snap_obj;
-	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-	    ds->ds_object, tx) == 0) {
-		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-		    ds->ds_phys->ds_prev_snap_obj, tx) == 0);
-	}
-}
-
-void
-dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
-
-	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-		return;
-
-	if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) {
-		dp->dp_scrub_bookmark.zb_objset = ds2->ds_object;
-	} else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) {
-		dp->dp_scrub_bookmark.zb_objset = ds1->ds_object;
-	}
-
-	if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-	    ds1->ds_object, tx) == 0) {
-		int err = zap_add_int(dp->dp_meta_objset,
-		    dp->dp_scrub_queue_obj, ds2->ds_object, tx);
-		VERIFY(err == 0 || err == EEXIST);
-		if (err == EEXIST) {
-			/* Both were there to begin with */
-			VERIFY(0 == zap_add_int(dp->dp_meta_objset,
-			    dp->dp_scrub_queue_obj, ds1->ds_object, tx));
-		}
-	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-	    ds2->ds_object, tx) == 0) {
-		VERIFY(0 == zap_add_int(dp->dp_meta_objset,
-		    dp->dp_scrub_queue_obj, ds1->ds_object, tx));
-	}
-}
-
-struct enqueue_clones_arg {
-	dmu_tx_t *tx;
-	uint64_t originobj;
-};
-
-/* ARGSUSED */
-static int
-enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
-	struct enqueue_clones_arg *eca = arg;
-	dsl_dataset_t *ds;
-	int err;
-	dsl_pool_t *dp;
-
-	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
-	if (err)
-		return (err);
-	dp = ds->ds_dir->dd_pool;
-
-	if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
-		while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
-			dsl_dataset_t *prev;
-			err = dsl_dataset_hold_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
-
-			dsl_dataset_rele(ds, FTAG);
-			if (err)
-				return (err);
-			ds = prev;
-		}
-		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-		    ds->ds_object, eca->tx) == 0);
-	}
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-static void
-scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds;
-	uint64_t min_txg_save;
-
-	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-
-	/*
-	 * Iterate over the bps in this ds.
-	 */
-	min_txg_save = dp->dp_scrub_min_txg;
-	dp->dp_scrub_min_txg =
-	    MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg);
-	scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp);
-	dp->dp_scrub_min_txg = min_txg_save;
-
-	if (dp->dp_scrub_pausing)
-		goto out;
-
-	/*
-	 * Add descendent datasets to work queue.
-	 */
-	if (ds->ds_phys->ds_next_snap_obj != 0) {
-		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
-	}
-	if (ds->ds_phys->ds_num_children > 1) {
-		boolean_t usenext = B_FALSE;
-		if (ds->ds_phys->ds_next_clones_obj != 0) {
-			uint64_t count;
-			/*
-			 * A bug in a previous version of the code could
-			 * cause upgrade_clones_cb() to not set
-			 * ds_next_snap_obj when it should, leading to a
-			 * missing entry.  Therefore we can only use the
-			 * next_clones_obj when its count is correct.
-			 */
-			int err = zap_count(dp->dp_meta_objset,
-			    ds->ds_phys->ds_next_clones_obj, &count);
-			if (err == 0 &&
-			    count == ds->ds_phys->ds_num_children - 1)
-				usenext = B_TRUE;
-		}
-
-		if (usenext) {
-			VERIFY(zap_join(dp->dp_meta_objset,
-			    ds->ds_phys->ds_next_clones_obj,
-			    dp->dp_scrub_queue_obj, tx) == 0);
-		} else {
-			struct enqueue_clones_arg eca;
-			eca.tx = tx;
-			eca.originobj = ds->ds_object;
-
-			(void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
-			    NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
-		}
-	}
-
-out:
-	dsl_dataset_rele(ds, FTAG);
-}
-
-/* ARGSUSED */
-static int
-enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
-	dmu_tx_t *tx = arg;
-	dsl_dataset_t *ds;
-	int err;
-	dsl_pool_t *dp;
-
-	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
-	if (err)
-		return (err);
-
-	dp = ds->ds_dir->dd_pool;
-
-	while (ds->ds_phys->ds_prev_snap_obj != 0) {
-		dsl_dataset_t *prev;
-		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
-		    FTAG, &prev);
-		if (err) {
-			dsl_dataset_rele(ds, FTAG);
-			return (err);
-		}
-
-		/*
-		 * If this is a clone, we don't need to worry about it for now.
-		 */
-		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
-			dsl_dataset_rele(ds, FTAG);
-			dsl_dataset_rele(prev, FTAG);
-			return (0);
-		}
-		dsl_dataset_rele(ds, FTAG);
-		ds = prev;
-	}
-
-	VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-	    ds->ds_object, tx) == 0);
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-/*
- * Scrub/dedup interaction.
- *
- * If there are N references to a deduped block, we don't want to scrub it
- * N times -- ideally, we should scrub it exactly once.
- *
- * We leverage the fact that the dde's replication class (enum ddt_class)
- * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
- * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
- *
- * To prevent excess scrubbing, the scrub begins by walking the DDT
- * to find all blocks with refcnt > 1, and scrubs each of these once.
- * Since there are two replication classes which contain blocks with
- * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
- * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
- *
- * There would be nothing more to say if a block's refcnt couldn't change
- * during a scrub, but of course it can so we must account for changes
- * in a block's replication class.
- *
- * Here's an example of what can occur:
- *
- * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
- * when visited during the top-down scrub phase, it will be scrubbed twice.
- * This negates our scrub optimization, but is otherwise harmless.
- *
- * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
- * on each visit during the top-down scrub phase, it will never be scrubbed.
- * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
- * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
- * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
- * while a scrub is in progress, it scrubs the block right then.
- */
-static void
-dsl_pool_scrub_ddt(dsl_pool_t *dp)
-{
-	ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark;
-	ddt_entry_t dde;
-	int error;
-
-	while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) {
-		if (ddb->ddb_class > dp->dp_scrub_ddt_class_max)
-			return;
-		dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde);
-		if (scrub_pause(dp, NULL, ddb))
-			return;
-	}
-	ASSERT(error == ENOENT);
-	ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max);
-}
-
-void
-dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
-    const ddt_entry_t *dde)
-{
-	const ddt_key_t *ddk = &dde->dde_key;
-	const ddt_phys_t *ddp = dde->dde_phys;
-	blkptr_t blk;
-	zbookmark_t zb = { 0 };
-
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		if (ddp->ddp_phys_birth == 0)
-			continue;
-		ddt_bp_create(checksum, ddk, ddp, &blk);
-		scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
-	}
-}
-
-void
-dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
-{
-	spa_t *spa = dp->dp_spa;
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	boolean_t complete = B_TRUE;
-
-	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-		return;
-
-	/*
-	 * If the pool is not loaded, or is trying to unload, leave it alone.
-	 */
-	if (spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa))
-		return;
-
-	if (dp->dp_scrub_restart) {
-		enum scrub_func func = dp->dp_scrub_func;
-		dp->dp_scrub_restart = B_FALSE;
-		dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
-	}
-
-	if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
-		/*
-		 * We must have resumed after rebooting; reset the vdev
-		 * stats to know that we're doing a scrub (although it
-		 * will think we're just starting now).
-		 */
-		vdev_scrub_stat_update(spa->spa_root_vdev,
-		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
-		    POOL_SCRUB_EVERYTHING, B_FALSE);
-	}
-
-	dp->dp_scrub_pausing = B_FALSE;
-	dp->dp_scrub_start_time = gethrtime();
-	dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
-	spa->spa_scrub_active = B_TRUE;
-
-	if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) {
-		dsl_pool_scrub_ddt(dp);
-		if (dp->dp_scrub_pausing)
-			goto out;
-	}
-
-	if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) {
-		/* First do the MOS & ORIGIN */
-		scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
-		if (dp->dp_scrub_pausing)
-			goto out;
-
-		if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
-			VERIFY(0 == dmu_objset_find_spa(spa,
-			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
-		} else {
-			scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
-		}
-		ASSERT(!dp->dp_scrub_pausing);
-	} else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) {
-		/*
-		 * If we were paused, continue from here.  Note if the ds
-		 * we were paused on was destroyed, the zb_objset will be
-		 * ZB_DESTROYED_OBJSET, so we will skip this and find a new
-		 * objset below.
-		 */
-		scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
-		if (dp->dp_scrub_pausing)
-			goto out;
-	}
-
-	/*
-	 * In case we were paused right at the end of the ds, zero the
-	 * bookmark so we don't think that we're still trying to resume.
-	 */
-	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-
-	/* keep pulling things out of the zap-object-as-queue */
-	while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj),
-	    zap_cursor_retrieve(&zc, &za) == 0) {
-		VERIFY(0 == zap_remove(dp->dp_meta_objset,
-		    dp->dp_scrub_queue_obj, za.za_name, tx));
-		scrub_visitds(dp, za.za_first_integer, tx);
-		if (dp->dp_scrub_pausing)
-			break;
-		zap_cursor_fini(&zc);
-	}
-	zap_cursor_fini(&zc);
-	if (dp->dp_scrub_pausing)
-		goto out;
-
-	/* done. */
-
-	dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
-	return;
-out:
-	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
-	    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
-	    &dp->dp_scrub_bookmark, tx));
-	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
-	    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
-	    &dp->dp_scrub_ddt_bookmark, tx));
-	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
-	    &dp->dp_scrub_ddt_class_max, tx));
-	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-	    &spa->spa_scrub_errors, tx));
-}
-
-void
-dsl_pool_scrub_restart(dsl_pool_t *dp)
-{
-	mutex_enter(&dp->dp_scrub_cancel_lock);
-	dp->dp_scrub_restart = B_TRUE;
-	mutex_exit(&dp->dp_scrub_cancel_lock);
-}
-
-/*
- * scrub consumers
- */
-
-static void
-count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
-{
-	int i;
-
-	/*
-	 * If we resume after a reboot, zab will be NULL; don't record
-	 * incomplete stats in that case.
-	 */
-	if (zab == NULL)
-		return;
-
-	for (i = 0; i < 4; i++) {
-		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
-		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
-		zfs_blkstat_t *zb = &zab->zab_type[l][t];
-		int equal;
-
-		zb->zb_count++;
-		zb->zb_asize += BP_GET_ASIZE(bp);
-		zb->zb_lsize += BP_GET_LSIZE(bp);
-		zb->zb_psize += BP_GET_PSIZE(bp);
-		zb->zb_gangs += BP_COUNT_GANG(bp);
-
-		switch (BP_GET_NDVAS(bp)) {
-		case 2:
-			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
-			    DVA_GET_VDEV(&bp->blk_dva[1]))
-				zb->zb_ditto_2_of_2_samevdev++;
-			break;
-		case 3:
-			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
-			    DVA_GET_VDEV(&bp->blk_dva[1])) +
-			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
-			    DVA_GET_VDEV(&bp->blk_dva[2])) +
-			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
-			    DVA_GET_VDEV(&bp->blk_dva[2]));
-			if (equal == 1)
-				zb->zb_ditto_2_of_3_samevdev++;
-			else if (equal == 3)
-				zb->zb_ditto_3_of_3_samevdev++;
-			break;
-		}
-	}
-}
-
-static void
-dsl_pool_scrub_clean_done(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-
-	zio_data_buf_free(zio->io_data, zio->io_size);
-
-	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_inflight--;
-	cv_broadcast(&spa->spa_scrub_io_cv);
-
-	if (zio->io_error && (zio->io_error != ECKSUM ||
-	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)))
-		spa->spa_scrub_errors++;
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-static int
-dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
-    const blkptr_t *bp, const zbookmark_t *zb)
-{
-	size_t size = BP_GET_PSIZE(bp);
-	spa_t *spa = dp->dp_spa;
-	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
-	boolean_t needs_io;
-	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
-	int zio_priority;
-
-	if (phys_birth <= dp->dp_scrub_min_txg ||
-	    phys_birth >= dp->dp_scrub_max_txg)
-		return (0);
-
-	count_block(dp->dp_blkstats, bp);
-
-	if (dp->dp_scrub_isresilver == 0) {
-		/* It's a scrub */
-		zio_flags |= ZIO_FLAG_SCRUB;
-		zio_priority = ZIO_PRIORITY_SCRUB;
-		needs_io = B_TRUE;
-	} else {
-		/* It's a resilver */
-		zio_flags |= ZIO_FLAG_RESILVER;
-		zio_priority = ZIO_PRIORITY_RESILVER;
-		needs_io = B_FALSE;
-	}
-
-	/* If it's an intent log block, failure is expected. */
-	if (zb->zb_level == ZB_ZIL_LEVEL)
-		zio_flags |= ZIO_FLAG_SPECULATIVE;
-
-	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
-		vdev_t *vd = vdev_lookup_top(spa,
-		    DVA_GET_VDEV(&bp->blk_dva[d]));
-
-		/*
-		 * Keep track of how much data we've examined so that
-		 * zpool(1M) status can make useful progress reports.
-		 */
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_scrub_examined +=
-		    DVA_GET_ASIZE(&bp->blk_dva[d]);
-		mutex_exit(&vd->vdev_stat_lock);
-
-		/* if it's a resilver, this may not be in the target range */
-		if (!needs_io) {
-			if (DVA_GET_GANG(&bp->blk_dva[d])) {
-				/*
-				 * Gang members may be spread across multiple
-				 * vdevs, so the best estimate we have is the
-				 * scrub range, which has already been checked.
-				 * XXX -- it would be better to change our
-				 * allocation policy to ensure that all
-				 * gang members reside on the same vdev.
-				 */
-				needs_io = B_TRUE;
-			} else {
-				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
-				    phys_birth, 1);
-			}
-		}
-	}
-
-	if (needs_io && !zfs_no_scrub_io) {
-		void *data = zio_data_buf_alloc(size);
-
-		mutex_enter(&spa->spa_scrub_lock);
-		while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
-			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-		spa->spa_scrub_inflight++;
-		mutex_exit(&spa->spa_scrub_lock);
-
-		zio_nowait(zio_read(NULL, spa, bp, data, size,
-		    dsl_pool_scrub_clean_done, NULL, zio_priority,
-		    zio_flags, zb));
-	}
-
-	/* do not relocate this block */
-	return (0);
-}
-
-int
-dsl_pool_scrub_clean(dsl_pool_t *dp)
-{
-	spa_t *spa = dp->dp_spa;
-
-	/*
-	 * Purge all vdev caches and probe all devices.  We do this here
-	 * rather than in sync context because this requires a writer lock
-	 * on the spa_config lock, which we can't do from sync context.  The
-	 * spa_scrub_reopen flag indicates that vdev_open() should not
-	 * attempt to start another scrub.
-	 */
-	spa_vdev_state_enter(spa, SCL_NONE);
-	spa->spa_scrub_reopen = B_TRUE;
-	vdev_reopen(spa->spa_root_vdev);
-	spa->spa_scrub_reopen = B_FALSE;
-	(void) spa_vdev_state_exit(spa, NULL, 0);
-
-	return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
-}
diff --git a/usr/src/uts/common/fs/zfs/dsl_synctask.c b/usr/src/uts/common/fs/zfs/dsl_synctask.c
index 81c6334cc8..832685b0fc 100644
--- a/usr/src/uts/common/fs/zfs/dsl_synctask.c
+++ b/usr/src/uts/common/fs/zfs/dsl_synctask.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -29,7 +28,6 @@
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
 #include <sys/metaslab.h>
-#include <sys/cred.h>
 
 #define	DST_AVG_BLKSHIFT 14
 
@@ -49,7 +47,6 @@ dsl_sync_task_group_create(dsl_pool_t *dp)
 	list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
 	    offsetof(dsl_sync_task_t, dst_node));
 	dstg->dstg_pool = dp;
-	dstg->dstg_cr = CRED();
 
 	return (dstg);
 }
@@ -136,7 +133,6 @@ dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
 	uint64_t txg;
 
 	dstg->dstg_nowaiter = B_TRUE;
-	dstg->dstg_cr = NULL; /* it won't be valid by the time we sync */
 	txg = dmu_tx_get_txg(tx);
 	/*
 	 * We don't generally have many sync tasks, so pay the price of
@@ -200,8 +196,7 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
 		 */
 		for (dst = list_head(&dstg->dstg_tasks); dst;
 		    dst = list_next(&dstg->dstg_tasks, dst)) {
-			dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2,
-			    dstg->dstg_cr, tx);
+			dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx);
 		}
 	}
 	rw_exit(&dp->dp_config_rwlock);
diff --git a/usr/src/uts/common/fs/zfs/refcount.c b/usr/src/uts/common/fs/zfs/refcount.c
index f1b3b23fe2..8358b4ceeb 100644
--- a/usr/src/uts/common/fs/zfs/refcount.c
+++ b/usr/src/uts/common/fs/zfs/refcount.c
@@ -19,12 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index dd0fbccc7e..510472a515 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -59,6 +59,7 @@
 #include <sys/systeminfo.h>
 #include <sys/spa_boot.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/dsl_scan.h>
 
 #ifdef	_KERNEL
 #include <sys/bootprops.h>
@@ -110,7 +111,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 };
 
-static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
+static dsl_syncfunc_t spa_sync_props;
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
@@ -1105,7 +1106,7 @@ spa_load_spares(spa_t *spa)
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		spares[i] = vdev_config_generate(spa,
-		    spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
+		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
@@ -1231,7 +1232,7 @@ spa_load_l2cache(spa_t *spa)
 	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
 	for (i = 0; i < sav->sav_count; i++)
 		l2cache[i] = vdev_config_generate(spa,
-		    sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
+		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
 out:
@@ -1429,7 +1430,7 @@ spa_load_verify_done(zio_t *zio)
 /*ARGSUSED*/
 static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+    arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	if (bp != NULL) {
 		zio_t *rio = arg;
@@ -1780,6 +1781,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
 	spa->spa_claim_max_txg = spa->spa_first_txg;
+	spa->spa_prev_software_version = ub->ub_software_version;
 
 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error)
@@ -1851,6 +1853,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
+	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
+	    &spa->spa_creation_version);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
 	/*
 	 * Load the persistent error log.  If we have an older pool, this will
 	 * not be present.
@@ -2076,7 +2083,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 		/*
 		 * Check all DTLs to see if anything needs resilvering.
 		 */
-		if (vdev_resilver_needed(rvd, NULL, NULL))
+		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+		    vdev_resilver_needed(rvd, NULL, NULL))
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 		/*
@@ -2375,7 +2383,7 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
 			if (spa_spare_exists(guid, &pool, NULL) &&
 			    pool != 0ULL) {
 				VERIFY(nvlist_lookup_uint64_array(
-				    spares[i], ZPOOL_CONFIG_STATS,
+				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
 				    (uint64_t **)&vs, &vsc) == 0);
 				vs->vs_state = VDEV_STATE_CANT_OPEN;
 				vs->vs_aux = VDEV_AUX_SPARED;
@@ -2432,7 +2440,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
 			ASSERT(vd != NULL);
 
 			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
-			    ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
+			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+			    == 0);
 			vdev_get_stats(vd, vs);
 		}
 	}
@@ -2819,6 +2828,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 		cmn_err(CE_PANIC, "failed to add pool config");
 	}
 
+	if (zap_add(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
+	    sizeof (uint64_t), 1, &version, tx) != 0) {
+		cmn_err(CE_PANIC, "failed to add pool version");
+	}
+
 	/* Newly created pools with the right version are always deflated. */
 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		spa->spa_deflate = TRUE;
@@ -2861,7 +2876,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 
 	if (props != NULL) {
 		spa_configfile_set(spa, props, B_FALSE);
-		spa_sync_props(spa, props, CRED(), tx);
+		spa_sync_props(spa, props, tx);
 	}
 
 	dmu_tx_commit(tx);
@@ -3219,8 +3234,6 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
 		return (error);
 	}
 
-	spa_async_resume(spa);
-
 	/*
 	 * Override any spares and level 2 cache devices as specified by
 	 * the user, as these may have correct device names/devids, etc.
@@ -3271,6 +3284,8 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	}
 
+	spa_async_resume(spa);
+
 	/*
 	 * It's possible that the pool was expanded while it was exported.
 	 * We kick off an async task to handle this for us.
@@ -3455,7 +3470,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_state = new_state;
-			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
+			spa->spa_final_txg = spa_last_synced_txg(spa) +
+			    TXG_DEFER_SIZE + 1;
 			vdev_config_dirty(spa->spa_root_vdev);
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
@@ -3635,7 +3651,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 int
 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 {
-	uint64_t txg, open_txg;
+	uint64_t txg, dtl_max_txg;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
@@ -3771,13 +3787,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	vdev_config_dirty(tvd);
 
 	/*
-	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
-	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
+	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
+	 * for any dmu_sync-ed blocks.  It will propagate upward when
+	 * spa_vdev_exit() calls vdev_dtl_reassess().
 	 */
-	open_txg = txg + TXG_CONCURRENT_STATES - 1;
+	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
-	vdev_dtl_dirty(newvd, DTL_MISSING,
-	    TXG_INITIAL, open_txg - TXG_INITIAL + 1);
+	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+	    dtl_max_txg - TXG_INITIAL);
 
 	if (newvd->vdev_isspare) {
 		spa_spare_activate(newvd);
@@ -3793,10 +3810,18 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	 */
 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
-	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
+	/*
+	 * Restart the resilver
+	 */
+	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+
+	/*
+	 * Commit the config
+	 */
+	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
 
-	spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL,
-	    CRED(),  "%s vdev=%s %s vdev=%s",
+	spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL,
+	    "%s vdev=%s %s vdev=%s",
 	    replacing && newvd_isspare ? "spare in" :
 	    replacing ? "replace" : "attach", newvdpath,
 	    replacing ? "for" : "to", oldvdpath);
@@ -3804,11 +3829,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	spa_strfree(oldvdpath);
 	spa_strfree(newvdpath);
 
-	/*
-	 * Kick off a resilver to update newvd.
-	 */
-	VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
-
 	return (0);
 }
 
@@ -4004,7 +4024,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
-	spa_history_internal_log(LOG_POOL_VDEV_DETACH, spa, NULL, CRED(),
+	spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
 	    "vdev=%s", vdpath);
 	spa_strfree(vdpath);
 
@@ -4024,7 +4044,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 				continue;
 			spa_open_ref(spa, FTAG);
 			mutex_exit(&spa_namespace_lock);
-			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+			(void) spa_vdev_remove(spa, unspare_guid,
+			    B_TRUE);
 			mutex_enter(&spa_namespace_lock);
 			spa_close(spa, FTAG);
 		}
@@ -4266,8 +4287,8 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
 		if (vml[c] != NULL) {
 			vdev_split(vml[c]);
 			if (error == 0)
-				spa_history_internal_log(LOG_POOL_VDEV_DETACH,
-				    spa, tx, CRED(), "vdev=%s",
+				spa_history_log_internal(LOG_POOL_VDEV_DETACH,
+				    spa, tx, "vdev=%s",
 				    vml[c]->vdev_path);
 			vdev_free(vml[c]);
 		}
@@ -4283,7 +4304,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
 		zio_handle_panic_injection(spa, FTAG, 3);
 
 	/* split is complete; log a history record */
-	spa_history_internal_log(LOG_POOL_SPLIT, newspa, NULL, CRED(),
+	spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
 	    "split new pool %s from pool %s", newname, spa_name(spa));
 
 	kmem_free(vml, children * sizeof (vdev_t *));
@@ -4359,21 +4380,13 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
 }
 
 /*
- * Removing a device from the vdev namespace requires several steps
- * and can take a significant amount of time.  As a result we use
- * the spa_vdev_config_[enter/exit] functions which allow us to
- * grab and release the spa_config_lock while still holding the namespace
- * lock.  During each step the configuration is synced out.
- */
-
-/*
  * Evacuate the device.
  */
-int
+static int
 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
 {
-	int error = 0;
 	uint64_t txg;
+	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
@@ -4386,14 +4399,12 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
 	 * should no longer have any blocks allocated on it.
 	 */
 	if (vd->vdev_islog) {
-		error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
-		    NULL, DS_FIND_CHILDREN);
+		if (vd->vdev_stat.vs_alloc != 0)
+			error = spa_offline_log(spa);
 	} else {
-		error = ENOTSUP;	/* until we have bp rewrite */
+		error = ENOTSUP;
 	}
 
-	txg_wait_synced(spa_get_dsl(spa), 0);
-
 	if (error)
 		return (error);
 
@@ -4401,6 +4412,7 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
 	 * The evacuation succeeded.  Remove any remaining MOS metadata
 	 * associated with this vdev, and wait for these changes to sync.
 	 */
+	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
 	txg = spa_vdev_config_enter(spa);
 	vd->vdev_removing = B_TRUE;
 	vdev_dirty(vd, 0, NULL, txg);
@@ -4413,7 +4425,7 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
 /*
  * Complete the removal by cleaning up the namespace.
  */
-void
+static void
 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
@@ -4424,6 +4436,12 @@ spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(vd == vd->vdev_top);
 
+	/*
+	 * Only remove any devices which are empty.
+	 */
+	if (vd->vdev_stat.vs_alloc != 0)
+		return;
+
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	if (list_link_active(&vd->vdev_state_dirty_node))
@@ -4439,15 +4457,19 @@ spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
 		vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
 		vdev_add_child(rvd, vd);
 	}
-	vdev_config_dirty(rvd);
-
-	/*
-	 * Reassess the health of our root vdev.
-	 */
-	vdev_reopen(rvd);
 }
 
 /*
+ * Remove a device from the pool -
+ *
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time.  As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock.  During each step the configuration is synced out.
+ */
+
+/*
  * Remove a device from the pool.  Currently, this supports removing only hot
  * spares, slogs, and level 2 ARC devices.
  */
@@ -4690,40 +4712,38 @@ spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
 
 /*
  * ==========================================================================
- * SPA Scrubbing
+ * SPA Scanning
  * ==========================================================================
  */
 
 int
-spa_scrub(spa_t *spa, pool_scrub_type_t type)
+spa_scan_stop(spa_t *spa)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+	if (dsl_scan_resilvering(spa->spa_dsl_pool))
+		return (EBUSY);
+	return (dsl_scan_cancel(spa->spa_dsl_pool));
+}
 
-	if ((uint_t)type >= POOL_SCRUB_TYPES)
+int
+spa_scan(spa_t *spa, pool_scan_func_t func)
+{
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+
+	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
 		return (ENOTSUP);
 
 	/*
 	 * If a resilver was requested, but there is no DTL on a
 	 * writeable leaf device, we have nothing to do.
 	 */
-	if (type == POOL_SCRUB_RESILVER &&
+	if (func == POOL_SCAN_RESILVER &&
 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		return (0);
 	}
 
-	if (type == POOL_SCRUB_EVERYTHING &&
-	    spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE &&
-	    spa->spa_dsl_pool->dp_scrub_isresilver)
-		return (EBUSY);
-
-	if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) {
-		return (dsl_pool_scrub_clean(spa->spa_dsl_pool));
-	} else if (type == POOL_SCRUB_NONE) {
-		return (dsl_pool_scrub_cancel(spa->spa_dsl_pool));
-	} else {
-		return (EINVAL);
-	}
+	return (dsl_scan(spa->spa_dsl_pool, func));
 }
 
 /*
@@ -4829,8 +4849,8 @@ spa_async_thread(spa_t *spa)
 		 * then log an internal history event.
 		 */
 		if (new_space != old_space) {
-			spa_history_internal_log(LOG_POOL_VDEV_ONLINE,
-			    spa, NULL, CRED(),
+			spa_history_log_internal(LOG_POOL_VDEV_ONLINE,
+			    spa, NULL,
 			    "pool '%s' size: %llu(+%llu)",
 			    spa_name(spa), new_space, new_space - old_space);
 		}
@@ -4874,7 +4894,7 @@ spa_async_thread(spa_t *spa)
 	 * Kick off a resilver.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER)
-		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0);
+		dsl_resilver_restart(spa->spa_dsl_pool, 0);
 
 	/*
 	 * Let the world know that we're done.
@@ -4920,6 +4940,7 @@ spa_async_dispatch(spa_t *spa)
 void
 spa_async_request(spa_t *spa, int task)
 {
+	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_tasks |= task;
 	mutex_exit(&spa->spa_async_lock);
@@ -5024,7 +5045,7 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
 		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
 		for (i = 0; i < sav->sav_count; i++)
 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
-			    B_FALSE, B_FALSE, B_TRUE);
+			    B_FALSE, VDEV_CONFIG_L2CACHE);
 		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
 		    sav->sav_count) == 0);
 		for (i = 0; i < sav->sav_count; i++)
@@ -5064,7 +5085,7 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
  * Set zpool properties.
  */
 static void
-spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	spa_t *spa = arg1;
 	objset_t *mos = spa->spa_meta_objset;
@@ -5176,8 +5197,8 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 		/* log internal history if this is not a zpool create */
 		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
 		    tx->tx_txg != TXG_INITIAL) {
-			spa_history_internal_log(LOG_POOL_PROPSET,
-			    spa, tx, cr, "%s %lld %s",
+			spa_history_log_internal(LOG_POOL_PROPSET,
+			    spa, tx, "%s %lld %s",
 			    nvpair_name(elem), intval, spa_name(spa));
 		}
 	}
@@ -5272,13 +5293,17 @@ spa_sync(spa_t *spa, uint64_t txg)
 	}
 
 	/*
-	 * If anything has changed in this txg, push the deferred frees
-	 * from the previous txg.  If not, leave them alone so that we
-	 * don't generate work on an otherwise idle system.
+	 * If anything has changed in this txg, or if someone is waiting
+	 * for this txg to sync (eg, spa_vdev_remove()), push the
+	 * deferred frees from the previous txg.  If not, leave them
+	 * alone so that we don't generate work on an otherwise idle
+	 * system.
 	 */
 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
-	    !txg_list_empty(&dp->dp_sync_tasks, txg))
+	    !txg_list_empty(&dp->dp_sync_tasks, txg) ||
+	    ((dp->dp_scan->scn_phys.scn_state == DSS_SCANNING ||
+	    txg_sync_waiting(dp)) && !spa_shutting_down(spa)))
 		spa_sync_deferred_bplist(spa, defer_bpl, tx, txg);
 
 	/*
@@ -5304,11 +5329,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 		}
 
 		ddt_sync(spa, txg);
-
-		mutex_enter(&spa->spa_scrub_lock);
-		while (spa->spa_scrub_inflight > 0)
-			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-		mutex_exit(&spa->spa_scrub_lock);
+		dsl_scan_sync(dp, tx);
 
 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 			vdev_sync(vd, txg);
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
index 68a40bec89..cdeda3f93c 100644
--- a/usr/src/uts/common/fs/zfs/spa_config.c
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/spa.h>
@@ -419,7 +418,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 		    split_guid) == 0);
 	}
 
-	nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE);
+	nvroot = vdev_config_generate(spa, vd, getstats, 0);
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 	nvlist_free(nvroot);
 
diff --git a/usr/src/uts/common/fs/zfs/spa_errlog.c b/usr/src/uts/common/fs/zfs/spa_errlog.c
index 4c834e2d4e..282140b3bd 100644
--- a/usr/src/uts/common/fs/zfs/spa_errlog.c
+++ b/usr/src/uts/common/fs/zfs/spa_errlog.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -54,36 +53,6 @@
 #include <sys/zap.h>
 #include <sys/zio.h>
 
-/*
- * This is a stripped-down version of strtoull, suitable only for converting
- * lowercase hexidecimal numbers that don't overflow.
- */
-uint64_t
-strtonum(const char *str, char **nptr)
-{
-	uint64_t val = 0;
-	char c;
-	int digit;
-
-	while ((c = *str) != '\0') {
-		if (c >= '0' && c <= '9')
-			digit = c - '0';
-		else if (c >= 'a' && c <= 'f')
-			digit = 10 + c - 'a';
-		else
-			break;
-
-		val *= 16;
-		val += digit;
-
-		str++;
-	}
-
-	if (nptr)
-		*nptr = (char *)str;
-
-	return (val);
-}
 
 /*
  * Convert a bookmark to a string.
diff --git a/usr/src/uts/common/fs/zfs/spa_history.c b/usr/src/uts/common/fs/zfs/spa_history.c
index 18d4836bc7..212abae5b8 100644
--- a/usr/src/uts/common/fs/zfs/spa_history.c
+++ b/usr/src/uts/common/fs/zfs/spa_history.c
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/spa.h>
@@ -33,6 +32,7 @@
 #include <sys/utsname.h>
 #include <sys/cmn_err.h>
 #include <sys/sunddi.h>
+#include "zfs_comutil.h"
 #ifdef _KERNEL
 #include <sys/zone.h>
 #endif
@@ -189,7 +189,7 @@ spa_history_zone()
  */
 /*ARGSUSED*/
 static void
-spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	spa_t		*spa = arg1;
 	history_arg_t	*hap = arg2;
@@ -244,6 +244,8 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	    hap->ha_log_type == LOG_CMD_NORMAL) {
 		VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD,
 		    history_str) == 0);
+
+		zfs_dbgmsg("command: %s", history_str);
 	} else {
 		VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT,
 		    hap->ha_event) == 0);
@@ -251,6 +253,11 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 		    tx->tx_txg) == 0);
 		VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR,
 		    history_str) == 0);
+
+		zfs_dbgmsg("internal %s pool:%s txg:%llu %s",
+		    zfs_history_event_names[hap->ha_event], spa_name(spa),
+		    (longlong_t)tx->tx_txg, history_str);
+
 	}
 
 	VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0);
@@ -418,7 +425,7 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
 
 static void
 log_internal(history_internal_events_t event, spa_t *spa,
-    dmu_tx_t *tx, cred_t *cr, const char *fmt, va_list adx)
+    dmu_tx_t *tx, const char *fmt, va_list adx)
 {
 	history_arg_t *ha;
 
@@ -441,7 +448,7 @@ log_internal(history_internal_events_t event, spa_t *spa,
 	ha->ha_uid = 0;
 
 	if (dmu_tx_is_syncing(tx)) {
-		spa_history_log_sync(spa, ha, cr, tx);
+		spa_history_log_sync(spa, ha, tx);
 	} else {
 		dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
 		    spa_history_log_sync, spa, ha, 0, tx);
@@ -450,8 +457,8 @@ log_internal(history_internal_events_t event, spa_t *spa,
 }
 
 void
-spa_history_internal_log(history_internal_events_t event, spa_t *spa,
-    dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+spa_history_log_internal(history_internal_events_t event, spa_t *spa,
+    dmu_tx_t *tx, const char *fmt, ...)
 {
 	dmu_tx_t *htx = tx;
 	va_list adx;
@@ -466,7 +473,7 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa,
 	}
 
 	va_start(adx, fmt);
-	log_internal(event, spa, htx, cr, fmt, adx);
+	log_internal(event, spa, htx, fmt, adx);
 	va_end(adx);
 
 	/* if we didn't get a tx from the caller, commit the one we made */
@@ -481,7 +488,7 @@ spa_history_log_version(spa_t *spa, history_internal_events_t event)
 	uint64_t current_vers = spa_version(spa);
 
 	if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) {
-		spa_history_internal_log(event, spa, NULL, CRED(),
+		spa_history_log_internal(event, spa, NULL,
 		    "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s",
 		    (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION,
 		    utsname.nodename, utsname.release, utsname.version,
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index c815cd6113..8faa84a1b0 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -40,6 +40,7 @@
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
+#include <sys/dsl_scan.h>
 #include <sys/fs/zfs.h>
 #include <sys/metaslab_impl.h>
 #include <sys/arc.h>
@@ -888,10 +889,10 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
 
 	/*
-	 * If the config changed, notify the scrub thread that it must restart.
+	 * If the config changed, notify the scrub that it must restart.
+	 * This will initiate a resilver if needed.
 	 */
 	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
-		dsl_pool_scrub_restart(spa->spa_dsl_pool);
 		config_changed = B_TRUE;
 		spa->spa_config_generation++;
 	}
@@ -1078,7 +1079,6 @@ spa_rename(const char *name, const char *newname)
 	return (0);
 }
 
-
 /*
  * Determine whether a pool with given pool_guid exists.  If device_guid is
  * non-zero, determine whether the pool exists *and* contains a device with the
@@ -1209,6 +1209,37 @@ zfs_panic_recover(const char *fmt, ...)
 }
 
 /*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexidecimal numbers that don't overflow.
+ */
+uint64_t
+strtonum(const char *str, char **nptr)
+{
+	uint64_t val = 0;
+	char c;
+	int digit;
+
+	while ((c = *str) != '\0') {
+		if (c >= '0' && c <= '9')
+			digit = c - '0';
+		else if (c >= 'a' && c <= 'f')
+			digit = 10 + c - 'a';
+		else
+			break;
+
+		val *= 16;
+		val += digit;
+
+		str++;
+	}
+
+	if (nptr)
+		*nptr = (char *)str;
+
+	return (val);
+}
+
+/*
  * ==========================================================================
  * Accessor functions
  * ==========================================================================
@@ -1390,6 +1421,12 @@ spa_max_replication(spa_t *spa)
 	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
 }
 
+int
+spa_prev_software_version(spa_t *spa)
+{
+	return (spa->spa_prev_software_version);
+}
+
 uint64_t
 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
 {
@@ -1584,3 +1621,45 @@ spa_dedup_checksum(spa_t *spa)
 {
 	return (spa->spa_dedup_checksum);
 }
+
+/*
+ * Reset pool scan stat per scan pass (or reboot).
+ */
+void
+spa_scan_stat_init(spa_t *spa)
+{
+	/* data not stored on disk */
+	spa->spa_scan_pass_start = gethrestime_sec();
+	spa->spa_scan_pass_exam = 0;
+	vdev_scan_stat_init(spa->spa_root_vdev);
+}
+
+/*
+ * Get scan stats for zpool status reports
+ */
+int
+spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
+{
+	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
+
+	if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+		return (ENOENT);
+	bzero(ps, sizeof (pool_scan_stat_t));
+
+	/* data stored on disk */
+	ps->pss_func = scn->scn_phys.scn_func;
+	ps->pss_start_time = scn->scn_phys.scn_start_time;
+	ps->pss_end_time = scn->scn_phys.scn_end_time;
+	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
+	ps->pss_examined = scn->scn_phys.scn_examined;
+	ps->pss_to_process = scn->scn_phys.scn_to_process;
+	ps->pss_processed = scn->scn_phys.scn_processed;
+	ps->pss_errors = scn->scn_phys.scn_errors;
+	ps->pss_state = scn->scn_phys.scn_state;
+
+	/* data not stored on disk */
+	ps->pss_pass_start = spa->spa_scan_pass_start;
+	ps->pss_pass_exam = spa->spa_scan_pass_exam;
+
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index c528fac1a6..f0ad04ff49 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_ARC_H
@@ -48,7 +47,8 @@ arc_done_func_t arc_getbuf_func;
 struct arc_buf {
 	arc_buf_hdr_t		*b_hdr;
 	arc_buf_t		*b_next;
-	krwlock_t		b_lock;
+	kmutex_t		b_evict_lock;
+	krwlock_t		b_data_lock;
 	void			*b_data;
 	arc_evict_func_t	*b_efunc;
 	void			*b_private;
@@ -92,6 +92,8 @@ void arc_buf_add_ref(arc_buf_t *buf, void *tag);
 int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
 void arc_release(arc_buf_t *buf, void *tag);
+int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+    zbookmark_t *zb);
 int arc_released(arc_buf_t *buf);
 int arc_has_callback(arc_buf_t *buf);
 void arc_buf_freeze(arc_buf_t *buf);
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
index 54686ba32d..4c05806e3e 100644
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_DBUF_H
@@ -278,6 +277,7 @@ void dbuf_evict(dmu_buf_impl_t *db);
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
 void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_release_bp(dmu_buf_impl_t *db);
 
 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
     struct dmu_tx *);
diff --git a/usr/src/uts/common/fs/zfs/sys/ddt.h b/usr/src/uts/common/fs/zfs/sys/ddt.h
index 5eab6a2fb2..bd446acafa 100644
--- a/usr/src/uts/common/fs/zfs/sys/ddt.h
+++ b/usr/src/uts/common/fs/zfs/sys/ddt.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_DDT_H
@@ -232,6 +231,8 @@ extern int ddt_load(spa_t *spa);
 extern void ddt_unload(spa_t *spa);
 extern void ddt_sync(spa_t *spa, uint64_t txg);
 extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx);
 
 extern const ddt_ops_t ddt_zap_ops;
 
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 9c9369f8a7..7df5d48321 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -118,7 +118,7 @@ typedef enum dmu_object_type {
 	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
 	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
 	DMU_OT_NEXT_CLONES,		/* ZAP */
-	DMU_OT_SCRUB_QUEUE,		/* ZAP */
+	DMU_OT_SCAN_QUEUE,		/* ZAP */
 	DMU_OT_USERGROUP_USED,		/* ZAP */
 	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
 	DMU_OT_USERREFS,		/* ZAP */
@@ -128,6 +128,8 @@ typedef enum dmu_object_type {
 	DMU_OT_SA_MASTER_NODE,		/* ZAP */
 	DMU_OT_SA_ATTR_REGISTRATION,	/* ZAP */
 	DMU_OT_SA_ATTR_LAYOUTS,		/* ZAP */
+	DMU_OT_SCAN_XLATE,		/* ZAP */
+	DMU_OT_DEDUP,			/* fake dedup BP from ddt_bp_create() */
 	DMU_OT_NUMTYPES
 } dmu_object_type_t;
 
@@ -220,23 +222,8 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
-
-/* 4x8 zbookmark_t */
-#define	DMU_POOL_SCRUB_BOOKMARK		"scrub_bookmark"
-/* 4x8 ddt_bookmark_t */
-#define	DMU_POOL_SCRUB_DDT_BOOKMARK	"scrub_ddt_bookmark"
-/* 1x8 max_class */
-#define	DMU_POOL_SCRUB_DDT_CLASS_MAX	"scrub_ddt_class_max"
-/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */
-#define	DMU_POOL_SCRUB_QUEUE		"scrub_queue"
-/* 1x8 txg */
-#define	DMU_POOL_SCRUB_MIN_TXG		"scrub_min_txg"
-/* 1x8 txg */
-#define	DMU_POOL_SCRUB_MAX_TXG		"scrub_max_txg"
-/* 1x4 enum scrub_func */
-#define	DMU_POOL_SCRUB_FUNC		"scrub_func"
-/* 1x8 count */
-#define	DMU_POOL_SCRUB_ERRORS		"scrub_errors"
+#define	DMU_POOL_CREATION_VERSION	"creation_version"
+#define	DMU_POOL_SCAN			"scan"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
index ef1782b74d..5c5119a207 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -46,6 +46,9 @@ struct dmu_tx;
 #define	OBJSET_PHYS_SIZE 2048
 #define	OBJSET_OLD_PHYS_SIZE 1024
 
+#define	OBJSET_BUF_HAS_USERUSED(buf) \
+	(arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)
+
 #define	OBJSET_FLAG_USERACCOUNTING_COMPLETE	(1ULL<<0)
 
 typedef struct objset_phys {
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
index 5b0821253d..844e7f1aeb 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_DMU_TRAVERSE_H
@@ -37,9 +36,11 @@ extern "C" {
 struct dnode_phys;
 struct dsl_dataset;
 struct zilog;
+struct arc_buf;
 
 typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg);
+    struct arc_buf *pbuf, const zbookmark_t *zb, const struct dnode_phys *dnp,
+    void *arg);
 
 #define	TRAVERSE_PRE			(1<<0)
 #define	TRAVERSE_POST			(1<<1)
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
index 6eb7505ea5..a21971679c 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_DATASET_H
@@ -113,7 +112,6 @@ typedef struct dsl_dataset {
 
 	/* only used in syncing context, only valid for non-snapshots: */
 	struct dsl_dataset *ds_prev;
-	uint64_t ds_origin_txg;
 
 	/* has internal locking: */
 	bplist_t ds_deadlist;
@@ -235,8 +233,7 @@ int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
     uint64_t *ref_rsrv);
 int dsl_dataset_set_quota(const char *dsname, zprop_source_t source,
     uint64_t quota);
-void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr,
-    dmu_tx_t *tx);
+dsl_syncfunc_t dsl_dataset_set_quota_sync;
 int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
     uint64_t reservation);
 
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
index 14a64e019e..327cda6eec 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_DIR_H
@@ -90,6 +89,7 @@ struct dsl_dir {
 	kmutex_t dd_lock;
 	list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
 	timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
+	uint64_t dd_origin_txg;
 
 	/* gross estimate of space used by in-flight tx's */
 	uint64_t dd_tempreserved[TXG_SIZE];
@@ -142,6 +142,7 @@ timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
 /* internal reserved dir name */
 #define	MOS_DIR_NAME "$MOS"
 #define	ORIGIN_DIR_NAME "$ORIGIN"
+#define	XLATION_DIR_NAME "$XLATION"
 
 #ifdef ZFS_DEBUG
 #define	dprintf_dd(dd, fmt, ...) do { \
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
index 97541ad2f1..187040e700 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
@@ -32,6 +32,7 @@
 #include <sys/zio.h>
 #include <sys/dnode.h>
 #include <sys/ddt.h>
+#include <sys/arc.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -42,12 +43,7 @@ struct dsl_dir;
 struct dsl_dataset;
 struct dsl_pool;
 struct dmu_tx;
-
-enum scrub_func {
-	SCRUB_FUNC_NONE,
-	SCRUB_FUNC_CLEAN,
-	SCRUB_FUNC_NUMFUNCS
-};
+struct dsl_scan;
 
 /* These macros are for indexing into the zfs_all_blkstats_t. */
 #define	DMU_OT_DEFERRED	DMU_OT_NONE
@@ -87,25 +83,13 @@ typedef struct dsl_pool {
 	uint64_t dp_write_limit;
 	uint64_t dp_tmp_userrefs_obj;
 
+	struct dsl_scan *dp_scan;
+
 	/* Uses dp_lock */
 	kmutex_t dp_lock;
 	uint64_t dp_space_towrite[TXG_SIZE];
 	uint64_t dp_tempreserved[TXG_SIZE];
 
-	enum scrub_func dp_scrub_func;
-	uint64_t dp_scrub_queue_obj;
-	uint64_t dp_scrub_min_txg;
-	uint64_t dp_scrub_max_txg;
-	uint64_t dp_scrub_start_time;
-	uint64_t dp_scrub_ddt_class_max;
-	zbookmark_t dp_scrub_bookmark;
-	ddt_bookmark_t dp_scrub_ddt_bookmark;
-	boolean_t dp_scrub_pausing;
-	boolean_t dp_scrub_isresilver;
-	boolean_t dp_scrub_restart;
-	kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
-	zio_t *dp_scrub_prefetch_zio_root;
-
 	/* Has its own locking */
 	tx_state_t dp_tx;
 	txg_list_t dp_dirty_datasets;
@@ -138,20 +122,15 @@ void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
 void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
 void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
     const blkptr_t *bpp);
-void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
-    struct dmu_tx *tx);
+int dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+    arc_done_func_t *done, void *private, int priority, int zio_flags,
+    uint32_t *arc_flags, const zbookmark_t *zb);
+int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+    arc_done_func_t *done, void *private, int priority, int zio_flags,
+    uint32_t *arc_flags, const zbookmark_t *zb);
 void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
 
-int dsl_pool_scrub_cancel(dsl_pool_t *dp);
-int dsl_pool_scrub_clean(dsl_pool_t *dp);
-void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
-void dsl_pool_scrub_restart(dsl_pool_t *dp);
-void dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
-    const ddt_entry_t *dde);
-
 taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
 
 extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
@@ -159,6 +138,7 @@ extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
 extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
     const char *tag, dmu_tx_t *tx);
 extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
+int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h
index d8a8ab2d64..a636ad3509 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_PROP_H
@@ -91,7 +90,7 @@ int dsl_prop_set(const char *ddname, const char *propname,
     zprop_source_t source, int intsz, int numints, const void *buf);
 int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl);
 void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
-    cred_t *cr, dmu_tx_t *tx);
+    dmu_tx_t *tx);
 
 void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
     zprop_source_t source, uint64_t *value);
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h
new file mode 100644
index 0000000000..c0eaa49567
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_SYS_DSL_SCAN_H
+#define	_SYS_DSL_SCAN_H
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/bplist.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+struct dsl_dataset;
+struct dsl_pool;
+struct dmu_tx;
+
+/*
+ * All members of this structure must be uint64_t, for byteswap
+ * purposes.
+ */
+typedef struct dsl_scan_phys {
+	uint64_t scn_func; /* pool_scan_func_t */
+	uint64_t scn_state; /* dsl_scan_state_t */
+	uint64_t scn_queue_obj;
+	uint64_t scn_min_txg;
+	uint64_t scn_max_txg;
+	uint64_t scn_cur_min_txg;
+	uint64_t scn_cur_max_txg;
+	uint64_t scn_start_time;
+	uint64_t scn_end_time;
+	uint64_t scn_to_examine; /* total bytes to be scanned */
+	uint64_t scn_examined; /* bytes scanned so far */
+	uint64_t scn_to_process;
+	uint64_t scn_processed;
+	uint64_t scn_errors;	/* scan I/O error count */
+	uint64_t scn_ddt_class_max;
+	ddt_bookmark_t scn_ddt_bookmark;
+	zbookmark_t scn_bookmark;
+	uint64_t scn_flags; /* dsl_scan_flags_t */
+} dsl_scan_phys_t;
+
+#define	SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))
+
+typedef enum dsl_scan_flags {
+	DSF_VISIT_DS_AGAIN = 1<<0,
+} dsl_scan_flags_t;
+
+typedef struct dsl_scan {
+	struct dsl_pool *scn_dp;
+
+	boolean_t scn_pausing;
+	uint64_t scn_restart_txg;
+	uint64_t scn_sync_start_time;
+	zio_t *scn_prefetch_zio_root;
+
+	/* for debugging / information */
+	uint64_t scn_visited_this_txg;
+
+	dsl_scan_phys_t scn_phys;
+} dsl_scan_t;
+
+int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
+void dsl_scan_fini(struct dsl_pool *dp);
+void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
+int dsl_scan_cancel(struct dsl_pool *);
+int dsl_scan(struct dsl_pool *, pool_scan_func_t);
+void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
+boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
+boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
+void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+    ddt_entry_t *dde, dmu_tx_t *tx);
+void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
+    struct dmu_tx *tx);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_SCAN_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h b/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h
index 4995bfe5ac..9126290cdb 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h
@@ -19,15 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_SYNCTASK_H
 #define	_SYS_DSL_SYNCTASK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/txg.h>
 #include <sys/zfs_context.h>
 
@@ -38,7 +35,7 @@ extern "C" {
 struct dsl_pool;
 
 typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
-typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *);
+typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *);
 
 typedef struct dsl_sync_task {
 	list_node_t dst_node;
@@ -53,7 +50,6 @@ typedef struct dsl_sync_task_group {
 	txg_node_t dstg_node;
 	list_t dstg_tasks;
 	struct dsl_pool *dstg_pool;
-	cred_t *dstg_cr;
 	uint64_t dstg_txg;
 	int dstg_err;
 	int dstg_space;
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
index 5ce6251ddb..583d6303bd 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_H
diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h
index d3fe7b1f89..bc3ade80f1 100644
--- a/usr/src/uts/common/fs/zfs/sys/refcount.h
+++ b/usr/src/uts/common/fs/zfs/sys/refcount.h
@@ -19,15 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_REFCOUNT_H
 #define	_SYS_REFCOUNT_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/inttypes.h>
 #include <sys/list.h>
 #include <sys/zfs_context.h>
@@ -91,6 +88,11 @@ typedef struct refcount {
 	atomic_add_64_nv(&(rc)->rc_count, number)
 #define	refcount_remove_many(rc, number, holder) \
 	atomic_add_64_nv(&(rc)->rc_count, -number)
+#define	refcount_transfer(dst, src) { \
+	uint64_t __tmp = (src)->rc_count; \
+	atomic_add_64(&(src)->rc_count, -__tmp); \
+	atomic_add_64(&(dst)->rc_count, __tmp); \
+}
 
 #define	refcount_init()
 #define	refcount_fini()
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index a26a55cb42..4cc9244cd0 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_SPA_H
@@ -262,7 +261,7 @@ typedef struct blkptr {
 
 #define	BP_GET_UCSIZE(bp) \
 	((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
-	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
 
 #define	BP_GET_NDVAS(bp)	\
 	(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
@@ -432,6 +431,8 @@ extern void spa_async_suspend(spa_t *spa);
 extern void spa_async_resume(spa_t *spa);
 extern spa_t *spa_inject_addref(char *pool);
 extern void spa_inject_delref(spa_t *spa);
+extern void spa_scan_stat_init(spa_t *spa);
+extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
 
 #define	SPA_ASYNC_CONFIG_UPDATE	0x01
 #define	SPA_ASYNC_REMOVE	0x02
@@ -439,6 +440,14 @@ extern void spa_inject_delref(spa_t *spa);
 #define	SPA_ASYNC_RESILVER_DONE	0x08
 #define	SPA_ASYNC_RESILVER	0x10
 #define	SPA_ASYNC_AUTOEXPAND	0x20
+#define	SPA_ASYNC_REMOVE_DONE	0x40
+#define	SPA_ASYNC_REMOVE_STOP	0x80
+
+/*
+ * Controls the behavior of spa_vdev_remove().
+ */
+#define	SPA_REMOVE_UNSPARE	0x01
+#define	SPA_REMOVE_DONE		0x02
 
 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
@@ -447,6 +456,7 @@ extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
     int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
+extern boolean_t spa_vdev_remove_active(spa_t *spa);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
 extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
@@ -465,14 +475,19 @@ extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
 extern void spa_l2cache_activate(vdev_t *vd);
 extern void spa_l2cache_drop(spa_t *spa);
 
-/* scrubbing */
-extern int spa_scrub(spa_t *spa, pool_scrub_type_t type);
+/* scanning */
+extern int spa_scan(spa_t *spa, pool_scan_func_t func);
+extern int spa_scan_stop(spa_t *spa);
 
 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
-#define	SYNC_PASS_DEFERRED_FREE	1	/* defer frees after this pass */
+/*
+ * DEFERRED_FREE must be large enough that regular blocks are not
+ * deferred.  XXX so can't we change it back to 1?
+ */
+#define	SYNC_PASS_DEFERRED_FREE	2	/* defer frees after this pass */
 #define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
 #define	SYNC_PASS_REWRITE	1	/* rewrite new bps after this pass */
 
@@ -577,6 +592,7 @@ extern boolean_t spa_deflate(spa_t *spa);
 extern metaslab_class_t *spa_normal_class(spa_t *spa);
 extern metaslab_class_t *spa_log_class(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
+extern int spa_prev_software_version(spa_t *spa);
 extern int spa_busy(void);
 extern uint8_t spa_get_failmode(spa_t *spa);
 extern boolean_t spa_suspended(spa_t *spa);
@@ -632,8 +648,8 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
     char *his_buf);
 extern int spa_history_log(spa_t *spa, const char *his_buf,
     history_log_type_t what);
-extern void spa_history_internal_log(history_internal_events_t event,
-    spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
+extern void spa_history_log_internal(history_internal_events_t event,
+    spa_t *spa, dmu_tx_t *tx, const char *fmt, ...);
 extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt);
 
 /* error handling */
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 9daec092b4..f857f733d2 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_SPA_IMPL_H
@@ -150,13 +149,14 @@ struct spa {
 	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
 	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
 	uint64_t	spa_scrub_maxinflight;	/* max in-flight scrub I/Os */
-	uint64_t	spa_scrub_errors;	/* scrub I/O error count */
 	kcondvar_t	spa_scrub_io_cv;	/* scrub I/O completion */
 	uint8_t		spa_scrub_active;	/* active or suspended? */
 	uint8_t		spa_scrub_type;		/* type of scrub we're doing */
 	uint8_t		spa_scrub_finished;	/* indicator to rotate logs */
 	uint8_t		spa_scrub_started;	/* started since last boot */
 	uint8_t		spa_scrub_reopen;	/* scrub doing vdev_reopen */
+	uint64_t	spa_scan_pass_start;	/* start time per pass/reboot */
+	uint64_t	spa_scan_pass_exam;	/* examined bytes per pass */
 	kmutex_t	spa_async_lock;		/* protect async state */
 	kthread_t	*spa_async_thread;	/* thread doing async task */
 	int		spa_async_suspended;	/* async tasks suspended */
@@ -212,7 +212,8 @@ struct spa {
 	uint64_t	spa_did;		/* if procp != p0, did of t1 */
 	boolean_t	spa_autoreplace;	/* autoreplace set in open */
 	int		spa_vdev_locks;		/* locks grabbed */
-
+	uint64_t	spa_creation_version;	/* version at pool creation */
+	uint64_t	spa_prev_software_version;
 	/*
 	 * spa_refcnt & spa_config_lock must be the last elements
 	 * because refcount_t changes size based on compilation options.
diff --git a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h
index c135df9b10..6ab6aa3135 100644
--- a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_UBERBLOCK_IMPL_H
@@ -52,6 +51,9 @@ struct uberblock {
 	uint64_t	ub_guid_sum;	/* sum of all vdev guids	*/
 	uint64_t	ub_timestamp;	/* UTC time of last sync	*/
 	blkptr_t	ub_rootbp;	/* MOS objset_phys_t		*/
+
+	/* highest SPA_VERSION supported by software that wrote this txg */
+	uint64_t	ub_software_version;
 };
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index b37516a984..941f234dc6 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_H
@@ -83,8 +82,7 @@ extern void vdev_split(vdev_t *vd);
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
 extern void vdev_clear_stats(vdev_t *vd);
 extern void vdev_stat_update(zio_t *zio, uint64_t psize);
-extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
-    boolean_t complete);
+extern void vdev_scan_stat_init(vdev_t *vd);
 extern void vdev_propagate_state(vdev_t *vd);
 extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
     vdev_aux_t aux);
@@ -126,9 +124,15 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
 extern void vdev_state_dirty(vdev_t *vd);
 extern void vdev_state_clean(vdev_t *vd);
 
+typedef enum vdev_config_flag {
+	VDEV_CONFIG_SPARE = 1 << 0,
+	VDEV_CONFIG_L2CACHE = 1 << 1,
+	VDEV_CONFIG_REMOVING = 1 << 2
+} vdev_config_flag_t;
+
 extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
 extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
-    boolean_t getstats, boolean_t isspare, boolean_t isl2cache);
+    boolean_t getstats, vdev_config_flag_t flags);
 
 /*
  * Label routines
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index f78ec5084e..2b886bc588 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -151,7 +151,7 @@ struct vdev {
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
 	boolean_t	vdev_remove_wanted; /* async remove wanted?	*/
 	boolean_t	vdev_probe_wanted; /* async probe wanted?	*/
-	boolean_t	vdev_removing;	/* device is being removed?	*/
+	uint64_t	vdev_removing;	/* device is being removed?	*/
 	list_node_t	vdev_config_dirty_node; /* config dirty list	*/
 	list_node_t	vdev_state_dirty_node; /* state dirty list	*/
 	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
diff --git a/usr/src/uts/common/fs/zfs/sys/zap.h b/usr/src/uts/common/fs/zfs/sys/zap.h
index 3b9de2a2f9..b44fb8fbba 100644
--- a/usr/src/uts/common/fs/zfs/sys/zap.h
+++ b/usr/src/uts/common/fs/zfs/sys/zap.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_ZAP_H
@@ -259,7 +258,6 @@ int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
  */
 int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
 
-
 /*
  * Returns (in name) the name of the entry whose (value & mask)
  * (za_first_integer) is value, or ENOENT if not found.  The string
@@ -276,6 +274,14 @@ int zap_value_search(objset_t *os, uint64_t zapobj,
  */
 int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
 
+/* Same as zap_join, but set the values to 'value'. */
+int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    uint64_t value, dmu_tx_t *tx);
+
+/* Same as zap_join, but add together any duplicated entries. */
+int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    dmu_tx_t *tx);
+
 /*
  * Manipulate entries where the name + value are the "same" (the name is
  * a stringified version of the value).
@@ -286,6 +292,21 @@ int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
 int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
     dmu_tx_t *tx);
 
+/* Here the key is an int and the value is a different int. */
+int zap_add_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t *valuep);
+
+/*
+ * They name is a stringified version of key; increment its value by
+ * delta.  Zero values will be zap_remove()-ed.
+ */
+int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+    dmu_tx_t *tx);
+int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+    dmu_tx_t *tx);
+
 struct zap;
 struct zap_leaf;
 typedef struct zap_cursor {
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
index 5aa0efc98d..739a380b75 100644
--- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_ZAP_IMPL_H
@@ -67,9 +66,12 @@ typedef struct mzap_ent {
 	avl_node_t mze_node;
 	int mze_chunkid;
 	uint64_t mze_hash;
-	mzap_ent_phys_t mze_phys;
+	uint32_t mze_cd; /* copy from mze_phys->mze_cd */
 } mzap_ent_t;
 
+#define	MZE_PHYS(zap, mze) \
+	(&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid])
+
 /*
  * The (fat) zap is stored in one object. It is an array of
  * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
index 173b6b195e..3a33636741 100644
--- a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
+++ b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
@@ -19,13 +19,14 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_ZAP_LEAF_H
 #define	_SYS_ZAP_LEAF_H
 
+#include <sys/zap.h>
+
 #ifdef	__cplusplus
 extern "C" {
 #endif
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
index 450ac1c81b..50ecf9b362 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
@@ -19,15 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_DEBUG_H
 #define	_SYS_ZFS_DEBUG_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -68,6 +65,16 @@ extern void __dprintf(const char *file, const char *func,
 
 extern void zfs_panic_recover(const char *fmt, ...);
 
+typedef struct zfs_dbgmsg {
+	list_node_t zdm_node;
+	time_t zdm_timestamp;
+	char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+extern void zfs_dbgmsg_init(void);
+extern void zfs_dbgmsg_fini(void);
+extern void zfs_dbgmsg(const char *fmt, ...);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index f5884f2925..991b9f2f59 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -28,6 +27,7 @@
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
 #include <sys/callb.h>
 
 /*
@@ -136,7 +136,7 @@ txg_sync_start(dsl_pool_t *dp)
 	 * 32-bit x86.  This is due in part to nested pools and
 	 * scrub_visitbp() recursion.
 	 */
-	tx->tx_sync_thread = thread_create(NULL, 12<<10, txg_sync_thread,
+	tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
 	    dp, 0, &p0, TS_RUN, minclsyspri);
 
 	mutex_exit(&tx->tx_sync_lock);
@@ -366,12 +366,12 @@ txg_sync_thread(dsl_pool_t *dp)
 		uint64_t txg;
 
 		/*
-		 * We sync when we're scrubbing, there's someone waiting
+		 * We sync when we're scanning, there's someone waiting
 		 * on us, or the quiesce thread has handed off a txg to
 		 * us, or we have reached our timeout.
 		 */
 		timer = (delta >= timeout ? 0 : timeout - delta);
-		while ((dp->dp_scrub_func == SCRUB_FUNC_NONE ||
+		while ((dp->dp_scan->scn_phys.scn_state != DSS_SCANNING ||
 		    spa_load_state(spa) != SPA_LOAD_NONE ||
 		    spa_shutting_down(spa)) &&
 		    !tx->tx_exiting && timer > 0 &&
diff --git a/usr/src/uts/common/fs/zfs/uberblock.c b/usr/src/uts/common/fs/zfs/uberblock.c
index 34d7e0c3ac..692cda137f 100644
--- a/usr/src/uts/common/fs/zfs/uberblock.c
+++ b/usr/src/uts/common/fs/zfs/uberblock.c
@@ -19,12 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/uberblock_impl.h>
 #include <sys/vdev_impl.h>
@@ -58,6 +55,7 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
 	ub->ub_txg = txg;
 	ub->ub_guid_sum = rvd->vdev_guid_sum;
 	ub->ub_timestamp = gethrestime_sec();
+	ub->ub_software_version = SPA_VERSION;
 
 	return (ub->ub_rootbp.blk_birth == txg);
 }
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 2b77c45574..a61f29b8e7 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -39,6 +39,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/zil.h>
+#include <sys/dsl_scan.h>
 
 /*
  * Virtual device management.
@@ -486,6 +487,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 		    &vd->vdev_ms_shift);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    &vd->vdev_asize);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
+		    &vd->vdev_removing);
 	}
 
 	if (parent && !parent->vdev_parent) {
@@ -860,7 +863,12 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 	if (txg == 0)
 		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
-	if (oldc == 0)
+	/*
+	 * If the vdev is being removed we don't activate
+	 * the metaslabs since we want to ensure that no new
+	 * allocations are performed on this device.
+	 */
+	if (oldc == 0 && !vd->vdev_removing)
 		metaslab_group_activate(vd->vdev_mg);
 
 	if (txg == 0)
@@ -1648,9 +1656,12 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
+		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
 		mutex_enter(&vd->vdev_dtl_lock);
 		if (scrub_txg != 0 &&
-		    (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
+		    (spa->spa_scrub_started ||
+		    (scn && scn->scn_phys.scn_errors == 0))) {
 			/*
 			 * We completed a scrub up to scrub_txg.  If we
 			 * did it without rebooting, then the scrub dtl
@@ -2029,7 +2040,10 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 		dmu_tx_commit(tx);
 	}
 
-	if (vd->vdev_removing)
+	/*
+	 * Remove the metadata associated with this vdev once it's empty.
+	 */
+	if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
 		vdev_remove(vd, txg);
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
@@ -2403,7 +2417,7 @@ vdev_allocatable(vdev_t *vd)
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
-	    !vd->vdev_cant_write && !vd->vdev_ishole && !vd->vdev_removing);
+	    !vd->vdev_cant_write && !vd->vdev_ishole);
 }
 
 boolean_t
@@ -2433,7 +2447,6 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 
 	mutex_enter(&vd->vdev_stat_lock);
 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
-	vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 	vs->vs_state = vd->vdev_state;
 	vs->vs_rsize = vdev_get_min_asize(vd);
@@ -2455,7 +2468,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 				vs->vs_ops[t] += cvs->vs_ops[t];
 				vs->vs_bytes[t] += cvs->vs_bytes[t];
 			}
-			vs->vs_scrub_examined += cvs->vs_scrub_examined;
+			cvs->vs_scan_removing = cvd->vdev_removing;
 			mutex_exit(&vd->vdev_stat_lock);
 		}
 	}
@@ -2472,6 +2485,19 @@ vdev_clear_stats(vdev_t *vd)
 }
 
 void
+vdev_scan_stat_init(vdev_t *vd)
+{
+	vdev_stat_t *vs = &vd->vdev_stat;
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_scan_stat_init(vd->vdev_child[c]);
+
+	mutex_enter(&vd->vdev_stat_lock);
+	vs->vs_scan_processed = 0;
+	mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
 	spa_t *spa = zio->io_spa;
@@ -2515,8 +2541,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 		mutex_enter(&vd->vdev_stat_lock);
 
 		if (flags & ZIO_FLAG_IO_REPAIR) {
-			if (flags & ZIO_FLAG_SCRUB_THREAD)
-				vs->vs_scrub_repaired += psize;
+			if (flags & ZIO_FLAG_SCRUB_THREAD) {
+				dsl_scan_phys_t *scn_phys =
+				    &spa->spa_dsl_pool->dp_scan->scn_phys;
+				uint64_t *processed = &scn_phys->scn_processed;
+
+				/* XXX cleanup? */
+				if (vd->vdev_ops->vdev_op_leaf)
+					atomic_add_64(processed, psize);
+				vs->vs_scan_processed += psize;
+			}
+
 			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
 		}
@@ -2602,35 +2637,6 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 	}
 }
 
-void
-vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
-{
-	vdev_stat_t *vs = &vd->vdev_stat;
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
-
-	mutex_enter(&vd->vdev_stat_lock);
-
-	if (type == POOL_SCRUB_NONE) {
-		/*
-		 * Update completion and end time.  Leave everything else alone
-		 * so we can report what happened during the previous scrub.
-		 */
-		vs->vs_scrub_complete = complete;
-		vs->vs_scrub_end = gethrestime_sec();
-	} else {
-		vs->vs_scrub_type = type;
-		vs->vs_scrub_complete = 0;
-		vs->vs_scrub_examined = 0;
-		vs->vs_scrub_repaired = 0;
-		vs->vs_scrub_start = gethrestime_sec();
-		vs->vs_scrub_end = 0;
-	}
-
-	mutex_exit(&vd->vdev_stat_lock);
-}
-
 /*
  * Update the in-core space usage stats for this vdev, its metaslab class,
  * and the root vdev.
@@ -2730,7 +2736,7 @@ vdev_config_dirty(vdev_t *vd)
 		 * sketchy, but it will work.
 		 */
 		nvlist_free(aux[c]);
-		aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE);
+		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
 
 		return;
 	}
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index d11b3df7c6..75ec545345 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -141,6 +140,7 @@
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/zio.h>
+#include <sys/dsl_scan.h>
 #include <sys/fs/zfs.h>
 
 /*
@@ -208,7 +208,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
  */
 nvlist_t *
 vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
-    boolean_t isspare, boolean_t isl2cache)
+    vdev_config_flag_t flags)
 {
 	nvlist_t *nv = NULL;
 
@@ -216,7 +216,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 
 	VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
 	    vd->vdev_ops->vdev_op_type) == 0);
-	if (!isspare && !isl2cache)
+	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
 		    == 0);
 	VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
@@ -270,7 +270,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 	if (vd->vdev_isspare)
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
 
-	if (!isspare && !isl2cache && vd == vd->vdev_top) {
+	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
+	    vd == vd->vdev_top) {
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vd->vdev_ms_array) == 0);
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
@@ -281,6 +282,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		    vd->vdev_asize) == 0);
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
 		    vd->vdev_islog) == 0);
+		if (vd->vdev_removing)
+			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+			    vd->vdev_removing) == 0);
 	}
 
 	if (vd->vdev_dtl_smo.smo_object != 0)
@@ -293,28 +297,52 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 
 	if (getstats) {
 		vdev_stat_t vs;
+		pool_scan_stat_t ps;
+
 		vdev_get_stats(vd, &vs);
-		VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
+		VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 		    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+
+		/* provide either current or previous scan information */
+		if (spa_scan_get_stats(spa, &ps) == 0) {
+			VERIFY(nvlist_add_uint64_array(nv,
+			    ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+			    sizeof (pool_scan_stat_t) / sizeof (uint64_t))
+			    == 0);
+		}
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		nvlist_t **child;
-		int c;
+		int c, idx;
 
 		ASSERT(!vd->vdev_ishole);
 
 		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
 		    KM_SLEEP);
 
-		for (c = 0; c < vd->vdev_children; c++)
-			child[c] = vdev_config_generate(spa, vd->vdev_child[c],
-			    getstats, isspare, isl2cache);
+		for (c = 0, idx = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+
+			/*
+			 * If we're generating an nvlist of removing
+			 * vdevs then skip over any device which is
+			 * not being removed.
+			 */
+			if ((flags & VDEV_CONFIG_REMOVING) &&
+			    !cvd->vdev_removing)
+				continue;
 
-		VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-		    child, vd->vdev_children) == 0);
+			child[idx++] = vdev_config_generate(spa, cvd,
+			    getstats, flags);
+		}
+
+		if (idx) {
+			VERIFY(nvlist_add_nvlist_array(nv,
+			    ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+		}
 
-		for (c = 0; c < vd->vdev_children; c++)
+		for (c = 0; c < idx; c++)
 			nvlist_free(child[c]);
 
 		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
@@ -375,12 +403,11 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t *array;
-	uint_t idx;
+	uint_t c, idx;
 
 	array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
 
-	idx = 0;
-	for (int c = 0; c < rvd->vdev_children; c++) {
+	for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		if (tvd->vdev_ishole)
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 30415c8abb..4b0f5602c1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -1604,6 +1603,7 @@ vdev_raidz_io_start(zio_t *zio)
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
+
 /*
  * Report a checksum error for a child of a RAID-Z device.
  */
diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c
index 6be63ef6b0..df5bd46739 100644
--- a/usr/src/uts/common/fs/zfs/zap.c
+++ b/usr/src/uts/common/fs/zfs/zap.c
@@ -978,6 +978,56 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
 }
 
 int
+zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    uint64_t value, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int err;
+
+	for (zap_cursor_init(&zc, os, fromobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    (void) zap_cursor_advance(&zc)) {
+		if (za.za_integer_length != 8 || za.za_num_integers != 1)
+			return (EINVAL);
+		err = zap_add(os, intoobj, za.za_name,
+		    8, 1, &value, tx);
+		if (err)
+			return (err);
+	}
+	zap_cursor_fini(&zc);
+	return (0);
+}
+
+int
+zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int err;
+
+	for (zap_cursor_init(&zc, os, fromobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    (void) zap_cursor_advance(&zc)) {
+		uint64_t delta = 0;
+
+		if (za.za_integer_length != 8 || za.za_num_integers != 1)
+			return (EINVAL);
+
+		err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
+		if (err != 0 && err != ENOENT)
+			return (err);
+		delta += za.za_first_integer;
+		err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
+		if (err)
+			return (err);
+	}
+	zap_cursor_fini(&zc);
+	return (0);
+}
+
+int
 zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
 {
 	char name[20];
@@ -1005,17 +1055,34 @@ zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
 }
 
 int
-zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
-    dmu_tx_t *tx)
+zap_add_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
 {
 	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_lookup(os, obj, name, 8, 1, valuep));
+}
+
+int
+zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+    dmu_tx_t *tx)
+{
 	uint64_t value = 0;
 	int err;
 
 	if (delta == 0)
 		return (0);
 
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 	err = zap_lookup(os, obj, name, 8, 1, &value);
 	if (err != 0 && err != ENOENT)
 		return (err);
@@ -1027,6 +1094,15 @@ zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
 	return (err);
 }
 
+int
+zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+    dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_increment(os, obj, name, delta, tx));
+}
 
 /*
  * Routines for iterating over the attributes.
@@ -1101,7 +1177,6 @@ again:
 	return (err);
 }
 
-
 static void
 zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
 {
diff --git a/usr/src/uts/common/fs/zfs/zap_leaf.c b/usr/src/uts/common/fs/zfs/zap_leaf.c
index 285d9c5674..19a795db82 100644
--- a/usr/src/uts/common/fs/zfs/zap_leaf.c
+++ b/usr/src/uts/common/fs/zfs/zap_leaf.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -37,6 +36,7 @@
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
+#include <sys/arc.h>
 
 static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
 
@@ -538,14 +538,6 @@ zap_entry_update(zap_entry_handle_t *zeh,
 	if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
 		return (EAGAIN);
 
-	/*
-	 * We should search other chained leaves (via
-	 * zap_entry_remove,create?) otherwise returning EAGAIN will
-	 * just send us into an infinite loop if we have to chain
-	 * another leaf block, rather than being able to split this
-	 * block.
-	 */
-
 	zap_leaf_array_free(l, &le->le_value_chunk);
 	le->le_value_chunk =
 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c
index 2de5812fa2..3fc92ff122 100644
--- a/usr/src/uts/common/fs/zfs/zap_micro.c
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c
@@ -31,6 +31,7 @@
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 #include <sys/avl.h>
+#include <sys/arc.h>
 
 #ifdef _KERNEL
 #include <sys/sunddi.h>
@@ -254,26 +255,26 @@ mze_compare(const void *arg1, const void *arg2)
 		return (+1);
 	if (mze1->mze_hash < mze2->mze_hash)
 		return (-1);
-	if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
+	if (mze1->mze_cd > mze2->mze_cd)
 		return (+1);
-	if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
+	if (mze1->mze_cd < mze2->mze_cd)
 		return (-1);
 	return (0);
 }
 
 static void
-mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
+mze_insert(zap_t *zap, int chunkid, uint64_t hash)
 {
 	mzap_ent_t *mze;
 
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT(mzep->mze_cd < zap_maxcd(zap));
 
 	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
 	mze->mze_chunkid = chunkid;
 	mze->mze_hash = hash;
-	mze->mze_phys = *mzep;
+	mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
+	ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
 	avl_add(&zap->zap_m.zap_avl, mze);
 }
 
@@ -289,14 +290,15 @@ mze_find(zap_name_t *zn)
 	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
 
 	mze_tofind.mze_hash = zn->zn_hash;
-	mze_tofind.mze_phys.mze_cd = 0;
+	mze_tofind.mze_cd = 0;
 
 again:
 	mze = avl_find(avl, &mze_tofind, &idx);
 	if (mze == NULL)
 		mze = avl_nearest(avl, idx, AVL_AFTER);
 	for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
-		if (zap_match(zn, mze->mze_phys.mze_name))
+		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
+		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
 			return (mze);
 	}
 	if (zn->zn_matchtype == MT_BEST) {
@@ -319,12 +321,12 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash)
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	mze_tofind.mze_hash = hash;
-	mze_tofind.mze_phys.mze_cd = 0;
+	mze_tofind.mze_cd = 0;
 
 	cd = 0;
 	for (mze = avl_find(avl, &mze_tofind, &idx);
 	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
-		if (mze->mze_phys.mze_cd != cd)
+		if (mze->mze_cd != cd)
 			break;
 		cd++;
 	}
@@ -408,7 +410,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 				zap->zap_m.zap_num_entries++;
 				zn = zap_name_alloc(zap, mze->mze_name,
 				    MT_EXACT);
-				mze_insert(zap, i, zn->zn_hash, mze);
+				mze_insert(zap, i, zn->zn_hash);
 				zap_name_free(zn);
 			}
 		}
@@ -727,11 +729,11 @@ again:
 	    other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
 
 		if (zn == NULL) {
-			zn = zap_name_alloc(zap, mze->mze_phys.mze_name,
+			zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
 			    MT_FIRST);
 			allocdzn = B_TRUE;
 		}
-		if (zap_match(zn, other->mze_phys.mze_name)) {
+		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
 			if (allocdzn)
 				zap_name_free(zn);
 			return (B_TRUE);
@@ -793,9 +795,10 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
 			} else if (integer_size != 8) {
 				err = EINVAL;
 			} else {
-				*(uint64_t *)buf = mze->mze_phys.mze_value;
+				*(uint64_t *)buf =
+				    MZE_PHYS(zap, mze)->mze_value;
 				(void) strlcpy(realname,
-				    mze->mze_phys.mze_name, rn_len);
+				    MZE_PHYS(zap, mze)->mze_name, rn_len);
 				if (ncp) {
 					*ncp = mzap_normalization_conflict(zap,
 					    zn, mze);
@@ -932,7 +935,7 @@ again:
 			if (zap->zap_m.zap_alloc_next ==
 			    zap->zap_m.zap_num_chunks)
 				zap->zap_m.zap_alloc_next = 0;
-			mze_insert(zap, i, zn->zn_hash, mze);
+			mze_insert(zap, i, zn->zn_hash);
 			return;
 		}
 	}
@@ -1017,10 +1020,20 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 {
 	zap_t *zap;
 	mzap_ent_t *mze;
+	uint64_t oldval;
 	const uint64_t *intval = val;
 	zap_name_t *zn;
 	int err;
 
+#ifdef ZFS_DEBUG
+	/*
+	 * If there is an old value, it shouldn't change across the
+	 * lockdir (eg, due to bprewrite's xlation).
+	 */
+	if (integer_size == 8 && num_integers == 1)
+		(void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
+#endif
+
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 	if (err)
 		return (err);
@@ -1044,9 +1057,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	} else {
 		mze = mze_find(zn);
 		if (mze != NULL) {
-			mze->mze_phys.mze_value = *intval;
-			zap->zap_m.zap_phys->mz_chunk
-			    [mze->mze_chunkid].mze_value = *intval;
+			ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
+			MZE_PHYS(zap, mze)->mze_value = *intval;
 		} else {
 			mzap_addent(zn, *intval);
 		}
@@ -1245,7 +1257,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 		err = ENOENT;
 
 		mze_tofind.mze_hash = zc->zc_hash;
-		mze_tofind.mze_phys.mze_cd = zc->zc_cd;
+		mze_tofind.mze_cd = zc->zc_cd;
 
 		mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
 		if (mze == NULL) {
@@ -1253,18 +1265,16 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 			    idx, AVL_AFTER);
 		}
 		if (mze) {
-			ASSERT(0 == bcmp(&mze->mze_phys,
-			    &zc->zc_zap->zap_m.zap_phys->mz_chunk
-			    [mze->mze_chunkid], sizeof (mze->mze_phys)));
-
+			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
+			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
 			za->za_normalization_conflict =
 			    mzap_normalization_conflict(zc->zc_zap, NULL, mze);
 			za->za_integer_length = 8;
 			za->za_num_integers = 1;
-			za->za_first_integer = mze->mze_phys.mze_value;
-			(void) strcpy(za->za_name, mze->mze_phys.mze_name);
+			za->za_first_integer = mzep->mze_value;
+			(void) strcpy(za->za_name, mzep->mze_name);
 			zc->zc_hash = mze->mze_hash;
-			zc->zc_cd = mze->mze_phys.mze_cd;
+			zc->zc_cd = mze->mze_cd;
 			err = 0;
 		} else {
 			zc->zc_hash = -1ULL;
@@ -1313,7 +1323,7 @@ zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)
 			goto out;
 		}
 		zc->zc_hash = mze->mze_hash;
-		zc->zc_cd = mze->mze_phys.mze_cd;
+		zc->zc_cd = mze->mze_cd;
 	}
 
 out:
diff --git a/usr/src/uts/common/fs/zfs/zfs_debug.c b/usr/src/uts/common/fs/zfs/zfs_debug.c
new file mode 100644
index 0000000000..d0f411a993
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_debug.c
@@ -0,0 +1,95 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+
+list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size;
+kmutex_t zfs_dbgmsgs_lock;
+int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */
+
+void
+zfs_dbgmsg_init(void)
+{
+	list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
+	    offsetof(zfs_dbgmsg_t, zdm_node));
+	mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+	zfs_dbgmsg_t *zdm;
+
+	while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) {
+		int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+		kmem_free(zdm, size);
+		zfs_dbgmsg_size -= size;
+	}
+	mutex_destroy(&zfs_dbgmsgs_lock);
+	ASSERT3U(zfs_dbgmsg_size, ==, 0);
+}
+
+/*
+ * Print these messages by running:
+ * 	echo ::zfs_dbgmsg | mdb -k
+ *
+ * Monitor these messages by running:
+ * 	dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ */
+void
+zfs_dbgmsg(const char *fmt, ...)
+{
+	int size;
+	va_list adx;
+	zfs_dbgmsg_t *zdm;
+
+	va_start(adx, fmt);
+	size = vsnprintf(NULL, 0, fmt, adx);
+	va_end(adx);
+
+	/*
+	 * There is one byte of string in sizeof (zfs_dbgmsg_t), used
+	 * for the terminating null.
+	 */
+	zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP);
+	zdm->zdm_timestamp = gethrestime_sec();
+
+	va_start(adx, fmt);
+	(void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx);
+	va_end(adx);
+
+	DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg);
+
+	mutex_enter(&zfs_dbgmsgs_lock);
+	list_insert_tail(&zfs_dbgmsgs, zdm);
+	zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size;
+	while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) {
+		zdm = list_remove_head(&zfs_dbgmsgs);
+		size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+		kmem_free(zdm, size);
+		zfs_dbgmsg_size -= size;
+	}
+	mutex_exit(&zfs_dbgmsgs_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index d280125152..de5fb1e4ce 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -62,6 +61,7 @@
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_dir.h>
 #include <sys/zvol.h>
+#include <sys/dsl_scan.h>
 #include <sharefs/share.h>
 #include <sys/dmu_objset.h>
 
@@ -117,7 +117,7 @@ void
 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
 {
 	const char *newfile;
-	char buf[256];
+	char buf[512];
 	va_list adx;
 
 	/*
@@ -1237,8 +1237,13 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
 	return (error);
 }
 
+/*
+ * inputs:
+ * zc_name              name of the pool
+ * zc_cookie            scan func (pool_scan_func_t)
+ */
 static int
-zfs_ioc_pool_scrub(zfs_cmd_t *zc)
+zfs_ioc_pool_scan(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
@@ -1246,7 +1251,10 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc)
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	error = spa_scrub(spa, zc->zc_cookie);
+	if (zc->zc_cookie == POOL_SCAN_NONE)
+		error = spa_scan_stop(spa);
+	else
+		error = spa_scan(spa, zc->zc_cookie);
 
 	spa_close(spa, FTAG);
 
@@ -1402,6 +1410,12 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
 	return (error);
 }
 
+/*
+ * inputs:
+ * zc_name		name of the pool
+ * zc_nvlist_conf	nvlist of devices to remove
+ * zc_cookie		to stop the remove?
+ */
 static int
 zfs_ioc_vdev_remove(zfs_cmd_t *zc)
 {
@@ -4250,7 +4264,7 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
 	    B_FALSE },
 	{ zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE,
 	    B_FALSE },
-	{ zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	{ zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE,
 	    B_TRUE },
 	{ zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE,
 	    B_FALSE },
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index da9163c963..f68dde85f8 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -560,34 +560,6 @@ unregister:
 
 }
 
-static void
-uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
-    int64_t delta, dmu_tx_t *tx)
-{
-	uint64_t used = 0;
-	char buf[32];
-	int err;
-	uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
-
-	if (delta == 0)
-		return;
-
-	(void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
-	err = zap_lookup(os, obj, buf, 8, 1, &used);
-
-	ASSERT(err == 0 || err == ENOENT);
-	/* no underflow/overflow */
-	ASSERT(delta > 0 || used >= -delta);
-	ASSERT(delta < 0 || used + delta > used);
-	used += delta;
-	if (used == 0)
-		err = zap_remove(os, obj, buf, tx);
-	else
-		err = zap_update(os, obj, buf, 8, 1, &used, tx);
-	ASSERT(err == 0);
-
-}
-
 static int
 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
     uint64_t *userp, uint64_t *groupp)
@@ -2239,9 +2211,8 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 		sa_register_update_callback(os, zfs_sa_upgrade);
 	}
 
-	spa_history_internal_log(LOG_DS_UPGRADE,
-	    dmu_objset_spa(os), tx, CRED(),
-	    "oldver=%llu newver=%llu dataset = %llu",
+	spa_history_log_internal(LOG_DS_UPGRADE,
+	    dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu",
 	    zfsvfs->z_version, newvers, dmu_objset_id(os));
 
 	dmu_tx_commit(tx);
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index e4c435341f..4aa4d10b07 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -36,6 +36,7 @@
 #include <sys/dsl_dataset.h>
 #include <sys/vdev.h>
 #include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
 
 /*
  * The zfs intent log (ZIL) saves transaction records of system calls
@@ -179,7 +180,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
 	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
-	error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+	error = dsl_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 4e481b16b7..181d07fbdb 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -661,6 +660,9 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 {
 	zio_t *zio;
 
+	dprintf_bp(bp, "freeing in txg %llu, pass %u",
+	    (longlong_t)txg, spa->spa_sync_pass);
+
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
 	ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
@@ -2073,6 +2075,8 @@ zio_ddt_write(zio_t *zio)
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
+ddt_entry_t *freedde; /* for debugging */
+
 static int
 zio_ddt_free(zio_t *zio)
 {
@@ -2086,7 +2090,7 @@ zio_ddt_free(zio_t *zio)
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	ddt_enter(ddt);
-	dde = ddt_lookup(ddt, bp, B_TRUE);
+	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
 	ddp = ddt_phys_select(dde, bp);
 	ddt_phys_decref(ddp);
 	ddt_exit(ddt);
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 3481fb0586..7b4577a9f3 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -249,7 +249,7 @@ struct maparg {
 
 /*ARGSUSED*/
 static int
-zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct maparg *ma = arg;