17 files changed, 250 insertions, 145 deletions
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index c12eb098a3..38dfaaba36 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -808,7 +808,7 @@ static uint64_t
 ztest_get_ashift(void)
 {
 	if (ztest_opts.zo_ashift == 0)
-		return (SPA_MINBLOCKSHIFT + ztest_random(3));
+		return (SPA_MINBLOCKSHIFT + ztest_random(5));
 	return (ztest_opts.zo_ashift);
 }
 
@@ -967,11 +967,28 @@ ztest_random_spa_version(uint64_t initial_version)
 	return (version);
 }
 
+/*
+ * Find the largest ashift used
+ */
+static uint64_t
+ztest_spa_get_ashift() {
+	uint64_t i;
+	uint64_t ashift = SPA_MINBLOCKSHIFT;
+	vdev_t *rvd = ztest_spa->spa_root_vdev;
+
+	for (i = 0; i < rvd->vdev_children; i++) {
+		ashift = MAX(ashift, rvd->vdev_child[i]->vdev_ashift);
+	}
+	return (ashift);
+}
+
 static int
 ztest_random_blocksize(void)
 {
-	return (1 << (SPA_MINBLOCKSHIFT +
-	    ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)));
+	// Choose a block size >= the ashift.
+	uint64_t block_shift =
+	    ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1);
+	return (1 << (SPA_MINBLOCKSHIFT + block_shift));
 }
 
 static int
@@ -5765,16 +5782,30 @@ ztest_freeze(void)
 	spa_freeze(spa);
 
 	/*
+	 * Because it is hard to predict how much space a write will actually
+	 * require beforehand, we leave ourselves some fudge space to write over
+	 * capacity.
+	 */
+	uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;
+
+	/*
 	 * Run tests that generate log records but don't alter the pool config
 	 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
 	 * We do a txg_wait_synced() after each iteration to force the txg
 	 * to increase well beyond the last synced value in the uberblock.
 	 * The ZIL should be OK with that.
+	 *
+	 * Run a random number of times less than zo_maxloops and ensure we do
+	 * not run out of space on the pool.
 	 */
 	while (ztest_random(10) != 0 &&
-	    numloops++ < ztest_opts.zo_maxloops) {
-		ztest_dmu_write_parallel(zd, 0);
-		ztest_dmu_object_alloc_free(zd, 0);
+	    numloops++ < ztest_opts.zo_maxloops &&
+	    metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
+		ztest_od_t od;
+		ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+		VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
+		ztest_io(zd, od.od_object,
+		    ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 		txg_wait_synced(spa_get_dsl(spa), 0);
 	}
 
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 168355ddda..911cde9717 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -105,7 +105,7 @@
  * with the buffer may be evicted prior to the callback.  The callback
  * must be made with *no locks held* (to prevent deadlock).  Additionally,
  * the users of callbacks must ensure that their private data is
- * protected from simultaneous callbacks from arc_buf_evict()
+ * protected from simultaneous callbacks from arc_clear_callback()
  * and arc_do_user_evicts().
  *
  * Note that the majority of the performance stats are manipulated
@@ -1498,8 +1498,12 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
 	}
 }
 
+/*
+ * Free up buf->b_data and if 'remove' is set, then pull the
+ * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
+ */
 static void
-arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
+arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
 {
 	arc_buf_t **bufp;
 
@@ -1550,7 +1554,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 	}
 
 	/* only remove the buf if requested */
-	if (!all)
+	if (!remove)
 		return;
 
 	/* remove the buf from the hdr list */
@@ -2124,7 +2128,7 @@ arc_do_user_evicts(void)
 		mutex_exit(&arc_eviction_mtx);
 
 		if (buf->b_efunc != NULL)
-			VERIFY(buf->b_efunc(buf) == 0);
+			VERIFY0(buf->b_efunc(buf->b_private));
 
 		buf->b_efunc = NULL;
 		buf->b_private = NULL;
@@ -3263,16 +3267,25 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
 }
 
 /*
- * This is used by the DMU to let the ARC know that a buffer is
- * being evicted, so the ARC should clean up.  If this arc buf
- * is not yet in the evicted state, it will be put there.
+ * Clear the user eviction callback set by arc_set_callback(), first calling
+ * it if it exists.  Because the presence of a callback keeps an arc_buf cached
+ * clearing the callback may result in the arc_buf being destroyed.  However,
+ * it will not result in the *last* arc_buf being destroyed, hence the data
+ * will remain cached in the ARC. We make a copy of the arc buffer here so
+ * that we can process the callback without holding any locks.
+ *
+ * It's possible that the callback is already in the process of being cleared
+ * by another thread.  In this case we can not clear the callback.
+ *
+ * Returns B_TRUE if the callback was successfully called and cleared.
  */
-int
-arc_buf_evict(arc_buf_t *buf)
+boolean_t
+arc_clear_callback(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
-	arc_buf_t **bufp;
+	arc_evict_func_t *efunc = buf->b_efunc;
+	void *private = buf->b_private;
 
 	mutex_enter(&buf->b_evict_lock);
 	hdr = buf->b_hdr;
@@ -3282,17 +3295,16 @@ arc_buf_evict(arc_buf_t *buf)
 		 */
 		ASSERT(buf->b_data == NULL);
 		mutex_exit(&buf->b_evict_lock);
-		return (0);
+		return (B_FALSE);
 	} else if (buf->b_data == NULL) {
-		arc_buf_t copy = *buf; /* structure assignment */
 		/*
 		 * We are on the eviction list; process this buffer now
 		 * but let arc_do_user_evicts() do the reaping.
 		 */
 		buf->b_efunc = NULL;
 		mutex_exit(&buf->b_evict_lock);
-		VERIFY(copy.b_efunc(&copy) == 0);
-		return (1);
+		VERIFY0(efunc(private));
+		return (B_TRUE);
 	}
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
@@ -3302,48 +3314,21 @@ arc_buf_evict(arc_buf_t *buf)
 	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
 
-	/*
-	 * Pull this buffer off of the hdr
-	 */
-	bufp = &hdr->b_buf;
-	while (*bufp != buf)
-		bufp = &(*bufp)->b_next;
-	*bufp = buf->b_next;
-
-	ASSERT(buf->b_data != NULL);
-	arc_buf_destroy(buf, FALSE, FALSE);
-
-	if (hdr->b_datacnt == 0) {
-		arc_state_t *old_state = hdr->b_state;
-		arc_state_t *evicted_state;
-
-		ASSERT(hdr->b_buf == NULL);
-		ASSERT(refcount_is_zero(&hdr->b_refcnt));
-
-		evicted_state =
-		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
-
-		mutex_enter(&old_state->arcs_mtx);
-		mutex_enter(&evicted_state->arcs_mtx);
-
-		arc_change_state(evicted_state, hdr, hash_lock);
-		ASSERT(HDR_IN_HASH_TABLE(hdr));
-		hdr->b_flags |= ARC_IN_HASH_TABLE;
-		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+	buf->b_efunc = NULL;
+	buf->b_private = NULL;
 
-		mutex_exit(&evicted_state->arcs_mtx);
-		mutex_exit(&old_state->arcs_mtx);
+	if (hdr->b_datacnt > 1) {
+		mutex_exit(&buf->b_evict_lock);
+		arc_buf_destroy(buf, FALSE, TRUE);
+	} else {
+		ASSERT(buf == hdr->b_buf);
+		hdr->b_flags |= ARC_BUF_AVAILABLE;
+		mutex_exit(&buf->b_evict_lock);
 	}
-	mutex_exit(hash_lock);
-	mutex_exit(&buf->b_evict_lock);
 
-	VERIFY(buf->b_efunc(buf) == 0);
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
-	buf->b_hdr = NULL;
-	buf->b_next = NULL;
-	kmem_cache_free(buf_cache, buf);
-	return (1);
+	mutex_exit(hash_lock);
+	VERIFY0(efunc(private));
+	return (B_TRUE);
 }
 
 /*
@@ -3489,17 +3474,6 @@ arc_released(arc_buf_t *buf)
 	return (released);
 }
 
-int
-arc_has_callback(arc_buf_t *buf)
-{
-	int callback;
-
-	mutex_enter(&buf->b_evict_lock);
-	callback = (buf->b_efunc != NULL);
-	mutex_exit(&buf->b_evict_lock);
-	return (callback);
-}
-
 #ifdef ZFS_DEBUG
 int
 arc_referenced(arc_buf_t *buf)
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 3480c64ec0..83afd52df4 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -181,8 +181,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
 }
 
 /*
- * Remove an entry from the hash table.  This operation will
- * fail if there are any existing holds on the db.
+ * Remove an entry from the hash table.  It must be in the EVICTING state.
  */
 static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
@@ -194,7 +193,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
 	dmu_buf_impl_t *dbf, **dbp;
 
 	/*
-	 * We musn't hold db_mtx to maintin lock ordering:
+	 * We musn't hold db_mtx to maintain lock ordering:
 	 * DBUF_HASH_MUTEX > db_mtx.
 	 */
 	ASSERT(refcount_is_zero(&db->db_holds));
@@ -431,7 +430,6 @@ static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
 	db->db_buf = buf;
 	if (buf != NULL) {
 		ASSERT(buf->b_data != NULL);
@@ -1555,12 +1553,15 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
  * when we are not holding the dn_dbufs_mtx, we can't clear the
  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
  * in this case.  For callers from the DMU we will usually see:
- *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
+ *	dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
  * For the arc callback, we will usually see:
  *	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
  * Sometimes, though, we will get a mix of these two:
- *	DMU: dbuf_clear()->arc_buf_evict()
+ *	DMU: dbuf_clear()->arc_clear_callback()
  *	ARC: dbuf_do_evict()->dbuf_destroy()
+ *
+ * This routine will dissociate the dbuf from the arc, by calling
+ * arc_clear_callback(), but will not evict the data from the ARC.
  */
 void
 dbuf_clear(dmu_buf_impl_t *db)
@@ -1568,7 +1569,7 @@ dbuf_clear(dmu_buf_impl_t *db)
 	dnode_t *dn;
 	dmu_buf_impl_t *parent = db->db_parent;
 	dmu_buf_impl_t *dndb;
-	int dbuf_gone = FALSE;
+	boolean_t dbuf_gone = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(refcount_is_zero(&db->db_holds));
@@ -1614,7 +1615,7 @@ dbuf_clear(dmu_buf_impl_t *db)
 	}
 
 	if (db->db_buf)
-		dbuf_gone = arc_buf_evict(db->db_buf);
+		dbuf_gone = arc_clear_callback(db->db_buf);
 
 	if (!dbuf_gone)
 		mutex_exit(&db->db_mtx);
@@ -1782,8 +1783,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 static int
 dbuf_do_evict(void *private)
 {
-	arc_buf_t *buf = private;
-	dmu_buf_impl_t *db = buf->b_private;
+	dmu_buf_impl_t *db = private;
 
 	if (!MUTEX_HELD(&db->db_mtx))
 		mutex_enter(&db->db_mtx);
@@ -2146,11 +2146,23 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
 			 * block on-disk. If so, then we simply evict
 			 * ourselves.
 			 */
-			if (!DBUF_IS_CACHEABLE(db) ||
-			    arc_buf_eviction_needed(db->db_buf))
+			if (!DBUF_IS_CACHEABLE(db)) {
+				if (db->db_blkptr != NULL &&
+				    !BP_IS_HOLE(db->db_blkptr) &&
+				    !BP_IS_EMBEDDED(db->db_blkptr)) {
+					spa_t *spa =
+					    dmu_objset_spa(db->db_objset);
+					blkptr_t bp = *db->db_blkptr;
+					dbuf_clear(db);
+					arc_freed(spa, &bp);
+				} else {
+					dbuf_clear(db);
+				}
+			} else if (arc_buf_eviction_needed(db->db_buf)) {
 				dbuf_clear(db);
-			else
+			} else {
 				mutex_exit(&db->db_mtx);
+			}
 		}
 	} else {
 		mutex_exit(&db->db_mtx);
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 5aef3977b4..27e3c959a4 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  */
 
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 2e2c972387..6e9ae37ebf 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -67,6 +67,21 @@ int zfs_condense_pct = 200;
 int zfs_condense_never = 0;
 
 /*
+ * Condensing a metaslab is not guaranteed to actually reduce the amount of
+ * space used on disk. In particular, a space map uses data in increments of
+ * MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the
+ * same number of blocks after condensing. Since the goal of condensing is to
+ * reduce the number of IOPs required to read the space map, we only want to
+ * condense when we can be sure we will reduce the number of blocks used by the
+ * space map. Unfortunately, we cannot precisely compute whether or not this is
+ * the case in metaslab_should_condense since we are holding ms_lock. Instead,
+ * we apply the following heuristic: do not condense a spacemap unless the
+ * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
+ * blocks.
+ */
+int zfs_metaslab_condense_block_threshold = 4;
+
+/*
  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  * be eligible for allocation. The value is defined as a percentage of
  * a free space. Metaslab groups that have more free space than
@@ -1284,6 +1299,8 @@ metaslab_group_preload(metaslab_group_t *mg)
  * times the size than the free space range tree representation
  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
  *
+ * 3. The on-disk size of the space map should actually decrease.
+ *
  * Checking the first condition is tricky since we don't want to walk
  * the entire AVL tree calculating the estimated on-disk size. Instead we
  * use the size-ordered range tree in the metaslab and calculate the
@@ -1294,13 +1311,21 @@ metaslab_group_preload(metaslab_group_t *mg)
  * To determine the second criterion we use a best-case estimate and assume
  * each segment can be represented on-disk as a single 64-bit entry. We refer
  * to this best-case estimate as the space map's minimal form.
+ *
+ * Unfortunately, we cannot compute the on-disk size of the space map in this
+ * context because we cannot accurately compute the effects of compression, etc.
+ * Instead, we apply the heuristic described in the block comment for
+ * zfs_metaslab_condense_block_threshold - we only condense if the space used
+ * is greater than a threshold number of blocks.
  */
 static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	range_seg_t *rs;
-	uint64_t size, entries, segsz;
+	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
+	dmu_object_info_t doi;
+	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
@@ -1327,9 +1352,15 @@ metaslab_should_condense(metaslab_t *msp)
 	entries = size / (MIN(size, SM_RUN_MAX));
 	segsz = entries * sizeof (uint64_t);
 
-	return (segsz <= space_map_length(msp->ms_sm) &&
-	    space_map_length(msp->ms_sm) >= (zfs_condense_pct *
-	    sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
+	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
+	object_size = space_map_length(msp->ms_sm);
+
+	dmu_object_info_from_db(sm->sm_dbuf, &doi);
+	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
+
+	return (segsz <= object_size &&
+	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
+	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
 /*
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 14bd2a0176..0504362726 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -1876,6 +1876,16 @@ spa_writeable(spa_t *spa)
 	return (!!(spa->spa_mode & FWRITE));
 }
 
+/*
+ * Returns true if there is a pending sync task in any of the current
+ * syncing txg, the current quiescing txg, or the current open txg.
+ */
+boolean_t
+spa_has_pending_synctask(spa_t *spa)
+{
+	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
+}
+
 int
 spa_mode(spa_t *spa)
 {
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index 15ef0f58d0..04a20c13ee 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -95,7 +95,6 @@ boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
 void arc_release(arc_buf_t *buf, void *tag);
 int arc_released(arc_buf_t *buf);
-int arc_has_callback(arc_buf_t *buf);
 void arc_buf_freeze(arc_buf_t *buf);
 void arc_buf_thaw(arc_buf_t *buf);
 boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
@@ -114,7 +113,7 @@ zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
 void arc_freed(spa_t *spa, const blkptr_t *bp);
 
 void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
-int arc_buf_evict(arc_buf_t *buf);
+boolean_t arc_clear_callback(arc_buf_t *buf);
 
 void arc_flush(spa_t *spa);
 void arc_tempreserve_clear(uint64_t reserve);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index 36b6c93bde..6c7d34cb82 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -780,6 +780,7 @@ extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
+extern boolean_t spa_has_pending_synctask(spa_t *spa);
 
 extern int spa_mode(spa_t *spa);
 extern uint64_t strtonum(const char *str, char **nptr);
diff --git a/usr/src/uts/common/fs/zfs/sys/txg.h b/usr/src/uts/common/fs/zfs/sys/txg.h
index e96c1fa8aa..77e2dba251 100644
--- a/usr/src/uts/common/fs/zfs/sys/txg.h
+++ b/usr/src/uts/common/fs/zfs/sys/txg.h
@@ -23,7 +23,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_TXG_H
@@ -112,6 +112,7 @@ extern boolean_t txg_sync_waiting(struct dsl_pool *dp);
 extern void txg_list_create(txg_list_t *tl, size_t offset);
 extern void txg_list_destroy(txg_list_t *tl);
 extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg);
+extern boolean_t txg_all_lists_empty(txg_list_t *tl);
 extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
 extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg);
 extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 9961a373ab..0326aa9c03 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
@@ -230,8 +230,11 @@ struct vdev {
 #define	VDEV_PHYS_SIZE		(112 << 10)
 #define	VDEV_UBERBLOCK_RING	(128 << 10)
 
+/* The largest uberblock we support is 8k. */
+#define	MAX_UBERBLOCK_SHIFT (13)
 #define	VDEV_UBERBLOCK_SHIFT(vd)	\
-	MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
+	MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
+	    MAX_UBERBLOCK_SHIFT)
 #define	VDEV_UBERBLOCK_COUNT(vd)	\
 	(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
 #define	VDEV_UBERBLOCK_OFFSET(vd, n)	\
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 464ecc75fa..248b08501c 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -165,19 +165,20 @@ enum zio_flag {
 	ZIO_FLAG_RESILVER	= 1 << 3,
 	ZIO_FLAG_SCRUB		= 1 << 4,
 	ZIO_FLAG_SCAN_THREAD	= 1 << 5,
+	ZIO_FLAG_PHYSICAL	= 1 << 6,
 
 #define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
 
 	/*
 	 * Flags inherited by ddt, gang, and vdev children.
 	 */
-	ZIO_FLAG_CANFAIL	= 1 << 6,	/* must be first for INHERIT */
-	ZIO_FLAG_SPECULATIVE	= 1 << 7,
-	ZIO_FLAG_CONFIG_WRITER	= 1 << 8,
-	ZIO_FLAG_DONT_RETRY	= 1 << 9,
-	ZIO_FLAG_DONT_CACHE	= 1 << 10,
-	ZIO_FLAG_NODATA		= 1 << 11,
-	ZIO_FLAG_INDUCE_DAMAGE	= 1 << 12,
+	ZIO_FLAG_CANFAIL	= 1 << 7,	/* must be first for INHERIT */
+	ZIO_FLAG_SPECULATIVE	= 1 << 8,
+	ZIO_FLAG_CONFIG_WRITER	= 1 << 9,
+	ZIO_FLAG_DONT_RETRY	= 1 << 10,
+	ZIO_FLAG_DONT_CACHE	= 1 << 11,
+	ZIO_FLAG_NODATA		= 1 << 12,
+	ZIO_FLAG_INDUCE_DAMAGE	= 1 << 13,
 
 #define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 #define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
@@ -185,27 +186,27 @@ enum zio_flag {
 	/*
 	 * Flags inherited by vdev children.
 	 */
-	ZIO_FLAG_IO_RETRY	= 1 << 13,	/* must be first for INHERIT */
-	ZIO_FLAG_PROBE		= 1 << 14,
-	ZIO_FLAG_TRYHARD	= 1 << 15,
-	ZIO_FLAG_OPTIONAL	= 1 << 16,
+	ZIO_FLAG_IO_RETRY	= 1 << 14,	/* must be first for INHERIT */
+	ZIO_FLAG_PROBE		= 1 << 15,
+	ZIO_FLAG_TRYHARD	= 1 << 16,
+	ZIO_FLAG_OPTIONAL	= 1 << 17,
 
 #define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)
 
 	/*
 	 * Flags not inherited by any children.
 	 */
-	ZIO_FLAG_DONT_QUEUE	= 1 << 17,	/* must be first for INHERIT */
-	ZIO_FLAG_DONT_PROPAGATE	= 1 << 18,
-	ZIO_FLAG_IO_BYPASS	= 1 << 19,
-	ZIO_FLAG_IO_REWRITE	= 1 << 20,
-	ZIO_FLAG_RAW		= 1 << 21,
-	ZIO_FLAG_GANG_CHILD	= 1 << 22,
-	ZIO_FLAG_DDT_CHILD	= 1 << 23,
-	ZIO_FLAG_GODFATHER	= 1 << 24,
-	ZIO_FLAG_NOPWRITE	= 1 << 25,
-	ZIO_FLAG_REEXECUTED	= 1 << 26,
-	ZIO_FLAG_DELEGATED	= 1 << 27,
+	ZIO_FLAG_DONT_QUEUE	= 1 << 18,	/* must be first for INHERIT */
+	ZIO_FLAG_DONT_PROPAGATE	= 1 << 19,
+	ZIO_FLAG_IO_BYPASS	= 1 << 20,
+	ZIO_FLAG_IO_REWRITE	= 1 << 21,
+	ZIO_FLAG_RAW		= 1 << 22,
+	ZIO_FLAG_GANG_CHILD	= 1 << 23,
+	ZIO_FLAG_DDT_CHILD	= 1 << 24,
+	ZIO_FLAG_GODFATHER	= 1 << 25,
+	ZIO_FLAG_NOPWRITE	= 1 << 26,
+	ZIO_FLAG_REEXECUTED	= 1 << 27,
+	ZIO_FLAG_DELEGATED	= 1 << 28,
 };
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index 5a7b3e85ee..915c9bb4b2 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 Martin Matuska
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -721,6 +721,24 @@ txg_list_empty(txg_list_t *tl, uint64_t txg)
 }
 
 /*
+ * Returns true if all txg lists are empty.
+ *
+ * Warning: this is inherently racy (an item could be added immediately after this
+ * function returns). We don't bother with the lock because it wouldn't change the
+ * semantics.
+ */
+boolean_t
+txg_all_lists_empty(txg_list_t *tl)
+{
+	for (int i = 0; i < TXG_SIZE; i++) {
+		if (!txg_list_empty(tl, i)) {
+			return (B_FALSE);
+		}
+	}
+	return (B_TRUE);
+}
+
+/*
  * Add an entry to the list (unless it's already on the list).
  * Returns B_TRUE if it was actually added.
  */
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index da50548bcb..5d02f3e7ed 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -356,14 +356,23 @@ vdev_queue_class_min_active(zio_priority_t p)
 }
 
 static int
-vdev_queue_max_async_writes(uint64_t dirty)
+vdev_queue_max_async_writes(spa_t *spa)
 {
 	int writes;
+	uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
 	uint64_t min_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_min_dirty_percent / 100;
 	uint64_t max_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_max_dirty_percent / 100;
 
+	/*
+	 * Sync tasks correspond to interactive user actions. To reduce the
+	 * execution time of those actions we push data out as fast as possible.
+	 */
+	if (spa_has_pending_synctask(spa)) {
+		return (zfs_vdev_async_write_max_active);
+	}
+
 	if (dirty < min_bytes)
 		return (zfs_vdev_async_write_min_active);
 	if (dirty > max_bytes)
@@ -396,8 +405,7 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
 	case ZIO_PRIORITY_ASYNC_READ:
 		return (zfs_vdev_async_read_max_active);
 	case ZIO_PRIORITY_ASYNC_WRITE:
-		return (vdev_queue_max_async_writes(
-		    spa->spa_dsl_pool->dp_dirty_total));
+		return (vdev_queue_max_async_writes(spa));
 	case ZIO_PRIORITY_SCRUB:
 		return (zfs_vdev_scrub_max_active);
 	default:
diff --git a/usr/src/uts/common/fs/zfs/zfs_debug.c b/usr/src/uts/common/fs/zfs/zfs_debug.c
index 85fa7600d9..a9cbe4dfe3 100644
--- a/usr/src/uts/common/fs/zfs/zfs_debug.c
+++ b/usr/src/uts/common/fs/zfs/zfs_debug.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -57,7 +57,10 @@ zfs_dbgmsg_fini(void)
  * echo ::zfs_dbgmsg | mdb -k
  *
  * Monitor these messages by running:
- * 	dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ *
+ * When used with libzpool, monitor with:
+ * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}'
  */
 void
 zfs_dbgmsg(const char *fmt, ...)
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 93b5194caa..91be6c7741 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -849,8 +849,8 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
-	    ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
-	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+	    NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
@@ -870,8 +870,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
-	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+	    NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
@@ -2542,7 +2542,9 @@ zio_vdev_io_start(zio_t *zio)
 
 	align = 1ULL << vd->vdev_top->vdev_ashift;
 
-	if (P2PHASE(zio->io_size, align) != 0) {
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+	    P2PHASE(zio->io_size, align) != 0) {
+		/* Transform logical writes to be a full physical block size. */
 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
 		char *abuf = zio_buf_alloc(asize);
 		ASSERT(vd == vd->vdev_top);
@@ -2553,8 +2555,22 @@ zio_vdev_io_start(zio_t *zio)
 		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
 	}
 
-	ASSERT(P2PHASE(zio->io_offset, align) == 0);
-	ASSERT(P2PHASE(zio->io_size, align) == 0);
+	/*
+	 * If this is not a physical io, make sure that it is properly aligned
+	 * before proceeding.
+	 */
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
+		ASSERT0(P2PHASE(zio->io_offset, align));
+		ASSERT0(P2PHASE(zio->io_size, align));
+	} else {
+		/*
+		 * For physical writes, we allow 512b aligned writes and assume
+		 * the device will perform a read-modify-write as necessary.
+		 */
+		ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
+		ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
+	}
+
 	VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
 
 	/*
diff --git a/usr/src/uts/common/klm/klmops.c b/usr/src/uts/common/klm/klmops.c
index a8adbe76c1..2036ead94d 100644
--- a/usr/src/uts/common/klm/klmops.c
+++ b/usr/src/uts/common/klm/klmops.c
@@ -123,9 +123,7 @@ lm4_shrlock(struct vnode *vp, int cmd,
 }
 
 /*
- * Helper for lm_frlock, lm4_frlock, nfs_lockrelease
- * After getting a lock from a remote lock manager,
- * register the lock locally.
+ * Helper for nfs_lockrelease.
  */
 void
 lm_register_lock_locally(struct vnode *vp, struct lm_sysid *ls,
diff --git a/usr/src/uts/common/klm/nlm_service.c b/usr/src/uts/common/klm/nlm_service.c
index e3e940ca12..d87e80448d 100644
--- a/usr/src/uts/common/klm/nlm_service.c
+++ b/usr/src/uts/common/klm/nlm_service.c
@@ -167,17 +167,18 @@ nlm_init_flock(struct flock64 *fl, struct nlm4_lock *nl,
  * This is just like nfs_fhtovp() but without the exportinfo argument.
  */
 static vnode_t *
-lm_fhtovp(fhandle_t *fh)
+lm_fhtovp(fhandle3_t *fh)
 {
 	vfs_t *vfsp;
 	vnode_t *vp;
 	int error;
 
-	vfsp = getvfs(&fh->fh_fsid);
+	vfsp = getvfs(&fh->_fh3_fsid);
 	if (vfsp == NULL)
 		return (NULL);
 
-	error = VFS_VGET(vfsp, &vp, (fid_t *)&(fh->fh_len));
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	error = VFS_VGET(vfsp, &vp, (fid_t *)&(fh->_fh3_len));
 	VFS_RELE(vfsp);
 	if (error || vp == NULL)
 		return (NULL);
@@ -193,28 +194,26 @@ lm_fhtovp(fhandle_t *fh)
 static vnode_t *
 nlm_fh_to_vp(struct netobj *fh)
 {
-	fhandle_t *fhp;
+	fhandle3_t *fhp;
 
 	/*
 	 * Get a vnode pointer for the given NFS file handle.
-	 * Note that it could be an NFSv2 for NFSv3 handle,
+	 * Note that it could be an NFSv2 or NFSv3 handle,
 	 * which means the size might vary.  (don't copy)
 	 */
-	if (fh->n_len > MAX_NETOBJ_SZ || fh->n_len < sizeof (*fhp))
+	if (fh->n_len < sizeof (fhandle_t))
 		return (NULL);
 
 	/* We know this is aligned (kmem_alloc) */
 	/* LINTED E_BAD_PTR_CAST_ALIGN */
-	fhp = (fhandle_t *)fh->n_bytes;
+	fhp = (fhandle3_t *)fh->n_bytes;
 
 	/*
 	 * See the comment for NFS_FH3MAXDATA in uts/common/nfs/nfs.h for
-	 * converting fhandles. Check the NFSv3 file handle size in case there
-	 * is some unknown compatability issue here, even though we're only
-	 * using the "legacy" fhandle_t struct. The lockmgr is not used for
-	 * NFS v4.
+	 * converting fhandles. Check the NFSv3 file handle size. The lockmgr
+	 * is not used for NFS v4.
 	 */
-	if (fhp->fh_len > NFS_FH3MAXDATA || fhp->fh_len == 0)
+	if (fhp->_fh3_len > NFS_FH3MAXDATA || fhp->_fh3_len == 0)
 		return (NULL);
 
 	return (lm_fhtovp(fhp));