Merge remote-tracking branch 'origin/master' into cr6990-OS-8027cr6990-OS-8027

author: Ryan Zezeski <rpz@joyent.com> 2019-11-14 09:39:53 -0700
committer: Ryan Zezeski <rpz@joyent.com> 2019-11-14 09:39:53 -0700
commit: 074bf480b3d9701c3c55056fe6105028504135b6 (patch)
tree: 9bd8568e7caa13fc1b13146260ba82af1827ebbc
parent: 27bc3ef3b6dd5a071a0607d96af5eec24ca5d276 (diff)
parent: 43ef85afe5649116d876156ca6eb797e144c9795 (diff)
download: illumos-joyent-cr6990-OS-8027.tar.gz
27 files changed, 1066 insertions, 328 deletions
diff --git a/usr/src/boot/Makefile.version b/usr/src/boot/Makefile.version
index 9d40ee8993..a161b24487 100644
--- a/usr/src/boot/Makefile.version
+++ b/usr/src/boot/Makefile.version
@@ -33,4 +33,4 @@ LOADER_VERSION = 1.1
 # Use date like formatting here, YYYY.MM.DD.XX, without leading zeroes.
 # The version is processed from left to right, the version number can only
 # be increased.
-BOOT_VERSION = $(LOADER_VERSION)-2019.11.04.1
+BOOT_VERSION = $(LOADER_VERSION)-2019.11.05.1
diff --git a/usr/src/boot/lib/libstand/zfs/zfsimpl.c b/usr/src/boot/lib/libstand/zfs/zfsimpl.c
index e595273c9b..fba9f1fc59 100644
--- a/usr/src/boot/lib/libstand/zfs/zfsimpl.c
+++ b/usr/src/boot/lib/libstand/zfs/zfsimpl.c
@@ -1534,71 +1534,104 @@ vdev_label_offset(uint64_t psize, int l, uint64_t offset)
 }
 
 static int
-vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
+vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
+{
+	unsigned int seq1 = 0;
+	unsigned int seq2 = 0;
+	int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
+
+	if (cmp != 0)
+		return (cmp);
+
+	cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
+	if (cmp != 0)
+		return (cmp);
+
+	if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
+		seq1 = MMP_SEQ(ub1);
+
+	if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
+		seq2 = MMP_SEQ(ub2);
+
+	return (AVL_CMP(seq1, seq2));
+}
+
+static int
+uberblock_verify(uberblock_t *ub)
+{
+	if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) {
+		byteswap_uint64_array(ub, sizeof (uberblock_t));
+	}
+
+	if (ub->ub_magic != UBERBLOCK_MAGIC ||
+	    !SPA_VERSION_IS_SUPPORTED(ub->ub_version))
+		return (EINVAL);
+
+	return (0);
+}
+
+static int
+vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset,
+    size_t size)
 {
-	vdev_t vtmp;
-	vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
-	vdev_phys_t *tmp_label;
-	spa_t *spa;
-	vdev_t *vdev, *top_vdev, *pool_vdev;
-	off_t off;
 	blkptr_t bp;
-	const unsigned char *nvlist = NULL;
-	uint64_t val;
-	uint64_t guid;
-	uint64_t best_txg = 0;
-	uint64_t pool_txg, pool_guid;
-	const char *pool_name;
-	const unsigned char *vdevs;
-	const unsigned char *features;
-	int i, l, rc, is_newer;
-	char *upbuf;
-	const struct uberblock *up;
+	off_t off;
 
-	/*
-	 * Load the vdev label and figure out which
-	 * uberblock is most current.
-	 */
-	memset(&vtmp, 0, sizeof(vtmp));
-	vtmp.v_phys_read = phys_read;
-	vtmp.v_read_priv = read_priv;
-	vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
-	    (uint64_t)sizeof (vdev_label_t));
+	off = vdev_label_offset(vd->v_psize, l, offset);
 
-	/* Test for minimum device size. */
-	if (vtmp.v_psize < SPA_MINDEVSIZE)
-		return (EIO);
+	BP_ZERO(&bp);
+	BP_SET_LSIZE(&bp, size);
+	BP_SET_PSIZE(&bp, size);
+	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
+	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+	DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
+	ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
 
-	tmp_label = zfs_alloc(sizeof (vdev_phys_t));
+	return (vdev_read_phys(vd, &bp, buf, off, size));
+}
 
-	for (l = 0; l < VDEV_LABELS; l++) {
-		off = vdev_label_offset(vtmp.v_psize, l,
-		    offsetof(vdev_label_t, vl_vdev_phys));
+static unsigned char *
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
+{
+	vdev_phys_t *label;
+	uint64_t best_txg = 0;
+	uint64_t label_txg = 0;
+	uint64_t asize;
+	unsigned char *nvl;
+	size_t nvl_size;
+	int error;
+
+	label = malloc(sizeof (vdev_phys_t));
+	if (label == NULL)
+		return (NULL);
 
-		BP_ZERO(&bp);
-		BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
-		BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
-		BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
-		BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
-		DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
-		ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
+	nvl_size = VDEV_PHYS_SIZE - sizeof (zio_eck_t) - 4;
+	nvl = malloc(nvl_size);
+	if (nvl == NULL)
+		goto done;
 
-		if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0))
-			continue;
+	for (int l = 0; l < VDEV_LABELS; l++) {
+		const unsigned char *nvlist;
 
-		if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR)
+		if (vdev_label_read(vd, l, label,
+		    offsetof(vdev_label_t, vl_vdev_phys),
+		    sizeof (vdev_phys_t)))
 			continue;
 
-		nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4;
-		if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
-		    DATA_TYPE_UINT64, NULL, &pool_txg) != 0)
+		if (label->vp_nvlist[0] != NV_ENCODE_XDR)
 			continue;
 
-		if (best_txg <= pool_txg) {
-			uint64_t asize;
+		nvlist = (const unsigned char *) label->vp_nvlist + 4;
+		error = nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
+		    DATA_TYPE_UINT64, NULL, &label_txg);
+		if (error != 0 || label_txg == 0) {
+			memcpy(nvl, nvlist, nvl_size);
+			goto done;
+		}
 
-			best_txg = pool_txg;
-			memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t));
+		if (label_txg <= txg && label_txg > best_txg) {
+			best_txg = label_txg;
+			memcpy(nvl, nvlist, nvl_size);
 
 			/*
 			 * Use asize from pool config. We need this
@@ -1606,30 +1639,89 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
 			 */
 			if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
 			    DATA_TYPE_UINT64, NULL, &asize) == 0) {
-				vtmp.v_psize = asize +
+				vd->v_psize = asize +
 				    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 			}
 		}
 	}
 
-	zfs_free(tmp_label, sizeof (vdev_phys_t));
+	if (best_txg == 0) {
+		free(nvl);
+		nvl = NULL;
+	}
+done:
+	free(label);
+	return (nvl);
+}
+
+static void
+vdev_uberblock_load(vdev_t *vd, uberblock_t *ub)
+{
+	uberblock_t *buf;
+
+	buf = malloc(VDEV_UBERBLOCK_SIZE(vd));
+	if (buf == NULL)
+		return;
+
+	for (int l = 0; l < VDEV_LABELS; l++) {
+		for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+			if (vdev_label_read(vd, l, buf,
+			    VDEV_UBERBLOCK_OFFSET(vd, n),
+			    VDEV_UBERBLOCK_SIZE(vd)))
+				continue;
+			if (uberblock_verify(buf) != 0)
+				continue;
+
+			if (vdev_uberblock_compare(buf, ub) > 0)
+				*ub = *buf;
+		}
+	}
+	free(buf);
+}
+
+static int
+vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
+{
+	vdev_t vtmp;
+	spa_t *spa;
+	vdev_t *vdev, *top_vdev, *pool_vdev;
+	unsigned char *nvlist;
+	uint64_t val;
+	uint64_t guid;
+	uint64_t pool_txg, pool_guid;
+	const char *pool_name;
+	const unsigned char *vdevs;
+	const unsigned char *features;
+	int rc, is_newer;
 
-	if (best_txg == 0)
-		return (EIO);
+	/*
+	 * Load the vdev label and figure out which
+	 * uberblock is most current.
+	 */
+	memset(&vtmp, 0, sizeof (vtmp));
+	vtmp.v_phys_read = phys_read;
+	vtmp.v_read_priv = read_priv;
+	vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
+	    (uint64_t)sizeof (vdev_label_t));
 
-	if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR)
+	/* Test for minimum device size. */
+	if (vtmp.v_psize < SPA_MINDEVSIZE)
 		return (EIO);
 
-	nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
+	nvlist = vdev_label_read_config(&vtmp, UINT64_MAX);
+	if (nvlist == NULL)
+		return (EIO);
 
 	if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
 	    NULL, &val) != 0) {
+		free(nvlist);
 		return (EIO);
 	}
 
 	if (!SPA_VERSION_IS_SUPPORTED(val)) {
 		printf("ZFS: unsupported ZFS version %u (should be %u)\n",
 		    (unsigned) val, (unsigned) SPA_VERSION);
+		free(nvlist);
 		return (EIO);
 	}
 
@@ -1637,16 +1729,19 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
 	if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
 	    DATA_TYPE_NVLIST, NULL, &features) == 0 &&
 	    nvlist_check_features_for_read(features) != 0) {
+		free(nvlist);
 		return (EIO);
 	}
 
 	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
 	    NULL, &val) != 0) {
+		free(nvlist);
 		return (EIO);
 	}
 
 	if (val == POOL_STATE_DESTROYED) {
 		/* We don't boot only from destroyed pools. */
+		free(nvlist);
 		return (EIO);
 	}
 
@@ -1660,12 +1755,13 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
 		 * Cache and spare devices end up here - just ignore
 		 * them.
 		 */
-		/*printf("ZFS: can't find pool details\n");*/
+		free(nvlist);
 		return (EIO);
 	}
 
 	if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64,
 	    NULL, &val) == 0 && val != 0) {
+		free(nvlist);
 		return (EIO);
 	}
 
@@ -1675,8 +1771,10 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
 	spa = spa_find_by_guid(pool_guid);
 	if (spa == NULL) {
 		spa = spa_create(pool_guid, pool_name);
-		if (spa == NULL)
+		if (spa == NULL) {
+			free(nvlist);
 			return (ENOMEM);
+		}
 	}
 	if (pool_txg > spa->spa_txg) {
 		spa->spa_txg = pool_txg;
@@ -1693,18 +1791,24 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
 	 */
 	if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
 	    NULL, &guid) != 0) {
+		free(nvlist);
 		return (EIO);
 	}
 	vdev = vdev_find(guid);
-	if (vdev && vdev->v_phys_read)	/* Has this vdev already been inited? */
+	/* Has this vdev already been inited? */
+	if (vdev && vdev->v_phys_read) {
+		free(nvlist);
 		return (EIO);
+	}
 
 	if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
 	    NULL, &vdevs)) {
+		free(nvlist);
 		return (EIO);
 	}
 
 	rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
+	free(nvlist);
 	if (rc != 0)
 		return (rc);
 
@@ -1714,6 +1818,7 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
 	STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
 		if (top_vdev == pool_vdev)
 			break;
+
 	if (!pool_vdev && top_vdev) {
 		top_vdev->spa = spa;
 		STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
@@ -1748,36 +1853,7 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
 	 * the best uberblock and then we can actually access
 	 * the contents of the pool.
 	 */
-	upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
-	up = (const struct uberblock *)upbuf;
-	for (l = 0; l < VDEV_LABELS; l++) {
-		for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) {
-			off = vdev_label_offset(vdev->v_psize, l,
-			    VDEV_UBERBLOCK_OFFSET(vdev, i));
-			BP_ZERO(&bp);
-			DVA_SET_OFFSET(&bp.blk_dva[0], off);
-			BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
-			BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
-			BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
-			BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
-			ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
-
-			if (vdev_read_phys(vdev, &bp, upbuf, off, 0) != 0)
-				continue;
-
-			if (up->ub_magic != UBERBLOCK_MAGIC)
-				continue;
-			if (up->ub_txg < spa->spa_txg)
-				continue;
-			if (up->ub_txg > spa->spa_uberblock.ub_txg ||
-			    (up->ub_txg == spa->spa_uberblock.ub_txg &&
-			    up->ub_timestamp >
-			    spa->spa_uberblock.ub_timestamp)) {
-				spa->spa_uberblock = *up;
-			}
-		}
-	}
-	zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
+	vdev_uberblock_load(vdev, &spa->spa_uberblock);
 
 	vdev->spa = spa;
 	if (spap != NULL)
diff --git a/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h b/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h
index 2a71fcb067..8f45983761 100644
--- a/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h
+++ b/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h
@@ -66,6 +66,14 @@
 
 #define _NOTE(s)
 
+/*
+ * AVL comparator helpers
+ */
+#define	AVL_ISIGN(a)	(((a) > 0) - ((a) < 0))
+#define	AVL_CMP(a, b)	(((a) > (b)) - ((a) < (b)))
+#define	AVL_PCMP(a, b)	\
+	(((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
+
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
 
@@ -492,8 +500,16 @@ typedef struct zio_gbh {
 #define	VDEV_PHYS_SIZE		(112 << 10)
 #define	VDEV_UBERBLOCK_RING	(128 << 10)
 
+/*
+ * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock
+ * ring when MMP is enabled.
+ */
+#define	MMP_BLOCKS_PER_LABEL	1
+
+/* The largest uberblock we support is 8k. */
+#define	MAX_UBERBLOCK_SHIFT	(13)
 #define	VDEV_UBERBLOCK_SHIFT(vd)	\
-	MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT)
+	MIN(MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT), MAX_UBERBLOCK_SHIFT)
 #define	VDEV_UBERBLOCK_COUNT(vd)	\
 	(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
 #define	VDEV_UBERBLOCK_OFFSET(vd, n)	\
@@ -843,15 +859,88 @@ typedef enum pool_state {
  */
 #define	UBERBLOCK_MAGIC		0x00bab10c		/* oo-ba-bloc!	*/
 #define	UBERBLOCK_SHIFT		10			/* up to 1K	*/
-
-struct uberblock {
+#define	MMP_MAGIC		0xa11cea11		/* all-see-all  */
+
+#define	MMP_INTERVAL_VALID_BIT	0x01
+#define	MMP_SEQ_VALID_BIT	0x02
+#define	MMP_FAIL_INT_VALID_BIT	0x04
+
+#define	MMP_VALID(ubp)		(ubp->ub_magic == UBERBLOCK_MAGIC && \
+				    ubp->ub_mmp_magic == MMP_MAGIC)
+#define	MMP_INTERVAL_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+				    MMP_INTERVAL_VALID_BIT))
+#define	MMP_SEQ_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+				    MMP_SEQ_VALID_BIT))
+#define	MMP_FAIL_INT_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+				    MMP_FAIL_INT_VALID_BIT))
+
+#define	MMP_INTERVAL(ubp)	((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
+				    >> 8)
+#define	MMP_SEQ(ubp)		((ubp->ub_mmp_config & 0x0000FFFF00000000) \
+				    >> 32)
+#define	MMP_FAIL_INT(ubp)	((ubp->ub_mmp_config & 0xFFFF000000000000) \
+				    >> 48)
+
+typedef struct uberblock {
 	uint64_t	ub_magic;	/* UBERBLOCK_MAGIC		*/
 	uint64_t	ub_version;	/* SPA_VERSION			*/
 	uint64_t	ub_txg;		/* txg of last sync		*/
 	uint64_t	ub_guid_sum;	/* sum of all vdev guids	*/
 	uint64_t	ub_timestamp;	/* UTC time of last sync	*/
 	blkptr_t	ub_rootbp;	/* MOS objset_phys_t		*/
-};
+	/* highest SPA_VERSION supported by software that wrote this txg */
+	uint64_t	ub_software_version;
+	/* Maybe missing in uberblocks we read, but always written */
+	uint64_t	ub_mmp_magic;
+	/*
+	 * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off.
+	 * Otherwise, nanosec since last MMP write.
+	 */
+	uint64_t	ub_mmp_delay;
+
+	/*
+	 * The ub_mmp_config contains the multihost write interval, multihost
+	 * fail intervals, sequence number for sub-second granularity, and
+	 * valid bit mask.  This layout is as follows:
+	 *
+	 *   64      56      48      40      32      24      16      8       0
+	 *   +-------+-------+-------+-------+-------+-------+-------+-------+
+	 * 0 | Fail Intervals|      Seq      |   Write Interval (ms) | VALID |
+	 *   +-------+-------+-------+-------+-------+-------+-------+-------+
+	 *
+	 * This allows a write_interval of (2^24/1000)s, over 4.5 hours
+	 *
+	 * VALID Bits:
+	 * - 0x01 - Write Interval (ms)
+	 * - 0x02 - Sequence number exists
+	 * - 0x04 - Fail Intervals
+	 * - 0xf8 - Reserved
+	 */
+	uint64_t	ub_mmp_config;
+
+	/*
+	 * ub_checkpoint_txg indicates two things about the current uberblock:
+	 *
+	 * 1] If it is not zero then this uberblock is a checkpoint. If it is
+	 *    zero, then this uberblock is not a checkpoint.
+	 *
+	 * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
+	 *    the ub_txg that the uberblock had at the time we moved it to
+	 *    the MOS config.
+	 *
+	 * The field is set when we checkpoint the uberblock and continues to
+	 * hold that value even after we've rewound (unlike the ub_txg that
+	 * is reset to a higher value).
+	 *
+	 * Besides checks used to determine whether we are reopening the
+	 * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
+	 * the value of the field is used to determine which ZIL blocks have
+	 * been allocated according to the ms_sm when we are rewinding to a
+	 * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
+	 * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
+	 */
+	uint64_t	ub_checkpoint_txg;
+} uberblock_t;
 
 /*
  * Flags.
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
index 2c32e1a191..7cc12ccf0a 100644
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -3098,25 +3098,25 @@ reference_cb(uintptr_t addr, const void *ignored, void *arg)
 	return (WALK_NEXT);
 }
 
-typedef struct mdb_refcount {
+typedef struct mdb_zfs_refcount {
 	uint64_t rc_count;
-} mdb_refcount_t;
+} mdb_zfs_refcount_t;
 
-typedef struct mdb_refcount_removed {
+typedef struct mdb_zfs_refcount_removed {
 	uint64_t rc_removed_count;
-} mdb_refcount_removed_t;
+} mdb_zfs_refcount_removed_t;
 
-typedef struct mdb_refcount_tracked {
+typedef struct mdb_zfs_refcount_tracked {
 	boolean_t rc_tracked;
-} mdb_refcount_tracked_t;
+} mdb_zfs_refcount_tracked_t;
 
 /* ARGSUSED */
 static int
-refcount(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+zfs_refcount(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
-	mdb_refcount_t rc;
-	mdb_refcount_removed_t rcr;
-	mdb_refcount_tracked_t rct;
+	mdb_zfs_refcount_t rc;
+	mdb_zfs_refcount_removed_t rcr;
+	mdb_zfs_refcount_tracked_t rct;
 	int off;
 	boolean_t released = B_FALSE;
 
@@ -3128,30 +3128,30 @@ refcount(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	    NULL) != argc)
 		return (DCMD_USAGE);
 
-	if (mdb_ctf_vread(&rc, "refcount_t", "mdb_refcount_t", addr,
+	if (mdb_ctf_vread(&rc, "zfs_refcount_t", "mdb_zfs_refcount_t", addr,
 	    0) == -1)
 		return (DCMD_ERR);
 
-	if (mdb_ctf_vread(&rcr, "refcount_t", "mdb_refcount_removed_t", addr,
-	    MDB_CTF_VREAD_QUIET) == -1) {
-		mdb_printf("refcount_t at %p has %llu holds (untracked)\n",
+	if (mdb_ctf_vread(&rcr, "zfs_refcount_t", "mdb_zfs_refcount_removed_t",
+	    addr, MDB_CTF_VREAD_QUIET) == -1) {
+		mdb_printf("zfs_refcount_t at %p has %llu holds (untracked)\n",
 		    addr, (longlong_t)rc.rc_count);
 		return (DCMD_OK);
 	}
 
-	if (mdb_ctf_vread(&rct, "refcount_t", "mdb_refcount_tracked_t", addr,
-	    MDB_CTF_VREAD_QUIET) == -1) {
+	if (mdb_ctf_vread(&rct, "zfs_refcount_t", "mdb_zfs_refcount_tracked_t",
+	    addr, MDB_CTF_VREAD_QUIET) == -1) {
 		/* If this is an old target, it might be tracked. */
 		rct.rc_tracked = B_TRUE;
 	}
 
-	mdb_printf("refcount_t at %p has %llu current holds, "
+	mdb_printf("zfs_refcount_t at %p has %llu current holds, "
 	    "%llu recently released holds\n",
 	    addr, (longlong_t)rc.rc_count, (longlong_t)rcr.rc_removed_count);
 
 	if (rct.rc_tracked && rc.rc_count > 0)
 		mdb_printf("current holds:\n");
-	off = mdb_ctf_offsetof_by_name("refcount_t", "rc_list");
+	off = mdb_ctf_offsetof_by_name("zfs_refcount_t", "rc_list");
 	if (off == -1)
 		return (DCMD_ERR);
 	mdb_pwalk("list", reference_cb, (void*)B_FALSE, addr + off);
@@ -3159,7 +3159,7 @@ refcount(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	if (released && rcr.rc_removed_count > 0) {
 		mdb_printf("released holds:\n");
 
-		off = mdb_ctf_offsetof_by_name("refcount_t", "rc_removed");
+		off = mdb_ctf_offsetof_by_name("zfs_refcount_t", "rc_removed");
 		if (off == -1)
 			return (DCMD_ERR);
 		mdb_pwalk("list", reference_cb, (void*)B_TRUE, addr + off);
@@ -3797,12 +3797,12 @@ rrwlock(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	}
 
 	mdb_printf("anonymous references:\n");
-	(void) mdb_call_dcmd("refcount", addr +
+	(void) mdb_call_dcmd("zfs_refcount", addr +
 	    mdb_ctf_offsetof_by_name(ZFS_STRUCT "rrwlock", "rr_anon_rcount"),
 	    DCMD_ADDRSPEC, 0, NULL);
 
 	mdb_printf("linked references:\n");
-	(void) mdb_call_dcmd("refcount", addr +
+	(void) mdb_call_dcmd("zfs_refcount", addr +
 	    mdb_ctf_offsetof_by_name(ZFS_STRUCT "rrwlock", "rr_linked_rcount"),
 	    DCMD_ADDRSPEC, 0, NULL);
 
@@ -4345,9 +4345,9 @@ static const mdb_dcmd_t dcmds[] = {
 	    "given a spa_t, print block type stats from last scrub",
 	    zfs_blkstats },
 	{ "zfs_params", "", "print zfs tunable parameters", zfs_params },
-	{ "refcount", ":[-r]\n"
+	{ "zfs_refcount", ":[-r]\n"
 	    "\t-r display recently removed references",
-	    "print refcount_t holders", refcount },
+	    "print zfs_refcount_t holders", zfs_refcount },
 	{ "zap_leaf", "", "print zap_leaf_phys_t", zap_leaf },
 	{ "zfs_aces", ":[-v]", "print all ACEs from a zfs_acl_t",
 	    zfs_acl_dump },
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 13fd33522a..61cfd74df3 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
@@ -901,7 +901,7 @@ dump_metaslab_stats(metaslab_t *msp)
 	/* max sure nicenum has enough space */
 	CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);
 
-	zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf));
+	zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
 
 	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
 	    "segments", avl_numnodes(t), "maxsize", maxbuf,
@@ -928,7 +928,7 @@ dump_metaslab(metaslab_t *msp)
 
 	if (dump_opt['m'] > 2 && !dump_opt['L']) {
 		mutex_enter(&msp->ms_lock);
-		VERIFY0(metaslab_load(msp, 0));
+		VERIFY0(metaslab_load(msp));
 		range_tree_stat_verify(msp->ms_allocatable);
 		dump_metaslab_stats(msp);
 		metaslab_unload(msp);
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index eb574105a7..654b62db6a 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2018, Joyent, Inc.
+ * Copyright (c) 2019, Joyent, Inc.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
@@ -296,6 +296,7 @@
 #include <zfs_fletcher.h>
 #include <sys/aggsum.h>
 #include <sys/cityhash.h>
+#include <sys/param.h>
 
 #ifndef _KERNEL
 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
@@ -1268,6 +1269,20 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 static void l2arc_read_done(zio_t *);
 
+/*
+ * The arc_all_memory function is a ZoL enhancement that lives in their OSL
+ * code. In user-space code, which is used primarily for testing, we return
+ * half of all memory.
+ */
+uint64_t
+arc_all_memory(void)
+{
+#ifdef _KERNEL
+	return (ptob(physmem));
+#else
+	return ((sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES)) / 2);
+#endif
+}
 
 /*
  * We use Cityhash for this. It's fast, and has good hash properties without
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index b950ed26d6..bc6b45ec7f 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -199,28 +199,20 @@ uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 int metaslab_load_pct = 50;
 
 /*
- * Determines how many txgs a metaslab may remain loaded without having any
- * allocations from it. As long as a metaslab continues to be used we will
- * keep it loaded.
+ * These tunables control how long a metaslab will remain loaded after the
+ * last allocation from it.  A metaslab can't be unloaded until at least
+ * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
+ * have elapsed.  However, zfs_metaslab_mem_limit may cause it to be
+ * unloaded sooner.  These settings are intended to be generous -- to keep
+ * metaslabs loaded for a long time, reducing the rate of metaslab loading.
  */
-int metaslab_unload_delay = TXG_SIZE * 2;
-
-/*
- * Tunables used to reduce metaslab load/unload thrashing when selection
- * algorithm is allocating across metaslabs very evenly. In addition to
- * tracking when the slab was used for allocation (ms_selected_txg), we also
- * track when it was loaded (ms_loaded_txg). If the slab would be unloaded,
- * but the load txg is within the window of
- *    metaslab_unload_delay + metaslab_load_window
- * then we ramp up metaslab_unload_delay instead of unloading the metaslab.
- */
-int metaslab_load_window = 10;
-int metaslab_unload_delay_max = 256;
+int metaslab_unload_delay = 32;
+int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
 
 /*
  * Max number of metaslabs per group to preload.
  */
-int metaslab_preload_limit = SPA_DVAS_PER_BP;
+int metaslab_preload_limit = 10;
 
 /*
  * Enable/disable preloading of metaslab.
@@ -281,6 +273,19 @@ uint64_t metaslab_trace_max_entries = 5000;
  */
 int max_disabled_ms = 3;
 
+/*
+ * Maximum percentage of memory to use on storing loaded metaslabs. If loading
+ * a metaslab would take it over this percentage, the oldest selected metaslab
+ * is automatically unloaded.
+ */
+int zfs_metaslab_mem_limit = 25;
+
+/*
+ * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
+ * To avoid 64-bit overflow, don't set above UINT32_MAX.
+ */
+unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+
 static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
@@ -288,6 +293,8 @@ static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
+static unsigned int metaslab_idx_func(multilist_t *, void *);
+static void metaslab_evict(metaslab_t *, uint64_t);
 
 kmem_cache_t *metaslab_alloc_trace_cache;
 
@@ -307,6 +314,8 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 	mc->mc_rotor = NULL;
 	mc->mc_ops = ops;
 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
+	mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
+	    offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
 	mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
 	    sizeof (zfs_refcount_t), KM_SLEEP);
 	mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
@@ -333,6 +342,7 @@ metaslab_class_destroy(metaslab_class_t *mc)
 	kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
 	    sizeof (uint64_t));
 	mutex_destroy(&mc->mc_lock);
+	multilist_destroy(mc->mc_metaslab_txg_list);
 	kmem_free(mc, sizeof (metaslab_class_t));
 }
 
@@ -523,6 +533,51 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
 	return (space);
 }
 
+void
+metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
+{
+	multilist_t *ml = mc->mc_metaslab_txg_list;
+	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
+		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		metaslab_t *msp = multilist_sublist_head(mls);
+		multilist_sublist_unlock(mls);
+		while (msp != NULL) {
+			mutex_enter(&msp->ms_lock);
+
+			/*
+			 * If the metaslab has been removed from the list
+			 * (which could happen if we were at the memory limit
+			 * and it was evicted during this loop), then we can't
+			 * proceed and we should restart the sublist.
+			 */
+			if (!multilist_link_active(&msp->ms_class_txg_node)) {
+				mutex_exit(&msp->ms_lock);
+				i--;
+				break;
+			}
+			mls = multilist_sublist_lock(ml, i);
+			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
+			multilist_sublist_unlock(mls);
+			if (txg >
+			    msp->ms_selected_txg + metaslab_unload_delay &&
+			    gethrtime() > msp->ms_selected_time +
+			    (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
+				metaslab_evict(msp, txg);
+			} else {
+				/*
+				 * Once we've hit a metaslab selected too
+				 * recently to evict, we're done evicting for
+				 * now.
+				 */
+				mutex_exit(&msp->ms_lock);
+				break;
+			}
+			mutex_exit(&msp->ms_lock);
+			msp = next_msp;
+		}
+	}
+}
+
 static int
 metaslab_compare(const void *x1, const void *x2)
 {
@@ -1002,6 +1057,14 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 	mutex_enter(&mg->mg_lock);
 	ASSERT(msp->ms_group == mg);
 	avl_remove(&mg->mg_metaslab_tree, msp);
+
+	metaslab_class_t *mc = msp->ms_group->mg_class;
+	multilist_sublist_t *mls =
+	    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+	if (multilist_link_active(&msp->ms_class_txg_node))
+		multilist_sublist_remove(mls, msp);
+	multilist_sublist_unlock(mls);
+
 	msp->ms_group = NULL;
 	mutex_exit(&mg->mg_lock);
 }
@@ -1009,8 +1072,10 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 static void
 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(MUTEX_HELD(&mg->mg_lock));
 	ASSERT(msp->ms_group == mg);
+
 	avl_remove(&mg->mg_metaslab_tree, msp);
 	msp->ms_weight = weight;
 	avl_add(&mg->mg_metaslab_tree, msp);
@@ -1211,17 +1276,83 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
  * Return the maximum contiguous segment within the metaslab.
  */
 uint64_t
-metaslab_block_maxsize(metaslab_t *msp)
+metaslab_largest_allocatable(metaslab_t *msp)
 {
 	avl_tree_t *t = &msp->ms_allocatable_by_size;
 	range_seg_t *rs;
 
-	if (t == NULL || (rs = avl_last(t)) == NULL)
-		return (0ULL);
+	if (t == NULL)
+		return (0);
+	rs = avl_last(t);
+	if (rs == NULL)
+		return (0);
 
 	return (rs->rs_end - rs->rs_start);
 }
 
+/*
+ * Return the maximum contiguous segment within the unflushed frees of this
+ * metaslab.
+ */
+uint64_t
+metaslab_largest_unflushed_free(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	if (msp->ms_unflushed_frees == NULL)
+		return (0);
+
+	range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size);
+	if (rs == NULL)
+		return (0);
+
+	/*
+	 * When a range is freed from the metaslab, that range is added to
+	 * both the unflushed frees and the deferred frees. While the block
+	 * will eventually be usable, if the metaslab were loaded the range
+	 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
+	 * txgs had passed.  As a result, when attempting to estimate an upper
+	 * bound for the largest currently-usable free segment in the
+	 * metaslab, we need to not consider any ranges currently in the defer
+	 * trees. This algorithm approximates the largest available chunk in
+	 * the largest range in the unflushed_frees tree by taking the first
+	 * chunk.  While this may be a poor estimate, it should only remain so
+	 * briefly and should eventually self-correct as frees are no longer
+	 * deferred. Similar logic applies to the ms_freed tree. See
+	 * metaslab_load() for more details.
+	 *
+	 * There are two primary sources of innacuracy in this estimate. Both
+	 * are tolerated for performance reasons. The first source is that we
+	 * only check the largest segment for overlaps. Smaller segments may
+	 * have more favorable overlaps with the other trees, resulting in
+	 * larger usable chunks.  Second, we only look at the first chunk in
+	 * the largest segment; there may be other usable chunks in the
+	 * largest segment, but we ignore them.
+	 */
+	uint64_t rstart = rs->rs_start;
+	uint64_t rsize = rs->rs_end - rstart;
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		uint64_t start = 0;
+		uint64_t size = 0;
+		boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
+		    rsize, &start, &size);
+		if (found) {
+			if (rstart == start)
+				return (0);
+			rsize = start - rstart;
+		}
+	}
+
+	uint64_t start = 0;
+	uint64_t size = 0;
+	boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
+	    rsize, &start, &size);
+	if (found)
+		rsize = start - rstart;
+
+	return (rsize);
+}
+
 static range_seg_t *
 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
 {
@@ -1311,7 +1442,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 	 * If we're running low on space, find a segment based on size,
 	 * rather than iterating based on offset.
 	 */
-	if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold ||
+	if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
 		offset = -1;
 	} else {
@@ -1409,7 +1540,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 	range_seg_t *rs, rsearch;
 	uint64_t hbit = highbit64(size);
 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
-	uint64_t max_size = metaslab_block_maxsize(msp);
+	uint64_t max_size = metaslab_largest_allocatable(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(avl_numnodes(t), ==,
@@ -1479,6 +1610,13 @@ metaslab_flush_wait(metaslab_t *msp)
 		cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
 }
 
+static unsigned int
+metaslab_idx_func(multilist_t *ml, void *arg)
+{
+	metaslab_t *msp = arg;
+	return (msp->ms_id % multilist_get_num_sublists(ml));
+}
+
 uint64_t
 metaslab_allocated_space(metaslab_t *msp)
 {
@@ -1537,6 +1675,8 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 		allocating +=
 		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
 	}
+	ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
+	    msp->ms_allocating_total);
 
 	ASSERT3U(msp->ms_deferspace, ==,
 	    range_tree_space(msp->ms_defer[0]) +
@@ -1725,7 +1865,6 @@ metaslab_verify_weight_and_frag(metaslab_t *msp)
 
 	msp->ms_weight = 0;
 	msp->ms_fragmentation = 0;
-	msp->ms_max_size = 0;
 
 	/*
 	 * This function is used for verification purposes. Regardless of
@@ -1753,6 +1892,87 @@ metaslab_verify_weight_and_frag(metaslab_t *msp)
 	VERIFY3U(msp->ms_weight, ==, weight);
 }
 
+/*
+ * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
+ * this class that was used longest ago, and attempt to unload it.  We don't
+ * want to spend too much time in this loop to prevent performance
+ * degredation, and we expect that most of the time this operation will
+ * succeed. Between that and the normal unloading processing during txg sync,
+ * we expect this to keep the metaslab memory usage under control.
+ */
+static void
+metaslab_potentially_evict(metaslab_class_t *mc)
+{
+#ifdef _KERNEL
+	uint64_t allmem = arc_all_memory();
+	extern kmem_cache_t *range_seg_cache;
+	uint64_t inuse = kmem_cache_stat(range_seg_cache, "buf_inuse");
+	uint64_t size =	kmem_cache_stat(range_seg_cache, "buf_size");
+	int tries = 0;
+	for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
+	    tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
+	    tries++) {
+		unsigned int idx = multilist_get_random_index(
+		    mc->mc_metaslab_txg_list);
+		multilist_sublist_t *mls =
+		    multilist_sublist_lock(mc->mc_metaslab_txg_list, idx);
+		metaslab_t *msp = multilist_sublist_head(mls);
+		multilist_sublist_unlock(mls);
+		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
+		    inuse * size) {
+			VERIFY3P(mls, ==, multilist_sublist_lock(
+			    mc->mc_metaslab_txg_list, idx));
+			ASSERT3U(idx, ==,
+			    metaslab_idx_func(mc->mc_metaslab_txg_list, msp));
+
+			if (!multilist_link_active(&msp->ms_class_txg_node)) {
+				multilist_sublist_unlock(mls);
+				break;
+			}
+			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
+			multilist_sublist_unlock(mls);
+			/*
+			 * If the metaslab is currently loading there are two
+			 * cases. If it's the metaslab we're evicting, we
+			 * can't continue on or we'll panic when we attempt to
+			 * recursively lock the mutex. If it's another
+			 * metaslab that's loading, it can be safely skipped,
+			 * since we know it's very new and therefore not a
+			 * good eviction candidate. We check later once the
+			 * lock is held that the metaslab is fully loaded
+			 * before actually unloading it.
+			 */
+			if (msp->ms_loading) {
+				msp = next_msp;
+				inuse = kmem_cache_stat(range_seg_cache,
+				    "buf_inuse");
+				continue;
+			}
+			/*
+			 * We can't unload metaslabs with no spacemap because
+			 * they're not ready to be unloaded yet. We can't
+			 * unload metaslabs with outstanding allocations
+			 * because doing so could cause the metaslab's weight
+			 * to decrease while it's unloaded, which violates an
+			 * invariant that we use to prevent unnecessary
+			 * loading. We also don't unload metaslabs that are
+			 * currently active because they are high-weight
+			 * metaslabs that are likely to be used in the near
+			 * future.
+			 */
+			mutex_enter(&msp->ms_lock);
+			if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
+			    msp->ms_allocating_total == 0) {
+				metaslab_unload(msp);
+			}
+			mutex_exit(&msp->ms_lock);
+			msp = next_msp;
+			inuse = kmem_cache_stat(range_seg_cache, "buf_inuse");
+		}
+	}
+#endif
+}
+
 static int
 metaslab_load_impl(metaslab_t *msp)
 {
@@ -1915,18 +2135,21 @@ metaslab_load_impl(metaslab_t *msp)
 	 * comment for ms_synchist and ms_deferhist[] for more info]
 	 */
 	uint64_t weight = msp->ms_weight;
+	uint64_t max_size = msp->ms_max_size;
 	metaslab_recalculate_weight_and_sort(msp);
 	if (!WEIGHT_IS_SPACEBASED(weight))
 		ASSERT3U(weight, <=, msp->ms_weight);
-	msp->ms_max_size = metaslab_block_maxsize(msp);
-
+	msp->ms_max_size = metaslab_largest_allocatable(msp);
+	ASSERT3U(max_size, <=, msp->ms_max_size);
 	hrtime_t load_end = gethrtime();
+		msp->ms_load_time = load_end;
 	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
 		zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, "
 		    "ms_id %llu, smp_length %llu, "
 		    "unflushed_allocs %llu, unflushed_frees %llu, "
 		    "freed %llu, defer %llu + %llu, "
-		    "loading_time %lld ms",
+		    "loading_time %lld ms, ms_max_size %llu, "
+		    "max size error %llu",
 		    spa_syncing_txg(spa), spa_name(spa),
 		    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
 		    space_map_length(msp->ms_sm),
@@ -1935,7 +2158,8 @@ metaslab_load_impl(metaslab_t *msp)
 		    range_tree_space(msp->ms_freed),
 		    range_tree_space(msp->ms_defer[0]),
 		    range_tree_space(msp->ms_defer[1]),
-		    (longlong_t)((load_end - load_start) / 1000000));
+		    (longlong_t)((load_end - load_start) / 1000000),
+		    msp->ms_max_size, msp->ms_max_size - max_size);
 	}
 
 	metaslab_verify_space(msp, spa_syncing_txg(spa));
@@ -1944,7 +2168,7 @@ metaslab_load_impl(metaslab_t *msp)
 }
 
 int
-metaslab_load(metaslab_t *msp, uint64_t txg)
+metaslab_load(metaslab_t *msp)
 {
 	kstat_t *ksp;
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -1988,11 +2212,20 @@ metaslab_load(metaslab_t *msp, uint64_t txg)
 	 */
 	ASSERT(!msp->ms_loaded);
 
+	/*
+	 * If we're loading a metaslab in the normal class, consider evicting
+	 * another one to keep our memory usage under the limit defined by the
+	 * zfs_metaslab_mem_limit tunable.
+	 */
+	if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
+	    msp->ms_group->mg_class) {
+		metaslab_potentially_evict(msp->ms_group->mg_class);
+	}
+
 	int error = metaslab_load_impl(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	msp->ms_loading = B_FALSE;
-	msp->ms_loaded_txg = txg;
 	cv_broadcast(&msp->ms_load_cv);
 
 	return (error);
@@ -2003,14 +2236,29 @@ metaslab_unload(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
-	metaslab_verify_weight_and_frag(msp);
+	/*
+	 * This can happen if a metaslab is selected for eviction (in
+	 * metaslab_potentially_evict) and then unloaded during spa_sync (via
+	 * metaslab_class_evict_old).
+	 */
+	if (!msp->ms_loaded)
+		return;
 
 	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 	msp->ms_loaded = B_FALSE;
-	msp->ms_loaded_txg = 0;
+	msp->ms_unload_time = gethrtime();
 
+	msp->ms_activation_weight = 0;
 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
-	msp->ms_max_size = 0;
+
+	if (msp->ms_group != NULL) {
+		metaslab_class_t *mc = msp->ms_group->mg_class;
+		multilist_sublist_t *mls =
+		    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+		if (multilist_link_active(&msp->ms_class_txg_node))
+			multilist_sublist_remove(mls, msp);
+		multilist_sublist_unlock(mls);
+	}
 
 	/*
 	 * We explicitly recalculate the metaslab's weight based on its space
@@ -2029,6 +2277,21 @@ metaslab_unload(metaslab_t *msp)
 }
 
 void
+metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	metaslab_class_t *mc = msp->ms_group->mg_class;
+	multilist_sublist_t *mls =
+	    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+	if (multilist_link_active(&msp->ms_class_txg_node))
+		multilist_sublist_remove(mls, msp);
+	msp->ms_selected_txg = txg;
+	msp->ms_selected_time = gethrtime();
+	multilist_sublist_insert_tail(mls, msp);
+	multilist_sublist_unlock(mls);
+}
+
+void
 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta)
 {
@@ -2056,6 +2319,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
+	multilist_link_init(&ms->ms_class_txg_node);
 
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
@@ -2349,7 +2613,6 @@ metaslab_space_weight(metaslab_t *msp)
 	uint64_t weight, space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT(!vd->vdev_removing);
 
 	/*
 	 * The baseline weight is the metaslab's free space.
@@ -2568,13 +2831,19 @@ metaslab_segment_weight(metaslab_t *msp)
  * weights we rely on the entire weight (excluding the weight-type bit).
  */
 boolean_t
-metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
+metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
 {
-	if (msp->ms_loaded) {
+	/*
+	 * If the metaslab is loaded, ms_max_size is definitive and we can use
+	 * the fast check. If it's not, the ms_max_size is a lower bound (once
+	 * set), and we should use the fast check as long as we're not in
+	 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
+	 * seconds since the metaslab was unloaded.
+	 */
+	if (msp->ms_loaded ||
+	    (msp->ms_max_size != 0 && !try_hard && gethrtime() <
+	    msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
 		return (msp->ms_max_size >= asize);
-	} else {
-		ASSERT0(msp->ms_max_size);
-	}
 
 	boolean_t should_allocate;
 	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
@@ -2590,6 +2859,7 @@ metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
 		should_allocate = (asize <=
 		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
 	}
+
 	return (should_allocate);
 }
 
@@ -2602,24 +2872,24 @@ metaslab_weight(metaslab_t *msp)
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
-	/*
-	 * If this vdev is in the process of being removed, there is nothing
-	 * for us to do here.
-	 */
-	if (vd->vdev_removing)
-		return (0);
-
 	metaslab_set_fragmentation(msp);
 
 	/*
-	 * Update the maximum size if the metaslab is loaded. This will
+	 * Update the maximum size. If the metaslab is loaded, this will
 	 * ensure that we get an accurate maximum size if newly freed space
-	 * has been added back into the free tree.
+	 * has been added back into the free tree. If the metaslab is
+	 * unloaded, we check if there's a larger free segment in the
+	 * unflushed frees. This is a lower bound on the largest allocatable
+	 * segment size. Coalescing of adjacent entries may reveal larger
+	 * allocatable segments, but we aren't aware of those until loading
+	 * the space map into a range tree.
 	 */
-	if (msp->ms_loaded)
-		msp->ms_max_size = metaslab_block_maxsize(msp);
-	else
-		ASSERT0(msp->ms_max_size);
+	if (msp->ms_loaded) {
+		msp->ms_max_size = metaslab_largest_allocatable(msp);
+	} else {
+		msp->ms_max_size = MAX(msp->ms_max_size,
+		    metaslab_largest_unflushed_free(msp));
+	}
 
 	/*
 	 * Segment-based weighting requires space map histogram support.
@@ -2638,6 +2908,8 @@ metaslab_weight(metaslab_t *msp)
 void
 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
 {
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
 	/* note: we preserve the mask (e.g. indication of primary, etc..) */
 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 	metaslab_group_sort(msp->ms_group, msp,
@@ -2648,16 +2920,23 @@ static int
 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
     int allocator, uint64_t activation_weight)
 {
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
 	/*
 	 * If we're activating for the claim code, we don't want to actually
 	 * set the metaslab up for a specific allocator.
 	 */
-	if (activation_weight == METASLAB_WEIGHT_CLAIM)
+	if (activation_weight == METASLAB_WEIGHT_CLAIM) {
+		ASSERT0(msp->ms_activation_weight);
+		msp->ms_activation_weight = msp->ms_weight;
+		metaslab_group_sort(mg, msp, msp->ms_weight |
+		    activation_weight);
 		return (0);
+	}
+
 	metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
 	    mg->mg_primaries : mg->mg_secondaries);
 
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	mutex_enter(&mg->mg_lock);
 	if (arr[allocator] != NULL) {
 		mutex_exit(&mg->mg_lock);
@@ -2668,39 +2947,88 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
 	ASSERT3S(msp->ms_allocator, ==, -1);
 	msp->ms_allocator = allocator;
 	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
+
+	ASSERT0(msp->ms_activation_weight);
+	msp->ms_activation_weight = msp->ms_weight;
+	metaslab_group_sort_impl(mg, msp,
+	    msp->ms_weight | activation_weight);
+
 	mutex_exit(&mg->mg_lock);
 
 	return (0);
 }
 
 static int
-metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight,
-    uint64_t txg)
+metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
-	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		int error = metaslab_load(msp, txg);
-		if (error != 0) {
-			metaslab_group_sort(msp->ms_group, msp, 0);
-			return (error);
-		}
-		if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
-			/*
-			 * The metaslab was activated for another allocator
-			 * while we were waiting, we should reselect.
-			 */
+	/*
+	 * The current metaslab is already activated for us so there
+	 * is nothing to do. Already activated though, doesn't mean
+	 * that this metaslab is activated for our allocator nor our
+	 * requested activation weight. The metaslab could have started
+	 * as an active one for our allocator but changed allocators
+	 * while we were waiting to grab its ms_lock or we stole it
+	 * [see find_valid_metaslab()]. This means that there is a
+	 * possibility of passivating a metaslab of another allocator
+	 * or from a different activation mask, from this thread.
+	 */
+	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+		ASSERT(msp->ms_loaded);
+		return (0);
+	}
+
+	int error = metaslab_load(msp);
+	if (error != 0) {
+		metaslab_group_sort(msp->ms_group, msp, 0);
+		return (error);
+	}
+
+	/*
+	 * When entering metaslab_load() we may have dropped the
+	 * ms_lock because we were loading this metaslab, or we
+	 * were waiting for another thread to load it for us. In
+	 * that scenario, we recheck the weight of the metaslab
+	 * to see if it was activated by another thread.
+	 *
+	 * If the metaslab was activated for another allocator or
+	 * it was activated with a different activation weight (e.g.
+	 * we wanted to make it a primary but it was activated as
+	 * secondary) we return error (EBUSY).
+	 *
+	 * If the metaslab was activated for the same allocator
+	 * and requested activation mask, skip activating it.
+	 */
+	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+		if (msp->ms_allocator != allocator)
 			return (EBUSY);
-		}
-		if ((error = metaslab_activate_allocator(msp->ms_group, msp,
-		    allocator, activation_weight)) != 0) {
-			return (error);
-		}
 
-		msp->ms_activation_weight = msp->ms_weight;
-		metaslab_group_sort(msp->ms_group, msp,
-		    msp->ms_weight | activation_weight);
+		if ((msp->ms_weight & activation_weight) == 0)
+			return (EBUSY);
+
+		EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
+		    msp->ms_primary);
+		return (0);
+	}
+
+	/*
+	 * If the metaslab has literally 0 space, it will have weight 0. In
+	 * that case, don't bother activating it. This can happen if the
+	 * metaslab had space during find_valid_metaslab, but another thread
+	 * loaded it and used all that space while we were waiting to grab the
+	 * lock.
+	 */
+	if (msp->ms_weight == 0) {
+		ASSERT0(range_tree_space(msp->ms_allocatable));
+		return (SET_ERROR(ENOSPC));
 	}
+
+	if ((error = metaslab_activate_allocator(msp->ms_group, msp,
+	    allocator, activation_weight)) != 0) {
+		return (error);
+	}
+
 	ASSERT(msp->ms_loaded);
 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 
@@ -2712,6 +3040,8 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
     uint64_t weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(msp->ms_loaded);
+
 	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
 		metaslab_group_sort(mg, msp, weight);
 		return;
@@ -2719,15 +3049,16 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
 
 	mutex_enter(&mg->mg_lock);
 	ASSERT3P(msp->ms_group, ==, mg);
+	ASSERT3S(0, <=, msp->ms_allocator);
+	ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
+
 	if (msp->ms_primary) {
-		ASSERT3U(0, <=, msp->ms_allocator);
-		ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
 		ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		mg->mg_primaries[msp->ms_allocator] = NULL;
 	} else {
-		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
+		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		mg->mg_secondaries[msp->ms_allocator] = NULL;
 	}
 	msp->ms_allocator = -1;
@@ -2749,9 +3080,10 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight)
 	    range_tree_is_empty(msp->ms_allocatable));
 	ASSERT0(weight & METASLAB_ACTIVE_MASK);
 
+	ASSERT(msp->ms_activation_weight != 0);
 	msp->ms_activation_weight = 0;
 	metaslab_passivate_allocator(msp->ms_group, msp, weight);
-	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
+	ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
 }
 
 /*
@@ -2790,13 +3122,14 @@ static void
 metaslab_preload(void *arg)
 {
 	metaslab_t *msp = arg;
-	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	metaslab_class_t *mc = msp->ms_group->mg_class;
+	spa_t *spa = mc->mc_spa;
 
 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
 
 	mutex_enter(&msp->ms_lock);
-	(void) metaslab_load(msp, spa_syncing_txg(spa));
-	msp->ms_selected_txg = spa_syncing_txg(spa);
+	(void) metaslab_load(msp);
+	metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
 	mutex_exit(&msp->ms_lock);
 }
 
@@ -3249,12 +3582,19 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	/*
 	 * Normally, we don't want to process a metaslab if there are no
 	 * allocations or frees to perform. However, if the metaslab is being
-	 * forced to condense and it's loaded, we need to let it through.
+	 * forced to condense, it's loaded and we're not beyond the final
+	 * dirty txg, we need to let it through. Not condensing beyond the
+	 * final dirty txg prevents an issue where metaslabs that need to be
+	 * condensed but were loaded for other reasons could cause a panic
+	 * here. By only checking the txg in that branch of the conditional,
+	 * we preserve the utility of the VERIFY statements in all other
+	 * cases.
 	 */
 	if (range_tree_is_empty(alloctree) &&
 	    range_tree_is_empty(msp->ms_freeing) &&
 	    range_tree_is_empty(msp->ms_checkpointing) &&
-	    !(msp->ms_loaded && msp->ms_condense_wanted))
+	    !(msp->ms_loaded && msp->ms_condense_wanted &&
+	    txg <= spa_final_dirty_txg(spa)))
 		return;
 
 
@@ -3507,6 +3847,23 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	dmu_tx_commit(tx);
 }
 
+static void
+metaslab_evict(metaslab_t *msp, uint64_t txg)
+{
+	if (!msp->ms_loaded || msp->ms_disabled != 0)
+		return;
+
+	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+		VERIFY0(range_tree_space(
+		    msp->ms_allocating[(txg + t) & TXG_MASK]));
+	}
+	if (msp->ms_allocator != -1)
+		metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+
+	if (!metaslab_debug_unload)
+		metaslab_unload(msp);
+}
+
 /*
  * Called after a transaction group has completely synced to mark
  * all of the metaslab's free space as usable.
@@ -3553,7 +3910,9 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 		ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
 		msp->ms_unflushed_allocs = range_tree_create(NULL, NULL);
 		ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
-		msp->ms_unflushed_frees = range_tree_create(NULL, NULL);
+		msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops,
+		    &msp->ms_unflushed_frees_by_size,
+		    metaslab_rangesize_compare, 0);
 
 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
 	}
@@ -3658,41 +4017,28 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	/*
 	 * If the metaslab is loaded and we've not tried to load or allocate
 	 * from it in 'metaslab_unload_delay' txgs, then we normally unload it.
-	 * However, to prevent thrashing, if the metaslab was recently loaded,
-	 * then instead of unloading it, we increase the unload delay (only up
-	 * to the maximum).
 	 */
 	if (msp->ms_loaded &&
 	    msp->ms_disabled == 0 &&
 	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
-		if (msp->ms_loaded_txg != 0 && msp->ms_loaded_txg +
-		    metaslab_unload_delay + metaslab_load_window >= txg) {
-			if (metaslab_unload_delay + metaslab_load_window <=
-			    metaslab_unload_delay_max) {
-				metaslab_unload_delay += metaslab_load_window;
-			}
-			DTRACE_PROBE1(zfs__metaslab__delay__unload,
-			    metaslab_t *, msp);
-		} else {
-			for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
-				VERIFY0(range_tree_space(
-				    msp->ms_allocating[(txg + t) & TXG_MASK]));
-			}
-			if (msp->ms_allocator != -1) {
-				metaslab_passivate(msp, msp->ms_weight &
-				    ~METASLAB_ACTIVE_MASK);
-			}
-
-			if (!metaslab_debug_unload)
-				metaslab_unload(msp);
+		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+			VERIFY0(range_tree_space(
+			    msp->ms_allocating[(txg + t) & TXG_MASK]));
 		}
+		if (msp->ms_allocator != -1) {
+			metaslab_passivate(msp, msp->ms_weight &
+			    ~METASLAB_ACTIVE_MASK);
+		}
+
+		if (!metaslab_debug_unload)
+			metaslab_unload(msp);
 	}
 
 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_freeing));
 	ASSERT0(range_tree_space(msp->ms_freed));
 	ASSERT0(range_tree_space(msp->ms_checkpointing));
-
+	msp->ms_allocating_total -= msp->ms_allocated_this_txg;
 	msp->ms_allocated_this_txg = 0;
 	mutex_exit(&msp->ms_lock);
 }
@@ -3946,6 +4292,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
 		range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
+		msp->ms_allocating_total += size;
 
 		metaslab_verify_space(msp, txg);
 	}
@@ -3954,7 +4301,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 	 * Now that we've attempted the allocation we need to update the
 	 * metaslab's maximum block size since it may have changed.
 	 */
-	msp->ms_max_size = metaslab_block_maxsize(msp);
+	msp->ms_max_size = metaslab_largest_allocatable(msp);
 	return (start);
 }
 
@@ -3972,7 +4319,8 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 static metaslab_t *
 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
     dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
-    zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
+    boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
+    boolean_t *was_active)
 {
 	avl_index_t idx;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
@@ -3982,7 +4330,7 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
 
 	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
 		int i;
-		if (!metaslab_should_allocate(msp, asize)) {
+		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
 			continue;
@@ -4024,17 +4372,51 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
 	return (msp);
 }
 
+void
+metaslab_active_mask_verify(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+		return;
+
+	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
+		return;
+
+	if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
+		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
+		VERIFY3S(msp->ms_allocator, !=, -1);
+		VERIFY(msp->ms_primary);
+		return;
+	}
+
+	if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
+		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
+		VERIFY3S(msp->ms_allocator, !=, -1);
+		VERIFY(!msp->ms_primary);
+		return;
+	}
+
+	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+		VERIFY3S(msp->ms_allocator, ==, -1);
+		return;
+	}
+}
+
 /* ARGSUSED */
 static uint64_t
 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
-    int d, int allocator)
+    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
+    int allocator, boolean_t try_hard)
 {
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
-	uint64_t activation_weight;
 
-	activation_weight = METASLAB_WEIGHT_PRIMARY;
+	uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
 	for (int i = 0; i < d; i++) {
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
@@ -4075,15 +4457,37 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 		    mg->mg_primaries[allocator] != NULL) {
 			msp = mg->mg_primaries[allocator];
+
+			/*
+			 * Even though we don't hold the ms_lock for the
+			 * primary metaslab, those fields should not
+			 * change while we hold the mg_lock. Thus is is
+			 * safe to make assertions on them.
+			 */
+			ASSERT(msp->ms_primary);
+			ASSERT3S(msp->ms_allocator, ==, allocator);
+			ASSERT(msp->ms_loaded);
+
 			was_active = B_TRUE;
+			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
 		    mg->mg_secondaries[allocator] != NULL) {
 			msp = mg->mg_secondaries[allocator];
+
+			/*
+			 * See comment above about the similar assertions
+			 * for the primary metaslab.
+			 */
+			ASSERT(!msp->ms_primary);
+			ASSERT3S(msp->ms_allocator, ==, allocator);
+			ASSERT(msp->ms_loaded);
+
 			was_active = B_TRUE;
+			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 		} else {
 			msp = find_valid_metaslab(mg, activation_weight, dva, d,
-			    want_unique, asize, allocator, zal, search,
-			    &was_active);
+			    want_unique, asize, allocator, try_hard, zal,
+			    search, &was_active);
 		}
 
 		mutex_exit(&mg->mg_lock);
@@ -4091,59 +4495,106 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 			kmem_free(search, sizeof (*search));
 			return (-1ULL);
 		}
-
 		mutex_enter(&msp->ms_lock);
+
+		metaslab_active_mask_verify(msp);
+
+		/*
+		 * This code is disabled out because of issues with
+		 * tracepoints in non-gpl kernel modules.
+		 */
+#if 0
+		DTRACE_PROBE3(ms__activation__attempt,
+		    metaslab_t *, msp, uint64_t, activation_weight,
+		    boolean_t, was_active);
+#endif
+
 		/*
 		 * Ensure that the metaslab we have selected is still
 		 * capable of handling our request. It's possible that
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock. We check the
-		 * active status first to see if we need to reselect
+		 * active status first to see if we need to set_selected_txg
 		 * a new metaslab.
 		 */
 		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
+			ASSERT3S(msp->ms_allocator, ==, -1);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/*
-		 * If the metaslab is freshly activated for an allocator that
-		 * isn't the one we're allocating from, or if it's a primary and
-		 * we're seeking a secondary (or vice versa), we go back and
-		 * select a new metaslab.
+		 * If the metaslab was activated for another allocator
+		 * while we were waiting in the ms_lock above, or it's
+		 * a primary and we're seeking a secondary (or vice versa),
+		 * we go back and select a new metaslab.
 		 */
 		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 		    (msp->ms_allocator != -1) &&
 		    (msp->ms_allocator != allocator || ((activation_weight ==
 		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
+			ASSERT(msp->ms_loaded);
+			ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
+			    msp->ms_allocator != -1);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
+		/*
+		 * This metaslab was used for claiming regions allocated
+		 * by the ZIL during pool import. Once these regions are
+		 * claimed we don't need to keep the CLAIM bit set
+		 * anymore. Passivate this metaslab to zero its activation
+		 * mask.
+		 */
 		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
 		    activation_weight != METASLAB_WEIGHT_CLAIM) {
+			ASSERT(msp->ms_loaded);
+			ASSERT3S(msp->ms_allocator, ==, -1);
 			metaslab_passivate(msp, msp->ms_weight &
 			    ~METASLAB_WEIGHT_CLAIM);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
-		if (metaslab_activate(msp, allocator, activation_weight,
-		    txg) != 0) {
+		metaslab_set_selected_txg(msp, txg);
+
+		int activation_error =
+		    metaslab_activate(msp, allocator, activation_weight);
+		metaslab_active_mask_verify(msp);
+
+		/*
+		 * If the metaslab was activated by another thread for
+		 * another allocator or activation_weight (EBUSY), or it
+		 * failed because another metaslab was assigned as primary
+		 * for this allocator (EEXIST) we continue using this
+		 * metaslab for our allocation, rather than going on to a
+		 * worse metaslab (we waited for that metaslab to be loaded
+		 * after all).
+		 *
+		 * If the activation failed due to an I/O error or ENOSPC we
+		 * skip to the next metaslab.
+		 */
+		boolean_t activated;
+		if (activation_error == 0) {
+			activated = B_TRUE;
+		} else if (activation_error == EBUSY ||
+		    activation_error == EEXIST) {
+			activated = B_FALSE;
+		} else {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
-
-		msp->ms_selected_txg = txg;
+		ASSERT(msp->ms_loaded);
 
 		/*
 		 * Now that we have the lock, recheck to see if we should
 		 * continue to use this metaslab for this allocation. The
-		 * the metaslab is now loaded so metaslab_should_allocate() can
-		 * accurately determine if the allocation attempt should
+		 * the metaslab is now loaded so metaslab_should_allocate()
+		 * can accurately determine if the allocation attempt should
 		 * proceed.
 		 */
-		if (!metaslab_should_allocate(msp, asize)) {
+		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			/* Passivate this metaslab and select a new one. */
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
@@ -4151,8 +4602,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 		}
 
 		/*
-		 * If this metaslab is currently condensing then pick again as
-		 * we can't manipulate this metaslab until it's committed
+		 * If this metaslab is currently condensing then pick again
+		 * as we can't manipulate this metaslab until it's committed
 		 * to disk. If this metaslab is being initialized, we shouldn't
 		 * allocate from it since the allocated region might be
 		 * overwritten after allocation.
@@ -4160,15 +4611,19 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 		if (msp->ms_condensing) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_CONDENSING, allocator);
-			metaslab_passivate(msp, msp->ms_weight &
-			    ~METASLAB_ACTIVE_MASK);
+			if (activated) {
+				metaslab_passivate(msp, msp->ms_weight &
+				    ~METASLAB_ACTIVE_MASK);
+			}
 			mutex_exit(&msp->ms_lock);
 			continue;
 		} else if (msp->ms_disabled > 0) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_DISABLED, allocator);
-			metaslab_passivate(msp, msp->ms_weight &
-			    ~METASLAB_ACTIVE_MASK);
+			if (activated) {
+				metaslab_passivate(msp, msp->ms_weight &
+				    ~METASLAB_ACTIVE_MASK);
+			}
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
@@ -4178,13 +4633,23 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 
 		if (offset != -1ULL) {
 			/* Proactively passivate the metaslab, if needed */
-			metaslab_segment_may_passivate(msp);
+			if (activated)
+				metaslab_segment_may_passivate(msp);
 			break;
 		}
 next:
 		ASSERT(msp->ms_loaded);
 
 		/*
+		 * This code is disabled out because of issues with
+		 * tracepoints in non-gpl kernel modules.
+		 */
+#if 0
+		DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
+		    uint64_t, asize);
+#endif
+
+		/*
 		 * We were unable to allocate from this metaslab so determine
 		 * a new weight for this metaslab. Now that we have loaded
 		 * the metaslab we can provide a better hint to the metaslab
@@ -4205,14 +4670,33 @@ next:
 		 * currently available for allocation and is accurate
 		 * even within a sync pass.
 		 */
+		uint64_t weight;
 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
-			uint64_t weight = metaslab_block_maxsize(msp);
+			weight = metaslab_largest_allocatable(msp);
 			WEIGHT_SET_SPACEBASED(weight);
+		} else {
+			weight = metaslab_weight_from_range_tree(msp);
+		}
+
+		if (activated) {
 			metaslab_passivate(msp, weight);
 		} else {
-			metaslab_passivate(msp,
-			    metaslab_weight_from_range_tree(msp));
+			/*
+			 * For the case where we use the metaslab that is
+			 * active for another allocator we want to make
+			 * sure that we retain the activation mask.
+			 *
+			 * Note that we could attempt to use something like
+			 * metaslab_recalculate_weight_and_sort() that
+			 * retains the activation mask here. That function
+			 * uses metaslab_weight() to set the weight though
+			 * which is not as accurate as the calculations
+			 * above.
+			 */
+			weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
+			metaslab_group_sort(mg, msp, weight);
 		}
+		metaslab_active_mask_verify(msp);
 
 		/*
 		 * We have just failed an allocation attempt, check
@@ -4220,7 +4704,7 @@ next:
 		 * we may end up in an infinite loop retrying the same
 		 * metaslab.
 		 */
-		ASSERT(!metaslab_should_allocate(msp, asize));
+		ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
 
 		mutex_exit(&msp->ms_lock);
 	}
@@ -4231,14 +4715,14 @@ next:
 
 static uint64_t
 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
-    int d, int allocator)
+    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
+    int allocator, boolean_t try_hard)
 {
 	uint64_t offset;
 	ASSERT(mg->mg_initialized);
 
 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
-	    dva, d, allocator);
+	    dva, d, allocator, try_hard);
 
 	mutex_enter(&mg->mg_lock);
 	if (offset == -1ULL) {
@@ -4408,7 +4892,7 @@ top:
 		 * allow any metaslab to be used (unique=false).
 		 */
 		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
-		    !try_hard, dva, d, allocator);
+		    !try_hard, dva, d, allocator, try_hard);
 
 		if (offset != -1ULL) {
 			/*
@@ -4731,6 +5215,7 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 	mutex_enter(&msp->ms_lock);
 	range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
 	    offset, size);
+	msp->ms_allocating_total -= size;
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY3U(offset, >=, msp->ms_start);
@@ -4836,7 +5321,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
 	mutex_enter(&msp->ms_lock);
 
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
-		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM, txg);
+		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
 	/*
 	 * No need to fail in that case; someone else has activated the
 	 * metaslab, but that doesn't preclude us from using it.
@@ -4862,10 +5347,20 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
 	range_tree_clear(msp->ms_trim, offset, size);
 
 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
+		metaslab_class_t *mc = msp->ms_group->mg_class;
+		multilist_sublist_t *mls =
+		    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+		if (!multilist_link_active(&msp->ms_class_txg_node)) {
+			msp->ms_selected_txg = txg;
+			multilist_sublist_insert_head(mls, msp);
+		}
+		multilist_sublist_unlock(mls);
+
 		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
 		range_tree_add(msp->ms_allocating[txg & TXG_MASK],
 		    offset, size);
+		msp->ms_allocating_total += size;
 	}
 
 	mutex_exit(&msp->ms_lock);
@@ -5226,7 +5721,7 @@ metaslab_disable(metaslab_t *msp)
 }
 
 void
-metaslab_enable(metaslab_t *msp, boolean_t sync)
+metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	spa_t *spa = mg->mg_vd->vdev_spa;
@@ -5244,6 +5739,8 @@ metaslab_enable(metaslab_t *msp, boolean_t sync)
 	if (--msp->ms_disabled == 0) {
 		mg->mg_ms_disabled--;
 		cv_broadcast(&mg->mg_ms_disabled_cv);
+		if (unload)
+			metaslab_unload(msp);
 	}
 	mutex_exit(&msp->ms_lock);
 	mutex_exit(&mg->mg_ms_disabled_lock);
diff --git a/usr/src/uts/common/fs/zfs/range_tree.c b/usr/src/uts/common/fs/zfs/range_tree.c
index 0ce251126b..92726c3f71 100644
--- a/usr/src/uts/common/fs/zfs/range_tree.c
+++ b/usr/src/uts/common/fs/zfs/range_tree.c
@@ -525,6 +525,36 @@ range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
 }
 
 /*
+ * Returns the first subset of the given range which overlaps with the range
+ * tree. Returns true if there is a segment in the range, and false if there
+ * isn't.
+ */
+boolean_t
+range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
+    uint64_t *ostart, uint64_t *osize)
+{
+	range_seg_t rsearch;
+	rsearch.rs_start = start;
+	rsearch.rs_end = start + 1;
+
+	avl_index_t where;
+	range_seg_t *rs = avl_find(&rt->rt_root, &rsearch, &where);
+	if (rs != NULL) {
+		*ostart = start;
+		*osize = MIN(size, rs->rs_end - start);
+		return (B_TRUE);
+	}
+
+	rs = avl_nearest(&rt->rt_root, where, AVL_AFTER);
+	if (rs == NULL || rs->rs_start > start + size)
+		return (B_FALSE);
+
+	*ostart = rs->rs_start;
+	*osize = MIN(start + size, rs->rs_end) - rs->rs_start;
+	return (B_TRUE);
+}
+
+/*
  * Ensure that this range is not in the tree, regardless of whether
  * it is currently in the tree.
  */
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index c213c860bd..054e773b3f 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -8618,6 +8618,10 @@ spa_sync(spa_t *spa, uint64_t txg)
 	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
 	    != NULL)
 		vdev_sync_done(vd, txg);
+
+	metaslab_class_evict_old(spa->spa_normal_class, txg);
+	metaslab_class_evict_old(spa->spa_log_class, txg);
+
 	spa_sync_close_syncing_log_sm(spa);
 
 	spa_update_dspace(spa);
diff --git a/usr/src/uts/common/fs/zfs/spa_log_spacemap.c b/usr/src/uts/common/fs/zfs/spa_log_spacemap.c
index bbb6eda845..e0c369d13c 100644
--- a/usr/src/uts/common/fs/zfs/spa_log_spacemap.c
+++ b/usr/src/uts/common/fs/zfs/spa_log_spacemap.c
@@ -1191,7 +1191,8 @@ out:
 		    metaslab_unflushed_changes_memused(m);
 
 		if (metaslab_debug_load && m->ms_sm != NULL) {
-			VERIFY0(metaslab_load(m, spa_syncing_txg(spa)));
+			VERIFY0(metaslab_load(m));
+			metaslab_set_selected_txg(m, 0);
 		}
 		mutex_exit(&m->ms_lock);
 	}
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index f636d3dcf2..1ef3bb79ca 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -236,6 +236,7 @@ void arc_flush(spa_t *spa, boolean_t retry);
 void arc_tempreserve_clear(uint64_t reserve);
 int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
 
+uint64_t arc_all_memory(void);
 uint64_t arc_max_bytes(void);
 void arc_init(void);
 void arc_fini(void);
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
index 10705a84bc..069c5ab79a 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -56,7 +56,7 @@ uint64_t metaslab_estimated_condensed_size(metaslab_t *);
 int metaslab_sort_by_flushed(const void *, const void *);
 uint64_t metaslab_unflushed_changes_memused(metaslab_t *);
 
-int metaslab_load(metaslab_t *, uint64_t);
+int metaslab_load(metaslab_t *);
 void metaslab_unload(metaslab_t *);
 boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *);
 
@@ -65,7 +65,7 @@ uint64_t metaslab_allocated_space(metaslab_t *);
 void metaslab_sync(metaslab_t *, uint64_t);
 void metaslab_sync_done(metaslab_t *, uint64_t);
 void metaslab_sync_reassess(metaslab_group_t *);
-uint64_t metaslab_block_maxsize(metaslab_t *);
+uint64_t metaslab_largest_allocatable(metaslab_t *);
 
 /*
  * metaslab alloc flags
@@ -107,7 +107,7 @@ uint64_t metaslab_class_expandable_space(metaslab_class_t *);
 boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
     zio_t *, int);
 void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
-
+void metaslab_class_evict_old(metaslab_class_t *, uint64_t);
 uint64_t metaslab_class_get_alloc(metaslab_class_t *);
 uint64_t metaslab_class_get_space(metaslab_class_t *);
 uint64_t metaslab_class_get_dspace(metaslab_class_t *);
@@ -130,7 +130,8 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
 void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
 void metaslab_recalculate_weight_and_sort(metaslab_t *);
 void metaslab_disable(metaslab_t *);
-void metaslab_enable(metaslab_t *, boolean_t);
+void metaslab_enable(metaslab_t *, boolean_t, boolean_t);
+void metaslab_set_selected_txg(metaslab_t *, uint64_t);
 
 extern int metaslab_debug_load;
 
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
index 5920b3113c..a413eef490 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -36,6 +36,7 @@
 #include <sys/vdev.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
+#include <sys/multilist.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -194,6 +195,12 @@ struct metaslab_class {
 	uint64_t		mc_space;	/* total space (alloc + free) */
 	uint64_t		mc_dspace;	/* total deflated space */
 	uint64_t		mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+	/*
+	 * List of all loaded metaslabs in the class, sorted in order of most
+	 * recent use.
+	 */
+	multilist_t		*mc_metaslab_txg_list;
 };
 
 /*
@@ -387,6 +394,7 @@ struct metaslab {
 	range_tree_t	*ms_allocating[TXG_SIZE];
 	range_tree_t	*ms_allocatable;
 	uint64_t	ms_allocated_this_txg;
+	uint64_t	ms_allocating_total;
 
 	/*
 	 * The following range trees are accessed only from syncing context.
@@ -484,7 +492,13 @@ struct metaslab {
 	 * stay cached.
 	 */
 	uint64_t	ms_selected_txg;
-	uint64_t	ms_loaded_txg;	/* track when metaslab was loaded */
+	/*
+	 * ms_load/unload_time can be used for performance monitoring
+	 * (e.g. by dtrace or mdb).
+	 */
+	hrtime_t	ms_load_time;	/* time last loaded */
+	hrtime_t	ms_unload_time;	/* time last unloaded */
+	hrtime_t	ms_selected_time; /* time last allocated from */
 
 	uint64_t	ms_max_size;	/* maximum allocatable size	*/
 
@@ -504,12 +518,17 @@ struct metaslab {
 	 * segment sizes.
 	 */
 	avl_tree_t	ms_allocatable_by_size;
+	avl_tree_t	ms_unflushed_frees_by_size;
 	uint64_t	ms_lbas[MAX_LBAS];
 
 	metaslab_group_t *ms_group;	/* metaslab group		*/
 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
 	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
 	avl_node_t	ms_spa_txg_node; /* node in spa_metaslabs_by_txg */
+	/*
+	 * Node in metaslab class's selected txg list
+	 */
+	multilist_node_t	ms_class_txg_node;
 
 	/*
 	 * Allocs and frees that are committed to the vdev log spacemap but
diff --git a/usr/src/uts/common/fs/zfs/sys/range_tree.h b/usr/src/uts/common/fs/zfs/sys/range_tree.h
index d450ff7f16..716aaf3b90 100644
--- a/usr/src/uts/common/fs/zfs/sys/range_tree.h
+++ b/usr/src/uts/common/fs/zfs/sys/range_tree.h
@@ -88,6 +88,8 @@ range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
 range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
 void range_tree_destroy(range_tree_t *rt);
 boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
+    uint64_t *ostart, uint64_t *osize);
 void range_tree_verify_not_present(range_tree_t *rt,
     uint64_t start, uint64_t size);
 range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
diff --git a/usr/src/uts/common/fs/zfs/vdev_initialize.c b/usr/src/uts/common/fs/zfs/vdev_initialize.c
index af18983c44..2079df133c 100644
--- a/usr/src/uts/common/fs/zfs/vdev_initialize.c
+++ b/usr/src/uts/common/fs/zfs/vdev_initialize.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
  */
 
 #include <sys/spa.h>
@@ -350,7 +350,7 @@ vdev_initialize_calculate_progress(vdev_t *vd)
 		 * metaslab. Load it and walk the free tree for more accurate
 		 * progress estimation.
 		 */
-		VERIFY0(metaslab_load(msp, spa_syncing_txg(vd->vdev_spa)));
+		VERIFY0(metaslab_load(msp));
 
 		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
 		    rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
@@ -474,6 +474,7 @@ vdev_initialize_thread(void *arg)
 	for (uint64_t i = 0; !vd->vdev_detached &&
 	    i < vd->vdev_top->vdev_ms_count; i++) {
 		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+		boolean_t unload_when_done = B_FALSE;
 
 		/*
 		 * If we've expanded the top-level vdev or it's our
@@ -487,14 +488,16 @@ vdev_initialize_thread(void *arg)
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		metaslab_disable(msp);
 		mutex_enter(&msp->ms_lock);
-		VERIFY0(metaslab_load(msp, spa_syncing_txg(spa)));
+		if (!msp->ms_loaded && !msp->ms_loading)
+			unload_when_done = B_TRUE;
+		VERIFY0(metaslab_load(msp));
 
 		range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
 		    vd);
 		mutex_exit(&msp->ms_lock);
 
 		error = vdev_initialize_ranges(vd, deadbeef);
-		metaslab_enable(msp, B_TRUE);
+		metaslab_enable(msp, B_TRUE, unload_when_done);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 		range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
diff --git a/usr/src/uts/common/fs/zfs/vdev_trim.c b/usr/src/uts/common/fs/zfs/vdev_trim.c
index a60d11814b..4be11bcb51 100644
--- a/usr/src/uts/common/fs/zfs/vdev_trim.c
+++ b/usr/src/uts/common/fs/zfs/vdev_trim.c
@@ -622,7 +622,7 @@ vdev_trim_calculate_progress(vdev_t *vd)
 		 * metaslab.  Load it and walk the free tree for more
 		 * accurate progress estimation.
 		 */
-		VERIFY0(metaslab_load(msp, spa_syncing_txg(vd->vdev_spa)));
+		VERIFY0(metaslab_load(msp));
 
 		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
 		    rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
@@ -730,7 +730,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
 	 */
 	if (zfs_flags & ZFS_DEBUG_TRIM) {
 		metaslab_t *msp = ta->trim_msp;
-		VERIFY0(metaslab_load(msp, spa_syncing_txg(vd->vdev_spa)));
+		VERIFY0(metaslab_load(msp));
 		VERIFY3B(msp->ms_loaded, ==, B_TRUE);
 		VERIFY(range_tree_find(msp->ms_allocatable, start, size));
 	}
@@ -842,7 +842,7 @@ vdev_trim_thread(void *arg)
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		metaslab_disable(msp);
 		mutex_enter(&msp->ms_lock);
-		VERIFY0(metaslab_load(msp, spa_syncing_txg(spa)));
+		VERIFY0(metaslab_load(msp));
 
 		/*
 		 * If a partial TRIM was requested skip metaslabs which have
@@ -850,7 +850,7 @@ vdev_trim_thread(void *arg)
 		 */
 		if (msp->ms_sm == NULL && vd->vdev_trim_partial) {
 			mutex_exit(&msp->ms_lock);
-			metaslab_enable(msp, B_FALSE);
+			metaslab_enable(msp, B_FALSE, B_FALSE);
 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 			vdev_trim_calculate_progress(vd);
 			continue;
@@ -862,7 +862,7 @@ vdev_trim_thread(void *arg)
 		mutex_exit(&msp->ms_lock);
 
 		error = vdev_trim_ranges(&ta);
-		metaslab_enable(msp, B_TRUE);
+		metaslab_enable(msp, B_TRUE, B_FALSE);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 		range_tree_vacate(ta.trim_tree, NULL, NULL);
@@ -1167,7 +1167,7 @@ vdev_autotrim_thread(void *arg)
 			if (msp->ms_sm == NULL ||
 			    range_tree_is_empty(msp->ms_trim)) {
 				mutex_exit(&msp->ms_lock);
-				metaslab_enable(msp, B_FALSE);
+				metaslab_enable(msp, B_FALSE, B_FALSE);
 				continue;
 			}
 
@@ -1183,7 +1183,7 @@ vdev_autotrim_thread(void *arg)
 			 */
 			if (msp->ms_disabled > 1) {
 				mutex_exit(&msp->ms_lock);
-				metaslab_enable(msp, B_FALSE);
+				metaslab_enable(msp, B_FALSE, B_FALSE);
 				continue;
 			}
 
@@ -1291,8 +1291,7 @@ vdev_autotrim_thread(void *arg)
 			 */
 			if (zfs_flags & ZFS_DEBUG_TRIM) {
 				mutex_enter(&msp->ms_lock);
-				VERIFY0(metaslab_load(msp,
-				    spa_syncing_txg(spa)));
+				VERIFY0(metaslab_load(msp));
 				VERIFY3P(tap[0].trim_msp, ==, msp);
 				range_tree_walk(trim_tree,
 				    vdev_trim_range_verify, &tap[0]);
@@ -1302,7 +1301,7 @@ vdev_autotrim_thread(void *arg)
 			range_tree_vacate(trim_tree, NULL, NULL);
 			range_tree_destroy(trim_tree);
 
-			metaslab_enable(msp, issued_trim);
+			metaslab_enable(msp, issued_trim, B_FALSE);
 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 			for (uint64_t c = 0; c < children; c++) {
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 72e18d5305..b24d83496c 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -5706,7 +5706,7 @@ zfs_ioc_next_obj(zfs_cmd_t *zc)
 	objset_t *os = NULL;
 	int error;
 
-	error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os);
+	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error != 0)
 		return (error);
 
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
index 3697d888e7..656c598e53 100644
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -1716,10 +1716,8 @@ mac_srs_create_proto_softrings(int id, uint16_t type, pri_t pri,
 	bzero(&mrf, sizeof (mac_rx_fifo_t));
 	mrf.mrf_type = MAC_RX_FIFO;
 	mrf.mrf_receive = (mac_receive_t)mac_soft_ring_poll;
-	mrf.mrf_intr_enable =
-	    (mac_intr_enable_t)mac_soft_ring_intr_enable;
-	mrf.mrf_intr_disable =
-	    (mac_intr_disable_t)mac_soft_ring_intr_disable;
+	mrf.mrf_intr_enable = (mac_intr_enable_t)mac_soft_ring_intr_enable;
+	mrf.mrf_intr_disable = (mac_intr_disable_t)mac_soft_ring_intr_disable;
 	mrf.mrf_flow_priority = pri;
 
 	softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait,
diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c
index 4655631dc1..c8a16e6fd3 100644
--- a/usr/src/uts/common/io/mac/mac_soft_ring.c
+++ b/usr/src/uts/common/io/mac/mac_soft_ring.c
@@ -494,7 +494,7 @@ done:
  * Enabling is allow the processing thread to send packets to the
  * client while disabling does the opposite.
  */
-void
+int
 mac_soft_ring_intr_enable(void *arg)
 {
 	mac_soft_ring_t *ringp = (mac_soft_ring_t *)arg;
@@ -503,6 +503,7 @@ mac_soft_ring_intr_enable(void *arg)
 	if (ringp->s_ring_first != NULL)
 		mac_soft_ring_worker_wakeup(ringp);
 	mutex_exit(&ringp->s_ring_lock);
+	return (0);
 }
 
 boolean_t
diff --git a/usr/src/uts/common/io/usb/usba/usbai.c b/usr/src/uts/common/io/usb/usba/usbai.c
index f6ac391bd8..e1a6b4dfcd 100644
--- a/usr/src/uts/common/io/usb/usba/usbai.c
+++ b/usr/src/uts/common/io/usb/usba/usbai.c
@@ -1040,7 +1040,7 @@ usb_register_hotplug_cbs(dev_info_t *dip,
 		}
 	}
 	if (ddi_add_event_handler(dip, usba_device->rm_cookie,
-	    (peh_t)disconnect_event_handler,
+	    (peh_t)(uintptr_t)disconnect_event_handler,
 	    NULL, &evdata->ev_rm_cb_id) != DDI_SUCCESS) {
 		USB_DPRINTF_L2(DPRINT_MASK_USBAI, usbai_log_handle,
 		    "usb_register_hotplug_cbs: add disconnect handler failed");
@@ -1058,7 +1058,7 @@ usb_register_hotplug_cbs(dev_info_t *dip,
 		}
 	}
 	if (ddi_add_event_handler(dip, usba_device->ins_cookie,
-	    (peh_t)reconnect_event_handler,
+	    (peh_t)(uintptr_t)reconnect_event_handler,
 	    NULL, &evdata->ev_ins_cb_id) != DDI_SUCCESS) {
 		USB_DPRINTF_L2(DPRINT_MASK_USBAI, usbai_log_handle,
 		    "usb_register_hotplug_cbs: add reconnect handler failed");
@@ -1129,7 +1129,7 @@ usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata,
 			}
 		}
 		if (ddi_add_event_handler(dip, usba_device->rm_cookie,
-		    (peh_t)usb_evdata->disconnect_event_handler,
+		    (peh_t)(uintptr_t)usb_evdata->disconnect_event_handler,
 		    NULL, &evdata->ev_rm_cb_id) != DDI_SUCCESS) {
 
 			goto fail;
@@ -1144,7 +1144,7 @@ usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata,
 			}
 		}
 		if (ddi_add_event_handler(dip, usba_device->ins_cookie,
-		    (peh_t)usb_evdata->reconnect_event_handler,
+		    (peh_t)(uintptr_t)usb_evdata->reconnect_event_handler,
 		    NULL, &evdata->ev_ins_cb_id) != DDI_SUCCESS) {
 
 			goto fail;
@@ -1159,7 +1159,7 @@ usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata,
 			}
 		}
 		if (ddi_add_event_handler(dip, usba_device->resume_cookie,
-		    (peh_t)usb_evdata->post_resume_event_handler,
+		    (peh_t)(uintptr_t)usb_evdata->post_resume_event_handler,
 		    NULL, &evdata->ev_resume_cb_id) != DDI_SUCCESS) {
 
 			goto fail;
@@ -1174,7 +1174,7 @@ usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata,
 			}
 		}
 		if (ddi_add_event_handler(dip, usba_device->suspend_cookie,
-		    (peh_t)usb_evdata->pre_suspend_event_handler,
+		    (peh_t)(uintptr_t)usb_evdata->pre_suspend_event_handler,
 		    NULL, &evdata->ev_suspend_cb_id) != DDI_SUCCESS) {
 
 			goto fail;
diff --git a/usr/src/uts/common/sys/mac_soft_ring.h b/usr/src/uts/common/sys/mac_soft_ring.h
index 581e18d06e..5a41899e60 100644
--- a/usr/src/uts/common/sys/mac_soft_ring.h
+++ b/usr/src/uts/common/sys/mac_soft_ring.h
@@ -691,7 +691,7 @@ extern void mac_srs_update_drv(struct mac_client_impl_s *);
 extern void mac_update_srs_priority(mac_soft_ring_set_t *, pri_t);
 extern void mac_client_update_classifier(mac_client_impl_t *, boolean_t);
 
-extern void mac_soft_ring_intr_enable(void *);
+extern int mac_soft_ring_intr_enable(void *);
 extern boolean_t mac_soft_ring_intr_disable(void *);
 extern mac_soft_ring_t *mac_soft_ring_create(int, clock_t, uint16_t,
     pri_t, mac_client_impl_t *, mac_soft_ring_set_t *,
diff --git a/usr/src/uts/common/xen/io/evtchn_dev.c b/usr/src/uts/common/xen/io/evtchn_dev.c
index b4ba63b436..7a8d50eb33 100644
--- a/usr/src/uts/common/xen/io/evtchn_dev.c
+++ b/usr/src/uts/common/xen/io/evtchn_dev.c
@@ -112,8 +112,8 @@ static int evtchndrv_detach(dev_info_t *, ddi_detach_cmd_t);
 static struct evtsoftdata *port_user[NR_EVENT_CHANNELS];
 static kmutex_t port_user_lock;
 
-void
-evtchn_device_upcall()
+uint_t
+evtchn_device_upcall(caddr_t arg __unused, caddr_t arg1 __unused)
 {
 	struct evtsoftdata *ep;
 	int port;
@@ -154,6 +154,7 @@ evtchn_device_upcall()
 
 done:
 	mutex_exit(&port_user_lock);
+	return (DDI_INTR_CLAIMED);
 }
 
 /* ARGSUSED */
diff --git a/usr/src/uts/i86pc/io/immu_intrmap.c b/usr/src/uts/i86pc/io/immu_intrmap.c
index ab9f9bcbe7..737eed2efa 100644
--- a/usr/src/uts/i86pc/io/immu_intrmap.c
+++ b/usr/src/uts/i86pc/io/immu_intrmap.c
@@ -63,7 +63,7 @@ typedef struct intrmap_rte {
 	    (p))
 
 typedef enum {
-	SVT_NO_VERIFY = 0, 	/* no verification */
+	SVT_NO_VERIFY = 0,	/* no verification */
 	SVT_ALL_VERIFY,		/* using sid and sq to verify */
 	SVT_BUS_VERIFY,		/* verify #startbus and #endbus */
 	SVT_RSVD
@@ -224,7 +224,7 @@ bitset_find_multi_free(bitset_t *b, uint_t post, uint_t count)
 		}
 	}
 
-	return (INTRMAP_IDX_FULL);  		/* no free index */
+	return (INTRMAP_IDX_FULL);		/* no free index */
 }
 
 /* alloc one interrupt remapping table entry */
@@ -495,11 +495,12 @@ intrmap_enable(immu_t *immu)
 
 /*
  * immu_intr_handler()
- * 	the fault event handler for a single immu unit
+ *	the fault event handler for a single immu unit
  */
-int
-immu_intr_handler(immu_t *immu)
+uint_t
+immu_intr_handler(caddr_t arg, caddr_t arg1 __unused)
 {
+	immu_t *immu = (immu_t *)arg;
 	uint32_t status;
 	int index, fault_reg_offset;
 	int max_fault_index;
@@ -995,10 +996,10 @@ immu_intr_register(immu_t *immu)
 	    "%s-intr-handler", immu->immu_name);
 
 	(void) add_avintr((void *)NULL, IMMU_INTR_IPL,
-	    (avfunc)(immu_intr_handler), intr_handler_name, irq,
+	    immu_intr_handler, intr_handler_name, irq,
 	    (caddr_t)immu, NULL, NULL, NULL);
 
 	immu_regs_intr_enable(immu, msi_addr, msi_data, uaddr);
 
-	(void) immu_intr_handler(immu);
+	(void) immu_intr_handler((caddr_t)immu, NULL);
 }
diff --git a/usr/src/uts/i86pc/io/immu_regs.c b/usr/src/uts/i86pc/io/immu_regs.c
index dc43b0f49a..d6b184416a 100644
--- a/usr/src/uts/i86pc/io/immu_regs.c
+++ b/usr/src/uts/i86pc/io/immu_regs.c
@@ -253,7 +253,7 @@ gaw2agaw(int gaw)
 
 /*
  * set_immu_agaw()
- * 	calculate agaw for a IOMMU unit
+ *	calculate agaw for a IOMMU unit
  */
 static int
 set_agaw(immu_t *immu)
@@ -481,7 +481,7 @@ immu_regs_resume(immu_t *immu)
 	immu_regs_intr_enable(immu, immu->immu_regs_intr_msi_addr,
 	    immu->immu_regs_intr_msi_data, immu->immu_regs_intr_uaddr);
 
-	(void) immu_intr_handler(immu);
+	(void) immu_intr_handler((caddr_t)immu, NULL);
 
 	immu_regs_intrmap_enable(immu, immu->immu_intrmap_irta_reg);
 
@@ -638,7 +638,7 @@ immu_regs_wbf_flush(immu_t *immu)
 
 /*
  * immu_regs_cpu_flush()
- * 	flush the cpu cache line after CPU memory writes, so
+ *	flush the cpu cache line after CPU memory writes, so
  *      IOMMU can see the writes
  */
 void
diff --git a/usr/src/uts/i86pc/sys/immu.h b/usr/src/uts/i86pc/sys/immu.h
index 70193d26e6..22ae9ad3bf 100644
--- a/usr/src/uts/i86pc/sys/immu.h
+++ b/usr/src/uts/i86pc/sys/immu.h
@@ -130,11 +130,11 @@ typedef struct drhd {
 	kmutex_t	dr_lock;   /* protects the dmar field */
 	struct immu	*dr_immu;
 	dev_info_t	*dr_dip;
-	uint16_t 	dr_seg;
-	uint64_t 	dr_regs;
+	uint16_t	dr_seg;
+	uint64_t	dr_regs;
 	boolean_t	dr_include_all;
-	list_t 		dr_scope_list;
-	list_node_t 	dr_node;
+	list_t		dr_scope_list;
+	list_node_t	dr_node;
 } drhd_t;
 
 typedef struct rmrr {
@@ -638,7 +638,7 @@ typedef struct immu {
  * Enough space to hold the decimal number of any device instance.
  * Used for device/cache names.
  */
-#define	IMMU_ISTRLEN 	11	/* log10(2^31)  + 1 */
+#define	IMMU_ISTRLEN	11	/* log10(2^31)  + 1 */
 
 /* properties that control DVMA */
 #define	DDI_DVMA_MAPTYPE_ROOTNEX_PROP	"immu-dvma-mapping"
@@ -677,7 +677,7 @@ typedef struct domain {
 	/* list node for list of domains off immu */
 	list_node_t		dom_immu_node;
 
-	mod_hash_t 		*dom_cookie_hash;
+	mod_hash_t		*dom_cookie_hash;
 
 	/* topmost device in domain; usually the device itself (non-shared) */
 	dev_info_t		*dom_dip;
@@ -944,7 +944,7 @@ void immu_intrmap_destroy(list_t *immu_list);
 
 /* registers interrupt handler for IOMMU unit */
 void immu_intr_register(immu_t *immu);
-int immu_intr_handler(immu_t *immu);
+uint_t immu_intr_handler(caddr_t, caddr_t);
 
 
 /* immu_qinv.c interfaces */
diff --git a/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c b/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c
index 8bc46f8e3e..a7745fd3f2 100644
--- a/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c
+++ b/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c
@@ -74,7 +74,7 @@ static gcpu_poll_trace_ctl_t gcpu_xpv_poll_trace_ctl;
 #define	GCPU_XPV_MCH_POLL_NO_REARM	NULL
 
 static uint_t
-gcpu_xpv_virq_intr(void)
+gcpu_xpv_virq_intr(caddr_t arg __unused, caddr_t arg1 __unused)
 {
 	int types[] = { XEN_MC_URGENT, XEN_MC_NONURGENT };
 	uint64_t fetch_id;
@@ -194,7 +194,7 @@ gcpu_mca_poll_start(cmi_hdl_t hdl)
 		 */
 		gcpu_xpv_virq_vect = ec_bind_virq_to_irq(VIRQ_MCA, 0);
 		(void) add_avintr(NULL, gcpu_xpv_virq_level,
-		    (avfunc)gcpu_xpv_virq_intr, "MCA", gcpu_xpv_virq_vect,
+		    gcpu_xpv_virq_intr, "MCA", gcpu_xpv_virq_vect,
 		    NULL, NULL, NULL, NULL);
 	}
 }
diff --git a/usr/src/uts/i86xpv/io/psm/xpv_psm.c b/usr/src/uts/i86xpv/io/psm/xpv_psm.c
index bc0ab7748d..94308c3f2f 100644
--- a/usr/src/uts/i86xpv/io/psm/xpv_psm.c
+++ b/usr/src/uts/i86xpv/io/psm/xpv_psm.c
@@ -223,14 +223,13 @@ xen_psm_hrtimeinit(void)
 }
 
 /* xen_psm NMI handler */
-/*ARGSUSED*/
-static void
-xen_psm_nmi_intr(caddr_t arg, struct regs *rp)
+static uint_t
+xen_psm_nmi_intr(caddr_t arg __unused, caddr_t arg1 __unused)
 {
 	xen_psm_num_nmis++;
 
 	if (!lock_try(&xen_psm_nmi_lock))
-		return;
+		return (DDI_INTR_UNCLAIMED);
 
 	if (xen_psm_kmdb_on_nmi && psm_debugger()) {
 		debug_enter("NMI received: entering kmdb\n");
@@ -247,6 +246,7 @@ xen_psm_nmi_intr(caddr_t arg, struct regs *rp)
 	}
 
 	lock_clear(&xen_psm_nmi_lock);
+	return (DDI_INTR_CLAIMED);
 }
 
 static void
@@ -294,7 +294,7 @@ xen_psm_picinit()
 	/* add nmi handler - least priority nmi handler */
 	LOCK_INIT_CLEAR(&xen_psm_nmi_lock);
 
-	if (!psm_add_nmintr(0, (avfunc) xen_psm_nmi_intr,
+	if (!psm_add_nmintr(0, xen_psm_nmi_intr,
 	    "xVM_psm NMI handler", (caddr_t)NULL))
 		cmn_err(CE_WARN, "xVM_psm: Unable to add nmi handler");
 }
author	Ryan Zezeski <rpz@joyent.com>	2019-11-14 09:39:53 -0700
committer	Ryan Zezeski <rpz@joyent.com>	2019-11-14 09:39:53 -0700
commit	074bf480b3d9701c3c55056fe6105028504135b6 (patch)
tree	9bd8568e7caa13fc1b13146260ba82af1827ebbc
parent	27bc3ef3b6dd5a071a0607d96af5eec24ca5d276 (diff)
parent	43ef85afe5649116d876156ca6eb797e144c9795 (diff)
download	illumos-joyent-cr6990-OS-8027.tar.gz