[illumos-gate merge]

commit 814dcd43c3de9925fd6226c256e4d4327841a0e1 11557 Log Spacemap Project commit c4e4d4102c8a8c2cc936dd971bdafe4ec52fd4cf 11747 zpool iostat -v no longer shows titles for log/bias sections Conflicts: usr/src/uts/common/fs/zfs/sys/metaslab.h usr/src/uts/common/fs/zfs/metaslab.c
author: Jerry Jelinek <jerry.jelinek@joyent.com> 2019-09-26 12:34:03 +0000
committer: Jerry Jelinek <jerry.jelinek@joyent.com> 2019-09-26 12:34:03 +0000
commit: 2b56e6362d6c66c3c0019a24349c436c2cd162ba (patch)
tree: 4f35e286b6fc5ed0eda0cd43d33ce54fc15ebaf0
parent: 3105c6ff4e5cab926dc4802a7e10eee1f4abbec4 (diff)
parent: 814dcd43c3de9925fd6226c256e4d4327841a0e1 (diff)
download: illumos-joyent-2b56e6362d6c66c3c0019a24349c436c2cd162ba.tar.gz
42 files changed, 3493 insertions, 483 deletions
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
index c536a0d399..2c32e1a191 100644
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -1465,6 +1465,9 @@ spa_print_config(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 
 
 typedef struct mdb_range_tree {
+	struct {
+		uint64_t avl_numnodes;
+	} rt_root;
 	uint64_t rt_space;
 } mdb_range_tree_t;
 
@@ -1486,6 +1489,8 @@ typedef struct mdb_metaslab {
 	uintptr_t ms_freeing;
 	uintptr_t ms_freed;
 	uintptr_t ms_allocatable;
+	uintptr_t ms_unflushed_frees;
+	uintptr_t ms_unflushed_allocs;
 	uintptr_t ms_sm;
 } mdb_metaslab_t;
 
@@ -1501,12 +1506,23 @@ typedef struct mdb_space_map {
 } mdb_space_map_t;
 
 typedef struct mdb_vdev {
-	uintptr_t vdev_path;
-	uintptr_t vdev_ms;
+	uint64_t vdev_id;
+	uint64_t vdev_state;
 	uintptr_t vdev_ops;
+	struct {
+		uint64_t vs_aux;
+		uint64_t vs_ops[VS_ZIO_TYPES];
+		uint64_t vs_bytes[VS_ZIO_TYPES];
+		uint64_t vs_read_errors;
+		uint64_t vs_write_errors;
+		uint64_t vs_checksum_errors;
+	} vdev_stat;
+	uintptr_t vdev_child;
+	uint64_t vdev_children;
 	uint64_t vdev_ms_count;
-	uint64_t vdev_id;
-	vdev_stat_t vdev_stat;
+	uintptr_t vdev_mg;
+	uintptr_t vdev_ms;
+	uintptr_t vdev_path;
 } mdb_vdev_t;
 
 typedef struct mdb_vdev_ops {
@@ -1514,37 +1530,31 @@ typedef struct mdb_vdev_ops {
 } mdb_vdev_ops_t;
 
 static int
-metaslab_stats(uintptr_t addr, int spa_flags)
+metaslab_stats(mdb_vdev_t *vd, int spa_flags)
 {
-	mdb_vdev_t vdev;
-	uintptr_t *vdev_ms;
-
-	if (mdb_ctf_vread(&vdev, "vdev_t", "mdb_vdev_t",
-	    (uintptr_t)addr, 0) == -1) {
-		mdb_warn("failed to read vdev at %p\n", addr);
-		return (DCMD_ERR);
-	}
-
 	mdb_inc_indent(4);
-	mdb_printf("%<u>%-?s %6s %20s %10s %9s%</u>\n", "ADDR", "ID",
-	    "OFFSET", "FREE", "FRAGMENTATION");
+	mdb_printf("%<u>%-?s %6s %20s %10s %10s %10s%</u>\n", "ADDR", "ID",
+	    "OFFSET", "FREE", "FRAG", "UCMU");
 
-	vdev_ms = mdb_alloc(vdev.vdev_ms_count * sizeof (void *),
+	uintptr_t *vdev_ms = mdb_alloc(vd->vdev_ms_count * sizeof (vdev_ms),
 	    UM_SLEEP | UM_GC);
-	if (mdb_vread(vdev_ms, vdev.vdev_ms_count * sizeof (void *),
-	    (uintptr_t)vdev.vdev_ms) == -1) {
-		mdb_warn("failed to read vdev_ms at %p\n", vdev.vdev_ms);
+	if (mdb_vread(vdev_ms, vd->vdev_ms_count * sizeof (uintptr_t),
+	    vd->vdev_ms) == -1) {
+		mdb_warn("failed to read vdev_ms at %p\n", vd->vdev_ms);
 		return (DCMD_ERR);
 	}
 
-	for (int m = 0; m < vdev.vdev_ms_count; m++) {
+	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		mdb_metaslab_t ms;
 		mdb_space_map_t sm = { 0 };
-		mdb_space_map_phys_t smp;
+		mdb_space_map_phys_t smp = { 0 };
+		mdb_range_tree_t rt;
+		uint64_t uallocs, ufrees, raw_free, raw_uchanges_mem;
 		char free[MDB_NICENUM_BUFLEN];
+		char uchanges_mem[MDB_NICENUM_BUFLEN];
 
 		if (mdb_ctf_vread(&ms, "metaslab_t", "mdb_metaslab_t",
-		    (uintptr_t)vdev_ms[m], 0) == -1)
+		    vdev_ms[m], 0) == -1)
 			return (DCMD_ERR);
 
 		if (ms.ms_sm != 0 &&
@@ -1552,25 +1562,40 @@ metaslab_stats(uintptr_t addr, int spa_flags)
 		    ms.ms_sm, 0) == -1)
 			return (DCMD_ERR);
 
-		if (sm.sm_phys != 0) {
+		if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t",
+		    ms.ms_unflushed_frees, 0) == -1)
+			return (DCMD_ERR);
+		ufrees = rt.rt_space;
+		raw_uchanges_mem = rt.rt_root.avl_numnodes *
+		    mdb_ctf_sizeof_by_name("range_seg_t");
+
+		if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t",
+		    ms.ms_unflushed_allocs, 0) == -1)
+			return (DCMD_ERR);
+		uallocs = rt.rt_space;
+		raw_uchanges_mem += rt.rt_root.avl_numnodes *
+		    mdb_ctf_sizeof_by_name("range_seg_t");
+		mdb_nicenum(raw_uchanges_mem, uchanges_mem);
+
+		raw_free = ms.ms_size;
+		if (ms.ms_sm != 0 && sm.sm_phys != 0) {
 			(void) mdb_ctf_vread(&smp, "space_map_phys_t",
 			    "mdb_space_map_phys_t", sm.sm_phys, 0);
-			mdb_nicenum(ms.ms_size - smp.smp_alloc, free);
-		} else {
-			(void) mdb_snprintf(free, MDB_NICENUM_BUFLEN, "-");
+			raw_free -= smp.smp_alloc;
 		}
+		raw_free += ufrees - uallocs;
+		mdb_nicenum(raw_free, free);
 
 		mdb_printf("%0?p %6llu %20llx %10s ", vdev_ms[m], ms.ms_id,
 		    ms.ms_start, free);
 		if (ms.ms_fragmentation == ZFS_FRAG_INVALID)
-			mdb_printf("%9s\n", "-");
+			mdb_printf("%9s ", "-");
 		else
-			mdb_printf("%9llu%%\n", ms.ms_fragmentation);
-
-		if ((spa_flags & SPA_FLAG_HISTOGRAMS) && ms.ms_sm != 0) {
-			if (sm.sm_phys == 0)
-				continue;
+			mdb_printf("%9llu%% ", ms.ms_fragmentation);
+		mdb_printf("%10s\n", uchanges_mem);
 
+		if ((spa_flags & SPA_FLAG_HISTOGRAMS) && ms.ms_sm != 0 &&
+		    sm.sm_phys != 0) {
 			dump_histogram(smp.smp_histogram,
 			    SPACE_MAP_HISTOGRAM_SIZE, sm.sm_shift);
 		}
@@ -1580,21 +1605,56 @@ metaslab_stats(uintptr_t addr, int spa_flags)
 }
 
 static int
-metaslab_group_stats(uintptr_t addr, int spa_flags)
+metaslab_group_stats(mdb_vdev_t *vd, int spa_flags)
 {
 	mdb_metaslab_group_t mg;
 	if (mdb_ctf_vread(&mg, "metaslab_group_t", "mdb_metaslab_group_t",
-	    (uintptr_t)addr, 0) == -1) {
-		mdb_warn("failed to read vdev_mg at %p\n", addr);
+	    vd->vdev_mg, 0) == -1) {
+		mdb_warn("failed to read vdev_mg at %p\n", vd->vdev_mg);
 		return (DCMD_ERR);
 	}
 
 	mdb_inc_indent(4);
-	mdb_printf("%<u>%-?s %15s%</u>\n", "ADDR", "FRAGMENTATION");
+	mdb_printf("%<u>%-?s %7s %9s%</u>\n", "ADDR", "FRAG", "UCMU");
+
 	if (mg.mg_fragmentation == ZFS_FRAG_INVALID)
-		mdb_printf("%0?p %15s\n", addr, "-");
+		mdb_printf("%0?p %6s\n", vd->vdev_mg, "-");
 	else
-		mdb_printf("%0?p %15llu%%\n", addr, mg.mg_fragmentation);
+		mdb_printf("%0?p %6llu%%", vd->vdev_mg, mg.mg_fragmentation);
+
+
+	uintptr_t *vdev_ms = mdb_alloc(vd->vdev_ms_count * sizeof (vdev_ms),
+	    UM_SLEEP | UM_GC);
+	if (mdb_vread(vdev_ms, vd->vdev_ms_count * sizeof (uintptr_t),
+	    vd->vdev_ms) == -1) {
+		mdb_warn("failed to read vdev_ms at %p\n", vd->vdev_ms);
+		return (DCMD_ERR);
+	}
+
+	uint64_t raw_uchanges_mem = 0;
+	char uchanges_mem[MDB_NICENUM_BUFLEN];
+	for (int m = 0; m < vd->vdev_ms_count; m++) {
+		mdb_metaslab_t ms;
+		mdb_range_tree_t rt;
+
+		if (mdb_ctf_vread(&ms, "metaslab_t", "mdb_metaslab_t",
+		    vdev_ms[m], 0) == -1)
+			return (DCMD_ERR);
+
+		if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t",
+		    ms.ms_unflushed_frees, 0) == -1)
+			return (DCMD_ERR);
+		raw_uchanges_mem +=
+		    rt.rt_root.avl_numnodes * sizeof (range_seg_t);
+
+		if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t",
+		    ms.ms_unflushed_allocs, 0) == -1)
+			return (DCMD_ERR);
+		raw_uchanges_mem +=
+		    rt.rt_root.avl_numnodes * sizeof (range_seg_t);
+	}
+	mdb_nicenum(raw_uchanges_mem, uchanges_mem);
+	mdb_printf("%10s\n", uchanges_mem);
 
 	if (spa_flags & SPA_FLAG_HISTOGRAMS)
 		dump_histogram(mg.mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
@@ -1618,33 +1678,28 @@ static int
 do_print_vdev(uintptr_t addr, int flags, int depth, boolean_t recursive,
     int spa_flags)
 {
-	vdev_t vdev;
-	char desc[MAXNAMELEN];
-	int c, children;
-	uintptr_t *child;
-	const char *state, *aux;
-
-	if (mdb_vread(&vdev, sizeof (vdev), (uintptr_t)addr) == -1) {
-		mdb_warn("failed to read vdev_t at %p\n", (uintptr_t)addr);
+	mdb_vdev_t vd;
+	if (mdb_ctf_vread(&vd, "vdev_t", "mdb_vdev_t",
+	    (uintptr_t)addr, 0) == -1)
 		return (DCMD_ERR);
-	}
 
 	if (flags & DCMD_PIPE_OUT) {
 		mdb_printf("%#lr\n", addr);
 	} else {
-		if (vdev.vdev_path != NULL) {
+		char desc[MAXNAMELEN];
+		if (vd.vdev_path != 0) {
 			if (mdb_readstr(desc, sizeof (desc),
-			    (uintptr_t)vdev.vdev_path) == -1) {
+			    (uintptr_t)vd.vdev_path) == -1) {
 				mdb_warn("failed to read vdev_path at %p\n",
-				    vdev.vdev_path);
+				    vd.vdev_path);
 				return (DCMD_ERR);
 			}
-		} else if (vdev.vdev_ops != NULL) {
+		} else if (vd.vdev_ops != 0) {
 			vdev_ops_t ops;
 			if (mdb_vread(&ops, sizeof (ops),
-			    (uintptr_t)vdev.vdev_ops) == -1) {
+			    (uintptr_t)vd.vdev_ops) == -1) {
 				mdb_warn("failed to read vdev_ops at %p\n",
-				    vdev.vdev_ops);
+				    vd.vdev_ops);
 				return (DCMD_ERR);
 			}
 			(void) strcpy(desc, ops.vdev_op_type);
@@ -1660,7 +1715,8 @@ do_print_vdev(uintptr_t addr, int flags, int depth, boolean_t recursive,
 
 		mdb_printf("%0?p ", addr);
 
-		switch (vdev.vdev_state) {
+		const char *state, *aux;
+		switch (vd.vdev_state) {
 		case VDEV_STATE_CLOSED:
 			state = "CLOSED";
 			break;
@@ -1687,7 +1743,7 @@ do_print_vdev(uintptr_t addr, int flags, int depth, boolean_t recursive,
 			break;
 		}
 
-		switch (vdev.vdev_stat.vs_aux) {
+		switch (vd.vdev_stat.vs_aux) {
 		case VDEV_AUX_NONE:
 			aux = "-";
 			break;
@@ -1747,7 +1803,6 @@ do_print_vdev(uintptr_t addr, int flags, int depth, boolean_t recursive,
 		mdb_printf("%-9s %-12s %*s%s\n", state, aux, depth, "", desc);
 
 		if (spa_flags & SPA_FLAG_ERRORS) {
-			vdev_stat_t *vs = &vdev.vdev_stat;
 			int i;
 
 			mdb_inc_indent(4);
@@ -1756,48 +1811,50 @@ do_print_vdev(uintptr_t addr, int flags, int depth, boolean_t recursive,
 			    "%12s%</u>\n", "READ", "WRITE", "FREE", "CLAIM",
 			    "IOCTL");
 			mdb_printf("OPS     ");
-			for (i = 1; i < ZIO_TYPES; i++)
-				mdb_printf("%11#llx%s", vs->vs_ops[i],
-				    i == ZIO_TYPES - 1 ? "" : "  ");
+			for (i = 1; i < VS_ZIO_TYPES; i++)
+				mdb_printf("%11#llx%s",
+				    vd.vdev_stat.vs_ops[i],
+				    i == VS_ZIO_TYPES - 1 ? "" : "  ");
 			mdb_printf("\n");
 			mdb_printf("BYTES   ");
-			for (i = 1; i < ZIO_TYPES; i++)
-				mdb_printf("%11#llx%s", vs->vs_bytes[i],
-				    i == ZIO_TYPES - 1 ? "" : "  ");
+			for (i = 1; i < VS_ZIO_TYPES; i++)
+				mdb_printf("%11#llx%s",
+				    vd.vdev_stat.vs_bytes[i],
+				    i == VS_ZIO_TYPES - 1 ? "" : "  ");
 
 
 			mdb_printf("\n");
-			mdb_printf("EREAD    %10#llx\n", vs->vs_read_errors);
-			mdb_printf("EWRITE   %10#llx\n", vs->vs_write_errors);
+			mdb_printf("EREAD    %10#llx\n",
+			    vd.vdev_stat.vs_read_errors);
+			mdb_printf("EWRITE   %10#llx\n",
+			    vd.vdev_stat.vs_write_errors);
 			mdb_printf("ECKSUM   %10#llx\n",
-			    vs->vs_checksum_errors);
+			    vd.vdev_stat.vs_checksum_errors);
 			mdb_dec_indent(4);
 			mdb_printf("\n");
 		}
 
-		if (spa_flags & SPA_FLAG_METASLAB_GROUPS &&
-		    vdev.vdev_mg != NULL) {
-			metaslab_group_stats((uintptr_t)vdev.vdev_mg,
-			    spa_flags);
+		if ((spa_flags & SPA_FLAG_METASLAB_GROUPS) &&
+		    vd.vdev_mg != 0) {
+			metaslab_group_stats(&vd, spa_flags);
 		}
-		if (spa_flags & SPA_FLAG_METASLABS && vdev.vdev_ms != NULL) {
-			metaslab_stats((uintptr_t)addr, spa_flags);
+		if ((spa_flags & SPA_FLAG_METASLABS) && vd.vdev_ms != 0) {
+			metaslab_stats(&vd, spa_flags);
 		}
 	}
 
-	children = vdev.vdev_children;
-
+	uint64_t children = vd.vdev_children;
 	if (children == 0 || !recursive)
 		return (DCMD_OK);
 
-	child = mdb_alloc(children * sizeof (void *), UM_SLEEP | UM_GC);
-	if (mdb_vread(child, children * sizeof (void *),
-	    (uintptr_t)vdev.vdev_child) == -1) {
-		mdb_warn("failed to read vdev children at %p", vdev.vdev_child);
+	uintptr_t *child = mdb_alloc(children * sizeof (child),
+	    UM_SLEEP | UM_GC);
+	if (mdb_vread(child, children * sizeof (void *), vd.vdev_child) == -1) {
+		mdb_warn("failed to read vdev children at %p", vd.vdev_child);
 		return (DCMD_ERR);
 	}
 
-	for (c = 0; c < children; c++) {
+	for (uint64_t c = 0; c < children; c++) {
 		if (do_print_vdev(child[c], flags, depth + 2, recursive,
 		    spa_flags)) {
 			return (DCMD_ERR);
@@ -2111,9 +2168,11 @@ typedef struct space_data {
 	uint64_t ms_checkpointing;
 	uint64_t ms_freeing;
 	uint64_t ms_freed;
+	uint64_t ms_unflushed_frees;
+	uint64_t ms_unflushed_allocs;
 	uint64_t ms_allocatable;
 	int64_t ms_deferspace;
-	uint64_t nowavail;
+	uint64_t avail;
 } space_data_t;
 
 /* ARGSUSED */
@@ -2125,6 +2184,7 @@ space_cb(uintptr_t addr, const void *unknown, void *arg)
 	mdb_range_tree_t rt;
 	mdb_space_map_t sm = { 0 };
 	mdb_space_map_phys_t smp = { 0 };
+	uint64_t uallocs, ufrees;
 	int i;
 
 	if (mdb_ctf_vread(&ms, "metaslab_t", "mdb_metaslab_t",
@@ -2135,9 +2195,7 @@ space_cb(uintptr_t addr, const void *unknown, void *arg)
 		if (mdb_ctf_vread(&rt, "range_tree_t",
 		    "mdb_range_tree_t", ms.ms_allocating[i], 0) == -1)
 			return (WALK_ERR);
-
 		sd->ms_allocating[i] += rt.rt_space;
-
 	}
 
 	if (mdb_ctf_vread(&rt, "range_tree_t",
@@ -2160,6 +2218,18 @@ space_cb(uintptr_t addr, const void *unknown, void *arg)
 		return (WALK_ERR);
 	sd->ms_allocatable += rt.rt_space;
 
+	if (mdb_ctf_vread(&rt, "range_tree_t",
+	    "mdb_range_tree_t", ms.ms_unflushed_frees, 0) == -1)
+		return (WALK_ERR);
+	sd->ms_unflushed_frees += rt.rt_space;
+	ufrees = rt.rt_space;
+
+	if (mdb_ctf_vread(&rt, "range_tree_t",
+	    "mdb_range_tree_t", ms.ms_unflushed_allocs, 0) == -1)
+		return (WALK_ERR);
+	sd->ms_unflushed_allocs += rt.rt_space;
+	uallocs = rt.rt_space;
+
 	if (ms.ms_sm != 0 &&
 	    mdb_ctf_vread(&sm, "space_map_t",
 	    "mdb_space_map_t", ms.ms_sm, 0) == -1)
@@ -2171,7 +2241,7 @@ space_cb(uintptr_t addr, const void *unknown, void *arg)
 	}
 
 	sd->ms_deferspace += ms.ms_deferspace;
-	sd->nowavail += sm.sm_size - smp.smp_alloc;
+	sd->avail += sm.sm_size - smp.smp_alloc + ufrees - uallocs;
 
 	return (WALK_NEXT);
 }
@@ -2251,12 +2321,16 @@ spa_space(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	    sd.ms_freeing >> shift, suffix);
 	mdb_printf("ms_freed = %llu%s\n",
 	    sd.ms_freed >> shift, suffix);
+	mdb_printf("ms_unflushed_frees = %llu%s\n",
+	    sd.ms_unflushed_frees >> shift, suffix);
+	mdb_printf("ms_unflushed_allocs = %llu%s\n",
+	    sd.ms_unflushed_allocs >> shift, suffix);
 	mdb_printf("ms_allocatable = %llu%s\n",
 	    sd.ms_allocatable >> shift, suffix);
 	mdb_printf("ms_deferspace = %llu%s\n",
 	    sd.ms_deferspace >> shift, suffix);
-	mdb_printf("current syncing avail = %llu%s\n",
-	    sd.nowavail >> shift, suffix);
+	mdb_printf("current avail = %llu%s\n",
+	    sd.avail >> shift, suffix);
 
 	return (DCMD_OK);
 }
@@ -4096,6 +4170,121 @@ out:
 	return (rc);
 }
 
+typedef struct mdb_range_seg {
+	uint64_t rs_start;
+	uint64_t rs_end;
+} mdb_range_seg_t;
+
+/* ARGSUSED */
+static int
+range_tree_cb(uintptr_t addr, const void *unknown, void *arg)
+{
+	mdb_range_seg_t rs;
+
+	if (mdb_ctf_vread(&rs, ZFS_STRUCT "range_seg", "mdb_range_seg_t",
+	    addr, 0) == -1)
+		return (DCMD_ERR);
+
+	mdb_printf("\t[%llx %llx) (length %llx)\n",
+	    rs.rs_start, rs.rs_end, rs.rs_end - rs.rs_start);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+range_tree(uintptr_t addr, uint_t flags, int argc,
+    const mdb_arg_t *argv)
+{
+	mdb_range_tree_t rt;
+	uintptr_t avl_addr;
+
+	if (!(flags & DCMD_ADDRSPEC))
+		return (DCMD_USAGE);
+
+	if (mdb_ctf_vread(&rt, ZFS_STRUCT "range_tree", "mdb_range_tree_t",
+	    addr, 0) == -1)
+		return (DCMD_ERR);
+
+	mdb_printf("%p: range tree of %llu entries, %llu bytes\n",
+	    addr, rt.rt_root.avl_numnodes, rt.rt_space);
+
+	avl_addr = addr +
+	    mdb_ctf_offsetof_by_name(ZFS_STRUCT "range_tree", "rt_root");
+
+	if (mdb_pwalk("avl", range_tree_cb, NULL, avl_addr) != 0) {
+		mdb_warn("can't walk range_tree segments");
+		return (DCMD_ERR);
+	}
+	return (DCMD_OK);
+}
+
+typedef struct mdb_spa_log_sm {
+	uint64_t sls_sm_obj;
+	uint64_t sls_txg;
+	uint64_t sls_nblocks;
+	uint64_t sls_mscount;
+} mdb_spa_log_sm_t;
+
+/* ARGSUSED */
+static int
+logsm_stats_cb(uintptr_t addr, const void *unknown, void *arg)
+{
+	mdb_spa_log_sm_t sls;
+	if (mdb_ctf_vread(&sls, ZFS_STRUCT "spa_log_sm", "mdb_spa_log_sm_t",
+	    addr, 0) == -1)
+		return (WALK_ERR);
+
+	mdb_printf("%7lld %7lld %7lld %7lld\n",
+	    sls.sls_txg, sls.sls_nblocks, sls.sls_mscount, sls.sls_sm_obj);
+
+	return (WALK_NEXT);
+}
+typedef struct mdb_log_summary_entry {
+	uint64_t lse_start;
+	uint64_t lse_blkcount;
+	uint64_t lse_mscount;
+} mdb_log_summary_entry_t;
+
+/* ARGSUSED */
+static int
+logsm_summary_cb(uintptr_t addr, const void *unknown, void *arg)
+{
+	mdb_log_summary_entry_t lse;
+	if (mdb_ctf_vread(&lse, ZFS_STRUCT "log_summary_entry",
+	    "mdb_log_summary_entry_t", addr, 0) == -1)
+		return (WALK_ERR);
+
+	mdb_printf("%7lld %7lld %7lld\n",
+	    lse.lse_start, lse.lse_blkcount, lse.lse_mscount);
+	return (WALK_NEXT);
+}
+
+/* ARGSUSED */
+static int
+logsm_stats(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	if (!(flags & DCMD_ADDRSPEC))
+		return (DCMD_USAGE);
+
+	uintptr_t sls_avl_addr = addr +
+	    mdb_ctf_offsetof_by_name(ZFS_STRUCT "spa", "spa_sm_logs_by_txg");
+	uintptr_t summary_addr = addr +
+	    mdb_ctf_offsetof_by_name(ZFS_STRUCT "spa", "spa_log_summary");
+
+	mdb_printf("Log Entries:\n");
+	mdb_printf("%7s %7s %7s %7s\n", "txg", "blk", "ms", "obj");
+	if (mdb_pwalk("avl", logsm_stats_cb, NULL, sls_avl_addr) != 0)
+		return (DCMD_ERR);
+
+	mdb_printf("\nSummary Entries:\n");
+	mdb_printf("%7s %7s %7s\n", "txg", "blk", "ms");
+	if (mdb_pwalk("list", logsm_summary_cb, NULL, summary_addr) != 0)
+		return (DCMD_ERR);
+
+	return (DCMD_OK);
+}
+
 /*
  * MDB module linkage information:
  *
@@ -4117,6 +4306,8 @@ static const mdb_dcmd_t dcmds[] = {
 	{ "abuf_find", "dva_word[0] dva_word[1]",
 	    "find arc_buf_hdr_t of a specified DVA",
 	    abuf_find },
+	{ "logsm_stats", ":", "print log space map statistics of a spa_t",
+	    logsm_stats},
 	{ "spa", "?[-cevmMh]\n"
 	    "\t-c display spa config\n"
 	    "\t-e display vdev statistics\n"
@@ -4182,6 +4373,8 @@ static const mdb_dcmd_t dcmds[] = {
 	    "\t-b display histogram of buffer counts\n",
 	    "print a histogram of compressed arc buffer sizes",
 	    arc_compression_stats},
+	{ "range_tree", ":",
+	    "print entries in range_tree_t", range_tree},
 	{ NULL }
 };
 
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index a936c361b5..f56766d81f 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -765,6 +765,12 @@ get_checkpoint_refcount(vdev_t *vd)
 }
 
 static int
+get_log_spacemap_refcount(spa_t *spa)
+{
+	return (avl_numnodes(&spa->spa_sm_logs_by_txg));
+}
+
+static int
 verify_spacemap_refcounts(spa_t *spa)
 {
 	uint64_t expected_refcount = 0;
@@ -778,6 +784,7 @@ verify_spacemap_refcounts(spa_t *spa)
 	actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
 	actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
 	actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
+	actual_refcount += get_log_spacemap_refcount(spa);
 
 	if (expected_refcount != actual_refcount) {
 		(void) printf("space map refcount mismatch: expected %lld != "
@@ -942,23 +949,46 @@ dump_metaslab(metaslab_t *msp)
 
 	ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
 	dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
+
+	if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+		(void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",
+		    (u_longlong_t)metaslab_unflushed_txg(msp));
+	}
 }
 
 static void
 print_vdev_metaslab_header(vdev_t *vd)
 {
 	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
-	const char *bias_str;
+	const char *bias_str = "";
+
+	if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {
+		bias_str = VDEV_ALLOC_BIAS_LOG;
+	} else if (alloc_bias == VDEV_BIAS_SPECIAL) {
+		bias_str = VDEV_ALLOC_BIAS_SPECIAL;
+	} else if (alloc_bias == VDEV_BIAS_DEDUP) {
+		bias_str = VDEV_ALLOC_BIAS_DEDUP;
+	}
+
+	uint64_t ms_flush_data_obj = 0;
+	if (vd->vdev_top_zap != 0) {
+		int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
+		    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
+		    sizeof (uint64_t), 1, &ms_flush_data_obj);
+		if (error != ENOENT) {
+			ASSERT0(error);
+		}
+	}
+
+	(void) printf("\tvdev %10llu   %s",
+	    (u_longlong_t)vd->vdev_id, bias_str);
 
-	bias_str = (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) ?
-	    VDEV_ALLOC_BIAS_LOG :
-	    (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
-	    (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP :
-	    vd->vdev_islog ? "log" : "";
+	if (ms_flush_data_obj != 0) {
+		(void) printf("   ms_unflushed_phys object %llu",
+		    (u_longlong_t)ms_flush_data_obj);
+	}
 
-	(void) printf("\tvdev %10llu   %s\n"
-	    "\t%-10s%5llu   %-19s   %-15s   %-12s\n",
-	    (u_longlong_t)vd->vdev_id, bias_str,
+	(void) printf("\n\t%-10s%5llu   %-19s   %-15s   %-12s\n",
 	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
 	    "offset", "spacemap", "free");
 	(void) printf("\t%15s   %19s   %15s   %12s\n",
@@ -1124,6 +1154,27 @@ dump_metaslabs(spa_t *spa)
 }
 
 static void
+dump_log_spacemaps(spa_t *spa)
+{
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	(void) printf("\nLog Space Maps in Pool:\n");
+	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+		space_map_t *sm = NULL;
+		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
+		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
+
+		(void) printf("Log Spacemap object %llu txg %llu\n",
+		    (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);
+		dump_spacemap(spa->spa_meta_objset, sm);
+		space_map_close(sm);
+	}
+	(void) printf("\n");
+}
+
+static void
 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
 {
 	const ddt_phys_t *ddp = dde->dde_phys;
@@ -3153,6 +3204,85 @@ static metaslab_ops_t zdb_metaslab_ops = {
 	NULL	/* alloc */
 };
 
+typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme,
+    uint64_t txg, void *arg);
+
+typedef struct unflushed_iter_cb_arg {
+	spa_t *uic_spa;
+	uint64_t uic_txg;
+	void *uic_arg;
+	zdb_log_sm_cb_t uic_cb;
+} unflushed_iter_cb_arg_t;
+
+static int
+iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
+{
+	unflushed_iter_cb_arg_t *uic = arg;
+
+	return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
+}
+
+static void
+iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
+{
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+		space_map_t *sm = NULL;
+		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
+		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
+
+		unflushed_iter_cb_arg_t uic = {
+			.uic_spa = spa,
+			.uic_txg = sls->sls_txg,
+			.uic_arg = arg,
+			.uic_cb = cb
+		};
+
+		VERIFY0(space_map_iterate(sm, space_map_length(sm),
+		    iterate_through_spacemap_logs_cb, &uic));
+		space_map_close(sm);
+	}
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+/* ARGSUSED */
+static int
+load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
+    uint64_t txg, void *arg)
+{
+	spa_vdev_removal_t *svr = arg;
+
+	uint64_t offset = sme->sme_offset;
+	uint64_t size = sme->sme_run;
+
+	/* skip vdevs we don't care about */
+	if (sme->sme_vdev != svr->svr_vdev_id)
+		return (0);
+
+	vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+
+	if (txg < metaslab_unflushed_txg(ms))
+		return (0);
+
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	ASSERT(vim != NULL);
+	if (offset >= vdev_indirect_mapping_max_offset(vim))
+		return (0);
+
+	if (sme->sme_type == SM_ALLOC)
+		range_tree_add(svr->svr_allocd_segs, offset, size);
+	else
+		range_tree_remove(svr->svr_allocd_segs, offset, size);
+
+	return (0);
+}
+
 static void
 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
@@ -3242,36 +3372,35 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
+	ASSERT0(range_tree_space(svr->svr_allocd_segs));
+
+	range_tree_t *allocs = range_tree_create(NULL, NULL);
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 
 		if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
 			break;
 
-		ASSERT0(range_tree_space(svr->svr_allocd_segs));
+		ASSERT0(range_tree_space(allocs));
+		if (msp->ms_sm != NULL)
+			VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));
+		range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs);
+	}
+	range_tree_destroy(allocs);
 
-		if (msp->ms_sm != NULL) {
-			VERIFY0(space_map_load(msp->ms_sm,
-			    svr->svr_allocd_segs, SM_ALLOC));
+	iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);
 
-			/*
-			 * Clear everything past what has been synced unless
-			 * it's past the spacemap, because we have not allocated
-			 * mappings for it yet.
-			 */
-			uint64_t vim_max_offset =
-			    vdev_indirect_mapping_max_offset(vim);
-			uint64_t sm_end = msp->ms_sm->sm_start +
-			    msp->ms_sm->sm_size;
-			if (sm_end > vim_max_offset)
-				range_tree_clear(svr->svr_allocd_segs,
-				    vim_max_offset, sm_end - vim_max_offset);
-		}
+	/*
+	 * Clear everything past what has been synced,
+	 * because we have not allocated mappings for
+	 * it yet.
+	 */
+	range_tree_clear(svr->svr_allocd_segs,
+	    vdev_indirect_mapping_max_offset(vim),
+	    vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));
 
-		zcb->zcb_removing_size +=
-		    range_tree_space(svr->svr_allocd_segs);
-		range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
-	}
+	zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs);
+	range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
@@ -3438,6 +3567,79 @@ zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
 	}
 }
 
+static int
+count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,
+    uint64_t txg, void *arg)
+{
+	int64_t *ualloc_space = arg;
+	uint64_t offset = sme->sme_offset;
+	uint64_t vdev_id = sme->sme_vdev;
+
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+	if (!vdev_is_concrete(vd))
+		return (0);
+
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+
+	if (txg < metaslab_unflushed_txg(ms))
+		return (0);
+
+	if (sme->sme_type == SM_ALLOC)
+		*ualloc_space += sme->sme_run;
+	else
+		*ualloc_space -= sme->sme_run;
+
+	return (0);
+}
+
+static int64_t
+get_unflushed_alloc_space(spa_t *spa)
+{
+	if (dump_opt['L'])
+		return (0);
+
+	int64_t ualloc_space = 0;
+	iterate_through_spacemap_logs(spa, count_unflushed_space_cb,
+	    &ualloc_space);
+	return (ualloc_space);
+}
+
+static int
+load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)
+{
+	maptype_t *uic_maptype = arg;
+	uint64_t offset = sme->sme_offset;
+	uint64_t size = sme->sme_run;
+	uint64_t vdev_id = sme->sme_vdev;
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+
+	/* skip indirect vdevs */
+	if (!vdev_is_concrete(vd))
+		return (0);
+
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+	ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);
+
+	if (txg < metaslab_unflushed_txg(ms))
+		return (0);
+
+	if (*uic_maptype == sme->sme_type)
+		range_tree_add(ms->ms_allocatable, offset, size);
+	else
+		range_tree_remove(ms->ms_allocatable, offset, size);
+
+	return (0);
+}
+
+static void
+load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)
+{
+	iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);
+}
+
 static void
 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
 {
@@ -3461,7 +3663,7 @@ load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
 			    (longlong_t)vd->vdev_ms_count);
 
 			mutex_enter(&msp->ms_lock);
-			metaslab_unload(msp);
+			range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 
 			/*
 			 * We don't want to spend the CPU manipulating the
@@ -3478,6 +3680,8 @@ load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
 			mutex_exit(&msp->ms_lock);
 		}
 	}
+
+	load_unflushed_to_ms_allocatables(spa, maptype);
 }
 
 /*
@@ -3492,7 +3696,7 @@ load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	mutex_enter(&msp->ms_lock);
-	metaslab_unload(msp);
+	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 
 	/*
 	 * We don't want to spend the CPU manipulating the
@@ -3752,7 +3956,6 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
 				range_tree_vacate(msp->ms_allocatable,
 				    zdb_leak, vd);
 			}
-
 			if (msp->ms_loaded) {
 				msp->ms_loaded = B_FALSE;
 			}
@@ -3889,7 +4092,8 @@ dump_block_stats(spa_t *spa)
 	total_alloc = norm_alloc +
 	    metaslab_class_get_alloc(spa_log_class(spa)) +
 	    metaslab_class_get_alloc(spa_special_class(spa)) +
-	    metaslab_class_get_alloc(spa_dedup_class(spa));
+	    metaslab_class_get_alloc(spa_dedup_class(spa)) +
+	    get_unflushed_alloc_space(spa);
 	total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
 	    zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
 
@@ -4738,11 +4942,25 @@ mos_obj_refd(uint64_t obj)
 }
 
 static void
+mos_leak_vdev_top_zap(vdev_t *vd)
+{
+	uint64_t ms_flush_data_obj;
+
+	int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
+	    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
+	    sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);
+	if (error == ENOENT)
+		return;
+	ASSERT0(error);
+
+	mos_obj_refd(ms_flush_data_obj);
+}
+
+static void
 mos_leak_vdev(vdev_t *vd)
 {
 	mos_obj_refd(vd->vdev_dtl_object);
 	mos_obj_refd(vd->vdev_ms_array);
-	mos_obj_refd(vd->vdev_top_zap);
 	mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
 	mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
 	mos_obj_refd(vd->vdev_leaf_zap);
@@ -4760,11 +4978,34 @@ mos_leak_vdev(vdev_t *vd)
 		mos_obj_refd(space_map_object(ms->ms_sm));
 	}
 
+	if (vd->vdev_top_zap != 0) {
+		mos_obj_refd(vd->vdev_top_zap);
+		mos_leak_vdev_top_zap(vd);
+	}
+
 	for (uint64_t c = 0; c < vd->vdev_children; c++) {
 		mos_leak_vdev(vd->vdev_child[c]);
 	}
 }
 
+static void
+mos_leak_log_spacemaps(spa_t *spa)
+{
+	uint64_t spacemap_zap;
+
+	int error = zap_lookup(spa_meta_objset(spa),
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,
+	    sizeof (spacemap_zap), 1, &spacemap_zap);
+	if (error == ENOENT)
+		return;
+	ASSERT0(error);
+
+	mos_obj_refd(spacemap_zap);
+	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))
+		mos_obj_refd(sls->sls_sm_obj);
+}
+
 static int
 dump_mos_leaks(spa_t *spa)
 {
@@ -4796,6 +5037,10 @@ dump_mos_leaks(spa_t *spa)
 	mos_obj_refd(spa->spa_l2cache.sav_object);
 	mos_obj_refd(spa->spa_spares.sav_object);
 
+	if (spa->spa_syncing_log_sm != NULL)
+		mos_obj_refd(spa->spa_syncing_log_sm->sm_object);
+	mos_leak_log_spacemaps(spa);
+
 	mos_obj_refd(spa->spa_condensing_indirect_phys.
 	    scip_next_mapping_object);
 	mos_obj_refd(spa->spa_condensing_indirect_phys.
@@ -4873,6 +5118,81 @@ dump_mos_leaks(spa_t *spa)
 	return (rv);
 }
 
+typedef struct log_sm_obsolete_stats_arg {
+	uint64_t lsos_current_txg;
+
+	uint64_t lsos_total_entries;
+	uint64_t lsos_valid_entries;
+
+	uint64_t lsos_sm_entries;
+	uint64_t lsos_valid_sm_entries;
+} log_sm_obsolete_stats_arg_t;
+
+static int
+log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,
+    uint64_t txg, void *arg)
+{
+	log_sm_obsolete_stats_arg_t *lsos = arg;
+	uint64_t offset = sme->sme_offset;
+	uint64_t vdev_id = sme->sme_vdev;
+
+	if (lsos->lsos_current_txg == 0) {
+		/* this is the first log */
+		lsos->lsos_current_txg = txg;
+	} else if (lsos->lsos_current_txg < txg) {
+		/* we just changed log - print stats and reset */
+		(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
+		    (u_longlong_t)lsos->lsos_valid_sm_entries,
+		    (u_longlong_t)lsos->lsos_sm_entries,
+		    (u_longlong_t)lsos->lsos_current_txg);
+		lsos->lsos_valid_sm_entries = 0;
+		lsos->lsos_sm_entries = 0;
+		lsos->lsos_current_txg = txg;
+	}
+	ASSERT3U(lsos->lsos_current_txg, ==, txg);
+
+	lsos->lsos_sm_entries++;
+	lsos->lsos_total_entries++;
+
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+	if (!vdev_is_concrete(vd))
+		return (0);
+
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+
+	if (txg < metaslab_unflushed_txg(ms))
+		return (0);
+	lsos->lsos_valid_sm_entries++;
+	lsos->lsos_valid_entries++;
+	return (0);
+}
+
+static void
+dump_log_spacemap_obsolete_stats(spa_t *spa)
+{
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	log_sm_obsolete_stats_arg_t lsos;
+	bzero(&lsos, sizeof (lsos));
+
+	(void) printf("Log Space Map Obsolete Entry Statistics:\n");
+
+	iterate_through_spacemap_logs(spa,
+	    log_spacemap_obsolete_stats_cb, &lsos);
+
+	/* print stats for latest log */
+	(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
+	    (u_longlong_t)lsos.lsos_valid_sm_entries,
+	    (u_longlong_t)lsos.lsos_sm_entries,
+	    (u_longlong_t)lsos.lsos_current_txg);
+
+	(void) printf("%-8llu valid entries out of %-8llu - total\n\n",
+	    (u_longlong_t)lsos.lsos_valid_entries,
+	    (u_longlong_t)lsos.lsos_total_entries);
+}
+
 static void
 dump_zpool(spa_t *spa)
 {
@@ -4902,6 +5222,10 @@ dump_zpool(spa_t *spa)
 		dump_metaslabs(spa);
 	if (dump_opt['M'])
 		dump_metaslab_groups(spa);
+	if (dump_opt['d'] > 2 || dump_opt['m']) {
+		dump_log_spacemaps(spa);
+		dump_log_spacemap_obsolete_stats(spa);
+	}
 
 	if (dump_opt['d'] || dump_opt['i']) {
 		mos_refd_objs = range_tree_create(NULL, NULL);
@@ -4962,9 +5286,8 @@ dump_zpool(spa_t *spa)
 			}
 		}
 
-		if (rc == 0) {
+		if (rc == 0)
 			rc = verify_device_removal_feature_counts(spa);
-		}
 	}
 
 	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c
index 8e0c103349..52ca88ab7f 100644
--- a/usr/src/cmd/zpool/zpool_main.c
+++ b/usr/src/cmd/zpool/zpool_main.c
@@ -32,6 +32,7 @@
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
  * Copyright 2019 Joyent, Inc.
  * Copyright (c) 2012 by Cyril Plisko. All rights reserved.
+ * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
  */
 
 #include <assert.h>
@@ -3229,6 +3230,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
 	 * print all other top-level devices
 	 */
 	for (uint_t n = 0; n < 3; n++) {
+		boolean_t printed = B_FALSE;
 		for (c = 0; c < children; c++) {
 			uint64_t islog = B_FALSE;
 			char *bias = NULL;
@@ -3249,6 +3251,17 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
 			if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
 				continue;
 
+			if (!printed) {
+				if (!cb->cb_scripted) {
+					(void) printf(
+					    "%-*s      -      -      -      -"
+					    "      -      -",
+					    cb->cb_namewidth, class_name[n]);
+				}
+				printf("\n");
+				printed = B_TRUE;
+			}
+
 			vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
 			    cb->cb_name_flags);
 			print_vdev_stats(zhp, vname, oldnv ?
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index f422db3bbc..83922cf376 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -2827,24 +2827,12 @@ vdev_lookup_by_path(vdev_t *vd, const char *path)
 	return (NULL);
 }
 
-/*
- * Find the first available hole which can be used as a top-level.
- */
-int
-find_vdev_hole(spa_t *spa)
+static int
+spa_num_top_vdevs(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
-	int c;
-
-	ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
-
-	for (c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *cvd = rvd->vdev_child[c];
-
-		if (cvd->vdev_ishole)
-			break;
-	}
-	return (c);
+	ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV);
+	return (rvd->vdev_children);
 }
 
 /*
@@ -2869,7 +2857,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
-	ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
+	ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;
 
 	/*
 	 * If we have slogs then remove them 1/4 of the time.
@@ -2974,7 +2962,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
 	leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-	ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
+	ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
@@ -6895,6 +6883,15 @@ ztest_init(ztest_shared_t *zs)
 	props = make_random_props();
 	for (int i = 0; i < SPA_FEATURES; i++) {
 		char buf[1024];
+
+		/*
+		 * 75% chance of using the log space map feature. We want ztest
+		 * to exercise both the code paths that use the log space map
+		 * feature and the ones that don't.
+		 */
+		if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0)
+			continue;
+
 		(void) snprintf(buf, sizeof (buf), "feature@%s",
 		    spa_feature_table[i].fi_uname);
 		VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
diff --git a/usr/src/common/zfs/zfeature_common.c b/usr/src/common/zfs/zfeature_common.c
index 78345bbd88..e5d3fc27a0 100644
--- a/usr/src/common/zfs/zfeature_common.c
+++ b/usr/src/common/zfs/zfeature_common.c
@@ -366,4 +366,15 @@ zpool_feature_init(void)
 	    "space/object accounting based on project ID.",
 	    ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET,
 	    project_quota_deps);
+
+	static const spa_feature_t log_spacemap_deps[] = {
+		SPA_FEATURE_SPACEMAP_V2,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_LOG_SPACEMAP,
+	    "com.delphix:log_spacemap", "log_spacemap",
+	    "Log metaslab changes on a single spacemap and "
+	    "flush them periodically.",
+	    ZFEATURE_FLAG_READONLY_COMPAT,
+	    log_spacemap_deps);
 }
diff --git a/usr/src/common/zfs/zfeature_common.h b/usr/src/common/zfs/zfeature_common.h
index ab9ff50ff6..9fc4983228 100644
--- a/usr/src/common/zfs/zfeature_common.h
+++ b/usr/src/common/zfs/zfeature_common.h
@@ -68,6 +68,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_BOOKMARK_V2,
 	SPA_FEATURE_USEROBJ_ACCOUNTING,
 	SPA_FEATURE_PROJECT_QUOTA,
+	SPA_FEATURE_LOG_SPACEMAP,
 	SPA_FEATURES
 } spa_feature_t;
 
diff --git a/usr/src/man/man1m/zdb.1m b/usr/src/man/man1m/zdb.1m
index ca771c24d7..422fba96d9 100644
--- a/usr/src/man/man1m/zdb.1m
+++ b/usr/src/man/man1m/zdb.1m
@@ -192,7 +192,8 @@ By default,
 .Nm
 verifies that all non-free blocks are referenced, which can be very expensive.
 .It Fl m
-Display the offset, spacemap, and free space of each metaslab.
+Display the offset, spacemap, free space of each metaslab, all the log
+spacemaps and their obsolete entry statistics.
 .It Fl mm
 Also display information about the on-disk free space histogram associated with
 each metaslab.
diff --git a/usr/src/man/man5/zpool-features.5 b/usr/src/man/man5/zpool-features.5
index 21a5369799..38045f80df 100644
--- a/usr/src/man/man5/zpool-features.5
+++ b/usr/src/man/man5/zpool-features.5
@@ -808,5 +808,27 @@ The upgrade process runs in the background and may take a while to complete
 for the filesystems containing a large number of files.
 .RE
 
+.sp
+.ne 2
+.na
+\fBlog_spacemap\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID	com.delphix:log_spacemap
+READ\-ONLY COMPATIBLE	yes
+DEPENDENCIES	com.delphix:spacemap_v2
+.TE
+
+This feature improves performance for heavily-fragmented pools,
+especially when workloads are heavy in random-writes.
+It does so by logging all the metaslab changes on a single spacemap every TXG
+instead of scattering multiple writes to all the metaslab spacemaps.
+
+This feature becomes \fBactive\fR as soon as it is enabled and will never
+return to being \fBenabled\fR.
+.RE
+
 .SH "SEE ALSO"
 \fBzfs\fR(1M), \fBzpool\fR(1M)
diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf
index 66fc12ff3a..9680204e96 100644
--- a/usr/src/pkg/manifests/system-test-zfstest.mf
+++ b/usr/src/pkg/manifests/system-test-zfstest.mf
@@ -124,6 +124,7 @@ dir path=opt/zfs-tests/tests/functional/large_files
 dir path=opt/zfs-tests/tests/functional/largest_pool
 dir path=opt/zfs-tests/tests/functional/libzfs
 dir path=opt/zfs-tests/tests/functional/link_count
+dir path=opt/zfs-tests/tests/functional/log_spacemap
 dir path=opt/zfs-tests/tests/functional/mdb
 dir path=opt/zfs-tests/tests/functional/migration
 dir path=opt/zfs-tests/tests/functional/mmap
@@ -2563,6 +2564,8 @@ file path=opt/zfs-tests/tests/functional/libzfs/many_fds mode=0555
 file path=opt/zfs-tests/tests/functional/link_count/cleanup mode=0555
 file path=opt/zfs-tests/tests/functional/link_count/link_count_001 mode=0555
 file path=opt/zfs-tests/tests/functional/link_count/setup mode=0555
+file path=opt/zfs-tests/tests/functional/log_spacemap/log_spacemap_import_logs \
+    mode=0555
 file path=opt/zfs-tests/tests/functional/mdb/cleanup mode=0555
 file path=opt/zfs-tests/tests/functional/mdb/mdb_001_pos mode=0555
 file path=opt/zfs-tests/tests/functional/mdb/setup mode=0555
diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run
index 14ce6e5bd9..1d8fe09149 100644
--- a/usr/src/test/zfs-tests/runfiles/delphix.run
+++ b/usr/src/test/zfs-tests/runfiles/delphix.run
@@ -723,3 +723,9 @@ tests = ['zvol_misc_001_neg', 'zvol_misc_002_pos', 'zvol_misc_003_neg',
 tests = ['many_fds']
 pre =
 post =
+
+[/opt/zfs-tests/tests/functional/log_spacemap]
+tests = ['log_spacemap_import_logs']
+pre =
+post =
+tags = ['functional', 'log_spacemap']
diff --git a/usr/src/test/zfs-tests/runfiles/omnios.run b/usr/src/test/zfs-tests/runfiles/omnios.run
index 6c127881da..3d42388e3f 100644
--- a/usr/src/test/zfs-tests/runfiles/omnios.run
+++ b/usr/src/test/zfs-tests/runfiles/omnios.run
@@ -727,3 +727,9 @@ tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_003_pos',
 tests = ['many_fds']
 pre =
 post =
+
+[/opt/zfs-tests/tests/functional/log_spacemap]
+tests = ['log_spacemap_import_logs']
+pre =
+post =
+tags = ['functional', 'log_spacemap']
diff --git a/usr/src/test/zfs-tests/runfiles/openindiana.run b/usr/src/test/zfs-tests/runfiles/openindiana.run
index 011529f8f1..6f537c7ba8 100644
--- a/usr/src/test/zfs-tests/runfiles/openindiana.run
+++ b/usr/src/test/zfs-tests/runfiles/openindiana.run
@@ -727,3 +727,9 @@ tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_003_pos',
 tests = ['many_fds']
 pre =
 post =
+
+[/opt/zfs-tests/tests/functional/log_spacemap]
+tests = ['log_spacemap_import_logs']
+pre =
+post =
+tags = ['functional', 'log_spacemap']
diff --git a/usr/src/test/zfs-tests/runfiles/smartos.run b/usr/src/test/zfs-tests/runfiles/smartos.run
index e6bcd4d8d5..f98344bdf8 100644
--- a/usr/src/test/zfs-tests/runfiles/smartos.run
+++ b/usr/src/test/zfs-tests/runfiles/smartos.run
@@ -626,3 +626,9 @@ tests = ['zvol_misc_001_neg', 'zvol_misc_002_pos', 'zvol_misc_003_neg',
 tests = ['many_fds']
 pre =
 post =
+
+[/opt/zfs-tests/tests/functional/log_spacemap]
+tests = ['log_spacemap_import_logs']
+pre =
+post =
+tags = ['functional', 'log_spacemap']
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index 03f7fc37fe..aa99c2be00 100644
--- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -85,4 +85,5 @@ typeset -a properties=(
     "feature@bookmark_v2"
     "feature@userobj_accounting"
     "feature@project_quota"
+    "feature@log_spacemap"
 )
diff --git a/usr/src/test/zfs-tests/tests/functional/log_spacemap/Makefile b/usr/src/test/zfs-tests/tests/functional/log_spacemap/Makefile
new file mode 100644
index 0000000000..afb44c1549
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/log_spacemap/Makefile
@@ -0,0 +1,21 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+include $(SRC)/Makefile.master
+
+ROOTOPTPKG = $(ROOT)/opt/zfs-tests
+TARGETDIR = $(ROOTOPTPKG)/tests/functional/log_spacemap
+
+include $(SRC)/test/zfs-tests/Makefile.com
diff --git a/usr/src/test/zfs-tests/tests/functional/log_spacemap/log_spacemap_import_logs.ksh b/usr/src/test/zfs-tests/tests/functional/log_spacemap/log_spacemap_import_logs.ksh
new file mode 100755
index 0000000000..71a91284e8
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/log_spacemap/log_spacemap_import_logs.ksh
@@ -0,0 +1,82 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 by Delphix. All rights reserved.
+# Copyright 2019 Joyent, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Log spacemaps are generally destroyed at export in order to
+# not induce performance overheads at import time. As a result,
+# the log spacemap codepaths that read the logs in import times
+# are not tested outside of ztest and pools with DEBUG bits doing
+# many imports/exports while running the test suite.
+#
+# This test uses an internal tunable and forces ZFS to keep the
+# log spacemaps at export, and then re-imports the pool, thus
+# providing explicit testing of those codepaths. It also uses
+# another tunable to load all the metaslabs when the pool is
+# re-imported so more assertions and verifications will be hit.
+#
+# STRATEGY:
+#	1. Create pool.
+#	2. Do a couple of writes to generate some data for spacemap logs.
+#	3. Set tunable to keep logs after export.
+#	4. Export pool and verify that there are logs with zdb.
+#	5. Set tunable to load all metaslabs at import.
+#	6. Import pool.
+#	7. Reset tunables.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+	log_must set_tunable32 zfs_keep_log_spacemaps_at_export 0
+	log_must set_tunable32 metaslab_debug_load 0
+	if poolexists $LOGSM_POOL; then
+		log_must zpool destroy -f $LOGSM_POOL
+	fi
+}
+log_onexit cleanup
+
+LOGSM_POOL="logsm_import"
+TESTDISK="$(echo $DISKS | cut -d' ' -f1)"
+
+log_must zpool create -o cachefile=none -f $LOGSM_POOL $TESTDISK
+log_must zfs create $LOGSM_POOL/fs
+
+log_must dd if=/dev/urandom of=/$LOGSM_POOL/fs/00 bs=128k count=10
+log_must sync
+log_must dd if=/dev/urandom of=/$LOGSM_POOL/fs/00 bs=128k count=10
+log_must sync
+
+log_must set_tunable32 zfs_keep_log_spacemaps_at_export 1
+log_must zpool export $LOGSM_POOL
+
+LOGSM_COUNT=$(zdb -m -e $LOGSM_POOL | grep "Log Spacemap object" | wc -l)
+if (( LOGSM_COUNT == 0 )); then
+	log_fail "Pool does not have any log spacemaps after being exported"
+fi
+
+log_must set_tunable32 metaslab_debug_load 1
+log_must zpool import $LOGSM_POOL
+
+log_pass "Log spacemaps imported with no errors"
diff --git a/usr/src/test/zfs-tests/tests/functional/removal/removal_condense_export.ksh b/usr/src/test/zfs-tests/tests/functional/removal/removal_condense_export.ksh
index d33b53fe14..7bbf770b4c 100644
--- a/usr/src/test/zfs-tests/tests/functional/removal/removal_condense_export.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/removal/removal_condense_export.ksh
@@ -37,7 +37,7 @@ function reset
 
 default_setup_noexit "$DISKS" "true"
 log_onexit reset
-log_must set_condense_delay 100
+log_must set_condense_delay 500
 log_must set_min_bytes 1
 
 log_must zfs set recordsize=512 $TESTPOOL/$TESTFS
@@ -75,7 +75,7 @@ log_mustnot vdevs_in_pool $TESTPOOL $REMOVEDISK
 
 log_must zfs remap $TESTPOOL/$TESTFS
 sync
-sleep 5
+sleep 4
 sync
 log_must zpool export $TESTPOOL
 zdb -e $TESTPOOL | grep 'Condensing indirect vdev' || \
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index aff3427796..78894e23f2 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -1413,6 +1413,7 @@ ZFS_COMMON_OBJS +=		\
 	spa_config.o		\
 	spa_errlog.o		\
 	spa_history.o		\
+	spa_log_spacemap.o	\
 	spa_misc.o		\
 	space_map.o		\
 	space_reftree.o		\
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index fb75ef3630..4c9ce98326 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -1530,7 +1530,7 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
 		ASSERT(dn->dn_dbuf->db_data_pending);
 		/*
 		 * Initialize dn_zio outside dnode_sync() because the
-		 * meta-dnode needs to set it ouside dnode_sync().
+		 * meta-dnode needs to set it outside dnode_sync().
 		 */
 		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
 		ASSERT(dn->dn_zio);
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index c09cec15a5..8564900fc9 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
@@ -737,7 +737,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 		dp->dp_mos_uncompressed_delta = 0;
 	}
 
-	if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) {
+	if (dmu_objset_is_dirty(mos, txg)) {
 		dsl_pool_sync_mos(dp, tx);
 	}
 
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 2231664c33..b950ed26d6 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -46,12 +46,21 @@ uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
- * Since we can touch multiple metaslabs (and their respective space maps)
- * with each transaction group, we benefit from having a smaller space map
+ * In pools where the log space map feature is not enabled we touch
+ * multiple metaslabs (and their respective space maps) with each
+ * transaction group. Thus, we benefit from having a small space map
  * block size since it allows us to issue more I/O operations scattered
- * around the disk.
+ * around the disk. So a sane default for the space map block size
+ * is 8~16K.
  */
-int zfs_metaslab_sm_blksz = (1 << 12);
+int zfs_metaslab_sm_blksz_no_log = (1 << 14);
+
+/*
+ * When the log space map feature is enabled, we accumulate a lot of
+ * changes per metaslab that are flushed once in a while so we benefit
+ * from a bigger block size like 128K for the metaslab space maps.
+ */
+int zfs_metaslab_sm_blksz_with_log = (1 << 17);
 
 /*
  * The in-core space map representation is more compact than its on-disk form.
@@ -98,12 +107,27 @@ int zfs_mg_noalloc_threshold = 0;
 
 /*
  * Metaslab groups are considered eligible for allocations if their
- * fragmenation metric (measured as a percentage) is less than or equal to
- * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
- * then it will be skipped unless all metaslab groups within the metaslab
- * class have also crossed this threshold.
+ * fragmenation metric (measured as a percentage) is less than or
+ * equal to zfs_mg_fragmentation_threshold. If a metaslab group
+ * exceeds this threshold then it will be skipped unless all metaslab
+ * groups within the metaslab class have also crossed this threshold.
+ *
+ * This tunable was introduced to avoid edge cases where we continue
+ * allocating from very fragmented disks in our pool while other, less
+ * fragmented disks, exists. On the other hand, if all disks in the
+ * pool are uniformly approaching the threshold, the threshold can
+ * be a speed bump in performance, where we keep switching the disks
+ * that we allocate from (e.g. we allocate some segments from disk A
+ * making it bypassing the threshold while freeing segments from disk
+ * B getting its fragmentation below the threshold).
+ *
+ * Empirically, we've seen that our vdev selection for allocations is
+ * good enough that fragmentation increases uniformly across all vdevs
+ * the majority of the time. Thus we set the threshold percentage high
+ * enough to avoid hitting the speed bump on pools that are being pushed
+ * to the edge.
  */
-int zfs_mg_fragmentation_threshold = 85;
+int zfs_mg_fragmentation_threshold = 95;
 
 /*
  * Allow metaslabs to keep their active state as long as their fragmentation
@@ -140,6 +164,30 @@ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 int metaslab_df_free_pct = 4;
 
 /*
+ * Maximum distance to search forward from the last offset. Without this
+ * limit, fragmented pools can see >100,000 iterations and
+ * metaslab_block_picker() becomes the performance limiting factor on
+ * high-performance storage.
+ *
+ * With the default setting of 16MB, we typically see less than 500
+ * iterations, even with very fragmented, ashift=9 pools. The maximum number
+ * of iterations possible is:
+ *     metaslab_df_max_search / (2 * (1<<ashift))
+ * With the default setting of 16MB this is 16*1024 (with ashift=9) or
+ * 2048 (with ashift=12).
+ */
+int metaslab_df_max_search = 16 * 1024 * 1024;
+
+/*
+ * If we are not searching forward (due to metaslab_df_max_search,
+ * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
+ * controls what segment is used.  If it is set, we will use the largest free
+ * segment.  If it is not set, we will use a segment of exactly the requested
+ * size (or larger).
+ */
+int metaslab_df_use_largest_segment = B_FALSE;
+
+/*
  * A metaslab is considered "free" if it contains a contiguous
  * segment which is greater than metaslab_min_alloc_size.
  */
@@ -239,6 +287,7 @@ static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
+static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
 
 kmem_cache_t *metaslab_alloc_trace_cache;
 
@@ -513,67 +562,6 @@ metaslab_compare(const void *x1, const void *x2)
 	return (AVL_CMP(m1->ms_start, m2->ms_start));
 }
 
-uint64_t
-metaslab_allocated_space(metaslab_t *msp)
-{
-	return (msp->ms_allocated_space);
-}
-
-/*
- * Verify that the space accounting on disk matches the in-core range_trees.
- */
-static void
-metaslab_verify_space(metaslab_t *msp, uint64_t txg)
-{
-	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
-	uint64_t allocating = 0;
-	uint64_t sm_free_space, msp_free_space;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT(!msp->ms_condensing);
-
-	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
-		return;
-
-	/*
-	 * We can only verify the metaslab space when we're called
-	 * from syncing context with a loaded metaslab that has an
-	 * allocated space map. Calling this in non-syncing context
-	 * does not provide a consistent view of the metaslab since
-	 * we're performing allocations in the future.
-	 */
-	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
-	    !msp->ms_loaded)
-		return;
-
-	/*
-	 * Even though the smp_alloc field can get negative (e.g.
-	 * see vdev_checkpoint_sm), that should never be the case
-	 * when it come's to a metaslab's space map.
-	 */
-	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
-
-	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
-
-	/*
-	 * Account for future allocations since we would have
-	 * already deducted that space from the ms_allocatable.
-	 */
-	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
-		allocating +=
-		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
-	}
-
-	ASSERT3U(msp->ms_deferspace, ==,
-	    range_tree_space(msp->ms_defer[0]) +
-	    range_tree_space(msp->ms_defer[1]));
-
-	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
-	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
-
-	VERIFY3U(sm_free_space, ==, msp_free_space);
-}
-
 /*
  * ==========================================================================
  * Metaslab groups
@@ -662,6 +650,25 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
 	mutex_exit(&mg->mg_lock);
 }
 
+int
+metaslab_sort_by_flushed(const void *va, const void *vb)
+{
+	const metaslab_t *a = va;
+	const metaslab_t *b = vb;
+
+	int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
+	if (likely(cmp))
+		return (cmp);
+
+	uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
+	uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
+	cmp = AVL_CMP(a_vdev_id, b_vdev_id);
+	if (cmp)
+		return (cmp);
+
+	return (AVL_CMP(a->ms_id, b->ms_id));
+}
+
 metaslab_group_t *
 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 {
@@ -676,7 +683,7 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 	mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
 	    KM_SLEEP);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
-	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
+	    sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
 	mg->mg_vd = vd;
 	mg->mg_class = mc;
 	mg->mg_activation_count = 0;
@@ -909,7 +916,6 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
 
 	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
-		ASSERT(msp != NULL);
 
 		/* skip if not active or not a member */
 		if (msp->ms_sm == NULL || msp->ms_group != mg)
@@ -1240,13 +1246,16 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
  */
 static uint64_t
 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
-    uint64_t align)
+    uint64_t max_search)
 {
 	range_seg_t *rs = metaslab_block_find(t, *cursor, size);
+	uint64_t first_found;
 
-	while (rs != NULL) {
-		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
+	if (rs != NULL)
+		first_found = rs->rs_start;
 
+	while (rs != NULL && rs->rs_start - first_found <= max_search) {
+		uint64_t offset = rs->rs_start;
 		if (offset + size <= rs->rs_end) {
 			*cursor = offset + size;
 			return (offset);
@@ -1254,49 +1263,28 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
 		rs = AVL_NEXT(t, rs);
 	}
 
-	/*
-	 * If we know we've searched the whole map (*cursor == 0), give up.
-	 * Otherwise, reset the cursor to the beginning and try again.
-	 */
-	if (*cursor == 0)
-		return (-1ULL);
-
 	*cursor = 0;
-	return (metaslab_block_picker(t, cursor, size, align));
-}
-
-/*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
- */
-static uint64_t
-metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
-{
-	/*
-	 * Find the largest power of 2 block size that evenly divides the
-	 * requested size. This is used to try to allocate blocks with similar
-	 * alignment from the same area of the metaslab (i.e. same cursor
-	 * bucket) but it does not guarantee that other allocations sizes
-	 * may exist in the same region.
-	 */
-	uint64_t align = size & -size;
-	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
-	avl_tree_t *t = &msp->ms_allocatable->rt_root;
-
-	return (metaslab_block_picker(t, cursor, size, align));
+	return (-1ULL);
 }
 
-static metaslab_ops_t metaslab_ff_ops = {
-	metaslab_ff_alloc
-};
-
 /*
  * ==========================================================================
- * Dynamic block allocator -
- * Uses the first fit allocation scheme until space get low and then
- * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
- * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ * Dynamic Fit (df) block allocator
+ *
+ * Search for a free chunk of at least this size, starting from the last
+ * offset (for this alignment of block) looking for up to
+ * metaslab_df_max_search bytes (16MB).  If a large enough free chunk is not
+ * found within 16MB, then return a free chunk of exactly the requested size (or
+ * larger).
+ *
+ * If it seems like searching from the last offset will be unproductive, skip
+ * that and just return a free chunk of exactly the requested size (or larger).
+ * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct.  This
+ * mechanism is probably not very useful and may be removed in the future.
+ *
+ * The behavior when not searching can be changed to return the largest free
+ * chunk, instead of a free chunk of exactly the requested size, by setting
+ * metaslab_df_use_largest_segment.
  * ==========================================================================
  */
 static uint64_t
@@ -1312,28 +1300,42 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 	uint64_t align = size & -size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 	range_tree_t *rt = msp->ms_allocatable;
-	avl_tree_t *t = &rt->rt_root;
-	uint64_t max_size = metaslab_block_maxsize(msp);
 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
+	uint64_t offset;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT3U(avl_numnodes(t), ==,
+	ASSERT3U(avl_numnodes(&rt->rt_root), ==,
 	    avl_numnodes(&msp->ms_allocatable_by_size));
 
-	if (max_size < size)
-		return (-1ULL);
-
 	/*
-	 * If we're running low on space switch to using the size
-	 * sorted AVL tree (best-fit).
+	 * If we're running low on space, find a segment based on size,
+	 * rather than iterating based on offset.
 	 */
-	if (max_size < metaslab_df_alloc_threshold ||
+	if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
-		t = &msp->ms_allocatable_by_size;
-		*cursor = 0;
+		offset = -1;
+	} else {
+		offset = metaslab_block_picker(&rt->rt_root,
+		    cursor, size, metaslab_df_max_search);
 	}
 
-	return (metaslab_block_picker(t, cursor, size, 1ULL));
+	if (offset == -1) {
+		range_seg_t *rs;
+		if (metaslab_df_use_largest_segment) {
+			/* use largest free segment */
+			rs = avl_last(&msp->ms_allocatable_by_size);
+		} else {
+			/* use segment of this size, or next largest */
+			rs = metaslab_block_find(&msp->ms_allocatable_by_size,
+			    0, size);
+		}
+		if (rs != NULL && rs->rs_start + size <= rs->rs_end) {
+			offset = rs->rs_start;
+			*cursor = offset + size;
+		}
+	}
+
+	return (offset);
 }
 
 static metaslab_ops_t metaslab_df_ops = {
@@ -1451,6 +1453,101 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
  * ==========================================================================
  */
 
+/*
+ * Wait for any in-progress metaslab loads to complete.
+ */
+void
+metaslab_load_wait(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	while (msp->ms_loading) {
+		ASSERT(!msp->ms_loaded);
+		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
+	}
+}
+
+/*
+ * Wait for any in-progress flushing to complete.
+ */
+void
+metaslab_flush_wait(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	while (msp->ms_flushing)
+		cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
+}
+
+uint64_t
+metaslab_allocated_space(metaslab_t *msp)
+{
+	return (msp->ms_allocated_space);
+}
+
+/*
+ * Verify that the space accounting on disk matches the in-core range_trees.
+ */
+static void
+metaslab_verify_space(metaslab_t *msp, uint64_t txg)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	uint64_t allocating = 0;
+	uint64_t sm_free_space, msp_free_space;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(!msp->ms_condensing);
+
+	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+		return;
+
+	/*
+	 * We can only verify the metaslab space when we're called
+	 * from syncing context with a loaded metaslab that has an
+	 * allocated space map. Calling this in non-syncing context
+	 * does not provide a consistent view of the metaslab since
+	 * we're performing allocations in the future.
+	 */
+	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
+	    !msp->ms_loaded)
+		return;
+
+	/*
+	 * Even though the smp_alloc field can get negative,
+	 * when it comes to a metaslab's space map, that should
+	 * never be the case.
+	 */
+	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
+
+	ASSERT3U(space_map_allocated(msp->ms_sm), >=,
+	    range_tree_space(msp->ms_unflushed_frees));
+
+	ASSERT3U(metaslab_allocated_space(msp), ==,
+	    space_map_allocated(msp->ms_sm) +
+	    range_tree_space(msp->ms_unflushed_allocs) -
+	    range_tree_space(msp->ms_unflushed_frees));
+
+	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
+
+	/*
+	 * Account for future allocations since we would have
+	 * already deducted that space from the ms_allocatable.
+	 */
+	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
+		allocating +=
+		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
+	}
+
+	ASSERT3U(msp->ms_deferspace, ==,
+	    range_tree_space(msp->ms_defer[0]) +
+	    range_tree_space(msp->ms_defer[1]));
+
+	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
+	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
+
+	VERIFY3U(sm_free_space, ==, msp_free_space);
+}
+
 static void
 metaslab_aux_histograms_clear(metaslab_t *msp)
 {
@@ -1574,7 +1671,15 @@ metaslab_verify_weight_and_frag(metaslab_t *msp)
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
-	/* see comment in metaslab_verify_unflushed_changes() */
+	/*
+	 * We can end up here from vdev_remove_complete(), in which case we
+	 * cannot do these assertions because we hold spa config locks and
+	 * thus we are not allowed to read from the DMU.
+	 *
+	 * We check if the metaslab group has been removed and if that's
+	 * the case we return immediately as that would mean that we are
+	 * here from the aforementioned code path.
+	 */
 	if (msp->ms_group == NULL)
 		return;
 
@@ -1648,20 +1753,6 @@ metaslab_verify_weight_and_frag(metaslab_t *msp)
 	VERIFY3U(msp->ms_weight, ==, weight);
 }
 
-/*
- * Wait for any in-progress metaslab loads to complete.
- */
-static void
-metaslab_load_wait(metaslab_t *msp)
-{
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	while (msp->ms_loading) {
-		ASSERT(!msp->ms_loaded);
-		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
-	}
-}
-
 static int
 metaslab_load_impl(metaslab_t *msp)
 {
@@ -1676,13 +1767,19 @@ metaslab_load_impl(metaslab_t *msp)
 	 * are reading the space map. Therefore, metaslab_sync() and
 	 * metaslab_sync_done() can run at the same time as we do.
 	 *
-	 * metaslab_sync() can append to the space map while we are loading.
-	 * Therefore we load only entries that existed when we started the
-	 * load. Additionally, metaslab_sync_done() has to wait for the load
-	 * to complete because there are potential races like metaslab_load()
-	 * loading parts of the space map that are currently being appended
-	 * by metaslab_sync(). If we didn't, the ms_allocatable would have
-	 * entries that metaslab_sync_done() would try to re-add later.
+	 * If we are using the log space maps, metaslab_sync() can't write to
+	 * the metaslab's space map while we are loading as we only write to
+	 * it when we are flushing the metaslab, and that can't happen while
+	 * we are loading it.
+	 *
+	 * If we are not using log space maps though, metaslab_sync() can
+	 * append to the space map while we are loading. Therefore we load
+	 * only entries that existed when we started the load. Additionally,
+	 * metaslab_sync_done() has to wait for the load to complete because
+	 * there are potential races like metaslab_load() loading parts of the
+	 * space map that are currently being appended by metaslab_sync(). If
+	 * we didn't, the ms_allocatable would have entries that
+	 * metaslab_sync_done() would try to re-add later.
 	 *
 	 * That's why before dropping the lock we remember the synced length
 	 * of the metaslab and read up to that point of the space map,
@@ -1692,6 +1789,7 @@ metaslab_load_impl(metaslab_t *msp)
 	uint64_t length = msp->ms_synced_length;
 	mutex_exit(&msp->ms_lock);
 
+	hrtime_t load_start = gethrtime();
 	if (msp->ms_sm != NULL) {
 		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
 		    SM_FREE, length);
@@ -1703,18 +1801,37 @@ metaslab_load_impl(metaslab_t *msp)
 		 */
 		range_tree_add(msp->ms_allocatable,
 		    msp->ms_start, msp->ms_size);
+
+		if (msp->ms_freed != NULL) {
+			/*
+			 * If the ms_sm doesn't exist, this means that this
+			 * metaslab hasn't gone through metaslab_sync() and
+			 * thus has never been dirtied. So we shouldn't
+			 * expect any unflushed allocs or frees from previous
+			 * TXGs.
+			 *
+			 * Note: ms_freed and all the other trees except for
+			 * the ms_allocatable, can be NULL at this point only
+			 * if this is a new metaslab of a vdev that just got
+			 * expanded.
+			 */
+			ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+			ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+		}
 	}
 
 	/*
 	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
-	 * changing the ms_sm and the metaslab's range trees while we are
-	 * about to use them and populate the ms_allocatable. The ms_lock
-	 * is insufficient for this because metaslab_sync() doesn't hold
-	 * the ms_lock while writing the ms_checkpointing tree to disk.
+	 * changing the ms_sm (or log_sm) and the metaslab's range trees
+	 * while we are about to use them and populate the ms_allocatable.
+	 * The ms_lock is insufficient for this because metaslab_sync() doesn't
+	 * hold the ms_lock while writing the ms_checkpointing tree to disk.
 	 */
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
+
 	ASSERT(!msp->ms_condensing);
+	ASSERT(!msp->ms_flushing);
 
 	if (error != 0) {
 		mutex_exit(&msp->ms_sync_lock);
@@ -1725,10 +1842,60 @@ metaslab_load_impl(metaslab_t *msp)
 	msp->ms_loaded = B_TRUE;
 
 	/*
-	 * The ms_allocatable contains the segments that exist in the
-	 * ms_defer trees [see ms_synced_length]. Thus we need to remove
-	 * them from ms_allocatable as they will be added again in
+	 * Apply all the unflushed changes to ms_allocatable right
+	 * away so any manipulations we do below have a clear view
+	 * of what is allocated and what is free.
+	 */
+	range_tree_walk(msp->ms_unflushed_allocs,
+	    range_tree_remove, msp->ms_allocatable);
+	range_tree_walk(msp->ms_unflushed_frees,
+	    range_tree_add, msp->ms_allocatable);
+
+	msp->ms_loaded = B_TRUE;
+
+	ASSERT3P(msp->ms_group, !=, NULL);
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	if (spa_syncing_log_sm(spa) != NULL) {
+		ASSERT(spa_feature_is_enabled(spa,
+		    SPA_FEATURE_LOG_SPACEMAP));
+
+		/*
+		 * If we use a log space map we add all the segments
+		 * that are in ms_unflushed_frees so they are available
+		 * for allocation.
+		 *
+		 * ms_allocatable needs to contain all free segments
+		 * that are ready for allocations (thus not segments
+		 * from ms_freeing, ms_freed, and the ms_defer trees).
+		 * But if we grab the lock in this code path at a sync
+		 * pass later that 1, then it also contains the
+		 * segments of ms_freed (they were added to it earlier
+		 * in this path through ms_unflushed_frees). So we
+		 * need to remove all the segments that exist in
+		 * ms_freed from ms_allocatable as they will be added
+		 * later in metaslab_sync_done().
+		 *
+		 * When there's no log space map, the ms_allocatable
+		 * correctly doesn't contain any segments that exist
+		 * in ms_freed [see ms_synced_length].
+		 */
+		range_tree_walk(msp->ms_freed,
+		    range_tree_remove, msp->ms_allocatable);
+	}
+
+	/*
+	 * If we are not using the log space map, ms_allocatable
+	 * contains the segments that exist in the ms_defer trees
+	 * [see ms_synced_length]. Thus we need to remove them
+	 * from ms_allocatable as they will be added again in
 	 * metaslab_sync_done().
+	 *
+	 * If we are using the log space map, ms_allocatable still
+	 * contains the segments that exist in the ms_defer trees.
+	 * Not because it read them through the ms_sm though. But
+	 * because these segments are part of ms_unflushed_frees
+	 * whose segments we add to ms_allocatable earlier in this
+	 * code path.
 	 */
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_walk(msp->ms_defer[t],
@@ -1753,10 +1920,26 @@ metaslab_load_impl(metaslab_t *msp)
 		ASSERT3U(weight, <=, msp->ms_weight);
 	msp->ms_max_size = metaslab_block_maxsize(msp);
 
-	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	hrtime_t load_end = gethrtime();
+	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
+		zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, "
+		    "ms_id %llu, smp_length %llu, "
+		    "unflushed_allocs %llu, unflushed_frees %llu, "
+		    "freed %llu, defer %llu + %llu, "
+		    "loading_time %lld ms",
+		    spa_syncing_txg(spa), spa_name(spa),
+		    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
+		    space_map_length(msp->ms_sm),
+		    range_tree_space(msp->ms_unflushed_allocs),
+		    range_tree_space(msp->ms_unflushed_frees),
+		    range_tree_space(msp->ms_freed),
+		    range_tree_space(msp->ms_defer[0]),
+		    range_tree_space(msp->ms_defer[1]),
+		    (longlong_t)((load_end - load_start) / 1000000));
+	}
+
 	metaslab_verify_space(msp, spa_syncing_txg(spa));
 	mutex_exit(&msp->ms_sync_lock);
-
 	return (0);
 }
 
@@ -1782,8 +1965,32 @@ metaslab_load(metaslab_t *msp, uint64_t txg)
 		atomic_inc_64(&mg_ksp->mg_loads.value.ui64);
 	}
 
+	/*
+	 * We set the loading flag BEFORE potentially dropping the lock to
+	 * wait for an ongoing flush (see ms_flushing below). This way other
+	 * threads know that there is already a thread that is loading this
+	 * metaslab.
+	 */
 	msp->ms_loading = B_TRUE;
+
+	/*
+	 * Wait for any in-progress flushing to finish as we drop the ms_lock
+	 * both here (during space_map_load()) and in metaslab_flush() (when
+	 * we flush our changes to the ms_sm).
+	 */
+	if (msp->ms_flushing)
+		metaslab_flush_wait(msp);
+
+	/*
+	 * In the possibility that we were waiting for the metaslab to be
+	 * flushed (where we temporarily dropped the ms_lock), ensure that
+	 * no one else loaded the metaslab somehow.
+	 */
+	ASSERT(!msp->ms_loaded);
+
 	int error = metaslab_load_impl(msp);
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	msp->ms_loading = B_FALSE;
 	msp->ms_loaded_txg = txg;
 	cv_broadcast(&msp->ms_load_cv);
@@ -1811,7 +2018,7 @@ metaslab_unload(metaslab_t *msp)
 	 * have their weights calculated from the space map histograms, while
 	 * loaded ones have it calculated from their in-core range tree
 	 * [see metaslab_load()]. This way, the weight reflects the information
-	 * available in-core, whether it is loaded or not
+	 * available in-core, whether it is loaded or not.
 	 *
 	 * If ms_group == NULL means that we came here from metaslab_fini(),
 	 * at which point it doesn't make sense for us to do the recalculation
@@ -1821,7 +2028,7 @@ metaslab_unload(metaslab_t *msp)
 		metaslab_recalculate_weight_and_sort(msp);
 }
 
-static void
+void
 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta)
 {
@@ -1835,8 +2042,8 @@ metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
 }
 
 int
-metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
-    metaslab_t **msp)
+metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
+    uint64_t txg, metaslab_t **msp)
 {
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
@@ -1848,6 +2055,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
 
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
@@ -1911,17 +2119,6 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 		    metaslab_allocated_space(ms), 0, 0);
 	}
 
-	/*
-	 * If metaslab_debug_load is set and we're initializing a metaslab
-	 * that has an allocated space map object then load the space map
-	 * so that we can verify frees.
-	 */
-	if (metaslab_debug_load && ms->ms_sm != NULL) {
-		mutex_enter(&ms->ms_lock);
-		VERIFY0(metaslab_load(ms, txg));
-		mutex_exit(&ms->ms_lock);
-	}
-
 	if (txg != 0) {
 		vdev_dirty(vd, 0, NULL, txg);
 		vdev_dirty(vd, VDD_METASLAB, ms, txg);
@@ -1932,11 +2129,42 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	return (0);
 }
 
+static void
+metaslab_fini_flush_data(metaslab_t *msp)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+	if (metaslab_unflushed_txg(msp) == 0) {
+		ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
+		    ==, NULL);
+		return;
+	}
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+	mutex_enter(&spa->spa_flushed_ms_lock);
+	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
+	mutex_exit(&spa->spa_flushed_ms_lock);
+
+	spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+	spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+}
+
+uint64_t
+metaslab_unflushed_changes_memused(metaslab_t *ms)
+{
+	return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
+	    range_tree_numsegs(ms->ms_unflushed_frees)) *
+	    sizeof (range_seg_t));
+}
+
 void
 metaslab_fini(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+
+	metaslab_fini_flush_data(msp);
 
 	metaslab_group_remove(mg, msp);
 
@@ -1946,13 +2174,22 @@ metaslab_fini(metaslab_t *msp)
 	    -metaslab_allocated_space(msp), 0, -msp->ms_size);
 
 	space_map_close(msp->ms_sm);
+	msp->ms_sm = NULL;
 
 	metaslab_unload(msp);
-
 	range_tree_destroy(msp->ms_allocatable);
 	range_tree_destroy(msp->ms_freeing);
 	range_tree_destroy(msp->ms_freed);
 
+	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+	    metaslab_unflushed_changes_memused(msp));
+	spa->spa_unflushed_stats.sus_memused -=
+	    metaslab_unflushed_changes_memused(msp);
+	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+	range_tree_destroy(msp->ms_unflushed_allocs);
+	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+	range_tree_destroy(msp->ms_unflushed_frees);
+
 	for (int t = 0; t < TXG_SIZE; t++) {
 		range_tree_destroy(msp->ms_allocating[t]);
 	}
@@ -1972,6 +2209,7 @@ metaslab_fini(metaslab_t *msp)
 
 	mutex_exit(&msp->ms_lock);
 	cv_destroy(&msp->ms_load_cv);
+	cv_destroy(&msp->ms_flush_cv);
 	mutex_destroy(&msp->ms_lock);
 	mutex_destroy(&msp->ms_sync_lock);
 	ASSERT3U(msp->ms_allocator, ==, -1);
@@ -2213,9 +2451,9 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
 }
 
 /*
- * Calculate the weight based on the on-disk histogram. This should only
- * be called after a sync pass has completely finished since the on-disk
- * information is updated in metaslab_sync().
+ * Calculate the weight based on the on-disk histogram. Should be applied
+ * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
+ * give results consistent with the on-disk state
  */
 static uint64_t
 metaslab_weight_from_spacemap(metaslab_t *msp)
@@ -2289,7 +2527,6 @@ metaslab_segment_weight(metaslab_t *msp)
 		}
 		WEIGHT_SET_ACTIVE(weight, 0);
 		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
-
 		return (weight);
 	}
 
@@ -2323,21 +2560,23 @@ metaslab_segment_weight(metaslab_t *msp)
 
 /*
  * Determine if we should attempt to allocate from this metaslab. If the
- * metaslab has a maximum size then we can quickly determine if the desired
- * allocation size can be satisfied. Otherwise, if we're using segment-based
- * weighting then we can determine the maximum allocation that this metaslab
- * can accommodate based on the index encoded in the weight. If we're using
- * space-based weights then rely on the entire weight (excluding the weight
- * type bit).
+ * metaslab is loaded, then we can determine if the desired allocation
+ * can be satisfied by looking at the size of the maximum free segment
+ * on that metaslab. Otherwise, we make our decision based on the metaslab's
+ * weight. For segment-based weighting we can determine the maximum
+ * allocation based on the index encoded in its value. For space-based
+ * weights we rely on the entire weight (excluding the weight-type bit).
  */
 boolean_t
 metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
 {
-	boolean_t should_allocate;
-
-	if (msp->ms_max_size != 0)
+	if (msp->ms_loaded) {
 		return (msp->ms_max_size >= asize);
+	} else {
+		ASSERT0(msp->ms_max_size);
+	}
 
+	boolean_t should_allocate;
 	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
 		/*
 		 * The metaslab segment weight indicates segments in the
@@ -2599,18 +2838,19 @@ metaslab_group_preload(metaslab_group_t *mg)
 }
 
 /*
- * Determine if the space map's on-disk footprint is past our tolerance
- * for inefficiency. We would like to use the following criteria to make
- * our decision:
+ * Determine if the space map's on-disk footprint is past our tolerance for
+ * inefficiency. We would like to use the following criteria to make our
+ * decision:
  *
- * 1. The size of the space map object should not dramatically increase as a
- * result of writing out the free space range tree.
+ * 1. Do not condense if the size of the space map object would dramatically
+ *    increase as a result of writing out the free space range tree.
  *
- * 2. The minimal on-disk space map representation is zfs_condense_pct/100
- * times the size than the free space range tree representation
- * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
+ * 2. Condense if the on on-disk space map representation is at least
+ *    zfs_condense_pct/100 times the size of the optimal representation
+ *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
  *
- * 3. The on-disk size of the space map should actually decrease.
+ * 3. Do not condense if the on-disk size of the space map does not actually
+ *    decrease.
  *
  * Unfortunately, we cannot compute the on-disk size of the space map in this
  * context because we cannot accurately compute the effects of compression, etc.
@@ -2624,30 +2864,11 @@ metaslab_should_condense(metaslab_t *msp)
 	space_map_t *sm = msp->ms_sm;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
-	uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
-
-	if (zfs_condense_never != 0)
-		return (B_FALSE);
-
-	/*
-	 * Allocations and frees in early passes are generally more space
-	 * efficient (in terms of blocks described in space map entries)
-	 * than the ones in later passes (e.g. we don't compress after
-	 * sync pass 5) and condensing a metaslab multiple times in a txg
-	 * could degrade performance.
-	 *
-	 * Thus we prefer condensing each metaslab at most once every txg at
-	 * the earliest sync pass possible. If a metaslab is eligible for
-	 * condensing again after being considered for condensing within the
-	 * same txg, it will hopefully be dirty in the next txg where it will
-	 * be condensed at an earlier pass.
-	 */
-	if (msp->ms_condense_checked_txg == current_txg)
-		return (B_FALSE);
-	msp->ms_condense_checked_txg = current_txg;
+	ASSERT(sm != NULL);
+	ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
 
 	/*
 	 * We always condense metaslabs that are empty and metaslabs for
@@ -2657,96 +2878,343 @@ metaslab_should_condense(metaslab_t *msp)
 	    msp->ms_condense_wanted)
 		return (B_TRUE);
 
-	uint64_t object_size = space_map_length(msp->ms_sm);
+	uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
+	uint64_t object_size = space_map_length(sm);
 	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
 	    msp->ms_allocatable, SM_NO_VDEVID);
 
-	dmu_object_info_t doi;
-	dmu_object_info_from_db(sm->sm_dbuf, &doi);
-	uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
-
 	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
 /*
  * Condense the on-disk space map representation to its minimized form.
- * The minimized form consists of a small number of allocations followed by
- * the entries of the free range tree.
+ * The minimized form consists of a small number of allocations followed
+ * by the entries of the free range tree (ms_allocatable). The condensed
+ * spacemap contains all the entries of previous TXGs (including those in
+ * the pool-wide log spacemaps; thus this is effectively a superset of
+ * metaslab_flush()), but this TXG's entries still need to be written.
  */
 static void
-metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
+metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
 {
 	range_tree_t *condense_tree;
 	space_map_t *sm = msp->ms_sm;
+	uint64_t txg = dmu_tx_get_txg(tx);
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
+	ASSERT(msp->ms_sm != NULL);
+
+	/*
+	 * In order to condense the space map, we need to change it so it
+	 * only describes which segments are currently allocated and free.
+	 *
+	 * All the current free space resides in the ms_allocatable, all
+	 * the ms_defer trees, and all the ms_allocating trees. We ignore
+	 * ms_freed because it is empty because we're in sync pass 1. We
+	 * ignore ms_freeing because these changes are not yet reflected
+	 * in the spacemap (they will be written later this txg).
+	 *
+	 * So to truncate the space map to represent all the entries of
+	 * previous TXGs we do the following:
+	 *
+	 * 1] We create a range tree (condense tree) that is 100% allocated.
+	 * 2] We remove from it all segments found in the ms_defer trees
+	 *    as those segments are marked as free in the original space
+	 *    map. We do the same with the ms_allocating trees for the same
+	 *    reason. Removing these segments should be a relatively
+	 *    inexpensive operation since we expect these trees to have a
+	 *    small number of nodes.
+	 * 3] We vacate any unflushed allocs as they should already exist
+	 *    in the condense tree. Then we vacate any unflushed frees as
+	 *    they should already be part of ms_allocatable.
+	 * 4] At this point, we would ideally like to remove all segments
+	 *    in the ms_allocatable tree from the condense tree. This way
+	 *    we would write all the entries of the condense tree as the
+	 *    condensed space map, which would only contain allocated
+	 *    segments with everything else assumed to be freed.
+	 *
+	 *    Doing so can be prohibitively expensive as ms_allocatable can
+	 *    be large, and therefore computationally expensive to subtract
+	 *    from the condense_tree. Instead we first sync out the
+	 *    condense_tree and then the ms_allocatable, in the condensed
+	 *    space map. While this is not optimal, it is typically close to
+	 *    optimal and more importantly much cheaper to compute.
+	 *
+	 * 5] Finally, as both of the unflushed trees were written to our
+	 *    new and condensed metaslab space map, we basically flushed
+	 *    all the unflushed changes to disk, thus we call
+	 *    metaslab_flush_update().
+	 */
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
 
 	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
 	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
 	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
-	    msp->ms_group->mg_vd->vdev_spa->spa_name,
-	    space_map_length(msp->ms_sm),
+	    spa->spa_name, space_map_length(msp->ms_sm),
 	    avl_numnodes(&msp->ms_allocatable->rt_root),
 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
 
 	msp->ms_condense_wanted = B_FALSE;
 
-	/*
-	 * Create an range tree that is 100% allocated. We remove segments
-	 * that have been freed in this txg, any deferred frees that exist,
-	 * and any allocation in the future. Removing segments should be
-	 * a relatively inexpensive operation since we expect these trees to
-	 * have a small number of nodes.
-	 */
 	condense_tree = range_tree_create(NULL, NULL);
 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
 
-	range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
-	range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
-
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_walk(msp->ms_defer[t],
 		    range_tree_remove, condense_tree);
 	}
 
-	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
 		    range_tree_remove, condense_tree);
 	}
 
+	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+	    metaslab_unflushed_changes_memused(msp));
+	spa->spa_unflushed_stats.sus_memused -=
+	    metaslab_unflushed_changes_memused(msp);
+	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+
 	/*
-	 * We're about to drop the metaslab's lock thus allowing
-	 * other consumers to change it's content. Set the
-	 * metaslab's ms_condensing flag to ensure that
-	 * allocations on this metaslab do not occur while we're
-	 * in the middle of committing it to disk. This is only critical
-	 * for ms_allocatable as all other range trees use per txg
+	 * We're about to drop the metaslab's lock thus allowing other
+	 * consumers to change its content. Set the metaslab's ms_condensing
+	 * flag to ensure that allocations on this metaslab do not occur
+	 * while we're in the middle of committing it to disk. This is only
+	 * critical for ms_allocatable as all other range trees use per TXG
 	 * views of their content.
 	 */
 	msp->ms_condensing = B_TRUE;
 
 	mutex_exit(&msp->ms_lock);
-	space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
+	uint64_t object = space_map_object(msp->ms_sm);
+	space_map_truncate(sm,
+	    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
+	    zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
 
 	/*
-	 * While we would ideally like to create a space map representation
-	 * that consists only of allocation records, doing so can be
-	 * prohibitively expensive because the in-core free tree can be
-	 * large, and therefore computationally expensive to subtract
-	 * from the condense_tree. Instead we sync out two trees, a cheap
-	 * allocation only tree followed by the in-core free tree. While not
-	 * optimal, this is typically close to optimal, and much cheaper to
-	 * compute.
+	 * space_map_truncate() may have reallocated the spacemap object.
+	 * If so, update the vdev_ms_array.
+	 */
+	if (space_map_object(msp->ms_sm) != object) {
+		object = space_map_object(msp->ms_sm);
+		dmu_write(spa->spa_meta_objset,
+		    msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
+		    msp->ms_id, sizeof (uint64_t), &object, tx);
+	}
+
+	/*
+	 * Note:
+	 * When the log space map feature is enabled, each space map will
+	 * always have ALLOCS followed by FREES for each sync pass. This is
+	 * typically true even when the log space map feature is disabled,
+	 * except from the case where a metaslab goes through metaslab_sync()
+	 * and gets condensed. In that case the metaslab's space map will have
+	 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
+	 * followed by FREES (due to space_map_write() in metaslab_sync()) for
+	 * sync pass 1.
 	 */
 	space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
+	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
+
 	range_tree_vacate(condense_tree, NULL, NULL);
 	range_tree_destroy(condense_tree);
-
-	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
 	mutex_enter(&msp->ms_lock);
+
 	msp->ms_condensing = B_FALSE;
+	metaslab_flush_update(msp, tx);
+}
+
+/*
+ * Called when the metaslab has been flushed (its own spacemap now reflects
+ * all the contents of the pool-wide spacemap log). Updates the metaslab's
+ * metadata and any pool-wide related log space map data (e.g. summary,
+ * obsolete logs, etc.) to reflect that.
+ */
+static void
+metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
+{
+	metaslab_group_t *mg = msp->ms_group;
+	spa_t *spa = mg->mg_vd->vdev_spa;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+	ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+
+	/*
+	 * Just because a metaslab got flushed, that doesn't mean that
+	 * it will pass through metaslab_sync_done(). Thus, make sure to
+	 * update ms_synced_length here in case it doesn't.
+	 */
+	msp->ms_synced_length = space_map_length(msp->ms_sm);
+
+	/*
+	 * We may end up here from metaslab_condense() without the
+	 * feature being active. In that case this is a no-op.
+	 */
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	ASSERT(spa_syncing_log_sm(spa) != NULL);
+	ASSERT(msp->ms_sm != NULL);
+	ASSERT(metaslab_unflushed_txg(msp) != 0);
+	ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
+
+	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
+
+	/* update metaslab's position in our flushing tree */
+	uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
+	mutex_enter(&spa->spa_flushed_ms_lock);
+	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
+	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+	avl_add(&spa->spa_metaslabs_by_flushed, msp);
+	mutex_exit(&spa->spa_flushed_ms_lock);
+
+	/* update metaslab counts of spa_log_sm_t nodes */
+	spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
+	spa_log_sm_increment_current_mscount(spa);
+
+	/* cleanup obsolete logs if any */
+	uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
+	spa_cleanup_old_sm_logs(spa, tx);
+	uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
+	VERIFY3U(log_blocks_after, <=, log_blocks_before);
+
+	/* update log space map summary */
+	uint64_t blocks_gone = log_blocks_before - log_blocks_after;
+	spa_log_summary_add_flushed_metaslab(spa);
+	spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
+	spa_log_summary_decrement_blkcount(spa, blocks_gone);
+}
+
+boolean_t
+metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+	ASSERT(msp->ms_sm != NULL);
+	ASSERT(metaslab_unflushed_txg(msp) != 0);
+	ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
+
+	/*
+	 * There is nothing wrong with flushing the same metaslab twice, as
+	 * this codepath should work on that case. However, the current
+	 * flushing scheme makes sure to avoid this situation as we would be
+	 * making all these calls without having anything meaningful to write
+	 * to disk. We assert this behavior here.
+	 */
+	ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
+
+	/*
+	 * We can not flush while loading, because then we would
+	 * not load the ms_unflushed_{allocs,frees}.
+	 */
+	if (msp->ms_loading)
+		return (B_FALSE);
+
+	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+	metaslab_verify_weight_and_frag(msp);
+
+	/*
+	 * Metaslab condensing is effectively flushing. Therefore if the
+	 * metaslab can be condensed we can just condense it instead of
+	 * flushing it.
+	 *
+	 * Note that metaslab_condense() does call metaslab_flush_update()
+	 * so we can just return immediately after condensing. We also
+	 * don't need to care about setting ms_flushing or broadcasting
+	 * ms_flush_cv, even if we temporarily drop the ms_lock in
+	 * metaslab_condense(), as the metaslab is already loaded.
+	 */
+	if (msp->ms_loaded && metaslab_should_condense(msp)) {
+		metaslab_group_t *mg = msp->ms_group;
+
+		/*
+		 * For all histogram operations below refer to the
+		 * comments of metaslab_sync() where we follow a
+		 * similar procedure.
+		 */
+		metaslab_group_histogram_verify(mg);
+		metaslab_class_histogram_verify(mg->mg_class);
+		metaslab_group_histogram_remove(mg, msp);
+
+		metaslab_condense(msp, tx);
+
+		space_map_histogram_clear(msp->ms_sm);
+		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
+		ASSERT(range_tree_is_empty(msp->ms_freed));
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			space_map_histogram_add(msp->ms_sm,
+			    msp->ms_defer[t], tx);
+		}
+		metaslab_aux_histograms_update(msp);
+
+		metaslab_group_histogram_add(mg, msp);
+		metaslab_group_histogram_verify(mg);
+		metaslab_class_histogram_verify(mg->mg_class);
+
+		metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+
+		/*
+		 * Since we recreated the histogram (and potentially
+		 * the ms_sm too while condensing) ensure that the
+		 * weight is updated too because we are not guaranteed
+		 * that this metaslab is dirty and will go through
+		 * metaslab_sync_done().
+		 */
+		metaslab_recalculate_weight_and_sort(msp);
+		return (B_TRUE);
+	}
+
+	msp->ms_flushing = B_TRUE;
+	uint64_t sm_len_before = space_map_length(msp->ms_sm);
+
+	mutex_exit(&msp->ms_lock);
+	space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
+	    SM_NO_VDEVID, tx);
+	space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
+	    SM_NO_VDEVID, tx);
+	mutex_enter(&msp->ms_lock);
+
+	uint64_t sm_len_after = space_map_length(msp->ms_sm);
+	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
+		zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
+		    "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
+		    "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa),
+		    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
+		    range_tree_space(msp->ms_unflushed_allocs),
+		    range_tree_space(msp->ms_unflushed_frees),
+		    (sm_len_after - sm_len_before));
+	}
+
+	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+	    metaslab_unflushed_changes_memused(msp));
+	spa->spa_unflushed_stats.sus_memused -=
+	    metaslab_unflushed_changes_memused(msp);
+	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+
+	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+	metaslab_verify_weight_and_frag(msp);
+
+	metaslab_flush_update(msp, tx);
+
+	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+	metaslab_verify_weight_and_frag(msp);
+
+	msp->ms_flushing = B_FALSE;
+	cv_broadcast(&msp->ms_flush_cv);
+	return (B_TRUE);
 }
 
 /*
@@ -2761,7 +3229,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	objset_t *mos = spa_meta_objset(spa);
 	range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
 	dmu_tx_t *tx;
-	uint64_t object = space_map_object(msp->ms_sm);
 
 	ASSERT(!vd->vdev_ishole);
 
@@ -2808,25 +3275,53 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	 */
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
-	if (msp->ms_sm == NULL) {
-		uint64_t new_object;
+	/*
+	 * Generate a log space map if one doesn't exist already.
+	 */
+	spa_generate_syncing_log_sm(spa, tx);
 
-		new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
+	if (msp->ms_sm == NULL) {
+		uint64_t new_object = space_map_alloc(mos,
+		    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
+		    zfs_metaslab_sm_blksz_with_log :
+		    zfs_metaslab_sm_blksz_no_log, tx);
 		VERIFY3U(new_object, !=, 0);
 
+		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
+		    msp->ms_id, sizeof (uint64_t), &new_object, tx);
+
 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
 		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
-
 		ASSERT(msp->ms_sm != NULL);
+
+		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 		ASSERT0(metaslab_allocated_space(msp));
 	}
 
+	if (metaslab_unflushed_txg(msp) == 0 &&
+	    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+		ASSERT(spa_syncing_log_sm(spa) != NULL);
+
+		metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+		spa_log_sm_increment_current_mscount(spa);
+		spa_log_summary_add_flushed_metaslab(spa);
+
+		ASSERT(msp->ms_sm != NULL);
+		mutex_enter(&spa->spa_flushed_ms_lock);
+		avl_add(&spa->spa_metaslabs_by_flushed, msp);
+		mutex_exit(&spa->spa_flushed_ms_lock);
+
+		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+	}
+
 	if (!range_tree_is_empty(msp->ms_checkpointing) &&
 	    vd->vdev_checkpoint_sm == NULL) {
 		ASSERT(spa_has_checkpoint(spa));
 
 		uint64_t new_object = space_map_alloc(mos,
-		    vdev_standard_sm_blksz, tx);
+		    zfs_vdev_standard_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
@@ -2855,10 +3350,39 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	metaslab_class_histogram_verify(mg->mg_class);
 	metaslab_group_histogram_remove(mg, msp);
 
-	if (msp->ms_loaded && metaslab_should_condense(msp)) {
-		metaslab_condense(msp, txg, tx);
+	if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
+	    metaslab_should_condense(msp))
+		metaslab_condense(msp, tx);
+
+	/*
+	 * We'll be going to disk to sync our space accounting, thus we
+	 * drop the ms_lock during that time so allocations coming from
+	 * open-context (ZIL) for future TXGs do not block.
+	 */
+	mutex_exit(&msp->ms_lock);
+	space_map_t *log_sm = spa_syncing_log_sm(spa);
+	if (log_sm != NULL) {
+		ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+		space_map_write(log_sm, alloctree, SM_ALLOC,
+		    vd->vdev_id, tx);
+		space_map_write(log_sm, msp->ms_freeing, SM_FREE,
+		    vd->vdev_id, tx);
+		mutex_enter(&msp->ms_lock);
+
+		ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+		    metaslab_unflushed_changes_memused(msp));
+		spa->spa_unflushed_stats.sus_memused -=
+		    metaslab_unflushed_changes_memused(msp);
+		range_tree_remove_xor_add(alloctree,
+		    msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
+		range_tree_remove_xor_add(msp->ms_freeing,
+		    msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
+		spa->spa_unflushed_stats.sus_memused +=
+		    metaslab_unflushed_changes_memused(msp);
 	} else {
-		mutex_exit(&msp->ms_lock);
+		ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+
 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
 		    SM_NO_VDEVID, tx);
 		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
@@ -2878,7 +3402,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		/*
 		 * Since we are doing writes to disk and the ms_checkpointing
 		 * tree won't be changing during that time, we drop the
-		 * ms_lock while writing to the checkpoint space map.
+		 * ms_lock while writing to the checkpoint space map, for the
+		 * same reason mentioned above.
 		 */
 		mutex_exit(&msp->ms_lock);
 		space_map_write(vd->vdev_checkpoint_sm,
@@ -2946,6 +3471,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	 * and instead will just swap the pointers for freeing and freed.
 	 * We can safely do this since the freed_tree is guaranteed to be
 	 * empty on the initial pass.
+	 *
+	 * Keep in mind that even if we are currently using a log spacemap
+	 * we want current frees to end up in the ms_allocatable (but not
+	 * get appended to the ms_sm) so their ranges can be reused as usual.
 	 */
 	if (spa_sync_pass(spa) == 1) {
 		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
@@ -2965,11 +3494,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 
 	mutex_exit(&msp->ms_lock);
 
-	if (object != space_map_object(msp->ms_sm)) {
-		object = space_map_object(msp->ms_sm);
-		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
-		    msp->ms_id, sizeof (uint64_t), &object, tx);
-	}
+	/*
+	 * Verify that the space map object ID has been recorded in the
+	 * vdev_ms_array.
+	 */
+	uint64_t object;
+	VERIFY0(dmu_read(mos, vd->vdev_ms_array,
+	    msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
+	VERIFY3U(object, ==, space_map_object(msp->ms_sm));
+
 	mutex_exit(&msp->ms_sync_lock);
 	dmu_tx_commit(tx);
 }
@@ -3010,14 +3543,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 		msp->ms_freed = range_tree_create(NULL, NULL);
 
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-			ASSERT(msp->ms_defer[t] == NULL);
-
+			ASSERT3P(msp->ms_defer[t], ==, NULL);
 			msp->ms_defer[t] = range_tree_create(NULL, NULL);
 		}
 
 		ASSERT3P(msp->ms_checkpointing, ==, NULL);
 		msp->ms_checkpointing = range_tree_create(NULL, NULL);
 
+		ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
+		msp->ms_unflushed_allocs = range_tree_create(NULL, NULL);
+		ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
+		msp->ms_unflushed_frees = range_tree_create(NULL, NULL);
+
 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
 	}
 	ASSERT0(range_tree_space(msp->ms_freeing));
@@ -3034,21 +3571,28 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	defer_delta = 0;
 	alloc_delta = msp->ms_allocated_this_txg -
 	    range_tree_space(msp->ms_freed);
+
 	if (defer_allowed) {
 		defer_delta = range_tree_space(msp->ms_freed) -
 		    range_tree_space(*defer_tree);
 	} else {
 		defer_delta -= range_tree_space(*defer_tree);
 	}
-
 	metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
 	    defer_delta, 0);
 
-	/*
-	 * If there's a metaslab_load() in progress, wait for it to complete
-	 * so that we have a consistent view of the in-core space map.
-	 */
-	metaslab_load_wait(msp);
+	if (spa_syncing_log_sm(spa) == NULL) {
+		/*
+		 * If there's a metaslab_load() in progress and we don't have
+		 * a log space map, it means that we probably wrote to the
+		 * metaslab's space map. If this is the case, we need to
+		 * make sure that we wait for the load to complete so that we
+		 * have a consistent view at the in-core side of the metaslab.
+		 */
+		metaslab_load_wait(msp);
+	} else {
+		ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+	}
 
 	/*
 	 * When auto-trimming is enabled, free ranges which are added to
@@ -3383,6 +3927,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 	range_tree_t *rt = msp->ms_allocatable;
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(msp->ms_disabled);
 
@@ -4578,12 +5123,23 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
 		    offset, size);
 	}
 
-	range_tree_verify_not_present(msp->ms_trim, offset, size);
+	/*
+	 * Check all segments that currently exist in the freeing pipeline.
+	 *
+	 * It would intuitively make sense to also check the current allocating
+	 * tree since metaslab_unalloc_dva() exists for extents that are
+	 * allocated and freed in the same sync pass withing the same txg.
+	 * Unfortunately there are places (e.g. the ZIL) where we allocate a
+	 * segment but then we free part of it within the same txg
+	 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
+	 * current allocating tree.
+	 */
 	range_tree_verify_not_present(msp->ms_freeing, offset, size);
 	range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
 	range_tree_verify_not_present(msp->ms_freed, offset, size);
 	for (int j = 0; j < TXG_DEFER_SIZE; j++)
 		range_tree_verify_not_present(msp->ms_defer[j], offset, size);
+	range_tree_verify_not_present(msp->ms_trim, offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
@@ -4692,3 +5248,54 @@ metaslab_enable(metaslab_t *msp, boolean_t sync)
 	mutex_exit(&msp->ms_lock);
 	mutex_exit(&mg->mg_ms_disabled_lock);
 }
+
+static void
+metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
+{
+	vdev_t *vd = ms->ms_group->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa_meta_objset(spa);
+
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+	metaslab_unflushed_phys_t entry = {
+		.msp_unflushed_txg = metaslab_unflushed_txg(ms),
+	};
+	uint64_t entry_size = sizeof (entry);
+	uint64_t entry_offset = ms->ms_id * entry_size;
+
+	uint64_t object = 0;
+	int err = zap_lookup(mos, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
+	    &object);
+	if (err == ENOENT) {
+		object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
+		    SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+		VERIFY0(zap_add(mos, vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
+		    &object, tx));
+	} else {
+		VERIFY0(err);
+	}
+
+	dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
+	    &entry, tx);
+}
+
+void
+metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
+{
+	spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	ms->ms_unflushed_txg = txg;
+	metaslab_update_ondisk_flush_data(ms, tx);
+}
+
+uint64_t
+metaslab_unflushed_txg(metaslab_t *ms)
+{
+	return (ms->ms_unflushed_txg);
+}
diff --git a/usr/src/uts/common/fs/zfs/range_tree.c b/usr/src/uts/common/fs/zfs/range_tree.c
index fc705e3796..0ce251126b 100644
--- a/usr/src/uts/common/fs/zfs/range_tree.c
+++ b/usr/src/uts/common/fs/zfs/range_tree.c
@@ -23,7 +23,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -579,10 +579,10 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
 void
 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
 {
-	range_seg_t *rs;
-
-	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
+	for (range_seg_t *rs = avl_first(&rt->rt_root); rs;
+	    rs = AVL_NEXT(&rt->rt_root, rs)) {
 		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
+	}
 }
 
 range_seg_t *
@@ -597,6 +597,12 @@ range_tree_space(range_tree_t *rt)
 	return (rt->rt_space);
 }
 
+uint64_t
+range_tree_numsegs(range_tree_t *rt)
+{
+	return ((rt == NULL) ? 0 : avl_numnodes(&rt->rt_root));
+}
+
 /* Generic range tree functions for maintaining segments in an AVL tree. */
 void
 rt_avl_create(range_tree_t *rt, void *arg)
@@ -668,3 +674,73 @@ range_tree_span(range_tree_t *rt)
 {
 	return (range_tree_max(rt) - range_tree_min(rt));
 }
+
+/*
+ * Remove any overlapping ranges between the given segment [start, end)
+ * from removefrom. Add non-overlapping leftovers to addto.
+ */
+void
+range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
+    range_tree_t *removefrom, range_tree_t *addto)
+{
+	avl_index_t where;
+	range_seg_t starting_rs = {
+		.rs_start = start,
+		.rs_end = start + 1
+	};
+
+	range_seg_t *curr = avl_find(&removefrom->rt_root,
+	    &starting_rs, &where);
+
+	if (curr == NULL)
+		curr = avl_nearest(&removefrom->rt_root, where, AVL_AFTER);
+
+	range_seg_t *next;
+	for (; curr != NULL; curr = next) {
+		next = AVL_NEXT(&removefrom->rt_root, curr);
+
+		if (start == end)
+			return;
+		VERIFY3U(start, <, end);
+
+		/* there is no overlap */
+		if (end <= curr->rs_start) {
+			range_tree_add(addto, start, end - start);
+			return;
+		}
+
+		uint64_t overlap_start = MAX(curr->rs_start, start);
+		uint64_t overlap_end = MIN(curr->rs_end, end);
+		uint64_t overlap_size = overlap_end - overlap_start;
+		ASSERT3S(overlap_size, >, 0);
+		range_tree_remove(removefrom, overlap_start, overlap_size);
+
+		if (start < overlap_start)
+			range_tree_add(addto, start, overlap_start - start);
+
+		start = overlap_end;
+	}
+	VERIFY3P(curr, ==, NULL);
+
+	if (start != end) {
+		VERIFY3U(start, <, end);
+		range_tree_add(addto, start, end - start);
+	} else {
+		VERIFY3U(start, ==, end);
+	}
+}
+
+/*
+ * For each entry in rt, if it exists in removefrom, remove it
+ * from removefrom. Otherwise, add it to addto.
+ */
+void
+range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom,
+    range_tree_t *addto)
+{
+	for (range_seg_t *rs = avl_first(&rt->rt_root); rs;
+	    rs = AVL_NEXT(&rt->rt_root, rs)) {
+		range_tree_remove_xor_add_segment(rs->rs_start, rs->rs_end,
+		    removefrom, addto);
+	}
+}
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 0afcffad45..32bab905e7 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -1356,19 +1356,90 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
 	return (0);
 }
 
+static boolean_t
+spa_should_flush_logs_on_unload(spa_t *spa)
+{
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return (B_FALSE);
+
+	if (!spa_writeable(spa))
+		return (B_FALSE);
+
+	if (!spa->spa_sync_on)
+		return (B_FALSE);
+
+	if (spa_state(spa) != POOL_STATE_EXPORTED)
+		return (B_FALSE);
+
+	if (zfs_keep_log_spacemaps_at_export)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Opens a transaction that will set the flag that will instruct
+ * spa_sync to attempt to flush all the metaslabs for that txg.
+ */
+static void
+spa_unload_log_sm_flush_all(spa_t *spa)
+{
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+	ASSERT3U(spa->spa_log_flushall_txg, ==, 0);
+	spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
+
+	dmu_tx_commit(tx);
+	txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
+}
+
+static void
+spa_unload_log_sm_metadata(spa_t *spa)
+{
+	void *cookie = NULL;
+	spa_log_sm_t *sls;
+
+	while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
+	    &cookie)) != NULL) {
+		VERIFY0(sls->sls_mscount);
+		kmem_free(sls, sizeof (spa_log_sm_t));
+	}
+
+	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	    e != NULL; e = list_head(&spa->spa_log_summary)) {
+		VERIFY0(e->lse_mscount);
+		list_remove(&spa->spa_log_summary, e);
+		kmem_free(e, sizeof (log_summary_entry_t));
+	}
+
+	spa->spa_unflushed_stats.sus_nblocks = 0;
+	spa->spa_unflushed_stats.sus_memused = 0;
+	spa->spa_unflushed_stats.sus_blocklimit = 0;
+}
+
 /*
  * Opposite of spa_load().
  */
 static void
 spa_unload(spa_t *spa)
 {
-	int i;
-
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
 
 	spa_load_note(spa, "UNLOADING");
 
 	/*
+	 * If the log space map feature is enabled and the pool is getting
+	 * exported (but not destroyed), we want to spend some time flushing
+	 * as many metaslabs as we can in an attempt to destroy log space
+	 * maps and save import time.
+	 */
+	if (spa_should_flush_logs_on_unload(spa))
+		spa_unload_log_sm_flush_all(spa);
+
+	/*
 	 * Stop async tasks.
 	 */
 	spa_async_suspend(spa);
@@ -1389,16 +1460,15 @@ spa_unload(spa_t *spa)
 	}
 
 	/*
-	 * Even though vdev_free() also calls vdev_metaslab_fini, we need
-	 * to call it earlier, before we wait for async i/o to complete.
-	 * This ensures that there is no async metaslab prefetching, by
-	 * calling taskq_wait(mg_taskq).
+	 * This ensures that there is no async metaslab prefetching
+	 * while we attempt to unload the spa.
 	 */
 	if (spa->spa_root_vdev != NULL) {
-		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
-		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
-			vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
-		spa_config_exit(spa, SCL_ALL, spa);
+		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
+			vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
+			if (vc->vdev_mg != NULL)
+				taskq_wait(vc->vdev_mg->mg_taskq);
+		}
 	}
 
 	if (spa->spa_mmp.mmp_thread)
@@ -1452,13 +1522,14 @@ spa_unload(spa_t *spa)
 	}
 
 	ddt_unload(spa);
+	spa_unload_log_sm_metadata(spa);
 
 	/*
 	 * Drop and purge level 2 cache
 	 */
 	spa_l2cache_drop(spa);
 
-	for (i = 0; i < spa->spa_spares.sav_count; i++)
+	for (int i = 0; i < spa->spa_spares.sav_count; i++)
 		vdev_free(spa->spa_spares.sav_vdevs[i]);
 	if (spa->spa_spares.sav_vdevs) {
 		kmem_free(spa->spa_spares.sav_vdevs,
@@ -1471,7 +1542,7 @@ spa_unload(spa_t *spa)
 	}
 	spa->spa_spares.sav_count = 0;
 
-	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
+	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
 	}
@@ -3584,6 +3655,13 @@ spa_ld_load_vdev_metadata(spa_t *spa)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
+	error = spa_ld_log_spacemaps(spa);
+	if (error != 0) {
+		spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]",
+		    error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+	}
+
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
 	 */
@@ -5870,7 +5948,7 @@ spa_reset(char *pool)
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
-	uint64_t txg, id;
+	uint64_t txg;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
@@ -5945,19 +6023,9 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
-
-		/*
-		 * Set the vdev id to the first hole, if one exists.
-		 */
-		for (id = 0; id < rvd->vdev_children; id++) {
-			if (rvd->vdev_child[id]->vdev_ishole) {
-				vdev_free(rvd->vdev_child[id]);
-				break;
-			}
-		}
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
-		tvd->vdev_id = id;
+		tvd->vdev_id = rvd->vdev_children;
 		vdev_add_child(rvd, tvd);
 		vdev_config_dirty(tvd);
 	}
@@ -7601,6 +7669,18 @@ spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
 	if (spa_sync_pass(spa) != 1)
 		return;
 
+	/*
+	 * Note:
+	 * If the log space map feature is active, we stop deferring
+	 * frees to the next TXG and therefore running this function
+	 * would be considered a no-op as spa_deferred_bpobj should
+	 * not have any entries.
+	 *
+	 * That said we run this function anyway (instead of returning
+	 * immediately) for the edge-case scenario where we just
+	 * activated the log space map feature in this TXG but we have
+	 * deferred frees from the previous TXG.
+	 */
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
 	    spa_free_sync_cb, zio, tx), ==, 0);
@@ -8193,7 +8273,14 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
-		if (pass < zfs_sync_pass_deferred_free) {
+		if (pass < zfs_sync_pass_deferred_free ||
+		    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+			/*
+			 * If the log space map feature is active we don't
+			 * care about deferred frees and the deferred bpobj
+			 * as the log space map should effectively have the
+			 * same results (i.e. appending only to one object).
+			 */
 			spa_sync_frees(spa, free_bpl, tx);
 		} else {
 			/*
@@ -8210,6 +8297,8 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 		svr_sync(spa, tx);
 		spa_sync_upgrades(spa, tx);
 
+		spa_flush_metaslabs(spa, tx);
+
 		vdev_t *vd = NULL;
 		while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 		    != NULL)
@@ -8456,6 +8545,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
 	    != NULL)
 		vdev_sync_done(vd, txg);
+	spa_sync_close_syncing_log_sm(spa);
 
 	spa_update_dspace(spa);
 
@@ -8650,6 +8740,21 @@ spa_has_active_shared_spare(spa_t *spa)
 	return (B_FALSE);
 }
 
+uint64_t
+spa_total_metaslabs(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t m = 0;
+
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+		if (!vdev_is_concrete(vd))
+			continue;
+		m += vd->vdev_ms_count;
+	}
+	return (m);
+}
+
 sysevent_t *
 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 {
diff --git a/usr/src/uts/common/fs/zfs/spa_log_spacemap.c b/usr/src/uts/common/fs/zfs/spa_log_spacemap.c
new file mode 100644
index 0000000000..ffa2c60563
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/spa_log_spacemap.c
@@ -0,0 +1,1285 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu_objset.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/spa_log_spacemap.h>
+#include <sys/vdev_impl.h>
+#include <sys/zap.h>
+
+/*
+ * Log Space Maps
+ *
+ * Log space maps are an optimization in ZFS metadata allocations for pools
+ * whose workloads are primarily random-writes. Random-write workloads are also
+ * typically random-free, meaning that they are freeing from locations scattered
+ * throughout the pool. This means that each TXG we will have to append some
+ * FREE records to almost every metaslab. With log space maps, we hold their
+ * changes in memory and log them altogether in one pool-wide space map on-disk
+ * for persistence. As more blocks are accumulated in the log space maps and
+ * more unflushed changes are accounted in memory, we flush a selected group
+ * of metaslabs every TXG to relieve memory pressure and potential overheads
+ * when loading the pool. Flushing a metaslab to disk relieves memory as we
+ * flush any unflushed changes from memory to disk (i.e. the metaslab's space
+ * map) and saves import time by making old log space maps obsolete and
+ * eventually destroying them. [A log space map is said to be obsolete when all
+ * its entries have made it to their corresponding metaslab space maps].
+ *
+ * == On disk data structures used ==
+ *
+ * - The pool has a new feature flag and a new entry in the MOS. The feature
+ *   is activated when we create the first log space map and remains active
+ *   for the lifetime of the pool. The new entry in the MOS Directory [refer
+ *   to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
+ *   pairs are of the form <key: txg, value: log space map object for that txg>.
+ *   This entry is our on-disk reference of the log space maps that exist in
+ *   the pool for each TXG and it is used during import to load all the
+ *   metaslab unflushed changes in memory. To see how this structure is first
+ *   created and later populated refer to spa_generate_syncing_log_sm(). To see
+ *   how it is used during import time refer to spa_ld_log_sm_metadata().
+ *
+ * - Each vdev has a new entry in its vdev_top_zap (see field
+ *   VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
+ *   each metaslab in this vdev. This field is the on-disk counterpart of the
+ *   in-memory field ms_unflushed_txg which tells us from which TXG and onwards
+ *   the metaslab haven't had its changes flushed. During import, we use this
+ *   to ignore any entries in the space map log that are for this metaslab but
+ *   from a TXG before msp_unflushed_txg. At that point, we also populate its
+ *   in-memory counterpart and from there both fields are updated every time
+ *   we flush that metaslab.
+ *
+ * - A space map is created every TXG and, during that TXG, it is used to log
+ *   all incoming changes (the log space map). When created, the log space map
+ *   is referenced in memory by spa_syncing_log_sm and its object ID is inserted
+ *   to the space map ZAP mentioned above. The log space map is closed at the
+ *   end of the TXG and will be destroyed when it becomes fully obsolete. We
+ *   know when a log space map has become obsolete by looking at the oldest
+ *   (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
+ *   than the log space map's TXG, then it means that there is no metaslab who
+ *   doesn't have the changes from that log and we can therefore destroy it.
+ *   [see spa_cleanup_old_sm_logs()].
+ *
+ * == Important in-memory structures ==
+ *
+ * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
+ *   the pool by their ms_unflushed_txg field. It is primarily used for three
+ *   reasons. First of all, it is used during flushing where we try to flush
+ *   metaslabs in-order from the oldest-flushed to the most recently flushed
+ *   every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
+ *   oldest flushed metaslab to distinguish which log space maps have become
+ *   obsolete and which ones are still relevant. Finally it tells us which
+ *   metaslabs have unflushed changes in a pool where this feature was just
+ *   enabled, as we don't immediately add all of the pool's metaslabs but we
+ *   add them over time as they go through metaslab_sync(). The reason that
+ *   we do that is to ease these pools into the behavior of the flushing
+ *   algorithm (described later on).
+ *
+ * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
+ *   counterpart of the space map ZAP mentioned above. It's an AVL tree whose
+ *   nodes represent the log space maps in the pool. This in-memory
+ *   representation of log space maps in the pool sorts the log space maps by
+ *   the TXG that they were created (which is also the TXG of their unflushed
+ *   changes). It also contains the following extra information for each
+ *   space map:
+ *   [1] The number of metaslabs that were last flushed on that TXG. This is
+ *       important because if that counter is zero and this is the oldest
+ *       log then it means that it is also obsolete.
+ *   [2] The number of blocks of that space map. This field is used by the
+ *       block heuristic of our flushing algorithm (described later on).
+ *       It represents how many blocks of metadata changes ZFS had to write
+ *       to disk for that TXG.
+ *
+ * - The per-spa field spa_log_summary is a list of entries that summarizes
+ *   the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
+ *   AVL tree mentioned above. The reason this exists is that our flushing
+ *   algorithm (described later) tries to estimate how many metaslabs to flush
+ *   in each TXG by iterating over all the log space maps and looking at their
+ *   block counts. Summarizing that information means that don't have to
+ *   iterate through each space map, minimizing the runtime overhead of the
+ *   flushing algorithm which would be induced in syncing context. In terms of
+ *   implementation the log summary is used as a queue:
+ *   * we modify or pop entries from its head when we flush metaslabs
+ *   * we modify or append entries to its tail when we sync changes.
+ *
+ * - Each metaslab has two new range trees that hold its unflushed changes,
+ *   ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
+ *
+ * == Flushing algorithm ==
+ *
+ * The decision of how many metaslabs to flush on a give TXG is guided by
+ * two heuristics:
+ *
+ * [1] The memory heuristic -
+ * We keep track of the memory used by the unflushed trees from all the
+ * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
+ * stays below a certain threshold which is determined by an arbitrary hard
+ * limit and an arbitrary percentage of the system's memory [see
+ * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
+ * unflushed changes are passing that threshold, we flush metaslabs, which
+ * empties their unflushed range trees, reducing the memory used.
+ *
+ * [2] The block heuristic -
+ * We try to keep the total number of blocks in the log space maps in check
+ * so the log doesn't grow indefinitely and we don't induce a lot of overhead
+ * when loading the pool. At the same time we don't want to flush a lot of
+ * metaslabs too often as this would defeat the purpose of the log space map.
+ * As a result we set a limit in the amount of blocks that we think it's
+ * acceptable for the log space maps to have and try not to cross it.
+ * [see sus_blocklimit from spa_unflushed_stats].
+ *
+ * In order to stay below the block limit every TXG we have to estimate how
+ * many metaslabs we need to flush based on the current rate of incoming blocks
+ * and our history of log space map blocks. The main idea here is to answer
+ * the question of how many metaslabs do we need to flush in order to get rid
+ * at least an X amount of log space map blocks. We can answer this question
+ * by iterating backwards from the oldest log space map to the newest one
+ * and looking at their metaslab and block counts. At this point the log summary
+ * mentioned above comes handy as it reduces the amount of things that we have
+ * to iterate (even though it may reduce the preciseness of our estimates due
+ * to its aggregation of data). So with that in mind, we project the incoming
+ * rate of the current TXG into the future and attempt to approximate how many
+ * metaslabs would we need to flush from now in order to avoid exceeding our
+ * block limit in different points in the future (granted that we would keep
+ * flushing the same number of metaslabs for every TXG). Then we take the
+ * maximum number from all these estimates to be on the safe side. For the
+ * exact implementation details of algorithm refer to
+ * spa_estimate_metaslabs_to_flush.
+ */
+
+/*
+ * This is used as the block size for the space maps used for the
+ * log space map feature. These space maps benefit from a bigger
+ * block size as we expect to be writing a lot of data to them at
+ * once.
+ */
+unsigned long zfs_log_sm_blksz = 1ULL << 17;
+
+/*
+ * Percentage of the overall system’s memory that ZFS allows to be
+ * used for unflushed changes (e.g. the sum of size of all the nodes
+ * in the unflushed trees).
+ *
+ * Note that this value is calculated over 1000000 for finer granularity
+ * (thus the _ppm suffix; reads as "parts per million"). As an example,
+ * the default of 1000 allows 0.1% of memory to be used.
+ */
+unsigned long zfs_unflushed_max_mem_ppm = 1000;
+
+/*
+ * Specific hard-limit in memory that ZFS allows to be used for
+ * unflushed changes.
+ */
+unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
+
+/*
+ * The following tunable determines the number of blocks that can be used for
+ * the log space maps. It is expressed as a percentage of the total number of
+ * metaslabs in the pool (i.e. the default of 400 means that the number of log
+ * blocks is capped at 4 times the number of metaslabs).
+ *
+ * This value exists to tune our flushing algorithm, with higher values
+ * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
+ * flushing metaslabs more aggressively with the upside of saving overheads
+ * when loading the pool. Another factor in this tradeoff is that flushing
+ * less often can potentially lead to better utilization of the metaslab space
+ * map's block size as we accumulate more changes per flush.
+ *
+ * Given that this tunable indirectly controls the flush rate (metaslabs
+ * flushed per txg) and that's why making it a percentage in terms of the
+ * number of metaslabs in the pool makes sense here.
+ *
+ * As a rule of thumb we default this tunable to 400% based on the following:
+ *
+ * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
+ *    it is reasonable to expect that the amount of obsolete entries changes
+ *    linearly from txg to txg (e.g. the oldest log should have the most
+ *    obsolete entries, and the most recent one the least). With this we could
+ *    say that, at any given time, about half of the entries in the whole space
+ *    map log are obsolete. Thus for every two entries for a metaslab in the
+ *    log space map, only one of them is valid and actually makes it to the
+ *    metaslab's space map.
+ *    [factor of 2]
+ * 2] Each entry in the log space map is guaranteed to be two words while
+ *    entries in metaslab space maps are generally single-word.
+ *    [an extra factor of 2 - 400% overall]
+ * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
+ *    account any consolidation of segments from the log space map to the
+ *    unflushed range trees nor their history (e.g. a segment being allocated,
+ *    then freed, then allocated again means 3 log space map entries but 0
+ *    metaslab space map entries). Depending on the workload, we've seen ~1.8
+ *    non-obsolete log space map entries per metaslab entry, for a total of
+ *    ~600%. Since most of these estimates though are workload dependent, we
+ *    default on 400% to be conservative.
+ *
+ *    Thus we could say that even in the worst
+ *    case of [1] and [2], the factor should end up being 4.
+ *
+ * That said, regardless of the number of metaslabs in the pool we need to
+ * provide upper and lower bounds for the log block limit.
+ * [see zfs_unflushed_log_block_{min,max}]
+ */
+unsigned long zfs_unflushed_log_block_pct = 400;
+
+/*
+ * If the number of metaslabs is small and our incoming rate is high, we could
+ * get into a situation that we are flushing all our metaslabs every TXG. Thus
+ * we always allow at least this many log blocks.
+ */
+unsigned long zfs_unflushed_log_block_min = 1000;
+
+/*
+ * If the log becomes too big, the import time of the pool can take a hit in
+ * terms of performance. Thus we have a hard limit in the size of the log in
+ * terms of blocks.
+ */
+unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
+
+/*
+ * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
+ * stability of the flushing algorithm (longer summary) vs its runtime overhead
+ * (smaller summary is faster to traverse).
+ */
+unsigned long zfs_max_logsm_summary_length = 10;
+
+/*
+ * Tunable that sets the lower bound on the metaslabs to flush every TXG.
+ *
+ * Setting this to 0 has no effect since if the pool is idle we won't even be
+ * creating log space maps and therefore we won't be flushing. On the other
+ * hand if the pool has any incoming workload our block heuristic will start
+ * flushing metaslabs anyway.
+ *
+ * The point of this tunable is to be used in extreme cases where we really
+ * want to flush more metaslabs than our adaptable heuristic plans to flush.
+ */
+unsigned long zfs_min_metaslabs_to_flush = 1;
+
+/*
+ * Tunable that specifies how far in the past do we want to look when trying to
+ * estimate the incoming log blocks for the current TXG.
+ *
+ * Setting this too high may not only increase runtime but also minimize the
+ * effect of the incoming rates from the most recent TXGs as we take the
+ * average over all the blocks that we walk
+ * [see spa_estimate_incoming_log_blocks].
+ */
+unsigned long zfs_max_log_walking = 5;
+
+/*
+ * This tunable exists solely for testing purposes. It ensures that the log
+ * spacemaps are not flushed and destroyed during export in order for the
+ * relevant log spacemap import code paths to be tested (effectively simulating
+ * a crash).
+ */
+int zfs_keep_log_spacemaps_at_export = 0;
+
+static uint64_t
+spa_estimate_incoming_log_blocks(spa_t *spa)
+{
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	uint64_t steps = 0, sum = 0;
+
+	for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
+	    sls != NULL && steps < zfs_max_log_walking;
+	    sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
+		if (sls->sls_txg == spa_syncing_txg(spa)) {
+			/*
+			 * skip the log created in this TXG as this would
+			 * make our estimations inaccurate.
+			 */
+			continue;
+		}
+		sum += sls->sls_nblocks;
+		steps++;
+	}
+	return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
+}
+
+uint64_t
+spa_log_sm_blocklimit(spa_t *spa)
+{
+	return (spa->spa_unflushed_stats.sus_blocklimit);
+}
+
+void
+spa_log_sm_set_blocklimit(spa_t *spa)
+{
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+		ASSERT0(spa_log_sm_blocklimit(spa));
+		return;
+	}
+
+	uint64_t calculated_limit =
+	    (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
+	spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
+	    zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
+}
+
+uint64_t
+spa_log_sm_nblocks(spa_t *spa)
+{
+	return (spa->spa_unflushed_stats.sus_nblocks);
+}
+
+/*
+ * Ensure that the in-memory log space map structures and the summary
+ * have the same block and metaslab counts.
+ */
+static void
+spa_log_summary_verify_counts(spa_t *spa)
+{
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+	if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
+		return;
+
+	uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
+
+	uint64_t ms_in_summary = 0, blk_in_summary = 0;
+	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	    e; e = list_next(&spa->spa_log_summary, e)) {
+		ms_in_summary += e->lse_mscount;
+		blk_in_summary += e->lse_blkcount;
+	}
+
+	uint64_t ms_in_logs = 0, blk_in_logs = 0;
+	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+		ms_in_logs += sls->sls_mscount;
+		blk_in_logs += sls->sls_nblocks;
+	}
+
+	VERIFY3U(ms_in_logs, ==, ms_in_summary);
+	VERIFY3U(ms_in_logs, ==, ms_in_avl);
+	VERIFY3U(blk_in_logs, ==, blk_in_summary);
+	VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
+}
+
+static boolean_t
+summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
+{
+	uint64_t blocks_per_row = MAX(1,
+	    DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
+	    zfs_max_logsm_summary_length));
+
+	return (blocks_per_row <= e->lse_blkcount);
+}
+
+/*
+ * Update the log summary information to reflect the fact that a metaslab
+ * was flushed or destroyed (e.g due to device removal or pool export/destroy).
+ *
+ * We typically flush the oldest flushed metaslab so the first (and oldest)
+ * entry of the summary is updated. However if that metaslab is getting loaded
+ * we may flush the second oldest one which may be part of an entry later in
+ * the summary. Moreover, if we call into this function from metaslab_fini()
+ * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
+ * for a txg as an argument so we can locate the appropriate summary entry for
+ * the metaslab.
+ */
+void
+spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
+{
+	/*
+	 * We don't track summary data for read-only pools and this function
+	 * can be called from metaslab_fini(). In that case return immediately.
+	 */
+	if (!spa_writeable(spa))
+		return;
+
+	log_summary_entry_t *target = NULL;
+	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	    e != NULL; e = list_next(&spa->spa_log_summary, e)) {
+		if (e->lse_start > txg)
+			break;
+		target = e;
+	}
+
+	if (target == NULL || target->lse_mscount == 0) {
+		/*
+		 * We didn't find a summary entry for this metaslab. We must be
+		 * at the teardown of a spa_load() attempt that got an error
+		 * while reading the log space maps.
+		 */
+		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
+		return;
+	}
+
+	target->lse_mscount--;
+}
+
+/*
+ * Update the log summary information to reflect the fact that we destroyed
+ * old log space maps. Since we can only destroy the oldest log space maps,
+ * we decrement the block count of the oldest summary entry and potentially
+ * destroy it when that count hits 0.
+ *
+ * This function is called after a metaslab is flushed and typically that
+ * metaslab is the oldest flushed, which means that this function will
+ * typically decrement the block count of the first entry of the summary and
+ * potentially free it if the block count gets to zero (its metaslab count
+ * should be zero too at that point).
+ *
+ * There are certain scenarios though that don't work exactly like that so we
+ * need to account for them:
+ *
+ * Scenario [1]: It is possible that after we flushed the oldest flushed
+ * metaslab and we destroyed the oldest log space map, more recent logs had 0
+ * metaslabs pointing to them so we got rid of them too. This can happen due
+ * to metaslabs being destroyed through device removal, or because the oldest
+ * flushed metaslab was loading but we kept flushing more recently flushed
+ * metaslabs due to the memory pressure of unflushed changes. Because of that,
+ * we always iterate from the beginning of the summary and if blocks_gone is
+ * bigger than the block_count of the current entry we free that entry (we
+ * expect its metaslab count to be zero), we decrement blocks_gone and on to
+ * the next entry repeating this procedure until blocks_gone gets decremented
+ * to 0. Doing this also works for the typical case mentioned above.
+ *
+ * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by
+ * the first (and oldest) entry in the summary. If the first few entries of
+ * the summary were only accounting metaslabs from a device that was just
+ * removed, then the current oldest flushed metaslab could be accounted by an
+ * entry somewhere in the middle of the summary. Moreover flushing that
+ * metaslab will destroy all the log space maps older than its ms_unflushed_txg
+ * because they became obsolete after the removal. Thus, iterating as we did
+ * for scenario [1] works out for this case too.
+ *
+ * Scenario [3]: At times we decide to flush all the metaslabs in the pool
+ * in one TXG (either because we are exporting the pool or because our flushing
+ * heuristics decided to do so). When that happens all the log space maps get
+ * destroyed except the one created for the current TXG which doesn't have
+ * any log blocks yet. As log space maps get destroyed with every metaslab that
+ * we flush, entries in the summary are also destroyed. This brings a weird
+ * corner-case when we flush the last metaslab and the log space map of the
+ * current TXG is in the same summary entry with other log space maps that
+ * are older. When that happens we are eventually left with this one last
+ * summary entry whose blocks are gone (blocks_gone equals the entry's block
+ * count) but its metaslab count is non-zero (because it accounts all the
+ * metaslabs in the pool as they all got flushed). Under this scenario we can't
+ * free this last summary entry as it's referencing all the metaslabs in the
+ * pool and its block count will get incremented at the end of this sync (when
+ * we close the syncing log space map). Thus we just decrement its current
+ * block count and leave it alone. In the case that the pool gets exported,
+ * its metaslab count will be decremented over time as we call metaslab_fini()
+ * for all the metaslabs in the pool and the entry will be freed at
+ * spa_unload_log_sm_metadata().
+ */
+void
+spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
+{
+	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	    e != NULL; e = list_head(&spa->spa_log_summary)) {
+		if (e->lse_blkcount > blocks_gone) {
+			/*
+			 * Assert that we stopped at an entry that is not
+			 * obsolete.
+			 */
+			ASSERT(e->lse_mscount != 0);
+
+			e->lse_blkcount -= blocks_gone;
+			blocks_gone = 0;
+			break;
+		} else if (e->lse_mscount == 0) {
+			/* remove obsolete entry */
+			blocks_gone -= e->lse_blkcount;
+			list_remove(&spa->spa_log_summary, e);
+			kmem_free(e, sizeof (log_summary_entry_t));
+		} else {
+			/* Verify that this is scenario [3] mentioned above. */
+			VERIFY3U(blocks_gone, ==, e->lse_blkcount);
+
+			/*
+			 * Assert that this is scenario [3] further by ensuring
+			 * that this is the only entry in the summary.
+			 */
+			VERIFY3P(e, ==, list_tail(&spa->spa_log_summary));
+			ASSERT3P(e, ==, list_head(&spa->spa_log_summary));
+
+			blocks_gone = e->lse_blkcount = 0;
+			break;
+		}
+	}
+
+	/*
+	 * Ensure that there is no way we are trying to remove more blocks
+	 * than the # of blocks in the summary.
+	 */
+	ASSERT0(blocks_gone);
+}
+
+void
+spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg)
+{
+	spa_log_sm_t target = { .sls_txg = txg };
+	spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
+	    &target, NULL);
+
+	if (sls == NULL) {
+		/*
+		 * We must be at the teardown of a spa_load() attempt that
+		 * got an error while reading the log space maps.
+		 */
+		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
+		return;
+	}
+
+	ASSERT(sls->sls_mscount > 0);
+	sls->sls_mscount--;
+}
+
+void
+spa_log_sm_increment_current_mscount(spa_t *spa)
+{
+	spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg);
+
+	ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa));
+	last_sls->sls_mscount++;
+}
+
+static void
+summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
+    uint64_t nblocks)
+{
+	log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
+
+	if (e == NULL || summary_entry_is_full(spa, e)) {
+		e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
+		e->lse_start = txg;
+		list_insert_tail(&spa->spa_log_summary, e);
+	}
+
+	ASSERT3U(e->lse_start, <=, txg);
+	e->lse_mscount += metaslabs_flushed;
+	e->lse_blkcount += nblocks;
+}
+
+static void
+spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
+{
+	summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
+}
+
+void
+spa_log_summary_add_flushed_metaslab(spa_t *spa)
+{
+	summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
+}
+
+/*
+ * This function attempts to estimate how many metaslabs should
+ * we flush to satisfy our block heuristic for the log spacemap
+ * for the upcoming TXGs.
+ *
+ * Specifically, it first tries to estimate the number of incoming
+ * blocks in this TXG. Then by projecting that incoming rate to
+ * future TXGs and using the log summary, it figures out how many
+ * flushes we would need to do for future TXGs individually to
+ * stay below our block limit and returns the maximum number of
+ * flushes from those estimates.
+ */
+static uint64_t
+spa_estimate_metaslabs_to_flush(spa_t *spa)
+{
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(spa_log_sm_blocklimit(spa) != 0);
+
+	/*
+	 * This variable contains the incoming rate that will be projected
+	 * and used for our flushing estimates in the future.
+	 */
+	uint64_t incoming = spa_estimate_incoming_log_blocks(spa);
+
+	/*
+	 * At any point in time this variable tells us how many
+	 * TXGs in the future we are so we can make our estimations.
+	 */
+	uint64_t txgs_in_future = 1;
+
+	/*
+	 * This variable tells us how much room do we have until we hit
+	 * our limit. When it goes negative, it means that we've exceeded
+	 * our limit and we need to flush.
+	 *
+	 * Note that since we start at the first TXG in the future (i.e.
+	 * txgs_in_future starts from 1) we already decrement this
+	 * variable by the incoming rate.
+	 */
+	int64_t available_blocks =
+	    spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
+
+	/*
+	 * This variable tells us the total number of flushes needed to
+	 * keep the log size within the limit when we reach txgs_in_future.
+	 */
+	uint64_t total_flushes = 0;
+
+	/* Holds the current maximum of our estimates so far. */
+	uint64_t max_flushes_pertxg =
+	    MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
+	    zfs_min_metaslabs_to_flush);
+
+	/*
+	 * For our estimations we only look as far in the future
+	 * as the summary allows us.
+	 */
+	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	    e; e = list_next(&spa->spa_log_summary, e)) {
+
+		/*
+		 * If there is still room before we exceed our limit
+		 * then keep skipping TXGs accumulating more blocks
+		 * based on the incoming rate until we exceed it.
+		 */
+		if (available_blocks >= 0) {
+			uint64_t skip_txgs = (available_blocks / incoming) + 1;
+			available_blocks -= (skip_txgs * incoming);
+			txgs_in_future += skip_txgs;
+			ASSERT3S(available_blocks, >=, -incoming);
+		}
+
+		/*
+		 * At this point we're far enough into the future where
+		 * the limit was just exceeded and we flush metaslabs
+		 * based on the current entry in the summary, updating
+		 * our available_blocks.
+		 */
+		ASSERT3S(available_blocks, <, 0);
+		available_blocks += e->lse_blkcount;
+		total_flushes += e->lse_mscount;
+
+		/*
+		 * Keep the running maximum of the total_flushes that
+		 * we've done so far over the number of TXGs in the
+		 * future that we are. The idea here is to estimate
+		 * the average number of flushes that we should do
+		 * every TXG so that when we are that many TXGs in the
+		 * future we stay under the limit.
+		 */
+		max_flushes_pertxg = MAX(max_flushes_pertxg,
+		    DIV_ROUND_UP(total_flushes, txgs_in_future));
+		ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
+		    max_flushes_pertxg);
+	}
+	return (max_flushes_pertxg);
+}
+
+uint64_t
+spa_log_sm_memused(spa_t *spa)
+{
+	return (spa->spa_unflushed_stats.sus_memused);
+}
+
+static boolean_t
+spa_log_exceeds_memlimit(spa_t *spa)
+{
+	if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt)
+		return (B_TRUE);
+
+	uint64_t system_mem_allowed = ((physmem * PAGESIZE) *
+	    zfs_unflushed_max_mem_ppm) / 1000000;
+	if (spa_log_sm_memused(spa) > system_mem_allowed)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+boolean_t
+spa_flush_all_logs_requested(spa_t *spa)
+{
+	return (spa->spa_log_flushall_txg != 0);
+}
+
+void
+spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
+{
+	uint64_t txg = dmu_tx_get_txg(tx);
+
+	if (spa_sync_pass(spa) != 1)
+		return;
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	/*
+	 * If we don't have any metaslabs with unflushed changes
+	 * return immediately.
+	 */
+	if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0)
+		return;
+
+	/*
+	 * During SPA export we leave a few empty TXGs to go by [see
+	 * spa_final_dirty_txg() to understand why]. For this specific
+	 * case, it is important to not flush any metaslabs as that
+	 * would dirty this TXG.
+	 *
+	 * That said, during one of these dirty TXGs that is less or
+	 * equal to spa_final_dirty(), spa_unload() will request that
+	 * we try to flush all the metaslabs for that TXG before
+	 * exporting the pool, thus we ensure that we didn't get a
+	 * request of flushing everything before we attempt to return
+	 * immediately.
+	 */
+	if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+	    !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
+	    !spa_flush_all_logs_requested(spa))
+		return;
+
+	/*
+	 * We need to generate a log space map before flushing because this
+	 * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg)
+	 * for this TXG's flushed metaslab count (aka sls_mscount which is
+	 * manipulated in many ways down the metaslab_flush() codepath).
+	 *
+	 * That is not to say that we may generate a log space map when we
+	 * don't need it. If we are flushing metaslabs, that means that we
+	 * were going to write changes to disk anyway, so even if we were
+	 * not flushing, a log space map would have been created anyway in
+	 * metaslab_sync().
+	 */
+	spa_generate_syncing_log_sm(spa, tx);
+
+	/*
+	 * This variable tells us how many metaslabs we want to flush based
+	 * on the block-heuristic of our flushing algorithm (see block comment
+	 * of log space map feature). We also decrement this as we flush
+	 * metaslabs and attempt to destroy old log space maps.
+	 */
+	uint64_t want_to_flush;
+	if (spa_flush_all_logs_requested(spa)) {
+		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
+		want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
+	} else {
+		want_to_flush = spa_estimate_metaslabs_to_flush(spa);
+	}
+
+	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
+	    want_to_flush);
+
+	/* Used purely for verification purposes */
+	uint64_t visited = 0;
+
+	/*
+	 * Ideally we would only iterate through spa_metaslabs_by_flushed
+	 * using only one variable (curr). We can't do that because
+	 * metaslab_flush() mutates position of curr in the AVL when
+	 * it flushes that metaslab by moving it to the end of the tree.
+	 * Thus we always keep track of the original next node of the
+	 * current node (curr) in another variable (next).
+	 */
+	metaslab_t *next = NULL;
+	for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed);
+	    curr != NULL; curr = next) {
+		next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr);
+
+		/*
+		 * If this metaslab has been flushed this txg then we've done
+		 * a full circle over the metaslabs.
+		 */
+		if (metaslab_unflushed_txg(curr) == txg)
+			break;
+
+		/*
+		 * If we are done flushing for the block heuristic and the
+		 * unflushed changes don't exceed the memory limit just stop.
+		 */
+		if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
+			break;
+
+		mutex_enter(&curr->ms_sync_lock);
+		mutex_enter(&curr->ms_lock);
+		boolean_t flushed = metaslab_flush(curr, tx);
+		mutex_exit(&curr->ms_lock);
+		mutex_exit(&curr->ms_sync_lock);
+
+		/*
+		 * If we failed to flush a metaslab (because it was loading),
+		 * then we are done with the block heuristic as it's not
+		 * possible to destroy any log space maps once you've skipped
+		 * a metaslab. In that case we just set our counter to 0 but
+		 * we continue looping in case there is still memory pressure
+		 * due to unflushed changes. Note that, flushing a metaslab
+		 * that is not the oldest flushed in the pool, will never
+		 * destroy any log space maps [see spa_cleanup_old_sm_logs()].
+		 */
+		if (!flushed) {
+			want_to_flush = 0;
+		} else if (want_to_flush > 0) {
+			want_to_flush--;
+		}
+
+		visited++;
+	}
+	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
+}
+
+/*
+ * Close the log space map for this TXG and update the block counts
+ * for the the log's in-memory structure and the summary.
+ */
+void
+spa_sync_close_syncing_log_sm(spa_t *spa)
+{
+	if (spa_syncing_log_sm(spa) == NULL)
+		return;
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+	spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
+	ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa));
+
+	sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa));
+	spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
+
+	/*
+	 * Note that we can't assert that sls_mscount is not 0,
+	 * because there is the case where the first metaslab
+	 * in spa_metaslabs_by_flushed is loading and we were
+	 * not able to flush any metaslabs the current TXG.
+	 */
+	ASSERT(sls->sls_nblocks != 0);
+
+	spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks);
+	spa_log_summary_verify_counts(spa);
+
+	space_map_close(spa->spa_syncing_log_sm);
+	spa->spa_syncing_log_sm = NULL;
+
+	/*
+	 * At this point we tried to flush as many metaslabs as we
+	 * can as the pool is getting exported. Reset the "flush all"
+	 * so the last few TXGs before closing the pool can be empty
+	 * (e.g. not dirty).
+	 */
+	if (spa_flush_all_logs_requested(spa)) {
+		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
+		spa->spa_log_flushall_txg = 0;
+	}
+}
+
+void
+spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
+{
+	objset_t *mos = spa_meta_objset(spa);
+
+	uint64_t spacemap_zap;
+	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
+	if (error == ENOENT) {
+		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
+		return;
+	}
+	VERIFY0(error);
+
+	metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed);
+	uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest);
+
+	/* Free all log space maps older than the oldest_flushed_txg. */
+	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+	    sls && sls->sls_txg < oldest_flushed_txg;
+	    sls = avl_first(&spa->spa_sm_logs_by_txg)) {
+		ASSERT0(sls->sls_mscount);
+		avl_remove(&spa->spa_sm_logs_by_txg, sls);
+		space_map_free_obj(mos, sls->sls_sm_obj, tx);
+		VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
+		spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
+		kmem_free(sls, sizeof (spa_log_sm_t));
+	}
+}
+
+static spa_log_sm_t *
+spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg)
+{
+	spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP);
+
+	sls->sls_sm_obj = sm_obj;
+	sls->sls_txg = txg;
+	return (sls);
+}
+
+void
+spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
+{
+	uint64_t txg = dmu_tx_get_txg(tx);
+	objset_t *mos = spa_meta_objset(spa);
+
+	if (spa_syncing_log_sm(spa) != NULL)
+		return;
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	uint64_t spacemap_zap;
+	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
+	if (error == ENOENT) {
+		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
+
+		error = 0;
+		spacemap_zap = zap_create(mos,
+		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1,
+		    &spacemap_zap, tx));
+		spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx);
+	}
+	VERIFY0(error);
+
+	uint64_t sm_obj;
+	ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj),
+	    ==, ENOENT);
+	sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx);
+	VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx));
+	avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg));
+
+	/*
+	 * We pass UINT64_MAX as the space map's representation size
+	 * and SPA_MINBLOCKSHIFT as the shift, to make the space map
+	 * accept any sorts of segments since there's no real advantage
+	 * to being more restrictive (given that we're already going
+	 * to be using 2-word entries).
+	 */
+	VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
+	    0, UINT64_MAX, SPA_MINBLOCKSHIFT));
+
+	/*
+	 * If the log space map feature was just enabled, the blocklimit
+	 * has not yet been set.
+	 */
+	if (spa_log_sm_blocklimit(spa) == 0)
+		spa_log_sm_set_blocklimit(spa);
+}
+
+/*
+ * Find all the log space maps stored in the space map ZAP and sort
+ * them by their TXG in spa_sm_logs_by_txg.
+ */
+static int
+spa_ld_log_sm_metadata(spa_t *spa)
+{
+	int error;
+	uint64_t spacemap_zap;
+
+	ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
+
+	error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
+	if (error == ENOENT) {
+		/* the space map ZAP doesn't exist yet */
+		return (0);
+	} else if (error != 0) {
+		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
+		    "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]",
+		    error);
+		return (error);
+	}
+
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap);
+	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		uint64_t log_txg = zfs_strtonum(za.za_name, NULL);
+		spa_log_sm_t *sls =
+		    spa_log_sm_alloc(za.za_first_integer, log_txg);
+		avl_add(&spa->spa_sm_logs_by_txg, sls);
+	}
+	zap_cursor_fini(&zc);
+	if (error != ENOENT) {
+		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
+		    "zap_cursor_retrieve(spacemap_zap) [error %d]",
+		    error);
+		return (error);
+	}
+
+	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
+	    m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
+		spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) };
+		spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
+		    &target, NULL);
+
+		/*
+		 * At this point if sls is zero it means that a bug occurred
+		 * in ZFS the last time the pool was open or earlier in the
+		 * import code path. In general, we would have placed a
+		 * VERIFY() here or in this case just let the kernel panic
+		 * with NULL pointer dereference when incrementing sls_mscount,
+		 * but since this is the import code path we can be a bit more
+		 * lenient. Thus, for DEBUG bits we always cause a panic, while
+		 * in production we log the error and just fail the import.
+		 */
+		ASSERT(sls != NULL);
+		if (sls == NULL) {
+			spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug "
+			    "encountered: could not find log spacemap for "
+			    "TXG %ld [error %d]",
+			    metaslab_unflushed_txg(m), ENOENT);
+			return (ENOENT);
+		}
+		sls->sls_mscount++;
+	}
+
+	return (0);
+}
+
+typedef struct spa_ld_log_sm_arg {
+	spa_t *slls_spa;
+	uint64_t slls_txg;
+} spa_ld_log_sm_arg_t;
+
+static int
+spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
+{
+	uint64_t offset = sme->sme_offset;
+	uint64_t size = sme->sme_run;
+	uint32_t vdev_id = sme->sme_vdev;
+	spa_ld_log_sm_arg_t *slls = arg;
+	spa_t *spa = slls->slls_spa;
+
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+
+	/*
+	 * If the vdev has been removed (i.e. it is indirect or a hole)
+	 * skip this entry. The contents of this vdev have already moved
+	 * elsewhere.
+	 */
+	if (!vdev_is_concrete(vd))
+		return (0);
+
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	ASSERT(!ms->ms_loaded);
+
+	/*
+	 * If we have already flushed entries for this TXG to this
+	 * metaslab's space map, then ignore it. Note that we flush
+	 * before processing any allocations/frees for that TXG, so
+	 * the metaslab's space map only has entries from *before*
+	 * the unflushed TXG.
+	 */
+	if (slls->slls_txg < metaslab_unflushed_txg(ms))
+		return (0);
+
+	switch (sme->sme_type) {
+	case SM_ALLOC:
+		range_tree_remove_xor_add_segment(offset, offset + size,
+		    ms->ms_unflushed_frees, ms->ms_unflushed_allocs);
+		break;
+	case SM_FREE:
+		range_tree_remove_xor_add_segment(offset, offset + size,
+		    ms->ms_unflushed_allocs, ms->ms_unflushed_frees);
+		break;
+	default:
+		panic("invalid maptype_t");
+		break;
+	}
+	return (0);
+}
+
+static int
+spa_ld_log_sm_data(spa_t *spa)
+{
+	int error = 0;
+
+	/*
+	 * If we are not going to do any writes there is no need
+	 * to read the log space maps.
+	 */
+	if (!spa_writeable(spa))
+		return (0);
+
+	ASSERT0(spa->spa_unflushed_stats.sus_nblocks);
+	ASSERT0(spa->spa_unflushed_stats.sus_memused);
+
+	hrtime_t read_logs_starttime = gethrtime();
+	/* this is a no-op when we don't have space map logs */
+	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+		space_map_t *sm = NULL;
+		error = space_map_open(&sm, spa_meta_objset(spa),
+		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
+		if (error != 0) {
+			spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
+			    "space_map_open(obj=%llu) [error %d]",
+			    (u_longlong_t)sls->sls_sm_obj, error);
+			goto out;
+		}
+
+		struct spa_ld_log_sm_arg vla = {
+			.slls_spa = spa,
+			.slls_txg = sls->sls_txg
+		};
+		error = space_map_iterate(sm, space_map_length(sm),
+		    spa_ld_log_sm_cb, &vla);
+		if (error != 0) {
+			space_map_close(sm);
+			spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
+			    "at space_map_iterate(obj=%llu) [error %d]",
+			    (u_longlong_t)sls->sls_sm_obj, error);
+			goto out;
+		}
+
+		ASSERT0(sls->sls_nblocks);
+		sls->sls_nblocks = space_map_nblocks(sm);
+		spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
+		summary_add_data(spa, sls->sls_txg,
+		    sls->sls_mscount, sls->sls_nblocks);
+
+		space_map_close(sm);
+	}
+	hrtime_t read_logs_endtime = gethrtime();
+	spa_load_note(spa,
+	    "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
+	    "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg),
+	    (u_longlong_t)spa_log_sm_nblocks(spa),
+	    (u_longlong_t)zfs_log_sm_blksz,
+	    (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
+
+out:
+	/*
+	 * Now that the metaslabs contain their unflushed changes:
+	 * [1] recalculate their actual allocated space
+	 * [2] recalculate their weights
+	 * [3] sum up the memory usage of their unflushed range trees
+	 * [4] optionally load them, if debug_load is set
+	 *
+	 * Note that even in the case where we get here because of an
+	 * error (e.g. error != 0), we still want to update the fields
+	 * below in order to have a proper teardown in spa_unload().
+	 */
+	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
+	    m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
+		mutex_enter(&m->ms_lock);
+		m->ms_allocated_space = space_map_allocated(m->ms_sm) +
+		    range_tree_space(m->ms_unflushed_allocs) -
+		    range_tree_space(m->ms_unflushed_frees);
+
+		vdev_t *vd = m->ms_group->mg_vd;
+		metaslab_space_update(vd, m->ms_group->mg_class,
+		    range_tree_space(m->ms_unflushed_allocs), 0, 0);
+		metaslab_space_update(vd, m->ms_group->mg_class,
+		    -range_tree_space(m->ms_unflushed_frees), 0, 0);
+
+		ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK);
+		metaslab_recalculate_weight_and_sort(m);
+
+		spa->spa_unflushed_stats.sus_memused +=
+		    metaslab_unflushed_changes_memused(m);
+
+		if (metaslab_debug_load && m->ms_sm != NULL) {
+			VERIFY0(metaslab_load(m));
+		}
+		mutex_exit(&m->ms_lock);
+	}
+
+	return (error);
+}
+
+static int
+spa_ld_unflushed_txgs(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa_meta_objset(spa);
+
+	if (vd->vdev_top_zap == 0)
+		return (0);
+
+	uint64_t object = 0;
+	int error = zap_lookup(mos, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
+	    sizeof (uint64_t), 1, &object);
+	if (error == ENOENT)
+		return (0);
+	else if (error != 0) {
+		spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at "
+		    "zap_lookup(vdev_top_zap=%llu) [error %d]",
+		    (u_longlong_t)vd->vdev_top_zap, error);
+		return (error);
+	}
+
+	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+		metaslab_t *ms = vd->vdev_ms[m];
+		ASSERT(ms != NULL);
+
+		metaslab_unflushed_phys_t entry;
+		uint64_t entry_size = sizeof (entry);
+		uint64_t entry_offset = ms->ms_id * entry_size;
+
+		error = dmu_read(mos, object,
+		    entry_offset, entry_size, &entry, 0);
+		if (error != 0) {
+			spa_load_failed(spa, "spa_ld_unflushed_txgs(): "
+			    "failed at dmu_read(obj=%llu) [error %d]",
+			    (u_longlong_t)object, error);
+			return (error);
+		}
+
+		ms->ms_unflushed_txg = entry.msp_unflushed_txg;
+		if (ms->ms_unflushed_txg != 0) {
+			mutex_enter(&spa->spa_flushed_ms_lock);
+			avl_add(&spa->spa_metaslabs_by_flushed, ms);
+			mutex_exit(&spa->spa_flushed_ms_lock);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Read all the log space map entries into their respective
+ * metaslab unflushed trees and keep them sorted by TXG in the
+ * SPA's metadata. In addition, setup all the metadata for the
+ * memory and the block heuristics.
+ */
+int
+spa_ld_log_spacemaps(spa_t *spa)
+{
+	int error;
+
+	spa_log_sm_set_blocklimit(spa);
+
+	for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
+		vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
+		error = spa_ld_unflushed_txgs(vd);
+		if (error != 0)
+			return (error);
+	}
+
+	error = spa_ld_log_sm_metadata(spa);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Note: we don't actually expect anything to change at this point
+	 * but we grab the config lock so we don't fail any assertions
+	 * when using vdev_lookup_top().
+	 */
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	error = spa_ld_log_sm_data(spa);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	return (error);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index e4a83406f4..6e7926db3a 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -62,7 +62,7 @@
 /*
  * SPA locking
  *
- * There are four basic locks for managing spa_t structures:
+ * There are three basic locks for managing spa_t structures:
  *
  * spa_namespace_lock (global mutex)
  *
@@ -595,6 +595,15 @@ spa_deadman(void *arg)
 		vdev_deadman(spa->spa_root_vdev);
 }
 
+int
+spa_log_sm_sort_by_txg(const void *va, const void *vb)
+{
+	const spa_log_sm_t *a = va;
+	const spa_log_sm_t *b = vb;
+
+	return (AVL_CMP(a->sls_txg, b->sls_txg));
+}
+
 /*
  * Create an uninitialized spa_t with the given name.  Requires
  * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
@@ -624,6 +633,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
@@ -684,6 +694,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
 		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
 	}
+	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
+	    sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
+	avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg,
+	    sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node));
+	list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t),
+	    offsetof(log_summary_entry_t, lse_node));
 
 	/*
 	 * Every pool starts with the default cachefile
@@ -751,7 +767,7 @@ spa_remove(spa_t *spa)
 	spa_config_dirent_t *dp;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+	ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED);
 	ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0);
 
 	nvlist_free(spa->spa_config_splitting);
@@ -780,6 +796,9 @@ spa_remove(spa_t *spa)
 	kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
 	    sizeof (avl_tree_t));
 
+	avl_destroy(&spa->spa_metaslabs_by_flushed);
+	avl_destroy(&spa->spa_sm_logs_by_txg);
+	list_destroy(&spa->spa_log_summary);
 	list_destroy(&spa->spa_config_list);
 	list_destroy(&spa->spa_leaf_list);
 
@@ -811,6 +830,7 @@ spa_remove(spa_t *spa)
 	cv_destroy(&spa->spa_scrub_io_cv);
 	cv_destroy(&spa->spa_suspend_cv);
 
+	mutex_destroy(&spa->spa_flushed_ms_lock);
 	mutex_destroy(&spa->spa_async_lock);
 	mutex_destroy(&spa->spa_errlist_lock);
 	mutex_destroy(&spa->spa_errlog_lock);
@@ -2357,6 +2377,12 @@ spa_missing_tvds_allowed(spa_t *spa)
 	return (spa->spa_missing_tvds_allowed);
 }
 
+space_map_t *
+spa_syncing_log_sm(spa_t *spa)
+{
+	return (spa->spa_syncing_log_sm);
+}
+
 void
 spa_set_missing_tvds(spa_t *spa, uint64_t missing)
 {
diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c
index 71e1e8cabc..01f1d587db 100644
--- a/usr/src/uts/common/fs/zfs/space_map.c
+++ b/usr/src/uts/common/fs/zfs/space_map.c
@@ -23,7 +23,8 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -34,6 +35,7 @@
 #include <sys/dsl_pool.h>
 #include <sys/zio.h>
 #include <sys/space_map.h>
+#include <sys/spa_log_spacemap.h>
 #include <sys/refcount.h>
 #include <sys/zfeature.h>
 
@@ -1066,3 +1068,11 @@ space_map_length(space_map_t *sm)
 {
 	return (sm != NULL ? sm->sm_phys->smp_length : 0);
 }
+
+uint64_t
+space_map_nblocks(space_map_t *sm)
+{
+	if (sm == NULL)
+		return (0);
+	return (DIV_ROUND_UP(space_map_length(sm), sm->sm_blksz));
+}
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 1001f52864..d38914dd1d 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -388,6 +388,7 @@ typedef struct dmu_buf {
 #define	DMU_POOL_OBSOLETE_BPOBJ		"com.delphix:obsolete_bpobj"
 #define	DMU_POOL_CONDENSING_INDIRECT	"com.delphix:condensing_indirect"
 #define	DMU_POOL_ZPOOL_CHECKPOINT	"com.delphix:zpool_checkpoint"
+#define	DMU_POOL_LOG_SPACEMAP_ZAP	"com.delphix:log_spacemap_zap"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
index 9cb200eaad..10705a84bc 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -49,8 +49,16 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
     metaslab_t **);
 void metaslab_fini(metaslab_t *);
 
+void metaslab_set_unflushed_txg(metaslab_t *, uint64_t, dmu_tx_t *);
+void metaslab_set_estimated_condensed_size(metaslab_t *, uint64_t, dmu_tx_t *);
+uint64_t metaslab_unflushed_txg(metaslab_t *);
+uint64_t metaslab_estimated_condensed_size(metaslab_t *);
+int metaslab_sort_by_flushed(const void *, const void *);
+uint64_t metaslab_unflushed_changes_memused(metaslab_t *);
+
 int metaslab_load(metaslab_t *, uint64_t);
 void metaslab_unload(metaslab_t *);
+boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *);
 
 uint64_t metaslab_allocated_space(metaslab_t *);
 
@@ -105,6 +113,9 @@ uint64_t metaslab_class_get_space(metaslab_class_t *);
 uint64_t metaslab_class_get_dspace(metaslab_class_t *);
 uint64_t metaslab_class_get_deferred(metaslab_class_t *);
 
+void metaslab_space_update(vdev_t *, metaslab_class_t *,
+    int64_t, int64_t, int64_t);
+
 metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int);
 void metaslab_group_destroy(metaslab_group_t *);
 void metaslab_group_activate(metaslab_group_t *);
@@ -121,6 +132,8 @@ void metaslab_recalculate_weight_and_sort(metaslab_t *);
 void metaslab_disable(metaslab_t *);
 void metaslab_enable(metaslab_t *, boolean_t);
 
+extern int metaslab_debug_load;
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
index 97b06e712a..5920b3113c 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H
@@ -366,7 +366,7 @@ struct metaslab {
 	 * write to metaslab data on-disk (i.e flushing entries to
 	 * the metaslab's space map). It helps coordinate readers of
 	 * the metaslab's space map [see spa_vdev_remove_thread()]
-	 * with writers [see metaslab_sync()].
+	 * with writers [see metaslab_sync() or metaslab_flush()].
 	 *
 	 * Note that metaslab_load(), even though a reader, uses
 	 * a completely different mechanism to deal with the reading
@@ -410,7 +410,6 @@ struct metaslab {
 
 	boolean_t	ms_condensing;	/* condensing? */
 	boolean_t	ms_condense_wanted;
-	uint64_t	ms_condense_checked_txg;
 
 	/*
 	 * The number of consumers which have disabled the metaslab.
@@ -423,6 +422,8 @@ struct metaslab {
 	 */
 	boolean_t	ms_loaded;
 	boolean_t	ms_loading;
+	kcondvar_t	ms_flush_cv;
+	boolean_t	ms_flushing;
 
 	/*
 	 * The following histograms count entries that are in the
@@ -508,6 +509,22 @@ struct metaslab {
 	metaslab_group_t *ms_group;	/* metaslab group		*/
 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
 	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
+	avl_node_t	ms_spa_txg_node; /* node in spa_metaslabs_by_txg */
+
+	/*
+	 * Allocs and frees that are committed to the vdev log spacemap but
+	 * not yet to this metaslab's spacemap.
+	 */
+	range_tree_t	*ms_unflushed_allocs;
+	range_tree_t	*ms_unflushed_frees;
+
+	/*
+	 * We have flushed entries up to but not including this TXG. In
+	 * other words, all changes from this TXG and onward should not
+	 * be in this metaslab's space map and must be read from the
+	 * log space maps.
+	 */
+	uint64_t	ms_unflushed_txg;
 
 	/* updated every time we are done syncing the metaslab's space map */
 	uint64_t	ms_synced_length;
@@ -515,6 +532,11 @@ struct metaslab {
 	boolean_t	ms_new;
 };
 
+typedef struct metaslab_unflushed_phys {
+	/* on-disk counterpart of ms_unflushed_txg */
+	uint64_t	msp_unflushed_txg;
+} metaslab_unflushed_phys_t;
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/fs/zfs/sys/range_tree.h b/usr/src/uts/common/fs/zfs/sys/range_tree.h
index 588f41fcb7..d450ff7f16 100644
--- a/usr/src/uts/common/fs/zfs/sys/range_tree.h
+++ b/usr/src/uts/common/fs/zfs/sys/range_tree.h
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_RANGE_TREE_H
@@ -94,6 +94,7 @@ range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
 void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
     uint64_t newstart, uint64_t newsize);
 uint64_t range_tree_space(range_tree_t *rt);
+uint64_t range_tree_numsegs(range_tree_t *rt);
 boolean_t range_tree_is_empty(range_tree_t *rt);
 void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
 void range_tree_stat_verify(range_tree_t *rt);
@@ -111,6 +112,11 @@ void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
 void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
 range_seg_t *range_tree_first(range_tree_t *rt);
 
+void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
+    range_tree_t *removefrom, range_tree_t *addto);
+void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom,
+    range_tree_t *addto);
+
 void rt_avl_create(range_tree_t *rt, void *arg);
 void rt_avl_destroy(range_tree_t *rt, void *arg);
 void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index 653d4ee334..92db9b819b 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -41,6 +41,7 @@
 #include <sys/types.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
+#include <sys/space_map.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -1014,6 +1015,7 @@ extern boolean_t spa_suspended(spa_t *spa);
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
+extern space_map_t *spa_syncing_log_sm(spa_t *spa);
 extern uint64_t spa_deadman_synctime(spa_t *spa);
 extern uint64_t spa_dirty_data(spa_t *spa);
 extern spa_autotrim_t spa_get_autotrim(spa_t *spa);
@@ -1065,6 +1067,7 @@ extern boolean_t spa_trust_config(spa_t *spa);
 extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
 extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
 extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
+extern uint64_t spa_total_metaslabs(spa_t *spa);
 extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
 
 extern int spa_mode(spa_t *spa);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 0b2d4a3968..d71971959b 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -34,6 +34,7 @@
 
 #include <sys/spa.h>
 #include <sys/spa_checkpoint.h>
+#include <sys/spa_log_spacemap.h>
 #include <sys/vdev.h>
 #include <sys/vdev_removal.h>
 #include <sys/metaslab.h>
@@ -308,6 +309,14 @@ struct spa {
 	spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */
 	zthr_t		*spa_checkpoint_discard_zthr;
 
+	space_map_t	*spa_syncing_log_sm;	/* current log space map */
+	avl_tree_t	spa_sm_logs_by_txg;
+	kmutex_t	spa_flushed_ms_lock;	/* for metaslabs_by_flushed */
+	avl_tree_t	spa_metaslabs_by_flushed;
+	spa_unflushed_stats_t	spa_unflushed_stats;
+	list_t		spa_log_summary;
+	uint64_t	spa_log_flushall_txg;
+
 	char		*spa_root;		/* alternate root directory */
 	uint64_t	spa_ena;		/* spa-wide ereport ENA */
 	int		spa_last_open_failed;	/* error if last open failed */
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_log_spacemap.h b/usr/src/uts/common/fs/zfs/sys/spa_log_spacemap.h
new file mode 100644
index 0000000000..e5b683e5ea
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/spa_log_spacemap.h
@@ -0,0 +1,84 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _SYS_SPA_LOG_SPACEMAP_H
+#define	_SYS_SPA_LOG_SPACEMAP_H
+
+#include <sys/avl.h>
+
+#ifndef DIV_ROUND_UP
+#define	DIV_ROUND_UP(n, d)	(((n) + (d) - 1) / (d))
+#endif
+
+typedef struct log_summary_entry {
+	uint64_t lse_start;	/* start TXG */
+	uint64_t lse_mscount;	/* # of metaslabs needed to be flushed */
+	uint64_t lse_blkcount;	/* blocks held by this entry  */
+	list_node_t lse_node;
+} log_summary_entry_t;
+
+typedef struct spa_unflushed_stats  {
+	/* used for memory heuristic */
+	uint64_t sus_memused;	/* current memory used for unflushed trees */
+
+	/* used for block heuristic */
+	uint64_t sus_blocklimit;	/* max # of log blocks allowed */
+	uint64_t sus_nblocks;	/* # of blocks in log space maps currently */
+} spa_unflushed_stats_t;
+
+typedef struct spa_log_sm {
+	uint64_t sls_sm_obj;	/* space map object ID */
+	uint64_t sls_txg;	/* txg logged on the space map */
+	uint64_t sls_nblocks;	/* number of blocks in this log */
+	uint64_t sls_mscount;	/* # of metaslabs flushed in the log's txg */
+	avl_node_t sls_node;	/* node in spa_sm_logs_by_txg */
+} spa_log_sm_t;
+
+int spa_ld_log_spacemaps(spa_t *);
+
+void spa_generate_syncing_log_sm(spa_t *, dmu_tx_t *);
+void spa_flush_metaslabs(spa_t *, dmu_tx_t *);
+void spa_sync_close_syncing_log_sm(spa_t *);
+
+void spa_cleanup_old_sm_logs(spa_t *, dmu_tx_t *);
+
+uint64_t spa_log_sm_blocklimit(spa_t *);
+void spa_log_sm_set_blocklimit(spa_t *);
+uint64_t spa_log_sm_nblocks(spa_t *);
+uint64_t spa_log_sm_memused(spa_t *);
+
+void spa_log_sm_decrement_mscount(spa_t *, uint64_t);
+void spa_log_sm_increment_current_mscount(spa_t *);
+
+void spa_log_summary_add_flushed_metaslab(spa_t *);
+void spa_log_summary_decrement_mscount(spa_t *, uint64_t);
+void spa_log_summary_decrement_blkcount(spa_t *, uint64_t);
+
+boolean_t spa_flush_all_logs_requested(spa_t *);
+
+extern int zfs_keep_log_spacemaps_at_export;
+
+#endif /* _SYS_SPA_LOG_SPACEMAP_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/space_map.h b/usr/src/uts/common/fs/zfs/sys/space_map.h
index 2bce20b48b..5ede2c43e3 100644
--- a/usr/src/uts/common/fs/zfs/sys/space_map.h
+++ b/usr/src/uts/common/fs/zfs/sys/space_map.h
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_SPACE_MAP_H
@@ -72,6 +72,11 @@ typedef struct space_map_phys {
 	 * bucket, smp_histogram[i], contains the number of free regions
 	 * whose size is:
 	 * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
+	 *
+	 * Note that, if log space map feature is enabled, histograms of
+	 * space maps that belong to metaslabs will take into account any
+	 * unflushed changes for their metaslabs, even though the actual
+	 * space map doesn't have entries for these changes.
 	 */
 	uint64_t	smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
 } space_map_phys_t;
@@ -209,6 +214,8 @@ void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
 uint64_t space_map_object(space_map_t *sm);
 int64_t space_map_allocated(space_map_t *sm);
 uint64_t space_map_length(space_map_t *sm);
+uint64_t space_map_entries(space_map_t *sm, range_tree_t *rt);
+uint64_t space_map_nblocks(space_map_t *sm);
 
 void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
     uint64_t vdev_id, dmu_tx_t *tx);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 9819b85d0c..9caaddf857 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright 2019 Joyent, Inc.
  */
@@ -525,7 +526,7 @@ extern void vdev_set_min_asize(vdev_t *vd);
 /*
  * Global variables
  */
-extern int vdev_standard_sm_blksz;
+extern int zfs_vdev_standard_sm_blksz;
 /* zdb uses this tunable, so it must be declared here to make lint happy. */
 extern int zfs_vdev_cache_size;
 
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
index 2094f8d019..adfbb84de8 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_DEBUG_H
@@ -61,6 +61,7 @@ extern boolean_t zfs_free_leak_on_eio;
 #define	ZFS_DEBUG_METASLAB_VERIFY	(1 << 8)
 #define	ZFS_DEBUG_INDIRECT_REMAP	(1 << 9)
 #define	ZFS_DEBUG_TRIM			(1 << 11)
+#define	ZFS_DEBUG_LOG_SPACEMAP		(1 << 12)
 
 #ifdef ZFS_DEBUG
 extern void __dprintf(const char *file, const char *func,
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index 5be51b7d71..a99e581737 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 Martin Matuska
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -265,7 +265,7 @@ txg_sync_stop(dsl_pool_t *dp)
 	ASSERT3U(tx->tx_threads, ==, 2);
 
 	/*
-	 * We need to ensure that we've vacated the deferred space_maps.
+	 * We need to ensure that we've vacated the deferred metaslab trees.
 	 */
 	txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
 
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index a4d9415314..142542236c 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -95,14 +95,14 @@ boolean_t vdev_validate_skip = B_FALSE;
  * Since the DTL space map of a vdev is not expected to have a lot of
  * entries, we default its block size to 4K.
  */
-int vdev_dtl_sm_blksz = (1 << 12);
+int zfs_vdev_dtl_sm_blksz = (1 << 12);
 
 /*
  * vdev-wide space maps that have lots of entries written to them at
  * the end of each transaction can benefit from a higher I/O bandwidth
  * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
  */
-int vdev_standard_sm_blksz = (1 << 17);
+int zfs_vdev_standard_sm_blksz = (1 << 17);
 
 int zfs_ashift_min;
 
@@ -854,6 +854,7 @@ vdev_free(vdev_t *vd)
 	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
+		vd->vdev_mg = NULL;
 	}
 
 	ASSERT0(vd->vdev_stat.vs_space);
@@ -1264,6 +1265,13 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 	if (txg == 0)
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 
+	/*
+	 * Regardless whether this vdev was just added or it is being
+	 * expanded, the metaslab count has changed. Recalculate the
+	 * block limit.
+	 */
+	spa_log_sm_set_blocklimit(spa);
+
 	return (0);
 }
 
@@ -2752,7 +2760,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	if (vd->vdev_dtl_sm == NULL) {
 		uint64_t new_object;
 
-		new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx);
+		new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
@@ -2766,7 +2774,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	range_tree_walk(rt, range_tree_add, rtsync);
 	mutex_exit(&vd->vdev_dtl_lock);
 
-	space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
+	space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx);
 	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
 	range_tree_vacate(rtsync, NULL, NULL);
 
@@ -3042,6 +3050,25 @@ vdev_validate_aux(vdev_t *vd)
 	return (0);
 }
 
+static void
+vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx)
+{
+	objset_t *mos = spa_meta_objset(vd->vdev_spa);
+
+	if (vd->vdev_top_zap == 0)
+		return;
+
+	uint64_t object = 0;
+	int err = zap_lookup(mos, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object);
+	if (err == ENOENT)
+		return;
+
+	VERIFY0(dmu_object_free(mos, object, tx));
+	VERIFY0(zap_remove(mos, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx));
+}
+
 /*
  * Free the objects used to store this vdev's spacemaps, and the array
  * that points to them.
@@ -3069,6 +3096,7 @@ vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
 
 	kmem_free(smobj_array, array_bytes);
 	VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
+	vdev_destroy_ms_flush_data(vd, tx);
 	vd->vdev_ms_array = 0;
 }
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_indirect.c b/usr/src/uts/common/fs/zfs/vdev_indirect.c
index 9626589444..493e2b51ed 100644
--- a/usr/src/uts/common/fs/zfs/vdev_indirect.c
+++ b/usr/src/uts/common/fs/zfs/vdev_indirect.c
@@ -14,7 +14,7 @@
  */
 
 /*
- * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2019 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -818,7 +818,7 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
 	if (vdev_obsolete_sm_object(vd) == 0) {
 		uint64_t obsolete_sm_object =
 		    space_map_alloc(spa->spa_meta_objset,
-		    vdev_standard_sm_blksz, tx);
+		    zfs_vdev_standard_sm_blksz, tx);
 
 		ASSERT(vd->vdev_top_zap != 0);
 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
diff --git a/usr/src/uts/common/fs/zfs/vdev_removal.c b/usr/src/uts/common/fs/zfs/vdev_removal.c
index 28ce4aba9e..01415b5cd2 100644
--- a/usr/src/uts/common/fs/zfs/vdev_removal.c
+++ b/usr/src/uts/common/fs/zfs/vdev_removal.c
@@ -1160,6 +1160,7 @@ vdev_remove_complete(spa_t *spa)
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
 		vd->vdev_mg = NULL;
+		spa_log_sm_set_blocklimit(spa);
 	}
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
@@ -1400,6 +1401,10 @@ spa_vdev_remove_thread(void *arg)
 			VERIFY0(space_map_load(msp->ms_sm,
 			    svr->svr_allocd_segs, SM_ALLOC));
 
+			range_tree_walk(msp->ms_unflushed_allocs,
+			    range_tree_add, svr->svr_allocd_segs);
+			range_tree_walk(msp->ms_unflushed_frees,
+			    range_tree_remove, svr->svr_allocd_segs);
 			range_tree_walk(msp->ms_freeing,
 			    range_tree_remove, svr->svr_allocd_segs);
 
@@ -1596,6 +1601,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 			mutex_enter(&svr->svr_lock);
 			VERIFY0(space_map_load(msp->ms_sm,
 			    svr->svr_allocd_segs, SM_ALLOC));
+
+			range_tree_walk(msp->ms_unflushed_allocs,
+			    range_tree_add, svr->svr_allocd_segs);
+			range_tree_walk(msp->ms_unflushed_frees,
+			    range_tree_remove, svr->svr_allocd_segs);
 			range_tree_walk(msp->ms_freeing,
 			    range_tree_remove, svr->svr_allocd_segs);
 
@@ -1718,19 +1728,14 @@ vdev_remove_make_hole_and_free(vdev_t *vd)
 	uint64_t id = vd->vdev_id;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
-	boolean_t last_vdev = (id == (rvd->vdev_children - 1));
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	vdev_free(vd);
 
-	if (last_vdev) {
-		vdev_compact_children(rvd);
-	} else {
-		vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
-		vdev_add_child(rvd, vd);
-	}
+	vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
+	vdev_add_child(rvd, vd);
 	vdev_config_dirty(rvd);
 
 	/*
@@ -1792,7 +1797,28 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 	vdev_dirty_leaves(vd, VDD_DTL, *txg);
 	vdev_config_dirty(vd);
 
+	/*
+	 * When the log space map feature is enabled we look at
+	 * the vdev's top_zap to find the on-disk flush data of
+	 * the metaslab we just flushed. Thus, while removing a
+	 * log vdev we make sure to call vdev_metaslab_fini()
+	 * first, which removes all metaslabs of this vdev from
+	 * spa_metaslabs_by_flushed before vdev_remove_empty()
+	 * destroys the top_zap of this log vdev.
+	 *
+	 * This avoids the scenario where we flush a metaslab
+	 * from the log vdev being removed that doesn't have a
+	 * top_zap and end up failing to lookup its on-disk flush
+	 * data.
+	 *
+	 * We don't call metaslab_group_destroy() right away
+	 * though (it will be called in vdev_free() later) as
+	 * during metaslab_sync() of metaslabs from other vdevs
+	 * we may touch the metaslab group of this vdev through
+	 * metaslab_class_histogram_verify()
+	 */
 	vdev_metaslab_fini(vd);
+	spa_log_sm_set_blocklimit(spa);
 
 	spa_history_log_internal(spa, "vdev remove", NULL,
 	    "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 9fc8f9a8f2..a932ccb544 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -1042,10 +1042,16 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 	 * deferred, and which will not need to do a read (i.e. not GANG or
 	 * DEDUP), can be processed immediately.  Otherwise, put them on the
 	 * in-memory list for later processing.
+	 *
+	 * Note that we only defer frees after zfs_sync_pass_deferred_free
+	 * when the log space map feature is disabled. [see relevant comment
+	 * in spa_sync_iterate_to_convergence()]
 	 */
-	if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
+	if (BP_IS_GANG(bp) ||
+	    BP_GET_DEDUP(bp) ||
 	    txg != spa->spa_syncing_txg ||
-	    spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
+	    (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
+	    !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) {
 		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 	} else {
 		VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
@@ -1061,7 +1067,6 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
-	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index 7d5fcee519..4d81e8d40a 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -695,6 +695,8 @@ typedef struct zpool_load_policy {
 	"com.delphix:obsolete_counts_are_precise"
 #define	VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
 	"com.delphix:pool_checkpoint_sm"
+#define	VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \
+	"com.delphix:ms_unflushed_phys_txgs"
 
 #define	VDEV_TOP_ZAP_ALLOCATION_BIAS \
 	"org.zfsonlinux:allocation_bias"
author	Jerry Jelinek <jerry.jelinek@joyent.com>	2019-09-26 12:34:03 +0000
committer	Jerry Jelinek <jerry.jelinek@joyent.com>	2019-09-26 12:34:03 +0000
commit	2b56e6362d6c66c3c0019a24349c436c2cd162ba (patch)
tree	4f35e286b6fc5ed0eda0cd43d33ce54fc15ebaf0
parent	3105c6ff4e5cab926dc4802a7e10eee1f4abbec4 (diff)
parent	814dcd43c3de9925fd6226c256e4d4327841a0e1 (diff)
download	illumos-joyent-2b56e6362d6c66c3c0019a24349c436c2cd162ba.tar.gz