diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2019-09-26 12:34:03 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2019-09-26 12:34:03 +0000 |
commit | 2b56e6362d6c66c3c0019a24349c436c2cd162ba (patch) | |
tree | 4f35e286b6fc5ed0eda0cd43d33ce54fc15ebaf0 | |
parent | 3105c6ff4e5cab926dc4802a7e10eee1f4abbec4 (diff) | |
parent | 814dcd43c3de9925fd6226c256e4d4327841a0e1 (diff) | |
download | illumos-joyent-2b56e6362d6c66c3c0019a24349c436c2cd162ba.tar.gz |
[illumos-gate merge]
commit 814dcd43c3de9925fd6226c256e4d4327841a0e1
11557 Log Spacemap Project
commit c4e4d4102c8a8c2cc936dd971bdafe4ec52fd4cf
11747 zpool iostat -v no longer shows titles for log/bias sections
Conflicts:
usr/src/uts/common/fs/zfs/sys/metaslab.h
usr/src/uts/common/fs/zfs/metaslab.c
42 files changed, 3493 insertions, 483 deletions
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index c536a0d399..2c32e1a191 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -1465,6 +1465,9 @@ spa_print_config(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) typedef struct mdb_range_tree { + struct { + uint64_t avl_numnodes; + } rt_root; uint64_t rt_space; } mdb_range_tree_t; @@ -1486,6 +1489,8 @@ typedef struct mdb_metaslab { uintptr_t ms_freeing; uintptr_t ms_freed; uintptr_t ms_allocatable; + uintptr_t ms_unflushed_frees; + uintptr_t ms_unflushed_allocs; uintptr_t ms_sm; } mdb_metaslab_t; @@ -1501,12 +1506,23 @@ typedef struct mdb_space_map { } mdb_space_map_t; typedef struct mdb_vdev { - uintptr_t vdev_path; - uintptr_t vdev_ms; + uint64_t vdev_id; + uint64_t vdev_state; uintptr_t vdev_ops; + struct { + uint64_t vs_aux; + uint64_t vs_ops[VS_ZIO_TYPES]; + uint64_t vs_bytes[VS_ZIO_TYPES]; + uint64_t vs_read_errors; + uint64_t vs_write_errors; + uint64_t vs_checksum_errors; + } vdev_stat; + uintptr_t vdev_child; + uint64_t vdev_children; uint64_t vdev_ms_count; - uint64_t vdev_id; - vdev_stat_t vdev_stat; + uintptr_t vdev_mg; + uintptr_t vdev_ms; + uintptr_t vdev_path; } mdb_vdev_t; typedef struct mdb_vdev_ops { @@ -1514,37 +1530,31 @@ typedef struct mdb_vdev_ops { } mdb_vdev_ops_t; static int -metaslab_stats(uintptr_t addr, int spa_flags) +metaslab_stats(mdb_vdev_t *vd, int spa_flags) { - mdb_vdev_t vdev; - uintptr_t *vdev_ms; - - if (mdb_ctf_vread(&vdev, "vdev_t", "mdb_vdev_t", - (uintptr_t)addr, 0) == -1) { - mdb_warn("failed to read vdev at %p\n", addr); - return (DCMD_ERR); - } - mdb_inc_indent(4); - mdb_printf("%<u>%-?s %6s %20s %10s %9s%</u>\n", "ADDR", "ID", - "OFFSET", "FREE", "FRAGMENTATION"); + mdb_printf("%<u>%-?s %6s %20s %10s %10s %10s%</u>\n", "ADDR", "ID", + "OFFSET", "FREE", "FRAG", "UCMU"); - vdev_ms = mdb_alloc(vdev.vdev_ms_count * sizeof (void *), + uintptr_t *vdev_ms = mdb_alloc(vd->vdev_ms_count * sizeof (vdev_ms), UM_SLEEP | UM_GC); - if (mdb_vread(vdev_ms, vdev.vdev_ms_count * sizeof (void *), - (uintptr_t)vdev.vdev_ms) == -1) { - mdb_warn("failed to read vdev_ms at %p\n", vdev.vdev_ms); + if (mdb_vread(vdev_ms, vd->vdev_ms_count * sizeof (uintptr_t), + vd->vdev_ms) == -1) { + mdb_warn("failed to read vdev_ms at %p\n", vd->vdev_ms); return (DCMD_ERR); } - for (int m = 0; m < vdev.vdev_ms_count; m++) { + for (int m = 0; m < vd->vdev_ms_count; m++) { mdb_metaslab_t ms; mdb_space_map_t sm = { 0 }; - mdb_space_map_phys_t smp; + mdb_space_map_phys_t smp = { 0 }; + mdb_range_tree_t rt; + uint64_t uallocs, ufrees, raw_free, raw_uchanges_mem; char free[MDB_NICENUM_BUFLEN]; + char uchanges_mem[MDB_NICENUM_BUFLEN]; if (mdb_ctf_vread(&ms, "metaslab_t", "mdb_metaslab_t", - (uintptr_t)vdev_ms[m], 0) == -1) + vdev_ms[m], 0) == -1) return (DCMD_ERR); if (ms.ms_sm != 0 && @@ -1552,25 +1562,40 @@ metaslab_stats(uintptr_t addr, int spa_flags) ms.ms_sm, 0) == -1) return (DCMD_ERR); - if (sm.sm_phys != 0) { + if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t", + ms.ms_unflushed_frees, 0) == -1) + return (DCMD_ERR); + ufrees = rt.rt_space; + raw_uchanges_mem = rt.rt_root.avl_numnodes * + mdb_ctf_sizeof_by_name("range_seg_t"); + + if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t", + ms.ms_unflushed_allocs, 0) == -1) + return (DCMD_ERR); + uallocs = rt.rt_space; + raw_uchanges_mem += rt.rt_root.avl_numnodes * + mdb_ctf_sizeof_by_name("range_seg_t"); + mdb_nicenum(raw_uchanges_mem, uchanges_mem); + + raw_free = ms.ms_size; + if (ms.ms_sm != 0 && sm.sm_phys != 0) { (void) mdb_ctf_vread(&smp, "space_map_phys_t", "mdb_space_map_phys_t", sm.sm_phys, 0); - mdb_nicenum(ms.ms_size - smp.smp_alloc, free); - } else { - (void) mdb_snprintf(free, MDB_NICENUM_BUFLEN, "-"); + raw_free -= smp.smp_alloc; } + raw_free += ufrees - uallocs; + mdb_nicenum(raw_free, free); mdb_printf("%0?p %6llu %20llx %10s ", vdev_ms[m], ms.ms_id, ms.ms_start, free); if (ms.ms_fragmentation == ZFS_FRAG_INVALID) - mdb_printf("%9s\n", "-"); + mdb_printf("%9s ", "-"); else - mdb_printf("%9llu%%\n", ms.ms_fragmentation); - - if ((spa_flags & SPA_FLAG_HISTOGRAMS) && ms.ms_sm != 0) { - if (sm.sm_phys == 0) - continue; + mdb_printf("%9llu%% ", ms.ms_fragmentation); + mdb_printf("%10s\n", uchanges_mem); + if ((spa_flags & SPA_FLAG_HISTOGRAMS) && ms.ms_sm != 0 && + sm.sm_phys != 0) { dump_histogram(smp.smp_histogram, SPACE_MAP_HISTOGRAM_SIZE, sm.sm_shift); } @@ -1580,21 +1605,56 @@ metaslab_stats(uintptr_t addr, int spa_flags) } static int -metaslab_group_stats(uintptr_t addr, int spa_flags) +metaslab_group_stats(mdb_vdev_t *vd, int spa_flags) { mdb_metaslab_group_t mg; if (mdb_ctf_vread(&mg, "metaslab_group_t", "mdb_metaslab_group_t", - (uintptr_t)addr, 0) == -1) { - mdb_warn("failed to read vdev_mg at %p\n", addr); + vd->vdev_mg, 0) == -1) { + mdb_warn("failed to read vdev_mg at %p\n", vd->vdev_mg); return (DCMD_ERR); } mdb_inc_indent(4); - mdb_printf("%<u>%-?s %15s%</u>\n", "ADDR", "FRAGMENTATION"); + mdb_printf("%<u>%-?s %7s %9s%</u>\n", "ADDR", "FRAG", "UCMU"); + if (mg.mg_fragmentation == ZFS_FRAG_INVALID) - mdb_printf("%0?p %15s\n", addr, "-"); + mdb_printf("%0?p %6s\n", vd->vdev_mg, "-"); else - mdb_printf("%0?p %15llu%%\n", addr, mg.mg_fragmentation); + mdb_printf("%0?p %6llu%%", vd->vdev_mg, mg.mg_fragmentation); + + + uintptr_t *vdev_ms = mdb_alloc(vd->vdev_ms_count * sizeof (vdev_ms), + UM_SLEEP | UM_GC); + if (mdb_vread(vdev_ms, vd->vdev_ms_count * sizeof (uintptr_t), + vd->vdev_ms) == -1) { + mdb_warn("failed to read vdev_ms at %p\n", vd->vdev_ms); + return (DCMD_ERR); + } + + uint64_t raw_uchanges_mem = 0; + char uchanges_mem[MDB_NICENUM_BUFLEN]; + for (int m = 0; m < vd->vdev_ms_count; m++) { + mdb_metaslab_t ms; + mdb_range_tree_t rt; + + if (mdb_ctf_vread(&ms, "metaslab_t", "mdb_metaslab_t", + vdev_ms[m], 0) == -1) + return (DCMD_ERR); + + if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t", + ms.ms_unflushed_frees, 0) == -1) + return (DCMD_ERR); + raw_uchanges_mem += + rt.rt_root.avl_numnodes * sizeof (range_seg_t); + + if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t", + ms.ms_unflushed_allocs, 0) == -1) + return (DCMD_ERR); + raw_uchanges_mem += + rt.rt_root.avl_numnodes * sizeof (range_seg_t); + } + mdb_nicenum(raw_uchanges_mem, uchanges_mem); + mdb_printf("%10s\n", uchanges_mem); if (spa_flags & SPA_FLAG_HISTOGRAMS) dump_histogram(mg.mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); @@ -1618,33 +1678,28 @@ static int do_print_vdev(uintptr_t addr, int flags, int depth, boolean_t recursive, int spa_flags) { - vdev_t vdev; - char desc[MAXNAMELEN]; - int c, children; - uintptr_t *child; - const char *state, *aux; - - if (mdb_vread(&vdev, sizeof (vdev), (uintptr_t)addr) == -1) { - mdb_warn("failed to read vdev_t at %p\n", (uintptr_t)addr); + mdb_vdev_t vd; + if (mdb_ctf_vread(&vd, "vdev_t", "mdb_vdev_t", + (uintptr_t)addr, 0) == -1) return (DCMD_ERR); - } if (flags & DCMD_PIPE_OUT) { mdb_printf("%#lr\n", addr); } else { - if (vdev.vdev_path != NULL) { + char desc[MAXNAMELEN]; + if (vd.vdev_path != 0) { if (mdb_readstr(desc, sizeof (desc), - (uintptr_t)vdev.vdev_path) == -1) { + (uintptr_t)vd.vdev_path) == -1) { mdb_warn("failed to read vdev_path at %p\n", - vdev.vdev_path); + vd.vdev_path); return (DCMD_ERR); } - } else if (vdev.vdev_ops != NULL) { + } else if (vd.vdev_ops != 0) { vdev_ops_t ops; if (mdb_vread(&ops, sizeof (ops), - (uintptr_t)vdev.vdev_ops) == -1) { + (uintptr_t)vd.vdev_ops) == -1) { mdb_warn("failed to read vdev_ops at %p\n", - vdev.vdev_ops); + vd.vdev_ops); return (DCMD_ERR); } (void) strcpy(desc, ops.vdev_op_type); @@ -1660,7 +1715,8 @@ do_print_vdev(uintptr_t addr, int flags, int depth, boolean_t recursive, mdb_printf("%0?p ", addr); - switch (vdev.vdev_state) { + const char *state, *aux; + switch (vd.vdev_state) { case VDEV_STATE_CLOSED: state = "CLOSED"; break; @@ -1687,7 +1743,7 @@ do_print_vdev(uintptr_t addr, int flags, int depth, boolean_t recursive, break; } - switch (vdev.vdev_stat.vs_aux) { + switch (vd.vdev_stat.vs_aux) { case VDEV_AUX_NONE: aux = "-"; break; @@ -1747,7 +1803,6 @@ do_print_vdev(uintptr_t addr, int flags, int depth, boolean_t recursive, mdb_printf("%-9s %-12s %*s%s\n", state, aux, depth, "", desc); if (spa_flags & SPA_FLAG_ERRORS) { - vdev_stat_t *vs = &vdev.vdev_stat; int i; mdb_inc_indent(4); @@ -1756,48 +1811,50 @@ do_print_vdev(uintptr_t addr, int flags, int depth, boolean_t recursive, "%12s%</u>\n", "READ", "WRITE", "FREE", "CLAIM", "IOCTL"); mdb_printf("OPS "); - for (i = 1; i < ZIO_TYPES; i++) - mdb_printf("%11#llx%s", vs->vs_ops[i], - i == ZIO_TYPES - 1 ? "" : " "); + for (i = 1; i < VS_ZIO_TYPES; i++) + mdb_printf("%11#llx%s", + vd.vdev_stat.vs_ops[i], + i == VS_ZIO_TYPES - 1 ? "" : " "); mdb_printf("\n"); mdb_printf("BYTES "); - for (i = 1; i < ZIO_TYPES; i++) - mdb_printf("%11#llx%s", vs->vs_bytes[i], - i == ZIO_TYPES - 1 ? "" : " "); + for (i = 1; i < VS_ZIO_TYPES; i++) + mdb_printf("%11#llx%s", + vd.vdev_stat.vs_bytes[i], + i == VS_ZIO_TYPES - 1 ? "" : " "); mdb_printf("\n"); - mdb_printf("EREAD %10#llx\n", vs->vs_read_errors); - mdb_printf("EWRITE %10#llx\n", vs->vs_write_errors); + mdb_printf("EREAD %10#llx\n", + vd.vdev_stat.vs_read_errors); + mdb_printf("EWRITE %10#llx\n", + vd.vdev_stat.vs_write_errors); mdb_printf("ECKSUM %10#llx\n", - vs->vs_checksum_errors); + vd.vdev_stat.vs_checksum_errors); mdb_dec_indent(4); mdb_printf("\n"); } - if (spa_flags & SPA_FLAG_METASLAB_GROUPS && - vdev.vdev_mg != NULL) { - metaslab_group_stats((uintptr_t)vdev.vdev_mg, - spa_flags); + if ((spa_flags & SPA_FLAG_METASLAB_GROUPS) && + vd.vdev_mg != 0) { + metaslab_group_stats(&vd, spa_flags); } - if (spa_flags & SPA_FLAG_METASLABS && vdev.vdev_ms != NULL) { - metaslab_stats((uintptr_t)addr, spa_flags); + if ((spa_flags & SPA_FLAG_METASLABS) && vd.vdev_ms != 0) { + metaslab_stats(&vd, spa_flags); } } - children = vdev.vdev_children; - + uint64_t children = vd.vdev_children; if (children == 0 || !recursive) return (DCMD_OK); - child = mdb_alloc(children * sizeof (void *), UM_SLEEP | UM_GC); - if (mdb_vread(child, children * sizeof (void *), - (uintptr_t)vdev.vdev_child) == -1) { - mdb_warn("failed to read vdev children at %p", vdev.vdev_child); + uintptr_t *child = mdb_alloc(children * sizeof (child), + UM_SLEEP | UM_GC); + if (mdb_vread(child, children * sizeof (void *), vd.vdev_child) == -1) { + mdb_warn("failed to read vdev children at %p", vd.vdev_child); return (DCMD_ERR); } - for (c = 0; c < children; c++) { + for (uint64_t c = 0; c < children; c++) { if (do_print_vdev(child[c], flags, depth + 2, recursive, spa_flags)) { return (DCMD_ERR); @@ -2111,9 +2168,11 @@ typedef struct space_data { uint64_t ms_checkpointing; uint64_t ms_freeing; uint64_t ms_freed; + uint64_t ms_unflushed_frees; + uint64_t ms_unflushed_allocs; uint64_t ms_allocatable; int64_t ms_deferspace; - uint64_t nowavail; + uint64_t avail; } space_data_t; /* ARGSUSED */ @@ -2125,6 +2184,7 @@ space_cb(uintptr_t addr, const void *unknown, void *arg) mdb_range_tree_t rt; mdb_space_map_t sm = { 0 }; mdb_space_map_phys_t smp = { 0 }; + uint64_t uallocs, ufrees; int i; if (mdb_ctf_vread(&ms, "metaslab_t", "mdb_metaslab_t", @@ -2135,9 +2195,7 @@ space_cb(uintptr_t addr, const void *unknown, void *arg) if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t", ms.ms_allocating[i], 0) == -1) return (WALK_ERR); - sd->ms_allocating[i] += rt.rt_space; - } if (mdb_ctf_vread(&rt, "range_tree_t", @@ -2160,6 +2218,18 @@ space_cb(uintptr_t addr, const void *unknown, void *arg) return (WALK_ERR); sd->ms_allocatable += rt.rt_space; + if (mdb_ctf_vread(&rt, "range_tree_t", + "mdb_range_tree_t", ms.ms_unflushed_frees, 0) == -1) + return (WALK_ERR); + sd->ms_unflushed_frees += rt.rt_space; + ufrees = rt.rt_space; + + if (mdb_ctf_vread(&rt, "range_tree_t", + "mdb_range_tree_t", ms.ms_unflushed_allocs, 0) == -1) + return (WALK_ERR); + sd->ms_unflushed_allocs += rt.rt_space; + uallocs = rt.rt_space; + if (ms.ms_sm != 0 && mdb_ctf_vread(&sm, "space_map_t", "mdb_space_map_t", ms.ms_sm, 0) == -1) @@ -2171,7 +2241,7 @@ space_cb(uintptr_t addr, const void *unknown, void *arg) } sd->ms_deferspace += ms.ms_deferspace; - sd->nowavail += sm.sm_size - smp.smp_alloc; + sd->avail += sm.sm_size - smp.smp_alloc + ufrees - uallocs; return (WALK_NEXT); } @@ -2251,12 +2321,16 @@ spa_space(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) sd.ms_freeing >> shift, suffix); mdb_printf("ms_freed = %llu%s\n", sd.ms_freed >> shift, suffix); + mdb_printf("ms_unflushed_frees = %llu%s\n", + sd.ms_unflushed_frees >> shift, suffix); + mdb_printf("ms_unflushed_allocs = %llu%s\n", + sd.ms_unflushed_allocs >> shift, suffix); mdb_printf("ms_allocatable = %llu%s\n", sd.ms_allocatable >> shift, suffix); mdb_printf("ms_deferspace = %llu%s\n", sd.ms_deferspace >> shift, suffix); - mdb_printf("current syncing avail = %llu%s\n", - sd.nowavail >> shift, suffix); + mdb_printf("current avail = %llu%s\n", + sd.avail >> shift, suffix); return (DCMD_OK); } @@ -4096,6 +4170,121 @@ out: return (rc); } +typedef struct mdb_range_seg { + uint64_t rs_start; + uint64_t rs_end; +} mdb_range_seg_t; + +/* ARGSUSED */ +static int +range_tree_cb(uintptr_t addr, const void *unknown, void *arg) +{ + mdb_range_seg_t rs; + + if (mdb_ctf_vread(&rs, ZFS_STRUCT "range_seg", "mdb_range_seg_t", + addr, 0) == -1) + return (DCMD_ERR); + + mdb_printf("\t[%llx %llx) (length %llx)\n", + rs.rs_start, rs.rs_end, rs.rs_end - rs.rs_start); + + return (0); +} + +/* ARGSUSED */ +static int +range_tree(uintptr_t addr, uint_t flags, int argc, + const mdb_arg_t *argv) +{ + mdb_range_tree_t rt; + uintptr_t avl_addr; + + if (!(flags & DCMD_ADDRSPEC)) + return (DCMD_USAGE); + + if (mdb_ctf_vread(&rt, ZFS_STRUCT "range_tree", "mdb_range_tree_t", + addr, 0) == -1) + return (DCMD_ERR); + + mdb_printf("%p: range tree of %llu entries, %llu bytes\n", + addr, rt.rt_root.avl_numnodes, rt.rt_space); + + avl_addr = addr + + mdb_ctf_offsetof_by_name(ZFS_STRUCT "range_tree", "rt_root"); + + if (mdb_pwalk("avl", range_tree_cb, NULL, avl_addr) != 0) { + mdb_warn("can't walk range_tree segments"); + return (DCMD_ERR); + } + return (DCMD_OK); +} + +typedef struct mdb_spa_log_sm { + uint64_t sls_sm_obj; + uint64_t sls_txg; + uint64_t sls_nblocks; + uint64_t sls_mscount; +} mdb_spa_log_sm_t; + +/* ARGSUSED */ +static int +logsm_stats_cb(uintptr_t addr, const void *unknown, void *arg) +{ + mdb_spa_log_sm_t sls; + if (mdb_ctf_vread(&sls, ZFS_STRUCT "spa_log_sm", "mdb_spa_log_sm_t", + addr, 0) == -1) + return (WALK_ERR); + + mdb_printf("%7lld %7lld %7lld %7lld\n", + sls.sls_txg, sls.sls_nblocks, sls.sls_mscount, sls.sls_sm_obj); + + return (WALK_NEXT); +} +typedef struct mdb_log_summary_entry { + uint64_t lse_start; + uint64_t lse_blkcount; + uint64_t lse_mscount; +} mdb_log_summary_entry_t; + +/* ARGSUSED */ +static int +logsm_summary_cb(uintptr_t addr, const void *unknown, void *arg) +{ + mdb_log_summary_entry_t lse; + if (mdb_ctf_vread(&lse, ZFS_STRUCT "log_summary_entry", + "mdb_log_summary_entry_t", addr, 0) == -1) + return (WALK_ERR); + + mdb_printf("%7lld %7lld %7lld\n", + lse.lse_start, lse.lse_blkcount, lse.lse_mscount); + return (WALK_NEXT); +} + +/* ARGSUSED */ +static int +logsm_stats(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + if (!(flags & DCMD_ADDRSPEC)) + return (DCMD_USAGE); + + uintptr_t sls_avl_addr = addr + + mdb_ctf_offsetof_by_name(ZFS_STRUCT "spa", "spa_sm_logs_by_txg"); + uintptr_t summary_addr = addr + + mdb_ctf_offsetof_by_name(ZFS_STRUCT "spa", "spa_log_summary"); + + mdb_printf("Log Entries:\n"); + mdb_printf("%7s %7s %7s %7s\n", "txg", "blk", "ms", "obj"); + if (mdb_pwalk("avl", logsm_stats_cb, NULL, sls_avl_addr) != 0) + return (DCMD_ERR); + + mdb_printf("\nSummary Entries:\n"); + mdb_printf("%7s %7s %7s\n", "txg", "blk", "ms"); + if (mdb_pwalk("list", logsm_summary_cb, NULL, summary_addr) != 0) + return (DCMD_ERR); + + return (DCMD_OK); +} + /* * MDB module linkage information: * @@ -4117,6 +4306,8 @@ static const mdb_dcmd_t dcmds[] = { { "abuf_find", "dva_word[0] dva_word[1]", "find arc_buf_hdr_t of a specified DVA", abuf_find }, + { "logsm_stats", ":", "print log space map statistics of a spa_t", + logsm_stats}, { "spa", "?[-cevmMh]\n" "\t-c display spa config\n" "\t-e display vdev statistics\n" @@ -4182,6 +4373,8 @@ static const mdb_dcmd_t dcmds[] = { "\t-b display histogram of buffer counts\n", "print a histogram of compressed arc buffer sizes", arc_compression_stats}, + { "range_tree", ":", + "print entries in range_tree_t", range_tree}, { NULL } }; diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index a936c361b5..f56766d81f 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -765,6 +765,12 @@ get_checkpoint_refcount(vdev_t *vd) } static int +get_log_spacemap_refcount(spa_t *spa) +{ + return (avl_numnodes(&spa->spa_sm_logs_by_txg)); +} + +static int verify_spacemap_refcounts(spa_t *spa) { uint64_t expected_refcount = 0; @@ -778,6 +784,7 @@ verify_spacemap_refcounts(spa_t *spa) actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); actual_refcount += get_prev_obsolete_spacemap_refcount(spa); actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); + actual_refcount += get_log_spacemap_refcount(spa); if (expected_refcount != actual_refcount) { (void) printf("space map refcount mismatch: expected %lld != " @@ -942,23 +949,46 @@ dump_metaslab(metaslab_t *msp) ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); dump_spacemap(spa->spa_meta_objset, msp->ms_sm); + + if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n", + (u_longlong_t)metaslab_unflushed_txg(msp)); + } } static void print_vdev_metaslab_header(vdev_t *vd) { vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; - const char *bias_str; + const char *bias_str = ""; + + if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) { + bias_str = VDEV_ALLOC_BIAS_LOG; + } else if (alloc_bias == VDEV_BIAS_SPECIAL) { + bias_str = VDEV_ALLOC_BIAS_SPECIAL; + } else if (alloc_bias == VDEV_BIAS_DEDUP) { + bias_str = VDEV_ALLOC_BIAS_DEDUP; + } + + uint64_t ms_flush_data_obj = 0; + if (vd->vdev_top_zap != 0) { + int error = zap_lookup(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, + sizeof (uint64_t), 1, &ms_flush_data_obj); + if (error != ENOENT) { + ASSERT0(error); + } + } + + (void) printf("\tvdev %10llu %s", + (u_longlong_t)vd->vdev_id, bias_str); - bias_str = (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) ? - VDEV_ALLOC_BIAS_LOG : - (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : - (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : - vd->vdev_islog ? "log" : ""; + if (ms_flush_data_obj != 0) { + (void) printf(" ms_unflushed_phys object %llu", + (u_longlong_t)ms_flush_data_obj); + } - (void) printf("\tvdev %10llu %s\n" - "\t%-10s%5llu %-19s %-15s %-12s\n", - (u_longlong_t)vd->vdev_id, bias_str, + (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n", "metaslabs", (u_longlong_t)vd->vdev_ms_count, "offset", "spacemap", "free"); (void) printf("\t%15s %19s %15s %12s\n", @@ -1124,6 +1154,27 @@ dump_metaslabs(spa_t *spa) } static void +dump_log_spacemaps(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + (void) printf("\nLog Space Maps in Pool:\n"); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + VERIFY0(space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + (void) printf("Log Spacemap object %llu txg %llu\n", + (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg); + dump_spacemap(spa->spa_meta_objset, sm); + space_map_close(sm); + } + (void) printf("\n"); +} + +static void dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) { const ddt_phys_t *ddp = dde->dde_phys; @@ -3153,6 +3204,85 @@ static metaslab_ops_t zdb_metaslab_ops = { NULL /* alloc */ }; +typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg); + +typedef struct unflushed_iter_cb_arg { + spa_t *uic_spa; + uint64_t uic_txg; + void *uic_arg; + zdb_log_sm_cb_t uic_cb; +} unflushed_iter_cb_arg_t; + +static int +iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) +{ + unflushed_iter_cb_arg_t *uic = arg; + + return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); +} + +static void +iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + VERIFY0(space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + unflushed_iter_cb_arg_t uic = { + .uic_spa = spa, + .uic_txg = sls->sls_txg, + .uic_arg = arg, + .uic_cb = cb + }; + + VERIFY0(space_map_iterate(sm, space_map_length(sm), + iterate_through_spacemap_logs_cb, &uic)); + space_map_close(sm); + } + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +/* ARGSUSED */ +static int +load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + spa_vdev_removal_t *svr = arg; + + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + + /* skip vdevs we don't care about */ + if (sme->sme_vdev != svr->svr_vdev_id) + return (0); + + vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev); + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + ASSERT(vim != NULL); + if (offset >= vdev_indirect_mapping_max_offset(vim)) + return (0); + + if (sme->sme_type == SM_ALLOC) + range_tree_add(svr->svr_allocd_segs, offset, size); + else + range_tree_remove(svr->svr_allocd_segs, offset, size); + + return (0); +} + static void zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) { @@ -3242,36 +3372,35 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + range_tree_t *allocs = range_tree_create(NULL, NULL); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) break; - ASSERT0(range_tree_space(svr->svr_allocd_segs)); + ASSERT0(range_tree_space(allocs)); + if (msp->ms_sm != NULL) + VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); + range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs); + } + range_tree_destroy(allocs); - if (msp->ms_sm != NULL) { - VERIFY0(space_map_load(msp->ms_sm, - svr->svr_allocd_segs, SM_ALLOC)); + iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); - /* - * Clear everything past what has been synced unless - * it's past the spacemap, because we have not allocated - * mappings for it yet. - */ - uint64_t vim_max_offset = - vdev_indirect_mapping_max_offset(vim); - uint64_t sm_end = msp->ms_sm->sm_start + - msp->ms_sm->sm_size; - if (sm_end > vim_max_offset) - range_tree_clear(svr->svr_allocd_segs, - vim_max_offset, sm_end - vim_max_offset); - } + /* + * Clear everything past what has been synced, + * because we have not allocated mappings for + * it yet. + */ + range_tree_clear(svr->svr_allocd_segs, + vdev_indirect_mapping_max_offset(vim), + vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); - zcb->zcb_removing_size += - range_tree_space(svr->svr_allocd_segs); - range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); - } + zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); + range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); spa_config_exit(spa, SCL_CONFIG, FTAG); } @@ -3438,6 +3567,79 @@ zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) } } +static int +count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + int64_t *ualloc_space = arg; + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + if (sme->sme_type == SM_ALLOC) + *ualloc_space += sme->sme_run; + else + *ualloc_space -= sme->sme_run; + + return (0); +} + +static int64_t +get_unflushed_alloc_space(spa_t *spa) +{ + if (dump_opt['L']) + return (0); + + int64_t ualloc_space = 0; + iterate_through_spacemap_logs(spa, count_unflushed_space_cb, + &ualloc_space); + return (ualloc_space); +} + +static int +load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) +{ + maptype_t *uic_maptype = arg; + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + uint64_t vdev_id = sme->sme_vdev; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + + /* skip indirect vdevs */ + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + if (*uic_maptype == sme->sme_type) + range_tree_add(ms->ms_allocatable, offset, size); + else + range_tree_remove(ms->ms_allocatable, offset, size); + + return (0); +} + +static void +load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype) +{ + iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype); +} + static void load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) { @@ -3461,7 +3663,7 @@ load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) (longlong_t)vd->vdev_ms_count); mutex_enter(&msp->ms_lock); - metaslab_unload(msp); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the @@ -3478,6 +3680,8 @@ load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) mutex_exit(&msp->ms_lock); } } + + load_unflushed_to_ms_allocatables(spa, maptype); } /* @@ -3492,7 +3696,7 @@ load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; mutex_enter(&msp->ms_lock); - metaslab_unload(msp); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the @@ -3752,7 +3956,6 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) range_tree_vacate(msp->ms_allocatable, zdb_leak, vd); } - if (msp->ms_loaded) { msp->ms_loaded = B_FALSE; } @@ -3889,7 +4092,8 @@ dump_block_stats(spa_t *spa) total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)) + metaslab_class_get_alloc(spa_special_class(spa)) + - metaslab_class_get_alloc(spa_dedup_class(spa)); + metaslab_class_get_alloc(spa_dedup_class(spa)) + + get_unflushed_alloc_space(spa); total_found = tzb->zb_asize - zcb.zcb_dedup_asize + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; @@ -4738,11 +4942,25 @@ mos_obj_refd(uint64_t obj) } static void +mos_leak_vdev_top_zap(vdev_t *vd) +{ + uint64_t ms_flush_data_obj; + + int error = zap_lookup(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, + sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj); + if (error == ENOENT) + return; + ASSERT0(error); + + mos_obj_refd(ms_flush_data_obj); +} + +static void mos_leak_vdev(vdev_t *vd) { mos_obj_refd(vd->vdev_dtl_object); mos_obj_refd(vd->vdev_ms_array); - mos_obj_refd(vd->vdev_top_zap); mos_obj_refd(vd->vdev_indirect_config.vic_births_object); mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); mos_obj_refd(vd->vdev_leaf_zap); @@ -4760,11 +4978,34 @@ mos_leak_vdev(vdev_t *vd) mos_obj_refd(space_map_object(ms->ms_sm)); } + if (vd->vdev_top_zap != 0) { + mos_obj_refd(vd->vdev_top_zap); + mos_leak_vdev_top_zap(vd); + } + for (uint64_t c = 0; c < vd->vdev_children; c++) { mos_leak_vdev(vd->vdev_child[c]); } } +static void +mos_leak_log_spacemaps(spa_t *spa) +{ + uint64_t spacemap_zap; + + int error = zap_lookup(spa_meta_objset(spa), + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, + sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) + return; + ASSERT0(error); + + mos_obj_refd(spacemap_zap); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) + mos_obj_refd(sls->sls_sm_obj); +} + static int dump_mos_leaks(spa_t *spa) { @@ -4796,6 +5037,10 @@ dump_mos_leaks(spa_t *spa) mos_obj_refd(spa->spa_l2cache.sav_object); mos_obj_refd(spa->spa_spares.sav_object); + if (spa->spa_syncing_log_sm != NULL) + mos_obj_refd(spa->spa_syncing_log_sm->sm_object); + mos_leak_log_spacemaps(spa); + mos_obj_refd(spa->spa_condensing_indirect_phys. scip_next_mapping_object); mos_obj_refd(spa->spa_condensing_indirect_phys. @@ -4873,6 +5118,81 @@ dump_mos_leaks(spa_t *spa) return (rv); } +typedef struct log_sm_obsolete_stats_arg { + uint64_t lsos_current_txg; + + uint64_t lsos_total_entries; + uint64_t lsos_valid_entries; + + uint64_t lsos_sm_entries; + uint64_t lsos_valid_sm_entries; +} log_sm_obsolete_stats_arg_t; + +static int +log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + log_sm_obsolete_stats_arg_t *lsos = arg; + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + if (lsos->lsos_current_txg == 0) { + /* this is the first log */ + lsos->lsos_current_txg = txg; + } else if (lsos->lsos_current_txg < txg) { + /* we just changed log - print stats and reset */ + (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", + (u_longlong_t)lsos->lsos_valid_sm_entries, + (u_longlong_t)lsos->lsos_sm_entries, + (u_longlong_t)lsos->lsos_current_txg); + lsos->lsos_valid_sm_entries = 0; + lsos->lsos_sm_entries = 0; + lsos->lsos_current_txg = txg; + } + ASSERT3U(lsos->lsos_current_txg, ==, txg); + + lsos->lsos_sm_entries++; + lsos->lsos_total_entries++; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + lsos->lsos_valid_sm_entries++; + lsos->lsos_valid_entries++; + return (0); +} + +static void +dump_log_spacemap_obsolete_stats(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + log_sm_obsolete_stats_arg_t lsos; + bzero(&lsos, sizeof (lsos)); + + (void) printf("Log Space Map Obsolete Entry Statistics:\n"); + + iterate_through_spacemap_logs(spa, + log_spacemap_obsolete_stats_cb, &lsos); + + /* print stats for latest log */ + (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", + (u_longlong_t)lsos.lsos_valid_sm_entries, + (u_longlong_t)lsos.lsos_sm_entries, + (u_longlong_t)lsos.lsos_current_txg); + + (void) printf("%-8llu valid entries out of %-8llu - total\n\n", + (u_longlong_t)lsos.lsos_valid_entries, + (u_longlong_t)lsos.lsos_total_entries); +} + static void dump_zpool(spa_t *spa) { @@ -4902,6 +5222,10 @@ dump_zpool(spa_t *spa) dump_metaslabs(spa); if (dump_opt['M']) dump_metaslab_groups(spa); + if (dump_opt['d'] > 2 || dump_opt['m']) { + dump_log_spacemaps(spa); + dump_log_spacemap_obsolete_stats(spa); + } if (dump_opt['d'] || dump_opt['i']) { mos_refd_objs = range_tree_create(NULL, NULL); @@ -4962,9 +5286,8 @@ dump_zpool(spa_t *spa) } } - if (rc == 0) { + if (rc == 0) rc = verify_device_removal_feature_counts(spa); - } } if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c index 8e0c103349..52ca88ab7f 100644 --- a/usr/src/cmd/zpool/zpool_main.c +++ b/usr/src/cmd/zpool/zpool_main.c @@ -32,6 +32,7 @@ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com> * Copyright 2019 Joyent, Inc. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. + * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. */ #include <assert.h> @@ -3229,6 +3230,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, * print all other top-level devices */ for (uint_t n = 0; n < 3; n++) { + boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; char *bias = NULL; @@ -3249,6 +3251,17 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; + if (!printed) { + if (!cb->cb_scripted) { + (void) printf( + "%-*s - - - -" + " - -", + cb->cb_namewidth, class_name[n]); + } + printf("\n"); + printed = B_TRUE; + } + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); print_vdev_stats(zhp, vname, oldnv ? diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index f422db3bbc..83922cf376 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -2827,24 +2827,12 @@ vdev_lookup_by_path(vdev_t *vd, const char *path) return (NULL); } -/* - * Find the first available hole which can be used as a top-level. - */ -int -find_vdev_hole(spa_t *spa) +static int +spa_num_top_vdevs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; - int c; - - ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV); - - for (c = 0; c < rvd->vdev_children; c++) { - vdev_t *cvd = rvd->vdev_child[c]; - - if (cvd->vdev_ishole) - break; - } - return (c); + ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); + return (rvd->vdev_children); } /* @@ -2869,7 +2857,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; + ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; /* * If we have slogs then remove them 1/4 of the time. @@ -2974,7 +2962,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; + ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, @@ -6895,6 +6883,15 @@ ztest_init(ztest_shared_t *zs) props = make_random_props(); for (int i = 0; i < SPA_FEATURES; i++) { char buf[1024]; + + /* + * 75% chance of using the log space map feature. We want ztest + * to exercise both the code paths that use the log space map + * feature and the ones that don't. + */ + if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) + continue; + (void) snprintf(buf, sizeof (buf), "feature@%s", spa_feature_table[i].fi_uname); VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0)); diff --git a/usr/src/common/zfs/zfeature_common.c b/usr/src/common/zfs/zfeature_common.c index 78345bbd88..e5d3fc27a0 100644 --- a/usr/src/common/zfs/zfeature_common.c +++ b/usr/src/common/zfs/zfeature_common.c @@ -366,4 +366,15 @@ zpool_feature_init(void) "space/object accounting based on project ID.", ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET, project_quota_deps); + + static const spa_feature_t log_spacemap_deps[] = { + SPA_FEATURE_SPACEMAP_V2, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LOG_SPACEMAP, + "com.delphix:log_spacemap", "log_spacemap", + "Log metaslab changes on a single spacemap and " + "flush them periodically.", + ZFEATURE_FLAG_READONLY_COMPAT, + log_spacemap_deps); } diff --git a/usr/src/common/zfs/zfeature_common.h b/usr/src/common/zfs/zfeature_common.h index ab9ff50ff6..9fc4983228 100644 --- a/usr/src/common/zfs/zfeature_common.h +++ b/usr/src/common/zfs/zfeature_common.h @@ -68,6 +68,7 @@ typedef enum spa_feature { SPA_FEATURE_BOOKMARK_V2, SPA_FEATURE_USEROBJ_ACCOUNTING, SPA_FEATURE_PROJECT_QUOTA, + SPA_FEATURE_LOG_SPACEMAP, SPA_FEATURES } spa_feature_t; diff --git a/usr/src/man/man1m/zdb.1m b/usr/src/man/man1m/zdb.1m index ca771c24d7..422fba96d9 100644 --- a/usr/src/man/man1m/zdb.1m +++ b/usr/src/man/man1m/zdb.1m @@ -192,7 +192,8 @@ By default, .Nm verifies that all non-free blocks are referenced, which can be very expensive. .It Fl m -Display the offset, spacemap, and free space of each metaslab. +Display the offset, spacemap, free space of each metaslab, all the log +spacemaps and their obsolete entry statistics. .It Fl mm Also display information about the on-disk free space histogram associated with each metaslab. diff --git a/usr/src/man/man5/zpool-features.5 b/usr/src/man/man5/zpool-features.5 index 21a5369799..38045f80df 100644 --- a/usr/src/man/man5/zpool-features.5 +++ b/usr/src/man/man5/zpool-features.5 @@ -808,5 +808,27 @@ The upgrade process runs in the background and may take a while to complete for the filesystems containing a large number of files. .RE +.sp +.ne 2 +.na +\fBlog_spacemap\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:log_spacemap +READ\-ONLY COMPATIBLE yes +DEPENDENCIES com.delphix:spacemap_v2 +.TE + +This feature improves performance for heavily-fragmented pools, +especially when workloads are heavy in random-writes. +It does so by logging all the metaslab changes on a single spacemap every TXG +instead of scattering multiple writes to all the metaslab spacemaps. + +This feature becomes \fBactive\fR as soon as it is enabled and will never +return to being \fBenabled\fR. +.RE + .SH "SEE ALSO" \fBzfs\fR(1M), \fBzpool\fR(1M) diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf index 66fc12ff3a..9680204e96 100644 --- a/usr/src/pkg/manifests/system-test-zfstest.mf +++ b/usr/src/pkg/manifests/system-test-zfstest.mf @@ -124,6 +124,7 @@ dir path=opt/zfs-tests/tests/functional/large_files dir path=opt/zfs-tests/tests/functional/largest_pool dir path=opt/zfs-tests/tests/functional/libzfs dir path=opt/zfs-tests/tests/functional/link_count +dir path=opt/zfs-tests/tests/functional/log_spacemap dir path=opt/zfs-tests/tests/functional/mdb dir path=opt/zfs-tests/tests/functional/migration dir path=opt/zfs-tests/tests/functional/mmap @@ -2563,6 +2564,8 @@ file path=opt/zfs-tests/tests/functional/libzfs/many_fds mode=0555 file path=opt/zfs-tests/tests/functional/link_count/cleanup mode=0555 file path=opt/zfs-tests/tests/functional/link_count/link_count_001 mode=0555 file path=opt/zfs-tests/tests/functional/link_count/setup mode=0555 +file path=opt/zfs-tests/tests/functional/log_spacemap/log_spacemap_import_logs \ + mode=0555 file path=opt/zfs-tests/tests/functional/mdb/cleanup mode=0555 file path=opt/zfs-tests/tests/functional/mdb/mdb_001_pos mode=0555 file path=opt/zfs-tests/tests/functional/mdb/setup mode=0555 diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run index 14ce6e5bd9..1d8fe09149 100644 --- a/usr/src/test/zfs-tests/runfiles/delphix.run +++ b/usr/src/test/zfs-tests/runfiles/delphix.run @@ -723,3 +723,9 @@ tests = ['zvol_misc_001_neg', 'zvol_misc_002_pos', 'zvol_misc_003_neg', tests = ['many_fds'] pre = post = + +[/opt/zfs-tests/tests/functional/log_spacemap] +tests = ['log_spacemap_import_logs'] +pre = +post = +tags = ['functional', 'log_spacemap'] diff --git a/usr/src/test/zfs-tests/runfiles/omnios.run b/usr/src/test/zfs-tests/runfiles/omnios.run index 6c127881da..3d42388e3f 100644 --- a/usr/src/test/zfs-tests/runfiles/omnios.run +++ b/usr/src/test/zfs-tests/runfiles/omnios.run @@ -727,3 +727,9 @@ tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_003_pos', tests = ['many_fds'] pre = post = + +[/opt/zfs-tests/tests/functional/log_spacemap] +tests = ['log_spacemap_import_logs'] +pre = +post = +tags = ['functional', 'log_spacemap'] diff --git a/usr/src/test/zfs-tests/runfiles/openindiana.run b/usr/src/test/zfs-tests/runfiles/openindiana.run index 011529f8f1..6f537c7ba8 100644 --- a/usr/src/test/zfs-tests/runfiles/openindiana.run +++ b/usr/src/test/zfs-tests/runfiles/openindiana.run @@ -727,3 +727,9 @@ tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_003_pos', tests = ['many_fds'] pre = post = + +[/opt/zfs-tests/tests/functional/log_spacemap] +tests = ['log_spacemap_import_logs'] +pre = +post = +tags = ['functional', 'log_spacemap'] diff --git a/usr/src/test/zfs-tests/runfiles/smartos.run b/usr/src/test/zfs-tests/runfiles/smartos.run index e6bcd4d8d5..f98344bdf8 100644 --- a/usr/src/test/zfs-tests/runfiles/smartos.run +++ b/usr/src/test/zfs-tests/runfiles/smartos.run @@ -626,3 +626,9 @@ tests = ['zvol_misc_001_neg', 'zvol_misc_002_pos', 'zvol_misc_003_neg', tests = ['many_fds'] pre = post = + +[/opt/zfs-tests/tests/functional/log_spacemap] +tests = ['log_spacemap_import_logs'] +pre = +post = +tags = ['functional', 'log_spacemap'] diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 03f7fc37fe..aa99c2be00 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -85,4 +85,5 @@ typeset -a properties=( "feature@bookmark_v2" "feature@userobj_accounting" "feature@project_quota" + "feature@log_spacemap" ) diff --git a/usr/src/test/zfs-tests/tests/functional/log_spacemap/Makefile b/usr/src/test/zfs-tests/tests/functional/log_spacemap/Makefile new file mode 100644 index 0000000000..afb44c1549 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/log_spacemap/Makefile @@ -0,0 +1,21 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/zfs-tests +TARGETDIR = $(ROOTOPTPKG)/tests/functional/log_spacemap + +include $(SRC)/test/zfs-tests/Makefile.com diff --git a/usr/src/test/zfs-tests/tests/functional/log_spacemap/log_spacemap_import_logs.ksh b/usr/src/test/zfs-tests/tests/functional/log_spacemap/log_spacemap_import_logs.ksh new file mode 100755 index 0000000000..71a91284e8 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/log_spacemap/log_spacemap_import_logs.ksh @@ -0,0 +1,82 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Delphix. All rights reserved. +# Copyright 2019 Joyent, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Log spacemaps are generally destroyed at export in order to +# not induce performance overheads at import time. As a result, +# the log spacemap codepaths that read the logs in import times +# are not tested outside of ztest and pools with DEBUG bits doing +# many imports/exports while running the test suite. +# +# This test uses an internal tunable and forces ZFS to keep the +# log spacemaps at export, and then re-imports the pool, thus +# providing explicit testing of those codepaths. It also uses +# another tunable to load all the metaslabs when the pool is +# re-imported so more assertions and verifications will be hit. +# +# STRATEGY: +# 1. Create pool. +# 2. Do a couple of writes to generate some data for spacemap logs. +# 3. Set tunable to keep logs after export. +# 4. Export pool and verify that there are logs with zdb. +# 5. Set tunable to load all metaslabs at import. +# 6. Import pool. +# 7. Reset tunables. +# + +verify_runnable "global" + +function cleanup +{ + log_must set_tunable32 zfs_keep_log_spacemaps_at_export 0 + log_must set_tunable32 metaslab_debug_load 0 + if poolexists $LOGSM_POOL; then + log_must zpool destroy -f $LOGSM_POOL + fi +} +log_onexit cleanup + +LOGSM_POOL="logsm_import" +TESTDISK="$(echo $DISKS | cut -d' ' -f1)" + +log_must zpool create -o cachefile=none -f $LOGSM_POOL $TESTDISK +log_must zfs create $LOGSM_POOL/fs + +log_must dd if=/dev/urandom of=/$LOGSM_POOL/fs/00 bs=128k count=10 +log_must sync +log_must dd if=/dev/urandom of=/$LOGSM_POOL/fs/00 bs=128k count=10 +log_must sync + +log_must set_tunable32 zfs_keep_log_spacemaps_at_export 1 +log_must zpool export $LOGSM_POOL + +LOGSM_COUNT=$(zdb -m -e $LOGSM_POOL | grep "Log Spacemap object" | wc -l) +if (( LOGSM_COUNT == 0 )); then + log_fail "Pool does not have any log spacemaps after being exported" +fi + +log_must set_tunable32 metaslab_debug_load 1 +log_must zpool import $LOGSM_POOL + +log_pass "Log spacemaps imported with no errors" diff --git a/usr/src/test/zfs-tests/tests/functional/removal/removal_condense_export.ksh b/usr/src/test/zfs-tests/tests/functional/removal/removal_condense_export.ksh index d33b53fe14..7bbf770b4c 100644 --- a/usr/src/test/zfs-tests/tests/functional/removal/removal_condense_export.ksh +++ b/usr/src/test/zfs-tests/tests/functional/removal/removal_condense_export.ksh @@ -37,7 +37,7 @@ function reset default_setup_noexit "$DISKS" "true" log_onexit reset -log_must set_condense_delay 100 +log_must set_condense_delay 500 log_must set_min_bytes 1 log_must zfs set recordsize=512 $TESTPOOL/$TESTFS @@ -75,7 +75,7 @@ log_mustnot vdevs_in_pool $TESTPOOL $REMOVEDISK log_must zfs remap $TESTPOOL/$TESTFS sync -sleep 5 +sleep 4 sync log_must zpool export $TESTPOOL zdb -e $TESTPOOL | grep 'Condensing indirect vdev' || \ diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index aff3427796..78894e23f2 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -1413,6 +1413,7 @@ ZFS_COMMON_OBJS += \ spa_config.o \ spa_errlog.o \ spa_history.o \ + spa_log_spacemap.o \ spa_misc.o \ space_map.o \ space_reftree.o \ diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index fb75ef3630..4c9ce98326 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -1530,7 +1530,7 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) ASSERT(dn->dn_dbuf->db_data_pending); /* * Initialize dn_zio outside dnode_sync() because the - * meta-dnode needs to set it ouside dnode_sync(). + * meta-dnode needs to set it outside dnode_sync(). */ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ASSERT(dn->dn_zio); diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index c09cec15a5..8564900fc9 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -737,7 +737,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dp->dp_mos_uncompressed_delta = 0; } - if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) { + if (dmu_objset_is_dirty(mos, txg)) { dsl_pool_sync_mos(dp, tx); } diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 2231664c33..b950ed26d6 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -46,12 +46,21 @@ uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* - * Since we can touch multiple metaslabs (and their respective space maps) - * with each transaction group, we benefit from having a smaller space map + * In pools where the log space map feature is not enabled we touch + * multiple metaslabs (and their respective space maps) with each + * transaction group. Thus, we benefit from having a small space map * block size since it allows us to issue more I/O operations scattered - * around the disk. + * around the disk. So a sane default for the space map block size + * is 8~16K. */ -int zfs_metaslab_sm_blksz = (1 << 12); +int zfs_metaslab_sm_blksz_no_log = (1 << 14); + +/* + * When the log space map feature is enabled, we accumulate a lot of + * changes per metaslab that are flushed once in a while so we benefit + * from a bigger block size like 128K for the metaslab space maps. + */ +int zfs_metaslab_sm_blksz_with_log = (1 << 17); /* * The in-core space map representation is more compact than its on-disk form. @@ -98,12 +107,27 @@ int zfs_mg_noalloc_threshold = 0; /* * Metaslab groups are considered eligible for allocations if their - * fragmenation metric (measured as a percentage) is less than or equal to - * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold - * then it will be skipped unless all metaslab groups within the metaslab - * class have also crossed this threshold. + * fragmenation metric (measured as a percentage) is less than or + * equal to zfs_mg_fragmentation_threshold. If a metaslab group + * exceeds this threshold then it will be skipped unless all metaslab + * groups within the metaslab class have also crossed this threshold. + * + * This tunable was introduced to avoid edge cases where we continue + * allocating from very fragmented disks in our pool while other, less + * fragmented disks, exists. On the other hand, if all disks in the + * pool are uniformly approaching the threshold, the threshold can + * be a speed bump in performance, where we keep switching the disks + * that we allocate from (e.g. we allocate some segments from disk A + * making it bypassing the threshold while freeing segments from disk + * B getting its fragmentation below the threshold). + * + * Empirically, we've seen that our vdev selection for allocations is + * good enough that fragmentation increases uniformly across all vdevs + * the majority of the time. Thus we set the threshold percentage high + * enough to avoid hitting the speed bump on pools that are being pushed + * to the edge. */ -int zfs_mg_fragmentation_threshold = 85; +int zfs_mg_fragmentation_threshold = 95; /* * Allow metaslabs to keep their active state as long as their fragmentation @@ -140,6 +164,30 @@ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; int metaslab_df_free_pct = 4; /* + * Maximum distance to search forward from the last offset. Without this + * limit, fragmented pools can see >100,000 iterations and + * metaslab_block_picker() becomes the performance limiting factor on + * high-performance storage. + * + * With the default setting of 16MB, we typically see less than 500 + * iterations, even with very fragmented, ashift=9 pools. The maximum number + * of iterations possible is: + * metaslab_df_max_search / (2 * (1<<ashift)) + * With the default setting of 16MB this is 16*1024 (with ashift=9) or + * 2048 (with ashift=12). + */ +int metaslab_df_max_search = 16 * 1024 * 1024; + +/* + * If we are not searching forward (due to metaslab_df_max_search, + * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable + * controls what segment is used. If it is set, we will use the largest free + * segment. If it is not set, we will use a segment of exactly the requested + * size (or larger). + */ +int metaslab_df_use_largest_segment = B_FALSE; + +/* * A metaslab is considered "free" if it contains a contiguous * segment which is greater than metaslab_min_alloc_size. */ @@ -239,6 +287,7 @@ static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); static void metaslab_passivate(metaslab_t *msp, uint64_t weight); static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); +static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); kmem_cache_t *metaslab_alloc_trace_cache; @@ -513,67 +562,6 @@ metaslab_compare(const void *x1, const void *x2) return (AVL_CMP(m1->ms_start, m2->ms_start)); } -uint64_t -metaslab_allocated_space(metaslab_t *msp) -{ - return (msp->ms_allocated_space); -} - -/* - * Verify that the space accounting on disk matches the in-core range_trees. - */ -static void -metaslab_verify_space(metaslab_t *msp, uint64_t txg) -{ - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - uint64_t allocating = 0; - uint64_t sm_free_space, msp_free_space; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT(!msp->ms_condensing); - - if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) - return; - - /* - * We can only verify the metaslab space when we're called - * from syncing context with a loaded metaslab that has an - * allocated space map. Calling this in non-syncing context - * does not provide a consistent view of the metaslab since - * we're performing allocations in the future. - */ - if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || - !msp->ms_loaded) - return; - - /* - * Even though the smp_alloc field can get negative (e.g. - * see vdev_checkpoint_sm), that should never be the case - * when it come's to a metaslab's space map. - */ - ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); - - sm_free_space = msp->ms_size - metaslab_allocated_space(msp); - - /* - * Account for future allocations since we would have - * already deducted that space from the ms_allocatable. - */ - for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { - allocating += - range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); - } - - ASSERT3U(msp->ms_deferspace, ==, - range_tree_space(msp->ms_defer[0]) + - range_tree_space(msp->ms_defer[1])); - - msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + - msp->ms_deferspace + range_tree_space(msp->ms_freed); - - VERIFY3U(sm_free_space, ==, msp_free_space); -} - /* * ========================================================================== * Metaslab groups @@ -662,6 +650,25 @@ metaslab_group_alloc_update(metaslab_group_t *mg) mutex_exit(&mg->mg_lock); } +int +metaslab_sort_by_flushed(const void *va, const void *vb) +{ + const metaslab_t *a = va; + const metaslab_t *b = vb; + + int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); + if (likely(cmp)) + return (cmp); + + uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; + uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; + cmp = AVL_CMP(a_vdev_id, b_vdev_id); + if (cmp) + return (cmp); + + return (AVL_CMP(a->ms_id, b->ms_id)); +} + metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { @@ -676,7 +683,7 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), KM_SLEEP); avl_create(&mg->mg_metaslab_tree, metaslab_compare, - sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); + sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; @@ -909,7 +916,6 @@ metaslab_group_histogram_verify(metaslab_group_t *mg) for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - ASSERT(msp != NULL); /* skip if not active or not a member */ if (msp->ms_sm == NULL || msp->ms_group != mg) @@ -1240,13 +1246,16 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) */ static uint64_t metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, - uint64_t align) + uint64_t max_search) { range_seg_t *rs = metaslab_block_find(t, *cursor, size); + uint64_t first_found; - while (rs != NULL) { - uint64_t offset = P2ROUNDUP(rs->rs_start, align); + if (rs != NULL) + first_found = rs->rs_start; + while (rs != NULL && rs->rs_start - first_found <= max_search) { + uint64_t offset = rs->rs_start; if (offset + size <= rs->rs_end) { *cursor = offset + size; return (offset); @@ -1254,49 +1263,28 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, rs = AVL_NEXT(t, rs); } - /* - * If we know we've searched the whole map (*cursor == 0), give up. - * Otherwise, reset the cursor to the beginning and try again. - */ - if (*cursor == 0) - return (-1ULL); - *cursor = 0; - return (metaslab_block_picker(t, cursor, size, align)); -} - -/* - * ========================================================================== - * The first-fit block allocator - * ========================================================================== - */ -static uint64_t -metaslab_ff_alloc(metaslab_t *msp, uint64_t size) -{ - /* - * Find the largest power of 2 block size that evenly divides the - * requested size. This is used to try to allocate blocks with similar - * alignment from the same area of the metaslab (i.e. same cursor - * bucket) but it does not guarantee that other allocations sizes - * may exist in the same region. - */ - uint64_t align = size & -size; - uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; - avl_tree_t *t = &msp->ms_allocatable->rt_root; - - return (metaslab_block_picker(t, cursor, size, align)); + return (-1ULL); } -static metaslab_ops_t metaslab_ff_ops = { - metaslab_ff_alloc -}; - /* * ========================================================================== - * Dynamic block allocator - - * Uses the first fit allocation scheme until space get low and then - * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold - * and metaslab_df_free_pct to determine when to switch the allocation scheme. + * Dynamic Fit (df) block allocator + * + * Search for a free chunk of at least this size, starting from the last + * offset (for this alignment of block) looking for up to + * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not + * found within 16MB, then return a free chunk of exactly the requested size (or + * larger). + * + * If it seems like searching from the last offset will be unproductive, skip + * that and just return a free chunk of exactly the requested size (or larger). + * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This + * mechanism is probably not very useful and may be removed in the future. + * + * The behavior when not searching can be changed to return the largest free + * chunk, instead of a free chunk of exactly the requested size, by setting + * metaslab_df_use_largest_segment. * ========================================================================== */ static uint64_t @@ -1312,28 +1300,42 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &rt->rt_root; - uint64_t max_size = metaslab_block_maxsize(msp); int free_pct = range_tree_space(rt) * 100 / msp->ms_size; + uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, + ASSERT3U(avl_numnodes(&rt->rt_root), ==, avl_numnodes(&msp->ms_allocatable_by_size)); - if (max_size < size) - return (-1ULL); - /* - * If we're running low on space switch to using the size - * sorted AVL tree (best-fit). + * If we're running low on space, find a segment based on size, + * rather than iterating based on offset. */ - if (max_size < metaslab_df_alloc_threshold || + if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { - t = &msp->ms_allocatable_by_size; - *cursor = 0; + offset = -1; + } else { + offset = metaslab_block_picker(&rt->rt_root, + cursor, size, metaslab_df_max_search); } - return (metaslab_block_picker(t, cursor, size, 1ULL)); + if (offset == -1) { + range_seg_t *rs; + if (metaslab_df_use_largest_segment) { + /* use largest free segment */ + rs = avl_last(&msp->ms_allocatable_by_size); + } else { + /* use segment of this size, or next largest */ + rs = metaslab_block_find(&msp->ms_allocatable_by_size, + 0, size); + } + if (rs != NULL && rs->rs_start + size <= rs->rs_end) { + offset = rs->rs_start; + *cursor = offset + size; + } + } + + return (offset); } static metaslab_ops_t metaslab_df_ops = { @@ -1451,6 +1453,101 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; * ========================================================================== */ +/* + * Wait for any in-progress metaslab loads to complete. + */ +void +metaslab_load_wait(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + while (msp->ms_loading) { + ASSERT(!msp->ms_loaded); + cv_wait(&msp->ms_load_cv, &msp->ms_lock); + } +} + +/* + * Wait for any in-progress flushing to complete. + */ +void +metaslab_flush_wait(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + while (msp->ms_flushing) + cv_wait(&msp->ms_flush_cv, &msp->ms_lock); +} + +uint64_t +metaslab_allocated_space(metaslab_t *msp) +{ + return (msp->ms_allocated_space); +} + +/* + * Verify that the space accounting on disk matches the in-core range_trees. + */ +static void +metaslab_verify_space(metaslab_t *msp, uint64_t txg) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t allocating = 0; + uint64_t sm_free_space, msp_free_space; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(!msp->ms_condensing); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + /* + * We can only verify the metaslab space when we're called + * from syncing context with a loaded metaslab that has an + * allocated space map. Calling this in non-syncing context + * does not provide a consistent view of the metaslab since + * we're performing allocations in the future. + */ + if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || + !msp->ms_loaded) + return; + + /* + * Even though the smp_alloc field can get negative, + * when it comes to a metaslab's space map, that should + * never be the case. + */ + ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); + + ASSERT3U(space_map_allocated(msp->ms_sm), >=, + range_tree_space(msp->ms_unflushed_frees)); + + ASSERT3U(metaslab_allocated_space(msp), ==, + space_map_allocated(msp->ms_sm) + + range_tree_space(msp->ms_unflushed_allocs) - + range_tree_space(msp->ms_unflushed_frees)); + + sm_free_space = msp->ms_size - metaslab_allocated_space(msp); + + /* + * Account for future allocations since we would have + * already deducted that space from the ms_allocatable. + */ + for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { + allocating += + range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); + } + + ASSERT3U(msp->ms_deferspace, ==, + range_tree_space(msp->ms_defer[0]) + + range_tree_space(msp->ms_defer[1])); + + msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + + msp->ms_deferspace + range_tree_space(msp->ms_freed); + + VERIFY3U(sm_free_space, ==, msp_free_space); +} + static void metaslab_aux_histograms_clear(metaslab_t *msp) { @@ -1574,7 +1671,15 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; - /* see comment in metaslab_verify_unflushed_changes() */ + /* + * We can end up here from vdev_remove_complete(), in which case we + * cannot do these assertions because we hold spa config locks and + * thus we are not allowed to read from the DMU. + * + * We check if the metaslab group has been removed and if that's + * the case we return immediately as that would mean that we are + * here from the aforementioned code path. + */ if (msp->ms_group == NULL) return; @@ -1648,20 +1753,6 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) VERIFY3U(msp->ms_weight, ==, weight); } -/* - * Wait for any in-progress metaslab loads to complete. - */ -static void -metaslab_load_wait(metaslab_t *msp) -{ - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - while (msp->ms_loading) { - ASSERT(!msp->ms_loaded); - cv_wait(&msp->ms_load_cv, &msp->ms_lock); - } -} - static int metaslab_load_impl(metaslab_t *msp) { @@ -1676,13 +1767,19 @@ metaslab_load_impl(metaslab_t *msp) * are reading the space map. Therefore, metaslab_sync() and * metaslab_sync_done() can run at the same time as we do. * - * metaslab_sync() can append to the space map while we are loading. - * Therefore we load only entries that existed when we started the - * load. Additionally, metaslab_sync_done() has to wait for the load - * to complete because there are potential races like metaslab_load() - * loading parts of the space map that are currently being appended - * by metaslab_sync(). If we didn't, the ms_allocatable would have - * entries that metaslab_sync_done() would try to re-add later. + * If we are using the log space maps, metaslab_sync() can't write to + * the metaslab's space map while we are loading as we only write to + * it when we are flushing the metaslab, and that can't happen while + * we are loading it. + * + * If we are not using log space maps though, metaslab_sync() can + * append to the space map while we are loading. Therefore we load + * only entries that existed when we started the load. Additionally, + * metaslab_sync_done() has to wait for the load to complete because + * there are potential races like metaslab_load() loading parts of the + * space map that are currently being appended by metaslab_sync(). If + * we didn't, the ms_allocatable would have entries that + * metaslab_sync_done() would try to re-add later. * * That's why before dropping the lock we remember the synced length * of the metaslab and read up to that point of the space map, @@ -1692,6 +1789,7 @@ metaslab_load_impl(metaslab_t *msp) uint64_t length = msp->ms_synced_length; mutex_exit(&msp->ms_lock); + hrtime_t load_start = gethrtime(); if (msp->ms_sm != NULL) { error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, SM_FREE, length); @@ -1703,18 +1801,37 @@ metaslab_load_impl(metaslab_t *msp) */ range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); + + if (msp->ms_freed != NULL) { + /* + * If the ms_sm doesn't exist, this means that this + * metaslab hasn't gone through metaslab_sync() and + * thus has never been dirtied. So we shouldn't + * expect any unflushed allocs or frees from previous + * TXGs. + * + * Note: ms_freed and all the other trees except for + * the ms_allocatable, can be NULL at this point only + * if this is a new metaslab of a vdev that just got + * expanded. + */ + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + } } /* * We need to grab the ms_sync_lock to prevent metaslab_sync() from - * changing the ms_sm and the metaslab's range trees while we are - * about to use them and populate the ms_allocatable. The ms_lock - * is insufficient for this because metaslab_sync() doesn't hold - * the ms_lock while writing the ms_checkpointing tree to disk. + * changing the ms_sm (or log_sm) and the metaslab's range trees + * while we are about to use them and populate the ms_allocatable. + * The ms_lock is insufficient for this because metaslab_sync() doesn't + * hold the ms_lock while writing the ms_checkpointing tree to disk. */ mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); + ASSERT(!msp->ms_condensing); + ASSERT(!msp->ms_flushing); if (error != 0) { mutex_exit(&msp->ms_sync_lock); @@ -1725,10 +1842,60 @@ metaslab_load_impl(metaslab_t *msp) msp->ms_loaded = B_TRUE; /* - * The ms_allocatable contains the segments that exist in the - * ms_defer trees [see ms_synced_length]. Thus we need to remove - * them from ms_allocatable as they will be added again in + * Apply all the unflushed changes to ms_allocatable right + * away so any manipulations we do below have a clear view + * of what is allocated and what is free. + */ + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_remove, msp->ms_allocatable); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_add, msp->ms_allocatable); + + msp->ms_loaded = B_TRUE; + + ASSERT3P(msp->ms_group, !=, NULL); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + if (spa_syncing_log_sm(spa) != NULL) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LOG_SPACEMAP)); + + /* + * If we use a log space map we add all the segments + * that are in ms_unflushed_frees so they are available + * for allocation. + * + * ms_allocatable needs to contain all free segments + * that are ready for allocations (thus not segments + * from ms_freeing, ms_freed, and the ms_defer trees). + * But if we grab the lock in this code path at a sync + * pass later that 1, then it also contains the + * segments of ms_freed (they were added to it earlier + * in this path through ms_unflushed_frees). So we + * need to remove all the segments that exist in + * ms_freed from ms_allocatable as they will be added + * later in metaslab_sync_done(). + * + * When there's no log space map, the ms_allocatable + * correctly doesn't contain any segments that exist + * in ms_freed [see ms_synced_length]. + */ + range_tree_walk(msp->ms_freed, + range_tree_remove, msp->ms_allocatable); + } + + /* + * If we are not using the log space map, ms_allocatable + * contains the segments that exist in the ms_defer trees + * [see ms_synced_length]. Thus we need to remove them + * from ms_allocatable as they will be added again in * metaslab_sync_done(). + * + * If we are using the log space map, ms_allocatable still + * contains the segments that exist in the ms_defer trees. + * Not because it read them through the ms_sm though. But + * because these segments are part of ms_unflushed_frees + * whose segments we add to ms_allocatable earlier in this + * code path. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], @@ -1753,10 +1920,26 @@ metaslab_load_impl(metaslab_t *msp) ASSERT3U(weight, <=, msp->ms_weight); msp->ms_max_size = metaslab_block_maxsize(msp); - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + hrtime_t load_end = gethrtime(); + if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { + zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, " + "ms_id %llu, smp_length %llu, " + "unflushed_allocs %llu, unflushed_frees %llu, " + "freed %llu, defer %llu + %llu, " + "loading_time %lld ms", + spa_syncing_txg(spa), spa_name(spa), + msp->ms_group->mg_vd->vdev_id, msp->ms_id, + space_map_length(msp->ms_sm), + range_tree_space(msp->ms_unflushed_allocs), + range_tree_space(msp->ms_unflushed_frees), + range_tree_space(msp->ms_freed), + range_tree_space(msp->ms_defer[0]), + range_tree_space(msp->ms_defer[1]), + (longlong_t)((load_end - load_start) / 1000000)); + } + metaslab_verify_space(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_sync_lock); - return (0); } @@ -1782,8 +1965,32 @@ metaslab_load(metaslab_t *msp, uint64_t txg) atomic_inc_64(&mg_ksp->mg_loads.value.ui64); } + /* + * We set the loading flag BEFORE potentially dropping the lock to + * wait for an ongoing flush (see ms_flushing below). This way other + * threads know that there is already a thread that is loading this + * metaslab. + */ msp->ms_loading = B_TRUE; + + /* + * Wait for any in-progress flushing to finish as we drop the ms_lock + * both here (during space_map_load()) and in metaslab_flush() (when + * we flush our changes to the ms_sm). + */ + if (msp->ms_flushing) + metaslab_flush_wait(msp); + + /* + * In the possibility that we were waiting for the metaslab to be + * flushed (where we temporarily dropped the ms_lock), ensure that + * no one else loaded the metaslab somehow. + */ + ASSERT(!msp->ms_loaded); + int error = metaslab_load_impl(msp); + + ASSERT(MUTEX_HELD(&msp->ms_lock)); msp->ms_loading = B_FALSE; msp->ms_loaded_txg = txg; cv_broadcast(&msp->ms_load_cv); @@ -1811,7 +2018,7 @@ metaslab_unload(metaslab_t *msp) * have their weights calculated from the space map histograms, while * loaded ones have it calculated from their in-core range tree * [see metaslab_load()]. This way, the weight reflects the information - * available in-core, whether it is loaded or not + * available in-core, whether it is loaded or not. * * If ms_group == NULL means that we came here from metaslab_fini(), * at which point it doesn't make sense for us to do the recalculation @@ -1821,7 +2028,7 @@ metaslab_unload(metaslab_t *msp) metaslab_recalculate_weight_and_sort(msp); } -static void +void metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { @@ -1835,8 +2042,8 @@ metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, } int -metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, - metaslab_t **msp) +metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, + uint64_t txg, metaslab_t **msp) { vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; @@ -1848,6 +2055,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); + cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; @@ -1911,17 +2119,6 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, metaslab_allocated_space(ms), 0, 0); } - /* - * If metaslab_debug_load is set and we're initializing a metaslab - * that has an allocated space map object then load the space map - * so that we can verify frees. - */ - if (metaslab_debug_load && ms->ms_sm != NULL) { - mutex_enter(&ms->ms_lock); - VERIFY0(metaslab_load(ms, txg)); - mutex_exit(&ms->ms_lock); - } - if (txg != 0) { vdev_dirty(vd, 0, NULL, txg); vdev_dirty(vd, VDD_METASLAB, ms, txg); @@ -1932,11 +2129,42 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, return (0); } +static void +metaslab_fini_flush_data(metaslab_t *msp) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + if (metaslab_unflushed_txg(msp) == 0) { + ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), + ==, NULL); + return; + } + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + mutex_enter(&spa->spa_flushed_ms_lock); + avl_remove(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); + + spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); + spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp)); +} + +uint64_t +metaslab_unflushed_changes_memused(metaslab_t *ms) +{ + return ((range_tree_numsegs(ms->ms_unflushed_allocs) + + range_tree_numsegs(ms->ms_unflushed_frees)) * + sizeof (range_seg_t)); +} + void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; + + metaslab_fini_flush_data(msp); metaslab_group_remove(mg, msp); @@ -1946,13 +2174,22 @@ metaslab_fini(metaslab_t *msp) -metaslab_allocated_space(msp), 0, -msp->ms_size); space_map_close(msp->ms_sm); + msp->ms_sm = NULL; metaslab_unload(msp); - range_tree_destroy(msp->ms_allocatable); range_tree_destroy(msp->ms_freeing); range_tree_destroy(msp->ms_freed); + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_allocs); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_frees); + for (int t = 0; t < TXG_SIZE; t++) { range_tree_destroy(msp->ms_allocating[t]); } @@ -1972,6 +2209,7 @@ metaslab_fini(metaslab_t *msp) mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); + cv_destroy(&msp->ms_flush_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); ASSERT3U(msp->ms_allocator, ==, -1); @@ -2213,9 +2451,9 @@ metaslab_weight_from_range_tree(metaslab_t *msp) } /* - * Calculate the weight based on the on-disk histogram. This should only - * be called after a sync pass has completely finished since the on-disk - * information is updated in metaslab_sync(). + * Calculate the weight based on the on-disk histogram. Should be applied + * only to unloaded metaslabs (i.e no incoming allocations) in-order to + * give results consistent with the on-disk state */ static uint64_t metaslab_weight_from_spacemap(metaslab_t *msp) @@ -2289,7 +2527,6 @@ metaslab_segment_weight(metaslab_t *msp) } WEIGHT_SET_ACTIVE(weight, 0); ASSERT(!WEIGHT_IS_SPACEBASED(weight)); - return (weight); } @@ -2323,21 +2560,23 @@ metaslab_segment_weight(metaslab_t *msp) /* * Determine if we should attempt to allocate from this metaslab. If the - * metaslab has a maximum size then we can quickly determine if the desired - * allocation size can be satisfied. Otherwise, if we're using segment-based - * weighting then we can determine the maximum allocation that this metaslab - * can accommodate based on the index encoded in the weight. If we're using - * space-based weights then rely on the entire weight (excluding the weight - * type bit). + * metaslab is loaded, then we can determine if the desired allocation + * can be satisfied by looking at the size of the maximum free segment + * on that metaslab. Otherwise, we make our decision based on the metaslab's + * weight. For segment-based weighting we can determine the maximum + * allocation based on the index encoded in its value. For space-based + * weights we rely on the entire weight (excluding the weight-type bit). */ boolean_t metaslab_should_allocate(metaslab_t *msp, uint64_t asize) { - boolean_t should_allocate; - - if (msp->ms_max_size != 0) + if (msp->ms_loaded) { return (msp->ms_max_size >= asize); + } else { + ASSERT0(msp->ms_max_size); + } + boolean_t should_allocate; if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { /* * The metaslab segment weight indicates segments in the @@ -2599,18 +2838,19 @@ metaslab_group_preload(metaslab_group_t *mg) } /* - * Determine if the space map's on-disk footprint is past our tolerance - * for inefficiency. We would like to use the following criteria to make - * our decision: + * Determine if the space map's on-disk footprint is past our tolerance for + * inefficiency. We would like to use the following criteria to make our + * decision: * - * 1. The size of the space map object should not dramatically increase as a - * result of writing out the free space range tree. + * 1. Do not condense if the size of the space map object would dramatically + * increase as a result of writing out the free space range tree. * - * 2. The minimal on-disk space map representation is zfs_condense_pct/100 - * times the size than the free space range tree representation - * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). + * 2. Condense if the on on-disk space map representation is at least + * zfs_condense_pct/100 times the size of the optimal representation + * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). * - * 3. The on-disk size of the space map should actually decrease. + * 3. Do not condense if the on-disk size of the space map does not actually + * decrease. * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. @@ -2624,30 +2864,11 @@ metaslab_should_condense(metaslab_t *msp) space_map_t *sm = msp->ms_sm; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1 << vd->vdev_ashift; - uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); - - if (zfs_condense_never != 0) - return (B_FALSE); - - /* - * Allocations and frees in early passes are generally more space - * efficient (in terms of blocks described in space map entries) - * than the ones in later passes (e.g. we don't compress after - * sync pass 5) and condensing a metaslab multiple times in a txg - * could degrade performance. - * - * Thus we prefer condensing each metaslab at most once every txg at - * the earliest sync pass possible. If a metaslab is eligible for - * condensing again after being considered for condensing within the - * same txg, it will hopefully be dirty in the next txg where it will - * be condensed at an earlier pass. - */ - if (msp->ms_condense_checked_txg == current_txg) - return (B_FALSE); - msp->ms_condense_checked_txg = current_txg; + ASSERT(sm != NULL); + ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); /* * We always condense metaslabs that are empty and metaslabs for @@ -2657,96 +2878,343 @@ metaslab_should_condense(metaslab_t *msp) msp->ms_condense_wanted) return (B_TRUE); - uint64_t object_size = space_map_length(msp->ms_sm); + uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); + uint64_t object_size = space_map_length(sm); uint64_t optimal_size = space_map_estimate_optimal_size(sm, msp->ms_allocatable, SM_NO_VDEVID); - dmu_object_info_t doi; - dmu_object_info_from_db(sm->sm_dbuf, &doi); - uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); - return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } /* * Condense the on-disk space map representation to its minimized form. - * The minimized form consists of a small number of allocations followed by - * the entries of the free range tree. + * The minimized form consists of a small number of allocations followed + * by the entries of the free range tree (ms_allocatable). The condensed + * spacemap contains all the entries of previous TXGs (including those in + * the pool-wide log spacemaps; thus this is effectively a superset of + * metaslab_flush()), but this TXG's entries still need to be written. */ static void -metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) +metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) { range_tree_t *condense_tree; space_map_t *sm = msp->ms_sm; + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); + ASSERT(msp->ms_sm != NULL); + + /* + * In order to condense the space map, we need to change it so it + * only describes which segments are currently allocated and free. + * + * All the current free space resides in the ms_allocatable, all + * the ms_defer trees, and all the ms_allocating trees. We ignore + * ms_freed because it is empty because we're in sync pass 1. We + * ignore ms_freeing because these changes are not yet reflected + * in the spacemap (they will be written later this txg). + * + * So to truncate the space map to represent all the entries of + * previous TXGs we do the following: + * + * 1] We create a range tree (condense tree) that is 100% allocated. + * 2] We remove from it all segments found in the ms_defer trees + * as those segments are marked as free in the original space + * map. We do the same with the ms_allocating trees for the same + * reason. Removing these segments should be a relatively + * inexpensive operation since we expect these trees to have a + * small number of nodes. + * 3] We vacate any unflushed allocs as they should already exist + * in the condense tree. Then we vacate any unflushed frees as + * they should already be part of ms_allocatable. + * 4] At this point, we would ideally like to remove all segments + * in the ms_allocatable tree from the condense tree. This way + * we would write all the entries of the condense tree as the + * condensed space map, which would only contain allocated + * segments with everything else assumed to be freed. + * + * Doing so can be prohibitively expensive as ms_allocatable can + * be large, and therefore computationally expensive to subtract + * from the condense_tree. Instead we first sync out the + * condense_tree and then the ms_allocatable, in the condensed + * space map. While this is not optimal, it is typically close to + * optimal and more importantly much cheaper to compute. + * + * 5] Finally, as both of the unflushed trees were written to our + * new and condensed metaslab space map, we basically flushed + * all the unflushed changes to disk, thus we call + * metaslab_flush_update(). + */ + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, - msp->ms_group->mg_vd->vdev_spa->spa_name, - space_map_length(msp->ms_sm), + spa->spa_name, space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_allocatable->rt_root), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; - /* - * Create an range tree that is 100% allocated. We remove segments - * that have been freed in this txg, any deferred frees that exist, - * and any allocation in the future. Removing segments should be - * a relatively inexpensive operation since we expect these trees to - * have a small number of nodes. - */ condense_tree = range_tree_create(NULL, NULL); range_tree_add(condense_tree, msp->ms_start, msp->ms_size); - range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); - range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); - for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], range_tree_remove, condense_tree); } - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], range_tree_remove, condense_tree); } + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + /* - * We're about to drop the metaslab's lock thus allowing - * other consumers to change it's content. Set the - * metaslab's ms_condensing flag to ensure that - * allocations on this metaslab do not occur while we're - * in the middle of committing it to disk. This is only critical - * for ms_allocatable as all other range trees use per txg + * We're about to drop the metaslab's lock thus allowing other + * consumers to change its content. Set the metaslab's ms_condensing + * flag to ensure that allocations on this metaslab do not occur + * while we're in the middle of committing it to disk. This is only + * critical for ms_allocatable as all other range trees use per TXG * views of their content. */ msp->ms_condensing = B_TRUE; mutex_exit(&msp->ms_lock); - space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); + uint64_t object = space_map_object(msp->ms_sm); + space_map_truncate(sm, + spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? + zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); /* - * While we would ideally like to create a space map representation - * that consists only of allocation records, doing so can be - * prohibitively expensive because the in-core free tree can be - * large, and therefore computationally expensive to subtract - * from the condense_tree. Instead we sync out two trees, a cheap - * allocation only tree followed by the in-core free tree. While not - * optimal, this is typically close to optimal, and much cheaper to - * compute. + * space_map_truncate() may have reallocated the spacemap object. + * If so, update the vdev_ms_array. + */ + if (space_map_object(msp->ms_sm) != object) { + object = space_map_object(msp->ms_sm); + dmu_write(spa->spa_meta_objset, + msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * + msp->ms_id, sizeof (uint64_t), &object, tx); + } + + /* + * Note: + * When the log space map feature is enabled, each space map will + * always have ALLOCS followed by FREES for each sync pass. This is + * typically true even when the log space map feature is disabled, + * except from the case where a metaslab goes through metaslab_sync() + * and gets condensed. In that case the metaslab's space map will have + * ALLOCS followed by FREES (due to condensing) followed by ALLOCS + * followed by FREES (due to space_map_write() in metaslab_sync()) for + * sync pass 1. */ space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); + space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); + range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); - - space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); + msp->ms_condensing = B_FALSE; + metaslab_flush_update(msp, tx); +} + +/* + * Called when the metaslab has been flushed (its own spacemap now reflects + * all the contents of the pool-wide spacemap log). Updates the metaslab's + * metadata and any pool-wide related log space map data (e.g. summary, + * obsolete logs, etc.) to reflect that. + */ +static void +metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) +{ + metaslab_group_t *mg = msp->ms_group; + spa_t *spa = mg->mg_vd->vdev_spa; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + + /* + * Just because a metaslab got flushed, that doesn't mean that + * it will pass through metaslab_sync_done(). Thus, make sure to + * update ms_synced_length here in case it doesn't. + */ + msp->ms_synced_length = space_map_length(msp->ms_sm); + + /* + * We may end up here from metaslab_condense() without the + * feature being active. In that case this is a no-op. + */ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + ASSERT(spa_syncing_log_sm(spa) != NULL); + ASSERT(msp->ms_sm != NULL); + ASSERT(metaslab_unflushed_txg(msp) != 0); + ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); + + VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); + + /* update metaslab's position in our flushing tree */ + uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); + mutex_enter(&spa->spa_flushed_ms_lock); + avl_remove(&spa->spa_metaslabs_by_flushed, msp); + metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + avl_add(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); + + /* update metaslab counts of spa_log_sm_t nodes */ + spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); + spa_log_sm_increment_current_mscount(spa); + + /* cleanup obsolete logs if any */ + uint64_t log_blocks_before = spa_log_sm_nblocks(spa); + spa_cleanup_old_sm_logs(spa, tx); + uint64_t log_blocks_after = spa_log_sm_nblocks(spa); + VERIFY3U(log_blocks_after, <=, log_blocks_before); + + /* update log space map summary */ + uint64_t blocks_gone = log_blocks_before - log_blocks_after; + spa_log_summary_add_flushed_metaslab(spa); + spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg); + spa_log_summary_decrement_blkcount(spa, blocks_gone); +} + +boolean_t +metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + ASSERT(msp->ms_sm != NULL); + ASSERT(metaslab_unflushed_txg(msp) != 0); + ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); + + /* + * There is nothing wrong with flushing the same metaslab twice, as + * this codepath should work on that case. However, the current + * flushing scheme makes sure to avoid this situation as we would be + * making all these calls without having anything meaningful to write + * to disk. We assert this behavior here. + */ + ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); + + /* + * We can not flush while loading, because then we would + * not load the ms_unflushed_{allocs,frees}. + */ + if (msp->ms_loading) + return (B_FALSE); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + metaslab_verify_weight_and_frag(msp); + + /* + * Metaslab condensing is effectively flushing. Therefore if the + * metaslab can be condensed we can just condense it instead of + * flushing it. + * + * Note that metaslab_condense() does call metaslab_flush_update() + * so we can just return immediately after condensing. We also + * don't need to care about setting ms_flushing or broadcasting + * ms_flush_cv, even if we temporarily drop the ms_lock in + * metaslab_condense(), as the metaslab is already loaded. + */ + if (msp->ms_loaded && metaslab_should_condense(msp)) { + metaslab_group_t *mg = msp->ms_group; + + /* + * For all histogram operations below refer to the + * comments of metaslab_sync() where we follow a + * similar procedure. + */ + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + metaslab_group_histogram_remove(mg, msp); + + metaslab_condense(msp, tx); + + space_map_histogram_clear(msp->ms_sm); + space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); + ASSERT(range_tree_is_empty(msp->ms_freed)); + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + space_map_histogram_add(msp->ms_sm, + msp->ms_defer[t], tx); + } + metaslab_aux_histograms_update(msp); + + metaslab_group_histogram_add(mg, msp); + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + + /* + * Since we recreated the histogram (and potentially + * the ms_sm too while condensing) ensure that the + * weight is updated too because we are not guaranteed + * that this metaslab is dirty and will go through + * metaslab_sync_done(). + */ + metaslab_recalculate_weight_and_sort(msp); + return (B_TRUE); + } + + msp->ms_flushing = B_TRUE; + uint64_t sm_len_before = space_map_length(msp->ms_sm); + + mutex_exit(&msp->ms_lock); + space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, + SM_NO_VDEVID, tx); + space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, + SM_NO_VDEVID, tx); + mutex_enter(&msp->ms_lock); + + uint64_t sm_len_after = space_map_length(msp->ms_sm); + if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { + zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " + "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " + "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa), + msp->ms_group->mg_vd->vdev_id, msp->ms_id, + range_tree_space(msp->ms_unflushed_allocs), + range_tree_space(msp->ms_unflushed_frees), + (sm_len_after - sm_len_before)); + } + + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + metaslab_verify_weight_and_frag(msp); + + metaslab_flush_update(msp, tx); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + metaslab_verify_weight_and_frag(msp); + + msp->ms_flushing = B_FALSE; + cv_broadcast(&msp->ms_flush_cv); + return (B_TRUE); } /* @@ -2761,7 +3229,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) objset_t *mos = spa_meta_objset(spa); range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; - uint64_t object = space_map_object(msp->ms_sm); ASSERT(!vd->vdev_ishole); @@ -2808,25 +3275,53 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); - if (msp->ms_sm == NULL) { - uint64_t new_object; + /* + * Generate a log space map if one doesn't exist already. + */ + spa_generate_syncing_log_sm(spa, tx); - new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); + if (msp->ms_sm == NULL) { + uint64_t new_object = space_map_alloc(mos, + spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? + zfs_metaslab_sm_blksz_with_log : + zfs_metaslab_sm_blksz_no_log, tx); VERIFY3U(new_object, !=, 0); + dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * + msp->ms_id, sizeof (uint64_t), &new_object, tx); + VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, msp->ms_start, msp->ms_size, vd->vdev_ashift)); - ASSERT(msp->ms_sm != NULL); + + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); ASSERT0(metaslab_allocated_space(msp)); } + if (metaslab_unflushed_txg(msp) == 0 && + spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + ASSERT(spa_syncing_log_sm(spa) != NULL); + + metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + spa_log_sm_increment_current_mscount(spa); + spa_log_summary_add_flushed_metaslab(spa); + + ASSERT(msp->ms_sm != NULL); + mutex_enter(&spa->spa_flushed_ms_lock); + avl_add(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); + + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + } + if (!range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); uint64_t new_object = space_map_alloc(mos, - vdev_standard_sm_blksz, tx); + zfs_vdev_standard_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, @@ -2855,10 +3350,39 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); - if (msp->ms_loaded && metaslab_should_condense(msp)) { - metaslab_condense(msp, txg, tx); + if (spa->spa_sync_pass == 1 && msp->ms_loaded && + metaslab_should_condense(msp)) + metaslab_condense(msp, tx); + + /* + * We'll be going to disk to sync our space accounting, thus we + * drop the ms_lock during that time so allocations coming from + * open-context (ZIL) for future TXGs do not block. + */ + mutex_exit(&msp->ms_lock); + space_map_t *log_sm = spa_syncing_log_sm(spa); + if (log_sm != NULL) { + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); + + space_map_write(log_sm, alloctree, SM_ALLOC, + vd->vdev_id, tx); + space_map_write(log_sm, msp->ms_freeing, SM_FREE, + vd->vdev_id, tx); + mutex_enter(&msp->ms_lock); + + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_remove_xor_add(alloctree, + msp->ms_unflushed_frees, msp->ms_unflushed_allocs); + range_tree_remove_xor_add(msp->ms_freeing, + msp->ms_unflushed_allocs, msp->ms_unflushed_frees); + spa->spa_unflushed_stats.sus_memused += + metaslab_unflushed_changes_memused(msp); } else { - mutex_exit(&msp->ms_lock); + ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); + space_map_write(msp->ms_sm, alloctree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, @@ -2878,7 +3402,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * Since we are doing writes to disk and the ms_checkpointing * tree won't be changing during that time, we drop the - * ms_lock while writing to the checkpoint space map. + * ms_lock while writing to the checkpoint space map, for the + * same reason mentioned above. */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, @@ -2946,6 +3471,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * and instead will just swap the pointers for freeing and freed. * We can safely do this since the freed_tree is guaranteed to be * empty on the initial pass. + * + * Keep in mind that even if we are currently using a log spacemap + * we want current frees to end up in the ms_allocatable (but not + * get appended to the ms_sm) so their ranges can be reused as usual. */ if (spa_sync_pass(spa) == 1) { range_tree_swap(&msp->ms_freeing, &msp->ms_freed); @@ -2965,11 +3494,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) mutex_exit(&msp->ms_lock); - if (object != space_map_object(msp->ms_sm)) { - object = space_map_object(msp->ms_sm); - dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * - msp->ms_id, sizeof (uint64_t), &object, tx); - } + /* + * Verify that the space map object ID has been recorded in the + * vdev_ms_array. + */ + uint64_t object; + VERIFY0(dmu_read(mos, vd->vdev_ms_array, + msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); + VERIFY3U(object, ==, space_map_object(msp->ms_sm)); + mutex_exit(&msp->ms_sync_lock); dmu_tx_commit(tx); } @@ -3010,14 +3543,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) msp->ms_freed = range_tree_create(NULL, NULL); for (int t = 0; t < TXG_DEFER_SIZE; t++) { - ASSERT(msp->ms_defer[t] == NULL); - + ASSERT3P(msp->ms_defer[t], ==, NULL); msp->ms_defer[t] = range_tree_create(NULL, NULL); } ASSERT3P(msp->ms_checkpointing, ==, NULL); msp->ms_checkpointing = range_tree_create(NULL, NULL); + ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); + msp->ms_unflushed_allocs = range_tree_create(NULL, NULL); + ASSERT3P(msp->ms_unflushed_frees, ==, NULL); + msp->ms_unflushed_frees = range_tree_create(NULL, NULL); + metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); } ASSERT0(range_tree_space(msp->ms_freeing)); @@ -3034,21 +3571,28 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) defer_delta = 0; alloc_delta = msp->ms_allocated_this_txg - range_tree_space(msp->ms_freed); + if (defer_allowed) { defer_delta = range_tree_space(msp->ms_freed) - range_tree_space(*defer_tree); } else { defer_delta -= range_tree_space(*defer_tree); } - metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, defer_delta, 0); - /* - * If there's a metaslab_load() in progress, wait for it to complete - * so that we have a consistent view of the in-core space map. - */ - metaslab_load_wait(msp); + if (spa_syncing_log_sm(spa) == NULL) { + /* + * If there's a metaslab_load() in progress and we don't have + * a log space map, it means that we probably wrote to the + * metaslab's space map. If this is the case, we need to + * make sure that we wait for the load to complete so that we + * have a consistent view at the in-core side of the metaslab. + */ + metaslab_load_wait(msp); + } else { + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + } /* * When auto-trimming is enabled, free ranges which are added to @@ -3383,6 +3927,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; + ASSERT(MUTEX_HELD(&msp->ms_lock)); VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_disabled); @@ -4578,12 +5123,23 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) offset, size); } - range_tree_verify_not_present(msp->ms_trim, offset, size); + /* + * Check all segments that currently exist in the freeing pipeline. + * + * It would intuitively make sense to also check the current allocating + * tree since metaslab_unalloc_dva() exists for extents that are + * allocated and freed in the same sync pass withing the same txg. + * Unfortunately there are places (e.g. the ZIL) where we allocate a + * segment but then we free part of it within the same txg + * [see zil_sync()]. Thus, we don't call range_tree_verify() in the + * current allocating tree. + */ range_tree_verify_not_present(msp->ms_freeing, offset, size); range_tree_verify_not_present(msp->ms_checkpointing, offset, size); range_tree_verify_not_present(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) range_tree_verify_not_present(msp->ms_defer[j], offset, size); + range_tree_verify_not_present(msp->ms_trim, offset, size); mutex_exit(&msp->ms_lock); } @@ -4692,3 +5248,54 @@ metaslab_enable(metaslab_t *msp, boolean_t sync) mutex_exit(&msp->ms_lock); mutex_exit(&mg->mg_ms_disabled_lock); } + +static void +metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) +{ + vdev_t *vd = ms->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa_meta_objset(spa); + + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + metaslab_unflushed_phys_t entry = { + .msp_unflushed_txg = metaslab_unflushed_txg(ms), + }; + uint64_t entry_size = sizeof (entry); + uint64_t entry_offset = ms->ms_id * entry_size; + + uint64_t object = 0; + int err = zap_lookup(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, + &object); + if (err == ENOENT) { + object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, + SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); + VERIFY0(zap_add(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, + &object, tx)); + } else { + VERIFY0(err); + } + + dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, + &entry, tx); +} + +void +metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) +{ + spa_t *spa = ms->ms_group->mg_vd->vdev_spa; + + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + ms->ms_unflushed_txg = txg; + metaslab_update_ondisk_flush_data(ms, tx); +} + +uint64_t +metaslab_unflushed_txg(metaslab_t *ms) +{ + return (ms->ms_unflushed_txg); +} diff --git a/usr/src/uts/common/fs/zfs/range_tree.c b/usr/src/uts/common/fs/zfs/range_tree.c index fc705e3796..0ce251126b 100644 --- a/usr/src/uts/common/fs/zfs/range_tree.c +++ b/usr/src/uts/common/fs/zfs/range_tree.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, 2019 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -579,10 +579,10 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) { - range_seg_t *rs; - - for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) + for (range_seg_t *rs = avl_first(&rt->rt_root); rs; + rs = AVL_NEXT(&rt->rt_root, rs)) { func(arg, rs->rs_start, rs->rs_end - rs->rs_start); + } } range_seg_t * @@ -597,6 +597,12 @@ range_tree_space(range_tree_t *rt) return (rt->rt_space); } +uint64_t +range_tree_numsegs(range_tree_t *rt) +{ + return ((rt == NULL) ? 0 : avl_numnodes(&rt->rt_root)); +} + /* Generic range tree functions for maintaining segments in an AVL tree. */ void rt_avl_create(range_tree_t *rt, void *arg) @@ -668,3 +674,73 @@ range_tree_span(range_tree_t *rt) { return (range_tree_max(rt) - range_tree_min(rt)); } + +/* + * Remove any overlapping ranges between the given segment [start, end) + * from removefrom. Add non-overlapping leftovers to addto. + */ +void +range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, + range_tree_t *removefrom, range_tree_t *addto) +{ + avl_index_t where; + range_seg_t starting_rs = { + .rs_start = start, + .rs_end = start + 1 + }; + + range_seg_t *curr = avl_find(&removefrom->rt_root, + &starting_rs, &where); + + if (curr == NULL) + curr = avl_nearest(&removefrom->rt_root, where, AVL_AFTER); + + range_seg_t *next; + for (; curr != NULL; curr = next) { + next = AVL_NEXT(&removefrom->rt_root, curr); + + if (start == end) + return; + VERIFY3U(start, <, end); + + /* there is no overlap */ + if (end <= curr->rs_start) { + range_tree_add(addto, start, end - start); + return; + } + + uint64_t overlap_start = MAX(curr->rs_start, start); + uint64_t overlap_end = MIN(curr->rs_end, end); + uint64_t overlap_size = overlap_end - overlap_start; + ASSERT3S(overlap_size, >, 0); + range_tree_remove(removefrom, overlap_start, overlap_size); + + if (start < overlap_start) + range_tree_add(addto, start, overlap_start - start); + + start = overlap_end; + } + VERIFY3P(curr, ==, NULL); + + if (start != end) { + VERIFY3U(start, <, end); + range_tree_add(addto, start, end - start); + } else { + VERIFY3U(start, ==, end); + } +} + +/* + * For each entry in rt, if it exists in removefrom, remove it + * from removefrom. Otherwise, add it to addto. + */ +void +range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, + range_tree_t *addto) +{ + for (range_seg_t *rs = avl_first(&rt->rt_root); rs; + rs = AVL_NEXT(&rt->rt_root, rs)) { + range_tree_remove_xor_add_segment(rs->rs_start, rs->rs_end, + removefrom, addto); + } +} diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 0afcffad45..32bab905e7 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -1356,19 +1356,90 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, return (0); } +static boolean_t +spa_should_flush_logs_on_unload(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return (B_FALSE); + + if (!spa_writeable(spa)) + return (B_FALSE); + + if (!spa->spa_sync_on) + return (B_FALSE); + + if (spa_state(spa) != POOL_STATE_EXPORTED) + return (B_FALSE); + + if (zfs_keep_log_spacemaps_at_export) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Opens a transaction that will set the flag that will instruct + * spa_sync to attempt to flush all the metaslabs for that txg. + */ +static void +spa_unload_log_sm_flush_all(spa_t *spa) +{ + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + ASSERT3U(spa->spa_log_flushall_txg, ==, 0); + spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); + + dmu_tx_commit(tx); + txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); +} + +static void +spa_unload_log_sm_metadata(spa_t *spa) +{ + void *cookie = NULL; + spa_log_sm_t *sls; + + while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, + &cookie)) != NULL) { + VERIFY0(sls->sls_mscount); + kmem_free(sls, sizeof (spa_log_sm_t)); + } + + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_head(&spa->spa_log_summary)) { + VERIFY0(e->lse_mscount); + list_remove(&spa->spa_log_summary, e); + kmem_free(e, sizeof (log_summary_entry_t)); + } + + spa->spa_unflushed_stats.sus_nblocks = 0; + spa->spa_unflushed_stats.sus_memused = 0; + spa->spa_unflushed_stats.sus_blocklimit = 0; +} + /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { - int i; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); spa_load_note(spa, "UNLOADING"); /* + * If the log space map feature is enabled and the pool is getting + * exported (but not destroyed), we want to spend some time flushing + * as many metaslabs as we can in an attempt to destroy log space + * maps and save import time. + */ + if (spa_should_flush_logs_on_unload(spa)) + spa_unload_log_sm_flush_all(spa); + + /* * Stop async tasks. */ spa_async_suspend(spa); @@ -1389,16 +1460,15 @@ spa_unload(spa_t *spa) } /* - * Even though vdev_free() also calls vdev_metaslab_fini, we need - * to call it earlier, before we wait for async i/o to complete. - * This ensures that there is no async metaslab prefetching, by - * calling taskq_wait(mg_taskq). + * This ensures that there is no async metaslab prefetching + * while we attempt to unload the spa. */ if (spa->spa_root_vdev != NULL) { - spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); - for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) - vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); - spa_config_exit(spa, SCL_ALL, spa); + for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { + vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; + if (vc->vdev_mg != NULL) + taskq_wait(vc->vdev_mg->mg_taskq); + } } if (spa->spa_mmp.mmp_thread) @@ -1452,13 +1522,14 @@ spa_unload(spa_t *spa) } ddt_unload(spa); + spa_unload_log_sm_metadata(spa); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); - for (i = 0; i < spa->spa_spares.sav_count; i++) + for (int i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { kmem_free(spa->spa_spares.sav_vdevs, @@ -1471,7 +1542,7 @@ spa_unload(spa_t *spa) } spa->spa_spares.sav_count = 0; - for (i = 0; i < spa->spa_l2cache.sav_count; i++) { + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); } @@ -3584,6 +3655,13 @@ spa_ld_load_vdev_metadata(spa_t *spa) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } + error = spa_ld_log_spacemaps(spa); + if (error != 0) { + spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } + /* * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ @@ -5870,7 +5948,7 @@ spa_reset(char *pool) int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { - uint64_t txg, id; + uint64_t txg; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; @@ -5945,19 +6023,9 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } for (int c = 0; c < vd->vdev_children; c++) { - - /* - * Set the vdev id to the first hole, if one exists. - */ - for (id = 0; id < rvd->vdev_children; id++) { - if (rvd->vdev_child[id]->vdev_ishole) { - vdev_free(rvd->vdev_child[id]); - break; - } - } tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); - tvd->vdev_id = id; + tvd->vdev_id = rvd->vdev_children; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } @@ -7601,6 +7669,18 @@ spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) if (spa_sync_pass(spa) != 1) return; + /* + * Note: + * If the log space map feature is active, we stop deferring + * frees to the next TXG and therefore running this function + * would be considered a no-op as spa_deferred_bpobj should + * not have any entries. + * + * That said we run this function anyway (instead of returning + * immediately) for the edge-case scenario where we just + * activated the log space map feature in this TXG but we have + * deferred frees from the previous TXG. + */ zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, spa_free_sync_cb, zio, tx), ==, 0); @@ -8193,7 +8273,14 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); - if (pass < zfs_sync_pass_deferred_free) { + if (pass < zfs_sync_pass_deferred_free || + spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + /* + * If the log space map feature is active we don't + * care about deferred frees and the deferred bpobj + * as the log space map should effectively have the + * same results (i.e. appending only to one object). + */ spa_sync_frees(spa, free_bpl, tx); } else { /* @@ -8210,6 +8297,8 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) svr_sync(spa, tx); spa_sync_upgrades(spa, tx); + spa_flush_metaslabs(spa, tx); + vdev_t *vd = NULL; while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) != NULL) @@ -8456,6 +8545,7 @@ spa_sync(spa_t *spa, uint64_t txg) while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) != NULL) vdev_sync_done(vd, txg); + spa_sync_close_syncing_log_sm(spa); spa_update_dspace(spa); @@ -8650,6 +8740,21 @@ spa_has_active_shared_spare(spa_t *spa) return (B_FALSE); } +uint64_t +spa_total_metaslabs(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t m = 0; + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + if (!vdev_is_concrete(vd)) + continue; + m += vd->vdev_ms_count; + } + return (m); +} + sysevent_t * spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { diff --git a/usr/src/uts/common/fs/zfs/spa_log_spacemap.c b/usr/src/uts/common/fs/zfs/spa_log_spacemap.c new file mode 100644 index 0000000000..ffa2c60563 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/spa_log_spacemap.c @@ -0,0 +1,1285 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018, 2019 by Delphix. All rights reserved. + */ + +#include <sys/dmu_objset.h> +#include <sys/metaslab.h> +#include <sys/metaslab_impl.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/spa_log_spacemap.h> +#include <sys/vdev_impl.h> +#include <sys/zap.h> + +/* + * Log Space Maps + * + * Log space maps are an optimization in ZFS metadata allocations for pools + * whose workloads are primarily random-writes. Random-write workloads are also + * typically random-free, meaning that they are freeing from locations scattered + * throughout the pool. This means that each TXG we will have to append some + * FREE records to almost every metaslab. With log space maps, we hold their + * changes in memory and log them altogether in one pool-wide space map on-disk + * for persistence. As more blocks are accumulated in the log space maps and + * more unflushed changes are accounted in memory, we flush a selected group + * of metaslabs every TXG to relieve memory pressure and potential overheads + * when loading the pool. Flushing a metaslab to disk relieves memory as we + * flush any unflushed changes from memory to disk (i.e. the metaslab's space + * map) and saves import time by making old log space maps obsolete and + * eventually destroying them. [A log space map is said to be obsolete when all + * its entries have made it to their corresponding metaslab space maps]. + * + * == On disk data structures used == + * + * - The pool has a new feature flag and a new entry in the MOS. The feature + * is activated when we create the first log space map and remains active + * for the lifetime of the pool. The new entry in the MOS Directory [refer + * to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value + * pairs are of the form <key: txg, value: log space map object for that txg>. + * This entry is our on-disk reference of the log space maps that exist in + * the pool for each TXG and it is used during import to load all the + * metaslab unflushed changes in memory. To see how this structure is first + * created and later populated refer to spa_generate_syncing_log_sm(). To see + * how it is used during import time refer to spa_ld_log_sm_metadata(). + * + * - Each vdev has a new entry in its vdev_top_zap (see field + * VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of + * each metaslab in this vdev. This field is the on-disk counterpart of the + * in-memory field ms_unflushed_txg which tells us from which TXG and onwards + * the metaslab haven't had its changes flushed. During import, we use this + * to ignore any entries in the space map log that are for this metaslab but + * from a TXG before msp_unflushed_txg. At that point, we also populate its + * in-memory counterpart and from there both fields are updated every time + * we flush that metaslab. + * + * - A space map is created every TXG and, during that TXG, it is used to log + * all incoming changes (the log space map). When created, the log space map + * is referenced in memory by spa_syncing_log_sm and its object ID is inserted + * to the space map ZAP mentioned above. The log space map is closed at the + * end of the TXG and will be destroyed when it becomes fully obsolete. We + * know when a log space map has become obsolete by looking at the oldest + * (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger + * than the log space map's TXG, then it means that there is no metaslab who + * doesn't have the changes from that log and we can therefore destroy it. + * [see spa_cleanup_old_sm_logs()]. + * + * == Important in-memory structures == + * + * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in + * the pool by their ms_unflushed_txg field. It is primarily used for three + * reasons. First of all, it is used during flushing where we try to flush + * metaslabs in-order from the oldest-flushed to the most recently flushed + * every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the + * oldest flushed metaslab to distinguish which log space maps have become + * obsolete and which ones are still relevant. Finally it tells us which + * metaslabs have unflushed changes in a pool where this feature was just + * enabled, as we don't immediately add all of the pool's metaslabs but we + * add them over time as they go through metaslab_sync(). The reason that + * we do that is to ease these pools into the behavior of the flushing + * algorithm (described later on). + * + * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory + * counterpart of the space map ZAP mentioned above. It's an AVL tree whose + * nodes represent the log space maps in the pool. This in-memory + * representation of log space maps in the pool sorts the log space maps by + * the TXG that they were created (which is also the TXG of their unflushed + * changes). It also contains the following extra information for each + * space map: + * [1] The number of metaslabs that were last flushed on that TXG. This is + * important because if that counter is zero and this is the oldest + * log then it means that it is also obsolete. + * [2] The number of blocks of that space map. This field is used by the + * block heuristic of our flushing algorithm (described later on). + * It represents how many blocks of metadata changes ZFS had to write + * to disk for that TXG. + * + * - The per-spa field spa_log_summary is a list of entries that summarizes + * the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg + * AVL tree mentioned above. The reason this exists is that our flushing + * algorithm (described later) tries to estimate how many metaslabs to flush + * in each TXG by iterating over all the log space maps and looking at their + * block counts. Summarizing that information means that don't have to + * iterate through each space map, minimizing the runtime overhead of the + * flushing algorithm which would be induced in syncing context. In terms of + * implementation the log summary is used as a queue: + * * we modify or pop entries from its head when we flush metaslabs + * * we modify or append entries to its tail when we sync changes. + * + * - Each metaslab has two new range trees that hold its unflushed changes, + * ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint. + * + * == Flushing algorithm == + * + * The decision of how many metaslabs to flush on a give TXG is guided by + * two heuristics: + * + * [1] The memory heuristic - + * We keep track of the memory used by the unflushed trees from all the + * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it + * stays below a certain threshold which is determined by an arbitrary hard + * limit and an arbitrary percentage of the system's memory [see + * spa_log_exceeds_memlimit()]. When we see that the memory usage of the + * unflushed changes are passing that threshold, we flush metaslabs, which + * empties their unflushed range trees, reducing the memory used. + * + * [2] The block heuristic - + * We try to keep the total number of blocks in the log space maps in check + * so the log doesn't grow indefinitely and we don't induce a lot of overhead + * when loading the pool. At the same time we don't want to flush a lot of + * metaslabs too often as this would defeat the purpose of the log space map. + * As a result we set a limit in the amount of blocks that we think it's + * acceptable for the log space maps to have and try not to cross it. + * [see sus_blocklimit from spa_unflushed_stats]. + * + * In order to stay below the block limit every TXG we have to estimate how + * many metaslabs we need to flush based on the current rate of incoming blocks + * and our history of log space map blocks. The main idea here is to answer + * the question of how many metaslabs do we need to flush in order to get rid + * at least an X amount of log space map blocks. We can answer this question + * by iterating backwards from the oldest log space map to the newest one + * and looking at their metaslab and block counts. At this point the log summary + * mentioned above comes handy as it reduces the amount of things that we have + * to iterate (even though it may reduce the preciseness of our estimates due + * to its aggregation of data). So with that in mind, we project the incoming + * rate of the current TXG into the future and attempt to approximate how many + * metaslabs would we need to flush from now in order to avoid exceeding our + * block limit in different points in the future (granted that we would keep + * flushing the same number of metaslabs for every TXG). Then we take the + * maximum number from all these estimates to be on the safe side. For the + * exact implementation details of algorithm refer to + * spa_estimate_metaslabs_to_flush. + */ + +/* + * This is used as the block size for the space maps used for the + * log space map feature. These space maps benefit from a bigger + * block size as we expect to be writing a lot of data to them at + * once. + */ +unsigned long zfs_log_sm_blksz = 1ULL << 17; + +/* + * Percentage of the overall system’s memory that ZFS allows to be + * used for unflushed changes (e.g. the sum of size of all the nodes + * in the unflushed trees). + * + * Note that this value is calculated over 1000000 for finer granularity + * (thus the _ppm suffix; reads as "parts per million"). As an example, + * the default of 1000 allows 0.1% of memory to be used. + */ +unsigned long zfs_unflushed_max_mem_ppm = 1000; + +/* + * Specific hard-limit in memory that ZFS allows to be used for + * unflushed changes. + */ +unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30; + +/* + * The following tunable determines the number of blocks that can be used for + * the log space maps. It is expressed as a percentage of the total number of + * metaslabs in the pool (i.e. the default of 400 means that the number of log + * blocks is capped at 4 times the number of metaslabs). + * + * This value exists to tune our flushing algorithm, with higher values + * flushing metaslabs less often (doing less I/Os) per TXG versus lower values + * flushing metaslabs more aggressively with the upside of saving overheads + * when loading the pool. Another factor in this tradeoff is that flushing + * less often can potentially lead to better utilization of the metaslab space + * map's block size as we accumulate more changes per flush. + * + * Given that this tunable indirectly controls the flush rate (metaslabs + * flushed per txg) and that's why making it a percentage in terms of the + * number of metaslabs in the pool makes sense here. + * + * As a rule of thumb we default this tunable to 400% based on the following: + * + * 1] Assuming a constant flush rate and a constant incoming rate of log blocks + * it is reasonable to expect that the amount of obsolete entries changes + * linearly from txg to txg (e.g. the oldest log should have the most + * obsolete entries, and the most recent one the least). With this we could + * say that, at any given time, about half of the entries in the whole space + * map log are obsolete. Thus for every two entries for a metaslab in the + * log space map, only one of them is valid and actually makes it to the + * metaslab's space map. + * [factor of 2] + * 2] Each entry in the log space map is guaranteed to be two words while + * entries in metaslab space maps are generally single-word. + * [an extra factor of 2 - 400% overall] + * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into + * account any consolidation of segments from the log space map to the + * unflushed range trees nor their history (e.g. a segment being allocated, + * then freed, then allocated again means 3 log space map entries but 0 + * metaslab space map entries). Depending on the workload, we've seen ~1.8 + * non-obsolete log space map entries per metaslab entry, for a total of + * ~600%. Since most of these estimates though are workload dependent, we + * default on 400% to be conservative. + * + * Thus we could say that even in the worst + * case of [1] and [2], the factor should end up being 4. + * + * That said, regardless of the number of metaslabs in the pool we need to + * provide upper and lower bounds for the log block limit. + * [see zfs_unflushed_log_block_{min,max}] + */ +unsigned long zfs_unflushed_log_block_pct = 400; + +/* + * If the number of metaslabs is small and our incoming rate is high, we could + * get into a situation that we are flushing all our metaslabs every TXG. Thus + * we always allow at least this many log blocks. + */ +unsigned long zfs_unflushed_log_block_min = 1000; + +/* + * If the log becomes too big, the import time of the pool can take a hit in + * terms of performance. Thus we have a hard limit in the size of the log in + * terms of blocks. + */ +unsigned long zfs_unflushed_log_block_max = (1ULL << 18); + +/* + * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and + * stability of the flushing algorithm (longer summary) vs its runtime overhead + * (smaller summary is faster to traverse). + */ +unsigned long zfs_max_logsm_summary_length = 10; + +/* + * Tunable that sets the lower bound on the metaslabs to flush every TXG. + * + * Setting this to 0 has no effect since if the pool is idle we won't even be + * creating log space maps and therefore we won't be flushing. On the other + * hand if the pool has any incoming workload our block heuristic will start + * flushing metaslabs anyway. + * + * The point of this tunable is to be used in extreme cases where we really + * want to flush more metaslabs than our adaptable heuristic plans to flush. + */ +unsigned long zfs_min_metaslabs_to_flush = 1; + +/* + * Tunable that specifies how far in the past do we want to look when trying to + * estimate the incoming log blocks for the current TXG. + * + * Setting this too high may not only increase runtime but also minimize the + * effect of the incoming rates from the most recent TXGs as we take the + * average over all the blocks that we walk + * [see spa_estimate_incoming_log_blocks]. + */ +unsigned long zfs_max_log_walking = 5; + +/* + * This tunable exists solely for testing purposes. It ensures that the log + * spacemaps are not flushed and destroyed during export in order for the + * relevant log spacemap import code paths to be tested (effectively simulating + * a crash). + */ +int zfs_keep_log_spacemaps_at_export = 0; + +static uint64_t +spa_estimate_incoming_log_blocks(spa_t *spa) +{ + ASSERT3U(spa_sync_pass(spa), ==, 1); + uint64_t steps = 0, sum = 0; + + for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg); + sls != NULL && steps < zfs_max_log_walking; + sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) { + if (sls->sls_txg == spa_syncing_txg(spa)) { + /* + * skip the log created in this TXG as this would + * make our estimations inaccurate. + */ + continue; + } + sum += sls->sls_nblocks; + steps++; + } + return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0); +} + +uint64_t +spa_log_sm_blocklimit(spa_t *spa) +{ + return (spa->spa_unflushed_stats.sus_blocklimit); +} + +void +spa_log_sm_set_blocklimit(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + ASSERT0(spa_log_sm_blocklimit(spa)); + return; + } + + uint64_t calculated_limit = + (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100; + spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit, + zfs_unflushed_log_block_min), zfs_unflushed_log_block_max); +} + +uint64_t +spa_log_sm_nblocks(spa_t *spa) +{ + return (spa->spa_unflushed_stats.sus_nblocks); +} + +/* + * Ensure that the in-memory log space map structures and the summary + * have the same block and metaslab counts. + */ +static void +spa_log_summary_verify_counts(spa_t *spa) +{ + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0) + return; + + uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed); + + uint64_t ms_in_summary = 0, blk_in_summary = 0; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) { + ms_in_summary += e->lse_mscount; + blk_in_summary += e->lse_blkcount; + } + + uint64_t ms_in_logs = 0, blk_in_logs = 0; + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + ms_in_logs += sls->sls_mscount; + blk_in_logs += sls->sls_nblocks; + } + + VERIFY3U(ms_in_logs, ==, ms_in_summary); + VERIFY3U(ms_in_logs, ==, ms_in_avl); + VERIFY3U(blk_in_logs, ==, blk_in_summary); + VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa)); +} + +static boolean_t +summary_entry_is_full(spa_t *spa, log_summary_entry_t *e) +{ + uint64_t blocks_per_row = MAX(1, + DIV_ROUND_UP(spa_log_sm_blocklimit(spa), + zfs_max_logsm_summary_length)); + + return (blocks_per_row <= e->lse_blkcount); +} + +/* + * Update the log summary information to reflect the fact that a metaslab + * was flushed or destroyed (e.g due to device removal or pool export/destroy). + * + * We typically flush the oldest flushed metaslab so the first (and oldest) + * entry of the summary is updated. However if that metaslab is getting loaded + * we may flush the second oldest one which may be part of an entry later in + * the summary. Moreover, if we call into this function from metaslab_fini() + * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask + * for a txg as an argument so we can locate the appropriate summary entry for + * the metaslab. + */ +void +spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) +{ + /* + * We don't track summary data for read-only pools and this function + * can be called from metaslab_fini(). In that case return immediately. + */ + if (!spa_writeable(spa)) + return; + + log_summary_entry_t *target = NULL; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_next(&spa->spa_log_summary, e)) { + if (e->lse_start > txg) + break; + target = e; + } + + if (target == NULL || target->lse_mscount == 0) { + /* + * We didn't find a summary entry for this metaslab. We must be + * at the teardown of a spa_load() attempt that got an error + * while reading the log space maps. + */ + VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR); + return; + } + + target->lse_mscount--; +} + +/* + * Update the log summary information to reflect the fact that we destroyed + * old log space maps. Since we can only destroy the oldest log space maps, + * we decrement the block count of the oldest summary entry and potentially + * destroy it when that count hits 0. + * + * This function is called after a metaslab is flushed and typically that + * metaslab is the oldest flushed, which means that this function will + * typically decrement the block count of the first entry of the summary and + * potentially free it if the block count gets to zero (its metaslab count + * should be zero too at that point). + * + * There are certain scenarios though that don't work exactly like that so we + * need to account for them: + * + * Scenario [1]: It is possible that after we flushed the oldest flushed + * metaslab and we destroyed the oldest log space map, more recent logs had 0 + * metaslabs pointing to them so we got rid of them too. This can happen due + * to metaslabs being destroyed through device removal, or because the oldest + * flushed metaslab was loading but we kept flushing more recently flushed + * metaslabs due to the memory pressure of unflushed changes. Because of that, + * we always iterate from the beginning of the summary and if blocks_gone is + * bigger than the block_count of the current entry we free that entry (we + * expect its metaslab count to be zero), we decrement blocks_gone and on to + * the next entry repeating this procedure until blocks_gone gets decremented + * to 0. Doing this also works for the typical case mentioned above. + * + * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by + * the first (and oldest) entry in the summary. If the first few entries of + * the summary were only accounting metaslabs from a device that was just + * removed, then the current oldest flushed metaslab could be accounted by an + * entry somewhere in the middle of the summary. Moreover flushing that + * metaslab will destroy all the log space maps older than its ms_unflushed_txg + * because they became obsolete after the removal. Thus, iterating as we did + * for scenario [1] works out for this case too. + * + * Scenario [3]: At times we decide to flush all the metaslabs in the pool + * in one TXG (either because we are exporting the pool or because our flushing + * heuristics decided to do so). When that happens all the log space maps get + * destroyed except the one created for the current TXG which doesn't have + * any log blocks yet. As log space maps get destroyed with every metaslab that + * we flush, entries in the summary are also destroyed. This brings a weird + * corner-case when we flush the last metaslab and the log space map of the + * current TXG is in the same summary entry with other log space maps that + * are older. When that happens we are eventually left with this one last + * summary entry whose blocks are gone (blocks_gone equals the entry's block + * count) but its metaslab count is non-zero (because it accounts all the + * metaslabs in the pool as they all got flushed). Under this scenario we can't + * free this last summary entry as it's referencing all the metaslabs in the + * pool and its block count will get incremented at the end of this sync (when + * we close the syncing log space map). Thus we just decrement its current + * block count and leave it alone. In the case that the pool gets exported, + * its metaslab count will be decremented over time as we call metaslab_fini() + * for all the metaslabs in the pool and the entry will be freed at + * spa_unload_log_sm_metadata(). + */ +void +spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone) +{ + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_head(&spa->spa_log_summary)) { + if (e->lse_blkcount > blocks_gone) { + /* + * Assert that we stopped at an entry that is not + * obsolete. + */ + ASSERT(e->lse_mscount != 0); + + e->lse_blkcount -= blocks_gone; + blocks_gone = 0; + break; + } else if (e->lse_mscount == 0) { + /* remove obsolete entry */ + blocks_gone -= e->lse_blkcount; + list_remove(&spa->spa_log_summary, e); + kmem_free(e, sizeof (log_summary_entry_t)); + } else { + /* Verify that this is scenario [3] mentioned above. */ + VERIFY3U(blocks_gone, ==, e->lse_blkcount); + + /* + * Assert that this is scenario [3] further by ensuring + * that this is the only entry in the summary. + */ + VERIFY3P(e, ==, list_tail(&spa->spa_log_summary)); + ASSERT3P(e, ==, list_head(&spa->spa_log_summary)); + + blocks_gone = e->lse_blkcount = 0; + break; + } + } + + /* + * Ensure that there is no way we are trying to remove more blocks + * than the # of blocks in the summary. + */ + ASSERT0(blocks_gone); +} + +void +spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg) +{ + spa_log_sm_t target = { .sls_txg = txg }; + spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg, + &target, NULL); + + if (sls == NULL) { + /* + * We must be at the teardown of a spa_load() attempt that + * got an error while reading the log space maps. + */ + VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR); + return; + } + + ASSERT(sls->sls_mscount > 0); + sls->sls_mscount--; +} + +void +spa_log_sm_increment_current_mscount(spa_t *spa) +{ + spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg); + + ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa)); + last_sls->sls_mscount++; +} + +static void +summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed, + uint64_t nblocks) +{ + log_summary_entry_t *e = list_tail(&spa->spa_log_summary); + + if (e == NULL || summary_entry_is_full(spa, e)) { + e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP); + e->lse_start = txg; + list_insert_tail(&spa->spa_log_summary, e); + } + + ASSERT3U(e->lse_start, <=, txg); + e->lse_mscount += metaslabs_flushed; + e->lse_blkcount += nblocks; +} + +static void +spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks) +{ + summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks); +} + +void +spa_log_summary_add_flushed_metaslab(spa_t *spa) +{ + summary_add_data(spa, spa_syncing_txg(spa), 1, 0); +} + +/* + * This function attempts to estimate how many metaslabs should + * we flush to satisfy our block heuristic for the log spacemap + * for the upcoming TXGs. + * + * Specifically, it first tries to estimate the number of incoming + * blocks in this TXG. Then by projecting that incoming rate to + * future TXGs and using the log summary, it figures out how many + * flushes we would need to do for future TXGs individually to + * stay below our block limit and returns the maximum number of + * flushes from those estimates. + */ +static uint64_t +spa_estimate_metaslabs_to_flush(spa_t *spa) +{ + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(spa_log_sm_blocklimit(spa) != 0); + + /* + * This variable contains the incoming rate that will be projected + * and used for our flushing estimates in the future. + */ + uint64_t incoming = spa_estimate_incoming_log_blocks(spa); + + /* + * At any point in time this variable tells us how many + * TXGs in the future we are so we can make our estimations. + */ + uint64_t txgs_in_future = 1; + + /* + * This variable tells us how much room do we have until we hit + * our limit. When it goes negative, it means that we've exceeded + * our limit and we need to flush. + * + * Note that since we start at the first TXG in the future (i.e. + * txgs_in_future starts from 1) we already decrement this + * variable by the incoming rate. + */ + int64_t available_blocks = + spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming; + + /* + * This variable tells us the total number of flushes needed to + * keep the log size within the limit when we reach txgs_in_future. + */ + uint64_t total_flushes = 0; + + /* Holds the current maximum of our estimates so far. */ + uint64_t max_flushes_pertxg = + MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed), + zfs_min_metaslabs_to_flush); + + /* + * For our estimations we only look as far in the future + * as the summary allows us. + */ + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) { + + /* + * If there is still room before we exceed our limit + * then keep skipping TXGs accumulating more blocks + * based on the incoming rate until we exceed it. + */ + if (available_blocks >= 0) { + uint64_t skip_txgs = (available_blocks / incoming) + 1; + available_blocks -= (skip_txgs * incoming); + txgs_in_future += skip_txgs; + ASSERT3S(available_blocks, >=, -incoming); + } + + /* + * At this point we're far enough into the future where + * the limit was just exceeded and we flush metaslabs + * based on the current entry in the summary, updating + * our available_blocks. + */ + ASSERT3S(available_blocks, <, 0); + available_blocks += e->lse_blkcount; + total_flushes += e->lse_mscount; + + /* + * Keep the running maximum of the total_flushes that + * we've done so far over the number of TXGs in the + * future that we are. The idea here is to estimate + * the average number of flushes that we should do + * every TXG so that when we are that many TXGs in the + * future we stay under the limit. + */ + max_flushes_pertxg = MAX(max_flushes_pertxg, + DIV_ROUND_UP(total_flushes, txgs_in_future)); + ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, + max_flushes_pertxg); + } + return (max_flushes_pertxg); +} + +uint64_t +spa_log_sm_memused(spa_t *spa) +{ + return (spa->spa_unflushed_stats.sus_memused); +} + +static boolean_t +spa_log_exceeds_memlimit(spa_t *spa) +{ + if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt) + return (B_TRUE); + + uint64_t system_mem_allowed = ((physmem * PAGESIZE) * + zfs_unflushed_max_mem_ppm) / 1000000; + if (spa_log_sm_memused(spa) > system_mem_allowed) + return (B_TRUE); + + return (B_FALSE); +} + +boolean_t +spa_flush_all_logs_requested(spa_t *spa) +{ + return (spa->spa_log_flushall_txg != 0); +} + +void +spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) +{ + uint64_t txg = dmu_tx_get_txg(tx); + + if (spa_sync_pass(spa) != 1) + return; + + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + /* + * If we don't have any metaslabs with unflushed changes + * return immediately. + */ + if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0) + return; + + /* + * During SPA export we leave a few empty TXGs to go by [see + * spa_final_dirty_txg() to understand why]. For this specific + * case, it is important to not flush any metaslabs as that + * would dirty this TXG. + * + * That said, during one of these dirty TXGs that is less or + * equal to spa_final_dirty(), spa_unload() will request that + * we try to flush all the metaslabs for that TXG before + * exporting the pool, thus we ensure that we didn't get a + * request of flushing everything before we attempt to return + * immediately. + */ + if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && + !dmu_objset_is_dirty(spa_meta_objset(spa), txg) && + !spa_flush_all_logs_requested(spa)) + return; + + /* + * We need to generate a log space map before flushing because this + * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg) + * for this TXG's flushed metaslab count (aka sls_mscount which is + * manipulated in many ways down the metaslab_flush() codepath). + * + * That is not to say that we may generate a log space map when we + * don't need it. If we are flushing metaslabs, that means that we + * were going to write changes to disk anyway, so even if we were + * not flushing, a log space map would have been created anyway in + * metaslab_sync(). + */ + spa_generate_syncing_log_sm(spa, tx); + + /* + * This variable tells us how many metaslabs we want to flush based + * on the block-heuristic of our flushing algorithm (see block comment + * of log space map feature). We also decrement this as we flush + * metaslabs and attempt to destroy old log space maps. + */ + uint64_t want_to_flush; + if (spa_flush_all_logs_requested(spa)) { + ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); + want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed); + } else { + want_to_flush = spa_estimate_metaslabs_to_flush(spa); + } + + ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, + want_to_flush); + + /* Used purely for verification purposes */ + uint64_t visited = 0; + + /* + * Ideally we would only iterate through spa_metaslabs_by_flushed + * using only one variable (curr). We can't do that because + * metaslab_flush() mutates position of curr in the AVL when + * it flushes that metaslab by moving it to the end of the tree. + * Thus we always keep track of the original next node of the + * current node (curr) in another variable (next). + */ + metaslab_t *next = NULL; + for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed); + curr != NULL; curr = next) { + next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr); + + /* + * If this metaslab has been flushed this txg then we've done + * a full circle over the metaslabs. + */ + if (metaslab_unflushed_txg(curr) == txg) + break; + + /* + * If we are done flushing for the block heuristic and the + * unflushed changes don't exceed the memory limit just stop. + */ + if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa)) + break; + + mutex_enter(&curr->ms_sync_lock); + mutex_enter(&curr->ms_lock); + boolean_t flushed = metaslab_flush(curr, tx); + mutex_exit(&curr->ms_lock); + mutex_exit(&curr->ms_sync_lock); + + /* + * If we failed to flush a metaslab (because it was loading), + * then we are done with the block heuristic as it's not + * possible to destroy any log space maps once you've skipped + * a metaslab. In that case we just set our counter to 0 but + * we continue looping in case there is still memory pressure + * due to unflushed changes. Note that, flushing a metaslab + * that is not the oldest flushed in the pool, will never + * destroy any log space maps [see spa_cleanup_old_sm_logs()]. + */ + if (!flushed) { + want_to_flush = 0; + } else if (want_to_flush > 0) { + want_to_flush--; + } + + visited++; + } + ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited); +} + +/* + * Close the log space map for this TXG and update the block counts + * for the the log's in-memory structure and the summary. + */ +void +spa_sync_close_syncing_log_sm(spa_t *spa) +{ + if (spa_syncing_log_sm(spa) == NULL) + return; + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg); + ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa)); + + sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa)); + spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; + + /* + * Note that we can't assert that sls_mscount is not 0, + * because there is the case where the first metaslab + * in spa_metaslabs_by_flushed is loading and we were + * not able to flush any metaslabs the current TXG. + */ + ASSERT(sls->sls_nblocks != 0); + + spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks); + spa_log_summary_verify_counts(spa); + + space_map_close(spa->spa_syncing_log_sm); + spa->spa_syncing_log_sm = NULL; + + /* + * At this point we tried to flush as many metaslabs as we + * can as the pool is getting exported. Reset the "flush all" + * so the last few TXGs before closing the pool can be empty + * (e.g. not dirty). + */ + if (spa_flush_all_logs_requested(spa)) { + ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); + spa->spa_log_flushall_txg = 0; + } +} + +void +spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx) +{ + objset_t *mos = spa_meta_objset(spa); + + uint64_t spacemap_zap; + int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) { + ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); + return; + } + VERIFY0(error); + + metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed); + uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest); + + /* Free all log space maps older than the oldest_flushed_txg. */ + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls && sls->sls_txg < oldest_flushed_txg; + sls = avl_first(&spa->spa_sm_logs_by_txg)) { + ASSERT0(sls->sls_mscount); + avl_remove(&spa->spa_sm_logs_by_txg, sls); + space_map_free_obj(mos, sls->sls_sm_obj, tx); + VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx)); + spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks; + kmem_free(sls, sizeof (spa_log_sm_t)); + } +} + +static spa_log_sm_t * +spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg) +{ + spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP); + + sls->sls_sm_obj = sm_obj; + sls->sls_txg = txg; + return (sls); +} + +void +spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx) +{ + uint64_t txg = dmu_tx_get_txg(tx); + objset_t *mos = spa_meta_objset(spa); + + if (spa_syncing_log_sm(spa) != NULL) + return; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + uint64_t spacemap_zap; + int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) { + ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); + + error = 0; + spacemap_zap = zap_create(mos, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, + &spacemap_zap, tx)); + spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx); + } + VERIFY0(error); + + uint64_t sm_obj; + ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj), + ==, ENOENT); + sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx); + VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx)); + avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg)); + + /* + * We pass UINT64_MAX as the space map's representation size + * and SPA_MINBLOCKSHIFT as the shift, to make the space map + * accept any sorts of segments since there's no real advantage + * to being more restrictive (given that we're already going + * to be using 2-word entries). + */ + VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj, + 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + /* + * If the log space map feature was just enabled, the blocklimit + * has not yet been set. + */ + if (spa_log_sm_blocklimit(spa) == 0) + spa_log_sm_set_blocklimit(spa); +} + +/* + * Find all the log space maps stored in the space map ZAP and sort + * them by their TXG in spa_sm_logs_by_txg. + */ +static int +spa_ld_log_sm_metadata(spa_t *spa) +{ + int error; + uint64_t spacemap_zap; + + ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); + + error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) { + /* the space map ZAP doesn't exist yet */ + return (0); + } else if (error != 0) { + spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at " + "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]", + error); + return (error); + } + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t log_txg = zfs_strtonum(za.za_name, NULL); + spa_log_sm_t *sls = + spa_log_sm_alloc(za.za_first_integer, log_txg); + avl_add(&spa->spa_sm_logs_by_txg, sls); + } + zap_cursor_fini(&zc); + if (error != ENOENT) { + spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at " + "zap_cursor_retrieve(spacemap_zap) [error %d]", + error); + return (error); + } + + for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed); + m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) { + spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) }; + spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg, + &target, NULL); + + /* + * At this point if sls is zero it means that a bug occurred + * in ZFS the last time the pool was open or earlier in the + * import code path. In general, we would have placed a + * VERIFY() here or in this case just let the kernel panic + * with NULL pointer dereference when incrementing sls_mscount, + * but since this is the import code path we can be a bit more + * lenient. Thus, for DEBUG bits we always cause a panic, while + * in production we log the error and just fail the import. + */ + ASSERT(sls != NULL); + if (sls == NULL) { + spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug " + "encountered: could not find log spacemap for " + "TXG %ld [error %d]", + metaslab_unflushed_txg(m), ENOENT); + return (ENOENT); + } + sls->sls_mscount++; + } + + return (0); +} + +typedef struct spa_ld_log_sm_arg { + spa_t *slls_spa; + uint64_t slls_txg; +} spa_ld_log_sm_arg_t; + +static int +spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg) +{ + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + uint32_t vdev_id = sme->sme_vdev; + spa_ld_log_sm_arg_t *slls = arg; + spa_t *spa = slls->slls_spa; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + + /* + * If the vdev has been removed (i.e. it is indirect or a hole) + * skip this entry. The contents of this vdev have already moved + * elsewhere. + */ + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(!ms->ms_loaded); + + /* + * If we have already flushed entries for this TXG to this + * metaslab's space map, then ignore it. Note that we flush + * before processing any allocations/frees for that TXG, so + * the metaslab's space map only has entries from *before* + * the unflushed TXG. + */ + if (slls->slls_txg < metaslab_unflushed_txg(ms)) + return (0); + + switch (sme->sme_type) { + case SM_ALLOC: + range_tree_remove_xor_add_segment(offset, offset + size, + ms->ms_unflushed_frees, ms->ms_unflushed_allocs); + break; + case SM_FREE: + range_tree_remove_xor_add_segment(offset, offset + size, + ms->ms_unflushed_allocs, ms->ms_unflushed_frees); + break; + default: + panic("invalid maptype_t"); + break; + } + return (0); +} + +static int +spa_ld_log_sm_data(spa_t *spa) +{ + int error = 0; + + /* + * If we are not going to do any writes there is no need + * to read the log space maps. + */ + if (!spa_writeable(spa)) + return (0); + + ASSERT0(spa->spa_unflushed_stats.sus_nblocks); + ASSERT0(spa->spa_unflushed_stats.sus_memused); + + hrtime_t read_logs_starttime = gethrtime(); + /* this is a no-op when we don't have space map logs */ + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + error = space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT); + if (error != 0) { + spa_load_failed(spa, "spa_ld_log_sm_data(): failed at " + "space_map_open(obj=%llu) [error %d]", + (u_longlong_t)sls->sls_sm_obj, error); + goto out; + } + + struct spa_ld_log_sm_arg vla = { + .slls_spa = spa, + .slls_txg = sls->sls_txg + }; + error = space_map_iterate(sm, space_map_length(sm), + spa_ld_log_sm_cb, &vla); + if (error != 0) { + space_map_close(sm); + spa_load_failed(spa, "spa_ld_log_sm_data(): failed " + "at space_map_iterate(obj=%llu) [error %d]", + (u_longlong_t)sls->sls_sm_obj, error); + goto out; + } + + ASSERT0(sls->sls_nblocks); + sls->sls_nblocks = space_map_nblocks(sm); + spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; + summary_add_data(spa, sls->sls_txg, + sls->sls_mscount, sls->sls_nblocks); + + space_map_close(sm); + } + hrtime_t read_logs_endtime = gethrtime(); + spa_load_note(spa, + "read %llu log space maps (%llu total blocks - blksz = %llu bytes) " + "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg), + (u_longlong_t)spa_log_sm_nblocks(spa), + (u_longlong_t)zfs_log_sm_blksz, + (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000)); + +out: + /* + * Now that the metaslabs contain their unflushed changes: + * [1] recalculate their actual allocated space + * [2] recalculate their weights + * [3] sum up the memory usage of their unflushed range trees + * [4] optionally load them, if debug_load is set + * + * Note that even in the case where we get here because of an + * error (e.g. error != 0), we still want to update the fields + * below in order to have a proper teardown in spa_unload(). + */ + for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed); + m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) { + mutex_enter(&m->ms_lock); + m->ms_allocated_space = space_map_allocated(m->ms_sm) + + range_tree_space(m->ms_unflushed_allocs) - + range_tree_space(m->ms_unflushed_frees); + + vdev_t *vd = m->ms_group->mg_vd; + metaslab_space_update(vd, m->ms_group->mg_class, + range_tree_space(m->ms_unflushed_allocs), 0, 0); + metaslab_space_update(vd, m->ms_group->mg_class, + -range_tree_space(m->ms_unflushed_frees), 0, 0); + + ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK); + metaslab_recalculate_weight_and_sort(m); + + spa->spa_unflushed_stats.sus_memused += + metaslab_unflushed_changes_memused(m); + + if (metaslab_debug_load && m->ms_sm != NULL) { + VERIFY0(metaslab_load(m)); + } + mutex_exit(&m->ms_lock); + } + + return (error); +} + +static int +spa_ld_unflushed_txgs(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa_meta_objset(spa); + + if (vd->vdev_top_zap == 0) + return (0); + + uint64_t object = 0; + int error = zap_lookup(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, + sizeof (uint64_t), 1, &object); + if (error == ENOENT) + return (0); + else if (error != 0) { + spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at " + "zap_lookup(vdev_top_zap=%llu) [error %d]", + (u_longlong_t)vd->vdev_top_zap, error); + return (error); + } + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *ms = vd->vdev_ms[m]; + ASSERT(ms != NULL); + + metaslab_unflushed_phys_t entry; + uint64_t entry_size = sizeof (entry); + uint64_t entry_offset = ms->ms_id * entry_size; + + error = dmu_read(mos, object, + entry_offset, entry_size, &entry, 0); + if (error != 0) { + spa_load_failed(spa, "spa_ld_unflushed_txgs(): " + "failed at dmu_read(obj=%llu) [error %d]", + (u_longlong_t)object, error); + return (error); + } + + ms->ms_unflushed_txg = entry.msp_unflushed_txg; + if (ms->ms_unflushed_txg != 0) { + mutex_enter(&spa->spa_flushed_ms_lock); + avl_add(&spa->spa_metaslabs_by_flushed, ms); + mutex_exit(&spa->spa_flushed_ms_lock); + } + } + return (0); +} + +/* + * Read all the log space map entries into their respective + * metaslab unflushed trees and keep them sorted by TXG in the + * SPA's metadata. In addition, setup all the metadata for the + * memory and the block heuristics. + */ +int +spa_ld_log_spacemaps(spa_t *spa) +{ + int error; + + spa_log_sm_set_blocklimit(spa); + + for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; + error = spa_ld_unflushed_txgs(vd); + if (error != 0) + return (error); + } + + error = spa_ld_log_sm_metadata(spa); + if (error != 0) + return (error); + + /* + * Note: we don't actually expect anything to change at this point + * but we grab the config lock so we don't fail any assertions + * when using vdev_lookup_top(). + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + error = spa_ld_log_sm_data(spa); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + return (error); +} diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index e4a83406f4..6e7926db3a 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -62,7 +62,7 @@ /* * SPA locking * - * There are four basic locks for managing spa_t structures: + * There are three basic locks for managing spa_t structures: * * spa_namespace_lock (global mutex) * @@ -595,6 +595,15 @@ spa_deadman(void *arg) vdev_deadman(spa->spa_root_vdev); } +int +spa_log_sm_sort_by_txg(const void *va, const void *vb) +{ + const spa_log_sm_t *a = va; + const spa_log_sm_t *b = vb; + + return (AVL_CMP(a->sls_txg, b->sls_txg)); +} + /* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already @@ -624,6 +633,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); @@ -684,6 +694,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_alloc_node)); } + avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, + sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); + avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg, + sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node)); + list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t), + offsetof(log_summary_entry_t, lse_node)); /* * Every pool starts with the default cachefile @@ -751,7 +767,7 @@ spa_remove(spa_t *spa) spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); - ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED); ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0); nvlist_free(spa->spa_config_splitting); @@ -780,6 +796,9 @@ spa_remove(spa_t *spa) kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count * sizeof (avl_tree_t)); + avl_destroy(&spa->spa_metaslabs_by_flushed); + avl_destroy(&spa->spa_sm_logs_by_txg); + list_destroy(&spa->spa_log_summary); list_destroy(&spa->spa_config_list); list_destroy(&spa->spa_leaf_list); @@ -811,6 +830,7 @@ spa_remove(spa_t *spa) cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); + mutex_destroy(&spa->spa_flushed_ms_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); @@ -2357,6 +2377,12 @@ spa_missing_tvds_allowed(spa_t *spa) return (spa->spa_missing_tvds_allowed); } +space_map_t * +spa_syncing_log_sm(spa_t *spa) +{ + return (spa->spa_syncing_log_sm); +} + void spa_set_missing_tvds(spa_t *spa, uint64_t missing) { diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c index 71e1e8cabc..01f1d587db 100644 --- a/usr/src/uts/common/fs/zfs/space_map.c +++ b/usr/src/uts/common/fs/zfs/space_map.c @@ -23,7 +23,8 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <sys/zfs_context.h> @@ -34,6 +35,7 @@ #include <sys/dsl_pool.h> #include <sys/zio.h> #include <sys/space_map.h> +#include <sys/spa_log_spacemap.h> #include <sys/refcount.h> #include <sys/zfeature.h> @@ -1066,3 +1068,11 @@ space_map_length(space_map_t *sm) { return (sm != NULL ? sm->sm_phys->smp_length : 0); } + +uint64_t +space_map_nblocks(space_map_t *sm) +{ + if (sm == NULL) + return (0); + return (DIV_ROUND_UP(space_map_length(sm), sm->sm_blksz)); +} diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 1001f52864..d38914dd1d 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -388,6 +388,7 @@ typedef struct dmu_buf { #define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" #define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" +#define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap" /* * Allocate an object from this objset. The range of object numbers diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h index 9cb200eaad..10705a84bc 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h @@ -49,8 +49,16 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, metaslab_t **); void metaslab_fini(metaslab_t *); +void metaslab_set_unflushed_txg(metaslab_t *, uint64_t, dmu_tx_t *); +void metaslab_set_estimated_condensed_size(metaslab_t *, uint64_t, dmu_tx_t *); +uint64_t metaslab_unflushed_txg(metaslab_t *); +uint64_t metaslab_estimated_condensed_size(metaslab_t *); +int metaslab_sort_by_flushed(const void *, const void *); +uint64_t metaslab_unflushed_changes_memused(metaslab_t *); + int metaslab_load(metaslab_t *, uint64_t); void metaslab_unload(metaslab_t *); +boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *); uint64_t metaslab_allocated_space(metaslab_t *); @@ -105,6 +113,9 @@ uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *); uint64_t metaslab_class_get_deferred(metaslab_class_t *); +void metaslab_space_update(vdev_t *, metaslab_class_t *, + int64_t, int64_t, int64_t); + metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int); void metaslab_group_destroy(metaslab_group_t *); void metaslab_group_activate(metaslab_group_t *); @@ -121,6 +132,8 @@ void metaslab_recalculate_weight_and_sort(metaslab_t *); void metaslab_disable(metaslab_t *); void metaslab_enable(metaslab_t *, boolean_t); +extern int metaslab_debug_load; + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h index 97b06e712a..5920b3113c 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -366,7 +366,7 @@ struct metaslab { * write to metaslab data on-disk (i.e flushing entries to * the metaslab's space map). It helps coordinate readers of * the metaslab's space map [see spa_vdev_remove_thread()] - * with writers [see metaslab_sync()]. + * with writers [see metaslab_sync() or metaslab_flush()]. * * Note that metaslab_load(), even though a reader, uses * a completely different mechanism to deal with the reading @@ -410,7 +410,6 @@ struct metaslab { boolean_t ms_condensing; /* condensing? */ boolean_t ms_condense_wanted; - uint64_t ms_condense_checked_txg; /* * The number of consumers which have disabled the metaslab. @@ -423,6 +422,8 @@ struct metaslab { */ boolean_t ms_loaded; boolean_t ms_loading; + kcondvar_t ms_flush_cv; + boolean_t ms_flushing; /* * The following histograms count entries that are in the @@ -508,6 +509,22 @@ struct metaslab { metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ + avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */ + + /* + * Allocs and frees that are committed to the vdev log spacemap but + * not yet to this metaslab's spacemap. + */ + range_tree_t *ms_unflushed_allocs; + range_tree_t *ms_unflushed_frees; + + /* + * We have flushed entries up to but not including this TXG. In + * other words, all changes from this TXG and onward should not + * be in this metaslab's space map and must be read from the + * log space maps. + */ + uint64_t ms_unflushed_txg; /* updated every time we are done syncing the metaslab's space map */ uint64_t ms_synced_length; @@ -515,6 +532,11 @@ struct metaslab { boolean_t ms_new; }; +typedef struct metaslab_unflushed_phys { + /* on-disk counterpart of ms_unflushed_txg */ + uint64_t msp_unflushed_txg; +} metaslab_unflushed_phys_t; + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/range_tree.h b/usr/src/uts/common/fs/zfs/sys/range_tree.h index 588f41fcb7..d450ff7f16 100644 --- a/usr/src/uts/common/fs/zfs/sys/range_tree.h +++ b/usr/src/uts/common/fs/zfs/sys/range_tree.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_RANGE_TREE_H @@ -94,6 +94,7 @@ range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, uint64_t newstart, uint64_t newsize); uint64_t range_tree_space(range_tree_t *rt); +uint64_t range_tree_numsegs(range_tree_t *rt); boolean_t range_tree_is_empty(range_tree_t *rt); void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); void range_tree_stat_verify(range_tree_t *rt); @@ -111,6 +112,11 @@ void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg); void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg); range_seg_t *range_tree_first(range_tree_t *rt); +void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, + range_tree_t *removefrom, range_tree_t *addto); +void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, + range_tree_t *addto); + void rt_avl_create(range_tree_t *rt, void *arg); void rt_avl_destroy(range_tree_t *rt, void *arg); void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 653d4ee334..92db9b819b 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -41,6 +41,7 @@ #include <sys/types.h> #include <sys/fs/zfs.h> #include <sys/dmu.h> +#include <sys/space_map.h> #ifdef __cplusplus extern "C" { @@ -1014,6 +1015,7 @@ extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); +extern space_map_t *spa_syncing_log_sm(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); extern uint64_t spa_dirty_data(spa_t *spa); extern spa_autotrim_t spa_get_autotrim(spa_t *spa); @@ -1065,6 +1067,7 @@ extern boolean_t spa_trust_config(spa_t *spa); extern uint64_t spa_missing_tvds_allowed(spa_t *spa); extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); +extern uint64_t spa_total_metaslabs(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern int spa_mode(spa_t *spa); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index 0b2d4a3968..d71971959b 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -34,6 +34,7 @@ #include <sys/spa.h> #include <sys/spa_checkpoint.h> +#include <sys/spa_log_spacemap.h> #include <sys/vdev.h> #include <sys/vdev_removal.h> #include <sys/metaslab.h> @@ -308,6 +309,14 @@ struct spa { spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; + space_map_t *spa_syncing_log_sm; /* current log space map */ + avl_tree_t spa_sm_logs_by_txg; + kmutex_t spa_flushed_ms_lock; /* for metaslabs_by_flushed */ + avl_tree_t spa_metaslabs_by_flushed; + spa_unflushed_stats_t spa_unflushed_stats; + list_t spa_log_summary; + uint64_t spa_log_flushall_txg; + char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ int spa_last_open_failed; /* error if last open failed */ diff --git a/usr/src/uts/common/fs/zfs/sys/spa_log_spacemap.h b/usr/src/uts/common/fs/zfs/sys/spa_log_spacemap.h new file mode 100644 index 0000000000..e5b683e5ea --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/spa_log_spacemap.h @@ -0,0 +1,84 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018, 2019 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _SYS_SPA_LOG_SPACEMAP_H +#define _SYS_SPA_LOG_SPACEMAP_H + +#include <sys/avl.h> + +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +typedef struct log_summary_entry { + uint64_t lse_start; /* start TXG */ + uint64_t lse_mscount; /* # of metaslabs needed to be flushed */ + uint64_t lse_blkcount; /* blocks held by this entry */ + list_node_t lse_node; +} log_summary_entry_t; + +typedef struct spa_unflushed_stats { + /* used for memory heuristic */ + uint64_t sus_memused; /* current memory used for unflushed trees */ + + /* used for block heuristic */ + uint64_t sus_blocklimit; /* max # of log blocks allowed */ + uint64_t sus_nblocks; /* # of blocks in log space maps currently */ +} spa_unflushed_stats_t; + +typedef struct spa_log_sm { + uint64_t sls_sm_obj; /* space map object ID */ + uint64_t sls_txg; /* txg logged on the space map */ + uint64_t sls_nblocks; /* number of blocks in this log */ + uint64_t sls_mscount; /* # of metaslabs flushed in the log's txg */ + avl_node_t sls_node; /* node in spa_sm_logs_by_txg */ +} spa_log_sm_t; + +int spa_ld_log_spacemaps(spa_t *); + +void spa_generate_syncing_log_sm(spa_t *, dmu_tx_t *); +void spa_flush_metaslabs(spa_t *, dmu_tx_t *); +void spa_sync_close_syncing_log_sm(spa_t *); + +void spa_cleanup_old_sm_logs(spa_t *, dmu_tx_t *); + +uint64_t spa_log_sm_blocklimit(spa_t *); +void spa_log_sm_set_blocklimit(spa_t *); +uint64_t spa_log_sm_nblocks(spa_t *); +uint64_t spa_log_sm_memused(spa_t *); + +void spa_log_sm_decrement_mscount(spa_t *, uint64_t); +void spa_log_sm_increment_current_mscount(spa_t *); + +void spa_log_summary_add_flushed_metaslab(spa_t *); +void spa_log_summary_decrement_mscount(spa_t *, uint64_t); +void spa_log_summary_decrement_blkcount(spa_t *, uint64_t); + +boolean_t spa_flush_all_logs_requested(spa_t *); + +extern int zfs_keep_log_spacemaps_at_export; + +#endif /* _SYS_SPA_LOG_SPACEMAP_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/space_map.h b/usr/src/uts/common/fs/zfs/sys/space_map.h index 2bce20b48b..5ede2c43e3 100644 --- a/usr/src/uts/common/fs/zfs/sys/space_map.h +++ b/usr/src/uts/common/fs/zfs/sys/space_map.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_SPACE_MAP_H @@ -72,6 +72,11 @@ typedef struct space_map_phys { * bucket, smp_histogram[i], contains the number of free regions * whose size is: * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1) + * + * Note that, if log space map feature is enabled, histograms of + * space maps that belong to metaslabs will take into account any + * unflushed changes for their metaslabs, even though the actual + * space map doesn't have entries for these changes. */ uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE]; } space_map_phys_t; @@ -209,6 +214,8 @@ void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, uint64_t space_map_object(space_map_t *sm); int64_t space_map_allocated(space_map_t *sm); uint64_t space_map_length(space_map_t *sm); +uint64_t space_map_entries(space_map_t *sm, range_tree_t *rt); +uint64_t space_map_nblocks(space_map_t *sm); void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, uint64_t vdev_id, dmu_tx_t *tx); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 9819b85d0c..9caaddf857 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright 2019 Joyent, Inc. */ @@ -525,7 +526,7 @@ extern void vdev_set_min_asize(vdev_t *vd); /* * Global variables */ -extern int vdev_standard_sm_blksz; +extern int zfs_vdev_standard_sm_blksz; /* zdb uses this tunable, so it must be declared here to make lint happy. */ extern int zfs_vdev_cache_size; diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h index 2094f8d019..adfbb84de8 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_DEBUG_H @@ -61,6 +61,7 @@ extern boolean_t zfs_free_leak_on_eio; #define ZFS_DEBUG_METASLAB_VERIFY (1 << 8) #define ZFS_DEBUG_INDIRECT_REMAP (1 << 9) #define ZFS_DEBUG_TRIM (1 << 11) +#define ZFS_DEBUG_LOG_SPACEMAP (1 << 12) #ifdef ZFS_DEBUG extern void __dprintf(const char *file, const char *func, diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index 5be51b7d71..a99e581737 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -265,7 +265,7 @@ txg_sync_stop(dsl_pool_t *dp) ASSERT3U(tx->tx_threads, ==, 2); /* - * We need to ensure that we've vacated the deferred space_maps. + * We need to ensure that we've vacated the deferred metaslab trees. */ txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index a4d9415314..142542236c 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -95,14 +95,14 @@ boolean_t vdev_validate_skip = B_FALSE; * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. */ -int vdev_dtl_sm_blksz = (1 << 12); +int zfs_vdev_dtl_sm_blksz = (1 << 12); /* * vdev-wide space maps that have lots of entries written to them at * the end of each transaction can benefit from a higher I/O bandwidth * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. */ -int vdev_standard_sm_blksz = (1 << 17); +int zfs_vdev_standard_sm_blksz = (1 << 17); int zfs_ashift_min; @@ -854,6 +854,7 @@ vdev_free(vdev_t *vd) if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); + vd->vdev_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); @@ -1264,6 +1265,13 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); + /* + * Regardless whether this vdev was just added or it is being + * expanded, the metaslab count has changed. Recalculate the + * block limit. + */ + spa_log_sm_set_blocklimit(spa); + return (0); } @@ -2752,7 +2760,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; - new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx); + new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, @@ -2766,7 +2774,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); - space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx); + space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); @@ -3042,6 +3050,25 @@ vdev_validate_aux(vdev_t *vd) return (0); } +static void +vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) +{ + objset_t *mos = spa_meta_objset(vd->vdev_spa); + + if (vd->vdev_top_zap == 0) + return; + + uint64_t object = 0; + int err = zap_lookup(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); + if (err == ENOENT) + return; + + VERIFY0(dmu_object_free(mos, object, tx)); + VERIFY0(zap_remove(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); +} + /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. @@ -3069,6 +3096,7 @@ vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); + vdev_destroy_ms_flush_data(vd, tx); vd->vdev_ms_array = 0; } diff --git a/usr/src/uts/common/fs/zfs/vdev_indirect.c b/usr/src/uts/common/fs/zfs/vdev_indirect.c index 9626589444..493e2b51ed 100644 --- a/usr/src/uts/common/fs/zfs/vdev_indirect.c +++ b/usr/src/uts/common/fs/zfs/vdev_indirect.c @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2014, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014, 2019 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -818,7 +818,7 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) if (vdev_obsolete_sm_object(vd) == 0) { uint64_t obsolete_sm_object = space_map_alloc(spa->spa_meta_objset, - vdev_standard_sm_blksz, tx); + zfs_vdev_standard_sm_blksz, tx); ASSERT(vd->vdev_top_zap != 0); VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, diff --git a/usr/src/uts/common/fs/zfs/vdev_removal.c b/usr/src/uts/common/fs/zfs/vdev_removal.c index 28ce4aba9e..01415b5cd2 100644 --- a/usr/src/uts/common/fs/zfs/vdev_removal.c +++ b/usr/src/uts/common/fs/zfs/vdev_removal.c @@ -1160,6 +1160,7 @@ vdev_remove_complete(spa_t *spa) vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; + spa_log_sm_set_blocklimit(spa); } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); @@ -1400,6 +1401,10 @@ spa_vdev_remove_thread(void *arg) VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_add, svr->svr_allocd_segs); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_remove, svr->svr_allocd_segs); range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); @@ -1596,6 +1601,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) mutex_enter(&svr->svr_lock); VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); + + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_add, svr->svr_allocd_segs); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_remove, svr->svr_allocd_segs); range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); @@ -1718,19 +1728,14 @@ vdev_remove_make_hole_and_free(vdev_t *vd) uint64_t id = vd->vdev_id; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; - boolean_t last_vdev = (id == (rvd->vdev_children - 1)); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); vdev_free(vd); - if (last_vdev) { - vdev_compact_children(rvd); - } else { - vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); - vdev_add_child(rvd, vd); - } + vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); + vdev_add_child(rvd, vd); vdev_config_dirty(rvd); /* @@ -1792,7 +1797,28 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) vdev_dirty_leaves(vd, VDD_DTL, *txg); vdev_config_dirty(vd); + /* + * When the log space map feature is enabled we look at + * the vdev's top_zap to find the on-disk flush data of + * the metaslab we just flushed. Thus, while removing a + * log vdev we make sure to call vdev_metaslab_fini() + * first, which removes all metaslabs of this vdev from + * spa_metaslabs_by_flushed before vdev_remove_empty() + * destroys the top_zap of this log vdev. + * + * This avoids the scenario where we flush a metaslab + * from the log vdev being removed that doesn't have a + * top_zap and end up failing to lookup its on-disk flush + * data. + * + * We don't call metaslab_group_destroy() right away + * though (it will be called in vdev_free() later) as + * during metaslab_sync() of metaslabs from other vdevs + * we may touch the metaslab group of this vdev through + * metaslab_class_histogram_verify() + */ vdev_metaslab_fini(vd); + spa_log_sm_set_blocklimit(spa); spa_history_log_internal(spa, "vdev remove", NULL, "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id, diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 9fc8f9a8f2..a932ccb544 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -1042,10 +1042,16 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. + * + * Note that we only defer frees after zfs_sync_pass_deferred_free + * when the log space map feature is disabled. [see relevant comment + * in spa_sync_iterate_to_convergence()] */ - if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || + if (BP_IS_GANG(bp) || + BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || - spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { + (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && + !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) { bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0))); @@ -1061,7 +1067,6 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); - ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index 7d5fcee519..4d81e8d40a 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -695,6 +695,8 @@ typedef struct zpool_load_policy { "com.delphix:obsolete_counts_are_precise" #define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ "com.delphix:pool_checkpoint_sm" +#define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \ + "com.delphix:ms_unflushed_phys_txgs" #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" |