diff options
author | Lin Ling <Lin.Ling@Sun.COM> | 2010-05-03 14:54:08 -0700 |
---|---|---|
committer | Lin Ling <Lin.Ling@Sun.COM> | 2010-05-03 14:54:08 -0700 |
commit | 3f9d6ad73e45c6823b409f93b0c8d4f62861d2d5 (patch) | |
tree | 195b2b1fa9e897a41897e12fed9b0c6e58d8107e | |
parent | 3113f7cee6785cfe8d9e78c535cf9e2a79283275 (diff) | |
download | illumos-joyent-3f9d6ad73e45c6823b409f93b0c8d4f62861d2d5.tar.gz |
6675946 'zpool status' should show the progress of resilvering for individual disk.
6683750 scrub -s have to wait until resilver completed?
6841252 Resilvering not restartable - causing an excess reboot delay
6855073 spa scrub stats (eg %done) are reset on reboot
6891824 7410 NAS head "continually resilvering" following HDD replacement
6899970 scrub/resilver percent complete reporting in zpool status can be overly optimistic
6940889 add interval (count) args to zpool list
6944623 dbuf_read_done() locking performance improvement
6946760 mutex problem in bplist_enqueue()
6391915 RFE: provide interval arg to zpool status to monitor resilvering
6946512 want zfs_send() to pass back debug info
6943992 'zpool scrub' should not restart the existing scrub silently
6878281 zpool should store the time of last scrub/resilver and other zpool status info in pool properties.
6935158 Assertion failed: used <= spa_get_dspace(dd->dd_pool->dp_spa)
6944388 dsl_dataset_snapshot_reserve_space() causes dp_write_limit=max
--HG--
rename : usr/src/uts/common/fs/zfs/dsl_scrub.c => usr/src/uts/common/fs/zfs/dsl_scan.c
85 files changed, 3522 insertions, 2379 deletions
diff --git a/usr/src/cmd/availdevs/availdevs.c b/usr/src/cmd/availdevs/availdevs.c index 1332a4f2bb..7ecec0a05c 100644 --- a/usr/src/cmd/availdevs/availdevs.c +++ b/usr/src/cmd/availdevs/availdevs.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include "availdevs.h" @@ -134,7 +133,7 @@ add_pool_to_xml(nvlist_t *config, void *data) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state) || nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &devices) || nvlist_lookup_uint64_array( - devices, ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &n)) { + devices, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &n)) { return (-1); } diff --git a/usr/src/cmd/fm/schemes/zfs/scheme.c b/usr/src/cmd/fm/schemes/zfs/scheme.c index ffa8ebf7f5..c0922f4d89 100644 --- a/usr/src/cmd/fm/schemes/zfs/scheme.c +++ b/usr/src/cmd/fm/schemes/zfs/scheme.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <fm/fmd_fmri.h> @@ -214,7 +213,7 @@ fmd_fmri_unusable(nvlist_t *nvl) vdev_stat_t *vs; uint_t c; - (void) nvlist_lookup_uint64_array(vd, ZPOOL_CONFIG_STATS, + (void) nvlist_lookup_uint64_array(vd, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c); ret = (vs->vs_state < VDEV_STATE_DEGRADED); diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index e722007d72..e7bb40809b 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -129,6 +129,16 @@ getrefcount(uintptr_t addr, mdb_ctf_id_t *id, return (GETMEMBID(addr + off, &rc_id, rc_count, *rc)); } +static boolean_t +strisprint(const char *cp) +{ + for (; *cp; cp++) { + if (!isprint(*cp)) + return (B_FALSE); + } + return (B_TRUE); +} + static int verbose; static int @@ -624,8 +634,10 @@ zap_leaf(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) zlc->l_entry.le_hash); break; case ZAP_CHUNK_ARRAY: - mdb_printf(" %u: array \"%s\"\n", - i, zlc->l_array.la_array); + mdb_printf(" %u: array", i); + if (strisprint((char *)zlc->l_array.la_array)) + mdb_printf(" \"%s\"", zlc->l_array.la_array); + mdb_printf("\n"); if (verbose) { int j; mdb_printf(" "); @@ -811,6 +823,77 @@ abuf_find(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_OK); } +/* ARGSUSED */ +static int +dbgmsg_cb(uintptr_t addr, const void *unknown, void *arg) +{ + static mdb_ctf_id_t id; + static boolean_t gotid; + static ulong_t off; + + int *verbosep = arg; + time_t timestamp; + char buf[1024]; + + if (!gotid) { + if (mdb_ctf_lookup_by_name("struct zfs_dbgmsg", &id) == -1) { + mdb_warn("couldn't find struct zfs_dbgmsg"); + return (WALK_ERR); + } + gotid = TRUE; + if (mdb_ctf_offsetof(id, "zdm_msg", &off) == -1) { + mdb_warn("couldn't find zdm_msg"); + return (WALK_ERR); + } + off /= 8; + } + + + if (GETMEMBID(addr, &id, zdm_timestamp, timestamp)) { + return (WALK_ERR); + } + + if (mdb_readstr(buf, sizeof (buf), addr + off) == -1) { + mdb_warn("failed to read zdm_msg at %p\n", addr + off); + return (DCMD_ERR); + } + + if (*verbosep) + mdb_printf("%Y ", timestamp); + + mdb_printf("%s\n", buf); + + if (*verbosep) + (void) mdb_call_dcmd("whatis", addr, DCMD_ADDRSPEC, 0, NULL); + + return (WALK_NEXT); +} + +/* ARGSUSED */ +static int +dbgmsg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + GElf_Sym sym; + int verbose = FALSE; + + if (mdb_getopts(argc, argv, + 'v', MDB_OPT_SETBITS, TRUE, &verbose, + NULL) != argc) + return (DCMD_USAGE); + + if (mdb_lookup_by_name("zfs_dbgmsgs", &sym)) { + mdb_warn("can't find zfs_dbgmsgs"); + return (DCMD_ERR); + } + + if (mdb_pwalk("list", dbgmsg_cb, &verbose, sym.st_value) != 0) { + mdb_warn("can't walk zfs_dbgmsgs"); + return (DCMD_ERR); + } + + return (DCMD_OK); +} + /*ARGSUSED*/ static int arc_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) @@ -2195,7 +2278,7 @@ reference_cb(uintptr_t addr, const void *ignored, void *arg) uintptr_t ref_holder; uintptr_t ref_removed; uint64_t ref_number; - boolean_t holder_is_str; + boolean_t holder_is_str = B_FALSE; char holder_str[128]; boolean_t removed = (boolean_t)arg; @@ -2212,18 +2295,8 @@ reference_cb(uintptr_t addr, const void *ignored, void *arg) GETMEMBID(addr, &ref_id, ref_number, ref_number)) return (WALK_ERR); - if (mdb_readstr(holder_str, sizeof (holder_str), ref_holder) != -1) { - char *cp; - holder_is_str = B_TRUE; - for (cp = holder_str; *cp; cp++) { - if (!isprint(*cp)) { - holder_is_str = B_FALSE; - break; - } - } - } else { - holder_is_str = B_FALSE; - } + if (mdb_readstr(holder_str, sizeof (holder_str), ref_holder) != -1) + holder_is_str = strisprint(holder_str); if (removed) mdb_printf("removed "); @@ -2940,6 +3013,8 @@ static const mdb_dcmd_t dcmds[] = { sa_attr_table}, { "sa_attr", ": attr_id", "print SA attribute address when given sa_handle_t", sa_attr_print}, + { "zfs_dbgmsg", ":[-v]", + "print zfs debug log", dbgmsg}, { NULL } }; diff --git a/usr/src/cmd/ndmpd/ndmp/ndmpd_zfs.c b/usr/src/cmd/ndmpd/ndmp/ndmpd_zfs.c index 6a57828a24..4c12778ad1 100644 --- a/usr/src/cmd/ndmpd/ndmp/ndmpd_zfs.c +++ b/usr/src/cmd/ndmpd/ndmp/ndmpd_zfs.c @@ -610,7 +610,7 @@ ndmpd_zfs_backup_send_read(ndmpd_zfs_args_t *ndmpd_zfs_args) } err = zfs_send(zhp, fromsnap, ndmpd_zfs_args->nz_snapname, flags, - ndmpd_zfs_args->nz_pipe_fd[PIPE_ZFS], NULL, NULL); + ndmpd_zfs_args->nz_pipe_fd[PIPE_ZFS], NULL, NULL, NULL); if (err && !session->ns_data.dd_abort) NDMPD_ZFS_LOG_ZERR(ndmpd_zfs_args, "zfs_send: %d", err); diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c index 03f7b40ab0..7178537a06 100644 --- a/usr/src/cmd/truss/codes.c +++ b/usr/src/cmd/truss/codes.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1146,7 +1145,7 @@ const struct ioc { "zfs_cmd_t" }, { (uint_t)ZFS_IOC_POOL_TRYIMPORT, "ZFS_IOC_POOL_TRYIMPORT", "zfs_cmd_t" }, - { (uint_t)ZFS_IOC_POOL_SCRUB, "ZFS_IOC_POOL_SCRUB", + { (uint_t)ZFS_IOC_POOL_SCAN, "ZFS_IOC_POOL_SCAN", "zfs_cmd_t" }, { (uint_t)ZFS_IOC_POOL_FREEZE, "ZFS_IOC_POOL_FREEZE", "zfs_cmd_t" }, diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index 61e79d0e84..2d36cf5488 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -150,6 +150,7 @@ usage(void) "has altroot/not in a cachefile\n"); (void) fprintf(stderr, " -p <path> -- use one or more with " "-e to specify path to vdev dir\n"); + (void) fprintf(stderr, " -P print numbers parsable\n"); (void) fprintf(stderr, " -t <txg> -- highest txg to use when " "searching for uberblocks\n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " @@ -196,6 +197,15 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) nvlist_free(nv); } +static void +zdb_nicenum(uint64_t num, char *buf) +{ + if (dump_opt['P']) + (void) sprintf(buf, "%llu", (longlong_t)num); + else + nicenum(num, buf); +} + const char dump_zap_stars[] = "****************************************"; const int dump_zap_width = sizeof (dump_zap_stars) - 1; @@ -490,7 +500,7 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm) */ alloc = 0; for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) { - VERIFY(0 == dmu_read(os, smo->smo_object, offset, + VERIFY3U(0, ==, dmu_read(os, smo->smo_object, offset, sizeof (entry), &entry, DMU_READ_PREFETCH)); if (SM_DEBUG_DECODE(entry)) { (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n", @@ -525,12 +535,12 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm) static void dump_metaslab_stats(metaslab_t *msp) { - char maxbuf[5]; + char maxbuf[32]; space_map_t *sm = &msp->ms_map; avl_tree_t *t = sm->sm_pp_root; int free_pct = sm->sm_space * 100 / sm->sm_size; - nicenum(space_map_maxsize(sm), maxbuf); + zdb_nicenum(space_map_maxsize(sm), maxbuf); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", "segments", avl_numnodes(t), "maxsize", maxbuf, @@ -544,9 +554,9 @@ dump_metaslab(metaslab_t *msp) spa_t *spa = vd->vdev_spa; space_map_t *sm = &msp->ms_map; space_map_obj_t *smo = &msp->ms_smo; - char freebuf[5]; + char freebuf[32]; - nicenum(sm->sm_size - smo->smo_alloc, freebuf); + zdb_nicenum(sm->sm_size - smo->smo_alloc, freebuf); (void) printf( "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", @@ -855,7 +865,7 @@ dump_history(spa_t *spa) (void) snprintf(internalstr, sizeof (internalstr), "[internal %s txg:%lld] %s", - hist_event_table[ievent], txg, + zfs_history_event_names[ievent], txg, intstr); cmd = internalstr; } @@ -966,6 +976,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); + ASSERT(buf->b_data); /* recursively visit blocks below this */ cbp = buf->b_data; @@ -1015,7 +1026,7 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dir_phys_t *dd = data; time_t crtime; - char nice[6]; + char nice[32]; if (dd == NULL) return; @@ -1032,15 +1043,15 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) (u_longlong_t)dd->dd_origin_obj); (void) printf("\t\tchild_dir_zapobj = %llu\n", (u_longlong_t)dd->dd_child_dir_zapobj); - nicenum(dd->dd_used_bytes, nice); + zdb_nicenum(dd->dd_used_bytes, nice); (void) printf("\t\tused_bytes = %s\n", nice); - nicenum(dd->dd_compressed_bytes, nice); + zdb_nicenum(dd->dd_compressed_bytes, nice); (void) printf("\t\tcompressed_bytes = %s\n", nice); - nicenum(dd->dd_uncompressed_bytes, nice); + zdb_nicenum(dd->dd_uncompressed_bytes, nice); (void) printf("\t\tuncompressed_bytes = %s\n", nice); - nicenum(dd->dd_quota, nice); + zdb_nicenum(dd->dd_quota, nice); (void) printf("\t\tquota = %s\n", nice); - nicenum(dd->dd_reserved, nice); + zdb_nicenum(dd->dd_reserved, nice); (void) printf("\t\treserved = %s\n", nice); (void) printf("\t\tprops_zapobj = %llu\n", (u_longlong_t)dd->dd_props_zapobj); @@ -1050,7 +1061,7 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) (u_longlong_t)dd->dd_flags); #define DO(which) \ - nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \ + zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \ (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) DO(HEAD); DO(SNAP); @@ -1066,7 +1077,7 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dataset_phys_t *ds = data; time_t crtime; - char used[6], compressed[6], uncompressed[6], unique[6]; + char used[32], compressed[32], uncompressed[32], unique[32]; char blkbuf[BP_SPRINTF_LEN]; if (ds == NULL) @@ -1074,10 +1085,10 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) ASSERT(size == sizeof (*ds)); crtime = ds->ds_creation_time; - nicenum(ds->ds_used_bytes, used); - nicenum(ds->ds_compressed_bytes, compressed); - nicenum(ds->ds_uncompressed_bytes, uncompressed); - nicenum(ds->ds_unique_bytes, unique); + zdb_nicenum(ds->ds_used_bytes, used); + zdb_nicenum(ds->ds_compressed_bytes, compressed); + zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed); + zdb_nicenum(ds->ds_unique_bytes, unique); sprintf_blkptr(blkbuf, &ds->ds_bp); (void) printf("\t\tdir_obj = %llu\n", @@ -1122,9 +1133,9 @@ dump_bplist(objset_t *mos, uint64_t object, char *name) bplist_t bpl = { 0 }; blkptr_t blk, *bp = &blk; uint64_t itor = 0; - char bytes[6]; - char comp[6]; - char uncomp[6]; + char bytes[32]; + char comp[32]; + char uncomp[32]; if (dump_opt['d'] < 3) return; @@ -1137,10 +1148,10 @@ dump_bplist(objset_t *mos, uint64_t object, char *name) return; } - nicenum(bpl.bpl_phys->bpl_bytes, bytes); + zdb_nicenum(bpl.bpl_phys->bpl_bytes, bytes); if (bpl.bpl_dbuf->db_size == sizeof (bplist_phys_t)) { - nicenum(bpl.bpl_phys->bpl_comp, comp); - nicenum(bpl.bpl_phys->bpl_uncomp, uncomp); + zdb_nicenum(bpl.bpl_phys->bpl_comp, comp); + zdb_nicenum(bpl.bpl_phys->bpl_uncomp, uncomp); (void) printf("\n %s: %llu entries, %s (%s/%s comp)\n", name, (u_longlong_t)bpl.bpl_phys->bpl_entries, bytes, comp, uncomp); @@ -1391,6 +1402,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { dump_zap, /* SA Master Node */ dump_sa_attrs, /* SA attribute registration */ dump_sa_layouts, /* SA attribute layouts */ + dump_zap, /* DSL scrub translations */ + dump_none, /* fake dedup BP */ dump_unknown, /* Unknown type, must be last */ }; @@ -1402,7 +1415,8 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) dnode_t *dn; void *bonus = NULL; size_t bsize = 0; - char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], fill[7]; + char iblk[32], dblk[32], lsize[32], asize[32], fill[32]; + char bonus_size[32]; char aux[50]; int error; @@ -1426,11 +1440,11 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) } dmu_object_info_from_dnode(dn, &doi); - nicenum(doi.doi_metadata_block_size, iblk); - nicenum(doi.doi_data_block_size, dblk); - nicenum(doi.doi_max_offset, lsize); - nicenum(doi.doi_physical_blocks_512 << 9, asize); - nicenum(doi.doi_bonus_size, bonus_size); + zdb_nicenum(doi.doi_metadata_block_size, iblk); + zdb_nicenum(doi.doi_data_block_size, dblk); + zdb_nicenum(doi.doi_max_offset, lsize); + zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize); + zdb_nicenum(doi.doi_bonus_size, bonus_size); (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / doi.doi_max_offset); @@ -1492,7 +1506,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) } for (;;) { - char segsize[6]; + char segsize[32]; error = dnode_next_offset(dn, 0, &start, minlvl, blkfill, 0); if (error) @@ -1500,7 +1514,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) end = start; error = dnode_next_offset(dn, DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); - nicenum(end - start, segsize); + zdb_nicenum(end - start, segsize); (void) printf("\t\tsegment [%016llx, %016llx)" " size %5s\n", (u_longlong_t)start, (u_longlong_t)end, segsize); @@ -1523,7 +1537,7 @@ dump_dir(objset_t *os) dmu_objset_stats_t dds; uint64_t object, object_count; uint64_t refdbytes, usedobjs, scratch; - char numbuf[8]; + char numbuf[32]; char blkbuf[BP_SPRINTF_LEN + 20]; char osname[MAXNAMELEN]; char *type = "UNKNOWN"; @@ -1547,7 +1561,7 @@ dump_dir(objset_t *os) ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill); - nicenum(refdbytes, numbuf); + zdb_nicenum(refdbytes, numbuf); if (verbosity >= 4) { (void) sprintf(blkbuf, ", rootbp "); @@ -1905,8 +1919,9 @@ zdb_count_block(spa_t *spa, zilog_t *zilog, zdb_cb_t *zcb, const blkptr_t *bp, bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } +/* ARGSUSED */ static int -zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, +zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { zdb_cb_t *zcb = arg; @@ -2222,7 +2237,8 @@ dump_block_stats(spa_t *spa) "\t avg\t comp\t%%Total\tType\n"); for (t = 0; t <= ZDB_OT_TOTAL; t++) { - char csize[6], lsize[6], psize[6], asize[6], avg[6]; + char csize[32], lsize[32], psize[32], asize[32]; + char avg[32]; char *typename; if (t < DMU_OT_NUMTYPES) @@ -2258,11 +2274,11 @@ dump_block_stats(spa_t *spa) zcb.zcb_type[ZB_TOTAL][t].zb_asize) continue; - nicenum(zb->zb_count, csize); - nicenum(zb->zb_lsize, lsize); - nicenum(zb->zb_psize, psize); - nicenum(zb->zb_asize, asize); - nicenum(zb->zb_asize / zb->zb_count, avg); + zdb_nicenum(zb->zb_count, csize); + zdb_nicenum(zb->zb_lsize, lsize); + zdb_nicenum(zb->zb_psize, psize); + zdb_nicenum(zb->zb_asize, asize); + zdb_nicenum(zb->zb_asize / zb->zb_count, avg); (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" "\t%5.2f\t%6.2f\t", @@ -2302,7 +2318,7 @@ typedef struct zdb_ddt_entry { /* ARGSUSED */ static int zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { avl_tree_t *t = arg; avl_index_t where; @@ -2897,7 +2913,7 @@ main(int argc, char **argv) dprintf_setup(&argc, argv); - while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:")) != -1) { + while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:P")) != -1) { switch (c) { case 'b': case 'c': @@ -2920,6 +2936,7 @@ main(int argc, char **argv) case 'L': case 'X': case 'e': + case 'P': dump_opt[c]++; break; case 'v': @@ -2970,7 +2987,7 @@ main(int argc, char **argv) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { - if (dump_all && !strchr("elAFLRSX", c)) + if (dump_all && !strchr("elAFLRSXP", c)) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; diff --git a/usr/src/cmd/zfs/zfs_main.c b/usr/src/cmd/zfs/zfs_main.c index 66f99ccfbf..6176a102d3 100644 --- a/usr/src/cmd/zfs/zfs_main.c +++ b/usr/src/cmd/zfs/zfs_main.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <assert.h> @@ -2573,6 +2572,8 @@ zfs_do_send(int argc, char **argv) zfs_handle_t *zhp; sendflags_t flags = { 0 }; int c, err; + nvlist_t *dbgnv; + boolean_t extraverbose = B_FALSE; /* check options */ while ((c = getopt(argc, argv, ":i:I:RDpv")) != -1) { @@ -2595,6 +2596,8 @@ zfs_do_send(int argc, char **argv) flags.props = B_TRUE; break; case 'v': + if (flags.verbose) + extraverbose = B_TRUE; flags.verbose = B_TRUE; break; case 'D': @@ -2679,7 +2682,19 @@ zfs_do_send(int argc, char **argv) if (flags.replicate && fromname == NULL) flags.doall = B_TRUE; - err = zfs_send(zhp, fromname, toname, flags, STDOUT_FILENO, NULL, 0); + err = zfs_send(zhp, fromname, toname, flags, STDOUT_FILENO, NULL, 0, + extraverbose ? &dbgnv : NULL); + + if (extraverbose) { + /* + * dump_nvlist prints to stdout, but that's been + * redirected to a file. Make it print to stderr + * instead. + */ + (void) dup2(STDERR_FILENO, STDOUT_FILENO); + dump_nvlist(dbgnv, 0); + nvlist_free(dbgnv); + } zfs_close(zhp); return (err != 0); diff --git a/usr/src/cmd/zpool/Makefile b/usr/src/cmd/zpool/Makefile index 728fdbe03b..0bf7b02767 100644 --- a/usr/src/cmd/zpool/Makefile +++ b/usr/src/cmd/zpool/Makefile @@ -19,8 +19,7 @@ # CDDL HEADER END # # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. # PROG= zpool diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c index a31ee80255..c663cea5a1 100644 --- a/usr/src/cmd/zpool/zpool_main.c +++ b/usr/src/cmd/zpool/zpool_main.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <assert.h> @@ -42,7 +41,6 @@ #include <pwd.h> #include <zone.h> #include <sys/fs/zfs.h> - #include <sys/stat.h> #include <libzfs.h> @@ -215,7 +213,7 @@ get_usage(zpool_help_t idx) { "[count]]\n")); case HELP_LIST: return (gettext("\tlist [-H] [-o property[,...]] " - "[pool] ...\n")); + "[-T d|u] [pool] ... [interval [count]]\n")); case HELP_OFFLINE: return (gettext("\toffline [-t] <pool> <device> ...\n")); case HELP_ONLINE: @@ -228,7 +226,8 @@ get_usage(zpool_help_t idx) { case HELP_SCRUB: return (gettext("\tscrub [-s] <pool> ...\n")); case HELP_STATUS: - return (gettext("\tstatus [-vx] [pool] ...\n")); + return (gettext("\tstatus [-vx] [-T d|u] [pool] ... [interval " + "[count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" "\tupgrade -v\n" @@ -519,11 +518,10 @@ zpool_do_add(int argc, char **argv) } /* - * zpool remove <pool> <vdev> ... + * zpool remove <pool> <vdev> ... * - * Removes the given vdev from the pool. Currently, this only supports removing - * spares and cache devices from the pool. Eventually, we'll want to support - * removing leaf vdevs (as an alias for 'detach') as well as toplevel vdevs. + * Removes the given vdev from the pool. Currently, this supports removing + * spares, cache, and log devices from the pool. */ int zpool_do_remove(int argc, char **argv) @@ -1044,20 +1042,21 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, { nvlist_t **child; uint_t c, children; + pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; - char rbuf[6], wbuf[6], cbuf[6], repaired[7]; + char rbuf[6], wbuf[6], cbuf[6]; char *vname; uint64_t notpresent; spare_cbdata_t cb; char *state; - verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, - (uint64_t **)&vs, &c) == 0); - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { /* @@ -1147,14 +1146,16 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, (void) printf(gettext("corrupted data")); break; } - } else if (vs->vs_scrub_repaired != 0 && children == 0) { - /* - * Report bytes resilvered/repaired on leaf devices. - */ - zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired)); - (void) printf(gettext(" %s %s"), repaired, - (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ? - "resilvered" : "repaired"); + } + + (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &c); + + if (ps && ps->pss_state == DSS_SCANNING && + vs->vs_scan_processed != 0 && children == 0) { + (void) printf(gettext(" (%s)"), + (ps->pss_func == POOL_SCAN_RESILVER) ? + "resilvering" : "repairing"); } (void) printf("\n"); @@ -1194,7 +1195,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) strcmp(type, VDEV_TYPE_HOLE) == 0) return; - verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); (void) printf("\t%*s%-*s", depth, "", namewidth - depth, name); @@ -1333,7 +1334,7 @@ show_import(nvlist_t *config) verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS, + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); @@ -1400,6 +1401,11 @@ show_import(nvlist_t *config) "read.\n")); break; + case ZPOOL_STATUS_RESILVERING: + (void) printf(gettext("status: One or more devices were being " + "resilvered.\n")); + break; + default: /* * No other status can be seen when importing pools. @@ -1990,13 +1996,13 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, char *vname; if (oldnv != NULL) { - verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_STATS, - (uint64_t **)&oldvs, &c) == 0); + verify(nvlist_lookup_uint64_array(oldnv, + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); } else { oldvs = &zerovs; } - verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_STATS, + verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); if (strlen(name) + depth > cb->cb_namewidth) @@ -2046,6 +2052,12 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, return; for (c = 0; c < children; c++) { + uint64_t ishole = B_FALSE; + + if (nvlist_lookup_uint64(newchild[c], + ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) + continue; + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE); print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); @@ -2157,55 +2169,14 @@ get_namewidth(zpool_handle_t *zhp, void *data) } /* - * zpool iostat [-T d|u] [-v] [pool] ... [interval [count]] - * - * -T Display a timestamp in date(1) or Unix format - * -v Display statistics for individual vdevs - * - * This command can be tricky because we want to be able to deal with pool - * creation/destruction as well as vdev configuration changes. The bulk of this - * processing is handled by the pool_list_* routines in zpool_iter.c. We rely - * on pool_list_update() to detect the addition of new pools. Configuration - * changes are all handled within libzfs. + * Parse the input string, get the 'interval' and 'count' value if there is one. */ -int -zpool_do_iostat(int argc, char **argv) +static void +get_interval_count(int *argcp, char **argv, unsigned long *iv, + unsigned long *cnt) { - int c; - int ret; - int npools; unsigned long interval = 0, count = 0; - zpool_list_t *list; - boolean_t verbose = B_FALSE; - iostat_cbdata_t cb; - - /* check options */ - while ((c = getopt(argc, argv, "T:v")) != -1) { - switch (c) { - case 'T': - if (optarg) { - if (*optarg == 'u') - timestamp_fmt = UDATE; - else if (*optarg == 'd') - timestamp_fmt = DDATE; - else - usage(B_FALSE); - } else { - usage(B_FALSE); - } - break; - case 'v': - verbose = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; + int argc = *argcp, errno; /* * Determine if the last argument is an integer or a pool name @@ -2222,7 +2193,6 @@ zpool_do_iostat(int argc, char **argv) "cannot be zero\n")); usage(B_FALSE); } - /* * Ignore the last parameter */ @@ -2239,7 +2209,7 @@ zpool_do_iostat(int argc, char **argv) /* * If the last argument is also an integer, then we have both a count - * and an integer. + * and an interval. */ if (argc > 0 && isdigit(argv[argc - 1][0])) { char *end; @@ -2264,6 +2234,66 @@ zpool_do_iostat(int argc, char **argv) } } + *iv = interval; + *cnt = count; + *argcp = argc; +} + +static void +get_timestamp_arg(char c) +{ + if (c == 'u') + timestamp_fmt = UDATE; + else if (c == 'd') + timestamp_fmt = DDATE; + else + usage(B_FALSE); +} + +/* + * zpool iostat [-v] [-T d|u] [pool] ... [interval [count]] + * + * -v Display statistics for individual vdevs + * -T Display a timestamp in date(1) or Unix format + * + * This command can be tricky because we want to be able to deal with pool + * creation/destruction as well as vdev configuration changes. The bulk of this + * processing is handled by the pool_list_* routines in zpool_iter.c. We rely + * on pool_list_update() to detect the addition of new pools. Configuration + * changes are all handled within libzfs. + */ +int +zpool_do_iostat(int argc, char **argv) +{ + int c; + int ret; + int npools; + unsigned long interval = 0, count = 0; + zpool_list_t *list; + boolean_t verbose = B_FALSE; + iostat_cbdata_t cb; + + /* check options */ + while ((c = getopt(argc, argv, "T:v")) != -1) { + switch (c) { + case 'T': + get_timestamp_arg(*optarg); + break; + case 'v': + verbose = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + get_interval_count(&argc, argv, &interval, &count); + /* * Construct the list of all interesting pools. */ @@ -2464,12 +2494,13 @@ list_callback(zpool_handle_t *zhp, void *data) } /* - * zpool list [-H] [-o prop[,prop]*] [pool] ... + * zpool list [-H] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] * * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -o List of properties to display. Defaults to * "name,size,allocated,free,capacity,health,altroot" + * -T Display a timestamp in date(1) or Unix format * * List all pools in the system, whether or not they're healthy. Output space * statistics for each one, as well as health status summary. @@ -2483,9 +2514,10 @@ zpool_do_list(int argc, char **argv) static char default_props[] = "name,size,allocated,free,capacity,dedupratio,health,altroot"; char *props = default_props; + unsigned long interval = 0, count = 0; /* check options */ - while ((c = getopt(argc, argv, ":Ho:")) != -1) { + while ((c = getopt(argc, argv, ":Ho:T:")) != -1) { switch (c) { case 'H': cb.cb_scripted = B_TRUE; @@ -2493,6 +2525,9 @@ zpool_do_list(int argc, char **argv) case 'o': props = optarg; break; + case 'T': + get_timestamp_arg(*optarg); + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -2508,21 +2543,37 @@ zpool_do_list(int argc, char **argv) argc -= optind; argv += optind; + get_interval_count(&argc, argv, &interval, &count); + if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); cb.cb_first = B_TRUE; - ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, - list_callback, &cb); + for (;;) { - zprop_free_list(cb.cb_proplist); + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); - if (argc == 0 && cb.cb_first && !cb.cb_scripted) { - (void) printf(gettext("no pools available\n")); - return (0); + ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, + list_callback, &cb); + + if (argc == 0 && cb.cb_first && !cb.cb_scripted) { + (void) printf(gettext("no pools available\n")); + zprop_free_list(cb.cb_proplist); + return (0); + } + + if (interval == 0) + break; + + if (count != 0 && --count == 0) + break; + + (void) sleep(interval); } + zprop_free_list(cb.cb_proplist); return (ret); } @@ -3106,7 +3157,7 @@ scrub_callback(zpool_handle_t *zhp, void *data) return (1); } - err = zpool_scrub(zhp, cb->cb_type); + err = zpool_scan(zhp, cb->cb_type); return (err != 0); } @@ -3122,13 +3173,13 @@ zpool_do_scrub(int argc, char **argv) int c; scrub_cbdata_t cb; - cb.cb_type = POOL_SCRUB_EVERYTHING; + cb.cb_type = POOL_SCAN_SCRUB; /* check options */ while ((c = getopt(argc, argv, "s")) != -1) { switch (c) { case 's': - cb.cb_type = POOL_SCRUB_NONE; + cb.cb_type = POOL_SCAN_NONE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), @@ -3163,62 +3214,103 @@ typedef struct status_cbdata { * Print out detailed scrub status. */ void -print_scrub_status(nvlist_t *nvroot) +print_scan_status(pool_scan_stat_t *ps) { - vdev_stat_t *vs; - uint_t vsc; - time_t start, end, now; + time_t start, end; + uint64_t elapsed, mins_left; + uint64_t pass_exam, examined, total; + uint_t rate; double fraction_done; - uint64_t examined, total, minutes_left, minutes_taken; - char *scrub_type; + char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; - verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS, - (uint64_t **)&vs, &vsc) == 0); + (void) printf(gettext(" scan: ")); - /* - * If there's never been a scrub, there's not much to say. - */ - if (vs->vs_scrub_end == 0 && vs->vs_scrub_type == POOL_SCRUB_NONE) { + /* If there's never been a scan, there's not much to say. */ + if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || + ps->pss_func >= POOL_SCAN_FUNCS) { (void) printf(gettext("none requested\n")); return; } - scrub_type = (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ? - "resilver" : "scrub"; - - start = vs->vs_scrub_start; - end = vs->vs_scrub_end; - now = time(NULL); - examined = vs->vs_scrub_examined; - total = vs->vs_alloc; - - if (end != 0) { - minutes_taken = (uint64_t)((end - start) / 60); + start = ps->pss_start_time; + end = ps->pss_end_time; + zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf)); - (void) printf(gettext("%s %s after %lluh%um with %llu errors " - "on %s"), - scrub_type, vs->vs_scrub_complete ? "completed" : "stopped", + assert(ps->pss_func == POOL_SCAN_SCRUB || + ps->pss_func == POOL_SCAN_RESILVER); + /* + * Scan is finished or canceled. + */ + if (ps->pss_state == DSS_FINISHED) { + uint64_t minutes_taken = (end - start) / 60; + char *fmt; + + if (ps->pss_func == POOL_SCAN_SCRUB) { + fmt = gettext("scrub repaired %s in %lluh%um with " + "%llu errors on %s"); + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + fmt = gettext("resilvered %s in %lluh%um with " + "%llu errors on %s"); + } + /* LINTED */ + (void) printf(fmt, processed_buf, (u_longlong_t)(minutes_taken / 60), (uint_t)(minutes_taken % 60), - (u_longlong_t)vs->vs_scrub_errors, ctime(&end)); + (u_longlong_t)ps->pss_errors, + ctime((time_t *)&end)); + return; + } else if (ps->pss_state == DSS_CANCELED) { + if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext("scrub canceled on %s"), + ctime(&end)); + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("resilver canceled on %s"), + ctime(&end)); + } return; } - if (examined == 0) - examined = 1; - if (examined > total) - total = examined; + assert(ps->pss_state == DSS_SCANNING); + /* + * Scan is in progress. + */ + if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext("scrub in progress since %s"), + ctime(&start)); + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("resilver in progress since %s"), + ctime(&start)); + } + + examined = ps->pss_examined ? ps->pss_examined : 1; + total = ps->pss_to_examine; fraction_done = (double)examined / total; - minutes_left = (uint64_t)((now - start) * - (1 - fraction_done) / fraction_done / 60); - minutes_taken = (uint64_t)((now - start) / 60); - - (void) printf(gettext("%s in progress for %lluh%um, %.2f%% done, " - "%lluh%um to go\n"), - scrub_type, (u_longlong_t)(minutes_taken / 60), - (uint_t)(minutes_taken % 60), 100 * fraction_done, - (u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60)); + + /* elapsed time for this pass */ + elapsed = time(NULL) - ps->pss_pass_start; + elapsed = elapsed ? elapsed : 1; + pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1; + rate = pass_exam / elapsed; + rate = rate ? rate : 1; + mins_left = ((total - examined) / rate) / 60; + + zfs_nicenum(examined, examined_buf, sizeof (examined_buf)); + zfs_nicenum(total, total_buf, sizeof (total_buf)); + zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); + + (void) printf(gettext(" %s scanned out of %s at " + "%s/s, %lluh%um to go\n"), examined_buf, total_buf, rate_buf, + (u_longlong_t)(mins_left / 60), + (uint_t)(mins_left % 60)); + + if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext(" %s resilvered, %.2f%% done\n"), + processed_buf, 100 * fraction_done); + } else if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext(" %s repaired, %.2f%% done\n"), + processed_buf, 100 * fraction_done); + } } static void @@ -3378,7 +3470,7 @@ status_callback(zpool_handle_t *zhp, void *data) verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS, + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); @@ -3451,7 +3543,6 @@ status_callback(zpool_handle_t *zhp, void *data) "replace'.\n")); break; - case ZPOOL_STATUS_RESILVERING: (void) printf(gettext("status: One or more devices is " "currently being resilvered. The pool will\n\tcontinue " @@ -3549,10 +3640,11 @@ status_callback(zpool_handle_t *zhp, void *data) uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; + pool_scan_stat_t *ps = NULL; - - (void) printf(gettext(" scrub: ")); - print_scrub_status(nvroot); + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); + print_scan_status(ps); namewidth = max_width(zhp, nvroot, 0, 0); if (namewidth < 10) @@ -3620,11 +3712,12 @@ status_callback(zpool_handle_t *zhp, void *data) } /* - * zpool status [-vx] [pool] ... + * zpool status [-vx] [-T d|u] [pool] ... [interval [count]] * * -v Display complete error logs * -x Display only pools with potential problems * -D Display dedup status (undocumented) + * -T Display a timestamp in date(1) or Unix format * * Describes the health status of all pools or some subset. */ @@ -3633,10 +3726,11 @@ zpool_do_status(int argc, char **argv) { int c; int ret; + unsigned long interval = 0, count = 0; status_cbdata_t cb = { 0 }; /* check options */ - while ((c = getopt(argc, argv, "vxD")) != -1) { + while ((c = getopt(argc, argv, "vxDT:")) != -1) { switch (c) { case 'v': cb.cb_verbose = B_TRUE; @@ -3647,6 +3741,9 @@ zpool_do_status(int argc, char **argv) case 'D': cb.cb_dedup_stats = B_TRUE; break; + case 'T': + get_timestamp_arg(*optarg); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -3657,19 +3754,38 @@ zpool_do_status(int argc, char **argv) argc -= optind; argv += optind; - cb.cb_first = B_TRUE; + get_interval_count(&argc, argv, &interval, &count); if (argc == 0) cb.cb_allpools = B_TRUE; - ret = for_each_pool(argc, argv, B_TRUE, NULL, status_callback, &cb); + cb.cb_first = B_TRUE; - if (argc == 0 && cb.cb_count == 0) - (void) printf(gettext("no pools available\n")); - else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) - (void) printf(gettext("all pools are healthy\n")); + for (;;) { + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); - return (ret); + ret = for_each_pool(argc, argv, B_TRUE, NULL, + status_callback, &cb); + + if (argc == 0 && cb.cb_count == 0) + (void) printf(gettext("no pools available\n")); + else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) + (void) printf(gettext("all pools are healthy\n")); + + if (ret != 0) + return (ret); + + if (interval == 0) + break; + + if (count != 0 && --count == 0) + break; + + (void) sleep(interval); + } + + return (0); } typedef struct upgrade_cbdata { @@ -3890,6 +4006,7 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext(" 22 Received properties\n")); (void) printf(gettext(" 23 Slim ZIL\n")); (void) printf(gettext(" 24 System attributes\n")); + (void) printf(gettext(" 25 Improved scrub stats\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases, see:\n\n")); (void) printf("http://www.opensolaris.org/os/community/zfs/" @@ -3993,7 +4110,7 @@ get_history_one(zpool_handle_t *zhp, void *data) (void) snprintf(internalstr, sizeof (internalstr), "[internal %s txg:%lld] %s", - hist_event_table[ievent], txg, + zfs_history_event_names[ievent], txg, pathstr); cmdstr = internalstr; } diff --git a/usr/src/cmd/zpool/zpool_util.h b/usr/src/cmd/zpool/zpool_util.h index a18b8b705f..134c730fcf 100644 --- a/usr/src/cmd/zpool/zpool_util.h +++ b/usr/src/cmd/zpool/zpool_util.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef ZPOOL_UTIL_H @@ -45,7 +44,7 @@ uint_t num_logs(nvlist_t *nv); */ nvlist_t *make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, - boolean_t isreplace, boolean_t dryrun, int argc, char **argv); + boolean_t replacing, boolean_t dryrun, int argc, char **argv); nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, splitflags_t flags, int argc, char **argv); diff --git a/usr/src/cmd/zpool/zpool_vdev.c b/usr/src/cmd/zpool/zpool_vdev.c index 3c725d232c..53c2e60b7d 100644 --- a/usr/src/cmd/zpool/zpool_vdev.c +++ b/usr/src/cmd/zpool/zpool_vdev.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -1004,8 +1003,8 @@ is_spare(nvlist_t *config, const char *path) return (B_FALSE); } free(name); - (void) close(fd); + verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); nvlist_free(label); @@ -1029,8 +1028,8 @@ is_spare(nvlist_t *config, const char *path) * the majority of this task. */ static int -check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, - int isspare) +check_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, + boolean_t replacing, boolean_t isspare) { nvlist_t **child; uint_t c, children; @@ -1051,13 +1050,14 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, * hot spare within the same pool. If so, we allow it * regardless of what libdiskmgt or zpool_in_use() says. */ - if (isreplacing) { + if (replacing) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk) == 0 && wholedisk) (void) snprintf(buf, sizeof (buf), "%ss0", path); else (void) strlcpy(buf, path, sizeof (buf)); + if (is_spare(config, buf)) return (0); } @@ -1073,21 +1073,21 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, for (c = 0; c < children; c++) if ((ret = check_in_use(config, child[c], force, - isreplacing, B_FALSE)) != 0) + replacing, B_FALSE)) != 0) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) for (c = 0; c < children; c++) if ((ret = check_in_use(config, child[c], force, - isreplacing, B_TRUE)) != 0) + replacing, B_TRUE)) != 0) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) for (c = 0; c < children; c++) if ((ret = check_in_use(config, child[c], force, - isreplacing, B_FALSE)) != 0) + replacing, B_FALSE)) != 0) return (ret); return (0); @@ -1419,7 +1419,7 @@ split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, */ nvlist_t * make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, - boolean_t isreplacing, boolean_t dryrun, int argc, char **argv) + boolean_t replacing, boolean_t dryrun, int argc, char **argv) { nvlist_t *newroot; nvlist_t *poolconfig = NULL; @@ -1442,8 +1442,7 @@ make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, * uses (such as a dedicated dump device) that even '-f' cannot * override. */ - if (check_in_use(poolconfig, newroot, force, isreplacing, - B_FALSE) != 0) { + if (check_in_use(poolconfig, newroot, force, replacing, B_FALSE) != 0) { nvlist_free(newroot); return (NULL); } diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index eea3aa0d39..e0fabd7234 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -93,6 +93,7 @@ #include <sys/metaslab_impl.h> #include <sys/dsl_prop.h> #include <sys/dsl_dataset.h> +#include <sys/dsl_scan.h> #include <sys/refcount.h> #include <stdio.h> #include <stdio_ext.h> @@ -284,9 +285,9 @@ ztest_info_t ztest_info[] = { { ztest_spa_rename, 1, &zopt_rarely }, { ztest_scrub, 1, &zopt_rarely }, { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, - { ztest_vdev_attach_detach, 1, &zopt_rarely }, + { ztest_vdev_attach_detach, 1, &zopt_rarely }, { ztest_vdev_LUN_growth, 1, &zopt_rarely }, - { ztest_vdev_add_remove, 1, &zopt_vdevtime }, + { ztest_vdev_add_remove, 1, &zopt_vdevtime }, { ztest_vdev_aux_add_remove, 1, &zopt_vdevtime }, }; @@ -4662,9 +4663,9 @@ ztest_scrub(ztest_ds_t *zd, uint64_t id) ztest_shared_t *zs = ztest_shared; spa_t *spa = zs->zs_spa; - (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); + (void) spa_scan(spa, POOL_SCAN_SCRUB); (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ - (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); + (void) spa_scan(spa, POOL_SCAN_SCRUB); } /* @@ -4817,7 +4818,7 @@ ztest_spa_import_export(char *oldname, char *newname) * Kick off a scrub to tickle scrub/export races. */ if (ztest_random(2) == 0) - (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); + (void) spa_scan(spa, POOL_SCAN_SCRUB); pool_guid = spa_guid(spa); spa_close(spa, FTAG); diff --git a/usr/src/common/zfs/zfs_comutil.c b/usr/src/common/zfs/zfs_comutil.c index 8ab194e44c..ed9b67ea3b 100644 --- a/usr/src/common/zfs/zfs_comutil.c +++ b/usr/src/common/zfs/zfs_comutil.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -157,3 +156,47 @@ zfs_spa_version_map(int zpl_version) return (version); } + +const char *zfs_history_event_names[LOG_END] = { + "invalid event", + "pool create", + "vdev add", + "pool remove", + "pool destroy", + "pool export", + "pool import", + "vdev attach", + "vdev replace", + "vdev detach", + "vdev online", + "vdev offline", + "vdev upgrade", + "pool clear", + "pool scrub", + "pool property set", + "create", + "clone", + "destroy", + "destroy_begin_sync", + "inherit", + "property set", + "quota set", + "permission update", + "permission remove", + "permission who remove", + "promote", + "receive", + "rename", + "reservation set", + "replay_inc_sync", + "replay_full_sync", + "rollback", + "snapshot", + "filesystem version upgrade", + "refquota set", + "refreservation set", + "pool scrub done", + "user hold", + "user release", + "pool split", +}; diff --git a/usr/src/common/zfs/zfs_comutil.h b/usr/src/common/zfs/zfs_comutil.h index f6949387f1..61327f9aa9 100644 --- a/usr/src/common/zfs/zfs_comutil.h +++ b/usr/src/common/zfs/zfs_comutil.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _ZFS_COMUTIL_H @@ -38,6 +37,7 @@ extern void zpool_get_rewind_policy(nvlist_t *, zpool_rewind_policy_t *); extern int zfs_zpl_version_map(int spa_version); extern int zfs_spa_version_map(int zpl_version); +extern const char *zfs_history_event_names[LOG_END]; #ifdef __cplusplus } diff --git a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h index c0887d5b1d..de2632a71a 100644 --- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h +++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h @@ -17,8 +17,7 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FS_ZFS_H @@ -27,7 +26,7 @@ /* * On-disk version number. */ -#define SPA_VERSION 24ULL +#define SPA_VERSION 25ULL /* * The following are configuration names used in the nvlist describing a pool's diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h index 7a8d3d769a..6f7fed62c4 100644 --- a/usr/src/lib/libzfs/common/libzfs.h +++ b/usr/src/lib/libzfs/common/libzfs.h @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _LIBZFS_H @@ -119,6 +118,8 @@ enum { EZFS_PIPEFAILED, /* pipe create failed */ EZFS_THREADCREATEFAILED, /* thread create failed */ EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ + EZFS_SCRUBBING, /* currently scrubbing */ + EZFS_NO_SCRUB, /* no active scrub */ EZFS_UNKNOWN }; @@ -224,7 +225,7 @@ typedef struct splitflags { /* * Functions to manipulate pool and vdev state */ -extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t); +extern int zpool_scan(zpool_handle_t *, pool_scan_func_t); extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); extern int zpool_vdev_online(zpool_handle_t *, const char *, int, @@ -354,7 +355,7 @@ extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *, */ struct zfs_cmd; -extern const char *hist_event_table[LOG_END]; +extern const char *zfs_history_event_names[LOG_END]; extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, boolean_t verbose); @@ -526,8 +527,9 @@ typedef struct sendflags { typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); -extern int zfs_send(zfs_handle_t *, const char *, const char *, - sendflags_t, int, snapfilter_cb_t, void *); +extern int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, + sendflags_t flags, int outfd, snapfilter_cb_t filter_func, + void *cb_arg, nvlist_t **debugnvp); extern int zfs_promote(zfs_handle_t *); extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c index b212cdeddf..c35d6ab451 100644 --- a/usr/src/lib/libzfs/common/libzfs_pool.c +++ b/usr/src/lib/libzfs/common/libzfs_pool.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <ctype.h> @@ -43,50 +42,6 @@ #include "libzfs_impl.h" #include "zfs_comutil.h" -const char *hist_event_table[LOG_END] = { - "invalid event", - "pool create", - "vdev add", - "pool remove", - "pool destroy", - "pool export", - "pool import", - "vdev attach", - "vdev replace", - "vdev detach", - "vdev online", - "vdev offline", - "vdev upgrade", - "pool clear", - "pool scrub", - "pool property set", - "create", - "clone", - "destroy", - "destroy_begin_sync", - "inherit", - "property set", - "quota set", - "permission update", - "permission remove", - "permission who remove", - "promote", - "receive", - "rename", - "reservation set", - "replay_inc_sync", - "replay_full_sync", - "rollback", - "snapshot", - "filesystem version upgrade", - "refquota set", - "refreservation set", - "pool scrub done", - "user hold", - "user release", - "pool split", -}; - static int read_efi_label(nvlist_t *config, diskaddr_t *sb); #if defined(__i386) || defined(__amd64) @@ -334,7 +289,8 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) + == 0); (void) strlcpy(buf, zpool_state_to_name(intval, vs->vs_aux), len); @@ -1558,28 +1514,51 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, } /* - * Scrub the pool. + * Scan the pool. */ int -zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type) +zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func) { zfs_cmd_t zc = { 0 }; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - zc.zc_cookie = type; + zc.zc_cookie = func; - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SCRUB, &zc) == 0) + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 || + (errno == ENOENT && func != POOL_SCAN_NONE)) return (0); - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name); + if (func == POOL_SCAN_SCRUB) { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name); + } else if (func == POOL_SCAN_NONE) { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"), + zc.zc_name); + } else { + assert(!"unexpected result"); + } - if (errno == EBUSY) - return (zfs_error(hdl, EZFS_RESILVERING, msg)); - else + if (errno == EBUSY) { + nvlist_t *nvroot; + pool_scan_stat_t *ps = NULL; + uint_t psc; + + verify(nvlist_lookup_nvlist(zhp->zpool_config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); + if (ps && ps->pss_func == POOL_SCAN_SCRUB) + return (zfs_error(hdl, EZFS_SCRUBBING, msg)); + else + return (zfs_error(hdl, EZFS_RESILVERING, msg)); + } else if (errno == ENOENT) { + return (zfs_error(hdl, EZFS_NO_SCRUB, msg)); + } else { return (zpool_standard_error(hdl, errno, msg)); + } } /* @@ -2987,7 +2966,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, * open a misbehaving device, which can have undesirable * effects. */ - if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, + if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) != 0 || vs->vs_state >= VDEV_STATE_DEGRADED) && zhp != NULL && diff --git a/usr/src/lib/libzfs/common/libzfs_sendrecv.c b/usr/src/lib/libzfs/common/libzfs_sendrecv.c index 95031653eb..672e004ef5 100644 --- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c +++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c @@ -852,6 +852,7 @@ typedef struct send_dump_data { avl_tree_t *fsavl; snapfilter_cb_t *filter_cb; void *filter_cb_arg; + nvlist_t *debugnv; } send_dump_data_t; /* @@ -860,10 +861,11 @@ typedef struct send_dump_data { */ static int dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin, - int outfd, boolean_t enoent_ok, boolean_t *got_enoent) + int outfd, boolean_t enoent_ok, boolean_t *got_enoent, nvlist_t *debugnv) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zfs_hdl; + nvlist_t *thisdbg; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); assert(fromsnap == NULL || fromsnap[0] == '\0' || !fromorigin); @@ -876,11 +878,24 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin, *got_enoent = B_FALSE; + VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0)); + if (fromsnap && fromsnap[0] != '\0') { + VERIFY(0 == nvlist_add_string(thisdbg, + "fromsnap", fromsnap)); + } + if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SEND, &zc) != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); + VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno)); + if (debugnv) { + VERIFY(0 == nvlist_add_nvlist(debugnv, + zhp->zfs_name, thisdbg)); + } + nvlist_free(thisdbg); + switch (errno) { case EXDEV: @@ -920,6 +935,10 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin, } } + if (debugnv) + VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg)); + nvlist_free(thisdbg); + return (0); } @@ -1000,7 +1019,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate), - sdd->outfd, B_TRUE, &got_enoent); + sdd->outfd, B_TRUE, &got_enoent, sdd->debugnv); if (got_enoent) err = 0; @@ -1176,7 +1195,7 @@ again: int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sendflags_t flags, int outfd, snapfilter_cb_t filter_func, - void *cb_arg) + void *cb_arg, nvlist_t **debugnvp) { char errbuf[1024]; send_dump_data_t sdd = { 0 }; @@ -1276,7 +1295,10 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss)); err = nvlist_pack(hdrnv, &packbuf, &buflen, NV_ENCODE_XDR, 0); - nvlist_free(hdrnv); + if (debugnvp) + *debugnvp = hdrnv; + else + nvlist_free(hdrnv); if (err) { fsavl_destroy(fsavl); nvlist_free(fss); @@ -1351,6 +1373,8 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sdd.verbose = flags.verbose; sdd.filter_cb = filter_func; sdd.filter_cb_arg = cb_arg; + if (debugnvp) + sdd.debugnv = *debugnvp; err = dump_filesystems(zhp, &sdd); fsavl_destroy(fsavl); nvlist_free(fss); diff --git a/usr/src/lib/libzfs/common/libzfs_status.c b/usr/src/lib/libzfs/common/libzfs_status.c index c4f907733f..24725ec044 100644 --- a/usr/src/lib/libzfs/common/libzfs_status.c +++ b/usr/src/lib/libzfs/common/libzfs_status.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -138,7 +137,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) if (find_vdev_problem(child[c], func)) return (B_TRUE); } else { - verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS, + verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (func(vs->vs_state, vs->vs_aux, @@ -173,7 +172,8 @@ check_status(nvlist_t *config, boolean_t isimport) { nvlist_t *nvroot; vdev_stat_t *vs; - uint_t vsc; + pool_scan_stat_t *ps = NULL; + uint_t vsc, psc; uint64_t nerr; uint64_t version; uint64_t stateval; @@ -184,15 +184,24 @@ check_status(nvlist_t *config, boolean_t isimport) &version) == 0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS, + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &stateval) == 0); - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); + + /* + * Currently resilvering a vdev + */ + (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &psc); + if (ps && ps->pss_func == POOL_SCAN_RESILVER && + ps->pss_state == DSS_SCANNING) + return (ZPOOL_STATUS_RESILVERING); /* * Pool last accessed by another system. */ + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (hostid != 0 && (unsigned long)hostid != gethostid() && stateval == POOL_STATE_ACTIVE) return (ZPOOL_STATUS_HOSTID_MISMATCH); @@ -289,12 +298,6 @@ check_status(nvlist_t *config, boolean_t isimport) return (ZPOOL_STATUS_REMOVED_DEV); /* - * Currently resilvering - */ - if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER) - return (ZPOOL_STATUS_RESILVERING); - - /* * Outdated, but usable, version */ if (version < SPA_VERSION) diff --git a/usr/src/lib/libzfs/common/libzfs_util.c b/usr/src/lib/libzfs/common/libzfs_util.c index 98b56ff79a..2e73f76ea5 100644 --- a/usr/src/lib/libzfs/common/libzfs_util.c +++ b/usr/src/lib/libzfs/common/libzfs_util.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -215,6 +214,11 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_POSTSPLIT_ONLINE: return (dgettext(TEXT_DOMAIN, "disk was split from this pool " "into a new one")); + case EZFS_SCRUBBING: + return (dgettext(TEXT_DOMAIN, "currently scrubbing; " + "use 'zpool scrub -s' to cancel current scrub")); + case EZFS_NO_SCRUB: + return (dgettext(TEXT_DOMAIN, "there is no active scrub")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: diff --git a/usr/src/lib/libzfs/common/mapfile-vers b/usr/src/lib/libzfs/common/mapfile-vers index 376f3ed985..dc68ed9bc2 100644 --- a/usr/src/lib/libzfs/common/mapfile-vers +++ b/usr/src/lib/libzfs/common/mapfile-vers @@ -19,8 +19,7 @@ # CDDL HEADER END # # -# Copyright 2010 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. # # @@ -45,7 +44,6 @@ SUNWprivate_1.1 { fletcher_4_byteswap; fletcher_4_incremental_native; fletcher_4_incremental_byteswap; - hist_event_table; libzfs_errno; libzfs_error_action; libzfs_error_description; @@ -73,6 +71,7 @@ SUNWprivate_1.1 { zfs_get_pool_handle; zfs_get_user_props; zfs_get_type; + zfs_history_event_names; zfs_hold; zfs_hold_range; zfs_is_mounted; @@ -195,7 +194,7 @@ SUNWprivate_1.1 { zpool_prop_values; zpool_read_label; zpool_refresh_stats; - zpool_scrub; + zpool_scan; zpool_search_import; zpool_set_history_str; zpool_set_prop; diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c b/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c index fce227ffd3..65739f294c 100644 --- a/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c +++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include "libzfs_jni_util.h" @@ -1055,7 +1054,7 @@ populate_DeviceStatsBean(JNIEnv *env, nvlist_t *vdev, vdev_stat_t *vs; int result = nvlist_lookup_uint64_array( - vdev, ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &c); + vdev, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c); if (result != 0) { zjni_throw_exception(env, "could not retrieve virtual device statistics"); diff --git a/usr/src/lib/libzpool/common/util.c b/usr/src/lib/libzpool/common/util.c index 781edb6e8a..9b99531fd1 100644 --- a/usr/src/lib/libzpool/common/util.c +++ b/usr/src/lib/libzpool/common/util.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <assert.h> @@ -90,7 +89,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) if (is_log) prefix = "log "; - if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, + if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) != 0) vs = &v0; diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 301a3ab217..abbecd9a88 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -1343,7 +1343,7 @@ ZFS_COMMON_OBJS += \ dmu_zfetch.o \ dsl_deleg.o \ dsl_prop.o \ - dsl_scrub.o \ + dsl_scan.o \ gzip.o \ lzjb.o \ metaslab.o \ @@ -1372,6 +1372,7 @@ ZFS_COMMON_OBJS += \ zap_leaf.o \ zap_micro.o \ zfs_byteswap.o \ + zfs_debug.o \ zfs_fm.o \ zfs_fuid.o \ zfs_sa.o \ diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index f485fe9f7c..057f207bc6 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -437,6 +436,7 @@ struct arc_buf_hdr { kmutex_t b_freeze_lock; zio_cksum_t *b_freeze_cksum; + void *b_thawed; arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; @@ -545,8 +545,8 @@ static buf_hash_table_t buf_hash_table; (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) -#define HDR_LOCK(buf) \ - (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) +#define HDR_LOCK(hdr) \ + (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; @@ -664,6 +664,15 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((buf)->b_birth == birth) && ((buf)->b_spa == spa) +static void +buf_discard_identity(arc_buf_hdr_t *hdr) +{ + hdr->b_dva.dva_word[0] = 0; + hdr->b_dva.dva_word[1] = 0; + hdr->b_birth = 0; + hdr->b_cksum0 = 0; +} + static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) { @@ -797,7 +806,8 @@ buf_cons(void *vbuf, void *unused, int kmflag) arc_buf_t *buf = vbuf; bzero(buf, sizeof (arc_buf_t)); - rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); @@ -826,7 +836,8 @@ buf_dest(void *vbuf, void *unused) { arc_buf_t *buf = vbuf; - rw_destroy(&buf->b_lock); + mutex_destroy(&buf->b_evict_lock); + rw_destroy(&buf->b_data_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } @@ -941,6 +952,11 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) void arc_buf_thaw(arc_buf_t *buf) { + kmutex_t *hash_lock; + + hash_lock = HDR_LOCK(buf->b_hdr); + mutex_enter(hash_lock); + if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_state != arc_anon) panic("modifying non-anon buffer!"); @@ -954,18 +970,32 @@ arc_buf_thaw(arc_buf_t *buf) kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } + + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (buf->b_hdr->b_thawed) + kmem_free(buf->b_hdr->b_thawed, 1); + buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); + } + mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(hash_lock); } void arc_buf_freeze(arc_buf_t *buf) { + kmutex_t *hash_lock; + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + hash_lock = HDR_LOCK(buf->b_hdr); + mutex_enter(hash_lock); + ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); + mutex_exit(hash_lock); } static void @@ -1037,7 +1067,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT(new_state != old_state); ASSERT(refcnt == 0 || ab->b_datacnt > 0); ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); - ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon); ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); from_delta = to_delta = ab->b_datacnt * ab->b_size; @@ -1059,7 +1088,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) /* * If prefetching out of the ghost cache, - * we will have a non-null datacnt. + * we will have a non-zero datacnt. */ if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { /* ghost elements have a ghost size */ @@ -1095,9 +1124,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon) { + if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) buf_hash_remove(ab); - } /* adjust state sizes */ if (to_delta) @@ -1254,7 +1282,6 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr; - rw_enter(&buf->b_lock, RW_WRITER); ASSERT(buf->b_data != NULL); hdr = buf->b_hdr; (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); @@ -1263,7 +1290,6 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) buf->b_private = NULL; atomic_add_64(&arc_loaned_bytes, hdr->b_size); - rw_exit(&buf->b_lock); } static arc_buf_t * @@ -1299,16 +1325,16 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) * must verify b_data != NULL to know if the add_ref * was successful. */ - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); if (buf->b_data == NULL) { - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return; } - hdr = buf->b_hdr; - ASSERT(hdr != NULL); - hash_lock = HDR_LOCK(hdr); + hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); - rw_exit(&buf->b_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + mutex_exit(&buf->b_evict_lock); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); add_reference(hdr, hash_lock, tag); @@ -1394,6 +1420,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) continue; *bufp = buf->b_next; + buf->b_next = NULL; ASSERT(buf->b_efunc == NULL); @@ -1442,23 +1469,21 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) if (!BUF_EMPTY(hdr)) { ASSERT(!HDR_IN_HASH_TABLE(hdr)); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; + buf_discard_identity(hdr); } while (hdr->b_buf) { arc_buf_t *buf = hdr->b_buf; if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); - rw_enter(&buf->b_lock, RW_WRITER); + mutex_enter(&buf->b_evict_lock); ASSERT(buf->b_hdr != NULL); arc_buf_destroy(hdr->b_buf, FALSE, FALSE); hdr->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); mutex_exit(&arc_eviction_mtx); } else { arc_buf_destroy(hdr->b_buf, FALSE, TRUE); @@ -1468,6 +1493,10 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } + if (hdr->b_thawed) { + kmem_free(hdr->b_thawed, 1); + hdr->b_thawed = NULL; + } ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -1488,6 +1517,9 @@ arc_buf_free(arc_buf_t *buf, void *tag) kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_datacnt > 1) { arc_buf_destroy(buf, FALSE, TRUE); @@ -1512,12 +1544,10 @@ arc_buf_free(arc_buf_t *buf, void *tag) if (destroy_hdr) arc_hdr_destroy(hdr); } else { - if (remove_reference(hdr, NULL, tag) > 0) { - ASSERT(HDR_IO_ERROR(hdr)); + if (remove_reference(hdr, NULL, tag) > 0) arc_buf_destroy(buf, FALSE, TRUE); - } else { + else arc_hdr_destroy(hdr); - } } } @@ -1535,6 +1565,8 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) } mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT(hdr->b_state != arc_anon); ASSERT(buf->b_data != NULL); @@ -1613,7 +1645,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, ASSERT(ab->b_datacnt > 0); while (ab->b_buf) { arc_buf_t *buf = ab->b_buf; - if (!rw_tryenter(&buf->b_lock, RW_WRITER)) { + if (!mutex_tryenter(&buf->b_evict_lock)) { missed += 1; break; } @@ -1635,9 +1667,9 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&arc_eviction_mtx); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); } else { - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); arc_buf_destroy(buf, buf->b_data == stolen, TRUE); } @@ -1854,9 +1886,9 @@ arc_do_user_evicts(void) while (arc_eviction_list != NULL) { arc_buf_t *buf = arc_eviction_list; arc_eviction_list = buf->b_next; - rw_enter(&buf->b_lock, RW_WRITER); + mutex_enter(&buf->b_evict_lock); buf->b_hdr = NULL; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); mutex_exit(&arc_eviction_mtx); if (buf->b_efunc != NULL) @@ -2438,7 +2470,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { - bcopy(buf->b_data, arg, buf->b_hdr->b_size); + if (zio == NULL || zio->io_error == 0) + bcopy(buf->b_data, arg, buf->b_hdr->b_size); VERIFY(arc_buf_remove_ref(buf, arg) == 1); } @@ -2452,6 +2485,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) *bufp = NULL; } else { *bufp = buf; + ASSERT(buf->b_data); } } @@ -2606,13 +2640,22 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, { int err; + if (pbuf == NULL) { + /* + * XXX This happens from traverse callback funcs, for + * the objset_phys_t block. + */ + return (arc_read_nolock(pio, spa, bp, done, private, priority, + zio_flags, arc_flags, zb)); + } + ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); - rw_enter(&pbuf->b_lock, RW_READER); + rw_enter(&pbuf->b_data_lock, RW_READER); err = arc_read_nolock(pio, spa, bp, done, private, priority, zio_flags, arc_flags, zb); - rw_exit(&pbuf->b_lock); + rw_exit(&pbuf->b_data_lock); return (err); } @@ -2721,9 +2764,7 @@ top: if (exists) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; + buf_discard_identity(hdr); (void) arc_buf_remove_ref(buf, private); goto top; /* restart the IO request */ } @@ -2901,14 +2942,14 @@ arc_buf_evict(arc_buf_t *buf) kmutex_t *hash_lock; arc_buf_t **bufp; - rw_enter(&buf->b_lock, RW_WRITER); + mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(). */ ASSERT(buf->b_data == NULL); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (0); } else if (buf->b_data == NULL) { arc_buf_t copy = *buf; /* structure assignment */ @@ -2917,14 +2958,15 @@ arc_buf_evict(arc_buf_t *buf) * but let arc_do_user_evicts() do the reaping. */ buf->b_efunc = NULL; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); VERIFY(copy.b_efunc(©) == 0); return (1); } hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(buf->b_hdr == hdr); ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); @@ -2943,6 +2985,7 @@ arc_buf_evict(arc_buf_t *buf) arc_state_t *old_state = hdr->b_state; arc_state_t *evicted_state; + ASSERT(hdr->b_buf == NULL); ASSERT(refcount_is_zero(&hdr->b_refcnt)); evicted_state = @@ -2960,12 +3003,13 @@ arc_buf_evict(arc_buf_t *buf) mutex_exit(&old_state->arcs_mtx); } mutex_exit(hash_lock); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); VERIFY(buf->b_efunc(buf) == 0); buf->b_efunc = NULL; buf->b_private = NULL; buf->b_hdr = NULL; + buf->b_next = NULL; kmem_cache_free(buf_cache, buf); return (1); } @@ -2980,12 +3024,17 @@ void arc_release(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; + kmutex_t *hash_lock = NULL; l2arc_buf_hdr_t *l2hdr; uint64_t buf_size; - boolean_t released = B_FALSE; - rw_enter(&buf->b_lock, RW_WRITER); + /* + * It would be nice to assert that if it's DMU metadata (level > + * 0 || it's the dnode file), then it must be syncing context. + * But we don't know that information at this level. + */ + + mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; /* this buffer is not on any list */ @@ -2993,15 +3042,12 @@ arc_release(arc_buf_t *buf, void *tag) if (hdr->b_state == arc_anon) { /* this buffer is already released */ - ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); - ASSERT(BUF_EMPTY(hdr)); ASSERT(buf->b_efunc == NULL); - arc_buf_thaw(buf); - rw_exit(&buf->b_lock); - released = B_TRUE; } else { hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); } l2hdr = hdr->b_l2hdr; @@ -3011,9 +3057,6 @@ arc_release(arc_buf_t *buf, void *tag) buf_size = hdr->b_size; } - if (released) - goto out; - /* * Do we have more than one buf? */ @@ -3027,14 +3070,14 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(hdr->b_buf != buf || buf->b_next != NULL); /* - * Pull the data off of this buf and attach it to - * a new anonymous buf. + * Pull the data off of this hdr and attach it to + * a new anonymous hdr. */ (void) remove_reference(hdr, hash_lock, tag); bufp = &hdr->b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; - *bufp = (*bufp)->b_next; + *bufp = buf->b_next; buf->b_next = NULL; ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); @@ -3062,26 +3105,25 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_freeze_cksum = NULL; (void) refcount_add(&nhdr->b_refcnt, tag); buf->b_hdr = nhdr; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); atomic_add_64(&arc_anon->arcs_size, blksz); } else { - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_refcnt) == 1); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - arc_change_state(arc_anon, hdr, hash_lock); + if (hdr->b_state != arc_anon) + arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; - mutex_exit(hash_lock); + if (hash_lock) + mutex_exit(hash_lock); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; + buf_discard_identity(hdr); arc_buf_thaw(buf); } buf->b_efunc = NULL; buf->b_private = NULL; -out: if (l2hdr) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); @@ -3090,14 +3132,27 @@ out: } } +/* + * Release this buffer. If it does not match the provided BP, fill it + * with that block's contents. + */ +/* ARGSUSED */ +int +arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, + zbookmark_t *zb) +{ + arc_release(buf, tag); + return (0); +} + int arc_released(arc_buf_t *buf) { int released; - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (released); } @@ -3106,9 +3161,9 @@ arc_has_callback(arc_buf_t *buf) { int callback; - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); callback = (buf->b_efunc != NULL); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (callback); } @@ -3118,9 +3173,9 @@ arc_referenced(arc_buf_t *buf) { int referenced; - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); referenced = (refcount_count(&buf->b_hdr->b_refcnt)); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (referenced); } #endif @@ -3173,8 +3228,8 @@ arc_write_done(zio_t *zio) /* * If the block to be written was all-zero, we may have * compressed it away. In this case no write was performed - * so there will be no dva/birth-date/checksum. The buffer - * must therefor remain anonymous (and uncached). + * so there will be no dva/birth/checksum. The buffer must + * therefore remain anonymous (and uncached). */ if (!BUF_EMPTY(hdr)) { arc_buf_hdr_t *exists; @@ -3278,9 +3333,7 @@ arc_free(spa_t *spa, const blkptr_t *bp) if (HDR_IN_HASH_TABLE(ab)) buf_hash_remove(ab); ab->b_arc_access = 0; - bzero(&ab->b_dva, sizeof (dva_t)); - ab->b_birth = 0; - ab->b_cksum0 = 0; + buf_discard_identity(ab); ab->b_buf->b_efunc = NULL; ab->b_buf->b_private = NULL; mutex_exit(hash_lock); @@ -3974,11 +4027,11 @@ l2arc_read_done(zio_t *zio) ASSERT(cb != NULL); buf = cb->l2rcb_buf; ASSERT(buf != NULL); - hdr = buf->b_hdr; - ASSERT(hdr != NULL); - hash_lock = HDR_LOCK(hdr); + hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * Check this survived the L2ARC journey. diff --git a/usr/src/uts/common/fs/zfs/bplist.c b/usr/src/uts/common/fs/zfs/bplist.c index 830b6c1a42..8f8c5e1fcf 100644 --- a/usr/src/uts/common/fs/zfs/bplist.c +++ b/usr/src/uts/common/fs/zfs/bplist.c @@ -175,23 +175,26 @@ bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp) return (err); } - if (*itorp >= bpl->bpl_phys->bpl_entries) { - mutex_exit(&bpl->bpl_lock); - return (ENOENT); - } + do { + if (*itorp >= bpl->bpl_phys->bpl_entries) { + mutex_exit(&bpl->bpl_lock); + return (ENOENT); + } - blk = *itorp >> bpl->bpl_bpshift; - off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift); + blk = *itorp >> bpl->bpl_bpshift; + off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift); - err = bplist_cache(bpl, blk); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } + err = bplist_cache(bpl, blk); + if (err) { + mutex_exit(&bpl->bpl_lock); + return (err); + } + + bparray = bpl->bpl_cached_dbuf->db_data; + *bp = bparray[off]; + (*itorp)++; + } while (bp->blk_birth == 0); - bparray = bpl->bpl_cached_dbuf->db_data; - *bp = bparray[off]; - (*itorp)++; mutex_exit(&bpl->bpl_lock); return (0); } @@ -206,8 +209,10 @@ bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx) ASSERT(!BP_IS_HOLE(bp)); mutex_enter(&bpl->bpl_lock); err = bplist_hold(bpl); - if (err) + if (err) { + mutex_exit(&bpl->bpl_lock); return (err); + } blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift; off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift); diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index c211ff79dd..e1cd431acb 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -363,6 +363,7 @@ dbuf_verify(dmu_buf_impl_t *db) } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && + (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* @@ -477,8 +478,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); - mutex_exit(&db->db_mtx); - dbuf_rele(db, NULL); + dbuf_rele_and_unlock(db, NULL); } static void @@ -549,7 +549,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) else pbuf = db->db_objset->os_phys_buf; - (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, + (void) dsl_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); @@ -727,7 +727,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) /* free this block */ if (!BP_IS_HOLE(bp)) - dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp); + zio_free(db->db_dnode->dn_objset->os_spa, txg, bp); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; /* @@ -921,6 +921,26 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dnode_willuse_space(db->db_dnode, size-osize, tx); } +void +dbuf_release_bp(dmu_buf_impl_t *db) +{ + objset_t *os = db->db_dnode->dn_objset; + zbookmark_t zb; + + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + ASSERT(arc_released(os->os_phys_buf) || + list_link_active(&os->os_dsl_dataset->ds_synced_link)); + ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); + + zb.zb_objset = os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : 0; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; + (void) arc_release_bp(db->db_buf, db, + db->db_blkptr, os->os_spa, &zb); +} + dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { @@ -1717,7 +1737,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) else pbuf = dn->dn_objset->os_phys_buf; - (void) arc_read(NULL, dn->dn_objset->os_spa, + (void) dsl_read(NULL, dn->dn_objset->os_spa, bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); @@ -2463,7 +2483,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) if (BP_IS_HOLE(db->db_blkptr)) { arc_buf_thaw(data); } else { - arc_release(data, db); + dbuf_release_bp(db); } } } diff --git a/usr/src/uts/common/fs/zfs/ddt.c b/usr/src/uts/common/fs/zfs/ddt.c index 852fd1cdc4..64cbcc1f92 100644 --- a/usr/src/uts/common/fs/zfs/ddt.c +++ b/usr/src/uts/common/fs/zfs/ddt.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/zfs_context.h> @@ -35,6 +34,7 @@ #include <sys/dsl_pool.h> #include <sys/zio_checksum.h> #include <sys/zio_compress.h> +#include <sys/dsl_scan.h> static const ddt_ops_t *ddt_ops[DDT_TYPES] = { &ddt_zap_ops, @@ -160,7 +160,7 @@ ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt->ddt_object[type][class], dde)); } -static int +int ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx) { @@ -245,12 +245,13 @@ ddt_bp_create(enum zio_checksum checksum, ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); bp->blk_cksum = ddk->ddk_cksum; + bp->blk_fill = 1; BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); BP_SET_CHECKSUM(bp, checksum); - BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_TYPE(bp, DMU_OT_DEDUP); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); @@ -996,10 +997,17 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) ddt_object_create(ddt, ntype, nclass, tx); VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); - if (dp->dp_scrub_func != SCRUB_FUNC_NONE && - oclass > nclass && - nclass <= dp->dp_scrub_ddt_class_max) - dsl_pool_scrub_ddt_entry(dp, ddt->ddt_checksum, dde); + /* + * If the class changes, the order that we scan this bp + * changes. If it decreases, we could miss it, so + * scan it right now. (This covers both class changing + * while we are doing ddt_walk(), and when we are + * traversing.) + */ + if (nclass < oclass) { + dsl_scan_ddt_entry(dp->dp_scan, + ddt->ddt_checksum, dde, tx); + } } } @@ -1013,7 +1021,6 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) if (avl_numnodes(&ddt->ddt_tree) == 0) return; - ASSERT(spa_sync_pass(spa) == 1); ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); if (spa->spa_ddt_stat_object == 0) { @@ -1081,6 +1088,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) ddb->ddb_type, ddb->ddb_class, &ddb->ddb_cursor, dde); } + dde->dde_type = ddb->ddb_type; + dde->dde_class = ddb->ddb_class; if (error == 0) return (0); if (error != ENOENT) diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 0be72aa4f2..582089b8e8 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -84,7 +84,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, TRUE, "FUID table" }, { byteswap_uint64_array, TRUE, "FUID table size" }, { zap_byteswap, TRUE, "DSL dataset next clones"}, - { zap_byteswap, TRUE, "scrub work queue" }, + { zap_byteswap, TRUE, "scan work queue" }, { zap_byteswap, TRUE, "ZFS user/group used" }, { zap_byteswap, TRUE, "ZFS user/group quota" }, { zap_byteswap, TRUE, "snapshot refcount tags"}, @@ -93,7 +93,10 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, TRUE, "System attributes" }, { zap_byteswap, TRUE, "SA master node" }, { zap_byteswap, TRUE, "SA attr registration" }, - { zap_byteswap, TRUE, "SA attr layouts" }, }; + { zap_byteswap, TRUE, "SA attr layouts" }, + { zap_byteswap, TRUE, "scan translations" }, + { byteswap_uint8_array, FALSE, "deduplicated block" }, +}; int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, @@ -1630,6 +1633,7 @@ byteswap_uint8_array(void *vbuf, size_t size) void dmu_init(void) { + zfs_dbgmsg_init(); dbuf_init(); dnode_init(); zfetch_init(); @@ -1649,4 +1653,5 @@ dmu_fini(void) l2arc_fini(); xuio_stat_fini(); sa_cache_fini(); + zfs_dbgmsg_fini(); } diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 210d693051..546cd98b84 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -259,11 +259,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dprintf_bp(os->os_rootbp, "reading %s", ""); /* - * NB: when bprewrite scrub can change the bp, + * XXX when bprewrite scrub can change the bp, * and this is called from dmu_objset_open_ds_os, the bp * could change, and we'll need a lock. */ - err = arc_read_nolock(NULL, spa, os->os_rootbp, + err = dsl_read_nolock(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err) { @@ -628,6 +628,7 @@ struct oscarg { const char *lastname; dmu_objset_type_t type; uint64_t flags; + cred_t *cr; }; /*ARGSUSED*/ @@ -659,7 +660,7 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct oscarg *oa = arg2; @@ -668,7 +669,7 @@ dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); dsobj = dsl_dataset_create_sync(dd, oa->lastname, - oa->clone_origin, oa->flags, cr, tx); + oa->clone_origin, oa->flags, oa->cr, tx); if (oa->clone_origin == NULL) { dsl_dataset_t *ds; @@ -684,12 +685,12 @@ dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ds, bp, oa->type, tx); if (oa->userfunc) - oa->userfunc(os, oa->userarg, cr, tx); + oa->userfunc(os, oa->userarg, oa->cr, tx); dsl_dataset_rele(ds, FTAG); } - spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa, - tx, cr, "dataset = %llu", dsobj); + spa_history_log_internal(LOG_DS_CREATE, dd->dd_pool->dp_spa, + tx, "dataset = %llu", dsobj); } int @@ -715,6 +716,7 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, oa.lastname = tail; oa.type = type; oa.flags = flags; + oa.cr = CRED(); err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, dmu_objset_create_sync, pdd, &oa, 5); @@ -742,6 +744,7 @@ dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) oa.lastname = tail; oa.clone_origin = clone_origin; oa.flags = flags; + oa.cr = CRED(); err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, dmu_objset_create_sync, pdd, &oa, 5); @@ -795,19 +798,19 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) { objset_t *os = arg1; dsl_dataset_t *ds = os->os_dsl_dataset; struct snaparg *sn = arg2; - dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx); + dsl_dataset_snapshot_sync(ds, sn->snapname, tx); if (sn->props) { dsl_props_arg_t pa; pa.pa_props = sn->props; pa.pa_source = ZPROP_SRC_LOCAL; - dsl_props_set_sync(ds->ds_prev, &pa, cr, tx); + dsl_props_set_sync(ds->ds_prev, &pa, tx); } } @@ -1016,11 +1019,11 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) /* * Create the root block IO */ - arc_release(os->os_phys_buf, &os->os_phys_buf); - SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + VERIFY3U(0, ==, arc_release_bp(os->os_phys_buf, &os->os_phys_buf, + os->os_rootbp, os->os_spa, &zb)); dmu_write_policy(os, NULL, 0, 0, &zp); @@ -1082,7 +1085,7 @@ dmu_objset_is_dirty(objset_t *os, uint64_t txg) !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); } -static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; +objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) @@ -1649,7 +1652,7 @@ dmu_objset_prefetch(const char *name, void *arg) SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - (void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds), + (void) dsl_read_nolock(NULL, dsl_dataset_get_spa(ds), &ds->ds_phys->ds_bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index 86c428b5f2..a675e28b15 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -301,7 +301,7 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) /* ARGSUSED */ static int -backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, +backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct backuparg *ba = arg; @@ -330,7 +330,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; - if (arc_read_nolock(NULL, spa, bp, + if (dsl_read(NULL, spa, bp, pbuf, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (EIO); @@ -361,7 +361,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); - if (arc_read_nolock(NULL, spa, bp, + if (dsl_read(NULL, spa, bp, pbuf, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (EIO); @@ -504,6 +504,7 @@ struct recvbeginsyncarg { uint64_t dsflags; char clonelastname[MAXNAMELEN]; dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ + cred_t *cr; }; /* ARGSUSED */ @@ -536,7 +537,7 @@ recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct recvbeginsyncarg *rbsa = arg2; @@ -545,7 +546,7 @@ recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) /* Create and open new dataset. */ dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, - rbsa->origin, flags, cr, tx); + rbsa->origin, flags, rbsa->cr, tx); VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, B_TRUE, dmu_recv_tag, &rbsa->ds)); @@ -554,8 +555,8 @@ recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); } - spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC, - dd->dd_pool->dp_spa, tx, cr, "dataset = %lld", dsobj); + spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, + dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); } /* ARGSUSED */ @@ -630,7 +631,7 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ohds = arg1; struct recvbeginsyncarg *rbsa = arg2; @@ -641,7 +642,7 @@ recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) /* create and open the temporary clone */ dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, - ohds->ds_prev, flags, cr, tx); + ohds->ds_prev, flags, rbsa->cr, tx); VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); /* @@ -655,8 +656,8 @@ recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) rbsa->ds = cds; - spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, - dp->dp_spa, tx, cr, "dataset = %lld", dsobj); + spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, + dp->dp_spa, tx, "dataset = %lld", dsobj); } @@ -701,6 +702,7 @@ dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, rbsa.type = drrb->drr_type; rbsa.tag = FTAG; rbsa.dsflags = 0; + rbsa.cr = CRED(); versioninfo = drrb->drr_versioninfo; flags = drrb->drr_flags; @@ -1466,12 +1468,12 @@ recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct recvendsyncarg *resa = arg2; - dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx); + dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c index 653c3a2d41..429c76ae11 100644 --- a/usr/src/uts/common/fs/zfs/dmu_traverse.c +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/zfs_context.h> @@ -77,7 +76,7 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); + (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg); return (0); } @@ -102,7 +101,7 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); - (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, + (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg); } return (0); @@ -140,7 +139,8 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, boolean_t hard = td->td_flags & TRAVERSE_HARD; if (bp->blk_birth == 0) { - err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg); + err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp, + td->td_arg); return (err); } @@ -160,7 +160,8 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, } if (td->td_flags & TRAVERSE_PRE) { - err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); + err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, + td->td_arg); if (err) return (err); } @@ -171,7 +172,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - err = arc_read(NULL, td->td_spa, bp, pbuf, + err = dsl_read(NULL, td->td_spa, bp, pbuf, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) @@ -195,7 +196,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - err = arc_read(NULL, td->td_spa, bp, pbuf, + err = dsl_read(NULL, td->td_spa, bp, pbuf, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) @@ -217,7 +218,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, objset_phys_t *osp; dnode_phys_t *dnp; - err = arc_read_nolock(NULL, td->td_spa, bp, + err = dsl_read_nolock(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) @@ -252,8 +253,10 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, if (buf) (void) arc_buf_remove_ref(buf, &buf); - if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) - err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); + if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { + err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, + td->td_arg); + } return (err != 0 ? err : lasterr); } @@ -275,16 +278,17 @@ traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, break; lasterr = err; } - if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { - SET_BOOKMARK(&czb, objset, - object, 0, DMU_SPILL_BLKID); - err = traverse_visitbp(td, dnp, buf, - (blkptr_t *)&dnp->dn_spill, &czb); - if (err) { - if (!hard) - break; - lasterr = err; - } + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + SET_BOOKMARK(&czb, objset, + object, 0, DMU_SPILL_BLKID); + err = traverse_visitbp(td, dnp, buf, + (blkptr_t *)&dnp->dn_spill, &czb); + if (err) { + if (!hard) + return (err); + lasterr = err; } } return (err != 0 ? err : lasterr); @@ -293,7 +297,8 @@ traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, /* ARGSUSED */ static int traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, + void *arg) { struct prefetch_data *pfd = arg; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; @@ -314,7 +319,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, cv_broadcast(&pfd->pd_cv); mutex_exit(&pfd->pd_mtx); - (void) arc_read_nolock(NULL, spa, bp, NULL, NULL, + (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb); diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c index 523aad70da..fa5747c7aa 100644 --- a/usr/src/uts/common/fs/zfs/dnode_sync.c +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c @@ -227,7 +227,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, if (db->db_state != DB_CACHED) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); - arc_release(db->db_buf, db); + dbuf_release_bp(db); bp = (blkptr_t *)db->db.db_data; epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c index ee0ccd7d01..c645d4d785 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dataset.c +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c @@ -38,6 +38,7 @@ #include <sys/spa.h> #include <sys/zfs_znode.h> #include <sys/zvol.h> +#include <sys/dsl_scan.h> static char *dsl_reaper = "the grim reaper"; @@ -80,7 +81,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; - dprintf_bp(bp, "born, ds=%p\n", ds); + dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); /* It could have been compressed away to nothing */ @@ -100,6 +101,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) return; } dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); @@ -150,7 +152,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { int64_t delta; - dprintf_bp(bp, "freeing: %s", ""); + dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); dsl_free(tx->tx_pool, tx->tx_txg, bp); mutex_enter(&ds->ds_dir->dd_lock); @@ -191,7 +193,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, ds->ds_prev->ds_phys->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } - if (bp->blk_birth > ds->ds_origin_txg) { + if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { dsl_dir_transfer_space(ds->ds_dir, used, DD_USED_HEAD, DD_USED_SNAP, tx); } @@ -397,19 +399,6 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev); } - - if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) { - dsl_dataset_t *origin; - - err = dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_origin_obj, - FTAG, &origin); - if (err == 0) { - ds->ds_origin_txg = - origin->ds_phys->ds_creation_txg; - dsl_dataset_rele(origin, FTAG); - } - } } else { if (zfs_flags & ZFS_DEBUG_SNAPNAMES) err = dsl_dataset_get_snapname(ds); @@ -876,10 +865,6 @@ dsl_snapshot_destroy_one(const char *name, void *arg) struct dsl_ds_destroyarg *dsda; dsl_dataset_make_exclusive(ds, da->dstg); - if (ds->ds_objset != NULL) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); dsda->ds = ds; dsda->defer = da->defer; @@ -989,11 +974,6 @@ dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) return (error); dsda->rm_origin = origin; dsl_dataset_make_exclusive(origin, tag); - - if (origin->ds_objset != NULL) { - dmu_objset_evict(origin->ds_objset); - origin->ds_objset = NULL; - } } return (0); @@ -1020,10 +1000,6 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) /* Destroying a snapshot is simpler */ dsl_dataset_make_exclusive(ds, tag); - if (ds->ds_objset != NULL) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } dsda.defer = defer; err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, @@ -1096,24 +1072,10 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) if (err) goto out; - if (ds->ds_objset) { - /* - * We need to sync out all in-flight IO before we try - * to evict (the dataset evict func is trying to clear - * the cached entries for this dataset in the ARC). - */ - txg_wait_synced(dd->dd_pool, 0); - } - /* * Blow away the dsl_dir + head dataset. */ dsl_dataset_make_exclusive(ds, tag); - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } - /* * If we're removing a clone, we might also need to remove its * origin. @@ -1220,7 +1182,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) uint64_t mrs_used; uint64_t dlused, dlcomp, dluncomp; - ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj); + ASSERT(!dsl_dataset_is_snapshot(ds)); if (ds->ds_phys->ds_prev_snap_obj != 0) mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; @@ -1234,21 +1196,11 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) ds->ds_phys->ds_unique_bytes = ds->ds_phys->ds_used_bytes - (mrs_used - dlused); - if (!DS_UNIQUE_IS_ACCURATE(ds) && - spa_version(ds->ds_dir->dd_pool->dp_spa) >= + if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; } -static uint64_t -dsl_dataset_unique(dsl_dataset_t *ds) -{ - if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds)) - dsl_dataset_recalc_head_uniq(ds); - - return (ds->ds_phys->ds_unique_bytes); -} - struct killarg { dsl_dataset_t *ds; dmu_tx_t *tx; @@ -1256,7 +1208,7 @@ struct killarg { /* ARGSUSED */ static int -kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, +kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct killarg *ka = arg; @@ -1315,7 +1267,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_pool_t *dp = ds->ds_dir->dd_pool; @@ -1324,8 +1276,8 @@ dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, - cr, "dataset = %llu", ds->ds_object); + spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, + "dataset = %llu", ds->ds_object); } static int @@ -1499,7 +1451,7 @@ remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) } void -dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { struct dsl_ds_destroyarg *dsda = arg1; dsl_dataset_t *ds = dsda->ds; @@ -1531,6 +1483,11 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) cv_broadcast(&ds->ds_exclusive_cv); mutex_exit(&ds->ds_lock); + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + /* Remove our reservation */ if (ds->ds_reserved != 0) { dsl_prop_setarg_t psa; @@ -1541,13 +1498,13 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) &value); psa.psa_effective_value = 0; /* predict default value */ - dsl_dataset_set_reservation_sync(ds, &psa, cr, tx); + dsl_dataset_set_reservation_sync(ds, &psa, tx); ASSERT3U(ds->ds_reserved, ==, 0); } ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - dsl_pool_ds_destroyed(ds, tx); + dsl_scan_ds_destroyed(ds, tx); obj = ds->ds_object; @@ -1596,7 +1553,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) } } - if (ds->ds_phys->ds_next_snap_obj != 0) { + if (dsl_dataset_is_snapshot(ds)) { blkptr_t bp; zio_t *pio; dsl_dataset_t *ds_next; @@ -1608,7 +1565,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); - old_unique = dsl_dataset_unique(ds_next); + old_unique = ds_next->ds_phys->ds_unique_bytes; dmu_buf_will_dirty(ds_next->ds_dbuf, tx); ds_next->ds_phys->ds_prev_snap_obj = @@ -1664,7 +1621,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) ds_next->ds_phys->ds_deadlist_obj)); ds->ds_phys->ds_deadlist_obj = 0; - if (ds_next->ds_phys->ds_next_snap_obj != 0) { + if (dsl_dataset_is_snapshot(ds_next)) { /* * Update next's unique to include blocks which * were previously shared by only this snapshot @@ -1790,8 +1747,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) dsl_dataset_rele(ds_prev, FTAG); spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); - spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx, - cr, "dataset = %llu", ds->ds_object); + spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx, + "dataset = %llu", ds->ds_object); if (ds->ds_phys->ds_next_clones_obj != 0) { uint64_t count; @@ -1816,7 +1773,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) struct dsl_ds_destroyarg ndsda = {0}; ndsda.ds = dsda->rm_origin; - dsl_dataset_destroy_sync(&ndsda, tag, cr, tx); + dsl_dataset_destroy_sync(&ndsda, tag, tx); } } @@ -1833,7 +1790,8 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) * owned by the snapshot dataset must be accommodated by space * outside of the reservation. */ - asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); + asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE)) return (ENOSPC); @@ -1847,7 +1805,6 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) return (0); } -/* ARGSUSED */ int dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -1888,7 +1845,7 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) } void -dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; const char *snapname = arg2; @@ -1959,9 +1916,11 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) * since our unique space is going to zero. */ if (ds->ds_reserved) { - int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + int64_t delta; + ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); + delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, - add, 0, 0, tx); + delta, 0, 0, tx); } bplist_close(&ds->ds_deadlist); @@ -1987,11 +1946,11 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == dsl_dataset_get_ref(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); - dsl_pool_ds_snapshotted(ds, tx); + dsl_scan_ds_snapshotted(ds, tx); dsl_dir_snap_cmtime_update(ds->ds_dir); - spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx, "dataset = %llu", dsobj); } @@ -2035,7 +1994,7 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, ds->ds_phys->ds_guid); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, - dsl_dataset_unique(ds)); + ds->ds_phys->ds_unique_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, ds->ds_object); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, @@ -2163,8 +2122,7 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, - cred_t *cr, dmu_tx_t *tx) +dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; const char *newsnapname = arg2; @@ -2188,8 +2146,8 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, ds->ds_snapname, 8, 1, &ds->ds_object, tx); ASSERT3U(err, ==, 0); - spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, - cr, "dataset = %llu", ds->ds_object); + spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, + "dataset = %llu", ds->ds_object); dsl_dataset_rele(hds, FTAG); } @@ -2371,14 +2329,14 @@ struct promotenode { struct promotearg { list_t shared_snaps, origin_snaps, clone_snaps; - dsl_dataset_t *origin_origin, *origin_head; + dsl_dataset_t *origin_origin; uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; char *err_ds; }; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); +static boolean_t snaplist_unstable(list_t *l); -/* ARGSUSED */ static int dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -2479,19 +2437,19 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) /* * Note, typically this will not be a clone of a clone, - * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so + * so dd_origin_txg will be < TXG_INITIAL, so * these snaplist_space() -> bplist_space_birthrange() * calls will be fast because they do not have to * iterate over all bps. */ snap = list_head(&pa->origin_snaps); err = snaplist_space(&pa->shared_snaps, - snap->ds->ds_origin_txg, &pa->cloneusedsnap); + snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); if (err) return (err); err = snaplist_space(&pa->clone_snaps, - snap->ds->ds_origin_txg, &space); + snap->ds->ds_dir->dd_origin_txg, &space); if (err) return (err); pa->cloneusedsnap += space; @@ -2510,7 +2468,7 @@ out: } static void -dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *hds = arg1; struct promotearg *pa = arg2; @@ -2554,10 +2512,11 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(dd->dd_dbuf, tx); ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; - hds->ds_origin_txg = origin_head->ds_origin_txg; + dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; dmu_buf_will_dirty(odd->dd_dbuf, tx); odd->dd_phys->dd_origin_obj = origin_ds->ds_object; - origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg; + origin_head->ds_dir->dd_origin_txg = + origin_ds->ds_phys->ds_creation_txg; /* move snapshots to this dir */ for (snap = list_head(&pa->shared_snaps); snap; @@ -2614,8 +2573,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) origin_ds->ds_phys->ds_unique_bytes = pa->unique; /* log history record */ - spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, - cr, "dataset = %llu", hds->ds_object); + spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, + "dataset = %llu", hds->ds_object); dsl_dir_close(odd, FTAG); } @@ -2862,7 +2821,7 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) { struct cloneswaparg *csa = arg1; dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; @@ -2937,9 +2896,9 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) * changing that affects the snapused). */ VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, - csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used)); + csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &cdl_used)); VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist, - csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used)); + csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &odl_used)); dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, DD_USED_HEAD, DD_USED_SNAP, tx); } @@ -2975,7 +2934,7 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, csa->ohds->ds_phys->ds_deadlist_obj)); - dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx); + dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); } /* @@ -3110,24 +3069,24 @@ dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); } -extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *); +extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); void -dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; uint64_t effective_value = psa->psa_effective_value; - dsl_prop_set_sync(ds, psa, cr, tx); + dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); if (ds->ds_quota != effective_value) { dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_quota = effective_value; - spa_history_internal_log(LOG_DS_REFQUOTA, - ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu ", + spa_history_log_internal(LOG_DS_REFQUOTA, + ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ", (longlong_t)ds->ds_quota, ds->ds_object); } } @@ -3188,7 +3147,9 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); mutex_enter(&ds->ds_lock); - unique = dsl_dataset_unique(ds); + if (!DS_UNIQUE_IS_ACCURATE(ds)) + dsl_dataset_recalc_head_uniq(ds); + unique = ds->ds_phys->ds_unique_bytes; mutex_exit(&ds->ds_lock); if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { @@ -3205,10 +3166,8 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); } -/* ARGSUSED */ static void -dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, - dmu_tx_t *tx) +dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; @@ -3216,14 +3175,15 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, uint64_t unique; int64_t delta; - dsl_prop_set_sync(ds, psa, cr, tx); + dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); - unique = dsl_dataset_unique(ds); + ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); + unique = ds->ds_phys->ds_unique_bytes; delta = MAX(0, (int64_t)(effective_value - unique)) - MAX(0, (int64_t)(ds->ds_reserved - unique)); ds->ds_reserved = effective_value; @@ -3232,8 +3192,8 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); mutex_exit(&ds->ds_dir->dd_lock); - spa_history_internal_log(LOG_DS_REFRESERV, - ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", + spa_history_log_internal(LOG_DS_REFRESERV, + ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu", (longlong_t)effective_value, ds->ds_object); } @@ -3311,7 +3271,7 @@ dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct dsl_ds_holdarg *ha = arg2; @@ -3343,8 +3303,8 @@ dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) htag, &now, tx)); } - spa_history_internal_log(LOG_DS_USER_HOLD, - dp->dp_spa, tx, cr, "<%s> temp = %d dataset = %llu", htag, + spa_history_log_internal(LOG_DS_USER_HOLD, + dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag, (int)ha->temphold, ds->ds_object); } @@ -3495,10 +3455,6 @@ dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) */ if (!ra->own) return (EBUSY); - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } } dsda.ds = ds; dsda.releasing = B_TRUE; @@ -3509,7 +3465,7 @@ dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) } static void -dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) { struct dsl_ds_releasearg *ra = arg1; dsl_dataset_t *ds = ra->ds; @@ -3520,6 +3476,11 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) uint64_t refs; int error; + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + mutex_enter(&ds->ds_lock); ds->ds_userrefs--; refs = ds->ds_userrefs; @@ -3536,11 +3497,11 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) dsda.ds = ds; dsda.releasing = B_TRUE; /* We already did the destroy_check */ - dsl_dataset_destroy_sync(&dsda, tag, cr, tx); + dsl_dataset_destroy_sync(&dsda, tag, tx); } - spa_history_internal_log(LOG_DS_USER_RELEASE, - dp->dp_spa, tx, cr, "<%s> %lld dataset = %llu", + spa_history_log_internal(LOG_DS_USER_RELEASE, + dp->dp_spa, tx, "<%s> %lld dataset = %llu", ra->htag, (longlong_t)refs, dsobj); } diff --git a/usr/src/uts/common/fs/zfs/dsl_deleg.c b/usr/src/uts/common/fs/zfs/dsl_deleg.c index 04053fdf20..85490c8d5f 100644 --- a/usr/src/uts/common/fs/zfs/dsl_deleg.c +++ b/usr/src/uts/common/fs/zfs/dsl_deleg.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -148,7 +147,7 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) } static void -dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; nvlist_t *nvp = arg2; @@ -183,8 +182,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(zap_update(mos, jumpobj, perm, 8, 1, &n, tx) == 0); - spa_history_internal_log(LOG_DS_PERM_UPDATE, - dd->dd_pool->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_PERM_UPDATE, + dd->dd_pool->dp_spa, tx, "%s %s dataset = %llu", whokey, perm, dd->dd_phys->dd_head_dataset_obj); } @@ -192,7 +191,7 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } static void -dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; nvlist_t *nvp = arg2; @@ -215,8 +214,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) (void) zap_remove(mos, zapobj, whokey, tx); VERIFY(0 == zap_destroy(mos, jumpobj, tx)); } - spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE, - dd->dd_pool->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_PERM_WHO_REMOVE, + dd->dd_pool->dp_spa, tx, "%s dataset = %llu", whokey, dd->dd_phys->dd_head_dataset_obj); continue; @@ -236,8 +235,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == zap_destroy(mos, jumpobj, tx)); } - spa_history_internal_log(LOG_DS_PERM_REMOVE, - dd->dd_pool->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_PERM_REMOVE, + dd->dd_pool->dp_spa, tx, "%s %s dataset = %llu", whokey, perm, dd->dd_phys->dd_head_dataset_obj); } diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index 0dfb05da2d..ac86da6590 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/dmu.h> @@ -40,8 +39,7 @@ #include "zfs_namecheck.h" static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); -static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, - cred_t *cr, dmu_tx_t *tx); +static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx); /* ARGSUSED */ @@ -64,8 +62,8 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) spa_close(dd->dd_pool->dp_spa, dd); /* - * The props callback list should be empty since they hold the - * dir open. + * The props callback list should have been cleaned up by + * objset_evict(). */ list_destroy(&dd->dd_prop_cbs); mutex_destroy(&dd->dd_lock); @@ -136,6 +134,25 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); } + if (dsl_dir_is_clone(dd)) { + dmu_buf_t *origin_bonus; + dsl_dataset_phys_t *origin_phys; + + /* + * We can't open the origin dataset, because + * that would require opening this dsl_dir. + * Just look at its phys directly instead. + */ + err = dmu_bonus_hold(dp->dp_meta_objset, + dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); + if (err) + goto errout; + origin_phys = origin_bonus->db_data; + dd->dd_origin_txg = + origin_phys->ds_creation_txg; + dmu_buf_rele(origin_bonus, FTAG); + } + winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, dsl_dir_evict); if (winner) { @@ -458,7 +475,7 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) } void -dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_dir_t *dd = ds->ds_dir; @@ -477,7 +494,7 @@ dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) &value); psa.psa_effective_value = 0; /* predict default value */ - dsl_dir_set_reservation_sync(ds, &psa, cr, tx); + dsl_dir_set_reservation_sync(ds, &psa, tx); ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0); ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); @@ -652,15 +669,6 @@ dsl_dir_space_available(dsl_dir_t *dd, if (used > quota) { /* over quota */ myspace = 0; - - /* - * While it's OK to be a little over quota, if - * we think we are using more space than there - * is in the pool (which is already 1.6% more than - * dsl_pool_adjustedsize()), something is very - * wrong. - */ - ASSERT3U(used, <=, spa_get_dspace(dd->dd_pool->dp_spa)); } else { /* * the lesser of the space provided by our parent and @@ -1033,18 +1041,17 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) return (err); } -extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *); +extern dsl_syncfunc_t dsl_prop_set_sync; -/* ARGSUSED */ static void -dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_dir_t *dd = ds->ds_dir; dsl_prop_setarg_t *psa = arg2; uint64_t effective_value = psa->psa_effective_value; - dsl_prop_set_sync(ds, psa, cr, tx); + dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(dd, psa); dmu_buf_will_dirty(dd->dd_dbuf, tx); @@ -1053,8 +1060,8 @@ dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dd->dd_phys->dd_quota = effective_value; mutex_exit(&dd->dd_lock); - spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa, - tx, cr, "%lld dataset = %llu ", + spa_history_log_internal(LOG_DS_QUOTA, dd->dd_pool->dp_spa, + tx, "%lld dataset = %llu ", (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj); } @@ -1141,9 +1148,8 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); } -/* ARGSUSED */ static void -dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_dir_t *dd = ds->ds_dir; @@ -1152,7 +1158,7 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) uint64_t used; int64_t delta; - dsl_prop_set_sync(ds, psa, cr, tx); + dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(dd, psa); dmu_buf_will_dirty(dd->dd_dbuf, tx); @@ -1170,8 +1176,8 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } mutex_exit(&dd->dd_lock); - spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa, - tx, cr, "%lld dataset = %llu", + spa_history_log_internal(LOG_DS_RESERVATION, dd->dd_pool->dp_spa, + tx, "%lld dataset = %llu", (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj); } @@ -1240,7 +1246,6 @@ struct renamearg { const char *mynewname; }; -/*ARGSUSED*/ static int dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -1287,7 +1292,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct renamearg *ra = arg2; @@ -1336,8 +1341,8 @@ dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dd->dd_myname, 8, 1, &dd->dd_object, tx); ASSERT3U(err, ==, 0); - spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, - tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj); + spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, + tx, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj); } int diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 30a5611365..77aa4af4e3 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -19,14 +19,16 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/dsl_pool.h> #include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> #include <sys/dsl_dir.h> #include <sys/dsl_synctask.h> +#include <sys/dsl_scan.h> +#include <sys/dnode.h> #include <sys/dmu_tx.h> #include <sys/dmu_objset.h> #include <sys/arc.h> @@ -50,7 +52,7 @@ kmutex_t zfs_write_limit_lock; static pgcnt_t old_physmem = 0; -static int +int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { uint64_t obj; @@ -88,7 +90,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) offsetof(dsl_dataset_t, ds_synced_link)); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 1, 4, 0); @@ -150,64 +151,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - /* get scrub status */ - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, - &dp->dp_scrub_func); - if (err == 0) { - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, - &dp->dp_scrub_queue_obj); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_min_txg); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_max_txg); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_bookmark); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_ddt_bookmark); - if (err && err != ENOENT) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, - &dp->dp_scrub_ddt_class_max); - if (err && err != ENOENT) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &spa->spa_scrub_errors); - if (err) - goto out; - if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { - /* - * A new-type scrub was in progress on an old - * pool. Restart from the beginning, since the - * old software may have changed the pool in the - * meantime. - */ - dsl_pool_scrub_restart(dp); - } - } else { - /* - * It's OK if there is no scrub in progress (and if - * there was an I/O error, ignore it). - */ - err = 0; - } + err = dsl_scan_init(dp, txg); out: rw_exit(&dp->dp_config_rwlock); @@ -247,9 +191,9 @@ dsl_pool_close(dsl_pool_t *dp) arc_flush(dp->dp_spa); txg_fini(dp); + dsl_scan_fini(dp); rw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); - mutex_destroy(&dp->dp_scrub_cancel_lock); taskq_destroy(dp->dp_vnrele_taskq); if (dp->dp_blkstats) kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); @@ -275,6 +219,9 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); ASSERT3U(err, ==, 0); + /* Initialize scan structures */ + VERIFY3U(0, ==, dsl_scan_init(dp, txg)); + /* create and open the root dir */ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, @@ -318,6 +265,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) uint64_t data_written; int err; + /* + * We need to copy dp_space_towrite() before doing + * dsl_sync_task_group_sync(), because + * dsl_dataset_snapshot_reserve_space() will increase + * dp_space_towrite but not actually write anything. + */ + data_written = dp->dp_space_towrite[txg & TXG_MASK]; + tx = dmu_tx_create_assigned(dp, txg); dp->dp_read_overhead = 0; @@ -347,7 +302,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) /* * Sync the datasets again to push out the changes due to - * userquota updates. This must be done before we process the + * userspace updates. This must be done before we process the * sync tasks, because that could cause a snapshot of a dataset * whose ds_bp will be rewritten when we do this 2nd sync. */ @@ -383,13 +338,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dsl_dir_sync(dd, tx); write_time += gethrtime() - start; - if (spa_sync_pass(dp->dp_spa) == 1) { - dp->dp_scrub_prefetch_zio_root = zio_root(dp->dp_spa, NULL, - NULL, ZIO_FLAG_CANFAIL); - dsl_pool_scrub_sync(dp, tx); - (void) zio_wait(dp->dp_scrub_prefetch_zio_root); - } - start = gethrtime(); if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { @@ -407,7 +355,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dmu_tx_commit(tx); - data_written = dp->dp_space_towrite[txg & TXG_MASK]; dp->dp_space_towrite[txg & TXG_MASK] = 0; ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); @@ -679,7 +626,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, NULL, 0, kcred, tx); VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx); + dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx); VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, dp, &dp->dp_origin_snap)); dsl_dataset_rele(ds, FTAG); diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c index f27305c953..cedd777687 100644 --- a/usr/src/uts/common/fs/zfs/dsl_prop.c +++ b/usr/src/uts/common/fs/zfs/dsl_prop.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/zfs_context.h> @@ -260,11 +259,8 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, cbr->cbr_func(cbr->cbr_arg, value); - VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, - NULL, cbr, &dd)); if (need_rwlock) rw_exit(&dp->dp_config_rwlock); - /* Leave dir open until this callback is unregistered */ return (0); } @@ -464,8 +460,6 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1); kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); - /* Clean up from dsl_prop_register */ - dsl_dir_close(dd, cbr); return (0); } @@ -552,7 +546,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, } void -dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; @@ -707,9 +701,9 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } } - spa_history_internal_log((source == ZPROP_SRC_NONE || + spa_history_log_internal((source == ZPROP_SRC_NONE || source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT : - LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr, + LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, "%s=%s dataset = %llu", propname, (valstr == NULL ? "" : valstr), ds->ds_object); @@ -718,7 +712,7 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } void -dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_props_arg_t *pa = arg2; @@ -756,13 +750,13 @@ dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) psa.psa_numints = 1; psa.psa_value = &intval; } - dsl_prop_set_sync(ds, &psa, cr, tx); + dsl_prop_set_sync(ds, &psa, tx); } } void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, - cred_t *cr, dmu_tx_t *tx) + dmu_tx_t *tx) { objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t zapobj = dd->dd_phys->dd_props_zapobj; @@ -773,7 +767,7 @@ dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE); - spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, "%s=%llu dataset = %llu", name, (u_longlong_t)val, dd->dd_phys->dd_head_dataset_obj); } diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c new file mode 100644 index 0000000000..f3b401d602 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c @@ -0,0 +1,1660 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/dsl_scan.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> +#include <sys/dnode.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/arc.h> +#include <sys/zap.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zil_impl.h> +#include <sys/zio_checksum.h> +#include <sys/ddt.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#ifdef _KERNEL +#include <sys/zfs_vfsops.h> +#endif + +typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); + +static scan_cb_t dsl_scan_defrag_cb; +static scan_cb_t dsl_scan_scrub_cb; +static scan_cb_t dsl_scan_remove_cb; +static dsl_syncfunc_t dsl_scan_cancel_sync; +static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); + +int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ +int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ +boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ +boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ +enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; +int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */ + +#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ + ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ + (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) + +extern int zfs_txg_timeout; + +/* the order has to match pool_scan_type */ +static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { + NULL, + dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ + dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ +}; + +int +dsl_scan_init(dsl_pool_t *dp, uint64_t txg) +{ + int err; + dsl_scan_t *scn; + spa_t *spa = dp->dp_spa; + uint64_t f; + + scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); + scn->scn_dp = dp; + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_func", sizeof (uint64_t), 1, &f); + if (err == 0) { + /* + * There was an old-style scrub in progress. Restart a + * new-style scrub from the beginning. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("old-style scrub was in progress; " + "restarting new-style scrub in txg %llu", + scn->scn_restart_txg); + + /* + * Load the queue obj from the old location so that it + * can be freed by dsl_scan_done(). + */ + (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_queue", sizeof (uint64_t), 1, + &scn->scn_phys.scn_queue_obj); + } else { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys); + if (err == ENOENT) + return (0); + else if (err) + return (err); + + if (scn->scn_phys.scn_state == DSS_SCANNING && + spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { + /* + * A new-type scrub was in progress on an old + * pool, and the pool was accessed by old + * software. Restart from the beginning, since + * the old software may have changed the pool in + * the meantime. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("new-style scrub was modified " + "by old software; restarting in txg %llu", + scn->scn_restart_txg); + } + } + + spa_scan_stat_init(spa); + return (0); +} + +void +dsl_scan_fini(dsl_pool_t *dp) +{ + if (dp->dp_scan) { + kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); + dp->dp_scan = NULL; + } +} + +/* ARGSUSED */ +static int +dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg1; + + if (scn->scn_phys.scn_state == DSS_SCANNING) + return (EBUSY); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg1; + pool_scan_func_t *funcp = arg2; + dmu_object_type_t ot = 0; + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); + ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + bzero(&scn->scn_phys, sizeof (scn->scn_phys)); + scn->scn_phys.scn_func = *funcp; + scn->scn_phys.scn_state = DSS_SCANNING; + scn->scn_phys.scn_min_txg = 0; + scn->scn_phys.scn_max_txg = tx->tx_txg; + scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ + scn->scn_phys.scn_start_time = gethrestime_sec(); + scn->scn_phys.scn_errors = 0; + scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; + scn->scn_restart_txg = 0; + spa_scan_stat_init(spa); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; + + /* rewrite all disk labels */ + vdev_config_dirty(spa->spa_root_vdev); + + if (vdev_resilver_needed(spa->spa_root_vdev, + &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { + spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); + } else { + spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START); + } + + spa->spa_scrub_started = B_TRUE; + /* + * If this is an incremental scrub, limit the DDT scrub phase + * to just the auto-ditto class (for correctness); the rest + * of the scrub should go faster using top-down pruning. + */ + if (scn->scn_phys.scn_min_txg > TXG_INITIAL) + scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; + + } + + /* back to the generic stuff */ + + if (dp->dp_blkstats == NULL) { + dp->dp_blkstats = + kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + } + bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + + if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) + ot = DMU_OT_ZAP_OTHER; + + scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, + ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); + + dsl_scan_sync_state(scn, tx); + + spa_history_log_internal(LOG_POOL_SCAN, spa, tx, + "func=%u mintxg=%llu maxtxg=%llu", + *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); +} + +/* ARGSUSED */ +static void +dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) +{ + static const char *old_names[] = { + "scrub_bookmark", + "scrub_ddt_bookmark", + "scrub_ddt_class_max", + "scrub_queue", + "scrub_min_txg", + "scrub_max_txg", + "scrub_func", + "scrub_errors", + NULL + }; + + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + int i; + + /* Remove any remnants of an old-style scrub. */ + for (i = 0; old_names[i]; i++) { + (void) zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); + } + + if (scn->scn_phys.scn_queue_obj != 0) { + VERIFY(0 == dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = 0; + } + + /* + * If we were "restarted" from a stopped state, don't bother + * with anything else. + */ + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (complete) + scn->scn_phys.scn_state = DSS_FINISHED; + else + scn->scn_phys.scn_state = DSS_CANCELED; + + spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx, + "complete=%u", complete); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > 0) { + cv_wait(&spa->spa_scrub_io_cv, + &spa->spa_scrub_lock); + } + mutex_exit(&spa->spa_scrub_lock); + spa->spa_scrub_started = B_FALSE; + spa->spa_scrub_active = B_FALSE; + + /* + * If the scrub/resilver completed, update all DTLs to + * reflect this. Whether it succeeded or not, vacate + * all temporary scrub DTLs. + */ + vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, + complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); + if (complete) { + spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? + ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + } + spa_errlog_rotate(spa); + + /* + * We may have finished replacing a device. + * Let the async thread assess this and handle the detach. + */ + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); + } + + scn->scn_phys.scn_end_time = gethrestime_sec(); +} + +/* ARGSUSED */ +static int +dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg1; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return (ENOENT); + return (0); +} + +/* ARGSUSED */ +static void +dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg1; + + dsl_scan_done(scn, B_FALSE, tx); + dsl_scan_sync_state(scn, tx); +} + +int +dsl_scan_cancel(dsl_pool_t *dp) +{ + boolean_t complete = B_FALSE; + int err; + + err = dsl_sync_task_do(dp, dsl_scan_cancel_check, + dsl_scan_cancel_sync, dp->dp_scan, &complete, 3); + return (err); +} + +static void dsl_scan_visitbp(blkptr_t *bp, + const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf, + dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, + dmu_tx_t *tx); +static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, + dmu_objset_type_t ostype, + dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx); + +void +dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) +{ + zio_free(dp->dp_spa, txg, bp); +} + +void +dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) +{ + ASSERT(dsl_pool_sync_context(dp)); + zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); +} + +int +dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + return (arc_read(pio, spa, bpp, pbuf, done, private, + priority, zio_flags, arc_flags, zb)); +} + +int +dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + return (arc_read_nolock(pio, spa, bpp, done, private, + priority, zio_flags, arc_flags, zb)); +} + +static boolean_t +bookmark_is_zero(const zbookmark_t *zb) +{ + return (zb->zb_objset == 0 && zb->zb_object == 0 && + zb->zb_level == 0 && zb->zb_blkid == 0); +} + +/* dnp is the dnode for zb1->zb_object */ +static boolean_t +bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, + const zbookmark_t *zb2) +{ + uint64_t zb1nextL0, zb2thisobj; + + ASSERT(zb1->zb_objset == zb2->zb_objset); + ASSERT(zb2->zb_level == 0); + + /* + * A bookmark in the deadlist is considered to be after + * everything else. + */ + if (zb2->zb_object == DMU_DEADLIST_OBJECT) + return (B_TRUE); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + zb1nextL0 = (zb1->zb_blkid + 1) << + ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + + zb2thisobj = zb2->zb_object ? zb2->zb_object : + zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + + if (zb1->zb_object == DMU_META_DNODE_OBJECT) { + uint64_t nextobj = zb1nextL0 * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; + return (nextobj <= zb2thisobj); + } + + if (zb1->zb_object < zb2thisobj) + return (B_TRUE); + if (zb1->zb_object > zb2thisobj) + return (B_FALSE); + if (zb2->zb_object == DMU_META_DNODE_OBJECT) + return (B_FALSE); + return (zb1nextL0 <= zb2->zb_blkid); +} + +static uint64_t +dsl_scan_ds_maxtxg(dsl_dataset_t *ds) +{ + uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; + if (dsl_dataset_is_snapshot(ds)) + return (MIN(smt, ds->ds_phys->ds_creation_txg)); + return (smt); +} + +static void +dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) +{ + VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys, tx)); +} + +static boolean_t +dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) +{ + uint64_t elapsed_nanosecs; + int mintime; + + /* we never skip user/group accounting objects */ + if (zb && (int64_t)zb->zb_object < 0) + return (B_FALSE); + + if (scn->scn_pausing) + return (B_TRUE); /* we're already pausing */ + + if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark)) + return (B_FALSE); /* we're resuming */ + + /* We only know how to resume from level-0 blocks. */ + if (zb && zb->zb_level != 0) + return (B_FALSE); + + mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scan_min_time_ms; + elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + (elapsed_nanosecs / MICROSEC > mintime && + txg_sync_waiting(scn->scn_dp)) || + spa_shutting_down(scn->scn_dp->dp_spa)) { + if (zb) { + dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + scn->scn_phys.scn_bookmark = *zb; + } + dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); + scn->scn_pausing = B_TRUE; + return (B_TRUE); + } + return (B_FALSE); +} + +typedef struct zil_scan_arg { + dsl_pool_t *zsa_dp; + zil_header_t *zsa_zh; +} zil_scan_arg_t; + +/* ARGSUSED */ +static int +dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + zbookmark_t zb; + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * One block ("stubby") can be allocated a long time ago; we + * want to visit that one because it has been allocated + * (on-disk) even if it hasn't been claimed (even though for + * scrub there's nothing to do to it). + */ + if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + return (0); +} + +/* ARGSUSED */ +static int +dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + if (lrc->lrc_txtype == TX_WRITE) { + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_t zb; + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * birth can be < claim_txg if this record's txg is + * already txg sync'ed (but this log block contains + * other records that are not synced) + */ + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + } + return (0); +} + +static void +dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + zil_scan_arg_t zsa = { dp, zh }; + zilog_t *zilog; + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed (or, in read-only mode, blocks that *would* be claimed). + */ + if (claim_txg == 0 && spa_writeable(dp->dp_spa)) + return; + + zilog = zil_alloc(dp->dp_meta_objset, zh); + + (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, + claim_txg); + + zil_free(zilog); +} + +/* ARGSUSED */ +static void +dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, + uint64_t objset, uint64_t object, uint64_t blkid) +{ + zbookmark_t czb; + uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + + if (zfs_no_scrub_prefetch) + return; + + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) + return; + + SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); + + /* + * XXX need to make sure all of these arc_read() prefetches are + * done before setting xlateall (similar to dsl_read()) + */ + (void) arc_read(scn->scn_prefetch_zio_root, scn->scn_dp->dp_spa, bp, + buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &flags, &czb); +} + +static boolean_t +dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, + const zbookmark_t *zb) +{ + /* + * We never skip over user/group accounting objects (obj<0) + */ + if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) && + (int64_t)zb->zb_object >= 0) { + /* + * If we already visited this bp & everything below (in + * a prior txg sync), don't bother doing it again. + */ + if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) + return (B_TRUE); + + /* + * If we found the block we're trying to resume from, or + * we went past it to a different object, zero it out to + * indicate that it's OK to start checking for pausing + * again. + */ + if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || + zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { + dprintf("resuming at %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); + } + } + return (B_FALSE); +} + +/* + * Return nonzero on i/o error. + * Return new buf to write out in *bufp. + */ +static int +dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, + dnode_phys_t *dnp, const blkptr_t *bp, + const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) +{ + dsl_pool_t *dp = scn->scn_dp; + int err; + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { + dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset, + zb->zb_object, zb->zb_blkid * epb + i); + } + for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + dsl_scan_visitbp(cbp, &czb, dnp, + *bufp, ds, scn, ostype, tx); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) { + uint32_t flags = ARC_WAIT; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + uint32_t flags = ARC_WAIT; + dnode_phys_t *cdnp; + int i, j; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { + for (j = 0; j < cdnp->dn_nblkptr; j++) { + blkptr_t *cbp = &cdnp->dn_blkptr[j]; + dsl_scan_prefetch(scn, *bufp, cbp, + zb->zb_objset, zb->zb_blkid * epb + i, j); + } + } + for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { + dsl_scan_visitdnode(scn, ds, ostype, + cdnp, *bufp, zb->zb_blkid * epb + i, tx); + } + + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + uint32_t flags = ARC_WAIT; + objset_phys_t *osp; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + + osp = (*bufp)->b_data; + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) + dsl_scan_zil(dp, &osp->os_zil_header); + + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx); + + if (OBJSET_BUF_HAS_USERUSED(*bufp)) { + /* + * We also always visit user/group accounting + * objects, and never skip them, even if we are + * pausing. This is necessary so that the space + * deltas from this txg get integrated. + */ + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_groupused_dnode, *bufp, + DMU_GROUPUSED_OBJECT, tx); + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_userused_dnode, *bufp, + DMU_USERUSED_OBJECT, tx); + } + } + + return (0); +} + +static void +dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, + dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf, + uint64_t object, dmu_tx_t *tx) +{ + int j; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + dnp->dn_nlevels - 1, j); + dsl_scan_visitbp(&dnp->dn_blkptr[j], + &czb, dnp, buf, ds, scn, ostype, tx); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zbookmark_t czb; + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + 0, DMU_SPILL_BLKID); + dsl_scan_visitbp(&dnp->dn_spill, + &czb, dnp, buf, ds, scn, ostype, tx); + } +} + +/* + * The arguments are in this order because mdb can only print the + * first 5; we want them to be useful. + */ +static void +dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, + dnode_phys_t *dnp, arc_buf_t *pbuf, + dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + arc_buf_t *buf = NULL; + blkptr_t bp_toread = *bp; + + /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ + + if (dsl_scan_check_pause(scn, zb)) + return; + + if (dsl_scan_check_resume(scn, dnp, zb)) + return; + + if (bp->blk_birth == 0) + return; + + scn->scn_visited_this_txg++; + + dprintf_bp(bp, + "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p", + ds, ds ? ds->ds_object : 0, + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, + pbuf, bp); + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return; + + if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) { + /* + * For non-user-accounting blocks, we need to read the + * new bp (from a deleted snapshot, found in + * check_existing_xlation). If we used the old bp, + * pointers inside this block from before we resumed + * would be untranslated. + * + * For user-accounting blocks, we need to read the old + * bp, because we will apply the entire space delta to + * it (original untranslated -> translations from + * deleted snap -> now). + */ + bp_toread = *bp; + } + + if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx, + &buf) != 0) + return; + + /* + * If dsl_scan_ddt() has aready visited this block, it will have + * already done any translations or scrubbing, so don't call the + * callback again. + */ + if (ddt_class_contains(dp->dp_spa, + scn->scn_phys.scn_ddt_class_max, bp)) { + ASSERT(buf == NULL); + return; + } + + /* + * If this block is from the future (after cur_max_txg), then we + * are doing this on behalf of a deleted snapshot, and we will + * revisit the future block on the next pass of this dataset. + * Don't scan it now unless we need to because something + * under it was modified. + */ + if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) { + scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); + } + if (buf) + (void) arc_buf_remove_ref(buf, &buf); +} + +static void +dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, + dmu_tx_t *tx) +{ + zbookmark_t zb; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + dsl_scan_visitbp(bp, &zb, NULL, NULL, + ds, scn, DMU_OST_NONE, tx); + + dprintf_ds(ds, "finished scan%s", ""); +} + +void +dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { + if (dsl_dataset_is_snapshot(ds)) { + /* Note, scn_cur_{min,max}_txg stays the same. */ + scn->scn_phys.scn_bookmark.zb_objset = + ds->ds_phys->ds_next_snap_obj; + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_next_snap_obj); + scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; + } else { + SET_BOOKMARK(&scn->scn_phys.scn_bookmark, + ZB_DESTROYED_OBJSET, 0, 0, 0); + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset bookmark to -1,0,0,0", + (u_longlong_t)ds->ds_object); + } + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + if (dsl_dataset_is_snapshot(ds)) { + /* + * We keep the same mintxg; it could be > + * ds_creation_txg if the previous snapshot was + * deleted too. + */ + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0); + zfs_dbgmsg("destroying ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_next_snap_obj); + } else { + zfs_dbgmsg("destroying ds %llu; in queue; removing", + (u_longlong_t)ds->ds_object); + } + } else { + zfs_dbgmsg("destroying ds %llu; ignoring", + (u_longlong_t)ds->ds_object); + } + + /* + * dsl_scan_sync() should be called after this, and should sync + * out our changed state, but just to be safe, do it here. + */ + dsl_scan_sync_state(scn, tx); +} + +void +dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); + + if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = + ds->ds_phys->ds_prev_snap_obj; + zfs_dbgmsg("snapshotting ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0); + zfs_dbgmsg("snapshotting ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); + } + dsl_scan_sync_state(scn, tx); +} + +void +dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds1->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds1->ds_object, &mintxg) == 0) { + int err; + + ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); + err = zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); + VERIFY(err == 0 || err == EEXIST); + if (err == EEXIST) { + /* Both were there to begin with */ + VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds1->ds_object, mintxg, tx)); + } + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { + ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); + VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } + + dsl_scan_sync_state(scn, tx); +} + +struct enqueue_clones_arg { + dmu_tx_t *tx; + uint64_t originobj; +}; + +/* ARGSUSED */ +static int +enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + struct enqueue_clones_arg *eca = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err) + return (err); + + if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { + while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + ds = prev; + } + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, + ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); + } + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + dsl_dataset_t *ds; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + + /* + * Iterate over the bps in this ds. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx); + + char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " + "pausing=%u", + (longlong_t)dsobj, dsname, + (longlong_t)scn->scn_phys.scn_cur_min_txg, + (longlong_t)scn->scn_phys.scn_cur_max_txg, + (int)scn->scn_pausing); + kmem_free(dsname, ZFS_MAXNAMELEN); + + if (scn->scn_pausing) + goto out; + + /* + * We've finished this pass over this dataset. + */ + + /* + * If we did not completely visit this dataset, do another pass. + */ + if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { + zfs_dbgmsg("incomplete pass; visiting again"); + scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, + scn->scn_phys.scn_cur_max_txg, tx) == 0); + goto out; + } + + /* + * Add descendent datasets to work queue. + */ + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj, + ds->ds_phys->ds_creation_txg, tx) == 0); + } + if (ds->ds_phys->ds_num_children > 1) { + boolean_t usenext = B_FALSE; + if (ds->ds_phys->ds_next_clones_obj != 0) { + uint64_t count; + /* + * A bug in a previous version of the code could + * cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a + * missing entry. Therefore we can only use the + * next_clones_obj when its count is correct. + */ + int err = zap_count(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, &count); + if (err == 0 && + count == ds->ds_phys->ds_num_children - 1) + usenext = B_TRUE; + } + + if (usenext) { + VERIFY(zap_join_key(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, + scn->scn_phys.scn_queue_obj, + ds->ds_phys->ds_creation_txg, tx) == 0); + } else { + struct enqueue_clones_arg eca; + eca.tx = tx; + eca.originobj = ds->ds_object; + + (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, + NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); + } + } + +out: + dsl_dataset_rele(ds, FTAG); +} + +/* ARGSUSED */ +static int +enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err) + return (err); + + while (ds->ds_phys->ds_prev_snap_obj != 0) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + /* + * If this is a clone, we don't need to worry about it for now. + */ + if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(prev, FTAG); + return (0); + } + dsl_dataset_rele(ds, FTAG); + ds = prev; + } + + VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0); + dsl_dataset_rele(ds, FTAG); + return (0); +} + +/* + * Scrub/dedup interaction. + * + * If there are N references to a deduped block, we don't want to scrub it + * N times -- ideally, we should scrub it exactly once. + * + * We leverage the fact that the dde's replication class (enum ddt_class) + * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest + * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. + * + * To prevent excess scrubbing, the scrub begins by walking the DDT + * to find all blocks with refcnt > 1, and scrubs each of these once. + * Since there are two replication classes which contain blocks with + * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. + * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. + * + * There would be nothing more to say if a block's refcnt couldn't change + * during a scrub, but of course it can so we must account for changes + * in a block's replication class. + * + * Here's an example of what can occur: + * + * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 + * when visited during the top-down scrub phase, it will be scrubbed twice. + * This negates our scrub optimization, but is otherwise harmless. + * + * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 + * on each visit during the top-down scrub phase, it will never be scrubbed. + * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's + * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to + * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 + * while a scrub is in progress, it scrubs the block right then. + */ +static void +dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) +{ + ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; + ddt_entry_t dde = { 0 }; + int error; + uint64_t n = 0; + + while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { + ddt_t *ddt; + + if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) + break; + dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", + (longlong_t)ddb->ddb_class, + (longlong_t)ddb->ddb_type, + (longlong_t)ddb->ddb_checksum, + (longlong_t)ddb->ddb_cursor); + + /* There should be no pending changes to the dedup table */ + ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; + ASSERT(avl_first(&ddt->ddt_tree) == NULL); + + dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); + n++; + + if (dsl_scan_check_pause(scn, NULL)) + break; + } + + zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u", + (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, + (int)scn->scn_pausing); + + ASSERT(error == 0 || error == ENOENT); + ASSERT(error != ENOENT || + ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); +} + +/* ARGSUSED */ +void +dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + const ddt_key_t *ddk = &dde->dde_key; + ddt_phys_t *ddp = dde->dde_phys; + blkptr_t bp; + zbookmark_t zb = { 0 }; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg) + continue; + ddt_bp_create(checksum, ddk, ddp, &bp); + + scn->scn_visited_this_txg++; + scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); + } +} + +static void +dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + zap_cursor_t zc; + zap_attribute_t za; + + if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= + scn->scn_phys.scn_ddt_class_max) { + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_ddt(scn, tx); + if (scn->scn_pausing) + return; + } + + if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { + /* First do the MOS & ORIGIN */ + + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_visit_rootbp(scn, NULL, + &dp->dp_meta_rootbp, tx); + spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); + if (scn->scn_pausing) + return; + + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, + NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); + } else { + dsl_scan_visitds(scn, + dp->dp_origin_snap->ds_object, tx); + } + ASSERT(!scn->scn_pausing); + } else if (scn->scn_phys.scn_bookmark.zb_objset != + ZB_DESTROYED_OBJSET) { + /* + * If we were paused, continue from here. Note if the + * ds we were paused on was deleted, the zb_objset may + * be -1, so we will skip this and find a new objset + * below. + */ + dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); + if (scn->scn_pausing) + return; + } + + /* + * In case we were paused right at the end of the ds, zero the + * bookmark so we don't think that we're still trying to resume. + */ + bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t)); + + /* keep pulling things out of the zap-object-as-queue */ + while (zap_cursor_init(&zc, dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj), + zap_cursor_retrieve(&zc, &za) == 0) { + dsl_dataset_t *ds; + uint64_t dsobj; + + dsobj = strtonum(za.za_name, NULL); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, dsobj, tx)); + + /* Set up min/max txg */ + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + if (za.za_first_integer != 0) { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, + za.za_first_integer); + } else { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, + ds->ds_phys->ds_prev_snap_txg); + } + scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); + dsl_dataset_rele(ds, FTAG); + + dsl_scan_visitds(scn, dsobj, tx); + zap_cursor_fini(&zc); + if (scn->scn_pausing) + return; + } + zap_cursor_fini(&zc); +} + +void +dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dp->dp_scan; + spa_t *spa = dp->dp_spa; + + /* + * Check for scn_restart_txg before checking spa_load_state, so + * that we can restart an old-style scan while the pool is being + * imported (see dsl_scan_init). + */ + if (scn->scn_restart_txg != 0 && + scn->scn_restart_txg <= tx->tx_txg) { + pool_scan_func_t func = POOL_SCAN_SCRUB; + dsl_scan_done(scn, B_FALSE, tx); + if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + func = POOL_SCAN_RESILVER; + zfs_dbgmsg("restarting scan func=%u txg=%llu", + func, tx->tx_txg); + dsl_scan_setup_sync(scn, &func, tx); + } + + if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE || + spa_shutting_down(spa) || + spa_sync_pass(dp->dp_spa) > 1 || + scn->scn_phys.scn_state != DSS_SCANNING) + return; + + scn->scn_visited_this_txg = 0; + scn->scn_pausing = B_FALSE; + scn->scn_sync_start_time = gethrtime(); + spa->spa_scrub_active = B_TRUE; + + if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= + scn->scn_phys.scn_ddt_class_max) { + zfs_dbgmsg("doing scan sync txg %llu; " + "ddt bm=%llu/%llu/%llu/%llx", + (longlong_t)tx->tx_txg, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); + ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); + } else { + zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", + (longlong_t)tx->tx_txg, + (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, + (longlong_t)scn->scn_phys.scn_bookmark.zb_object, + (longlong_t)scn->scn_phys.scn_bookmark.zb_level, + (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); + } + + scn->scn_prefetch_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + dsl_scan_visit(scn, tx); + (void) zio_wait(scn->scn_prefetch_zio_root); + scn->scn_prefetch_zio_root = NULL; + + zfs_dbgmsg("visited %llu blocks in %llums", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC); + + if (!scn->scn_pausing) { + /* finished with scan. */ + zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg); + dsl_scan_done(scn, B_TRUE, tx); + } + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > 0) { + cv_wait(&spa->spa_scrub_io_cv, + &spa->spa_scrub_lock); + } + mutex_exit(&spa->spa_scrub_lock); + } + + dsl_scan_sync_state(scn, tx); +} + +/* + * This will start a new scan, or restart an existing one. + */ +void +dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) +{ + if (txg == 0) { + dmu_tx_t *tx; + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + + txg = dmu_tx_get_txg(tx); + dp->dp_scan->scn_restart_txg = txg; + dmu_tx_commit(tx); + } else { + dp->dp_scan->scn_restart_txg = txg; + } + zfs_dbgmsg("restarting resilver txg=%llu", txg); +} + +boolean_t +dsl_scan_resilvering(dsl_pool_t *dp) +{ + return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && + dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); +} + +/* + * scrub consumers + */ + +static void +count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) +{ + int i; + + /* + * If we resume after a reboot, zab will be NULL; don't record + * incomplete stats in that case. + */ + if (zab == NULL) + return; + + for (i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; + int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; + zfs_blkstat_t *zb = &zab->zab_type[l][t]; + int equal; + + zb->zb_count++; + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + zb->zb_ditto_2_of_2_samevdev++; + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal == 1) + zb->zb_ditto_2_of_3_samevdev++; + else if (equal == 3) + zb->zb_ditto_3_of_3_samevdev++; + break; + } + } +} + +static void +dsl_scan_scrub_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + + zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { + spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; + } + mutex_exit(&spa->spa_scrub_lock); +} + +static int +dsl_scan_scrub_cb(dsl_pool_t *dp, + const blkptr_t *bp, const zbookmark_t *zb) +{ + dsl_scan_t *scn = dp->dp_scan; + size_t size = BP_GET_PSIZE(bp); + spa_t *spa = dp->dp_spa; + uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); + boolean_t needs_io; + int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; + int zio_priority; + + if (phys_birth <= scn->scn_phys.scn_min_txg || + phys_birth >= scn->scn_phys.scn_max_txg) + return (0); + + count_block(dp->dp_blkstats, bp); + + ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); + if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { + zio_flags |= ZIO_FLAG_SCRUB; + zio_priority = ZIO_PRIORITY_SCRUB; + needs_io = B_TRUE; + } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { + zio_flags |= ZIO_FLAG_RESILVER; + zio_priority = ZIO_PRIORITY_RESILVER; + needs_io = B_FALSE; + } + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == ZB_ZIL_LEVEL) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { + vdev_t *vd = vdev_lookup_top(spa, + DVA_GET_VDEV(&bp->blk_dva[d])); + + /* + * Keep track of how much data we've examined so that + * zpool(1M) status can make useful progress reports. + */ + scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); + spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); + + /* if it's a resilver, this may not be in the target range */ + if (!needs_io) { + if (DVA_GET_GANG(&bp->blk_dva[d])) { + /* + * Gang members may be spread across multiple + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. + * XXX -- it would be better to change our + * allocation policy to ensure that all + * gang members reside on the same vdev. + */ + needs_io = B_TRUE; + } else { + needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, + phys_birth, 1); + } + } + } + + if (needs_io && !zfs_no_scrub_io) { + void *data = zio_data_buf_alloc(size); + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(NULL, spa, bp, data, size, + dsl_scan_scrub_done, NULL, zio_priority, + zio_flags, zb)); + } + + /* do not relocate this block */ + return (0); +} + +int +dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) +{ + spa_t *spa = dp->dp_spa; + + /* + * Purge all vdev caches and probe all devices. We do this here + * rather than in sync context because this requires a writer lock + * on the spa_config lock, which we can't do from sync context. The + * spa_scrub_reopen flag indicates that vdev_open() should not + * attempt to start another scrub. + */ + spa_vdev_state_enter(spa, SCL_NONE); + spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + (void) spa_vdev_state_exit(spa, NULL, 0); + + return (dsl_sync_task_do(dp, dsl_scan_setup_check, + dsl_scan_setup_sync, dp->dp_scan, &func, 0)); +} diff --git a/usr/src/uts/common/fs/zfs/dsl_scrub.c b/usr/src/uts/common/fs/zfs/dsl_scrub.c deleted file mode 100644 index b16ff66586..0000000000 --- a/usr/src/uts/common/fs/zfs/dsl_scrub.c +++ /dev/null @@ -1,1214 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include <sys/dsl_pool.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_prop.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_synctask.h> -#include <sys/dnode.h> -#include <sys/dmu_tx.h> -#include <sys/dmu_objset.h> -#include <sys/arc.h> -#include <sys/zap.h> -#include <sys/zio.h> -#include <sys/zfs_context.h> -#include <sys/fs/zfs.h> -#include <sys/zfs_znode.h> -#include <sys/spa_impl.h> -#include <sys/vdev_impl.h> -#include <sys/zil_impl.h> -#include <sys/zio_checksum.h> -#include <sys/ddt.h> -#include <sys/sa.h> -#include <sys/sa_impl.h> - -typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); - -static scrub_cb_t dsl_pool_scrub_clean_cb; -static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; -static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, - uint64_t objset, uint64_t object); - -int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ -int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ -boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ -boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ -enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; - -extern int zfs_txg_timeout; - -static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { - NULL, - dsl_pool_scrub_clean_cb -}; - -/* ARGSUSED */ -static void -dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_pool_t *dp = arg1; - enum scrub_func *funcp = arg2; - dmu_object_type_t ot = 0; - boolean_t complete = B_FALSE; - - dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx); - - ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE); - ASSERT(*funcp > SCRUB_FUNC_NONE); - ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS); - - dp->dp_scrub_min_txg = 0; - dp->dp_scrub_max_txg = tx->tx_txg; - dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max; - - if (*funcp == SCRUB_FUNC_CLEAN) { - vdev_t *rvd = dp->dp_spa->spa_root_vdev; - - /* rewrite all disk labels */ - vdev_config_dirty(rvd); - - if (vdev_resilver_needed(rvd, - &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { - spa_event_notify(dp->dp_spa, NULL, - ESC_ZFS_RESILVER_START); - dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, - tx->tx_txg); - } else { - spa_event_notify(dp->dp_spa, NULL, - ESC_ZFS_SCRUB_START); - } - - /* zero out the scrub stats in all vdev_stat_t's */ - vdev_scrub_stat_update(rvd, - dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : - POOL_SCRUB_EVERYTHING, B_FALSE); - - /* - * If this is an incremental scrub, limit the DDT scrub phase - * to just the auto-ditto class (for correctness); the rest - * of the scrub should go faster using top-down pruning. - */ - if (dp->dp_scrub_min_txg > TXG_INITIAL) - dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO; - - dp->dp_spa->spa_scrub_started = B_TRUE; - } - - /* back to the generic stuff */ - - if (dp->dp_blkstats == NULL) { - dp->dp_blkstats = - kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); - } - bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); - - if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) - ot = DMU_OT_ZAP_OTHER; - - dp->dp_scrub_func = *funcp; - dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, - ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); - bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); - bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); - dp->dp_scrub_restart = B_FALSE; - dp->dp_spa->spa_scrub_errors = 0; - - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, - &dp->dp_scrub_func, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, - &dp->dp_scrub_queue_obj, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_min_txg, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_max_txg, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_ddt_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, - &dp->dp_scrub_ddt_class_max, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &dp->dp_spa->spa_scrub_errors, tx)); - - spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr, - "func=%u mintxg=%llu maxtxg=%llu", - *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg); -} - -int -dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func) -{ - return (dsl_sync_task_do(dp, NULL, - dsl_pool_scrub_setup_sync, dp, &func, 0)); -} - -/* ARGSUSED */ -static void -dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_pool_t *dp = arg1; - boolean_t *completep = arg2; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - mutex_enter(&dp->dp_scrub_cancel_lock); - - if (dp->dp_scrub_restart) { - dp->dp_scrub_restart = B_FALSE; - *completep = B_FALSE; - } - - /* XXX this is scrub-clean specific */ - mutex_enter(&dp->dp_spa->spa_scrub_lock); - while (dp->dp_spa->spa_scrub_inflight > 0) { - cv_wait(&dp->dp_spa->spa_scrub_io_cv, - &dp->dp_spa->spa_scrub_lock); - } - mutex_exit(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_active = B_FALSE; - - dp->dp_scrub_func = SCRUB_FUNC_NONE; - VERIFY(0 == dmu_object_free(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, tx)); - dp->dp_scrub_queue_obj = 0; - bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); - bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); - - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_QUEUE, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MIN_TXG, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MAX_TXG, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_FUNC, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, tx)); - - (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_BOOKMARK, tx); - (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_CLASS_MAX, tx); - - spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, - "complete=%u", *completep); - - /* below is scrub-clean specific */ - vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE, - *completep); - /* - * If the scrub/resilver completed, update all DTLs to reflect this. - * Whether it succeeded or not, vacate all temporary scrub DTLs. - */ - vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, - *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); - dp->dp_spa->spa_scrub_started = B_FALSE; - if (*completep) - spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ? - ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); - spa_errlog_rotate(dp->dp_spa); - - /* - * We may have finished replacing a device. - * Let the async thread assess this and handle the detach. - */ - spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE); - - dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0; - mutex_exit(&dp->dp_scrub_cancel_lock); -} - -int -dsl_pool_scrub_cancel(dsl_pool_t *dp) -{ - boolean_t complete = B_FALSE; - - return (dsl_sync_task_do(dp, NULL, - dsl_pool_scrub_cancel_sync, dp, &complete, 3)); -} - -void -dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) -{ - /* - * This function will be used by bp-rewrite wad to intercept frees. - */ - zio_free(dp->dp_spa, txg, bpp); -} - -void -dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) -{ - ASSERT(dsl_pool_sync_context(dp)); - zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); -} - -static boolean_t -bookmark_is_zero(const zbookmark_t *zb) -{ - return (zb->zb_objset == 0 && zb->zb_object == 0 && - zb->zb_level == 0 && zb->zb_blkid == 0); -} - -/* dnp is the dnode for zb1->zb_object */ -static boolean_t -bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, - const zbookmark_t *zb2) -{ - uint64_t zb1nextL0, zb2thisobj; - - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT); - ASSERT(zb2->zb_level == 0); - - /* - * A bookmark in the deadlist is considered to be after - * everything else. - */ - if (zb2->zb_object == DMU_DEADLIST_OBJECT) - return (B_TRUE); - - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); - - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); - - if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); - } - - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == DMU_META_DNODE_OBJECT) - return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); -} - -static boolean_t -scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb) -{ - uint64_t elapsed_nanosecs; - int mintime; - - if (dp->dp_scrub_pausing) - return (B_TRUE); /* we're already pausing */ - - if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) - return (B_FALSE); /* we're resuming */ - - /* We only know how to resume from level-0 blocks. */ - if (zb != NULL && zb->zb_level != 0) - return (B_FALSE); - - mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time_ms : - zfs_scrub_min_time_ms; - elapsed_nanosecs = gethrtime() - dp->dp_scrub_start_time; - if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || - (elapsed_nanosecs / MICROSEC > mintime && txg_sync_waiting(dp))) { - if (zb) { - dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", - (longlong_t)zb->zb_objset, - (longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, - (longlong_t)zb->zb_blkid); - dp->dp_scrub_bookmark = *zb; - } - if (ddb) { - dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", - (longlong_t)ddb->ddb_class, - (longlong_t)ddb->ddb_type, - (longlong_t)ddb->ddb_checksum, - (longlong_t)ddb->ddb_cursor); - ASSERT(&dp->dp_scrub_ddt_bookmark == ddb); - } - dp->dp_scrub_pausing = B_TRUE; - return (B_TRUE); - } - return (B_FALSE); -} - -typedef struct zil_traverse_arg { - dsl_pool_t *zta_dp; - zil_header_t *zta_zh; -} zil_traverse_arg_t; - -/* ARGSUSED */ -static int -traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) -{ - zil_traverse_arg_t *zta = arg; - dsl_pool_t *dp = zta->zta_dp; - zil_header_t *zh = zta->zta_zh; - zbookmark_t zb; - - if (bp->blk_birth <= dp->dp_scrub_min_txg) - return (0); - - /* - * One block ("stubby") can be allocated a long time ago; we - * want to visit that one because it has been allocated - * (on-disk) even if it hasn't been claimed (even though for - * plain scrub there's nothing to do to it). - */ - if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) - return (0); - - SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], - ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - - VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); - return (0); -} - -/* ARGSUSED */ -static int -traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) -{ - if (lrc->lrc_txtype == TX_WRITE) { - zil_traverse_arg_t *zta = arg; - dsl_pool_t *dp = zta->zta_dp; - zil_header_t *zh = zta->zta_zh; - lr_write_t *lr = (lr_write_t *)lrc; - blkptr_t *bp = &lr->lr_blkptr; - zbookmark_t zb; - - if (bp->blk_birth <= dp->dp_scrub_min_txg) - return (0); - - /* - * birth can be < claim_txg if this record's txg is - * already txg sync'ed (but this log block contains - * other records that are not synced) - */ - if (claim_txg == 0 || bp->blk_birth < claim_txg) - return (0); - - SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], - lr->lr_foid, ZB_ZIL_LEVEL, - lr->lr_offset / BP_GET_LSIZE(bp)); - - VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); - } - return (0); -} - -static void -traverse_zil(dsl_pool_t *dp, zil_header_t *zh) -{ - uint64_t claim_txg = zh->zh_claim_txg; - zil_traverse_arg_t zta = { dp, zh }; - zilog_t *zilog; - - /* - * We only want to visit blocks that have been claimed but not yet - * replayed (or, in read-only mode, blocks that *would* be claimed). - */ - if (claim_txg == 0 && spa_writeable(dp->dp_spa)) - return; - - zilog = zil_alloc(dp->dp_meta_objset, zh); - - (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta, - claim_txg); - - zil_free(zilog); -} - -static void -scrub_prefetch(dsl_pool_t *dp, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, - uint64_t object, uint64_t blkid) -{ - zbookmark_t czb; - uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; - - if (zfs_no_scrub_prefetch) - return; - - if (BP_IS_HOLE(bp) || bp->blk_birth <= dp->dp_scrub_min_txg || - (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) - return; - - SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); - - (void) arc_read(dp->dp_scrub_prefetch_zio_root, dp->dp_spa, bp, - buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, - &flags, &czb); -} - -static void -scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, - arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) -{ - int err; - arc_buf_t *buf = NULL; - - if (bp->blk_birth <= dp->dp_scrub_min_txg) - return; - - if (scrub_pause(dp, zb, NULL)) - return; - - if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { - /* - * If we already visited this bp & everything below (in - * a prior txg), don't bother doing it again. - */ - if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark)) - return; - - /* - * If we found the block we're trying to resume from, or - * we went past it to a different object, zero it out to - * indicate that it's OK to start checking for pausing - * again. - */ - if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 || - zb->zb_object > dp->dp_scrub_bookmark.zb_object) { - dprintf("resuming at %llx/%llx/%llx/%llx\n", - (longlong_t)zb->zb_objset, - (longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, - (longlong_t)zb->zb_blkid); - bzero(&dp->dp_scrub_bookmark, sizeof (*zb)); - } - } - - /* - * If dsl_pool_scrub_ddt() has aready scrubbed this block, - * don't scrub it again. - */ - if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp)) - (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); - - if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; - int i; - blkptr_t *cbp; - int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - - err = arc_read(NULL, dp->dp_spa, bp, pbuf, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) { - mutex_enter(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_errors++; - mutex_exit(&dp->dp_spa->spa_scrub_lock); - return; - } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { - scrub_prefetch(dp, buf, cbp, zb->zb_objset, - zb->zb_object, zb->zb_blkid * epb + i); - } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); - scrub_visitbp(dp, dnp, buf, cbp, &czb); - } - } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; - dnode_phys_t *cdnp; - int i, j; - int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - - err = arc_read(NULL, dp->dp_spa, bp, pbuf, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) { - mutex_enter(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_errors++; - mutex_exit(&dp->dp_spa->spa_scrub_lock); - return; - } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { - for (j = 0; j < cdnp->dn_nblkptr; j++) { - blkptr_t *cbp = &cdnp->dn_blkptr[j]; - scrub_prefetch(dp, buf, cbp, zb->zb_objset, - zb->zb_blkid * epb + i, j); - } - } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { - scrub_visitdnode(dp, cdnp, buf, zb->zb_objset, - zb->zb_blkid * epb + i); - } - } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t flags = ARC_WAIT; - objset_phys_t *osp; - - err = arc_read_nolock(NULL, dp->dp_spa, bp, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) { - mutex_enter(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_errors++; - mutex_exit(&dp->dp_spa->spa_scrub_lock); - return; - } - - osp = buf->b_data; - - traverse_zil(dp, &osp->os_zil_header); - - scrub_visitdnode(dp, &osp->os_meta_dnode, - buf, zb->zb_objset, DMU_META_DNODE_OBJECT); - if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { - scrub_visitdnode(dp, &osp->os_userused_dnode, - buf, zb->zb_objset, DMU_USERUSED_OBJECT); - scrub_visitdnode(dp, &osp->os_groupused_dnode, - buf, zb->zb_objset, DMU_GROUPUSED_OBJECT); - } - } - - if (buf) - (void) arc_buf_remove_ref(buf, &buf); -} - -static void -scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, - uint64_t objset, uint64_t object) -{ - int j; - - for (j = 0; j < dnp->dn_nblkptr; j++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); - scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); - - if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { - zbookmark_t czb; - SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); - scrub_visitbp(dp, dnp, buf, &dnp->dn_spill, &czb); - } - } -} - -static void -scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) -{ - zbookmark_t zb; - - SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, - ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - scrub_visitbp(dp, NULL, NULL, bp, &zb); -} - -void -dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { - SET_BOOKMARK(&dp->dp_scrub_bookmark, - ZB_DESTROYED_OBJSET, 0, 0, 0); - } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, tx) != 0) { - return; - } - - if (ds->ds_phys->ds_next_snap_obj != 0) { - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_phys->ds_next_snap_obj, tx) == 0); - } - ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); -} - -void -dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); - - if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { - dp->dp_scrub_bookmark.zb_objset = - ds->ds_phys->ds_prev_snap_obj; - } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, tx) == 0) { - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_phys->ds_prev_snap_obj, tx) == 0); - } -} - -void -dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds1->ds_dir->dd_pool; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) { - dp->dp_scrub_bookmark.zb_objset = ds2->ds_object; - } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) { - dp->dp_scrub_bookmark.zb_objset = ds1->ds_object; - } - - if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds1->ds_object, tx) == 0) { - int err = zap_add_int(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, ds2->ds_object, tx); - VERIFY(err == 0 || err == EEXIST); - if (err == EEXIST) { - /* Both were there to begin with */ - VERIFY(0 == zap_add_int(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, ds1->ds_object, tx)); - } - } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds2->ds_object, tx) == 0) { - VERIFY(0 == zap_add_int(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, ds1->ds_object, tx)); - } -} - -struct enqueue_clones_arg { - dmu_tx_t *tx; - uint64_t originobj; -}; - -/* ARGSUSED */ -static int -enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -{ - struct enqueue_clones_arg *eca = arg; - dsl_dataset_t *ds; - int err; - dsl_pool_t *dp; - - err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); - if (err) - return (err); - dp = ds->ds_dir->dd_pool; - - if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { - while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { - dsl_dataset_t *prev; - err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); - - dsl_dataset_rele(ds, FTAG); - if (err) - return (err); - ds = prev; - } - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, eca->tx) == 0); - } - dsl_dataset_rele(ds, FTAG); - return (0); -} - -static void -scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) -{ - dsl_dataset_t *ds; - uint64_t min_txg_save; - - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - - /* - * Iterate over the bps in this ds. - */ - min_txg_save = dp->dp_scrub_min_txg; - dp->dp_scrub_min_txg = - MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg); - scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp); - dp->dp_scrub_min_txg = min_txg_save; - - if (dp->dp_scrub_pausing) - goto out; - - /* - * Add descendent datasets to work queue. - */ - if (ds->ds_phys->ds_next_snap_obj != 0) { - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_phys->ds_next_snap_obj, tx) == 0); - } - if (ds->ds_phys->ds_num_children > 1) { - boolean_t usenext = B_FALSE; - if (ds->ds_phys->ds_next_clones_obj != 0) { - uint64_t count; - /* - * A bug in a previous version of the code could - * cause upgrade_clones_cb() to not set - * ds_next_snap_obj when it should, leading to a - * missing entry. Therefore we can only use the - * next_clones_obj when its count is correct. - */ - int err = zap_count(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, &count); - if (err == 0 && - count == ds->ds_phys->ds_num_children - 1) - usenext = B_TRUE; - } - - if (usenext) { - VERIFY(zap_join(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, - dp->dp_scrub_queue_obj, tx) == 0); - } else { - struct enqueue_clones_arg eca; - eca.tx = tx; - eca.originobj = ds->ds_object; - - (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, - NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); - } - } - -out: - dsl_dataset_rele(ds, FTAG); -} - -/* ARGSUSED */ -static int -enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -{ - dmu_tx_t *tx = arg; - dsl_dataset_t *ds; - int err; - dsl_pool_t *dp; - - err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); - if (err) - return (err); - - dp = ds->ds_dir->dd_pool; - - while (ds->ds_phys->ds_prev_snap_obj != 0) { - dsl_dataset_t *prev; - err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, - FTAG, &prev); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } - - /* - * If this is a clone, we don't need to worry about it for now. - */ - if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { - dsl_dataset_rele(ds, FTAG); - dsl_dataset_rele(prev, FTAG); - return (0); - } - dsl_dataset_rele(ds, FTAG); - ds = prev; - } - - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, tx) == 0); - dsl_dataset_rele(ds, FTAG); - return (0); -} - -/* - * Scrub/dedup interaction. - * - * If there are N references to a deduped block, we don't want to scrub it - * N times -- ideally, we should scrub it exactly once. - * - * We leverage the fact that the dde's replication class (enum ddt_class) - * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest - * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. - * - * To prevent excess scrubbing, the scrub begins by walking the DDT - * to find all blocks with refcnt > 1, and scrubs each of these once. - * Since there are two replication classes which contain blocks with - * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. - * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. - * - * There would be nothing more to say if a block's refcnt couldn't change - * during a scrub, but of course it can so we must account for changes - * in a block's replication class. - * - * Here's an example of what can occur: - * - * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 - * when visited during the top-down scrub phase, it will be scrubbed twice. - * This negates our scrub optimization, but is otherwise harmless. - * - * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 - * on each visit during the top-down scrub phase, it will never be scrubbed. - * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's - * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to - * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 - * while a scrub is in progress, it scrubs the block right then. - */ -static void -dsl_pool_scrub_ddt(dsl_pool_t *dp) -{ - ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark; - ddt_entry_t dde; - int error; - - while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) { - if (ddb->ddb_class > dp->dp_scrub_ddt_class_max) - return; - dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde); - if (scrub_pause(dp, NULL, ddb)) - return; - } - ASSERT(error == ENOENT); - ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max); -} - -void -dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum, - const ddt_entry_t *dde) -{ - const ddt_key_t *ddk = &dde->dde_key; - const ddt_phys_t *ddp = dde->dde_phys; - blkptr_t blk; - zbookmark_t zb = { 0 }; - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0) - continue; - ddt_bp_create(checksum, ddk, ddp, &blk); - scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb); - } -} - -void -dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) -{ - spa_t *spa = dp->dp_spa; - zap_cursor_t zc; - zap_attribute_t za; - boolean_t complete = B_TRUE; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - /* - * If the pool is not loaded, or is trying to unload, leave it alone. - */ - if (spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa)) - return; - - if (dp->dp_scrub_restart) { - enum scrub_func func = dp->dp_scrub_func; - dp->dp_scrub_restart = B_FALSE; - dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); - } - - if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { - /* - * We must have resumed after rebooting; reset the vdev - * stats to know that we're doing a scrub (although it - * will think we're just starting now). - */ - vdev_scrub_stat_update(spa->spa_root_vdev, - dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : - POOL_SCRUB_EVERYTHING, B_FALSE); - } - - dp->dp_scrub_pausing = B_FALSE; - dp->dp_scrub_start_time = gethrtime(); - dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); - spa->spa_scrub_active = B_TRUE; - - if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) { - dsl_pool_scrub_ddt(dp); - if (dp->dp_scrub_pausing) - goto out; - } - - if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) { - /* First do the MOS & ORIGIN */ - scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); - if (dp->dp_scrub_pausing) - goto out; - - if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { - VERIFY(0 == dmu_objset_find_spa(spa, - NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); - } else { - scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); - } - ASSERT(!dp->dp_scrub_pausing); - } else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { - /* - * If we were paused, continue from here. Note if the ds - * we were paused on was destroyed, the zb_objset will be - * ZB_DESTROYED_OBJSET, so we will skip this and find a new - * objset below. - */ - scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); - if (dp->dp_scrub_pausing) - goto out; - } - - /* - * In case we were paused right at the end of the ds, zero the - * bookmark so we don't think that we're still trying to resume. - */ - bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); - - /* keep pulling things out of the zap-object-as-queue */ - while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj), - zap_cursor_retrieve(&zc, &za) == 0) { - VERIFY(0 == zap_remove(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, za.za_name, tx)); - scrub_visitds(dp, za.za_first_integer, tx); - if (dp->dp_scrub_pausing) - break; - zap_cursor_fini(&zc); - } - zap_cursor_fini(&zc); - if (dp->dp_scrub_pausing) - goto out; - - /* done. */ - - dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); - return; -out: - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_ddt_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, - &dp->dp_scrub_ddt_class_max, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &spa->spa_scrub_errors, tx)); -} - -void -dsl_pool_scrub_restart(dsl_pool_t *dp) -{ - mutex_enter(&dp->dp_scrub_cancel_lock); - dp->dp_scrub_restart = B_TRUE; - mutex_exit(&dp->dp_scrub_cancel_lock); -} - -/* - * scrub consumers - */ - -static void -count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) -{ - int i; - - /* - * If we resume after a reboot, zab will be NULL; don't record - * incomplete stats in that case. - */ - if (zab == NULL) - return; - - for (i = 0; i < 4; i++) { - int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; - int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; - zfs_blkstat_t *zb = &zab->zab_type[l][t]; - int equal; - - zb->zb_count++; - zb->zb_asize += BP_GET_ASIZE(bp); - zb->zb_lsize += BP_GET_LSIZE(bp); - zb->zb_psize += BP_GET_PSIZE(bp); - zb->zb_gangs += BP_COUNT_GANG(bp); - - switch (BP_GET_NDVAS(bp)) { - case 2: - if (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) - zb->zb_ditto_2_of_2_samevdev++; - break; - case 3: - equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) + - (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[2])) + - (DVA_GET_VDEV(&bp->blk_dva[1]) == - DVA_GET_VDEV(&bp->blk_dva[2])); - if (equal == 1) - zb->zb_ditto_2_of_3_samevdev++; - else if (equal == 3) - zb->zb_ditto_3_of_3_samevdev++; - break; - } - } -} - -static void -dsl_pool_scrub_clean_done(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - - zio_data_buf_free(zio->io_data, zio->io_size); - - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_inflight--; - cv_broadcast(&spa->spa_scrub_io_cv); - - if (zio->io_error && (zio->io_error != ECKSUM || - !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) - spa->spa_scrub_errors++; - mutex_exit(&spa->spa_scrub_lock); -} - -static int -dsl_pool_scrub_clean_cb(dsl_pool_t *dp, - const blkptr_t *bp, const zbookmark_t *zb) -{ - size_t size = BP_GET_PSIZE(bp); - spa_t *spa = dp->dp_spa; - uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); - boolean_t needs_io; - int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; - int zio_priority; - - if (phys_birth <= dp->dp_scrub_min_txg || - phys_birth >= dp->dp_scrub_max_txg) - return (0); - - count_block(dp->dp_blkstats, bp); - - if (dp->dp_scrub_isresilver == 0) { - /* It's a scrub */ - zio_flags |= ZIO_FLAG_SCRUB; - zio_priority = ZIO_PRIORITY_SCRUB; - needs_io = B_TRUE; - } else { - /* It's a resilver */ - zio_flags |= ZIO_FLAG_RESILVER; - zio_priority = ZIO_PRIORITY_RESILVER; - needs_io = B_FALSE; - } - - /* If it's an intent log block, failure is expected. */ - if (zb->zb_level == ZB_ZIL_LEVEL) - zio_flags |= ZIO_FLAG_SPECULATIVE; - - for (int d = 0; d < BP_GET_NDVAS(bp); d++) { - vdev_t *vd = vdev_lookup_top(spa, - DVA_GET_VDEV(&bp->blk_dva[d])); - - /* - * Keep track of how much data we've examined so that - * zpool(1M) status can make useful progress reports. - */ - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_scrub_examined += - DVA_GET_ASIZE(&bp->blk_dva[d]); - mutex_exit(&vd->vdev_stat_lock); - - /* if it's a resilver, this may not be in the target range */ - if (!needs_io) { - if (DVA_GET_GANG(&bp->blk_dva[d])) { - /* - * Gang members may be spread across multiple - * vdevs, so the best estimate we have is the - * scrub range, which has already been checked. - * XXX -- it would be better to change our - * allocation policy to ensure that all - * gang members reside on the same vdev. - */ - needs_io = B_TRUE; - } else { - needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, - phys_birth, 1); - } - } - } - - if (needs_io && !zfs_no_scrub_io) { - void *data = zio_data_buf_alloc(size); - - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight++; - mutex_exit(&spa->spa_scrub_lock); - - zio_nowait(zio_read(NULL, spa, bp, data, size, - dsl_pool_scrub_clean_done, NULL, zio_priority, - zio_flags, zb)); - } - - /* do not relocate this block */ - return (0); -} - -int -dsl_pool_scrub_clean(dsl_pool_t *dp) -{ - spa_t *spa = dp->dp_spa; - - /* - * Purge all vdev caches and probe all devices. We do this here - * rather than in sync context because this requires a writer lock - * on the spa_config lock, which we can't do from sync context. The - * spa_scrub_reopen flag indicates that vdev_open() should not - * attempt to start another scrub. - */ - spa_vdev_state_enter(spa, SCL_NONE); - spa->spa_scrub_reopen = B_TRUE; - vdev_reopen(spa->spa_root_vdev); - spa->spa_scrub_reopen = B_FALSE; - (void) spa_vdev_state_exit(spa, NULL, 0); - - return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); -} diff --git a/usr/src/uts/common/fs/zfs/dsl_synctask.c b/usr/src/uts/common/fs/zfs/dsl_synctask.c index 81c6334cc8..832685b0fc 100644 --- a/usr/src/uts/common/fs/zfs/dsl_synctask.c +++ b/usr/src/uts/common/fs/zfs/dsl_synctask.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/dmu.h> @@ -29,7 +28,6 @@ #include <sys/dsl_dir.h> #include <sys/dsl_synctask.h> #include <sys/metaslab.h> -#include <sys/cred.h> #define DST_AVG_BLKSHIFT 14 @@ -49,7 +47,6 @@ dsl_sync_task_group_create(dsl_pool_t *dp) list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), offsetof(dsl_sync_task_t, dst_node)); dstg->dstg_pool = dp; - dstg->dstg_cr = CRED(); return (dstg); } @@ -136,7 +133,6 @@ dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) uint64_t txg; dstg->dstg_nowaiter = B_TRUE; - dstg->dstg_cr = NULL; /* it won't be valid by the time we sync */ txg = dmu_tx_get_txg(tx); /* * We don't generally have many sync tasks, so pay the price of @@ -200,8 +196,7 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) */ for (dst = list_head(&dstg->dstg_tasks); dst; dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, - dstg->dstg_cr, tx); + dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx); } } rw_exit(&dp->dp_config_rwlock); diff --git a/usr/src/uts/common/fs/zfs/refcount.c b/usr/src/uts/common/fs/zfs/refcount.c index f1b3b23fe2..8358b4ceeb 100644 --- a/usr/src/uts/common/fs/zfs/refcount.c +++ b/usr/src/uts/common/fs/zfs/refcount.c @@ -19,12 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/refcount.h> diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index dd0fbccc7e..510472a515 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -59,6 +59,7 @@ #include <sys/systeminfo.h> #include <sys/spa_boot.h> #include <sys/zfs_ioctl.h> +#include <sys/dsl_scan.h> #ifdef _KERNEL #include <sys/bootprops.h> @@ -110,7 +111,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, }; -static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); +static dsl_syncfunc_t spa_sync_props; static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, @@ -1105,7 +1106,7 @@ spa_load_spares(spa_t *spa) KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, - spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); + spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); for (i = 0; i < spa->spa_spares.sav_count; i++) @@ -1231,7 +1232,7 @@ spa_load_l2cache(spa_t *spa) l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, - sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); + sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); out: @@ -1429,7 +1430,7 @@ spa_load_verify_done(zio_t *zio) /*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { if (bp != NULL) { zio_t *rio = arg; @@ -1780,6 +1781,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; + spa->spa_prev_software_version = ub->ub_software_version; error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) @@ -1851,6 +1853,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, + &spa->spa_creation_version); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* * Load the persistent error log. If we have an older pool, this will * not be present. @@ -2076,7 +2083,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * Check all DTLs to see if anything needs resilvering. */ - if (vdev_resilver_needed(rvd, NULL, NULL)) + if (!dsl_scan_resilvering(spa->spa_dsl_pool) && + vdev_resilver_needed(rvd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); /* @@ -2375,7 +2383,7 @@ spa_add_spares(spa_t *spa, nvlist_t *config) if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( - spares[i], ZPOOL_CONFIG_STATS, + spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; @@ -2432,7 +2440,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) ASSERT(vd != NULL); VERIFY(nvlist_lookup_uint64_array(l2cache[i], - ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) + == 0); vdev_get_stats(vd, vs); } } @@ -2819,6 +2828,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, cmn_err(CE_PANIC, "failed to add pool config"); } + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, + sizeof (uint64_t), 1, &version, tx) != 0) { + cmn_err(CE_PANIC, "failed to add pool version"); + } + /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; @@ -2861,7 +2876,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); - spa_sync_props(spa, props, CRED(), tx); + spa_sync_props(spa, props, tx); } dmu_tx_commit(tx); @@ -3219,8 +3234,6 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) return (error); } - spa_async_resume(spa); - /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. @@ -3271,6 +3284,8 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } + spa_async_resume(spa); + /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. @@ -3455,7 +3470,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; - spa->spa_final_txg = spa_last_synced_txg(spa) + 1; + spa->spa_final_txg = spa_last_synced_txg(spa) + + TXG_DEFER_SIZE + 1; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); } @@ -3635,7 +3651,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) { - uint64_t txg, open_txg; + uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; @@ -3771,13 +3787,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) vdev_config_dirty(tvd); /* - * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate - * upward when spa_vdev_exit() calls vdev_dtl_reassess(). + * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account + * for any dmu_sync-ed blocks. It will propagate upward when + * spa_vdev_exit() calls vdev_dtl_reassess(). */ - open_txg = txg + TXG_CONCURRENT_STATES - 1; + dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, - TXG_INITIAL, open_txg - TXG_INITIAL + 1); + vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, + dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); @@ -3793,10 +3810,18 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ vdev_dirty(tvd, VDD_DTL, newvd, txg); - (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); + /* + * Restart the resilver + */ + dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + + /* + * Commit the config + */ + (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); - spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, - CRED(), "%s vdev=%s %s vdev=%s", + spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, + "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); @@ -3804,11 +3829,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) spa_strfree(oldvdpath); spa_strfree(newvdpath); - /* - * Kick off a resilver to update newvd. - */ - VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); - return (0); } @@ -4004,7 +4024,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) error = spa_vdev_exit(spa, vd, txg, 0); - spa_history_internal_log(LOG_POOL_VDEV_DETACH, spa, NULL, CRED(), + spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, "vdev=%s", vdpath); spa_strfree(vdpath); @@ -4024,7 +4044,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); - (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + (void) spa_vdev_remove(spa, unspare_guid, + B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } @@ -4266,8 +4287,8 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, if (vml[c] != NULL) { vdev_split(vml[c]); if (error == 0) - spa_history_internal_log(LOG_POOL_VDEV_DETACH, - spa, tx, CRED(), "vdev=%s", + spa_history_log_internal(LOG_POOL_VDEV_DETACH, + spa, tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } @@ -4283,7 +4304,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ - spa_history_internal_log(LOG_POOL_SPLIT, newspa, NULL, CRED(), + spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, "split new pool %s from pool %s", newname, spa_name(spa)); kmem_free(vml, children * sizeof (vdev_t *)); @@ -4359,21 +4380,13 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, } /* - * Removing a device from the vdev namespace requires several steps - * and can take a significant amount of time. As a result we use - * the spa_vdev_config_[enter/exit] functions which allow us to - * grab and release the spa_config_lock while still holding the namespace - * lock. During each step the configuration is synced out. - */ - -/* * Evacuate the device. */ -int +static int spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) { - int error = 0; uint64_t txg; + int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); @@ -4386,14 +4399,12 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) * should no longer have any blocks allocated on it. */ if (vd->vdev_islog) { - error = dmu_objset_find(spa_name(spa), zil_vdev_offline, - NULL, DS_FIND_CHILDREN); + if (vd->vdev_stat.vs_alloc != 0) + error = spa_offline_log(spa); } else { - error = ENOTSUP; /* until we have bp rewrite */ + error = ENOTSUP; } - txg_wait_synced(spa_get_dsl(spa), 0); - if (error) return (error); @@ -4401,6 +4412,7 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) * The evacuation succeeded. Remove any remaining MOS metadata * associated with this vdev, and wait for these changes to sync. */ + ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); txg = spa_vdev_config_enter(spa); vd->vdev_removing = B_TRUE; vdev_dirty(vd, 0, NULL, txg); @@ -4413,7 +4425,7 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) /* * Complete the removal by cleaning up the namespace. */ -void +static void spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; @@ -4424,6 +4436,12 @@ spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vd == vd->vdev_top); + /* + * Only remove any devices which are empty. + */ + if (vd->vdev_stat.vs_alloc != 0) + return; + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); if (list_link_active(&vd->vdev_state_dirty_node)) @@ -4439,15 +4457,19 @@ spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); vdev_add_child(rvd, vd); } - vdev_config_dirty(rvd); - - /* - * Reassess the health of our root vdev. - */ - vdev_reopen(rvd); } /* + * Remove a device from the pool - + * + * Removing a device from the vdev namespace requires several steps + * and can take a significant amount of time. As a result we use + * the spa_vdev_config_[enter/exit] functions which allow us to + * grab and release the spa_config_lock while still holding the namespace + * lock. During each step the configuration is synced out. + */ + +/* * Remove a device from the pool. Currently, this supports removing only hot * spares, slogs, and level 2 ARC devices. */ @@ -4690,40 +4712,38 @@ spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) /* * ========================================================================== - * SPA Scrubbing + * SPA Scanning * ========================================================================== */ int -spa_scrub(spa_t *spa, pool_scrub_type_t type) +spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + if (dsl_scan_resilvering(spa->spa_dsl_pool)) + return (EBUSY); + return (dsl_scan_cancel(spa->spa_dsl_pool)); +} - if ((uint_t)type >= POOL_SCRUB_TYPES) +int +spa_scan(spa_t *spa, pool_scan_func_t func) +{ + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + + if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (ENOTSUP); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ - if (type == POOL_SCRUB_RESILVER && + if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } - if (type == POOL_SCRUB_EVERYTHING && - spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && - spa->spa_dsl_pool->dp_scrub_isresilver) - return (EBUSY); - - if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { - return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); - } else if (type == POOL_SCRUB_NONE) { - return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); - } else { - return (EINVAL); - } + return (dsl_scan(spa->spa_dsl_pool, func)); } /* @@ -4829,8 +4849,8 @@ spa_async_thread(spa_t *spa) * then log an internal history event. */ if (new_space != old_space) { - spa_history_internal_log(LOG_POOL_VDEV_ONLINE, - spa, NULL, CRED(), + spa_history_log_internal(LOG_POOL_VDEV_ONLINE, + spa, NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), new_space, new_space - old_space); } @@ -4874,7 +4894,7 @@ spa_async_thread(spa_t *spa) * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER) - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); + dsl_resilver_restart(spa->spa_dsl_pool, 0); /* * Let the world know that we're done. @@ -4920,6 +4940,7 @@ spa_async_dispatch(spa_t *spa) void spa_async_request(spa_t *spa, int task) { + zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); @@ -5024,7 +5045,7 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], - B_FALSE, B_FALSE, B_TRUE); + B_FALSE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(nvroot, config, list, sav->sav_count) == 0); for (i = 0; i < sav->sav_count; i++) @@ -5064,7 +5085,7 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) * Set zpool properties. */ static void -spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) { spa_t *spa = arg1; objset_t *mos = spa->spa_meta_objset; @@ -5176,8 +5197,8 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) /* log internal history if this is not a zpool create */ if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && tx->tx_txg != TXG_INITIAL) { - spa_history_internal_log(LOG_POOL_PROPSET, - spa, tx, cr, "%s %lld %s", + spa_history_log_internal(LOG_POOL_PROPSET, + spa, tx, "%s %lld %s", nvpair_name(elem), intval, spa_name(spa)); } } @@ -5272,13 +5293,17 @@ spa_sync(spa_t *spa, uint64_t txg) } /* - * If anything has changed in this txg, push the deferred frees - * from the previous txg. If not, leave them alone so that we - * don't generate work on an otherwise idle system. + * If anything has changed in this txg, or if someone is waiting + * for this txg to sync (eg, spa_vdev_remove()), push the + * deferred frees from the previous txg. If not, leave them + * alone so that we don't generate work on an otherwise idle + * system. */ if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || !txg_list_empty(&dp->dp_dirty_dirs, txg) || - !txg_list_empty(&dp->dp_sync_tasks, txg)) + !txg_list_empty(&dp->dp_sync_tasks, txg) || + ((dp->dp_scan->scn_phys.scn_state == DSS_SCANNING || + txg_sync_waiting(dp)) && !spa_shutting_down(spa))) spa_sync_deferred_bplist(spa, defer_bpl, tx, txg); /* @@ -5304,11 +5329,7 @@ spa_sync(spa_t *spa, uint64_t txg) } ddt_sync(spa, txg); - - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight > 0) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - mutex_exit(&spa->spa_scrub_lock); + dsl_scan_sync(dp, tx); while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) vdev_sync(vd, txg); diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c index 68a40bec89..cdeda3f93c 100644 --- a/usr/src/uts/common/fs/zfs/spa_config.c +++ b/usr/src/uts/common/fs/zfs/spa_config.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/spa.h> @@ -419,7 +418,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) split_guid) == 0); } - nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE); + nvroot = vdev_config_generate(spa, vd, getstats, 0); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); diff --git a/usr/src/uts/common/fs/zfs/spa_errlog.c b/usr/src/uts/common/fs/zfs/spa_errlog.c index 4c834e2d4e..282140b3bd 100644 --- a/usr/src/uts/common/fs/zfs/spa_errlog.c +++ b/usr/src/uts/common/fs/zfs/spa_errlog.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -54,36 +53,6 @@ #include <sys/zap.h> #include <sys/zio.h> -/* - * This is a stripped-down version of strtoull, suitable only for converting - * lowercase hexidecimal numbers that don't overflow. - */ -uint64_t -strtonum(const char *str, char **nptr) -{ - uint64_t val = 0; - char c; - int digit; - - while ((c = *str) != '\0') { - if (c >= '0' && c <= '9') - digit = c - '0'; - else if (c >= 'a' && c <= 'f') - digit = 10 + c - 'a'; - else - break; - - val *= 16; - val += digit; - - str++; - } - - if (nptr) - *nptr = (char *)str; - - return (val); -} /* * Convert a bookmark to a string. diff --git a/usr/src/uts/common/fs/zfs/spa_history.c b/usr/src/uts/common/fs/zfs/spa_history.c index 18d4836bc7..212abae5b8 100644 --- a/usr/src/uts/common/fs/zfs/spa_history.c +++ b/usr/src/uts/common/fs/zfs/spa_history.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/spa.h> @@ -33,6 +32,7 @@ #include <sys/utsname.h> #include <sys/cmn_err.h> #include <sys/sunddi.h> +#include "zfs_comutil.h" #ifdef _KERNEL #include <sys/zone.h> #endif @@ -189,7 +189,7 @@ spa_history_zone() */ /*ARGSUSED*/ static void -spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) { spa_t *spa = arg1; history_arg_t *hap = arg2; @@ -244,6 +244,8 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) hap->ha_log_type == LOG_CMD_NORMAL) { VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, history_str) == 0); + + zfs_dbgmsg("command: %s", history_str); } else { VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT, hap->ha_event) == 0); @@ -251,6 +253,11 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) tx->tx_txg) == 0); VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR, history_str) == 0); + + zfs_dbgmsg("internal %s pool:%s txg:%llu %s", + zfs_history_event_names[hap->ha_event], spa_name(spa), + (longlong_t)tx->tx_txg, history_str); + } VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0); @@ -418,7 +425,7 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) static void log_internal(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, va_list adx) + dmu_tx_t *tx, const char *fmt, va_list adx) { history_arg_t *ha; @@ -441,7 +448,7 @@ log_internal(history_internal_events_t event, spa_t *spa, ha->ha_uid = 0; if (dmu_tx_is_syncing(tx)) { - spa_history_log_sync(spa, ha, cr, tx); + spa_history_log_sync(spa, ha, tx); } else { dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, spa_history_log_sync, spa, ha, 0, tx); @@ -450,8 +457,8 @@ log_internal(history_internal_events_t event, spa_t *spa, } void -spa_history_internal_log(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, ...) +spa_history_log_internal(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, const char *fmt, ...) { dmu_tx_t *htx = tx; va_list adx; @@ -466,7 +473,7 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa, } va_start(adx, fmt); - log_internal(event, spa, htx, cr, fmt, adx); + log_internal(event, spa, htx, fmt, adx); va_end(adx); /* if we didn't get a tx from the caller, commit the one we made */ @@ -481,7 +488,7 @@ spa_history_log_version(spa_t *spa, history_internal_events_t event) uint64_t current_vers = spa_version(spa); if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) { - spa_history_internal_log(event, spa, NULL, CRED(), + spa_history_log_internal(event, spa, NULL, "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s", (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, utsname.nodename, utsname.release, utsname.version, diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index c815cd6113..8faa84a1b0 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -40,6 +40,7 @@ #include <sys/dsl_pool.h> #include <sys/dsl_dir.h> #include <sys/dsl_prop.h> +#include <sys/dsl_scan.h> #include <sys/fs/zfs.h> #include <sys/metaslab_impl.h> #include <sys/arc.h> @@ -888,10 +889,10 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); /* - * If the config changed, notify the scrub thread that it must restart. + * If the config changed, notify the scrub that it must restart. + * This will initiate a resilver if needed. */ if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { - dsl_pool_scrub_restart(spa->spa_dsl_pool); config_changed = B_TRUE; spa->spa_config_generation++; } @@ -1078,7 +1079,6 @@ spa_rename(const char *name, const char *newname) return (0); } - /* * Determine whether a pool with given pool_guid exists. If device_guid is * non-zero, determine whether the pool exists *and* contains a device with the @@ -1209,6 +1209,37 @@ zfs_panic_recover(const char *fmt, ...) } /* + * This is a stripped-down version of strtoull, suitable only for converting + * lowercase hexidecimal numbers that don't overflow. + */ +uint64_t +strtonum(const char *str, char **nptr) +{ + uint64_t val = 0; + char c; + int digit; + + while ((c = *str) != '\0') { + if (c >= '0' && c <= '9') + digit = c - '0'; + else if (c >= 'a' && c <= 'f') + digit = 10 + c - 'a'; + else + break; + + val *= 16; + val += digit; + + str++; + } + + if (nptr) + *nptr = (char *)str; + + return (val); +} + +/* * ========================================================================== * Accessor functions * ========================================================================== @@ -1390,6 +1421,12 @@ spa_max_replication(spa_t *spa) return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } +int +spa_prev_software_version(spa_t *spa) +{ + return (spa->spa_prev_software_version); +} + uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { @@ -1584,3 +1621,45 @@ spa_dedup_checksum(spa_t *spa) { return (spa->spa_dedup_checksum); } + +/* + * Reset pool scan stat per scan pass (or reboot). + */ +void +spa_scan_stat_init(spa_t *spa) +{ + /* data not stored on disk */ + spa->spa_scan_pass_start = gethrestime_sec(); + spa->spa_scan_pass_exam = 0; + vdev_scan_stat_init(spa->spa_root_vdev); +} + +/* + * Get scan stats for zpool status reports + */ +int +spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) +{ + dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; + + if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) + return (ENOENT); + bzero(ps, sizeof (pool_scan_stat_t)); + + /* data stored on disk */ + ps->pss_func = scn->scn_phys.scn_func; + ps->pss_start_time = scn->scn_phys.scn_start_time; + ps->pss_end_time = scn->scn_phys.scn_end_time; + ps->pss_to_examine = scn->scn_phys.scn_to_examine; + ps->pss_examined = scn->scn_phys.scn_examined; + ps->pss_to_process = scn->scn_phys.scn_to_process; + ps->pss_processed = scn->scn_phys.scn_processed; + ps->pss_errors = scn->scn_phys.scn_errors; + ps->pss_state = scn->scn_phys.scn_state; + + /* data not stored on disk */ + ps->pss_pass_start = spa->spa_scan_pass_start; + ps->pss_pass_exam = spa->spa_scan_pass_exam; + + return (0); +} diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index c528fac1a6..f0ad04ff49 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ARC_H @@ -48,7 +47,8 @@ arc_done_func_t arc_getbuf_func; struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; - krwlock_t b_lock; + kmutex_t b_evict_lock; + krwlock_t b_data_lock; void *b_data; arc_evict_func_t *b_efunc; void *b_private; @@ -92,6 +92,8 @@ void arc_buf_add_ref(arc_buf_t *buf, void *tag); int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); +int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, + zbookmark_t *zb); int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h index 54686ba32d..4c05806e3e 100644 --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DBUF_H @@ -278,6 +277,7 @@ void dbuf_evict(dmu_buf_impl_t *db); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, dmu_tx_t *tx); +void dbuf_release_bp(dmu_buf_impl_t *db); void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, struct dmu_tx *); diff --git a/usr/src/uts/common/fs/zfs/sys/ddt.h b/usr/src/uts/common/fs/zfs/sys/ddt.h index 5eab6a2fb2..bd446acafa 100644 --- a/usr/src/uts/common/fs/zfs/sys/ddt.h +++ b/usr/src/uts/common/fs/zfs/sys/ddt.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DDT_H @@ -232,6 +231,8 @@ extern int ddt_load(spa_t *spa); extern void ddt_unload(spa_t *spa); extern void ddt_sync(spa_t *spa, uint64_t txg); extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); +extern int ddt_object_update(ddt_t *ddt, enum ddt_type type, + enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx); extern const ddt_ops_t ddt_zap_ops; diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 9c9369f8a7..7df5d48321 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -118,7 +118,7 @@ typedef enum dmu_object_type { DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ - DMU_OT_SCRUB_QUEUE, /* ZAP */ + DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_USERREFS, /* ZAP */ @@ -128,6 +128,8 @@ typedef enum dmu_object_type { DMU_OT_SA_MASTER_NODE, /* ZAP */ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ + DMU_OT_SCAN_XLATE, /* ZAP */ + DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ DMU_OT_NUMTYPES } dmu_object_type_t; @@ -220,23 +222,8 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" - -/* 4x8 zbookmark_t */ -#define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark" -/* 4x8 ddt_bookmark_t */ -#define DMU_POOL_SCRUB_DDT_BOOKMARK "scrub_ddt_bookmark" -/* 1x8 max_class */ -#define DMU_POOL_SCRUB_DDT_CLASS_MAX "scrub_ddt_class_max" -/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */ -#define DMU_POOL_SCRUB_QUEUE "scrub_queue" -/* 1x8 txg */ -#define DMU_POOL_SCRUB_MIN_TXG "scrub_min_txg" -/* 1x8 txg */ -#define DMU_POOL_SCRUB_MAX_TXG "scrub_max_txg" -/* 1x4 enum scrub_func */ -#define DMU_POOL_SCRUB_FUNC "scrub_func" -/* 1x8 count */ -#define DMU_POOL_SCRUB_ERRORS "scrub_errors" +#define DMU_POOL_CREATION_VERSION "creation_version" +#define DMU_POOL_SCAN "scan" /* * Allocate an object from this objset. The range of object numbers diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h index ef1782b74d..5c5119a207 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h @@ -46,6 +46,9 @@ struct dmu_tx; #define OBJSET_PHYS_SIZE 2048 #define OBJSET_OLD_PHYS_SIZE 1024 +#define OBJSET_BUF_HAS_USERUSED(buf) \ + (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE) + #define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) typedef struct objset_phys { diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h index 5b0821253d..844e7f1aeb 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DMU_TRAVERSE_H @@ -37,9 +36,11 @@ extern "C" { struct dnode_phys; struct dsl_dataset; struct zilog; +struct arc_buf; typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg); + struct arc_buf *pbuf, const zbookmark_t *zb, const struct dnode_phys *dnp, + void *arg); #define TRAVERSE_PRE (1<<0) #define TRAVERSE_POST (1<<1) diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h index 6eb7505ea5..a21971679c 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_DATASET_H @@ -113,7 +112,6 @@ typedef struct dsl_dataset { /* only used in syncing context, only valid for non-snapshots: */ struct dsl_dataset *ds_prev; - uint64_t ds_origin_txg; /* has internal locking: */ bplist_t ds_deadlist; @@ -235,8 +233,7 @@ int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, uint64_t *ref_rsrv); int dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota); -void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, - dmu_tx_t *tx); +dsl_syncfunc_t dsl_dataset_set_quota_sync; int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, uint64_t reservation); diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h index 14a64e019e..327cda6eec 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_DIR_H @@ -90,6 +89,7 @@ struct dsl_dir { kmutex_t dd_lock; list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */ + uint64_t dd_origin_txg; /* gross estimate of space used by in-flight tx's */ uint64_t dd_tempreserved[TXG_SIZE]; @@ -142,6 +142,7 @@ timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" #define ORIGIN_DIR_NAME "$ORIGIN" +#define XLATION_DIR_NAME "$XLATION" #ifdef ZFS_DEBUG #define dprintf_dd(dd, fmt, ...) do { \ diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h index 97541ad2f1..187040e700 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h @@ -32,6 +32,7 @@ #include <sys/zio.h> #include <sys/dnode.h> #include <sys/ddt.h> +#include <sys/arc.h> #ifdef __cplusplus extern "C" { @@ -42,12 +43,7 @@ struct dsl_dir; struct dsl_dataset; struct dsl_pool; struct dmu_tx; - -enum scrub_func { - SCRUB_FUNC_NONE, - SCRUB_FUNC_CLEAN, - SCRUB_FUNC_NUMFUNCS -}; +struct dsl_scan; /* These macros are for indexing into the zfs_all_blkstats_t. */ #define DMU_OT_DEFERRED DMU_OT_NONE @@ -87,25 +83,13 @@ typedef struct dsl_pool { uint64_t dp_write_limit; uint64_t dp_tmp_userrefs_obj; + struct dsl_scan *dp_scan; + /* Uses dp_lock */ kmutex_t dp_lock; uint64_t dp_space_towrite[TXG_SIZE]; uint64_t dp_tempreserved[TXG_SIZE]; - enum scrub_func dp_scrub_func; - uint64_t dp_scrub_queue_obj; - uint64_t dp_scrub_min_txg; - uint64_t dp_scrub_max_txg; - uint64_t dp_scrub_start_time; - uint64_t dp_scrub_ddt_class_max; - zbookmark_t dp_scrub_bookmark; - ddt_bookmark_t dp_scrub_ddt_bookmark; - boolean_t dp_scrub_pausing; - boolean_t dp_scrub_isresilver; - boolean_t dp_scrub_restart; - kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */ - zio_t *dp_scrub_prefetch_zio_root; - /* Has its own locking */ tx_state_t dp_tx; txg_list_t dp_dirty_datasets; @@ -138,20 +122,15 @@ void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); -void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); -void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); -void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, - struct dmu_tx *tx); +int dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb); +int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb); void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); -int dsl_pool_scrub_cancel(dsl_pool_t *dp); -int dsl_pool_scrub_clean(dsl_pool_t *dp); -void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx); -void dsl_pool_scrub_restart(dsl_pool_t *dp); -void dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum, - const ddt_entry_t *dde); - taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp); extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, @@ -159,6 +138,7 @@ extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, dmu_tx_t *tx); extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); +int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); #ifdef __cplusplus } diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h index d8a8ab2d64..a636ad3509 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_PROP_H @@ -91,7 +90,7 @@ int dsl_prop_set(const char *ddname, const char *propname, zprop_source_t source, int intsz, int numints, const void *buf); int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, - cred_t *cr, dmu_tx_t *tx); + dmu_tx_t *tx); void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, zprop_source_t source, uint64_t *value); diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h new file mode 100644 index 0000000000..c0eaa49567 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h @@ -0,0 +1,107 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_DSL_SCAN_H +#define _SYS_DSL_SCAN_H + +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/ddt.h> +#include <sys/bplist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct objset; +struct dsl_dir; +struct dsl_dataset; +struct dsl_pool; +struct dmu_tx; + +/* + * All members of this structure must be uint64_t, for byteswap + * purposes. + */ +typedef struct dsl_scan_phys { + uint64_t scn_func; /* pool_scan_func_t */ + uint64_t scn_state; /* dsl_scan_state_t */ + uint64_t scn_queue_obj; + uint64_t scn_min_txg; + uint64_t scn_max_txg; + uint64_t scn_cur_min_txg; + uint64_t scn_cur_max_txg; + uint64_t scn_start_time; + uint64_t scn_end_time; + uint64_t scn_to_examine; /* total bytes to be scanned */ + uint64_t scn_examined; /* bytes scanned so far */ + uint64_t scn_to_process; + uint64_t scn_processed; + uint64_t scn_errors; /* scan I/O error count */ + uint64_t scn_ddt_class_max; + ddt_bookmark_t scn_ddt_bookmark; + zbookmark_t scn_bookmark; + uint64_t scn_flags; /* dsl_scan_flags_t */ +} dsl_scan_phys_t; + +#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t)) + +typedef enum dsl_scan_flags { + DSF_VISIT_DS_AGAIN = 1<<0, +} dsl_scan_flags_t; + +typedef struct dsl_scan { + struct dsl_pool *scn_dp; + + boolean_t scn_pausing; + uint64_t scn_restart_txg; + uint64_t scn_sync_start_time; + zio_t *scn_prefetch_zio_root; + + /* for debugging / information */ + uint64_t scn_visited_this_txg; + + dsl_scan_phys_t scn_phys; +} dsl_scan_t; + +int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); +void dsl_scan_fini(struct dsl_pool *dp); +void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); +int dsl_scan_cancel(struct dsl_pool *); +int dsl_scan(struct dsl_pool *, pool_scan_func_t); +void dsl_resilver_restart(struct dsl_pool *, uint64_t txg); +boolean_t dsl_scan_resilvering(struct dsl_pool *dp); +boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); +void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx); +void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, + struct dmu_tx *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_SCAN_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h b/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h index 4995bfe5ac..9126290cdb 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_SYNCTASK_H #define _SYS_DSL_SYNCTASK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/txg.h> #include <sys/zfs_context.h> @@ -38,7 +35,7 @@ extern "C" { struct dsl_pool; typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *); -typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *); +typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *); typedef struct dsl_sync_task { list_node_t dst_node; @@ -53,7 +50,6 @@ typedef struct dsl_sync_task_group { txg_node_t dstg_node; list_t dstg_tasks; struct dsl_pool *dstg_pool; - cred_t *dstg_cr; uint64_t dstg_txg; int dstg_err; int dstg_space; diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h index 5ce6251ddb..583d6303bd 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_METASLAB_H diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h index d3fe7b1f89..bc3ade80f1 100644 --- a/usr/src/uts/common/fs/zfs/sys/refcount.h +++ b/usr/src/uts/common/fs/zfs/sys/refcount.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_REFCOUNT_H #define _SYS_REFCOUNT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/inttypes.h> #include <sys/list.h> #include <sys/zfs_context.h> @@ -91,6 +88,11 @@ typedef struct refcount { atomic_add_64_nv(&(rc)->rc_count, number) #define refcount_remove_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, -number) +#define refcount_transfer(dst, src) { \ + uint64_t __tmp = (src)->rc_count; \ + atomic_add_64(&(src)->rc_count, -__tmp); \ + atomic_add_64(&(dst)->rc_count, __tmp); \ +} #define refcount_init() #define refcount_fini() diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index a26a55cb42..4cc9244cd0 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SPA_H @@ -262,7 +261,7 @@ typedef struct blkptr { #define BP_GET_UCSIZE(bp) \ ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ - BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)); + BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ @@ -432,6 +431,8 @@ extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); +extern void spa_scan_stat_init(spa_t *spa); +extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 @@ -439,6 +440,14 @@ extern void spa_inject_delref(spa_t *spa); #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 +#define SPA_ASYNC_REMOVE_DONE 0x40 +#define SPA_ASYNC_REMOVE_STOP 0x80 + +/* + * Controls the behavior of spa_vdev_remove(). + */ +#define SPA_REMOVE_UNSPARE 0x01 +#define SPA_REMOVE_DONE 0x02 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); @@ -447,6 +456,7 @@ extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); +extern boolean_t spa_vdev_remove_active(spa_t *spa); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, @@ -465,14 +475,19 @@ extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); extern void spa_l2cache_activate(vdev_t *vd); extern void spa_l2cache_drop(spa_t *spa); -/* scrubbing */ -extern int spa_scrub(spa_t *spa, pool_scrub_type_t type); +/* scanning */ +extern int spa_scan(spa_t *spa, pool_scan_func_t func); +extern int spa_scan_stop(spa_t *spa); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); -#define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */ +/* + * DEFERRED_FREE must be large enough that regular blocks are not + * deferred. XXX so can't we change it back to 1? + */ +#define SYNC_PASS_DEFERRED_FREE 2 /* defer frees after this pass */ #define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */ #define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */ @@ -577,6 +592,7 @@ extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); extern int spa_max_replication(spa_t *spa); +extern int spa_prev_software_version(spa_t *spa); extern int spa_busy(void); extern uint8_t spa_get_failmode(spa_t *spa); extern boolean_t spa_suspended(spa_t *spa); @@ -632,8 +648,8 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf, history_log_type_t what); -extern void spa_history_internal_log(history_internal_events_t event, - spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...); +extern void spa_history_log_internal(history_internal_events_t event, + spa_t *spa, dmu_tx_t *tx, const char *fmt, ...); extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt); /* error handling */ diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index 9daec092b4..f857f733d2 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SPA_IMPL_H @@ -150,13 +149,14 @@ struct spa { kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */ - uint64_t spa_scrub_errors; /* scrub I/O error count */ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ uint8_t spa_scrub_finished; /* indicator to rotate logs */ uint8_t spa_scrub_started; /* started since last boot */ uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ + uint64_t spa_scan_pass_start; /* start time per pass/reboot */ + uint64_t spa_scan_pass_exam; /* examined bytes per pass */ kmutex_t spa_async_lock; /* protect async state */ kthread_t *spa_async_thread; /* thread doing async task */ int spa_async_suspended; /* async tasks suspended */ @@ -212,7 +212,8 @@ struct spa { uint64_t spa_did; /* if procp != p0, did of t1 */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ - + uint64_t spa_creation_version; /* version at pool creation */ + uint64_t spa_prev_software_version; /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h index c135df9b10..6ab6aa3135 100644 --- a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_UBERBLOCK_IMPL_H @@ -52,6 +51,9 @@ struct uberblock { uint64_t ub_guid_sum; /* sum of all vdev guids */ uint64_t ub_timestamp; /* UTC time of last sync */ blkptr_t ub_rootbp; /* MOS objset_phys_t */ + + /* highest SPA_VERSION supported by software that wrote this txg */ + uint64_t ub_software_version; }; #ifdef __cplusplus diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index b37516a984..941f234dc6 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_VDEV_H @@ -83,8 +82,7 @@ extern void vdev_split(vdev_t *vd); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); -extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, - boolean_t complete); +extern void vdev_scan_stat_init(vdev_t *vd); extern void vdev_propagate_state(vdev_t *vd); extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux); @@ -126,9 +124,15 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd); +typedef enum vdev_config_flag { + VDEV_CONFIG_SPARE = 1 << 0, + VDEV_CONFIG_L2CACHE = 1 << 1, + VDEV_CONFIG_REMOVING = 1 << 2 +} vdev_config_flag_t; + extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, - boolean_t getstats, boolean_t isspare, boolean_t isl2cache); + boolean_t getstats, vdev_config_flag_t flags); /* * Label routines diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index f78ec5084e..2b886bc588 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -151,7 +151,7 @@ struct vdev { txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ boolean_t vdev_probe_wanted; /* async probe wanted? */ - boolean_t vdev_removing; /* device is being removed? */ + uint64_t vdev_removing; /* device is being removed? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ diff --git a/usr/src/uts/common/fs/zfs/sys/zap.h b/usr/src/uts/common/fs/zfs/sys/zap.h index 3b9de2a2f9..b44fb8fbba 100644 --- a/usr/src/uts/common/fs/zfs/sys/zap.h +++ b/usr/src/uts/common/fs/zfs/sys/zap.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZAP_H @@ -259,7 +258,6 @@ int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, */ int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); - /* * Returns (in name) the name of the entry whose (value & mask) * (za_first_integer) is value, or ENOENT if not found. The string @@ -276,6 +274,14 @@ int zap_value_search(objset_t *os, uint64_t zapobj, */ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); +/* Same as zap_join, but set the values to 'value'. */ +int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, + uint64_t value, dmu_tx_t *tx); + +/* Same as zap_join, but add together any duplicated entries. */ +int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, + dmu_tx_t *tx); + /* * Manipulate entries where the name + value are the "same" (the name is * a stringified version of the value). @@ -286,6 +292,21 @@ int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, dmu_tx_t *tx); +/* Here the key is an int and the value is a different int. */ +int zap_add_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx); +int zap_lookup_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t *valuep); + +/* + * They name is a stringified version of key; increment its value by + * delta. Zero values will be zap_remove()-ed. + */ +int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx); +int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, + dmu_tx_t *tx); + struct zap; struct zap_leaf; typedef struct zap_cursor { diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h index 5aa0efc98d..739a380b75 100644 --- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZAP_IMPL_H @@ -67,9 +66,12 @@ typedef struct mzap_ent { avl_node_t mze_node; int mze_chunkid; uint64_t mze_hash; - mzap_ent_phys_t mze_phys; + uint32_t mze_cd; /* copy from mze_phys->mze_cd */ } mzap_ent_t; +#define MZE_PHYS(zap, mze) \ + (&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid]) + /* * The (fat) zap is stored in one object. It is an array of * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of: diff --git a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h index 173b6b195e..3a33636741 100644 --- a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h +++ b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h @@ -19,13 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZAP_LEAF_H #define _SYS_ZAP_LEAF_H +#include <sys/zap.h> + #ifdef __cplusplus extern "C" { #endif diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h index 450ac1c81b..50ecf9b362 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZFS_DEBUG_H #define _SYS_ZFS_DEBUG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -68,6 +65,16 @@ extern void __dprintf(const char *file, const char *func, extern void zfs_panic_recover(const char *fmt, ...); +typedef struct zfs_dbgmsg { + list_node_t zdm_node; + time_t zdm_timestamp; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +extern void zfs_dbgmsg_init(void); +extern void zfs_dbgmsg_fini(void); +extern void zfs_dbgmsg(const char *fmt, ...); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index f5884f2925..991b9f2f59 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/zfs_context.h> @@ -28,6 +27,7 @@ #include <sys/dmu_impl.h> #include <sys/dmu_tx.h> #include <sys/dsl_pool.h> +#include <sys/dsl_scan.h> #include <sys/callb.h> /* @@ -136,7 +136,7 @@ txg_sync_start(dsl_pool_t *dp) * 32-bit x86. This is due in part to nested pools and * scrub_visitbp() recursion. */ - tx->tx_sync_thread = thread_create(NULL, 12<<10, txg_sync_thread, + tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, dp, 0, &p0, TS_RUN, minclsyspri); mutex_exit(&tx->tx_sync_lock); @@ -366,12 +366,12 @@ txg_sync_thread(dsl_pool_t *dp) uint64_t txg; /* - * We sync when we're scrubbing, there's someone waiting + * We sync when we're scanning, there's someone waiting * on us, or the quiesce thread has handed off a txg to * us, or we have reached our timeout. */ timer = (delta >= timeout ? 0 : timeout - delta); - while ((dp->dp_scrub_func == SCRUB_FUNC_NONE || + while ((dp->dp_scan->scn_phys.scn_state != DSS_SCANNING || spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa)) && !tx->tx_exiting && timer > 0 && diff --git a/usr/src/uts/common/fs/zfs/uberblock.c b/usr/src/uts/common/fs/zfs/uberblock.c index 34d7e0c3ac..692cda137f 100644 --- a/usr/src/uts/common/fs/zfs/uberblock.c +++ b/usr/src/uts/common/fs/zfs/uberblock.c @@ -19,12 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/uberblock_impl.h> #include <sys/vdev_impl.h> @@ -58,6 +55,7 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg) ub->ub_txg = txg; ub->ub_guid_sum = rvd->vdev_guid_sum; ub->ub_timestamp = gethrestime_sec(); + ub->ub_software_version = SPA_VERSION; return (ub->ub_rootbp.blk_birth == txg); } diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 2b77c45574..a61f29b8e7 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -39,6 +39,7 @@ #include <sys/fs/zfs.h> #include <sys/arc.h> #include <sys/zil.h> +#include <sys/dsl_scan.h> /* * Virtual device management. @@ -486,6 +487,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, + &vd->vdev_removing); } if (parent && !parent->vdev_parent) { @@ -860,7 +863,12 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); - if (oldc == 0) + /* + * If the vdev is being removed we don't activate + * the metaslabs since we want to ensure that no new + * allocations are performed on this device. + */ + if (oldc == 0 && !vd->vdev_removing) metaslab_group_activate(vd->vdev_mg); if (txg == 0) @@ -1648,9 +1656,12 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) return; if (vd->vdev_ops->vdev_op_leaf) { + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + mutex_enter(&vd->vdev_dtl_lock); if (scrub_txg != 0 && - (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { + (spa->spa_scrub_started || + (scn && scn->scn_phys.scn_errors == 0))) { /* * We completed a scrub up to scrub_txg. If we * did it without rebooting, then the scrub dtl @@ -2029,7 +2040,10 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } - if (vd->vdev_removing) + /* + * Remove the metadata associated with this vdev once it's empty. + */ + if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove(vd, txg); while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { @@ -2403,7 +2417,7 @@ vdev_allocatable(vdev_t *vd) * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && - !vd->vdev_cant_write && !vd->vdev_ishole && !vd->vdev_removing); + !vd->vdev_cant_write && !vd->vdev_ishole); } boolean_t @@ -2433,7 +2447,6 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) mutex_enter(&vd->vdev_stat_lock); bcopy(&vd->vdev_stat, vs, sizeof (*vs)); - vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); @@ -2455,7 +2468,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } - vs->vs_scrub_examined += cvs->vs_scrub_examined; + cvs->vs_scan_removing = cvd->vdev_removing; mutex_exit(&vd->vdev_stat_lock); } } @@ -2472,6 +2485,19 @@ vdev_clear_stats(vdev_t *vd) } void +vdev_scan_stat_init(vdev_t *vd) +{ + vdev_stat_t *vs = &vd->vdev_stat; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_scan_stat_init(vd->vdev_child[c]); + + mutex_enter(&vd->vdev_stat_lock); + vs->vs_scan_processed = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; @@ -2515,8 +2541,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize) mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { - if (flags & ZIO_FLAG_SCRUB_THREAD) - vs->vs_scrub_repaired += psize; + if (flags & ZIO_FLAG_SCRUB_THREAD) { + dsl_scan_phys_t *scn_phys = + &spa->spa_dsl_pool->dp_scan->scn_phys; + uint64_t *processed = &scn_phys->scn_processed; + + /* XXX cleanup? */ + if (vd->vdev_ops->vdev_op_leaf) + atomic_add_64(processed, psize); + vs->vs_scan_processed += psize; + } + if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } @@ -2602,35 +2637,6 @@ vdev_stat_update(zio_t *zio, uint64_t psize) } } -void -vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) -{ - vdev_stat_t *vs = &vd->vdev_stat; - - for (int c = 0; c < vd->vdev_children; c++) - vdev_scrub_stat_update(vd->vdev_child[c], type, complete); - - mutex_enter(&vd->vdev_stat_lock); - - if (type == POOL_SCRUB_NONE) { - /* - * Update completion and end time. Leave everything else alone - * so we can report what happened during the previous scrub. - */ - vs->vs_scrub_complete = complete; - vs->vs_scrub_end = gethrestime_sec(); - } else { - vs->vs_scrub_type = type; - vs->vs_scrub_complete = 0; - vs->vs_scrub_examined = 0; - vs->vs_scrub_repaired = 0; - vs->vs_scrub_start = gethrestime_sec(); - vs->vs_scrub_end = 0; - } - - mutex_exit(&vd->vdev_stat_lock); -} - /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. @@ -2730,7 +2736,7 @@ vdev_config_dirty(vdev_t *vd) * sketchy, but it will work. */ nvlist_free(aux[c]); - aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE); + aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index d11b3df7c6..75ec545345 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -141,6 +140,7 @@ #include <sys/uberblock_impl.h> #include <sys/metaslab.h> #include <sys/zio.h> +#include <sys/dsl_scan.h> #include <sys/fs/zfs.h> /* @@ -208,7 +208,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, */ nvlist_t * vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - boolean_t isspare, boolean_t isl2cache) + vdev_config_flag_t flags) { nvlist_t *nv = NULL; @@ -216,7 +216,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type) == 0); - if (!isspare && !isl2cache) + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); @@ -270,7 +270,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_isspare) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0); - if (!isspare && !isl2cache && vd == vd->vdev_top) { + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && + vd == vd->vdev_top) { VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, vd->vdev_ms_array) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, @@ -281,6 +282,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vd->vdev_asize) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog) == 0); + if (vd->vdev_removing) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, + vd->vdev_removing) == 0); } if (vd->vdev_dtl_smo.smo_object != 0) @@ -293,28 +297,52 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (getstats) { vdev_stat_t vs; + pool_scan_stat_t ps; + vdev_get_stats(vd, &vs); - VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS, + VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0); + + /* provide either current or previous scan information */ + if (spa_scan_get_stats(spa, &ps) == 0) { + VERIFY(nvlist_add_uint64_array(nv, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, + sizeof (pool_scan_stat_t) / sizeof (uint64_t)) + == 0); + } } if (!vd->vdev_ops->vdev_op_leaf) { nvlist_t **child; - int c; + int c, idx; ASSERT(!vd->vdev_ishole); child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); - for (c = 0; c < vd->vdev_children; c++) - child[c] = vdev_config_generate(spa, vd->vdev_child[c], - getstats, isspare, isl2cache); + for (c = 0, idx = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + /* + * If we're generating an nvlist of removing + * vdevs then skip over any device which is + * not being removed. + */ + if ((flags & VDEV_CONFIG_REMOVING) && + !cvd->vdev_removing) + continue; - VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - child, vd->vdev_children) == 0); + child[idx++] = vdev_config_generate(spa, cvd, + getstats, flags); + } + + if (idx) { + VERIFY(nvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, idx) == 0); + } - for (c = 0; c < vd->vdev_children; c++) + for (c = 0; c < idx; c++) nvlist_free(child[c]); kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); @@ -375,12 +403,11 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config) { vdev_t *rvd = spa->spa_root_vdev; uint64_t *array; - uint_t idx; + uint_t c, idx; array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); - idx = 0; - for (int c = 0; c < rvd->vdev_children; c++) { + for (c = 0, idx = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_ishole) diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index 30415c8abb..4b0f5602c1 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/zfs_context.h> @@ -1604,6 +1603,7 @@ vdev_raidz_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } + /* * Report a checksum error for a child of a RAID-Z device. */ diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c index 6be63ef6b0..df5bd46739 100644 --- a/usr/src/uts/common/fs/zfs/zap.c +++ b/usr/src/uts/common/fs/zfs/zap.c @@ -978,6 +978,56 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) } int +zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, + uint64_t value, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err; + + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + if (za.za_integer_length != 8 || za.za_num_integers != 1) + return (EINVAL); + err = zap_add(os, intoobj, za.za_name, + 8, 1, &value, tx); + if (err) + return (err); + } + zap_cursor_fini(&zc); + return (0); +} + +int +zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, + dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err; + + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + uint64_t delta = 0; + + if (za.za_integer_length != 8 || za.za_num_integers != 1) + return (EINVAL); + + err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta); + if (err != 0 && err != ENOENT) + return (err); + delta += za.za_first_integer; + err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx); + if (err) + return (err); + } + zap_cursor_fini(&zc); + return (0); +} + +int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) { char name[20]; @@ -1005,17 +1055,34 @@ zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) } int -zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, - dmu_tx_t *tx) +zap_add_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_add(os, obj, name, 8, 1, &value, tx)); +} + +int +zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) { char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_lookup(os, obj, name, 8, 1, valuep)); +} + +int +zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, + dmu_tx_t *tx) +{ uint64_t value = 0; int err; if (delta == 0) return (0); - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); err = zap_lookup(os, obj, name, 8, 1, &value); if (err != 0 && err != ENOENT) return (err); @@ -1027,6 +1094,15 @@ zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, return (err); } +int +zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_increment(os, obj, name, delta, tx)); +} /* * Routines for iterating over the attributes. @@ -1101,7 +1177,6 @@ again: return (err); } - static void zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) { diff --git a/usr/src/uts/common/fs/zfs/zap_leaf.c b/usr/src/uts/common/fs/zfs/zap_leaf.c index 285d9c5674..19a795db82 100644 --- a/usr/src/uts/common/fs/zfs/zap_leaf.c +++ b/usr/src/uts/common/fs/zfs/zap_leaf.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -37,6 +36,7 @@ #include <sys/zap.h> #include <sys/zap_impl.h> #include <sys/zap_leaf.h> +#include <sys/arc.h> static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); @@ -538,14 +538,6 @@ zap_entry_update(zap_entry_handle_t *zeh, if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks) return (EAGAIN); - /* - * We should search other chained leaves (via - * zap_entry_remove,create?) otherwise returning EAGAIN will - * just send us into an infinite loop if we have to chain - * another leaf block, rather than being able to split this - * block. - */ - zap_leaf_array_free(l, &le->le_value_chunk); le->le_value_chunk = zap_leaf_array_create(l, buf, integer_size, num_integers); diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c index 2de5812fa2..3fc92ff122 100644 --- a/usr/src/uts/common/fs/zfs/zap_micro.c +++ b/usr/src/uts/common/fs/zfs/zap_micro.c @@ -31,6 +31,7 @@ #include <sys/zap_impl.h> #include <sys/zap_leaf.h> #include <sys/avl.h> +#include <sys/arc.h> #ifdef _KERNEL #include <sys/sunddi.h> @@ -254,26 +255,26 @@ mze_compare(const void *arg1, const void *arg2) return (+1); if (mze1->mze_hash < mze2->mze_hash) return (-1); - if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd) + if (mze1->mze_cd > mze2->mze_cd) return (+1); - if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd) + if (mze1->mze_cd < mze2->mze_cd) return (-1); return (0); } static void -mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) +mze_insert(zap_t *zap, int chunkid, uint64_t hash) { mzap_ent_t *mze; ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT(mzep->mze_cd < zap_maxcd(zap)); mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); mze->mze_chunkid = chunkid; mze->mze_hash = hash; - mze->mze_phys = *mzep; + mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; + ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); avl_add(&zap->zap_m.zap_avl, mze); } @@ -289,14 +290,15 @@ mze_find(zap_name_t *zn) ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); mze_tofind.mze_hash = zn->zn_hash; - mze_tofind.mze_phys.mze_cd = 0; + mze_tofind.mze_cd = 0; again: mze = avl_find(avl, &mze_tofind, &idx); if (mze == NULL) mze = avl_nearest(avl, idx, AVL_AFTER); for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { - if (zap_match(zn, mze->mze_phys.mze_name)) + ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); + if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); } if (zn->zn_matchtype == MT_BEST) { @@ -319,12 +321,12 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash) ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); mze_tofind.mze_hash = hash; - mze_tofind.mze_phys.mze_cd = 0; + mze_tofind.mze_cd = 0; cd = 0; for (mze = avl_find(avl, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { - if (mze->mze_phys.mze_cd != cd) + if (mze->mze_cd != cd) break; cd++; } @@ -408,7 +410,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zap->zap_m.zap_num_entries++; zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); - mze_insert(zap, i, zn->zn_hash, mze); + mze_insert(zap, i, zn->zn_hash); zap_name_free(zn); } } @@ -727,11 +729,11 @@ again: other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { if (zn == NULL) { - zn = zap_name_alloc(zap, mze->mze_phys.mze_name, + zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, MT_FIRST); allocdzn = B_TRUE; } - if (zap_match(zn, other->mze_phys.mze_name)) { + if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); @@ -793,9 +795,10 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, } else if (integer_size != 8) { err = EINVAL; } else { - *(uint64_t *)buf = mze->mze_phys.mze_value; + *(uint64_t *)buf = + MZE_PHYS(zap, mze)->mze_value; (void) strlcpy(realname, - mze->mze_phys.mze_name, rn_len); + MZE_PHYS(zap, mze)->mze_name, rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, zn, mze); @@ -932,7 +935,7 @@ again: if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; - mze_insert(zap, i, zn->zn_hash, mze); + mze_insert(zap, i, zn->zn_hash); return; } } @@ -1017,10 +1020,20 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, { zap_t *zap; mzap_ent_t *mze; + uint64_t oldval; const uint64_t *intval = val; zap_name_t *zn; int err; +#ifdef ZFS_DEBUG + /* + * If there is an old value, it shouldn't change across the + * lockdir (eg, due to bprewrite's xlation). + */ + if (integer_size == 8 && num_integers == 1) + (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); +#endif + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); @@ -1044,9 +1057,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, } else { mze = mze_find(zn); if (mze != NULL) { - mze->mze_phys.mze_value = *intval; - zap->zap_m.zap_phys->mz_chunk - [mze->mze_chunkid].mze_value = *intval; + ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); + MZE_PHYS(zap, mze)->mze_value = *intval; } else { mzap_addent(zn, *intval); } @@ -1245,7 +1257,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) err = ENOENT; mze_tofind.mze_hash = zc->zc_hash; - mze_tofind.mze_phys.mze_cd = zc->zc_cd; + mze_tofind.mze_cd = zc->zc_cd; mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); if (mze == NULL) { @@ -1253,18 +1265,16 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) idx, AVL_AFTER); } if (mze) { - ASSERT(0 == bcmp(&mze->mze_phys, - &zc->zc_zap->zap_m.zap_phys->mz_chunk - [mze->mze_chunkid], sizeof (mze->mze_phys))); - + mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); + ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = mzap_normalization_conflict(zc->zc_zap, NULL, mze); za->za_integer_length = 8; za->za_num_integers = 1; - za->za_first_integer = mze->mze_phys.mze_value; - (void) strcpy(za->za_name, mze->mze_phys.mze_name); + za->za_first_integer = mzep->mze_value; + (void) strcpy(za->za_name, mzep->mze_name); zc->zc_hash = mze->mze_hash; - zc->zc_cd = mze->mze_phys.mze_cd; + zc->zc_cd = mze->mze_cd; err = 0; } else { zc->zc_hash = -1ULL; @@ -1313,7 +1323,7 @@ zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) goto out; } zc->zc_hash = mze->mze_hash; - zc->zc_cd = mze->mze_phys.mze_cd; + zc->zc_cd = mze->mze_cd; } out: diff --git a/usr/src/uts/common/fs/zfs/zfs_debug.c b/usr/src/uts/common/fs/zfs/zfs_debug.c new file mode 100644 index 0000000000..d0f411a993 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_debug.c @@ -0,0 +1,95 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> + +list_t zfs_dbgmsgs; +int zfs_dbgmsg_size; +kmutex_t zfs_dbgmsgs_lock; +int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */ + +void +zfs_dbgmsg_init(void) +{ + list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), + offsetof(zfs_dbgmsg_t, zdm_node)); + mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +zfs_dbgmsg_fini(void) +{ + zfs_dbgmsg_t *zdm; + + while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) { + int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_destroy(&zfs_dbgmsgs_lock); + ASSERT3U(zfs_dbgmsg_size, ==, 0); +} + +/* + * Print these messages by running: + * echo ::zfs_dbgmsg | mdb -k + * + * Monitor these messages by running: + * dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' + */ +void +zfs_dbgmsg(const char *fmt, ...) +{ + int size; + va_list adx; + zfs_dbgmsg_t *zdm; + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx); + va_end(adx); + + /* + * There is one byte of string in sizeof (zfs_dbgmsg_t), used + * for the terminating null. + */ + zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP); + zdm->zdm_timestamp = gethrestime_sec(); + + va_start(adx, fmt); + (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx); + va_end(adx); + + DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg); + + mutex_enter(&zfs_dbgmsgs_lock); + list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size; + while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) { + zdm = list_remove_head(&zfs_dbgmsgs); + size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_exit(&zfs_dbgmsgs_lock); +} diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index d280125152..de5fb1e4ce 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/types.h> @@ -62,6 +61,7 @@ #include <sys/zfs_ctldir.h> #include <sys/zfs_dir.h> #include <sys/zvol.h> +#include <sys/dsl_scan.h> #include <sharefs/share.h> #include <sys/dmu_objset.h> @@ -117,7 +117,7 @@ void __dprintf(const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; - char buf[256]; + char buf[512]; va_list adx; /* @@ -1237,8 +1237,13 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * zc_cookie scan func (pool_scan_func_t) + */ static int -zfs_ioc_pool_scrub(zfs_cmd_t *zc) +zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; @@ -1246,7 +1251,10 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = spa_scrub(spa, zc->zc_cookie); + if (zc->zc_cookie == POOL_SCAN_NONE) + error = spa_scan_stop(spa); + else + error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); @@ -1402,6 +1410,12 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * zc_nvlist_conf nvlist of devices to remove + * zc_cookie to stop the remove? + */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { @@ -4250,7 +4264,7 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { B_FALSE }, { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE, B_FALSE }, - { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE, + { zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_TRUE }, { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, B_FALSE }, diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index da9163c963..f68dde85f8 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -560,34 +560,6 @@ unregister: } -static void -uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid, - int64_t delta, dmu_tx_t *tx) -{ - uint64_t used = 0; - char buf[32]; - int err; - uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; - - if (delta == 0) - return; - - (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid); - err = zap_lookup(os, obj, buf, 8, 1, &used); - - ASSERT(err == 0 || err == ENOENT); - /* no underflow/overflow */ - ASSERT(delta > 0 || used >= -delta); - ASSERT(delta < 0 || used + delta > used); - used += delta; - if (used == 0) - err = zap_remove(os, obj, buf, tx); - else - err = zap_update(os, obj, buf, 8, 1, &used, tx); - ASSERT(err == 0); - -} - static int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) @@ -2239,9 +2211,8 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) sa_register_update_callback(os, zfs_sa_upgrade); } - spa_history_internal_log(LOG_DS_UPGRADE, - dmu_objset_spa(os), tx, CRED(), - "oldver=%llu newver=%llu dataset = %llu", + spa_history_log_internal(LOG_DS_UPGRADE, + dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu", zfsvfs->z_version, newvers, dmu_objset_id(os)); dmu_tx_commit(tx); diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index e4c435341f..4aa4d10b07 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -36,6 +36,7 @@ #include <sys/dsl_dataset.h> #include <sys/vdev.h> #include <sys/dmu_tx.h> +#include <sys/dsl_pool.h> /* * The zfs intent log (ZIL) saves transaction records of system calls @@ -179,7 +180,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, + error = dsl_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 4e481b16b7..181d07fbdb 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/zfs_context.h> @@ -661,6 +660,9 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, { zio_t *zio; + dprintf_bp(bp, "freeing in txg %llu, pass %u", + (longlong_t)txg, spa->spa_sync_pass); + ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); @@ -2073,6 +2075,8 @@ zio_ddt_write(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } +ddt_entry_t *freedde; /* for debugging */ + static int zio_ddt_free(zio_t *zio) { @@ -2086,7 +2090,7 @@ zio_ddt_free(zio_t *zio) ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_TRUE); + freedde = dde = ddt_lookup(ddt, bp, B_TRUE); ddp = ddt_phys_select(dde, bp); ddt_phys_decref(ddp); ddt_exit(ddt); diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 3481fb0586..7b4577a9f3 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -249,7 +249,7 @@ struct maparg { /*ARGSUSED*/ static int -zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, +zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct maparg *ma = arg; diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index a5d9122246..54dd8abec7 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -333,14 +333,15 @@ typedef enum { #define SPA_VERSION_22 22ULL #define SPA_VERSION_23 23ULL #define SPA_VERSION_24 24ULL +#define SPA_VERSION_25 25ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, * and do the appropriate changes. Also bump the version number in * usr/src/grub/capability. */ -#define SPA_VERSION SPA_VERSION_24 -#define SPA_VERSION_STRING "24" +#define SPA_VERSION SPA_VERSION_25 +#define SPA_VERSION_STRING "25" /* * Symbolic names for the changes that caused a SPA_VERSION switch. @@ -386,6 +387,7 @@ typedef enum { #define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 #define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 #define SPA_VERSION_SA SPA_VERSION_24 +#define SPA_VERSION_SCAN SPA_VERSION_25 /* * ZPL version - rev'd whenever an incompatible on-disk format change @@ -450,7 +452,8 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" #define ZPOOL_CONFIG_DTL "DTL" -#define ZPOOL_CONFIG_STATS "stats" +#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" @@ -473,6 +476,7 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_ORIG_GUID "orig_guid" #define ZPOOL_CONFIG_SPLIT_GUID "split_guid" #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" +#define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ @@ -580,14 +584,14 @@ typedef enum pool_state { } pool_state_t; /* - * Scrub types. + * Scan Functions. */ -typedef enum pool_scrub_type { - POOL_SCRUB_NONE, - POOL_SCRUB_RESILVER, - POOL_SCRUB_EVERYTHING, - POOL_SCRUB_TYPES -} pool_scrub_type_t; +typedef enum pool_scan_func { + POOL_SCAN_NONE, + POOL_SCAN_SCRUB, + POOL_SCAN_RESILVER, + POOL_SCAN_FUNCS +} pool_scan_func_t; /* * ZIO types. Needed to interpret vdev statistics below. @@ -603,6 +607,36 @@ typedef enum zio_type { } zio_type_t; /* + * Pool statistics. Note: all fields should be 64-bit because this + * is passed between kernel and userland as an nvlist uint64 array. + */ +typedef struct pool_scan_stat { + /* values stored on disk */ + uint64_t pss_func; /* pool_scan_func_t */ + uint64_t pss_state; /* dsl_scan_state_t */ + uint64_t pss_start_time; /* scan start time */ + uint64_t pss_end_time; /* scan end time */ + uint64_t pss_to_examine; /* total bytes to scan */ + uint64_t pss_examined; /* total examined bytes */ + uint64_t pss_to_process; /* total bytes to process */ + uint64_t pss_processed; /* total processed bytes */ + uint64_t pss_errors; /* scan errors */ + + /* values not stored on disk */ + uint64_t pss_pass_exam; /* examined bytes per scan pass */ + uint64_t pss_pass_start; /* start time of a scan pass */ +} pool_scan_stat_t; + +typedef enum dsl_scan_state { + DSS_NONE, + DSS_SCANNING, + DSS_FINISHED, + DSS_CANCELED, + DSS_NUM_STATES +} dsl_scan_state_t; + + +/* * Vdev statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ @@ -620,13 +654,8 @@ typedef struct vdev_stat { uint64_t vs_write_errors; /* write errors */ uint64_t vs_checksum_errors; /* checksum errors */ uint64_t vs_self_healed; /* self-healed bytes */ - uint64_t vs_scrub_type; /* pool_scrub_type_t */ - uint64_t vs_scrub_complete; /* completed? */ - uint64_t vs_scrub_examined; /* bytes examined; top */ - uint64_t vs_scrub_repaired; /* bytes repaired; leaf */ - uint64_t vs_scrub_errors; /* errors during scrub */ - uint64_t vs_scrub_start; /* UTC scrub start time */ - uint64_t vs_scrub_end; /* UTC scrub end time */ + uint64_t vs_scan_removing; /* removing? */ + uint64_t vs_scan_processed; /* scan processed bytes */ } vdev_stat_t; /* @@ -682,7 +711,7 @@ typedef enum zfs_ioc { ZFS_IOC_POOL_CONFIGS, ZFS_IOC_POOL_STATS, ZFS_IOC_POOL_TRYIMPORT, - ZFS_IOC_POOL_SCRUB, + ZFS_IOC_POOL_SCAN, ZFS_IOC_POOL_FREEZE, ZFS_IOC_POOL_UPGRADE, ZFS_IOC_POOL_GET_HISTORY, @@ -820,7 +849,7 @@ typedef enum history_internal_events { LOG_POOL_VDEV_OFFLINE, LOG_POOL_UPGRADE, LOG_POOL_CLEAR, - LOG_POOL_SCRUB, + LOG_POOL_SCAN, LOG_POOL_PROPSET, LOG_DS_CREATE, LOG_DS_CLONE, @@ -843,7 +872,7 @@ typedef enum history_internal_events { LOG_DS_UPGRADE, LOG_DS_REFQUOTA, LOG_DS_REFRESERV, - LOG_POOL_SCRUB_DONE, + LOG_POOL_SCAN_DONE, LOG_DS_USER_HOLD, LOG_DS_USER_RELEASE, LOG_POOL_SPLIT, |