summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLin Ling <Lin.Ling@Sun.COM>2010-05-03 14:54:08 -0700
committerLin Ling <Lin.Ling@Sun.COM>2010-05-03 14:54:08 -0700
commit3f9d6ad73e45c6823b409f93b0c8d4f62861d2d5 (patch)
tree195b2b1fa9e897a41897e12fed9b0c6e58d8107e
parent3113f7cee6785cfe8d9e78c535cf9e2a79283275 (diff)
downloadillumos-joyent-3f9d6ad73e45c6823b409f93b0c8d4f62861d2d5.tar.gz
6675946 'zpool status' should show the progress of resilvering for individual disk.
6683750 scrub -s have to wait until resilver completed? 6841252 Resilvering not restartable - causing an excess reboot delay 6855073 spa scrub stats (eg %done) are reset on reboot 6891824 7410 NAS head "continually resilvering" following HDD replacement 6899970 scrub/resilver percent complete reporting in zpool status can be overly optimistic 6940889 add interval (count) args to zpool list 6944623 dbuf_read_done() locking performance improvement 6946760 mutex problem in bplist_enqueue() 6391915 RFE: provide interval arg to zpool status to monitor resilvering 6946512 want zfs_send() to pass back debug info 6943992 'zpool scrub' should not restart the existing scrub silently 6878281 zpool should store the time of last scrub/resilver and other zpool status info in pool properties. 6935158 Assertion failed: used <= spa_get_dspace(dd->dd_pool->dp_spa) 6944388 dsl_dataset_snapshot_reserve_space() causes dp_write_limit=max --HG-- rename : usr/src/uts/common/fs/zfs/dsl_scrub.c => usr/src/uts/common/fs/zfs/dsl_scan.c
-rw-r--r--usr/src/cmd/availdevs/availdevs.c5
-rw-r--r--usr/src/cmd/fm/schemes/zfs/scheme.c5
-rw-r--r--usr/src/cmd/mdb/common/modules/zfs/zfs.c105
-rw-r--r--usr/src/cmd/ndmpd/ndmp/ndmpd_zfs.c2
-rw-r--r--usr/src/cmd/truss/codes.c5
-rw-r--r--usr/src/cmd/zdb/zdb.c105
-rw-r--r--usr/src/cmd/zfs/zfs_main.c21
-rw-r--r--usr/src/cmd/zpool/Makefile3
-rw-r--r--usr/src/cmd/zpool/zpool_main.c397
-rw-r--r--usr/src/cmd/zpool/zpool_util.h5
-rw-r--r--usr/src/cmd/zpool/zpool_vdev.c23
-rw-r--r--usr/src/cmd/ztest/ztest.c11
-rw-r--r--usr/src/common/zfs/zfs_comutil.c47
-rw-r--r--usr/src/common/zfs/zfs_comutil.h4
-rw-r--r--usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h5
-rw-r--r--usr/src/lib/libzfs/common/libzfs.h14
-rw-r--r--usr/src/lib/libzfs/common/libzfs_pool.c93
-rw-r--r--usr/src/lib/libzfs/common/libzfs_sendrecv.c32
-rw-r--r--usr/src/lib/libzfs/common/libzfs_status.c27
-rw-r--r--usr/src/lib/libzfs/common/libzfs_util.c8
-rw-r--r--usr/src/lib/libzfs/common/mapfile-vers7
-rw-r--r--usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c5
-rw-r--r--usr/src/lib/libzpool/common/util.c5
-rw-r--r--usr/src/uts/common/Makefile.files3
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c211
-rw-r--r--usr/src/uts/common/fs/zfs/bplist.c35
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c32
-rw-r--r--usr/src/uts/common/fs/zfs/ddt.c27
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c9
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c31
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c28
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_traverse.c51
-rw-r--r--usr/src/uts/common/fs/zfs/dnode_sync.c2
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dataset.c197
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_deleg.c19
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dir.c69
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_pool.c93
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_prop.c22
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_scan.c1660
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_scrub.c1214
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_synctask.c9
-rw-r--r--usr/src/uts/common/fs/zfs/refcount.c5
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c189
-rw-r--r--usr/src/uts/common/fs/zfs/spa_config.c5
-rw-r--r--usr/src/uts/common/fs/zfs/spa_errlog.c33
-rw-r--r--usr/src/uts/common/fs/zfs/spa_history.c25
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c85
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc.h8
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dbuf.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/ddt.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h23
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_objset.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_traverse.h7
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_dataset.h7
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_dir.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_pool.h42
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_prop.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_scan.h107
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_synctask.h8
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/refcount.h10
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h32
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_impl.h9
-rw-r--r--usr/src/uts/common/fs/zfs/sys/uberblock_impl.h6
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h14
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zap.h27
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zap_impl.h8
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zap_leaf.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_debug.h15
-rw-r--r--usr/src/uts/common/fs/zfs/txg.c10
-rw-r--r--usr/src/uts/common/fs/zfs/uberblock.c6
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c82
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_label.c59
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c4
-rw-r--r--usr/src/uts/common/fs/zfs/zap.c83
-rw-r--r--usr/src/uts/common/fs/zfs/zap_leaf.c12
-rw-r--r--usr/src/uts/common/fs/zfs/zap_micro.c64
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_debug.c95
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c26
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c33
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c3
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c10
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c2
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h69
85 files changed, 3522 insertions, 2379 deletions
diff --git a/usr/src/cmd/availdevs/availdevs.c b/usr/src/cmd/availdevs/availdevs.c
index 1332a4f2bb..7ecec0a05c 100644
--- a/usr/src/cmd/availdevs/availdevs.c
+++ b/usr/src/cmd/availdevs/availdevs.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include "availdevs.h"
@@ -134,7 +133,7 @@ add_pool_to_xml(nvlist_t *config, void *data)
nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state) ||
nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &devices) ||
nvlist_lookup_uint64_array(
- devices, ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &n)) {
+ devices, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &n)) {
return (-1);
}
diff --git a/usr/src/cmd/fm/schemes/zfs/scheme.c b/usr/src/cmd/fm/schemes/zfs/scheme.c
index ffa8ebf7f5..c0922f4d89 100644
--- a/usr/src/cmd/fm/schemes/zfs/scheme.c
+++ b/usr/src/cmd/fm/schemes/zfs/scheme.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <fm/fmd_fmri.h>
@@ -214,7 +213,7 @@ fmd_fmri_unusable(nvlist_t *nvl)
vdev_stat_t *vs;
uint_t c;
- (void) nvlist_lookup_uint64_array(vd, ZPOOL_CONFIG_STATS,
+ (void) nvlist_lookup_uint64_array(vd, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c);
ret = (vs->vs_state < VDEV_STATE_DEGRADED);
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
index e722007d72..e7bb40809b 100644
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -129,6 +129,16 @@ getrefcount(uintptr_t addr, mdb_ctf_id_t *id,
return (GETMEMBID(addr + off, &rc_id, rc_count, *rc));
}
+static boolean_t
+strisprint(const char *cp)
+{
+ for (; *cp; cp++) {
+ if (!isprint(*cp))
+ return (B_FALSE);
+ }
+ return (B_TRUE);
+}
+
static int verbose;
static int
@@ -624,8 +634,10 @@ zap_leaf(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
zlc->l_entry.le_hash);
break;
case ZAP_CHUNK_ARRAY:
- mdb_printf(" %u: array \"%s\"\n",
- i, zlc->l_array.la_array);
+ mdb_printf(" %u: array", i);
+ if (strisprint((char *)zlc->l_array.la_array))
+ mdb_printf(" \"%s\"", zlc->l_array.la_array);
+ mdb_printf("\n");
if (verbose) {
int j;
mdb_printf(" ");
@@ -811,6 +823,77 @@ abuf_find(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
return (DCMD_OK);
}
+/* ARGSUSED */
+static int
+dbgmsg_cb(uintptr_t addr, const void *unknown, void *arg)
+{
+ static mdb_ctf_id_t id;
+ static boolean_t gotid;
+ static ulong_t off;
+
+ int *verbosep = arg;
+ time_t timestamp;
+ char buf[1024];
+
+ if (!gotid) {
+ if (mdb_ctf_lookup_by_name("struct zfs_dbgmsg", &id) == -1) {
+ mdb_warn("couldn't find struct zfs_dbgmsg");
+ return (WALK_ERR);
+ }
+ gotid = TRUE;
+ if (mdb_ctf_offsetof(id, "zdm_msg", &off) == -1) {
+ mdb_warn("couldn't find zdm_msg");
+ return (WALK_ERR);
+ }
+ off /= 8;
+ }
+
+
+ if (GETMEMBID(addr, &id, zdm_timestamp, timestamp)) {
+ return (WALK_ERR);
+ }
+
+ if (mdb_readstr(buf, sizeof (buf), addr + off) == -1) {
+ mdb_warn("failed to read zdm_msg at %p\n", addr + off);
+ return (DCMD_ERR);
+ }
+
+ if (*verbosep)
+ mdb_printf("%Y ", timestamp);
+
+ mdb_printf("%s\n", buf);
+
+ if (*verbosep)
+ (void) mdb_call_dcmd("whatis", addr, DCMD_ADDRSPEC, 0, NULL);
+
+ return (WALK_NEXT);
+}
+
+/* ARGSUSED */
+static int
+dbgmsg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ GElf_Sym sym;
+ int verbose = FALSE;
+
+ if (mdb_getopts(argc, argv,
+ 'v', MDB_OPT_SETBITS, TRUE, &verbose,
+ NULL) != argc)
+ return (DCMD_USAGE);
+
+ if (mdb_lookup_by_name("zfs_dbgmsgs", &sym)) {
+ mdb_warn("can't find zfs_dbgmsgs");
+ return (DCMD_ERR);
+ }
+
+ if (mdb_pwalk("list", dbgmsg_cb, &verbose, sym.st_value) != 0) {
+ mdb_warn("can't walk zfs_dbgmsgs");
+ return (DCMD_ERR);
+ }
+
+ return (DCMD_OK);
+}
+
/*ARGSUSED*/
static int
arc_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
@@ -2195,7 +2278,7 @@ reference_cb(uintptr_t addr, const void *ignored, void *arg)
uintptr_t ref_holder;
uintptr_t ref_removed;
uint64_t ref_number;
- boolean_t holder_is_str;
+ boolean_t holder_is_str = B_FALSE;
char holder_str[128];
boolean_t removed = (boolean_t)arg;
@@ -2212,18 +2295,8 @@ reference_cb(uintptr_t addr, const void *ignored, void *arg)
GETMEMBID(addr, &ref_id, ref_number, ref_number))
return (WALK_ERR);
- if (mdb_readstr(holder_str, sizeof (holder_str), ref_holder) != -1) {
- char *cp;
- holder_is_str = B_TRUE;
- for (cp = holder_str; *cp; cp++) {
- if (!isprint(*cp)) {
- holder_is_str = B_FALSE;
- break;
- }
- }
- } else {
- holder_is_str = B_FALSE;
- }
+ if (mdb_readstr(holder_str, sizeof (holder_str), ref_holder) != -1)
+ holder_is_str = strisprint(holder_str);
if (removed)
mdb_printf("removed ");
@@ -2940,6 +3013,8 @@ static const mdb_dcmd_t dcmds[] = {
sa_attr_table},
{ "sa_attr", ": attr_id",
"print SA attribute address when given sa_handle_t", sa_attr_print},
+ { "zfs_dbgmsg", ":[-v]",
+ "print zfs debug log", dbgmsg},
{ NULL }
};
diff --git a/usr/src/cmd/ndmpd/ndmp/ndmpd_zfs.c b/usr/src/cmd/ndmpd/ndmp/ndmpd_zfs.c
index 6a57828a24..4c12778ad1 100644
--- a/usr/src/cmd/ndmpd/ndmp/ndmpd_zfs.c
+++ b/usr/src/cmd/ndmpd/ndmp/ndmpd_zfs.c
@@ -610,7 +610,7 @@ ndmpd_zfs_backup_send_read(ndmpd_zfs_args_t *ndmpd_zfs_args)
}
err = zfs_send(zhp, fromsnap, ndmpd_zfs_args->nz_snapname, flags,
- ndmpd_zfs_args->nz_pipe_fd[PIPE_ZFS], NULL, NULL);
+ ndmpd_zfs_args->nz_pipe_fd[PIPE_ZFS], NULL, NULL, NULL);
if (err && !session->ns_data.dd_abort)
NDMPD_ZFS_LOG_ZERR(ndmpd_zfs_args, "zfs_send: %d", err);
diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c
index 03f7b40ab0..7178537a06 100644
--- a/usr/src/cmd/truss/codes.c
+++ b/usr/src/cmd/truss/codes.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -1146,7 +1145,7 @@ const struct ioc {
"zfs_cmd_t" },
{ (uint_t)ZFS_IOC_POOL_TRYIMPORT, "ZFS_IOC_POOL_TRYIMPORT",
"zfs_cmd_t" },
- { (uint_t)ZFS_IOC_POOL_SCRUB, "ZFS_IOC_POOL_SCRUB",
+ { (uint_t)ZFS_IOC_POOL_SCAN, "ZFS_IOC_POOL_SCAN",
"zfs_cmd_t" },
{ (uint_t)ZFS_IOC_POOL_FREEZE, "ZFS_IOC_POOL_FREEZE",
"zfs_cmd_t" },
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 61e79d0e84..2d36cf5488 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -150,6 +150,7 @@ usage(void)
"has altroot/not in a cachefile\n");
(void) fprintf(stderr, " -p <path> -- use one or more with "
"-e to specify path to vdev dir\n");
+ (void) fprintf(stderr, " -P print numbers parsable\n");
(void) fprintf(stderr, " -t <txg> -- highest txg to use when "
"searching for uberblocks\n");
(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
@@ -196,6 +197,15 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
nvlist_free(nv);
}
+static void
+zdb_nicenum(uint64_t num, char *buf)
+{
+ if (dump_opt['P'])
+ (void) sprintf(buf, "%llu", (longlong_t)num);
+ else
+ nicenum(num, buf);
+}
+
const char dump_zap_stars[] = "****************************************";
const int dump_zap_width = sizeof (dump_zap_stars) - 1;
@@ -490,7 +500,7 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
*/
alloc = 0;
for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
- VERIFY(0 == dmu_read(os, smo->smo_object, offset,
+ VERIFY3U(0, ==, dmu_read(os, smo->smo_object, offset,
sizeof (entry), &entry, DMU_READ_PREFETCH));
if (SM_DEBUG_DECODE(entry)) {
(void) printf("\t [%6llu] %s: txg %llu, pass %llu\n",
@@ -525,12 +535,12 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
static void
dump_metaslab_stats(metaslab_t *msp)
{
- char maxbuf[5];
+ char maxbuf[32];
space_map_t *sm = &msp->ms_map;
avl_tree_t *t = sm->sm_pp_root;
int free_pct = sm->sm_space * 100 / sm->sm_size;
- nicenum(space_map_maxsize(sm), maxbuf);
+ zdb_nicenum(space_map_maxsize(sm), maxbuf);
(void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
"segments", avl_numnodes(t), "maxsize", maxbuf,
@@ -544,9 +554,9 @@ dump_metaslab(metaslab_t *msp)
spa_t *spa = vd->vdev_spa;
space_map_t *sm = &msp->ms_map;
space_map_obj_t *smo = &msp->ms_smo;
- char freebuf[5];
+ char freebuf[32];
- nicenum(sm->sm_size - smo->smo_alloc, freebuf);
+ zdb_nicenum(sm->sm_size - smo->smo_alloc, freebuf);
(void) printf(
"\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",
@@ -855,7 +865,7 @@ dump_history(spa_t *spa)
(void) snprintf(internalstr,
sizeof (internalstr),
"[internal %s txg:%lld] %s",
- hist_event_table[ievent], txg,
+ zfs_history_event_names[ievent], txg,
intstr);
cmd = internalstr;
}
@@ -966,6 +976,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
return (err);
+ ASSERT(buf->b_data);
/* recursively visit blocks below this */
cbp = buf->b_data;
@@ -1015,7 +1026,7 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
{
dsl_dir_phys_t *dd = data;
time_t crtime;
- char nice[6];
+ char nice[32];
if (dd == NULL)
return;
@@ -1032,15 +1043,15 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
(u_longlong_t)dd->dd_origin_obj);
(void) printf("\t\tchild_dir_zapobj = %llu\n",
(u_longlong_t)dd->dd_child_dir_zapobj);
- nicenum(dd->dd_used_bytes, nice);
+ zdb_nicenum(dd->dd_used_bytes, nice);
(void) printf("\t\tused_bytes = %s\n", nice);
- nicenum(dd->dd_compressed_bytes, nice);
+ zdb_nicenum(dd->dd_compressed_bytes, nice);
(void) printf("\t\tcompressed_bytes = %s\n", nice);
- nicenum(dd->dd_uncompressed_bytes, nice);
+ zdb_nicenum(dd->dd_uncompressed_bytes, nice);
(void) printf("\t\tuncompressed_bytes = %s\n", nice);
- nicenum(dd->dd_quota, nice);
+ zdb_nicenum(dd->dd_quota, nice);
(void) printf("\t\tquota = %s\n", nice);
- nicenum(dd->dd_reserved, nice);
+ zdb_nicenum(dd->dd_reserved, nice);
(void) printf("\t\treserved = %s\n", nice);
(void) printf("\t\tprops_zapobj = %llu\n",
(u_longlong_t)dd->dd_props_zapobj);
@@ -1050,7 +1061,7 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
(u_longlong_t)dd->dd_flags);
#define DO(which) \
- nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \
+ zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \
(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
DO(HEAD);
DO(SNAP);
@@ -1066,7 +1077,7 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
{
dsl_dataset_phys_t *ds = data;
time_t crtime;
- char used[6], compressed[6], uncompressed[6], unique[6];
+ char used[32], compressed[32], uncompressed[32], unique[32];
char blkbuf[BP_SPRINTF_LEN];
if (ds == NULL)
@@ -1074,10 +1085,10 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
ASSERT(size == sizeof (*ds));
crtime = ds->ds_creation_time;
- nicenum(ds->ds_used_bytes, used);
- nicenum(ds->ds_compressed_bytes, compressed);
- nicenum(ds->ds_uncompressed_bytes, uncompressed);
- nicenum(ds->ds_unique_bytes, unique);
+ zdb_nicenum(ds->ds_used_bytes, used);
+ zdb_nicenum(ds->ds_compressed_bytes, compressed);
+ zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed);
+ zdb_nicenum(ds->ds_unique_bytes, unique);
sprintf_blkptr(blkbuf, &ds->ds_bp);
(void) printf("\t\tdir_obj = %llu\n",
@@ -1122,9 +1133,9 @@ dump_bplist(objset_t *mos, uint64_t object, char *name)
bplist_t bpl = { 0 };
blkptr_t blk, *bp = &blk;
uint64_t itor = 0;
- char bytes[6];
- char comp[6];
- char uncomp[6];
+ char bytes[32];
+ char comp[32];
+ char uncomp[32];
if (dump_opt['d'] < 3)
return;
@@ -1137,10 +1148,10 @@ dump_bplist(objset_t *mos, uint64_t object, char *name)
return;
}
- nicenum(bpl.bpl_phys->bpl_bytes, bytes);
+ zdb_nicenum(bpl.bpl_phys->bpl_bytes, bytes);
if (bpl.bpl_dbuf->db_size == sizeof (bplist_phys_t)) {
- nicenum(bpl.bpl_phys->bpl_comp, comp);
- nicenum(bpl.bpl_phys->bpl_uncomp, uncomp);
+ zdb_nicenum(bpl.bpl_phys->bpl_comp, comp);
+ zdb_nicenum(bpl.bpl_phys->bpl_uncomp, uncomp);
(void) printf("\n %s: %llu entries, %s (%s/%s comp)\n",
name, (u_longlong_t)bpl.bpl_phys->bpl_entries,
bytes, comp, uncomp);
@@ -1391,6 +1402,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
dump_zap, /* SA Master Node */
dump_sa_attrs, /* SA attribute registration */
dump_sa_layouts, /* SA attribute layouts */
+ dump_zap, /* DSL scrub translations */
+ dump_none, /* fake dedup BP */
dump_unknown, /* Unknown type, must be last */
};
@@ -1402,7 +1415,8 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
dnode_t *dn;
void *bonus = NULL;
size_t bsize = 0;
- char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], fill[7];
+ char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
+ char bonus_size[32];
char aux[50];
int error;
@@ -1426,11 +1440,11 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
}
dmu_object_info_from_dnode(dn, &doi);
- nicenum(doi.doi_metadata_block_size, iblk);
- nicenum(doi.doi_data_block_size, dblk);
- nicenum(doi.doi_max_offset, lsize);
- nicenum(doi.doi_physical_blocks_512 << 9, asize);
- nicenum(doi.doi_bonus_size, bonus_size);
+ zdb_nicenum(doi.doi_metadata_block_size, iblk);
+ zdb_nicenum(doi.doi_data_block_size, dblk);
+ zdb_nicenum(doi.doi_max_offset, lsize);
+ zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize);
+ zdb_nicenum(doi.doi_bonus_size, bonus_size);
(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
doi.doi_max_offset);
@@ -1492,7 +1506,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
}
for (;;) {
- char segsize[6];
+ char segsize[32];
error = dnode_next_offset(dn,
0, &start, minlvl, blkfill, 0);
if (error)
@@ -1500,7 +1514,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
end = start;
error = dnode_next_offset(dn,
DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
- nicenum(end - start, segsize);
+ zdb_nicenum(end - start, segsize);
(void) printf("\t\tsegment [%016llx, %016llx)"
" size %5s\n", (u_longlong_t)start,
(u_longlong_t)end, segsize);
@@ -1523,7 +1537,7 @@ dump_dir(objset_t *os)
dmu_objset_stats_t dds;
uint64_t object, object_count;
uint64_t refdbytes, usedobjs, scratch;
- char numbuf[8];
+ char numbuf[32];
char blkbuf[BP_SPRINTF_LEN + 20];
char osname[MAXNAMELEN];
char *type = "UNKNOWN";
@@ -1547,7 +1561,7 @@ dump_dir(objset_t *os)
ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill);
- nicenum(refdbytes, numbuf);
+ zdb_nicenum(refdbytes, numbuf);
if (verbosity >= 4) {
(void) sprintf(blkbuf, ", rootbp ");
@@ -1905,8 +1919,9 @@ zdb_count_block(spa_t *spa, zilog_t *zilog, zdb_cb_t *zcb, const blkptr_t *bp,
bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
}
+/* ARGSUSED */
static int
-zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
zdb_cb_t *zcb = arg;
@@ -2222,7 +2237,8 @@ dump_block_stats(spa_t *spa)
"\t avg\t comp\t%%Total\tType\n");
for (t = 0; t <= ZDB_OT_TOTAL; t++) {
- char csize[6], lsize[6], psize[6], asize[6], avg[6];
+ char csize[32], lsize[32], psize[32], asize[32];
+ char avg[32];
char *typename;
if (t < DMU_OT_NUMTYPES)
@@ -2258,11 +2274,11 @@ dump_block_stats(spa_t *spa)
zcb.zcb_type[ZB_TOTAL][t].zb_asize)
continue;
- nicenum(zb->zb_count, csize);
- nicenum(zb->zb_lsize, lsize);
- nicenum(zb->zb_psize, psize);
- nicenum(zb->zb_asize, asize);
- nicenum(zb->zb_asize / zb->zb_count, avg);
+ zdb_nicenum(zb->zb_count, csize);
+ zdb_nicenum(zb->zb_lsize, lsize);
+ zdb_nicenum(zb->zb_psize, psize);
+ zdb_nicenum(zb->zb_asize, asize);
+ zdb_nicenum(zb->zb_asize / zb->zb_count, avg);
(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
"\t%5.2f\t%6.2f\t",
@@ -2302,7 +2318,7 @@ typedef struct zdb_ddt_entry {
/* ARGSUSED */
static int
zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+ arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
avl_tree_t *t = arg;
avl_index_t where;
@@ -2897,7 +2913,7 @@ main(int argc, char **argv)
dprintf_setup(&argc, argv);
- while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:")) != -1) {
+ while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:P")) != -1) {
switch (c) {
case 'b':
case 'c':
@@ -2920,6 +2936,7 @@ main(int argc, char **argv)
case 'L':
case 'X':
case 'e':
+ case 'P':
dump_opt[c]++;
break;
case 'v':
@@ -2970,7 +2987,7 @@ main(int argc, char **argv)
verbose = MAX(verbose, 1);
for (c = 0; c < 256; c++) {
- if (dump_all && !strchr("elAFLRSX", c))
+ if (dump_all && !strchr("elAFLRSXP", c))
dump_opt[c] = 1;
if (dump_opt[c])
dump_opt[c] += verbose;
diff --git a/usr/src/cmd/zfs/zfs_main.c b/usr/src/cmd/zfs/zfs_main.c
index 66f99ccfbf..6176a102d3 100644
--- a/usr/src/cmd/zfs/zfs_main.c
+++ b/usr/src/cmd/zfs/zfs_main.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <assert.h>
@@ -2573,6 +2572,8 @@ zfs_do_send(int argc, char **argv)
zfs_handle_t *zhp;
sendflags_t flags = { 0 };
int c, err;
+ nvlist_t *dbgnv;
+ boolean_t extraverbose = B_FALSE;
/* check options */
while ((c = getopt(argc, argv, ":i:I:RDpv")) != -1) {
@@ -2595,6 +2596,8 @@ zfs_do_send(int argc, char **argv)
flags.props = B_TRUE;
break;
case 'v':
+ if (flags.verbose)
+ extraverbose = B_TRUE;
flags.verbose = B_TRUE;
break;
case 'D':
@@ -2679,7 +2682,19 @@ zfs_do_send(int argc, char **argv)
if (flags.replicate && fromname == NULL)
flags.doall = B_TRUE;
- err = zfs_send(zhp, fromname, toname, flags, STDOUT_FILENO, NULL, 0);
+ err = zfs_send(zhp, fromname, toname, flags, STDOUT_FILENO, NULL, 0,
+ extraverbose ? &dbgnv : NULL);
+
+ if (extraverbose) {
+ /*
+ * dump_nvlist prints to stdout, but that's been
+ * redirected to a file. Make it print to stderr
+ * instead.
+ */
+ (void) dup2(STDERR_FILENO, STDOUT_FILENO);
+ dump_nvlist(dbgnv, 0);
+ nvlist_free(dbgnv);
+ }
zfs_close(zhp);
return (err != 0);
diff --git a/usr/src/cmd/zpool/Makefile b/usr/src/cmd/zpool/Makefile
index 728fdbe03b..0bf7b02767 100644
--- a/usr/src/cmd/zpool/Makefile
+++ b/usr/src/cmd/zpool/Makefile
@@ -19,8 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
#
PROG= zpool
diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c
index a31ee80255..c663cea5a1 100644
--- a/usr/src/cmd/zpool/zpool_main.c
+++ b/usr/src/cmd/zpool/zpool_main.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <assert.h>
@@ -42,7 +41,6 @@
#include <pwd.h>
#include <zone.h>
#include <sys/fs/zfs.h>
-
#include <sys/stat.h>
#include <libzfs.h>
@@ -215,7 +213,7 @@ get_usage(zpool_help_t idx) {
"[count]]\n"));
case HELP_LIST:
return (gettext("\tlist [-H] [-o property[,...]] "
- "[pool] ...\n"));
+ "[-T d|u] [pool] ... [interval [count]]\n"));
case HELP_OFFLINE:
return (gettext("\toffline [-t] <pool> <device> ...\n"));
case HELP_ONLINE:
@@ -228,7 +226,8 @@ get_usage(zpool_help_t idx) {
case HELP_SCRUB:
return (gettext("\tscrub [-s] <pool> ...\n"));
case HELP_STATUS:
- return (gettext("\tstatus [-vx] [pool] ...\n"));
+ return (gettext("\tstatus [-vx] [-T d|u] [pool] ... [interval "
+ "[count]]\n"));
case HELP_UPGRADE:
return (gettext("\tupgrade\n"
"\tupgrade -v\n"
@@ -519,11 +518,10 @@ zpool_do_add(int argc, char **argv)
}
/*
- * zpool remove <pool> <vdev> ...
+ * zpool remove <pool> <vdev> ...
*
- * Removes the given vdev from the pool. Currently, this only supports removing
- * spares and cache devices from the pool. Eventually, we'll want to support
- * removing leaf vdevs (as an alias for 'detach') as well as toplevel vdevs.
+ * Removes the given vdev from the pool. Currently, this supports removing
+ * spares, cache, and log devices from the pool.
*/
int
zpool_do_remove(int argc, char **argv)
@@ -1044,20 +1042,21 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
{
nvlist_t **child;
uint_t c, children;
+ pool_scan_stat_t *ps = NULL;
vdev_stat_t *vs;
- char rbuf[6], wbuf[6], cbuf[6], repaired[7];
+ char rbuf[6], wbuf[6], cbuf[6];
char *vname;
uint64_t notpresent;
spare_cbdata_t cb;
char *state;
- verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
- (uint64_t **)&vs, &c) == 0);
-
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
children = 0;
+ verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &c) == 0);
+
state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
if (isspare) {
/*
@@ -1147,14 +1146,16 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
(void) printf(gettext("corrupted data"));
break;
}
- } else if (vs->vs_scrub_repaired != 0 && children == 0) {
- /*
- * Report bytes resilvered/repaired on leaf devices.
- */
- zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired));
- (void) printf(gettext(" %s %s"), repaired,
- (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
- "resilvered" : "repaired");
+ }
+
+ (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
+ (uint64_t **)&ps, &c);
+
+ if (ps && ps->pss_state == DSS_SCANNING &&
+ vs->vs_scan_processed != 0 && children == 0) {
+ (void) printf(gettext(" (%s)"),
+ (ps->pss_func == POOL_SCAN_RESILVER) ?
+ "resilvering" : "repairing");
}
(void) printf("\n");
@@ -1194,7 +1195,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
strcmp(type, VDEV_TYPE_HOLE) == 0)
return;
- verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0);
(void) printf("\t%*s%-*s", depth, "", namewidth - depth, name);
@@ -1333,7 +1334,7 @@ show_import(nvlist_t *config)
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
- verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) == 0);
health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
@@ -1400,6 +1401,11 @@ show_import(nvlist_t *config)
"read.\n"));
break;
+ case ZPOOL_STATUS_RESILVERING:
+ (void) printf(gettext("status: One or more devices were being "
+ "resilvered.\n"));
+ break;
+
default:
/*
* No other status can be seen when importing pools.
@@ -1990,13 +1996,13 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
char *vname;
if (oldnv != NULL) {
- verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_STATS,
- (uint64_t **)&oldvs, &c) == 0);
+ verify(nvlist_lookup_uint64_array(oldnv,
+ ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0);
} else {
oldvs = &zerovs;
}
- verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_STATS,
+ verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&newvs, &c) == 0);
if (strlen(name) + depth > cb->cb_namewidth)
@@ -2046,6 +2052,12 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
return;
for (c = 0; c < children; c++) {
+ uint64_t ishole = B_FALSE;
+
+ if (nvlist_lookup_uint64(newchild[c],
+ ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole)
+ continue;
+
vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE);
print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
newchild[c], cb, depth + 2);
@@ -2157,55 +2169,14 @@ get_namewidth(zpool_handle_t *zhp, void *data)
}
/*
- * zpool iostat [-T d|u] [-v] [pool] ... [interval [count]]
- *
- * -T Display a timestamp in date(1) or Unix format
- * -v Display statistics for individual vdevs
- *
- * This command can be tricky because we want to be able to deal with pool
- * creation/destruction as well as vdev configuration changes. The bulk of this
- * processing is handled by the pool_list_* routines in zpool_iter.c. We rely
- * on pool_list_update() to detect the addition of new pools. Configuration
- * changes are all handled within libzfs.
+ * Parse the input string, get the 'interval' and 'count' value if there is one.
*/
-int
-zpool_do_iostat(int argc, char **argv)
+static void
+get_interval_count(int *argcp, char **argv, unsigned long *iv,
+ unsigned long *cnt)
{
- int c;
- int ret;
- int npools;
unsigned long interval = 0, count = 0;
- zpool_list_t *list;
- boolean_t verbose = B_FALSE;
- iostat_cbdata_t cb;
-
- /* check options */
- while ((c = getopt(argc, argv, "T:v")) != -1) {
- switch (c) {
- case 'T':
- if (optarg) {
- if (*optarg == 'u')
- timestamp_fmt = UDATE;
- else if (*optarg == 'd')
- timestamp_fmt = DDATE;
- else
- usage(B_FALSE);
- } else {
- usage(B_FALSE);
- }
- break;
- case 'v':
- verbose = B_TRUE;
- break;
- case '?':
- (void) fprintf(stderr, gettext("invalid option '%c'\n"),
- optopt);
- usage(B_FALSE);
- }
- }
-
- argc -= optind;
- argv += optind;
+ int argc = *argcp, errno;
/*
* Determine if the last argument is an integer or a pool name
@@ -2222,7 +2193,6 @@ zpool_do_iostat(int argc, char **argv)
"cannot be zero\n"));
usage(B_FALSE);
}
-
/*
* Ignore the last parameter
*/
@@ -2239,7 +2209,7 @@ zpool_do_iostat(int argc, char **argv)
/*
* If the last argument is also an integer, then we have both a count
- * and an integer.
+ * and an interval.
*/
if (argc > 0 && isdigit(argv[argc - 1][0])) {
char *end;
@@ -2264,6 +2234,66 @@ zpool_do_iostat(int argc, char **argv)
}
}
+ *iv = interval;
+ *cnt = count;
+ *argcp = argc;
+}
+
+static void
+get_timestamp_arg(char c)
+{
+ if (c == 'u')
+ timestamp_fmt = UDATE;
+ else if (c == 'd')
+ timestamp_fmt = DDATE;
+ else
+ usage(B_FALSE);
+}
+
+/*
+ * zpool iostat [-v] [-T d|u] [pool] ... [interval [count]]
+ *
+ * -v Display statistics for individual vdevs
+ * -T Display a timestamp in date(1) or Unix format
+ *
+ * This command can be tricky because we want to be able to deal with pool
+ * creation/destruction as well as vdev configuration changes. The bulk of this
+ * processing is handled by the pool_list_* routines in zpool_iter.c. We rely
+ * on pool_list_update() to detect the addition of new pools. Configuration
+ * changes are all handled within libzfs.
+ */
+int
+zpool_do_iostat(int argc, char **argv)
+{
+ int c;
+ int ret;
+ int npools;
+ unsigned long interval = 0, count = 0;
+ zpool_list_t *list;
+ boolean_t verbose = B_FALSE;
+ iostat_cbdata_t cb;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "T:v")) != -1) {
+ switch (c) {
+ case 'T':
+ get_timestamp_arg(*optarg);
+ break;
+ case 'v':
+ verbose = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ get_interval_count(&argc, argv, &interval, &count);
+
/*
* Construct the list of all interesting pools.
*/
@@ -2464,12 +2494,13 @@ list_callback(zpool_handle_t *zhp, void *data)
}
/*
- * zpool list [-H] [-o prop[,prop]*] [pool] ...
+ * zpool list [-H] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]]
*
* -H Scripted mode. Don't display headers, and separate properties
* by a single tab.
* -o List of properties to display. Defaults to
* "name,size,allocated,free,capacity,health,altroot"
+ * -T Display a timestamp in date(1) or Unix format
*
* List all pools in the system, whether or not they're healthy. Output space
* statistics for each one, as well as health status summary.
@@ -2483,9 +2514,10 @@ zpool_do_list(int argc, char **argv)
static char default_props[] =
"name,size,allocated,free,capacity,dedupratio,health,altroot";
char *props = default_props;
+ unsigned long interval = 0, count = 0;
/* check options */
- while ((c = getopt(argc, argv, ":Ho:")) != -1) {
+ while ((c = getopt(argc, argv, ":Ho:T:")) != -1) {
switch (c) {
case 'H':
cb.cb_scripted = B_TRUE;
@@ -2493,6 +2525,9 @@ zpool_do_list(int argc, char **argv)
case 'o':
props = optarg;
break;
+ case 'T':
+ get_timestamp_arg(*optarg);
+ break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
@@ -2508,21 +2543,37 @@ zpool_do_list(int argc, char **argv)
argc -= optind;
argv += optind;
+ get_interval_count(&argc, argv, &interval, &count);
+
if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0)
usage(B_FALSE);
cb.cb_first = B_TRUE;
- ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
- list_callback, &cb);
+ for (;;) {
- zprop_free_list(cb.cb_proplist);
+ if (timestamp_fmt != NODATE)
+ print_timestamp(timestamp_fmt);
- if (argc == 0 && cb.cb_first && !cb.cb_scripted) {
- (void) printf(gettext("no pools available\n"));
- return (0);
+ ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
+ list_callback, &cb);
+
+ if (argc == 0 && cb.cb_first && !cb.cb_scripted) {
+ (void) printf(gettext("no pools available\n"));
+ zprop_free_list(cb.cb_proplist);
+ return (0);
+ }
+
+ if (interval == 0)
+ break;
+
+ if (count != 0 && --count == 0)
+ break;
+
+ (void) sleep(interval);
}
+ zprop_free_list(cb.cb_proplist);
return (ret);
}
@@ -3106,7 +3157,7 @@ scrub_callback(zpool_handle_t *zhp, void *data)
return (1);
}
- err = zpool_scrub(zhp, cb->cb_type);
+ err = zpool_scan(zhp, cb->cb_type);
return (err != 0);
}
@@ -3122,13 +3173,13 @@ zpool_do_scrub(int argc, char **argv)
int c;
scrub_cbdata_t cb;
- cb.cb_type = POOL_SCRUB_EVERYTHING;
+ cb.cb_type = POOL_SCAN_SCRUB;
/* check options */
while ((c = getopt(argc, argv, "s")) != -1) {
switch (c) {
case 's':
- cb.cb_type = POOL_SCRUB_NONE;
+ cb.cb_type = POOL_SCAN_NONE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
@@ -3163,62 +3214,103 @@ typedef struct status_cbdata {
* Print out detailed scrub status.
*/
void
-print_scrub_status(nvlist_t *nvroot)
+print_scan_status(pool_scan_stat_t *ps)
{
- vdev_stat_t *vs;
- uint_t vsc;
- time_t start, end, now;
+ time_t start, end;
+ uint64_t elapsed, mins_left;
+ uint64_t pass_exam, examined, total;
+ uint_t rate;
double fraction_done;
- uint64_t examined, total, minutes_left, minutes_taken;
- char *scrub_type;
+ char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
- verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
- (uint64_t **)&vs, &vsc) == 0);
+ (void) printf(gettext(" scan: "));
- /*
- * If there's never been a scrub, there's not much to say.
- */
- if (vs->vs_scrub_end == 0 && vs->vs_scrub_type == POOL_SCRUB_NONE) {
+ /* If there's never been a scan, there's not much to say. */
+ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE ||
+ ps->pss_func >= POOL_SCAN_FUNCS) {
(void) printf(gettext("none requested\n"));
return;
}
- scrub_type = (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
- "resilver" : "scrub";
-
- start = vs->vs_scrub_start;
- end = vs->vs_scrub_end;
- now = time(NULL);
- examined = vs->vs_scrub_examined;
- total = vs->vs_alloc;
-
- if (end != 0) {
- minutes_taken = (uint64_t)((end - start) / 60);
+ start = ps->pss_start_time;
+ end = ps->pss_end_time;
+ zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf));
- (void) printf(gettext("%s %s after %lluh%um with %llu errors "
- "on %s"),
- scrub_type, vs->vs_scrub_complete ? "completed" : "stopped",
+ assert(ps->pss_func == POOL_SCAN_SCRUB ||
+ ps->pss_func == POOL_SCAN_RESILVER);
+ /*
+ * Scan is finished or canceled.
+ */
+ if (ps->pss_state == DSS_FINISHED) {
+ uint64_t minutes_taken = (end - start) / 60;
+ char *fmt;
+
+ if (ps->pss_func == POOL_SCAN_SCRUB) {
+ fmt = gettext("scrub repaired %s in %lluh%um with "
+ "%llu errors on %s");
+ } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+ fmt = gettext("resilvered %s in %lluh%um with "
+ "%llu errors on %s");
+ }
+ /* LINTED */
+ (void) printf(fmt, processed_buf,
(u_longlong_t)(minutes_taken / 60),
(uint_t)(minutes_taken % 60),
- (u_longlong_t)vs->vs_scrub_errors, ctime(&end));
+ (u_longlong_t)ps->pss_errors,
+ ctime((time_t *)&end));
+ return;
+ } else if (ps->pss_state == DSS_CANCELED) {
+ if (ps->pss_func == POOL_SCAN_SCRUB) {
+ (void) printf(gettext("scrub canceled on %s"),
+ ctime(&end));
+ } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+ (void) printf(gettext("resilver canceled on %s"),
+ ctime(&end));
+ }
return;
}
- if (examined == 0)
- examined = 1;
- if (examined > total)
- total = examined;
+ assert(ps->pss_state == DSS_SCANNING);
+ /*
+ * Scan is in progress.
+ */
+ if (ps->pss_func == POOL_SCAN_SCRUB) {
+ (void) printf(gettext("scrub in progress since %s"),
+ ctime(&start));
+ } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+ (void) printf(gettext("resilver in progress since %s"),
+ ctime(&start));
+ }
+
+ examined = ps->pss_examined ? ps->pss_examined : 1;
+ total = ps->pss_to_examine;
fraction_done = (double)examined / total;
- minutes_left = (uint64_t)((now - start) *
- (1 - fraction_done) / fraction_done / 60);
- minutes_taken = (uint64_t)((now - start) / 60);
-
- (void) printf(gettext("%s in progress for %lluh%um, %.2f%% done, "
- "%lluh%um to go\n"),
- scrub_type, (u_longlong_t)(minutes_taken / 60),
- (uint_t)(minutes_taken % 60), 100 * fraction_done,
- (u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60));
+
+ /* elapsed time for this pass */
+ elapsed = time(NULL) - ps->pss_pass_start;
+ elapsed = elapsed ? elapsed : 1;
+ pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
+ rate = pass_exam / elapsed;
+ rate = rate ? rate : 1;
+ mins_left = ((total - examined) / rate) / 60;
+
+ zfs_nicenum(examined, examined_buf, sizeof (examined_buf));
+ zfs_nicenum(total, total_buf, sizeof (total_buf));
+ zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
+
+ (void) printf(gettext(" %s scanned out of %s at "
+ "%s/s, %lluh%um to go\n"), examined_buf, total_buf, rate_buf,
+ (u_longlong_t)(mins_left / 60),
+ (uint_t)(mins_left % 60));
+
+ if (ps->pss_func == POOL_SCAN_RESILVER) {
+ (void) printf(gettext(" %s resilvered, %.2f%% done\n"),
+ processed_buf, 100 * fraction_done);
+ } else if (ps->pss_func == POOL_SCAN_SCRUB) {
+ (void) printf(gettext(" %s repaired, %.2f%% done\n"),
+ processed_buf, 100 * fraction_done);
+ }
}
static void
@@ -3378,7 +3470,7 @@ status_callback(zpool_handle_t *zhp, void *data)
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
- verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0);
health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
@@ -3451,7 +3543,6 @@ status_callback(zpool_handle_t *zhp, void *data)
"replace'.\n"));
break;
-
case ZPOOL_STATUS_RESILVERING:
(void) printf(gettext("status: One or more devices is "
"currently being resilvered. The pool will\n\tcontinue "
@@ -3549,10 +3640,11 @@ status_callback(zpool_handle_t *zhp, void *data)
uint64_t nerr;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
+ pool_scan_stat_t *ps = NULL;
-
- (void) printf(gettext(" scrub: "));
- print_scrub_status(nvroot);
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c);
+ print_scan_status(ps);
namewidth = max_width(zhp, nvroot, 0, 0);
if (namewidth < 10)
@@ -3620,11 +3712,12 @@ status_callback(zpool_handle_t *zhp, void *data)
}
/*
- * zpool status [-vx] [pool] ...
+ * zpool status [-vx] [-T d|u] [pool] ... [interval [count]]
*
* -v Display complete error logs
* -x Display only pools with potential problems
* -D Display dedup status (undocumented)
+ * -T Display a timestamp in date(1) or Unix format
*
* Describes the health status of all pools or some subset.
*/
@@ -3633,10 +3726,11 @@ zpool_do_status(int argc, char **argv)
{
int c;
int ret;
+ unsigned long interval = 0, count = 0;
status_cbdata_t cb = { 0 };
/* check options */
- while ((c = getopt(argc, argv, "vxD")) != -1) {
+ while ((c = getopt(argc, argv, "vxDT:")) != -1) {
switch (c) {
case 'v':
cb.cb_verbose = B_TRUE;
@@ -3647,6 +3741,9 @@ zpool_do_status(int argc, char **argv)
case 'D':
cb.cb_dedup_stats = B_TRUE;
break;
+ case 'T':
+ get_timestamp_arg(*optarg);
+ break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
@@ -3657,19 +3754,38 @@ zpool_do_status(int argc, char **argv)
argc -= optind;
argv += optind;
- cb.cb_first = B_TRUE;
+ get_interval_count(&argc, argv, &interval, &count);
if (argc == 0)
cb.cb_allpools = B_TRUE;
- ret = for_each_pool(argc, argv, B_TRUE, NULL, status_callback, &cb);
+ cb.cb_first = B_TRUE;
- if (argc == 0 && cb.cb_count == 0)
- (void) printf(gettext("no pools available\n"));
- else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
- (void) printf(gettext("all pools are healthy\n"));
+ for (;;) {
+ if (timestamp_fmt != NODATE)
+ print_timestamp(timestamp_fmt);
- return (ret);
+ ret = for_each_pool(argc, argv, B_TRUE, NULL,
+ status_callback, &cb);
+
+ if (argc == 0 && cb.cb_count == 0)
+ (void) printf(gettext("no pools available\n"));
+ else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
+ (void) printf(gettext("all pools are healthy\n"));
+
+ if (ret != 0)
+ return (ret);
+
+ if (interval == 0)
+ break;
+
+ if (count != 0 && --count == 0)
+ break;
+
+ (void) sleep(interval);
+ }
+
+ return (0);
}
typedef struct upgrade_cbdata {
@@ -3890,6 +4006,7 @@ zpool_do_upgrade(int argc, char **argv)
(void) printf(gettext(" 22 Received properties\n"));
(void) printf(gettext(" 23 Slim ZIL\n"));
(void) printf(gettext(" 24 System attributes\n"));
+ (void) printf(gettext(" 25 Improved scrub stats\n"));
(void) printf(gettext("\nFor more information on a particular "
"version, including supported releases, see:\n\n"));
(void) printf("http://www.opensolaris.org/os/community/zfs/"
@@ -3993,7 +4110,7 @@ get_history_one(zpool_handle_t *zhp, void *data)
(void) snprintf(internalstr,
sizeof (internalstr),
"[internal %s txg:%lld] %s",
- hist_event_table[ievent], txg,
+ zfs_history_event_names[ievent], txg,
pathstr);
cmdstr = internalstr;
}
diff --git a/usr/src/cmd/zpool/zpool_util.h b/usr/src/cmd/zpool/zpool_util.h
index a18b8b705f..134c730fcf 100644
--- a/usr/src/cmd/zpool/zpool_util.h
+++ b/usr/src/cmd/zpool/zpool_util.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef ZPOOL_UTIL_H
@@ -45,7 +44,7 @@ uint_t num_logs(nvlist_t *nv);
*/
nvlist_t *make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
- boolean_t isreplace, boolean_t dryrun, int argc, char **argv);
+ boolean_t replacing, boolean_t dryrun, int argc, char **argv);
nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname,
nvlist_t *props, splitflags_t flags, int argc, char **argv);
diff --git a/usr/src/cmd/zpool/zpool_vdev.c b/usr/src/cmd/zpool/zpool_vdev.c
index 3c725d232c..53c2e60b7d 100644
--- a/usr/src/cmd/zpool/zpool_vdev.c
+++ b/usr/src/cmd/zpool/zpool_vdev.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -1004,8 +1003,8 @@ is_spare(nvlist_t *config, const char *path)
return (B_FALSE);
}
free(name);
-
(void) close(fd);
+
verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
nvlist_free(label);
@@ -1029,8 +1028,8 @@ is_spare(nvlist_t *config, const char *path)
* the majority of this task.
*/
static int
-check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
- int isspare)
+check_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
+ boolean_t replacing, boolean_t isspare)
{
nvlist_t **child;
uint_t c, children;
@@ -1051,13 +1050,14 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
* hot spare within the same pool. If so, we allow it
* regardless of what libdiskmgt or zpool_in_use() says.
*/
- if (isreplacing) {
+ if (replacing) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk) == 0 && wholedisk)
(void) snprintf(buf, sizeof (buf), "%ss0",
path);
else
(void) strlcpy(buf, path, sizeof (buf));
+
if (is_spare(config, buf))
return (0);
}
@@ -1073,21 +1073,21 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
for (c = 0; c < children; c++)
if ((ret = check_in_use(config, child[c], force,
- isreplacing, B_FALSE)) != 0)
+ replacing, B_FALSE)) != 0)
return (ret);
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
&child, &children) == 0)
for (c = 0; c < children; c++)
if ((ret = check_in_use(config, child[c], force,
- isreplacing, B_TRUE)) != 0)
+ replacing, B_TRUE)) != 0)
return (ret);
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
&child, &children) == 0)
for (c = 0; c < children; c++)
if ((ret = check_in_use(config, child[c], force,
- isreplacing, B_FALSE)) != 0)
+ replacing, B_FALSE)) != 0)
return (ret);
return (0);
@@ -1419,7 +1419,7 @@ split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
*/
nvlist_t *
make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
- boolean_t isreplacing, boolean_t dryrun, int argc, char **argv)
+ boolean_t replacing, boolean_t dryrun, int argc, char **argv)
{
nvlist_t *newroot;
nvlist_t *poolconfig = NULL;
@@ -1442,8 +1442,7 @@ make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
* uses (such as a dedicated dump device) that even '-f' cannot
* override.
*/
- if (check_in_use(poolconfig, newroot, force, isreplacing,
- B_FALSE) != 0) {
+ if (check_in_use(poolconfig, newroot, force, replacing, B_FALSE) != 0) {
nvlist_free(newroot);
return (NULL);
}
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index eea3aa0d39..e0fabd7234 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -93,6 +93,7 @@
#include <sys/metaslab_impl.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
+#include <sys/dsl_scan.h>
#include <sys/refcount.h>
#include <stdio.h>
#include <stdio_ext.h>
@@ -284,9 +285,9 @@ ztest_info_t ztest_info[] = {
{ ztest_spa_rename, 1, &zopt_rarely },
{ ztest_scrub, 1, &zopt_rarely },
{ ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
- { ztest_vdev_attach_detach, 1, &zopt_rarely },
+ { ztest_vdev_attach_detach, 1, &zopt_rarely },
{ ztest_vdev_LUN_growth, 1, &zopt_rarely },
- { ztest_vdev_add_remove, 1, &zopt_vdevtime },
+ { ztest_vdev_add_remove, 1, &zopt_vdevtime },
{ ztest_vdev_aux_add_remove, 1, &zopt_vdevtime },
};
@@ -4662,9 +4663,9 @@ ztest_scrub(ztest_ds_t *zd, uint64_t id)
ztest_shared_t *zs = ztest_shared;
spa_t *spa = zs->zs_spa;
- (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
+ (void) spa_scan(spa, POOL_SCAN_SCRUB);
(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
- (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
+ (void) spa_scan(spa, POOL_SCAN_SCRUB);
}
/*
@@ -4817,7 +4818,7 @@ ztest_spa_import_export(char *oldname, char *newname)
* Kick off a scrub to tickle scrub/export races.
*/
if (ztest_random(2) == 0)
- (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
+ (void) spa_scan(spa, POOL_SCAN_SCRUB);
pool_guid = spa_guid(spa);
spa_close(spa, FTAG);
diff --git a/usr/src/common/zfs/zfs_comutil.c b/usr/src/common/zfs/zfs_comutil.c
index 8ab194e44c..ed9b67ea3b 100644
--- a/usr/src/common/zfs/zfs_comutil.c
+++ b/usr/src/common/zfs/zfs_comutil.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -157,3 +156,47 @@ zfs_spa_version_map(int zpl_version)
return (version);
}
+
+const char *zfs_history_event_names[LOG_END] = {
+ "invalid event",
+ "pool create",
+ "vdev add",
+ "pool remove",
+ "pool destroy",
+ "pool export",
+ "pool import",
+ "vdev attach",
+ "vdev replace",
+ "vdev detach",
+ "vdev online",
+ "vdev offline",
+ "vdev upgrade",
+ "pool clear",
+ "pool scrub",
+ "pool property set",
+ "create",
+ "clone",
+ "destroy",
+ "destroy_begin_sync",
+ "inherit",
+ "property set",
+ "quota set",
+ "permission update",
+ "permission remove",
+ "permission who remove",
+ "promote",
+ "receive",
+ "rename",
+ "reservation set",
+ "replay_inc_sync",
+ "replay_full_sync",
+ "rollback",
+ "snapshot",
+ "filesystem version upgrade",
+ "refquota set",
+ "refreservation set",
+ "pool scrub done",
+ "user hold",
+ "user release",
+ "pool split",
+};
diff --git a/usr/src/common/zfs/zfs_comutil.h b/usr/src/common/zfs/zfs_comutil.h
index f6949387f1..61327f9aa9 100644
--- a/usr/src/common/zfs/zfs_comutil.h
+++ b/usr/src/common/zfs/zfs_comutil.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _ZFS_COMUTIL_H
@@ -38,6 +37,7 @@ extern void zpool_get_rewind_policy(nvlist_t *, zpool_rewind_policy_t *);
extern int zfs_zpl_version_map(int spa_version);
extern int zfs_spa_version_map(int zpl_version);
+extern const char *zfs_history_event_names[LOG_END];
#ifdef __cplusplus
}
diff --git a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h
index c0887d5b1d..de2632a71a 100644
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h
@@ -17,8 +17,7 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_FS_ZFS_H
@@ -27,7 +26,7 @@
/*
* On-disk version number.
*/
-#define SPA_VERSION 24ULL
+#define SPA_VERSION 25ULL
/*
* The following are configuration names used in the nvlist describing a pool's
diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h
index 7a8d3d769a..6f7fed62c4 100644
--- a/usr/src/lib/libzfs/common/libzfs.h
+++ b/usr/src/lib/libzfs/common/libzfs.h
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _LIBZFS_H
@@ -119,6 +118,8 @@ enum {
EZFS_PIPEFAILED, /* pipe create failed */
EZFS_THREADCREATEFAILED, /* thread create failed */
EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */
+ EZFS_SCRUBBING, /* currently scrubbing */
+ EZFS_NO_SCRUB, /* no active scrub */
EZFS_UNKNOWN
};
@@ -224,7 +225,7 @@ typedef struct splitflags {
/*
* Functions to manipulate pool and vdev state
*/
-extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t);
+extern int zpool_scan(zpool_handle_t *, pool_scan_func_t);
extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
extern int zpool_vdev_online(zpool_handle_t *, const char *, int,
@@ -354,7 +355,7 @@ extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *,
*/
struct zfs_cmd;
-extern const char *hist_event_table[LOG_END];
+extern const char *zfs_history_event_names[LOG_END];
extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *,
boolean_t verbose);
@@ -526,8 +527,9 @@ typedef struct sendflags {
typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
-extern int zfs_send(zfs_handle_t *, const char *, const char *,
- sendflags_t, int, snapfilter_cb_t, void *);
+extern int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
+ sendflags_t flags, int outfd, snapfilter_cb_t filter_func,
+ void *cb_arg, nvlist_t **debugnvp);
extern int zfs_promote(zfs_handle_t *);
extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t,
diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c
index b212cdeddf..c35d6ab451 100644
--- a/usr/src/lib/libzfs/common/libzfs_pool.c
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <ctype.h>
@@ -43,50 +42,6 @@
#include "libzfs_impl.h"
#include "zfs_comutil.h"
-const char *hist_event_table[LOG_END] = {
- "invalid event",
- "pool create",
- "vdev add",
- "pool remove",
- "pool destroy",
- "pool export",
- "pool import",
- "vdev attach",
- "vdev replace",
- "vdev detach",
- "vdev online",
- "vdev offline",
- "vdev upgrade",
- "pool clear",
- "pool scrub",
- "pool property set",
- "create",
- "clone",
- "destroy",
- "destroy_begin_sync",
- "inherit",
- "property set",
- "quota set",
- "permission update",
- "permission remove",
- "permission who remove",
- "promote",
- "receive",
- "rename",
- "reservation set",
- "replay_inc_sync",
- "replay_full_sync",
- "rollback",
- "snapshot",
- "filesystem version upgrade",
- "refquota set",
- "refreservation set",
- "pool scrub done",
- "user hold",
- "user release",
- "pool split",
-};
-
static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
#if defined(__i386) || defined(__amd64)
@@ -334,7 +289,8 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
verify(nvlist_lookup_uint64_array(nvroot,
- ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
+ ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+ == 0);
(void) strlcpy(buf, zpool_state_to_name(intval,
vs->vs_aux), len);
@@ -1558,28 +1514,51 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
}
/*
- * Scrub the pool.
+ * Scan the pool.
*/
int
-zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type)
+zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
- zc.zc_cookie = type;
+ zc.zc_cookie = func;
- if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SCRUB, &zc) == 0)
+ if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 ||
+ (errno == ENOENT && func != POOL_SCAN_NONE))
return (0);
- (void) snprintf(msg, sizeof (msg),
- dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name);
+ if (func == POOL_SCAN_SCRUB) {
+ (void) snprintf(msg, sizeof (msg),
+ dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name);
+ } else if (func == POOL_SCAN_NONE) {
+ (void) snprintf(msg, sizeof (msg),
+ dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"),
+ zc.zc_name);
+ } else {
+ assert(!"unexpected result");
+ }
- if (errno == EBUSY)
- return (zfs_error(hdl, EZFS_RESILVERING, msg));
- else
+ if (errno == EBUSY) {
+ nvlist_t *nvroot;
+ pool_scan_stat_t *ps = NULL;
+ uint_t psc;
+
+ verify(nvlist_lookup_nvlist(zhp->zpool_config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
+ if (ps && ps->pss_func == POOL_SCAN_SCRUB)
+ return (zfs_error(hdl, EZFS_SCRUBBING, msg));
+ else
+ return (zfs_error(hdl, EZFS_RESILVERING, msg));
+ } else if (errno == ENOENT) {
+ return (zfs_error(hdl, EZFS_NO_SCRUB, msg));
+ } else {
return (zpool_standard_error(hdl, errno, msg));
+ }
}
/*
@@ -2987,7 +2966,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
* open a misbehaving device, which can have undesirable
* effects.
*/
- if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) != 0 ||
vs->vs_state >= VDEV_STATE_DEGRADED) &&
zhp != NULL &&
diff --git a/usr/src/lib/libzfs/common/libzfs_sendrecv.c b/usr/src/lib/libzfs/common/libzfs_sendrecv.c
index 95031653eb..672e004ef5 100644
--- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c
+++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c
@@ -852,6 +852,7 @@ typedef struct send_dump_data {
avl_tree_t *fsavl;
snapfilter_cb_t *filter_cb;
void *filter_cb_arg;
+ nvlist_t *debugnv;
} send_dump_data_t;
/*
@@ -860,10 +861,11 @@ typedef struct send_dump_data {
*/
static int
dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
- int outfd, boolean_t enoent_ok, boolean_t *got_enoent)
+ int outfd, boolean_t enoent_ok, boolean_t *got_enoent, nvlist_t *debugnv)
{
zfs_cmd_t zc = { 0 };
libzfs_handle_t *hdl = zhp->zfs_hdl;
+ nvlist_t *thisdbg;
assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
assert(fromsnap == NULL || fromsnap[0] == '\0' || !fromorigin);
@@ -876,11 +878,24 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
*got_enoent = B_FALSE;
+ VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0));
+ if (fromsnap && fromsnap[0] != '\0') {
+ VERIFY(0 == nvlist_add_string(thisdbg,
+ "fromsnap", fromsnap));
+ }
+
if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SEND, &zc) != 0) {
char errbuf[1024];
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"warning: cannot send '%s'"), zhp->zfs_name);
+ VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno));
+ if (debugnv) {
+ VERIFY(0 == nvlist_add_nvlist(debugnv,
+ zhp->zfs_name, thisdbg));
+ }
+ nvlist_free(thisdbg);
+
switch (errno) {
case EXDEV:
@@ -920,6 +935,10 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
}
}
+ if (debugnv)
+ VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg));
+ nvlist_free(thisdbg);
+
return (0);
}
@@ -1000,7 +1019,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
err = dump_ioctl(zhp, sdd->prevsnap,
sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate),
- sdd->outfd, B_TRUE, &got_enoent);
+ sdd->outfd, B_TRUE, &got_enoent, sdd->debugnv);
if (got_enoent)
err = 0;
@@ -1176,7 +1195,7 @@ again:
int
zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
sendflags_t flags, int outfd, snapfilter_cb_t filter_func,
- void *cb_arg)
+ void *cb_arg, nvlist_t **debugnvp)
{
char errbuf[1024];
send_dump_data_t sdd = { 0 };
@@ -1276,7 +1295,10 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss));
err = nvlist_pack(hdrnv, &packbuf, &buflen,
NV_ENCODE_XDR, 0);
- nvlist_free(hdrnv);
+ if (debugnvp)
+ *debugnvp = hdrnv;
+ else
+ nvlist_free(hdrnv);
if (err) {
fsavl_destroy(fsavl);
nvlist_free(fss);
@@ -1351,6 +1373,8 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
sdd.verbose = flags.verbose;
sdd.filter_cb = filter_func;
sdd.filter_cb_arg = cb_arg;
+ if (debugnvp)
+ sdd.debugnv = *debugnvp;
err = dump_filesystems(zhp, &sdd);
fsavl_destroy(fsavl);
nvlist_free(fss);
diff --git a/usr/src/lib/libzfs/common/libzfs_status.c b/usr/src/lib/libzfs/common/libzfs_status.c
index c4f907733f..24725ec044 100644
--- a/usr/src/lib/libzfs/common/libzfs_status.c
+++ b/usr/src/lib/libzfs/common/libzfs_status.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -138,7 +137,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
if (find_vdev_problem(child[c], func))
return (B_TRUE);
} else {
- verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
+ verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0);
if (func(vs->vs_state, vs->vs_aux,
@@ -173,7 +172,8 @@ check_status(nvlist_t *config, boolean_t isimport)
{
nvlist_t *nvroot;
vdev_stat_t *vs;
- uint_t vsc;
+ pool_scan_stat_t *ps = NULL;
+ uint_t vsc, psc;
uint64_t nerr;
uint64_t version;
uint64_t stateval;
@@ -184,15 +184,24 @@ check_status(nvlist_t *config, boolean_t isimport)
&version) == 0);
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
- verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) == 0);
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
&stateval) == 0);
- (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
+
+ /*
+ * Currently resilvering a vdev
+ */
+ (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
+ (uint64_t **)&ps, &psc);
+ if (ps && ps->pss_func == POOL_SCAN_RESILVER &&
+ ps->pss_state == DSS_SCANNING)
+ return (ZPOOL_STATUS_RESILVERING);
/*
* Pool last accessed by another system.
*/
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
if (hostid != 0 && (unsigned long)hostid != gethostid() &&
stateval == POOL_STATE_ACTIVE)
return (ZPOOL_STATUS_HOSTID_MISMATCH);
@@ -289,12 +298,6 @@ check_status(nvlist_t *config, boolean_t isimport)
return (ZPOOL_STATUS_REMOVED_DEV);
/*
- * Currently resilvering
- */
- if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
- return (ZPOOL_STATUS_RESILVERING);
-
- /*
* Outdated, but usable, version
*/
if (version < SPA_VERSION)
diff --git a/usr/src/lib/libzfs/common/libzfs_util.c b/usr/src/lib/libzfs/common/libzfs_util.c
index 98b56ff79a..2e73f76ea5 100644
--- a/usr/src/lib/libzfs/common/libzfs_util.c
+++ b/usr/src/lib/libzfs/common/libzfs_util.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -215,6 +214,11 @@ libzfs_error_description(libzfs_handle_t *hdl)
case EZFS_POSTSPLIT_ONLINE:
return (dgettext(TEXT_DOMAIN, "disk was split from this pool "
"into a new one"));
+ case EZFS_SCRUBBING:
+ return (dgettext(TEXT_DOMAIN, "currently scrubbing; "
+ "use 'zpool scrub -s' to cancel current scrub"));
+ case EZFS_NO_SCRUB:
+ return (dgettext(TEXT_DOMAIN, "there is no active scrub"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
diff --git a/usr/src/lib/libzfs/common/mapfile-vers b/usr/src/lib/libzfs/common/mapfile-vers
index 376f3ed985..dc68ed9bc2 100644
--- a/usr/src/lib/libzfs/common/mapfile-vers
+++ b/usr/src/lib/libzfs/common/mapfile-vers
@@ -19,8 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2010 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
#
#
@@ -45,7 +44,6 @@ SUNWprivate_1.1 {
fletcher_4_byteswap;
fletcher_4_incremental_native;
fletcher_4_incremental_byteswap;
- hist_event_table;
libzfs_errno;
libzfs_error_action;
libzfs_error_description;
@@ -73,6 +71,7 @@ SUNWprivate_1.1 {
zfs_get_pool_handle;
zfs_get_user_props;
zfs_get_type;
+ zfs_history_event_names;
zfs_hold;
zfs_hold_range;
zfs_is_mounted;
@@ -195,7 +194,7 @@ SUNWprivate_1.1 {
zpool_prop_values;
zpool_read_label;
zpool_refresh_stats;
- zpool_scrub;
+ zpool_scan;
zpool_search_import;
zpool_set_history_str;
zpool_set_prop;
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c b/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c
index fce227ffd3..65739f294c 100644
--- a/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include "libzfs_jni_util.h"
@@ -1055,7 +1054,7 @@ populate_DeviceStatsBean(JNIEnv *env, nvlist_t *vdev,
vdev_stat_t *vs;
int result = nvlist_lookup_uint64_array(
- vdev, ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &c);
+ vdev, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c);
if (result != 0) {
zjni_throw_exception(env,
"could not retrieve virtual device statistics");
diff --git a/usr/src/lib/libzpool/common/util.c b/usr/src/lib/libzpool/common/util.c
index 781edb6e8a..9b99531fd1 100644
--- a/usr/src/lib/libzpool/common/util.c
+++ b/usr/src/lib/libzpool/common/util.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <assert.h>
@@ -90,7 +89,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
if (is_log)
prefix = "log ";
- if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) != 0)
vs = &v0;
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 301a3ab217..abbecd9a88 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -1343,7 +1343,7 @@ ZFS_COMMON_OBJS += \
dmu_zfetch.o \
dsl_deleg.o \
dsl_prop.o \
- dsl_scrub.o \
+ dsl_scan.o \
gzip.o \
lzjb.o \
metaslab.o \
@@ -1372,6 +1372,7 @@ ZFS_COMMON_OBJS += \
zap_leaf.o \
zap_micro.o \
zfs_byteswap.o \
+ zfs_debug.o \
zfs_fm.o \
zfs_fuid.o \
zfs_sa.o \
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index f485fe9f7c..057f207bc6 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -437,6 +436,7 @@ struct arc_buf_hdr {
kmutex_t b_freeze_lock;
zio_cksum_t *b_freeze_cksum;
+ void *b_thawed;
arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
@@ -545,8 +545,8 @@ static buf_hash_table_t buf_hash_table;
(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define HDR_LOCK(buf) \
- (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+#define HDR_LOCK(hdr) \
+ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
uint64_t zfs_crc64_table[256];
@@ -664,6 +664,15 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+ hdr->b_dva.dva_word[0] = 0;
+ hdr->b_dva.dva_word[1] = 0;
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+}
+
static arc_buf_hdr_t *
buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
{
@@ -797,7 +806,8 @@ buf_cons(void *vbuf, void *unused, int kmflag)
arc_buf_t *buf = vbuf;
bzero(buf, sizeof (arc_buf_t));
- rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
return (0);
@@ -826,7 +836,8 @@ buf_dest(void *vbuf, void *unused)
{
arc_buf_t *buf = vbuf;
- rw_destroy(&buf->b_lock);
+ mutex_destroy(&buf->b_evict_lock);
+ rw_destroy(&buf->b_data_lock);
arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
}
@@ -941,6 +952,11 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
void
arc_buf_thaw(arc_buf_t *buf)
{
+ kmutex_t *hash_lock;
+
+ hash_lock = HDR_LOCK(buf->b_hdr);
+ mutex_enter(hash_lock);
+
if (zfs_flags & ZFS_DEBUG_MODIFY) {
if (buf->b_hdr->b_state != arc_anon)
panic("modifying non-anon buffer!");
@@ -954,18 +970,32 @@ arc_buf_thaw(arc_buf_t *buf)
kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
buf->b_hdr->b_freeze_cksum = NULL;
}
+
+ if (zfs_flags & ZFS_DEBUG_MODIFY) {
+ if (buf->b_hdr->b_thawed)
+ kmem_free(buf->b_hdr->b_thawed, 1);
+ buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+ }
+
mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(hash_lock);
}
void
arc_buf_freeze(arc_buf_t *buf)
{
+ kmutex_t *hash_lock;
+
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+ hash_lock = HDR_LOCK(buf->b_hdr);
+ mutex_enter(hash_lock);
+
ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
buf->b_hdr->b_state == arc_anon);
arc_cksum_compute(buf, B_FALSE);
+ mutex_exit(hash_lock);
}
static void
@@ -1037,7 +1067,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
ASSERT(new_state != old_state);
ASSERT(refcnt == 0 || ab->b_datacnt > 0);
ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
- ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon);
ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
from_delta = to_delta = ab->b_datacnt * ab->b_size;
@@ -1059,7 +1088,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
/*
* If prefetching out of the ghost cache,
- * we will have a non-null datacnt.
+ * we will have a non-zero datacnt.
*/
if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
/* ghost elements have a ghost size */
@@ -1095,9 +1124,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
}
ASSERT(!BUF_EMPTY(ab));
- if (new_state == arc_anon) {
+ if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
buf_hash_remove(ab);
- }
/* adjust state sizes */
if (to_delta)
@@ -1254,7 +1282,6 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr;
- rw_enter(&buf->b_lock, RW_WRITER);
ASSERT(buf->b_data != NULL);
hdr = buf->b_hdr;
(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
@@ -1263,7 +1290,6 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
buf->b_private = NULL;
atomic_add_64(&arc_loaned_bytes, hdr->b_size);
- rw_exit(&buf->b_lock);
}
static arc_buf_t *
@@ -1299,16 +1325,16 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
* must verify b_data != NULL to know if the add_ref
* was successful.
*/
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
if (buf->b_data == NULL) {
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return;
}
- hdr = buf->b_hdr;
- ASSERT(hdr != NULL);
- hash_lock = HDR_LOCK(hdr);
+ hash_lock = HDR_LOCK(buf->b_hdr);
mutex_enter(hash_lock);
- rw_exit(&buf->b_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ mutex_exit(&buf->b_evict_lock);
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
add_reference(hdr, hash_lock, tag);
@@ -1394,6 +1420,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
continue;
*bufp = buf->b_next;
+ buf->b_next = NULL;
ASSERT(buf->b_efunc == NULL);
@@ -1442,23 +1469,21 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
if (!BUF_EMPTY(hdr)) {
ASSERT(!HDR_IN_HASH_TABLE(hdr));
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
+ buf_discard_identity(hdr);
}
while (hdr->b_buf) {
arc_buf_t *buf = hdr->b_buf;
if (buf->b_efunc) {
mutex_enter(&arc_eviction_mtx);
- rw_enter(&buf->b_lock, RW_WRITER);
+ mutex_enter(&buf->b_evict_lock);
ASSERT(buf->b_hdr != NULL);
arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
hdr->b_buf = buf->b_next;
buf->b_hdr = &arc_eviction_hdr;
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
mutex_exit(&arc_eviction_mtx);
} else {
arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
@@ -1468,6 +1493,10 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_freeze_cksum = NULL;
}
+ if (hdr->b_thawed) {
+ kmem_free(hdr->b_thawed, 1);
+ hdr->b_thawed = NULL;
+ }
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1488,6 +1517,9 @@ arc_buf_free(arc_buf_t *buf, void *tag)
kmutex_t *hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
(void) remove_reference(hdr, hash_lock, tag);
if (hdr->b_datacnt > 1) {
arc_buf_destroy(buf, FALSE, TRUE);
@@ -1512,12 +1544,10 @@ arc_buf_free(arc_buf_t *buf, void *tag)
if (destroy_hdr)
arc_hdr_destroy(hdr);
} else {
- if (remove_reference(hdr, NULL, tag) > 0) {
- ASSERT(HDR_IO_ERROR(hdr));
+ if (remove_reference(hdr, NULL, tag) > 0)
arc_buf_destroy(buf, FALSE, TRUE);
- } else {
+ else
arc_hdr_destroy(hdr);
- }
}
}
@@ -1535,6 +1565,8 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
}
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
ASSERT(hdr->b_state != arc_anon);
ASSERT(buf->b_data != NULL);
@@ -1613,7 +1645,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
ASSERT(ab->b_datacnt > 0);
while (ab->b_buf) {
arc_buf_t *buf = ab->b_buf;
- if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
+ if (!mutex_tryenter(&buf->b_evict_lock)) {
missed += 1;
break;
}
@@ -1635,9 +1667,9 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
mutex_exit(&arc_eviction_mtx);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
} else {
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
arc_buf_destroy(buf,
buf->b_data == stolen, TRUE);
}
@@ -1854,9 +1886,9 @@ arc_do_user_evicts(void)
while (arc_eviction_list != NULL) {
arc_buf_t *buf = arc_eviction_list;
arc_eviction_list = buf->b_next;
- rw_enter(&buf->b_lock, RW_WRITER);
+ mutex_enter(&buf->b_evict_lock);
buf->b_hdr = NULL;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
mutex_exit(&arc_eviction_mtx);
if (buf->b_efunc != NULL)
@@ -2438,7 +2470,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
void
arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
- bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+ if (zio == NULL || zio->io_error == 0)
+ bcopy(buf->b_data, arg, buf->b_hdr->b_size);
VERIFY(arc_buf_remove_ref(buf, arg) == 1);
}
@@ -2452,6 +2485,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
*bufp = NULL;
} else {
*bufp = buf;
+ ASSERT(buf->b_data);
}
}
@@ -2606,13 +2640,22 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
{
int err;
+ if (pbuf == NULL) {
+ /*
+ * XXX This happens from traverse callback funcs, for
+ * the objset_phys_t block.
+ */
+ return (arc_read_nolock(pio, spa, bp, done, private, priority,
+ zio_flags, arc_flags, zb));
+ }
+
ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
- rw_enter(&pbuf->b_lock, RW_READER);
+ rw_enter(&pbuf->b_data_lock, RW_READER);
err = arc_read_nolock(pio, spa, bp, done, private, priority,
zio_flags, arc_flags, zb);
- rw_exit(&pbuf->b_lock);
+ rw_exit(&pbuf->b_data_lock);
return (err);
}
@@ -2721,9 +2764,7 @@ top:
if (exists) {
/* somebody beat us to the hash insert */
mutex_exit(hash_lock);
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
+ buf_discard_identity(hdr);
(void) arc_buf_remove_ref(buf, private);
goto top; /* restart the IO request */
}
@@ -2901,14 +2942,14 @@ arc_buf_evict(arc_buf_t *buf)
kmutex_t *hash_lock;
arc_buf_t **bufp;
- rw_enter(&buf->b_lock, RW_WRITER);
+ mutex_enter(&buf->b_evict_lock);
hdr = buf->b_hdr;
if (hdr == NULL) {
/*
* We are in arc_do_user_evicts().
*/
ASSERT(buf->b_data == NULL);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (0);
} else if (buf->b_data == NULL) {
arc_buf_t copy = *buf; /* structure assignment */
@@ -2917,14 +2958,15 @@ arc_buf_evict(arc_buf_t *buf)
* but let arc_do_user_evicts() do the reaping.
*/
buf->b_efunc = NULL;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
VERIFY(copy.b_efunc(&copy) == 0);
return (1);
}
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- ASSERT(buf->b_hdr == hdr);
ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
@@ -2943,6 +2985,7 @@ arc_buf_evict(arc_buf_t *buf)
arc_state_t *old_state = hdr->b_state;
arc_state_t *evicted_state;
+ ASSERT(hdr->b_buf == NULL);
ASSERT(refcount_is_zero(&hdr->b_refcnt));
evicted_state =
@@ -2960,12 +3003,13 @@ arc_buf_evict(arc_buf_t *buf)
mutex_exit(&old_state->arcs_mtx);
}
mutex_exit(hash_lock);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
VERIFY(buf->b_efunc(buf) == 0);
buf->b_efunc = NULL;
buf->b_private = NULL;
buf->b_hdr = NULL;
+ buf->b_next = NULL;
kmem_cache_free(buf_cache, buf);
return (1);
}
@@ -2980,12 +3024,17 @@ void
arc_release(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock;
+ kmutex_t *hash_lock = NULL;
l2arc_buf_hdr_t *l2hdr;
uint64_t buf_size;
- boolean_t released = B_FALSE;
- rw_enter(&buf->b_lock, RW_WRITER);
+ /*
+ * It would be nice to assert that if it's DMU metadata (level >
+ * 0 || it's the dnode file), then it must be syncing context.
+ * But we don't know that information at this level.
+ */
+
+ mutex_enter(&buf->b_evict_lock);
hdr = buf->b_hdr;
/* this buffer is not on any list */
@@ -2993,15 +3042,12 @@ arc_release(arc_buf_t *buf, void *tag)
if (hdr->b_state == arc_anon) {
/* this buffer is already released */
- ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
- ASSERT(BUF_EMPTY(hdr));
ASSERT(buf->b_efunc == NULL);
- arc_buf_thaw(buf);
- rw_exit(&buf->b_lock);
- released = B_TRUE;
} else {
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
}
l2hdr = hdr->b_l2hdr;
@@ -3011,9 +3057,6 @@ arc_release(arc_buf_t *buf, void *tag)
buf_size = hdr->b_size;
}
- if (released)
- goto out;
-
/*
* Do we have more than one buf?
*/
@@ -3027,14 +3070,14 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
/*
- * Pull the data off of this buf and attach it to
- * a new anonymous buf.
+ * Pull the data off of this hdr and attach it to
+ * a new anonymous hdr.
*/
(void) remove_reference(hdr, hash_lock, tag);
bufp = &hdr->b_buf;
while (*bufp != buf)
bufp = &(*bufp)->b_next;
- *bufp = (*bufp)->b_next;
+ *bufp = buf->b_next;
buf->b_next = NULL;
ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
@@ -3062,26 +3105,25 @@ arc_release(arc_buf_t *buf, void *tag)
nhdr->b_freeze_cksum = NULL;
(void) refcount_add(&nhdr->b_refcnt, tag);
buf->b_hdr = nhdr;
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
atomic_add_64(&arc_anon->arcs_size, blksz);
} else {
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
ASSERT(refcount_count(&hdr->b_refcnt) == 1);
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- arc_change_state(arc_anon, hdr, hash_lock);
+ if (hdr->b_state != arc_anon)
+ arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_arc_access = 0;
- mutex_exit(hash_lock);
+ if (hash_lock)
+ mutex_exit(hash_lock);
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
+ buf_discard_identity(hdr);
arc_buf_thaw(buf);
}
buf->b_efunc = NULL;
buf->b_private = NULL;
-out:
if (l2hdr) {
list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
@@ -3090,14 +3132,27 @@ out:
}
}
+/*
+ * Release this buffer. If it does not match the provided BP, fill it
+ * with that block's contents.
+ */
+/* ARGSUSED */
+int
+arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+ zbookmark_t *zb)
+{
+ arc_release(buf, tag);
+ return (0);
+}
+
int
arc_released(arc_buf_t *buf)
{
int released;
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (released);
}
@@ -3106,9 +3161,9 @@ arc_has_callback(arc_buf_t *buf)
{
int callback;
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
callback = (buf->b_efunc != NULL);
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (callback);
}
@@ -3118,9 +3173,9 @@ arc_referenced(arc_buf_t *buf)
{
int referenced;
- rw_enter(&buf->b_lock, RW_READER);
+ mutex_enter(&buf->b_evict_lock);
referenced = (refcount_count(&buf->b_hdr->b_refcnt));
- rw_exit(&buf->b_lock);
+ mutex_exit(&buf->b_evict_lock);
return (referenced);
}
#endif
@@ -3173,8 +3228,8 @@ arc_write_done(zio_t *zio)
/*
* If the block to be written was all-zero, we may have
* compressed it away. In this case no write was performed
- * so there will be no dva/birth-date/checksum. The buffer
- * must therefor remain anonymous (and uncached).
+ * so there will be no dva/birth/checksum. The buffer must
+ * therefore remain anonymous (and uncached).
*/
if (!BUF_EMPTY(hdr)) {
arc_buf_hdr_t *exists;
@@ -3278,9 +3333,7 @@ arc_free(spa_t *spa, const blkptr_t *bp)
if (HDR_IN_HASH_TABLE(ab))
buf_hash_remove(ab);
ab->b_arc_access = 0;
- bzero(&ab->b_dva, sizeof (dva_t));
- ab->b_birth = 0;
- ab->b_cksum0 = 0;
+ buf_discard_identity(ab);
ab->b_buf->b_efunc = NULL;
ab->b_buf->b_private = NULL;
mutex_exit(hash_lock);
@@ -3974,11 +4027,11 @@ l2arc_read_done(zio_t *zio)
ASSERT(cb != NULL);
buf = cb->l2rcb_buf;
ASSERT(buf != NULL);
- hdr = buf->b_hdr;
- ASSERT(hdr != NULL);
- hash_lock = HDR_LOCK(hdr);
+ hash_lock = HDR_LOCK(buf->b_hdr);
mutex_enter(hash_lock);
+ hdr = buf->b_hdr;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
/*
* Check this survived the L2ARC journey.
diff --git a/usr/src/uts/common/fs/zfs/bplist.c b/usr/src/uts/common/fs/zfs/bplist.c
index 830b6c1a42..8f8c5e1fcf 100644
--- a/usr/src/uts/common/fs/zfs/bplist.c
+++ b/usr/src/uts/common/fs/zfs/bplist.c
@@ -175,23 +175,26 @@ bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
return (err);
}
- if (*itorp >= bpl->bpl_phys->bpl_entries) {
- mutex_exit(&bpl->bpl_lock);
- return (ENOENT);
- }
+ do {
+ if (*itorp >= bpl->bpl_phys->bpl_entries) {
+ mutex_exit(&bpl->bpl_lock);
+ return (ENOENT);
+ }
- blk = *itorp >> bpl->bpl_bpshift;
- off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
+ blk = *itorp >> bpl->bpl_bpshift;
+ off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
- err = bplist_cache(bpl, blk);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
+ err = bplist_cache(bpl, blk);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ bparray = bpl->bpl_cached_dbuf->db_data;
+ *bp = bparray[off];
+ (*itorp)++;
+ } while (bp->blk_birth == 0);
- bparray = bpl->bpl_cached_dbuf->db_data;
- *bp = bparray[off];
- (*itorp)++;
mutex_exit(&bpl->bpl_lock);
return (0);
}
@@ -206,8 +209,10 @@ bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
ASSERT(!BP_IS_HOLE(bp));
mutex_enter(&bpl->bpl_lock);
err = bplist_hold(bpl);
- if (err)
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
return (err);
+ }
blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index c211ff79dd..e1cd431acb 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -363,6 +363,7 @@ dbuf_verify(dmu_buf_impl_t *db)
}
}
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
+ (db->db_buf == NULL || db->db_buf->b_data) &&
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
db->db_state != DB_FILL && !dn->dn_free_txg) {
/*
@@ -477,8 +478,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
db->db_state = DB_UNCACHED;
}
cv_broadcast(&db->db_changed);
- mutex_exit(&db->db_mtx);
- dbuf_rele(db, NULL);
+ dbuf_rele_and_unlock(db, NULL);
}
static void
@@ -549,7 +549,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
else
pbuf = db->db_objset->os_phys_buf;
- (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
+ (void) dsl_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
@@ -727,7 +727,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
/* free this block */
if (!BP_IS_HOLE(bp))
- dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp);
+ zio_free(db->db_dnode->dn_objset->os_spa, txg, bp);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
/*
@@ -921,6 +921,26 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
dnode_willuse_space(db->db_dnode, size-osize, tx);
}
+void
+dbuf_release_bp(dmu_buf_impl_t *db)
+{
+ objset_t *os = db->db_dnode->dn_objset;
+ zbookmark_t zb;
+
+ ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+ ASSERT(arc_released(os->os_phys_buf) ||
+ list_link_active(&os->os_dsl_dataset->ds_synced_link));
+ ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+ zb.zb_objset = os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+ (void) arc_release_bp(db->db_buf, db,
+ db->db_blkptr, os->os_spa, &zb);
+}
+
dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
@@ -1717,7 +1737,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
else
pbuf = dn->dn_objset->os_phys_buf;
- (void) arc_read(NULL, dn->dn_objset->os_spa,
+ (void) dsl_read(NULL, dn->dn_objset->os_spa,
bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &zb);
@@ -2463,7 +2483,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
if (BP_IS_HOLE(db->db_blkptr)) {
arc_buf_thaw(data);
} else {
- arc_release(data, db);
+ dbuf_release_bp(db);
}
}
}
diff --git a/usr/src/uts/common/fs/zfs/ddt.c b/usr/src/uts/common/fs/zfs/ddt.c
index 852fd1cdc4..64cbcc1f92 100644
--- a/usr/src/uts/common/fs/zfs/ddt.c
+++ b/usr/src/uts/common/fs/zfs/ddt.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -35,6 +34,7 @@
#include <sys/dsl_pool.h>
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
+#include <sys/dsl_scan.h>
static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
&ddt_zap_ops,
@@ -160,7 +160,7 @@ ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
ddt->ddt_object[type][class], dde));
}
-static int
+int
ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
ddt_entry_t *dde, dmu_tx_t *tx)
{
@@ -245,12 +245,13 @@ ddt_bp_create(enum zio_checksum checksum,
ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
bp->blk_cksum = ddk->ddk_cksum;
+ bp->blk_fill = 1;
BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
BP_SET_CHECKSUM(bp, checksum);
- BP_SET_TYPE(bp, DMU_OT_NONE);
+ BP_SET_TYPE(bp, DMU_OT_DEDUP);
BP_SET_LEVEL(bp, 0);
BP_SET_DEDUP(bp, 0);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
@@ -996,10 +997,17 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
ddt_object_create(ddt, ntype, nclass, tx);
VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
- if (dp->dp_scrub_func != SCRUB_FUNC_NONE &&
- oclass > nclass &&
- nclass <= dp->dp_scrub_ddt_class_max)
- dsl_pool_scrub_ddt_entry(dp, ddt->ddt_checksum, dde);
+ /*
+ * If the class changes, the order that we scan this bp
+ * changes. If it decreases, we could miss it, so
+ * scan it right now. (This covers both class changing
+ * while we are doing ddt_walk(), and when we are
+ * traversing.)
+ */
+ if (nclass < oclass) {
+ dsl_scan_ddt_entry(dp->dp_scan,
+ ddt->ddt_checksum, dde, tx);
+ }
}
}
@@ -1013,7 +1021,6 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
if (avl_numnodes(&ddt->ddt_tree) == 0)
return;
- ASSERT(spa_sync_pass(spa) == 1);
ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
if (spa->spa_ddt_stat_object == 0) {
@@ -1081,6 +1088,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
ddb->ddb_type, ddb->ddb_class,
&ddb->ddb_cursor, dde);
}
+ dde->dde_type = ddb->ddb_type;
+ dde->dde_class = ddb->ddb_class;
if (error == 0)
return (0);
if (error != ENOENT)
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 0be72aa4f2..582089b8e8 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -84,7 +84,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint8_array, TRUE, "FUID table" },
{ byteswap_uint64_array, TRUE, "FUID table size" },
{ zap_byteswap, TRUE, "DSL dataset next clones"},
- { zap_byteswap, TRUE, "scrub work queue" },
+ { zap_byteswap, TRUE, "scan work queue" },
{ zap_byteswap, TRUE, "ZFS user/group used" },
{ zap_byteswap, TRUE, "ZFS user/group quota" },
{ zap_byteswap, TRUE, "snapshot refcount tags"},
@@ -93,7 +93,10 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint8_array, TRUE, "System attributes" },
{ zap_byteswap, TRUE, "SA master node" },
{ zap_byteswap, TRUE, "SA attr registration" },
- { zap_byteswap, TRUE, "SA attr layouts" }, };
+ { zap_byteswap, TRUE, "SA attr layouts" },
+ { zap_byteswap, TRUE, "scan translations" },
+ { byteswap_uint8_array, FALSE, "deduplicated block" },
+};
int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
@@ -1630,6 +1633,7 @@ byteswap_uint8_array(void *vbuf, size_t size)
void
dmu_init(void)
{
+ zfs_dbgmsg_init();
dbuf_init();
dnode_init();
zfetch_init();
@@ -1649,4 +1653,5 @@ dmu_fini(void)
l2arc_fini();
xuio_stat_fini();
sa_cache_fini();
+ zfs_dbgmsg_fini();
}
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 210d693051..546cd98b84 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -259,11 +259,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
dprintf_bp(os->os_rootbp, "reading %s", "");
/*
- * NB: when bprewrite scrub can change the bp,
+ * XXX when bprewrite scrub can change the bp,
* and this is called from dmu_objset_open_ds_os, the bp
* could change, and we'll need a lock.
*/
- err = arc_read_nolock(NULL, spa, os->os_rootbp,
+ err = dsl_read_nolock(NULL, spa, os->os_rootbp,
arc_getbuf_func, &os->os_phys_buf,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
if (err) {
@@ -628,6 +628,7 @@ struct oscarg {
const char *lastname;
dmu_objset_type_t type;
uint64_t flags;
+ cred_t *cr;
};
/*ARGSUSED*/
@@ -659,7 +660,7 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
struct oscarg *oa = arg2;
@@ -668,7 +669,7 @@ dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
ASSERT(dmu_tx_is_syncing(tx));
dsobj = dsl_dataset_create_sync(dd, oa->lastname,
- oa->clone_origin, oa->flags, cr, tx);
+ oa->clone_origin, oa->flags, oa->cr, tx);
if (oa->clone_origin == NULL) {
dsl_dataset_t *ds;
@@ -684,12 +685,12 @@ dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
ds, bp, oa->type, tx);
if (oa->userfunc)
- oa->userfunc(os, oa->userarg, cr, tx);
+ oa->userfunc(os, oa->userarg, oa->cr, tx);
dsl_dataset_rele(ds, FTAG);
}
- spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa,
- tx, cr, "dataset = %llu", dsobj);
+ spa_history_log_internal(LOG_DS_CREATE, dd->dd_pool->dp_spa,
+ tx, "dataset = %llu", dsobj);
}
int
@@ -715,6 +716,7 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
oa.lastname = tail;
oa.type = type;
oa.flags = flags;
+ oa.cr = CRED();
err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
dmu_objset_create_sync, pdd, &oa, 5);
@@ -742,6 +744,7 @@ dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags)
oa.lastname = tail;
oa.clone_origin = clone_origin;
oa.flags = flags;
+ oa.cr = CRED();
err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
dmu_objset_create_sync, pdd, &oa, 5);
@@ -795,19 +798,19 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
objset_t *os = arg1;
dsl_dataset_t *ds = os->os_dsl_dataset;
struct snaparg *sn = arg2;
- dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx);
+ dsl_dataset_snapshot_sync(ds, sn->snapname, tx);
if (sn->props) {
dsl_props_arg_t pa;
pa.pa_props = sn->props;
pa.pa_source = ZPROP_SRC_LOCAL;
- dsl_props_set_sync(ds->ds_prev, &pa, cr, tx);
+ dsl_props_set_sync(ds->ds_prev, &pa, tx);
}
}
@@ -1016,11 +1019,11 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
/*
* Create the root block IO
*/
- arc_release(os->os_phys_buf, &os->os_phys_buf);
-
SET_BOOKMARK(&zb, os->os_dsl_dataset ?
os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ VERIFY3U(0, ==, arc_release_bp(os->os_phys_buf, &os->os_phys_buf,
+ os->os_rootbp, os->os_spa, &zb));
dmu_write_policy(os, NULL, 0, 0, &zp);
@@ -1082,7 +1085,7 @@ dmu_objset_is_dirty(objset_t *os, uint64_t txg)
!list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
}
-static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
+objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
void
dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
@@ -1649,7 +1652,7 @@ dmu_objset_prefetch(const char *name, void *arg)
SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
- (void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds),
+ (void) dsl_read_nolock(NULL, dsl_dataset_get_spa(ds),
&ds->ds_phys->ds_bp, NULL, NULL,
ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index 86c428b5f2..a675e28b15 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -301,7 +301,7 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
/* ARGSUSED */
static int
-backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct backuparg *ba = arg;
@@ -330,7 +330,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf;
- if (arc_read_nolock(NULL, spa, bp,
+ if (dsl_read(NULL, spa, bp, pbuf,
arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
return (EIO);
@@ -361,7 +361,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
- if (arc_read_nolock(NULL, spa, bp,
+ if (dsl_read(NULL, spa, bp, pbuf,
arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
return (EIO);
@@ -504,6 +504,7 @@ struct recvbeginsyncarg {
uint64_t dsflags;
char clonelastname[MAXNAMELEN];
dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
+ cred_t *cr;
};
/* ARGSUSED */
@@ -536,7 +537,7 @@ recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
struct recvbeginsyncarg *rbsa = arg2;
@@ -545,7 +546,7 @@ recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
/* Create and open new dataset. */
dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
- rbsa->origin, flags, cr, tx);
+ rbsa->origin, flags, rbsa->cr, tx);
VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
B_TRUE, dmu_recv_tag, &rbsa->ds));
@@ -554,8 +555,8 @@ recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
}
- spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
- dd->dd_pool->dp_spa, tx, cr, "dataset = %lld", dsobj);
+ spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC,
+ dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj);
}
/* ARGSUSED */
@@ -630,7 +631,7 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* ARGSUSED */
static void
-recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ohds = arg1;
struct recvbeginsyncarg *rbsa = arg2;
@@ -641,7 +642,7 @@ recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
/* create and open the temporary clone */
dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
- ohds->ds_prev, flags, cr, tx);
+ ohds->ds_prev, flags, rbsa->cr, tx);
VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
/*
@@ -655,8 +656,8 @@ recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
rbsa->ds = cds;
- spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
- dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
+ spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC,
+ dp->dp_spa, tx, "dataset = %lld", dsobj);
}
@@ -701,6 +702,7 @@ dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
rbsa.type = drrb->drr_type;
rbsa.tag = FTAG;
rbsa.dsflags = 0;
+ rbsa.cr = CRED();
versioninfo = drrb->drr_versioninfo;
flags = drrb->drr_flags;
@@ -1466,12 +1468,12 @@ recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
struct recvendsyncarg *resa = arg2;
- dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx);
+ dsl_dataset_snapshot_sync(ds, resa->tosnap, tx);
/* set snapshot's creation time and guid */
dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c
index 653c3a2d41..429c76ae11 100644
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -77,7 +76,7 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
- (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
+ (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg);
return (0);
}
@@ -102,7 +101,7 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL,
lr->lr_offset / BP_GET_LSIZE(bp));
- (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
+ (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
td->td_arg);
}
return (0);
@@ -140,7 +139,8 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
boolean_t hard = td->td_flags & TRAVERSE_HARD;
if (bp->blk_birth == 0) {
- err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg);
+ err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
+ td->td_arg);
return (err);
}
@@ -160,7 +160,8 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
}
if (td->td_flags & TRAVERSE_PRE) {
- err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+ err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
+ td->td_arg);
if (err)
return (err);
}
@@ -171,7 +172,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
- err = arc_read(NULL, td->td_spa, bp, pbuf,
+ err = dsl_read(NULL, td->td_spa, bp, pbuf,
arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
@@ -195,7 +196,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
int i;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
- err = arc_read(NULL, td->td_spa, bp, pbuf,
+ err = dsl_read(NULL, td->td_spa, bp, pbuf,
arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
@@ -217,7 +218,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
objset_phys_t *osp;
dnode_phys_t *dnp;
- err = arc_read_nolock(NULL, td->td_spa, bp,
+ err = dsl_read_nolock(NULL, td->td_spa, bp,
arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err)
@@ -252,8 +253,10 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
if (buf)
(void) arc_buf_remove_ref(buf, &buf);
- if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST))
- err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+ if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
+ err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
+ td->td_arg);
+ }
return (err != 0 ? err : lasterr);
}
@@ -275,16 +278,17 @@ traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
break;
lasterr = err;
}
- if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
- SET_BOOKMARK(&czb, objset,
- object, 0, DMU_SPILL_BLKID);
- err = traverse_visitbp(td, dnp, buf,
- (blkptr_t *)&dnp->dn_spill, &czb);
- if (err) {
- if (!hard)
- break;
- lasterr = err;
- }
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ SET_BOOKMARK(&czb, objset,
+ object, 0, DMU_SPILL_BLKID);
+ err = traverse_visitbp(td, dnp, buf,
+ (blkptr_t *)&dnp->dn_spill, &czb);
+ if (err) {
+ if (!hard)
+ return (err);
+ lasterr = err;
}
}
return (err != 0 ? err : lasterr);
@@ -293,7 +297,8 @@ traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
/* ARGSUSED */
static int
traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+ arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
+ void *arg)
{
struct prefetch_data *pfd = arg;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
@@ -314,7 +319,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
cv_broadcast(&pfd->pd_cv);
mutex_exit(&pfd->pd_mtx);
- (void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
+ (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL,
ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, zb);
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
index 523aad70da..fa5747c7aa 100644
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -227,7 +227,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
if (db->db_state != DB_CACHED)
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
- arc_release(db->db_buf, db);
+ dbuf_release_bp(db);
bp = (blkptr_t *)db->db.db_data;
epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index ee0ccd7d01..c645d4d785 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -38,6 +38,7 @@
#include <sys/spa.h>
#include <sys/zfs_znode.h>
#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
static char *dsl_reaper = "the grim reaper";
@@ -80,7 +81,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
int uncompressed = BP_GET_UCSIZE(bp);
int64_t delta;
- dprintf_bp(bp, "born, ds=%p\n", ds);
+ dprintf_bp(bp, "ds=%p", ds);
ASSERT(dmu_tx_is_syncing(tx));
/* It could have been compressed away to nothing */
@@ -100,6 +101,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
return;
}
dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
delta = parent_delta(ds, used);
@@ -150,7 +152,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
int64_t delta;
- dprintf_bp(bp, "freeing: %s", "");
+ dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
dsl_free(tx->tx_pool, tx->tx_txg, bp);
mutex_enter(&ds->ds_dir->dd_lock);
@@ -191,7 +193,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
ds->ds_prev->ds_phys->ds_unique_bytes += used;
mutex_exit(&ds->ds_prev->ds_lock);
}
- if (bp->blk_birth > ds->ds_origin_txg) {
+ if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
dsl_dir_transfer_space(ds->ds_dir, used,
DD_USED_HEAD, DD_USED_SNAP, tx);
}
@@ -397,19 +399,6 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
ds->ds_phys->ds_prev_snap_obj,
ds, &ds->ds_prev);
}
-
- if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) {
- dsl_dataset_t *origin;
-
- err = dsl_dataset_hold_obj(dp,
- ds->ds_dir->dd_phys->dd_origin_obj,
- FTAG, &origin);
- if (err == 0) {
- ds->ds_origin_txg =
- origin->ds_phys->ds_creation_txg;
- dsl_dataset_rele(origin, FTAG);
- }
- }
} else {
if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
err = dsl_dataset_get_snapname(ds);
@@ -876,10 +865,6 @@ dsl_snapshot_destroy_one(const char *name, void *arg)
struct dsl_ds_destroyarg *dsda;
dsl_dataset_make_exclusive(ds, da->dstg);
- if (ds->ds_objset != NULL) {
- dmu_objset_evict(ds->ds_objset);
- ds->ds_objset = NULL;
- }
dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP);
dsda->ds = ds;
dsda->defer = da->defer;
@@ -989,11 +974,6 @@ dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
return (error);
dsda->rm_origin = origin;
dsl_dataset_make_exclusive(origin, tag);
-
- if (origin->ds_objset != NULL) {
- dmu_objset_evict(origin->ds_objset);
- origin->ds_objset = NULL;
- }
}
return (0);
@@ -1020,10 +1000,6 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
/* Destroying a snapshot is simpler */
dsl_dataset_make_exclusive(ds, tag);
- if (ds->ds_objset != NULL) {
- dmu_objset_evict(ds->ds_objset);
- ds->ds_objset = NULL;
- }
dsda.defer = defer;
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
@@ -1096,24 +1072,10 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
if (err)
goto out;
- if (ds->ds_objset) {
- /*
- * We need to sync out all in-flight IO before we try
- * to evict (the dataset evict func is trying to clear
- * the cached entries for this dataset in the ARC).
- */
- txg_wait_synced(dd->dd_pool, 0);
- }
-
/*
* Blow away the dsl_dir + head dataset.
*/
dsl_dataset_make_exclusive(ds, tag);
- if (ds->ds_objset) {
- dmu_objset_evict(ds->ds_objset);
- ds->ds_objset = NULL;
- }
-
/*
* If we're removing a clone, we might also need to remove its
* origin.
@@ -1220,7 +1182,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
uint64_t mrs_used;
uint64_t dlused, dlcomp, dluncomp;
- ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
+ ASSERT(!dsl_dataset_is_snapshot(ds));
if (ds->ds_phys->ds_prev_snap_obj != 0)
mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
@@ -1234,21 +1196,11 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
ds->ds_phys->ds_unique_bytes =
ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
- if (!DS_UNIQUE_IS_ACCURATE(ds) &&
- spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
SPA_VERSION_UNIQUE_ACCURATE)
ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
}
-static uint64_t
-dsl_dataset_unique(dsl_dataset_t *ds)
-{
- if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
- dsl_dataset_recalc_head_uniq(ds);
-
- return (ds->ds_phys->ds_unique_bytes);
-}
-
struct killarg {
dsl_dataset_t *ds;
dmu_tx_t *tx;
@@ -1256,7 +1208,7 @@ struct killarg {
/* ARGSUSED */
static int
-kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct killarg *ka = arg;
@@ -1315,7 +1267,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* ARGSUSED */
static void
-dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
@@ -1324,8 +1276,8 @@ dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
- spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
- cr, "dataset = %llu", ds->ds_object);
+ spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
+ "dataset = %llu", ds->ds_object);
}
static int
@@ -1499,7 +1451,7 @@ remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
}
void
-dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
{
struct dsl_ds_destroyarg *dsda = arg1;
dsl_dataset_t *ds = dsda->ds;
@@ -1531,6 +1483,11 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
cv_broadcast(&ds->ds_exclusive_cv);
mutex_exit(&ds->ds_lock);
+ if (ds->ds_objset) {
+ dmu_objset_evict(ds->ds_objset);
+ ds->ds_objset = NULL;
+ }
+
/* Remove our reservation */
if (ds->ds_reserved != 0) {
dsl_prop_setarg_t psa;
@@ -1541,13 +1498,13 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
&value);
psa.psa_effective_value = 0; /* predict default value */
- dsl_dataset_set_reservation_sync(ds, &psa, cr, tx);
+ dsl_dataset_set_reservation_sync(ds, &psa, tx);
ASSERT3U(ds->ds_reserved, ==, 0);
}
ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
- dsl_pool_ds_destroyed(ds, tx);
+ dsl_scan_ds_destroyed(ds, tx);
obj = ds->ds_object;
@@ -1596,7 +1553,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
}
}
- if (ds->ds_phys->ds_next_snap_obj != 0) {
+ if (dsl_dataset_is_snapshot(ds)) {
blkptr_t bp;
zio_t *pio;
dsl_dataset_t *ds_next;
@@ -1608,7 +1565,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
- old_unique = dsl_dataset_unique(ds_next);
+ old_unique = ds_next->ds_phys->ds_unique_bytes;
dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
ds_next->ds_phys->ds_prev_snap_obj =
@@ -1664,7 +1621,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
ds_next->ds_phys->ds_deadlist_obj));
ds->ds_phys->ds_deadlist_obj = 0;
- if (ds_next->ds_phys->ds_next_snap_obj != 0) {
+ if (dsl_dataset_is_snapshot(ds_next)) {
/*
* Update next's unique to include blocks which
* were previously shared by only this snapshot
@@ -1790,8 +1747,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
dsl_dataset_rele(ds_prev, FTAG);
spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
- spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx,
- cr, "dataset = %llu", ds->ds_object);
+ spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx,
+ "dataset = %llu", ds->ds_object);
if (ds->ds_phys->ds_next_clones_obj != 0) {
uint64_t count;
@@ -1816,7 +1773,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
struct dsl_ds_destroyarg ndsda = {0};
ndsda.ds = dsda->rm_origin;
- dsl_dataset_destroy_sync(&ndsda, tag, cr, tx);
+ dsl_dataset_destroy_sync(&ndsda, tag, tx);
}
}
@@ -1833,7 +1790,8 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
* owned by the snapshot dataset must be accommodated by space
* outside of the reservation.
*/
- asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
+ asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
return (ENOSPC);
@@ -1847,7 +1805,6 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
return (0);
}
-/* ARGSUSED */
int
dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -1888,7 +1845,7 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
void
-dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
const char *snapname = arg2;
@@ -1959,9 +1916,11 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
* since our unique space is going to zero.
*/
if (ds->ds_reserved) {
- int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+ int64_t delta;
+ ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+ delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
- add, 0, 0, tx);
+ delta, 0, 0, tx);
}
bplist_close(&ds->ds_deadlist);
@@ -1987,11 +1946,11 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(0 == dsl_dataset_get_ref(dp,
ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
- dsl_pool_ds_snapshotted(ds, tx);
+ dsl_scan_ds_snapshotted(ds, tx);
dsl_dir_snap_cmtime_update(ds->ds_dir);
- spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
+ spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx,
"dataset = %llu", dsobj);
}
@@ -2035,7 +1994,7 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
ds->ds_phys->ds_guid);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
- dsl_dataset_unique(ds));
+ ds->ds_phys->ds_unique_bytes);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
ds->ds_object);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
@@ -2163,8 +2122,7 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
- cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
const char *newsnapname = arg2;
@@ -2188,8 +2146,8 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
ds->ds_snapname, 8, 1, &ds->ds_object, tx);
ASSERT3U(err, ==, 0);
- spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
- cr, "dataset = %llu", ds->ds_object);
+ spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
+ "dataset = %llu", ds->ds_object);
dsl_dataset_rele(hds, FTAG);
}
@@ -2371,14 +2329,14 @@ struct promotenode {
struct promotearg {
list_t shared_snaps, origin_snaps, clone_snaps;
- dsl_dataset_t *origin_origin, *origin_head;
+ dsl_dataset_t *origin_origin;
uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
char *err_ds;
};
static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
+static boolean_t snaplist_unstable(list_t *l);
-/* ARGSUSED */
static int
dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -2479,19 +2437,19 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
/*
* Note, typically this will not be a clone of a clone,
- * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so
+ * so dd_origin_txg will be < TXG_INITIAL, so
* these snaplist_space() -> bplist_space_birthrange()
* calls will be fast because they do not have to
* iterate over all bps.
*/
snap = list_head(&pa->origin_snaps);
err = snaplist_space(&pa->shared_snaps,
- snap->ds->ds_origin_txg, &pa->cloneusedsnap);
+ snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
if (err)
return (err);
err = snaplist_space(&pa->clone_snaps,
- snap->ds->ds_origin_txg, &space);
+ snap->ds->ds_dir->dd_origin_txg, &space);
if (err)
return (err);
pa->cloneusedsnap += space;
@@ -2510,7 +2468,7 @@ out:
}
static void
-dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *hds = arg1;
struct promotearg *pa = arg2;
@@ -2554,10 +2512,11 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dmu_buf_will_dirty(dd->dd_dbuf, tx);
ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
- hds->ds_origin_txg = origin_head->ds_origin_txg;
+ dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
dmu_buf_will_dirty(odd->dd_dbuf, tx);
odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
- origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg;
+ origin_head->ds_dir->dd_origin_txg =
+ origin_ds->ds_phys->ds_creation_txg;
/* move snapshots to this dir */
for (snap = list_head(&pa->shared_snaps); snap;
@@ -2614,8 +2573,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
origin_ds->ds_phys->ds_unique_bytes = pa->unique;
/* log history record */
- spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
- cr, "dataset = %llu", hds->ds_object);
+ spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
+ "dataset = %llu", hds->ds_object);
dsl_dir_close(odd, FTAG);
}
@@ -2862,7 +2821,7 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* ARGSUSED */
static void
-dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
struct cloneswaparg *csa = arg1;
dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
@@ -2937,9 +2896,9 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
* changing that affects the snapused).
*/
VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
- csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used));
+ csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &cdl_used));
VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist,
- csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used));
+ csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &odl_used));
dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
DD_USED_HEAD, DD_USED_SNAP, tx);
}
@@ -2975,7 +2934,7 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
csa->ohds->ds_phys->ds_deadlist_obj));
- dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx);
+ dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
}
/*
@@ -3110,24 +3069,24 @@ dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
return (0);
}
-extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *);
+extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
void
-dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
dsl_prop_setarg_t *psa = arg2;
uint64_t effective_value = psa->psa_effective_value;
- dsl_prop_set_sync(ds, psa, cr, tx);
+ dsl_prop_set_sync(ds, psa, tx);
DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
if (ds->ds_quota != effective_value) {
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_quota = effective_value;
- spa_history_internal_log(LOG_DS_REFQUOTA,
- ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu ",
+ spa_history_log_internal(LOG_DS_REFQUOTA,
+ ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ",
(longlong_t)ds->ds_quota, ds->ds_object);
}
}
@@ -3188,7 +3147,9 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
return (0);
mutex_enter(&ds->ds_lock);
- unique = dsl_dataset_unique(ds);
+ if (!DS_UNIQUE_IS_ACCURATE(ds))
+ dsl_dataset_recalc_head_uniq(ds);
+ unique = ds->ds_phys->ds_unique_bytes;
mutex_exit(&ds->ds_lock);
if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
@@ -3205,10 +3166,8 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
return (0);
}
-/* ARGSUSED */
static void
-dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
- dmu_tx_t *tx)
+dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
dsl_prop_setarg_t *psa = arg2;
@@ -3216,14 +3175,15 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
uint64_t unique;
int64_t delta;
- dsl_prop_set_sync(ds, psa, cr, tx);
+ dsl_prop_set_sync(ds, psa, tx);
DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
- unique = dsl_dataset_unique(ds);
+ ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+ unique = ds->ds_phys->ds_unique_bytes;
delta = MAX(0, (int64_t)(effective_value - unique)) -
MAX(0, (int64_t)(ds->ds_reserved - unique));
ds->ds_reserved = effective_value;
@@ -3232,8 +3192,8 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
mutex_exit(&ds->ds_dir->dd_lock);
- spa_history_internal_log(LOG_DS_REFRESERV,
- ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
+ spa_history_log_internal(LOG_DS_REFRESERV,
+ ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu",
(longlong_t)effective_value, ds->ds_object);
}
@@ -3311,7 +3271,7 @@ dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
struct dsl_ds_holdarg *ha = arg2;
@@ -3343,8 +3303,8 @@ dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
htag, &now, tx));
}
- spa_history_internal_log(LOG_DS_USER_HOLD,
- dp->dp_spa, tx, cr, "<%s> temp = %d dataset = %llu", htag,
+ spa_history_log_internal(LOG_DS_USER_HOLD,
+ dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag,
(int)ha->temphold, ds->ds_object);
}
@@ -3495,10 +3455,6 @@ dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
*/
if (!ra->own)
return (EBUSY);
- if (ds->ds_objset) {
- dmu_objset_evict(ds->ds_objset);
- ds->ds_objset = NULL;
- }
}
dsda.ds = ds;
dsda.releasing = B_TRUE;
@@ -3509,7 +3465,7 @@ dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
}
static void
-dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
{
struct dsl_ds_releasearg *ra = arg1;
dsl_dataset_t *ds = ra->ds;
@@ -3520,6 +3476,11 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
uint64_t refs;
int error;
+ if (ds->ds_objset) {
+ dmu_objset_evict(ds->ds_objset);
+ ds->ds_objset = NULL;
+ }
+
mutex_enter(&ds->ds_lock);
ds->ds_userrefs--;
refs = ds->ds_userrefs;
@@ -3536,11 +3497,11 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
dsda.ds = ds;
dsda.releasing = B_TRUE;
/* We already did the destroy_check */
- dsl_dataset_destroy_sync(&dsda, tag, cr, tx);
+ dsl_dataset_destroy_sync(&dsda, tag, tx);
}
- spa_history_internal_log(LOG_DS_USER_RELEASE,
- dp->dp_spa, tx, cr, "<%s> %lld dataset = %llu",
+ spa_history_log_internal(LOG_DS_USER_RELEASE,
+ dp->dp_spa, tx, "<%s> %lld dataset = %llu",
ra->htag, (longlong_t)refs, dsobj);
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_deleg.c b/usr/src/uts/common/fs/zfs/dsl_deleg.c
index 04053fdf20..85490c8d5f 100644
--- a/usr/src/uts/common/fs/zfs/dsl_deleg.c
+++ b/usr/src/uts/common/fs/zfs/dsl_deleg.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -148,7 +147,7 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
}
static void
-dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
nvlist_t *nvp = arg2;
@@ -183,8 +182,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(zap_update(mos, jumpobj,
perm, 8, 1, &n, tx) == 0);
- spa_history_internal_log(LOG_DS_PERM_UPDATE,
- dd->dd_pool->dp_spa, tx, cr,
+ spa_history_log_internal(LOG_DS_PERM_UPDATE,
+ dd->dd_pool->dp_spa, tx,
"%s %s dataset = %llu", whokey, perm,
dd->dd_phys->dd_head_dataset_obj);
}
@@ -192,7 +191,7 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
}
static void
-dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
nvlist_t *nvp = arg2;
@@ -215,8 +214,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
(void) zap_remove(mos, zapobj, whokey, tx);
VERIFY(0 == zap_destroy(mos, jumpobj, tx));
}
- spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE,
- dd->dd_pool->dp_spa, tx, cr,
+ spa_history_log_internal(LOG_DS_PERM_WHO_REMOVE,
+ dd->dd_pool->dp_spa, tx,
"%s dataset = %llu", whokey,
dd->dd_phys->dd_head_dataset_obj);
continue;
@@ -236,8 +235,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
VERIFY(0 == zap_destroy(mos,
jumpobj, tx));
}
- spa_history_internal_log(LOG_DS_PERM_REMOVE,
- dd->dd_pool->dp_spa, tx, cr,
+ spa_history_log_internal(LOG_DS_PERM_REMOVE,
+ dd->dd_pool->dp_spa, tx,
"%s %s dataset = %llu", whokey, perm,
dd->dd_phys->dd_head_dataset_obj);
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 0dfb05da2d..ac86da6590 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu.h>
@@ -40,8 +39,7 @@
#include "zfs_namecheck.h"
static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
-static void dsl_dir_set_reservation_sync(void *arg1, void *arg2,
- cred_t *cr, dmu_tx_t *tx);
+static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx);
/* ARGSUSED */
@@ -64,8 +62,8 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
spa_close(dd->dd_pool->dp_spa, dd);
/*
- * The props callback list should be empty since they hold the
- * dir open.
+ * The props callback list should have been cleaned up by
+ * objset_evict().
*/
list_destroy(&dd->dd_prop_cbs);
mutex_destroy(&dd->dd_lock);
@@ -136,6 +134,25 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
}
+ if (dsl_dir_is_clone(dd)) {
+ dmu_buf_t *origin_bonus;
+ dsl_dataset_phys_t *origin_phys;
+
+ /*
+ * We can't open the origin dataset, because
+ * that would require opening this dsl_dir.
+ * Just look at its phys directly instead.
+ */
+ err = dmu_bonus_hold(dp->dp_meta_objset,
+ dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
+ if (err)
+ goto errout;
+ origin_phys = origin_bonus->db_data;
+ dd->dd_origin_txg =
+ origin_phys->ds_creation_txg;
+ dmu_buf_rele(origin_bonus, FTAG);
+ }
+
winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
dsl_dir_evict);
if (winner) {
@@ -458,7 +475,7 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
void
-dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
dsl_dir_t *dd = ds->ds_dir;
@@ -477,7 +494,7 @@ dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
&value);
psa.psa_effective_value = 0; /* predict default value */
- dsl_dir_set_reservation_sync(ds, &psa, cr, tx);
+ dsl_dir_set_reservation_sync(ds, &psa, tx);
ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0);
ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
@@ -652,15 +669,6 @@ dsl_dir_space_available(dsl_dir_t *dd,
if (used > quota) {
/* over quota */
myspace = 0;
-
- /*
- * While it's OK to be a little over quota, if
- * we think we are using more space than there
- * is in the pool (which is already 1.6% more than
- * dsl_pool_adjustedsize()), something is very
- * wrong.
- */
- ASSERT3U(used, <=, spa_get_dspace(dd->dd_pool->dp_spa));
} else {
/*
* the lesser of the space provided by our parent and
@@ -1033,18 +1041,17 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
return (err);
}
-extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *);
+extern dsl_syncfunc_t dsl_prop_set_sync;
-/* ARGSUSED */
static void
-dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
dsl_dir_t *dd = ds->ds_dir;
dsl_prop_setarg_t *psa = arg2;
uint64_t effective_value = psa->psa_effective_value;
- dsl_prop_set_sync(ds, psa, cr, tx);
+ dsl_prop_set_sync(ds, psa, tx);
DSL_PROP_CHECK_PREDICTION(dd, psa);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
@@ -1053,8 +1060,8 @@ dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dd->dd_phys->dd_quota = effective_value;
mutex_exit(&dd->dd_lock);
- spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
- tx, cr, "%lld dataset = %llu ",
+ spa_history_log_internal(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
+ tx, "%lld dataset = %llu ",
(longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
}
@@ -1141,9 +1148,8 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
return (0);
}
-/* ARGSUSED */
static void
-dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
dsl_dir_t *dd = ds->ds_dir;
@@ -1152,7 +1158,7 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
uint64_t used;
int64_t delta;
- dsl_prop_set_sync(ds, psa, cr, tx);
+ dsl_prop_set_sync(ds, psa, tx);
DSL_PROP_CHECK_PREDICTION(dd, psa);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
@@ -1170,8 +1176,8 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
}
mutex_exit(&dd->dd_lock);
- spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
- tx, cr, "%lld dataset = %llu",
+ spa_history_log_internal(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
+ tx, "%lld dataset = %llu",
(longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
}
@@ -1240,7 +1246,6 @@ struct renamearg {
const char *mynewname;
};
-/*ARGSUSED*/
static int
dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -1287,7 +1292,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
}
static void
-dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dir_t *dd = arg1;
struct renamearg *ra = arg2;
@@ -1336,8 +1341,8 @@ dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dd->dd_myname, 8, 1, &dd->dd_object, tx);
ASSERT3U(err, ==, 0);
- spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa,
- tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
+ spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa,
+ tx, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
}
int
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 30a5611365..77aa4af4e3 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -19,14 +19,16 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dsl_pool.h>
#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_synctask.h>
+#include <sys/dsl_scan.h>
+#include <sys/dnode.h>
#include <sys/dmu_tx.h>
#include <sys/dmu_objset.h>
#include <sys/arc.h>
@@ -50,7 +52,7 @@ kmutex_t zfs_write_limit_lock;
static pgcnt_t old_physmem = 0;
-static int
+int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
{
uint64_t obj;
@@ -88,7 +90,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
offsetof(dsl_dataset_t, ds_synced_link));
mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);
dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
1, 4, 0);
@@ -150,64 +151,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
if (err)
goto out;
- /* get scrub status */
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
- &dp->dp_scrub_func);
- if (err == 0) {
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
- &dp->dp_scrub_queue_obj);
- if (err)
- goto out;
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
- &dp->dp_scrub_min_txg);
- if (err)
- goto out;
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
- &dp->dp_scrub_max_txg);
- if (err)
- goto out;
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
- sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
- &dp->dp_scrub_bookmark);
- if (err)
- goto out;
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
- sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
- &dp->dp_scrub_ddt_bookmark);
- if (err && err != ENOENT)
- goto out;
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
- &dp->dp_scrub_ddt_class_max);
- if (err && err != ENOENT)
- goto out;
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
- &spa->spa_scrub_errors);
- if (err)
- goto out;
- if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
- /*
- * A new-type scrub was in progress on an old
- * pool. Restart from the beginning, since the
- * old software may have changed the pool in the
- * meantime.
- */
- dsl_pool_scrub_restart(dp);
- }
- } else {
- /*
- * It's OK if there is no scrub in progress (and if
- * there was an I/O error, ignore it).
- */
- err = 0;
- }
+ err = dsl_scan_init(dp, txg);
out:
rw_exit(&dp->dp_config_rwlock);
@@ -247,9 +191,9 @@ dsl_pool_close(dsl_pool_t *dp)
arc_flush(dp->dp_spa);
txg_fini(dp);
+ dsl_scan_fini(dp);
rw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
- mutex_destroy(&dp->dp_scrub_cancel_lock);
taskq_destroy(dp->dp_vnrele_taskq);
if (dp->dp_blkstats)
kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
@@ -275,6 +219,9 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
ASSERT3U(err, ==, 0);
+ /* Initialize scan structures */
+ VERIFY3U(0, ==, dsl_scan_init(dp, txg));
+
/* create and open the root dir */
dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
@@ -318,6 +265,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
uint64_t data_written;
int err;
+ /*
+ * We need to copy dp_space_towrite() before doing
+ * dsl_sync_task_group_sync(), because
+ * dsl_dataset_snapshot_reserve_space() will increase
+ * dp_space_towrite but not actually write anything.
+ */
+ data_written = dp->dp_space_towrite[txg & TXG_MASK];
+
tx = dmu_tx_create_assigned(dp, txg);
dp->dp_read_overhead = 0;
@@ -347,7 +302,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
/*
* Sync the datasets again to push out the changes due to
- * userquota updates. This must be done before we process the
+ * userspace updates. This must be done before we process the
* sync tasks, because that could cause a snapshot of a dataset
* whose ds_bp will be rewritten when we do this 2nd sync.
*/
@@ -383,13 +338,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dsl_dir_sync(dd, tx);
write_time += gethrtime() - start;
- if (spa_sync_pass(dp->dp_spa) == 1) {
- dp->dp_scrub_prefetch_zio_root = zio_root(dp->dp_spa, NULL,
- NULL, ZIO_FLAG_CANFAIL);
- dsl_pool_scrub_sync(dp, tx);
- (void) zio_wait(dp->dp_scrub_prefetch_zio_root);
- }
-
start = gethrtime();
if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
@@ -407,7 +355,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dmu_tx_commit(tx);
- data_written = dp->dp_space_towrite[txg & TXG_MASK];
dp->dp_space_towrite[txg & TXG_MASK] = 0;
ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
@@ -679,7 +626,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
NULL, 0, kcred, tx);
VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
- dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx);
+ dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx);
VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
dp, &dp->dp_origin_snap));
dsl_dataset_rele(ds, FTAG);
diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c
index f27305c953..cedd777687 100644
--- a/usr/src/uts/common/fs/zfs/dsl_prop.c
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -260,11 +259,8 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
cbr->cbr_func(cbr->cbr_arg, value);
- VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
- NULL, cbr, &dd));
if (need_rwlock)
rw_exit(&dp->dp_config_rwlock);
- /* Leave dir open until this callback is unregistered */
return (0);
}
@@ -464,8 +460,6 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
- /* Clean up from dsl_prop_register */
- dsl_dir_close(dd, cbr);
return (0);
}
@@ -552,7 +546,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
}
void
-dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
dsl_prop_setarg_t *psa = arg2;
@@ -707,9 +701,9 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
}
}
- spa_history_internal_log((source == ZPROP_SRC_NONE ||
+ spa_history_log_internal((source == ZPROP_SRC_NONE ||
source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT :
- LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr,
+ LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx,
"%s=%s dataset = %llu", propname,
(valstr == NULL ? "" : valstr), ds->ds_object);
@@ -718,7 +712,7 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
}
void
-dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
dsl_props_arg_t *pa = arg2;
@@ -756,13 +750,13 @@ dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
psa.psa_numints = 1;
psa.psa_value = &intval;
}
- dsl_prop_set_sync(ds, &psa, cr, tx);
+ dsl_prop_set_sync(ds, &psa, tx);
}
}
void
dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
- cred_t *cr, dmu_tx_t *tx)
+ dmu_tx_t *tx)
{
objset_t *mos = dd->dd_pool->dp_meta_objset;
uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
@@ -773,7 +767,7 @@ dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE);
- spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr,
+ spa_history_log_internal(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx,
"%s=%llu dataset = %llu", name, (u_longlong_t)val,
dd->dd_phys->dd_head_dataset_obj);
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c
new file mode 100644
index 0000000000..f3b401d602
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c
@@ -0,0 +1,1660 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dsl_scan.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zil_impl.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+
+typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
+
+static scan_cb_t dsl_scan_defrag_cb;
+static scan_cb_t dsl_scan_scrub_cb;
+static scan_cb_t dsl_scan_remove_cb;
+static dsl_syncfunc_t dsl_scan_cancel_sync;
+static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
+
+int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
+
+#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
+ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
+ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+
+extern int zfs_txg_timeout;
+
+/* the order has to match pool_scan_type */
+static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
+ NULL,
+ dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
+ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
+};
+
+int
+dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
+{
+ int err;
+ dsl_scan_t *scn;
+ spa_t *spa = dp->dp_spa;
+ uint64_t f;
+
+ scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
+ scn->scn_dp = dp;
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ "scrub_func", sizeof (uint64_t), 1, &f);
+ if (err == 0) {
+ /*
+ * There was an old-style scrub in progress. Restart a
+ * new-style scrub from the beginning.
+ */
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("old-style scrub was in progress; "
+ "restarting new-style scrub in txg %llu",
+ scn->scn_restart_txg);
+
+ /*
+ * Load the queue obj from the old location so that it
+ * can be freed by dsl_scan_done().
+ */
+ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ "scrub_queue", sizeof (uint64_t), 1,
+ &scn->scn_phys.scn_queue_obj);
+ } else {
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys);
+ if (err == ENOENT)
+ return (0);
+ else if (err)
+ return (err);
+
+ if (scn->scn_phys.scn_state == DSS_SCANNING &&
+ spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
+ /*
+ * A new-type scrub was in progress on an old
+ * pool, and the pool was accessed by old
+ * software. Restart from the beginning, since
+ * the old software may have changed the pool in
+ * the meantime.
+ */
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("new-style scrub was modified "
+ "by old software; restarting in txg %llu",
+ scn->scn_restart_txg);
+ }
+ }
+
+ spa_scan_stat_init(spa);
+ return (0);
+}
+
+void
+dsl_scan_fini(dsl_pool_t *dp)
+{
+ if (dp->dp_scan) {
+ kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
+ dp->dp_scan = NULL;
+ }
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg1;
+
+ if (scn->scn_phys.scn_state == DSS_SCANNING)
+ return (EBUSY);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg1;
+ pool_scan_func_t *funcp = arg2;
+ dmu_object_type_t ot = 0;
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+
+ ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+ ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+ bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+ scn->scn_phys.scn_func = *funcp;
+ scn->scn_phys.scn_state = DSS_SCANNING;
+ scn->scn_phys.scn_min_txg = 0;
+ scn->scn_phys.scn_max_txg = tx->tx_txg;
+ scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
+ scn->scn_phys.scn_start_time = gethrestime_sec();
+ scn->scn_phys.scn_errors = 0;
+ scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+ scn->scn_restart_txg = 0;
+ spa_scan_stat_init(spa);
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
+
+ /* rewrite all disk labels */
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ if (vdev_resilver_needed(spa->spa_root_vdev,
+ &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
+ spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
+ } else {
+ spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
+ }
+
+ spa->spa_scrub_started = B_TRUE;
+ /*
+ * If this is an incremental scrub, limit the DDT scrub phase
+ * to just the auto-ditto class (for correctness); the rest
+ * of the scrub should go faster using top-down pruning.
+ */
+ if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
+ scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+
+ }
+
+ /* back to the generic stuff */
+
+ if (dp->dp_blkstats == NULL) {
+ dp->dp_blkstats =
+ kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+ }
+ bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+
+ if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
+ ot = DMU_OT_ZAP_OTHER;
+
+ scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
+ ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
+
+ dsl_scan_sync_state(scn, tx);
+
+ spa_history_log_internal(LOG_POOL_SCAN, spa, tx,
+ "func=%u mintxg=%llu maxtxg=%llu",
+ *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+ static const char *old_names[] = {
+ "scrub_bookmark",
+ "scrub_ddt_bookmark",
+ "scrub_ddt_class_max",
+ "scrub_queue",
+ "scrub_min_txg",
+ "scrub_max_txg",
+ "scrub_func",
+ "scrub_errors",
+ NULL
+ };
+
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+ int i;
+
+ /* Remove any remnants of an old-style scrub. */
+ for (i = 0; old_names[i]; i++) {
+ (void) zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
+ }
+
+ if (scn->scn_phys.scn_queue_obj != 0) {
+ VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, tx));
+ scn->scn_phys.scn_queue_obj = 0;
+ }
+
+ /*
+ * If we were "restarted" from a stopped state, don't bother
+ * with anything else.
+ */
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (complete)
+ scn->scn_phys.scn_state = DSS_FINISHED;
+ else
+ scn->scn_phys.scn_state = DSS_CANCELED;
+
+ spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx,
+ "complete=%u", complete);
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight > 0) {
+ cv_wait(&spa->spa_scrub_io_cv,
+ &spa->spa_scrub_lock);
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+ spa->spa_scrub_started = B_FALSE;
+ spa->spa_scrub_active = B_FALSE;
+
+ /*
+ * If the scrub/resilver completed, update all DTLs to
+ * reflect this. Whether it succeeded or not, vacate
+ * all temporary scrub DTLs.
+ */
+ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+ complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
+ if (complete) {
+ spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
+ ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+ }
+ spa_errlog_rotate(spa);
+
+ /*
+ * We may have finished replacing a device.
+ * Let the async thread assess this and handle the detach.
+ */
+ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+ }
+
+ scn->scn_phys.scn_end_time = gethrestime_sec();
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg1;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return (ENOENT);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg1;
+
+ dsl_scan_done(scn, B_FALSE, tx);
+ dsl_scan_sync_state(scn, tx);
+}
+
+int
+dsl_scan_cancel(dsl_pool_t *dp)
+{
+ boolean_t complete = B_FALSE;
+ int err;
+
+ err = dsl_sync_task_do(dp, dsl_scan_cancel_check,
+ dsl_scan_cancel_sync, dp->dp_scan, &complete, 3);
+ return (err);
+}
+
+static void dsl_scan_visitbp(blkptr_t *bp,
+ const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
+ dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+ dmu_tx_t *tx);
+static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
+ dmu_objset_type_t ostype,
+ dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
+
+void
+dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
+{
+ zio_free(dp->dp_spa, txg, bp);
+}
+
+void
+dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
+{
+ ASSERT(dsl_pool_sync_context(dp));
+ zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
+}
+
+int
+dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
+{
+ return (arc_read(pio, spa, bpp, pbuf, done, private,
+ priority, zio_flags, arc_flags, zb));
+}
+
+int
+dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb)
+{
+ return (arc_read_nolock(pio, spa, bpp, done, private,
+ priority, zio_flags, arc_flags, zb));
+}
+
+static boolean_t
+bookmark_is_zero(const zbookmark_t *zb)
+{
+ return (zb->zb_objset == 0 && zb->zb_object == 0 &&
+ zb->zb_level == 0 && zb->zb_blkid == 0);
+}
+
+/* dnp is the dnode for zb1->zb_object */
+static boolean_t
+bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
+ const zbookmark_t *zb2)
+{
+ uint64_t zb1nextL0, zb2thisobj;
+
+ ASSERT(zb1->zb_objset == zb2->zb_objset);
+ ASSERT(zb2->zb_level == 0);
+
+ /*
+ * A bookmark in the deadlist is considered to be after
+ * everything else.
+ */
+ if (zb2->zb_object == DMU_DEADLIST_OBJECT)
+ return (B_TRUE);
+
+ /* The objset_phys_t isn't before anything. */
+ if (dnp == NULL)
+ return (B_FALSE);
+
+ zb1nextL0 = (zb1->zb_blkid + 1) <<
+ ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+
+ zb2thisobj = zb2->zb_object ? zb2->zb_object :
+ zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+
+ if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+ uint64_t nextobj = zb1nextL0 *
+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+ return (nextobj <= zb2thisobj);
+ }
+
+ if (zb1->zb_object < zb2thisobj)
+ return (B_TRUE);
+ if (zb1->zb_object > zb2thisobj)
+ return (B_FALSE);
+ if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+ return (B_FALSE);
+ return (zb1nextL0 <= zb2->zb_blkid);
+}
+
+static uint64_t
+dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+{
+ uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+ if (dsl_dataset_is_snapshot(ds))
+ return (MIN(smt, ds->ds_phys->ds_creation_txg));
+ return (smt);
+}
+
+static void
+dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys, tx));
+}
+
+static boolean_t
+dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
+{
+ uint64_t elapsed_nanosecs;
+ int mintime;
+
+ /* we never skip user/group accounting objects */
+ if (zb && (int64_t)zb->zb_object < 0)
+ return (B_FALSE);
+
+ if (scn->scn_pausing)
+ return (B_TRUE); /* we're already pausing */
+
+ if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
+ return (B_FALSE); /* we're resuming */
+
+ /* We only know how to resume from level-0 blocks. */
+ if (zb && zb->zb_level != 0)
+ return (B_FALSE);
+
+ mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
+ elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+ if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+ (elapsed_nanosecs / MICROSEC > mintime &&
+ txg_sync_waiting(scn->scn_dp)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa)) {
+ if (zb) {
+ dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ scn->scn_phys.scn_bookmark = *zb;
+ }
+ dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+ scn->scn_pausing = B_TRUE;
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+typedef struct zil_scan_arg {
+ dsl_pool_t *zsa_dp;
+ zil_header_t *zsa_zh;
+} zil_scan_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ zil_scan_arg_t *zsa = arg;
+ dsl_pool_t *dp = zsa->zsa_dp;
+ dsl_scan_t *scn = dp->dp_scan;
+ zil_header_t *zh = zsa->zsa_zh;
+ zbookmark_t zb;
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return (0);
+
+ /*
+ * One block ("stubby") can be allocated a long time ago; we
+ * want to visit that one because it has been allocated
+ * (on-disk) even if it hasn't been claimed (even though for
+ * scrub there's nothing to do to it).
+ */
+ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+ return (0);
+
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+ if (lrc->lrc_txtype == TX_WRITE) {
+ zil_scan_arg_t *zsa = arg;
+ dsl_pool_t *dp = zsa->zsa_dp;
+ dsl_scan_t *scn = dp->dp_scan;
+ zil_header_t *zh = zsa->zsa_zh;
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_t zb;
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return (0);
+
+ /*
+ * birth can be < claim_txg if this record's txg is
+ * already txg sync'ed (but this log block contains
+ * other records that are not synced)
+ */
+ if (claim_txg == 0 || bp->blk_birth < claim_txg)
+ return (0);
+
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ lr->lr_foid, ZB_ZIL_LEVEL,
+ lr->lr_offset / BP_GET_LSIZE(bp));
+
+ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+ }
+ return (0);
+}
+
+static void
+dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
+{
+ uint64_t claim_txg = zh->zh_claim_txg;
+ zil_scan_arg_t zsa = { dp, zh };
+ zilog_t *zilog;
+
+ /*
+ * We only want to visit blocks that have been claimed but not yet
+ * replayed (or, in read-only mode, blocks that *would* be claimed).
+ */
+ if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+ return;
+
+ zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+ (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
+ claim_txg);
+
+ zil_free(zilog);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
+ uint64_t objset, uint64_t object, uint64_t blkid)
+{
+ zbookmark_t czb;
+ uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+
+ if (zfs_no_scrub_prefetch)
+ return;
+
+ if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
+ (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+ return;
+
+ SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+
+ /*
+ * XXX need to make sure all of these arc_read() prefetches are
+ * done before setting xlateall (similar to dsl_read())
+ */
+ (void) arc_read(scn->scn_prefetch_zio_root, scn->scn_dp->dp_spa, bp,
+ buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+ &flags, &czb);
+}
+
+static boolean_t
+dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
+ const zbookmark_t *zb)
+{
+ /*
+ * We never skip over user/group accounting objects (obj<0)
+ */
+ if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
+ (int64_t)zb->zb_object >= 0) {
+ /*
+ * If we already visited this bp & everything below (in
+ * a prior txg sync), don't bother doing it again.
+ */
+ if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+ return (B_TRUE);
+
+ /*
+ * If we found the block we're trying to resume from, or
+ * we went past it to a different object, zero it out to
+ * indicate that it's OK to start checking for pausing
+ * again.
+ */
+ if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
+ zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+ dprintf("resuming at %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+ }
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Return nonzero on i/o error.
+ * Return new buf to write out in *bufp.
+ */
+static int
+dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+ dnode_phys_t *dnp, const blkptr_t *bp,
+ const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ int err;
+
+ if (BP_GET_LEVEL(bp) > 0) {
+ uint32_t flags = ARC_WAIT;
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+ dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
+ zb->zb_object, zb->zb_blkid * epb + i);
+ }
+ for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ dsl_scan_visitbp(cbp, &czb, dnp,
+ *bufp, ds, scn, ostype, tx);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
+ uint32_t flags = ARC_WAIT;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ uint32_t flags = ARC_WAIT;
+ dnode_phys_t *cdnp;
+ int i, j;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+ for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+ for (j = 0; j < cdnp->dn_nblkptr; j++) {
+ blkptr_t *cbp = &cdnp->dn_blkptr[j];
+ dsl_scan_prefetch(scn, *bufp, cbp,
+ zb->zb_objset, zb->zb_blkid * epb + i, j);
+ }
+ }
+ for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+ dsl_scan_visitdnode(scn, ds, ostype,
+ cdnp, *bufp, zb->zb_blkid * epb + i, tx);
+ }
+
+ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ uint32_t flags = ARC_WAIT;
+ objset_phys_t *osp;
+
+ err = arc_read_nolock(NULL, dp->dp_spa, bp,
+ arc_getbuf_func, bufp,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err) {
+ scn->scn_phys.scn_errors++;
+ return (err);
+ }
+
+ osp = (*bufp)->b_data;
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn))
+ dsl_scan_zil(dp, &osp->os_zil_header);
+
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
+
+ if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
+ /*
+ * We also always visit user/group accounting
+ * objects, and never skip them, even if we are
+ * pausing. This is necessary so that the space
+ * deltas from this txg get integrated.
+ */
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_groupused_dnode, *bufp,
+ DMU_GROUPUSED_OBJECT, tx);
+ dsl_scan_visitdnode(scn, ds, osp->os_type,
+ &osp->os_userused_dnode, *bufp,
+ DMU_USERUSED_OBJECT, tx);
+ }
+ }
+
+ return (0);
+}
+
+static void
+dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
+ dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
+ uint64_t object, dmu_tx_t *tx)
+{
+ int j;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ zbookmark_t czb;
+
+ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ dnp->dn_nlevels - 1, j);
+ dsl_scan_visitbp(&dnp->dn_blkptr[j],
+ &czb, dnp, buf, ds, scn, ostype, tx);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zbookmark_t czb;
+ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+ 0, DMU_SPILL_BLKID);
+ dsl_scan_visitbp(&dnp->dn_spill,
+ &czb, dnp, buf, ds, scn, ostype, tx);
+ }
+}
+
+/*
+ * The arguments are in this order because mdb can only print the
+ * first 5; we want them to be useful.
+ */
+static void
+dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
+ dnode_phys_t *dnp, arc_buf_t *pbuf,
+ dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
+ dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ arc_buf_t *buf = NULL;
+ blkptr_t bp_toread = *bp;
+
+ /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
+
+ if (dsl_scan_check_pause(scn, zb))
+ return;
+
+ if (dsl_scan_check_resume(scn, dnp, zb))
+ return;
+
+ if (bp->blk_birth == 0)
+ return;
+
+ scn->scn_visited_this_txg++;
+
+ dprintf_bp(bp,
+ "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
+ ds, ds ? ds->ds_object : 0,
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+ pbuf, bp);
+
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ return;
+
+ if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
+ /*
+ * For non-user-accounting blocks, we need to read the
+ * new bp (from a deleted snapshot, found in
+ * check_existing_xlation). If we used the old bp,
+ * pointers inside this block from before we resumed
+ * would be untranslated.
+ *
+ * For user-accounting blocks, we need to read the old
+ * bp, because we will apply the entire space delta to
+ * it (original untranslated -> translations from
+ * deleted snap -> now).
+ */
+ bp_toread = *bp;
+ }
+
+ if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
+ &buf) != 0)
+ return;
+
+ /*
+ * If dsl_scan_ddt() has aready visited this block, it will have
+ * already done any translations or scrubbing, so don't call the
+ * callback again.
+ */
+ if (ddt_class_contains(dp->dp_spa,
+ scn->scn_phys.scn_ddt_class_max, bp)) {
+ ASSERT(buf == NULL);
+ return;
+ }
+
+ /*
+ * If this block is from the future (after cur_max_txg), then we
+ * are doing this on behalf of a deleted snapshot, and we will
+ * revisit the future block on the next pass of this dataset.
+ * Don't scan it now unless we need to because something
+ * under it was modified.
+ */
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
+ scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+ }
+ if (buf)
+ (void) arc_buf_remove_ref(buf, &buf);
+}
+
+static void
+dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_tx_t *tx)
+{
+ zbookmark_t zb;
+
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+ dsl_scan_visitbp(bp, &zb, NULL, NULL,
+ ds, scn, DMU_OST_NONE, tx);
+
+ dprintf_ds(ds, "finished scan%s", "");
+}
+
+void
+dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+ if (dsl_dataset_is_snapshot(ds)) {
+ /* Note, scn_cur_{min,max}_txg stays the same. */
+ scn->scn_phys.scn_bookmark.zb_objset =
+ ds->ds_phys->ds_next_snap_obj;
+ zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+ scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
+ } else {
+ SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+ ZB_DESTROYED_OBJSET, 0, 0, 0);
+ zfs_dbgmsg("destroying ds %llu; currently traversing; "
+ "reset bookmark to -1,0,0,0",
+ (u_longlong_t)ds->ds_object);
+ }
+ } else if (zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+ ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ if (dsl_dataset_is_snapshot(ds)) {
+ /*
+ * We keep the same mintxg; it could be >
+ * ds_creation_txg if the previous snapshot was
+ * deleted too.
+ */
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
+ zfs_dbgmsg("destroying ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+ } else {
+ zfs_dbgmsg("destroying ds %llu; in queue; removing",
+ (u_longlong_t)ds->ds_object);
+ }
+ } else {
+ zfs_dbgmsg("destroying ds %llu; ignoring",
+ (u_longlong_t)ds->ds_object);
+ }
+
+ /*
+ * dsl_scan_sync() should be called after this, and should sync
+ * out our changed state, but just to be safe, do it here.
+ */
+ dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+ scn->scn_phys.scn_bookmark.zb_objset =
+ ds->ds_phys->ds_prev_snap_obj;
+ zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+ } else if (zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
+ zfs_dbgmsg("snapshotting ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+ }
+ dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
+ scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
+ zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds1->ds_object,
+ (u_longlong_t)ds2->ds_object);
+ } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
+ scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
+ zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds2->ds_object,
+ (u_longlong_t)ds1->ds_object);
+ }
+
+ if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds1->ds_object, &mintxg) == 0) {
+ int err;
+
+ ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+ ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
+ err = zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
+ VERIFY(err == 0 || err == EEXIST);
+ if (err == EEXIST) {
+ /* Both were there to begin with */
+ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj,
+ ds1->ds_object, mintxg, tx));
+ }
+ zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds1->ds_object,
+ (u_longlong_t)ds2->ds_object);
+ } else if (zap_lookup_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
+ ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
+ ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
+ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
+ zfs_dbgmsg("clone_swap ds %llu; in queue; "
+ "replacing with %llu",
+ (u_longlong_t)ds2->ds_object,
+ (u_longlong_t)ds1->ds_object);
+ }
+
+ dsl_scan_sync_state(scn, tx);
+}
+
+struct enqueue_clones_arg {
+ dmu_tx_t *tx;
+ uint64_t originobj;
+};
+
+/* ARGSUSED */
+static int
+enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ struct enqueue_clones_arg *eca = arg;
+ dsl_dataset_t *ds;
+ int err;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err)
+ return (err);
+
+ if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
+ while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+
+ dsl_dataset_rele(ds, FTAG);
+ if (err)
+ return (err);
+ ds = prev;
+ }
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object,
+ ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ dsl_dataset_t *ds;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+ /*
+ * Iterate over the bps in this ds.
+ */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
+
+ char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
+ dsl_dataset_name(ds, dsname);
+ zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
+ "pausing=%u",
+ (longlong_t)dsobj, dsname,
+ (longlong_t)scn->scn_phys.scn_cur_min_txg,
+ (longlong_t)scn->scn_phys.scn_cur_max_txg,
+ (int)scn->scn_pausing);
+ kmem_free(dsname, ZFS_MAXNAMELEN);
+
+ if (scn->scn_pausing)
+ goto out;
+
+ /*
+ * We've finished this pass over this dataset.
+ */
+
+ /*
+ * If we did not completely visit this dataset, do another pass.
+ */
+ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
+ zfs_dbgmsg("incomplete pass; visiting again");
+ scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_object,
+ scn->scn_phys.scn_cur_max_txg, tx) == 0);
+ goto out;
+ }
+
+ /*
+ * Add descendent datasets to work queue.
+ */
+ if (ds->ds_phys->ds_next_snap_obj != 0) {
+ VERIFY(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
+ ds->ds_phys->ds_creation_txg, tx) == 0);
+ }
+ if (ds->ds_phys->ds_num_children > 1) {
+ boolean_t usenext = B_FALSE;
+ if (ds->ds_phys->ds_next_clones_obj != 0) {
+ uint64_t count;
+ /*
+ * A bug in a previous version of the code could
+ * cause upgrade_clones_cb() to not set
+ * ds_next_snap_obj when it should, leading to a
+ * missing entry. Therefore we can only use the
+ * next_clones_obj when its count is correct.
+ */
+ int err = zap_count(dp->dp_meta_objset,
+ ds->ds_phys->ds_next_clones_obj, &count);
+ if (err == 0 &&
+ count == ds->ds_phys->ds_num_children - 1)
+ usenext = B_TRUE;
+ }
+
+ if (usenext) {
+ VERIFY(zap_join_key(dp->dp_meta_objset,
+ ds->ds_phys->ds_next_clones_obj,
+ scn->scn_phys.scn_queue_obj,
+ ds->ds_phys->ds_creation_txg, tx) == 0);
+ } else {
+ struct enqueue_clones_arg eca;
+ eca.tx = tx;
+ eca.originobj = ds->ds_object;
+
+ (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
+ NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
+ }
+ }
+
+out:
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/* ARGSUSED */
+static int
+enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ dmu_tx_t *tx = arg;
+ dsl_dataset_t *ds;
+ int err;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+ if (err)
+ return (err);
+
+ while (ds->ds_phys->ds_prev_snap_obj != 0) {
+ dsl_dataset_t *prev;
+ err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+ FTAG, &prev);
+ if (err) {
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
+ }
+
+ /*
+ * If this is a clone, we don't need to worry about it for now.
+ */
+ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+ dsl_dataset_rele(ds, FTAG);
+ dsl_dataset_rele(prev, FTAG);
+ return (0);
+ }
+ dsl_dataset_rele(ds, FTAG);
+ ds = prev;
+ }
+
+ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * We leverage the fact that the dde's replication class (enum ddt_class)
+ * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+ * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Since there are two replication classes which contain blocks with
+ * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+ * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can so we must account for changes
+ * in a block's replication class.
+ *
+ * Here's an example of what can occur:
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+ * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ */
+static void
+dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
+ ddt_entry_t dde = { 0 };
+ int error;
+ uint64_t n = 0;
+
+ while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+ ddt_t *ddt;
+
+ if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
+ break;
+ dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
+ (longlong_t)ddb->ddb_class,
+ (longlong_t)ddb->ddb_type,
+ (longlong_t)ddb->ddb_checksum,
+ (longlong_t)ddb->ddb_cursor);
+
+ /* There should be no pending changes to the dedup table */
+ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
+ ASSERT(avl_first(&ddt->ddt_tree) == NULL);
+
+ dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+ n++;
+
+ if (dsl_scan_check_pause(scn, NULL))
+ break;
+ }
+
+ zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
+ (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
+ (int)scn->scn_pausing);
+
+ ASSERT(error == 0 || error == ENOENT);
+ ASSERT(error != ENOENT ||
+ ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
+}
+
+/* ARGSUSED */
+void
+dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ const ddt_key_t *ddk = &dde->dde_key;
+ ddt_phys_t *ddp = dde->dde_phys;
+ blkptr_t bp;
+ zbookmark_t zb = { 0 };
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
+ continue;
+ ddt_bp_create(checksum, ddk, ddp, &bp);
+
+ scn->scn_visited_this_txg++;
+ scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+ }
+}
+
+static void
+dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+ scn->scn_phys.scn_ddt_class_max) {
+ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ dsl_scan_ddt(scn, tx);
+ if (scn->scn_pausing)
+ return;
+ }
+
+ if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
+ /* First do the MOS & ORIGIN */
+
+ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+ dsl_scan_visit_rootbp(scn, NULL,
+ &dp->dp_meta_rootbp, tx);
+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+ if (scn->scn_pausing)
+ return;
+
+ if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+ VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+ NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
+ } else {
+ dsl_scan_visitds(scn,
+ dp->dp_origin_snap->ds_object, tx);
+ }
+ ASSERT(!scn->scn_pausing);
+ } else if (scn->scn_phys.scn_bookmark.zb_objset !=
+ ZB_DESTROYED_OBJSET) {
+ /*
+ * If we were paused, continue from here. Note if the
+ * ds we were paused on was deleted, the zb_objset may
+ * be -1, so we will skip this and find a new objset
+ * below.
+ */
+ dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
+ if (scn->scn_pausing)
+ return;
+ }
+
+ /*
+ * In case we were paused right at the end of the ds, zero the
+ * bookmark so we don't think that we're still trying to resume.
+ */
+ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
+
+ /* keep pulling things out of the zap-object-as-queue */
+ while (zap_cursor_init(&zc, dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj),
+ zap_cursor_retrieve(&zc, &za) == 0) {
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+
+ dsobj = strtonum(za.za_name, NULL);
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, dsobj, tx));
+
+ /* Set up min/max txg */
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ if (za.za_first_integer != 0) {
+ scn->scn_phys.scn_cur_min_txg =
+ MAX(scn->scn_phys.scn_min_txg,
+ za.za_first_integer);
+ } else {
+ scn->scn_phys.scn_cur_min_txg =
+ MAX(scn->scn_phys.scn_min_txg,
+ ds->ds_phys->ds_prev_snap_txg);
+ }
+ scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
+ dsl_dataset_rele(ds, FTAG);
+
+ dsl_scan_visitds(scn, dsobj, tx);
+ zap_cursor_fini(&zc);
+ if (scn->scn_pausing)
+ return;
+ }
+ zap_cursor_fini(&zc);
+}
+
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dp->dp_scan;
+ spa_t *spa = dp->dp_spa;
+
+ /*
+ * Check for scn_restart_txg before checking spa_load_state, so
+ * that we can restart an old-style scan while the pool is being
+ * imported (see dsl_scan_init).
+ */
+ if (scn->scn_restart_txg != 0 &&
+ scn->scn_restart_txg <= tx->tx_txg) {
+ pool_scan_func_t func = POOL_SCAN_SCRUB;
+ dsl_scan_done(scn, B_FALSE, tx);
+ if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+ func = POOL_SCAN_RESILVER;
+ zfs_dbgmsg("restarting scan func=%u txg=%llu",
+ func, tx->tx_txg);
+ dsl_scan_setup_sync(scn, &func, tx);
+ }
+
+ if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE ||
+ spa_shutting_down(spa) ||
+ spa_sync_pass(dp->dp_spa) > 1 ||
+ scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ scn->scn_visited_this_txg = 0;
+ scn->scn_pausing = B_FALSE;
+ scn->scn_sync_start_time = gethrtime();
+ spa->spa_scrub_active = B_TRUE;
+
+ if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+ scn->scn_phys.scn_ddt_class_max) {
+ zfs_dbgmsg("doing scan sync txg %llu; "
+ "ddt bm=%llu/%llu/%llu/%llx",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
+ ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
+ } else {
+ zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
+ (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
+ }
+
+ scn->scn_prefetch_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_CANFAIL);
+ dsl_scan_visit(scn, tx);
+ (void) zio_wait(scn->scn_prefetch_zio_root);
+ scn->scn_prefetch_zio_root = NULL;
+
+ zfs_dbgmsg("visited %llu blocks in %llums",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
+
+ if (!scn->scn_pausing) {
+ /* finished with scan. */
+ zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
+ dsl_scan_done(scn, B_TRUE, tx);
+ }
+
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight > 0) {
+ cv_wait(&spa->spa_scrub_io_cv,
+ &spa->spa_scrub_lock);
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+ }
+
+ dsl_scan_sync_state(scn, tx);
+}
+
+/*
+ * This will start a new scan, or restart an existing one.
+ */
+void
+dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+{
+ if (txg == 0) {
+ dmu_tx_t *tx;
+ tx = dmu_tx_create_dd(dp->dp_mos_dir);
+ VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+ txg = dmu_tx_get_txg(tx);
+ dp->dp_scan->scn_restart_txg = txg;
+ dmu_tx_commit(tx);
+ } else {
+ dp->dp_scan->scn_restart_txg = txg;
+ }
+ zfs_dbgmsg("restarting resilver txg=%llu", txg);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+ return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
+ dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+/*
+ * scrub consumers
+ */
+
+static void
+count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+{
+ int i;
+
+ /*
+ * If we resume after a reboot, zab will be NULL; don't record
+ * incomplete stats in that case.
+ */
+ if (zab == NULL)
+ return;
+
+ for (i = 0; i < 4; i++) {
+ int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+ int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+ zfs_blkstat_t *zb = &zab->zab_type[l][t];
+ int equal;
+
+ zb->zb_count++;
+ zb->zb_asize += BP_GET_ASIZE(bp);
+ zb->zb_lsize += BP_GET_LSIZE(bp);
+ zb->zb_psize += BP_GET_PSIZE(bp);
+ zb->zb_gangs += BP_COUNT_GANG(bp);
+
+ switch (BP_GET_NDVAS(bp)) {
+ case 2:
+ if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1]))
+ zb->zb_ditto_2_of_2_samevdev++;
+ break;
+ case 3:
+ equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1])) +
+ (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2])) +
+ (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]));
+ if (equal == 1)
+ zb->zb_ditto_2_of_3_samevdev++;
+ else if (equal == 3)
+ zb->zb_ditto_3_of_3_samevdev++;
+ break;
+ }
+ }
+}
+
+static void
+dsl_scan_scrub_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ zio_data_buf_free(zio->io_data, zio->io_size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_inflight--;
+ cv_broadcast(&spa->spa_scrub_io_cv);
+
+ if (zio->io_error && (zio->io_error != ECKSUM ||
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+ spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+dsl_scan_scrub_cb(dsl_pool_t *dp,
+ const blkptr_t *bp, const zbookmark_t *zb)
+{
+ dsl_scan_t *scn = dp->dp_scan;
+ size_t size = BP_GET_PSIZE(bp);
+ spa_t *spa = dp->dp_spa;
+ uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+ boolean_t needs_io;
+ int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+ int zio_priority;
+
+ if (phys_birth <= scn->scn_phys.scn_min_txg ||
+ phys_birth >= scn->scn_phys.scn_max_txg)
+ return (0);
+
+ count_block(dp->dp_blkstats, bp);
+
+ ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
+ if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
+ zio_flags |= ZIO_FLAG_SCRUB;
+ zio_priority = ZIO_PRIORITY_SCRUB;
+ needs_io = B_TRUE;
+ } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+ zio_flags |= ZIO_FLAG_RESILVER;
+ zio_priority = ZIO_PRIORITY_RESILVER;
+ needs_io = B_FALSE;
+ }
+
+ /* If it's an intent log block, failure is expected. */
+ if (zb->zb_level == ZB_ZIL_LEVEL)
+ zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
+ vdev_t *vd = vdev_lookup_top(spa,
+ DVA_GET_VDEV(&bp->blk_dva[d]));
+
+ /*
+ * Keep track of how much data we've examined so that
+ * zpool(1M) status can make useful progress reports.
+ */
+ scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
+ spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+
+ /* if it's a resilver, this may not be in the target range */
+ if (!needs_io) {
+ if (DVA_GET_GANG(&bp->blk_dva[d])) {
+ /*
+ * Gang members may be spread across multiple
+ * vdevs, so the best estimate we have is the
+ * scrub range, which has already been checked.
+ * XXX -- it would be better to change our
+ * allocation policy to ensure that all
+ * gang members reside on the same vdev.
+ */
+ needs_io = B_TRUE;
+ } else {
+ needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+ phys_birth, 1);
+ }
+ }
+ }
+
+ if (needs_io && !zfs_no_scrub_io) {
+ void *data = zio_data_buf_alloc(size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ spa->spa_scrub_inflight++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ zio_nowait(zio_read(NULL, spa, bp, data, size,
+ dsl_scan_scrub_done, NULL, zio_priority,
+ zio_flags, zb));
+ }
+
+ /* do not relocate this block */
+ return (0);
+}
+
+int
+dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+{
+ spa_t *spa = dp->dp_spa;
+
+ /*
+ * Purge all vdev caches and probe all devices. We do this here
+ * rather than in sync context because this requires a writer lock
+ * on the spa_config lock, which we can't do from sync context. The
+ * spa_scrub_reopen flag indicates that vdev_open() should not
+ * attempt to start another scrub.
+ */
+ spa_vdev_state_enter(spa, SCL_NONE);
+ spa->spa_scrub_reopen = B_TRUE;
+ vdev_reopen(spa->spa_root_vdev);
+ spa->spa_scrub_reopen = B_FALSE;
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+
+ return (dsl_sync_task_do(dp, dsl_scan_setup_check,
+ dsl_scan_setup_sync, dp->dp_scan, &func, 0));
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_scrub.c b/usr/src/uts/common/fs/zfs/dsl_scrub.c
deleted file mode 100644
index b16ff66586..0000000000
--- a/usr/src/uts/common/fs/zfs/dsl_scrub.c
+++ /dev/null
@@ -1,1214 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dnode.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/arc.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/zil_impl.h>
-#include <sys/zio_checksum.h>
-#include <sys/ddt.h>
-#include <sys/sa.h>
-#include <sys/sa_impl.h>
-
-typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
-
-static scrub_cb_t dsl_pool_scrub_clean_cb;
-static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
-static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
- uint64_t objset, uint64_t object);
-
-int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
-int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
-boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
-enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
-
-extern int zfs_txg_timeout;
-
-static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
- NULL,
- dsl_pool_scrub_clean_cb
-};
-
-/* ARGSUSED */
-static void
-dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = arg1;
- enum scrub_func *funcp = arg2;
- dmu_object_type_t ot = 0;
- boolean_t complete = B_FALSE;
-
- dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx);
-
- ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE);
- ASSERT(*funcp > SCRUB_FUNC_NONE);
- ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS);
-
- dp->dp_scrub_min_txg = 0;
- dp->dp_scrub_max_txg = tx->tx_txg;
- dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max;
-
- if (*funcp == SCRUB_FUNC_CLEAN) {
- vdev_t *rvd = dp->dp_spa->spa_root_vdev;
-
- /* rewrite all disk labels */
- vdev_config_dirty(rvd);
-
- if (vdev_resilver_needed(rvd,
- &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) {
- spa_event_notify(dp->dp_spa, NULL,
- ESC_ZFS_RESILVER_START);
- dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
- tx->tx_txg);
- } else {
- spa_event_notify(dp->dp_spa, NULL,
- ESC_ZFS_SCRUB_START);
- }
-
- /* zero out the scrub stats in all vdev_stat_t's */
- vdev_scrub_stat_update(rvd,
- dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
- POOL_SCRUB_EVERYTHING, B_FALSE);
-
- /*
- * If this is an incremental scrub, limit the DDT scrub phase
- * to just the auto-ditto class (for correctness); the rest
- * of the scrub should go faster using top-down pruning.
- */
- if (dp->dp_scrub_min_txg > TXG_INITIAL)
- dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO;
-
- dp->dp_spa->spa_scrub_started = B_TRUE;
- }
-
- /* back to the generic stuff */
-
- if (dp->dp_blkstats == NULL) {
- dp->dp_blkstats =
- kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
- }
- bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
-
- if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
- ot = DMU_OT_ZAP_OTHER;
-
- dp->dp_scrub_func = *funcp;
- dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
- ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
- bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
- bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
- dp->dp_scrub_restart = B_FALSE;
- dp->dp_spa->spa_scrub_errors = 0;
-
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
- &dp->dp_scrub_func, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
- &dp->dp_scrub_queue_obj, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
- &dp->dp_scrub_min_txg, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
- &dp->dp_scrub_max_txg, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
- sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
- &dp->dp_scrub_bookmark, tx));
- VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
- sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
- &dp->dp_scrub_ddt_bookmark, tx));
- VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
- &dp->dp_scrub_ddt_class_max, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
- &dp->dp_spa->spa_scrub_errors, tx));
-
- spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr,
- "func=%u mintxg=%llu maxtxg=%llu",
- *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg);
-}
-
-int
-dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func)
-{
- return (dsl_sync_task_do(dp, NULL,
- dsl_pool_scrub_setup_sync, dp, &func, 0));
-}
-
-/* ARGSUSED */
-static void
-dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = arg1;
- boolean_t *completep = arg2;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- mutex_enter(&dp->dp_scrub_cancel_lock);
-
- if (dp->dp_scrub_restart) {
- dp->dp_scrub_restart = B_FALSE;
- *completep = B_FALSE;
- }
-
- /* XXX this is scrub-clean specific */
- mutex_enter(&dp->dp_spa->spa_scrub_lock);
- while (dp->dp_spa->spa_scrub_inflight > 0) {
- cv_wait(&dp->dp_spa->spa_scrub_io_cv,
- &dp->dp_spa->spa_scrub_lock);
- }
- mutex_exit(&dp->dp_spa->spa_scrub_lock);
- dp->dp_spa->spa_scrub_active = B_FALSE;
-
- dp->dp_scrub_func = SCRUB_FUNC_NONE;
- VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, tx));
- dp->dp_scrub_queue_obj = 0;
- bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
- bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
-
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_QUEUE, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MIN_TXG, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_MAX_TXG, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_FUNC, tx));
- VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_ERRORS, tx));
-
- (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_DDT_BOOKMARK, tx);
- (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_DDT_CLASS_MAX, tx);
-
- spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
- "complete=%u", *completep);
-
- /* below is scrub-clean specific */
- vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE,
- *completep);
- /*
- * If the scrub/resilver completed, update all DTLs to reflect this.
- * Whether it succeeded or not, vacate all temporary scrub DTLs.
- */
- vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
- *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
- dp->dp_spa->spa_scrub_started = B_FALSE;
- if (*completep)
- spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ?
- ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
- spa_errlog_rotate(dp->dp_spa);
-
- /*
- * We may have finished replacing a device.
- * Let the async thread assess this and handle the detach.
- */
- spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE);
-
- dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0;
- mutex_exit(&dp->dp_scrub_cancel_lock);
-}
-
-int
-dsl_pool_scrub_cancel(dsl_pool_t *dp)
-{
- boolean_t complete = B_FALSE;
-
- return (dsl_sync_task_do(dp, NULL,
- dsl_pool_scrub_cancel_sync, dp, &complete, 3));
-}
-
-void
-dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
-{
- /*
- * This function will be used by bp-rewrite wad to intercept frees.
- */
- zio_free(dp->dp_spa, txg, bpp);
-}
-
-void
-dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
-{
- ASSERT(dsl_pool_sync_context(dp));
- zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
-}
-
-static boolean_t
-bookmark_is_zero(const zbookmark_t *zb)
-{
- return (zb->zb_objset == 0 && zb->zb_object == 0 &&
- zb->zb_level == 0 && zb->zb_blkid == 0);
-}
-
-/* dnp is the dnode for zb1->zb_object */
-static boolean_t
-bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
- const zbookmark_t *zb2)
-{
- uint64_t zb1nextL0, zb2thisobj;
-
- ASSERT(zb1->zb_objset == zb2->zb_objset);
- ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT);
- ASSERT(zb2->zb_level == 0);
-
- /*
- * A bookmark in the deadlist is considered to be after
- * everything else.
- */
- if (zb2->zb_object == DMU_DEADLIST_OBJECT)
- return (B_TRUE);
-
- /* The objset_phys_t isn't before anything. */
- if (dnp == NULL)
- return (B_FALSE);
-
- zb1nextL0 = (zb1->zb_blkid + 1) <<
- ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
-
- zb2thisobj = zb2->zb_object ? zb2->zb_object :
- zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
-
- if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
- uint64_t nextobj = zb1nextL0 *
- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
- return (nextobj <= zb2thisobj);
- }
-
- if (zb1->zb_object < zb2thisobj)
- return (B_TRUE);
- if (zb1->zb_object > zb2thisobj)
- return (B_FALSE);
- if (zb2->zb_object == DMU_META_DNODE_OBJECT)
- return (B_FALSE);
- return (zb1nextL0 <= zb2->zb_blkid);
-}
-
-static boolean_t
-scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb)
-{
- uint64_t elapsed_nanosecs;
- int mintime;
-
- if (dp->dp_scrub_pausing)
- return (B_TRUE); /* we're already pausing */
-
- if (!bookmark_is_zero(&dp->dp_scrub_bookmark))
- return (B_FALSE); /* we're resuming */
-
- /* We only know how to resume from level-0 blocks. */
- if (zb != NULL && zb->zb_level != 0)
- return (B_FALSE);
-
- mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time_ms :
- zfs_scrub_min_time_ms;
- elapsed_nanosecs = gethrtime() - dp->dp_scrub_start_time;
- if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
- (elapsed_nanosecs / MICROSEC > mintime && txg_sync_waiting(dp))) {
- if (zb) {
- dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
- (longlong_t)zb->zb_objset,
- (longlong_t)zb->zb_object,
- (longlong_t)zb->zb_level,
- (longlong_t)zb->zb_blkid);
- dp->dp_scrub_bookmark = *zb;
- }
- if (ddb) {
- dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
- (longlong_t)ddb->ddb_class,
- (longlong_t)ddb->ddb_type,
- (longlong_t)ddb->ddb_checksum,
- (longlong_t)ddb->ddb_cursor);
- ASSERT(&dp->dp_scrub_ddt_bookmark == ddb);
- }
- dp->dp_scrub_pausing = B_TRUE;
- return (B_TRUE);
- }
- return (B_FALSE);
-}
-
-typedef struct zil_traverse_arg {
- dsl_pool_t *zta_dp;
- zil_header_t *zta_zh;
-} zil_traverse_arg_t;
-
-/* ARGSUSED */
-static int
-traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
-{
- zil_traverse_arg_t *zta = arg;
- dsl_pool_t *dp = zta->zta_dp;
- zil_header_t *zh = zta->zta_zh;
- zbookmark_t zb;
-
- if (bp->blk_birth <= dp->dp_scrub_min_txg)
- return (0);
-
- /*
- * One block ("stubby") can be allocated a long time ago; we
- * want to visit that one because it has been allocated
- * (on-disk) even if it hasn't been claimed (even though for
- * plain scrub there's nothing to do to it).
- */
- if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
- return (0);
-
- SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
- ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
-
- VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
- return (0);
-}
-
-/* ARGSUSED */
-static int
-traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
-{
- if (lrc->lrc_txtype == TX_WRITE) {
- zil_traverse_arg_t *zta = arg;
- dsl_pool_t *dp = zta->zta_dp;
- zil_header_t *zh = zta->zta_zh;
- lr_write_t *lr = (lr_write_t *)lrc;
- blkptr_t *bp = &lr->lr_blkptr;
- zbookmark_t zb;
-
- if (bp->blk_birth <= dp->dp_scrub_min_txg)
- return (0);
-
- /*
- * birth can be < claim_txg if this record's txg is
- * already txg sync'ed (but this log block contains
- * other records that are not synced)
- */
- if (claim_txg == 0 || bp->blk_birth < claim_txg)
- return (0);
-
- SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
- lr->lr_foid, ZB_ZIL_LEVEL,
- lr->lr_offset / BP_GET_LSIZE(bp));
-
- VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
- }
- return (0);
-}
-
-static void
-traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
-{
- uint64_t claim_txg = zh->zh_claim_txg;
- zil_traverse_arg_t zta = { dp, zh };
- zilog_t *zilog;
-
- /*
- * We only want to visit blocks that have been claimed but not yet
- * replayed (or, in read-only mode, blocks that *would* be claimed).
- */
- if (claim_txg == 0 && spa_writeable(dp->dp_spa))
- return;
-
- zilog = zil_alloc(dp->dp_meta_objset, zh);
-
- (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta,
- claim_txg);
-
- zil_free(zilog);
-}
-
-static void
-scrub_prefetch(dsl_pool_t *dp, arc_buf_t *buf, blkptr_t *bp, uint64_t objset,
- uint64_t object, uint64_t blkid)
-{
- zbookmark_t czb;
- uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
-
- if (zfs_no_scrub_prefetch)
- return;
-
- if (BP_IS_HOLE(bp) || bp->blk_birth <= dp->dp_scrub_min_txg ||
- (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
- return;
-
- SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
-
- (void) arc_read(dp->dp_scrub_prefetch_zio_root, dp->dp_spa, bp,
- buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
- &flags, &czb);
-}
-
-static void
-scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
- arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
-{
- int err;
- arc_buf_t *buf = NULL;
-
- if (bp->blk_birth <= dp->dp_scrub_min_txg)
- return;
-
- if (scrub_pause(dp, zb, NULL))
- return;
-
- if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
- /*
- * If we already visited this bp & everything below (in
- * a prior txg), don't bother doing it again.
- */
- if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark))
- return;
-
- /*
- * If we found the block we're trying to resume from, or
- * we went past it to a different object, zero it out to
- * indicate that it's OK to start checking for pausing
- * again.
- */
- if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 ||
- zb->zb_object > dp->dp_scrub_bookmark.zb_object) {
- dprintf("resuming at %llx/%llx/%llx/%llx\n",
- (longlong_t)zb->zb_objset,
- (longlong_t)zb->zb_object,
- (longlong_t)zb->zb_level,
- (longlong_t)zb->zb_blkid);
- bzero(&dp->dp_scrub_bookmark, sizeof (*zb));
- }
- }
-
- /*
- * If dsl_pool_scrub_ddt() has aready scrubbed this block,
- * don't scrub it again.
- */
- if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp))
- (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
-
- if (BP_GET_LEVEL(bp) > 0) {
- uint32_t flags = ARC_WAIT;
- int i;
- blkptr_t *cbp;
- int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
-
- err = arc_read(NULL, dp->dp_spa, bp, pbuf,
- arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
- if (err) {
- mutex_enter(&dp->dp_spa->spa_scrub_lock);
- dp->dp_spa->spa_scrub_errors++;
- mutex_exit(&dp->dp_spa->spa_scrub_lock);
- return;
- }
- for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
- scrub_prefetch(dp, buf, cbp, zb->zb_objset,
- zb->zb_object, zb->zb_blkid * epb + i);
- }
- for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
- zbookmark_t czb;
-
- SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
- zb->zb_level - 1,
- zb->zb_blkid * epb + i);
- scrub_visitbp(dp, dnp, buf, cbp, &czb);
- }
- } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
- uint32_t flags = ARC_WAIT;
- dnode_phys_t *cdnp;
- int i, j;
- int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
-
- err = arc_read(NULL, dp->dp_spa, bp, pbuf,
- arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
- if (err) {
- mutex_enter(&dp->dp_spa->spa_scrub_lock);
- dp->dp_spa->spa_scrub_errors++;
- mutex_exit(&dp->dp_spa->spa_scrub_lock);
- return;
- }
- for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
- for (j = 0; j < cdnp->dn_nblkptr; j++) {
- blkptr_t *cbp = &cdnp->dn_blkptr[j];
- scrub_prefetch(dp, buf, cbp, zb->zb_objset,
- zb->zb_blkid * epb + i, j);
- }
- }
- for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
- scrub_visitdnode(dp, cdnp, buf, zb->zb_objset,
- zb->zb_blkid * epb + i);
- }
- } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
- uint32_t flags = ARC_WAIT;
- objset_phys_t *osp;
-
- err = arc_read_nolock(NULL, dp->dp_spa, bp,
- arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
- if (err) {
- mutex_enter(&dp->dp_spa->spa_scrub_lock);
- dp->dp_spa->spa_scrub_errors++;
- mutex_exit(&dp->dp_spa->spa_scrub_lock);
- return;
- }
-
- osp = buf->b_data;
-
- traverse_zil(dp, &osp->os_zil_header);
-
- scrub_visitdnode(dp, &osp->os_meta_dnode,
- buf, zb->zb_objset, DMU_META_DNODE_OBJECT);
- if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
- scrub_visitdnode(dp, &osp->os_userused_dnode,
- buf, zb->zb_objset, DMU_USERUSED_OBJECT);
- scrub_visitdnode(dp, &osp->os_groupused_dnode,
- buf, zb->zb_objset, DMU_GROUPUSED_OBJECT);
- }
- }
-
- if (buf)
- (void) arc_buf_remove_ref(buf, &buf);
-}
-
-static void
-scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
- uint64_t objset, uint64_t object)
-{
- int j;
-
- for (j = 0; j < dnp->dn_nblkptr; j++) {
- zbookmark_t czb;
-
- SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
- scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
-
- if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
- zbookmark_t czb;
- SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
- scrub_visitbp(dp, dnp, buf, &dnp->dn_spill, &czb);
- }
- }
-}
-
-static void
-scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
-{
- zbookmark_t zb;
-
- SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
- ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
- scrub_visitbp(dp, NULL, NULL, bp, &zb);
-}
-
-void
-dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
- SET_BOOKMARK(&dp->dp_scrub_bookmark,
- ZB_DESTROYED_OBJSET, 0, 0, 0);
- } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_object, tx) != 0) {
- return;
- }
-
- if (ds->ds_phys->ds_next_snap_obj != 0) {
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_phys->ds_next_snap_obj, tx) == 0);
- }
- ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
-}
-
-void
-dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
-
- if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
- dp->dp_scrub_bookmark.zb_objset =
- ds->ds_phys->ds_prev_snap_obj;
- } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_object, tx) == 0) {
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_phys->ds_prev_snap_obj, tx) == 0);
- }
-}
-
-void
-dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = ds1->ds_dir->dd_pool;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) {
- dp->dp_scrub_bookmark.zb_objset = ds2->ds_object;
- } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) {
- dp->dp_scrub_bookmark.zb_objset = ds1->ds_object;
- }
-
- if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds1->ds_object, tx) == 0) {
- int err = zap_add_int(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, ds2->ds_object, tx);
- VERIFY(err == 0 || err == EEXIST);
- if (err == EEXIST) {
- /* Both were there to begin with */
- VERIFY(0 == zap_add_int(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, ds1->ds_object, tx));
- }
- } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds2->ds_object, tx) == 0) {
- VERIFY(0 == zap_add_int(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, ds1->ds_object, tx));
- }
-}
-
-struct enqueue_clones_arg {
- dmu_tx_t *tx;
- uint64_t originobj;
-};
-
-/* ARGSUSED */
-static int
-enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
- struct enqueue_clones_arg *eca = arg;
- dsl_dataset_t *ds;
- int err;
- dsl_pool_t *dp;
-
- err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
- if (err)
- return (err);
- dp = ds->ds_dir->dd_pool;
-
- if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
- while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
- dsl_dataset_t *prev;
- err = dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
-
- dsl_dataset_rele(ds, FTAG);
- if (err)
- return (err);
- ds = prev;
- }
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_object, eca->tx) == 0);
- }
- dsl_dataset_rele(ds, FTAG);
- return (0);
-}
-
-static void
-scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds;
- uint64_t min_txg_save;
-
- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-
- /*
- * Iterate over the bps in this ds.
- */
- min_txg_save = dp->dp_scrub_min_txg;
- dp->dp_scrub_min_txg =
- MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg);
- scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp);
- dp->dp_scrub_min_txg = min_txg_save;
-
- if (dp->dp_scrub_pausing)
- goto out;
-
- /*
- * Add descendent datasets to work queue.
- */
- if (ds->ds_phys->ds_next_snap_obj != 0) {
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_phys->ds_next_snap_obj, tx) == 0);
- }
- if (ds->ds_phys->ds_num_children > 1) {
- boolean_t usenext = B_FALSE;
- if (ds->ds_phys->ds_next_clones_obj != 0) {
- uint64_t count;
- /*
- * A bug in a previous version of the code could
- * cause upgrade_clones_cb() to not set
- * ds_next_snap_obj when it should, leading to a
- * missing entry. Therefore we can only use the
- * next_clones_obj when its count is correct.
- */
- int err = zap_count(dp->dp_meta_objset,
- ds->ds_phys->ds_next_clones_obj, &count);
- if (err == 0 &&
- count == ds->ds_phys->ds_num_children - 1)
- usenext = B_TRUE;
- }
-
- if (usenext) {
- VERIFY(zap_join(dp->dp_meta_objset,
- ds->ds_phys->ds_next_clones_obj,
- dp->dp_scrub_queue_obj, tx) == 0);
- } else {
- struct enqueue_clones_arg eca;
- eca.tx = tx;
- eca.originobj = ds->ds_object;
-
- (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
- NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
- }
- }
-
-out:
- dsl_dataset_rele(ds, FTAG);
-}
-
-/* ARGSUSED */
-static int
-enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
- dmu_tx_t *tx = arg;
- dsl_dataset_t *ds;
- int err;
- dsl_pool_t *dp;
-
- err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
- if (err)
- return (err);
-
- dp = ds->ds_dir->dd_pool;
-
- while (ds->ds_phys->ds_prev_snap_obj != 0) {
- dsl_dataset_t *prev;
- err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
- FTAG, &prev);
- if (err) {
- dsl_dataset_rele(ds, FTAG);
- return (err);
- }
-
- /*
- * If this is a clone, we don't need to worry about it for now.
- */
- if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
- dsl_dataset_rele(ds, FTAG);
- dsl_dataset_rele(prev, FTAG);
- return (0);
- }
- dsl_dataset_rele(ds, FTAG);
- ds = prev;
- }
-
- VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
- ds->ds_object, tx) == 0);
- dsl_dataset_rele(ds, FTAG);
- return (0);
-}
-
-/*
- * Scrub/dedup interaction.
- *
- * If there are N references to a deduped block, we don't want to scrub it
- * N times -- ideally, we should scrub it exactly once.
- *
- * We leverage the fact that the dde's replication class (enum ddt_class)
- * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
- * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
- *
- * To prevent excess scrubbing, the scrub begins by walking the DDT
- * to find all blocks with refcnt > 1, and scrubs each of these once.
- * Since there are two replication classes which contain blocks with
- * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
- * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
- *
- * There would be nothing more to say if a block's refcnt couldn't change
- * during a scrub, but of course it can so we must account for changes
- * in a block's replication class.
- *
- * Here's an example of what can occur:
- *
- * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
- * when visited during the top-down scrub phase, it will be scrubbed twice.
- * This negates our scrub optimization, but is otherwise harmless.
- *
- * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
- * on each visit during the top-down scrub phase, it will never be scrubbed.
- * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
- * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
- * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
- * while a scrub is in progress, it scrubs the block right then.
- */
-static void
-dsl_pool_scrub_ddt(dsl_pool_t *dp)
-{
- ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark;
- ddt_entry_t dde;
- int error;
-
- while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) {
- if (ddb->ddb_class > dp->dp_scrub_ddt_class_max)
- return;
- dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde);
- if (scrub_pause(dp, NULL, ddb))
- return;
- }
- ASSERT(error == ENOENT);
- ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max);
-}
-
-void
-dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
- const ddt_entry_t *dde)
-{
- const ddt_key_t *ddk = &dde->dde_key;
- const ddt_phys_t *ddp = dde->dde_phys;
- blkptr_t blk;
- zbookmark_t zb = { 0 };
-
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
- if (ddp->ddp_phys_birth == 0)
- continue;
- ddt_bp_create(checksum, ddk, ddp, &blk);
- scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
- }
-}
-
-void
-dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
-{
- spa_t *spa = dp->dp_spa;
- zap_cursor_t zc;
- zap_attribute_t za;
- boolean_t complete = B_TRUE;
-
- if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
- return;
-
- /*
- * If the pool is not loaded, or is trying to unload, leave it alone.
- */
- if (spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa))
- return;
-
- if (dp->dp_scrub_restart) {
- enum scrub_func func = dp->dp_scrub_func;
- dp->dp_scrub_restart = B_FALSE;
- dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
- }
-
- if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
- /*
- * We must have resumed after rebooting; reset the vdev
- * stats to know that we're doing a scrub (although it
- * will think we're just starting now).
- */
- vdev_scrub_stat_update(spa->spa_root_vdev,
- dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
- POOL_SCRUB_EVERYTHING, B_FALSE);
- }
-
- dp->dp_scrub_pausing = B_FALSE;
- dp->dp_scrub_start_time = gethrtime();
- dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
- spa->spa_scrub_active = B_TRUE;
-
- if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) {
- dsl_pool_scrub_ddt(dp);
- if (dp->dp_scrub_pausing)
- goto out;
- }
-
- if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) {
- /* First do the MOS & ORIGIN */
- scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
- if (dp->dp_scrub_pausing)
- goto out;
-
- if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
- VERIFY(0 == dmu_objset_find_spa(spa,
- NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
- } else {
- scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
- }
- ASSERT(!dp->dp_scrub_pausing);
- } else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) {
- /*
- * If we were paused, continue from here. Note if the ds
- * we were paused on was destroyed, the zb_objset will be
- * ZB_DESTROYED_OBJSET, so we will skip this and find a new
- * objset below.
- */
- scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
- if (dp->dp_scrub_pausing)
- goto out;
- }
-
- /*
- * In case we were paused right at the end of the ds, zero the
- * bookmark so we don't think that we're still trying to resume.
- */
- bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-
- /* keep pulling things out of the zap-object-as-queue */
- while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj),
- zap_cursor_retrieve(&zc, &za) == 0) {
- VERIFY(0 == zap_remove(dp->dp_meta_objset,
- dp->dp_scrub_queue_obj, za.za_name, tx));
- scrub_visitds(dp, za.za_first_integer, tx);
- if (dp->dp_scrub_pausing)
- break;
- zap_cursor_fini(&zc);
- }
- zap_cursor_fini(&zc);
- if (dp->dp_scrub_pausing)
- goto out;
-
- /* done. */
-
- dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
- return;
-out:
- VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
- sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
- &dp->dp_scrub_bookmark, tx));
- VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
- sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
- &dp->dp_scrub_ddt_bookmark, tx));
- VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
- &dp->dp_scrub_ddt_class_max, tx));
- VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
- &spa->spa_scrub_errors, tx));
-}
-
-void
-dsl_pool_scrub_restart(dsl_pool_t *dp)
-{
- mutex_enter(&dp->dp_scrub_cancel_lock);
- dp->dp_scrub_restart = B_TRUE;
- mutex_exit(&dp->dp_scrub_cancel_lock);
-}
-
-/*
- * scrub consumers
- */
-
-static void
-count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
-{
- int i;
-
- /*
- * If we resume after a reboot, zab will be NULL; don't record
- * incomplete stats in that case.
- */
- if (zab == NULL)
- return;
-
- for (i = 0; i < 4; i++) {
- int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
- int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
- zfs_blkstat_t *zb = &zab->zab_type[l][t];
- int equal;
-
- zb->zb_count++;
- zb->zb_asize += BP_GET_ASIZE(bp);
- zb->zb_lsize += BP_GET_LSIZE(bp);
- zb->zb_psize += BP_GET_PSIZE(bp);
- zb->zb_gangs += BP_COUNT_GANG(bp);
-
- switch (BP_GET_NDVAS(bp)) {
- case 2:
- if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
- DVA_GET_VDEV(&bp->blk_dva[1]))
- zb->zb_ditto_2_of_2_samevdev++;
- break;
- case 3:
- equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
- DVA_GET_VDEV(&bp->blk_dva[1])) +
- (DVA_GET_VDEV(&bp->blk_dva[0]) ==
- DVA_GET_VDEV(&bp->blk_dva[2])) +
- (DVA_GET_VDEV(&bp->blk_dva[1]) ==
- DVA_GET_VDEV(&bp->blk_dva[2]));
- if (equal == 1)
- zb->zb_ditto_2_of_3_samevdev++;
- else if (equal == 3)
- zb->zb_ditto_3_of_3_samevdev++;
- break;
- }
- }
-}
-
-static void
-dsl_pool_scrub_clean_done(zio_t *zio)
-{
- spa_t *spa = zio->io_spa;
-
- zio_data_buf_free(zio->io_data, zio->io_size);
-
- mutex_enter(&spa->spa_scrub_lock);
- spa->spa_scrub_inflight--;
- cv_broadcast(&spa->spa_scrub_io_cv);
-
- if (zio->io_error && (zio->io_error != ECKSUM ||
- !(zio->io_flags & ZIO_FLAG_SPECULATIVE)))
- spa->spa_scrub_errors++;
- mutex_exit(&spa->spa_scrub_lock);
-}
-
-static int
-dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
- const blkptr_t *bp, const zbookmark_t *zb)
-{
- size_t size = BP_GET_PSIZE(bp);
- spa_t *spa = dp->dp_spa;
- uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
- boolean_t needs_io;
- int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
- int zio_priority;
-
- if (phys_birth <= dp->dp_scrub_min_txg ||
- phys_birth >= dp->dp_scrub_max_txg)
- return (0);
-
- count_block(dp->dp_blkstats, bp);
-
- if (dp->dp_scrub_isresilver == 0) {
- /* It's a scrub */
- zio_flags |= ZIO_FLAG_SCRUB;
- zio_priority = ZIO_PRIORITY_SCRUB;
- needs_io = B_TRUE;
- } else {
- /* It's a resilver */
- zio_flags |= ZIO_FLAG_RESILVER;
- zio_priority = ZIO_PRIORITY_RESILVER;
- needs_io = B_FALSE;
- }
-
- /* If it's an intent log block, failure is expected. */
- if (zb->zb_level == ZB_ZIL_LEVEL)
- zio_flags |= ZIO_FLAG_SPECULATIVE;
-
- for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
- vdev_t *vd = vdev_lookup_top(spa,
- DVA_GET_VDEV(&bp->blk_dva[d]));
-
- /*
- * Keep track of how much data we've examined so that
- * zpool(1M) status can make useful progress reports.
- */
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_scrub_examined +=
- DVA_GET_ASIZE(&bp->blk_dva[d]);
- mutex_exit(&vd->vdev_stat_lock);
-
- /* if it's a resilver, this may not be in the target range */
- if (!needs_io) {
- if (DVA_GET_GANG(&bp->blk_dva[d])) {
- /*
- * Gang members may be spread across multiple
- * vdevs, so the best estimate we have is the
- * scrub range, which has already been checked.
- * XXX -- it would be better to change our
- * allocation policy to ensure that all
- * gang members reside on the same vdev.
- */
- needs_io = B_TRUE;
- } else {
- needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
- phys_birth, 1);
- }
- }
- }
-
- if (needs_io && !zfs_no_scrub_io) {
- void *data = zio_data_buf_alloc(size);
-
- mutex_enter(&spa->spa_scrub_lock);
- while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
- cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
- spa->spa_scrub_inflight++;
- mutex_exit(&spa->spa_scrub_lock);
-
- zio_nowait(zio_read(NULL, spa, bp, data, size,
- dsl_pool_scrub_clean_done, NULL, zio_priority,
- zio_flags, zb));
- }
-
- /* do not relocate this block */
- return (0);
-}
-
-int
-dsl_pool_scrub_clean(dsl_pool_t *dp)
-{
- spa_t *spa = dp->dp_spa;
-
- /*
- * Purge all vdev caches and probe all devices. We do this here
- * rather than in sync context because this requires a writer lock
- * on the spa_config lock, which we can't do from sync context. The
- * spa_scrub_reopen flag indicates that vdev_open() should not
- * attempt to start another scrub.
- */
- spa_vdev_state_enter(spa, SCL_NONE);
- spa->spa_scrub_reopen = B_TRUE;
- vdev_reopen(spa->spa_root_vdev);
- spa->spa_scrub_reopen = B_FALSE;
- (void) spa_vdev_state_exit(spa, NULL, 0);
-
- return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
-}
diff --git a/usr/src/uts/common/fs/zfs/dsl_synctask.c b/usr/src/uts/common/fs/zfs/dsl_synctask.c
index 81c6334cc8..832685b0fc 100644
--- a/usr/src/uts/common/fs/zfs/dsl_synctask.c
+++ b/usr/src/uts/common/fs/zfs/dsl_synctask.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/dmu.h>
@@ -29,7 +28,6 @@
#include <sys/dsl_dir.h>
#include <sys/dsl_synctask.h>
#include <sys/metaslab.h>
-#include <sys/cred.h>
#define DST_AVG_BLKSHIFT 14
@@ -49,7 +47,6 @@ dsl_sync_task_group_create(dsl_pool_t *dp)
list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
offsetof(dsl_sync_task_t, dst_node));
dstg->dstg_pool = dp;
- dstg->dstg_cr = CRED();
return (dstg);
}
@@ -136,7 +133,6 @@ dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
uint64_t txg;
dstg->dstg_nowaiter = B_TRUE;
- dstg->dstg_cr = NULL; /* it won't be valid by the time we sync */
txg = dmu_tx_get_txg(tx);
/*
* We don't generally have many sync tasks, so pay the price of
@@ -200,8 +196,7 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
*/
for (dst = list_head(&dstg->dstg_tasks); dst;
dst = list_next(&dstg->dstg_tasks, dst)) {
- dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2,
- dstg->dstg_cr, tx);
+ dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx);
}
}
rw_exit(&dp->dp_config_rwlock);
diff --git a/usr/src/uts/common/fs/zfs/refcount.c b/usr/src/uts/common/fs/zfs/refcount.c
index f1b3b23fe2..8358b4ceeb 100644
--- a/usr/src/uts/common/fs/zfs/refcount.c
+++ b/usr/src/uts/common/fs/zfs/refcount.c
@@ -19,12 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/refcount.h>
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index dd0fbccc7e..510472a515 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -59,6 +59,7 @@
#include <sys/systeminfo.h>
#include <sys/spa_boot.h>
#include <sys/zfs_ioctl.h>
+#include <sys/dsl_scan.h>
#ifdef _KERNEL
#include <sys/bootprops.h>
@@ -110,7 +111,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
};
-static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
+static dsl_syncfunc_t spa_sync_props;
static boolean_t spa_has_active_shared_spare(spa_t *spa);
static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
@@ -1105,7 +1106,7 @@ spa_load_spares(spa_t *spa)
KM_SLEEP);
for (i = 0; i < spa->spa_spares.sav_count; i++)
spares[i] = vdev_config_generate(spa,
- spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
+ spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
for (i = 0; i < spa->spa_spares.sav_count; i++)
@@ -1231,7 +1232,7 @@ spa_load_l2cache(spa_t *spa)
l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
for (i = 0; i < sav->sav_count; i++)
l2cache[i] = vdev_config_generate(spa,
- sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
+ sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
VERIFY(nvlist_add_nvlist_array(sav->sav_config,
ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
out:
@@ -1429,7 +1430,7 @@ spa_load_verify_done(zio_t *zio)
/*ARGSUSED*/
static int
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+ arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
if (bp != NULL) {
zio_t *rio = arg;
@@ -1780,6 +1781,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
spa->spa_first_txg = spa->spa_last_ubsync_txg ?
spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
spa->spa_claim_max_txg = spa->spa_first_txg;
+ spa->spa_prev_software_version = ub->ub_software_version;
error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
if (error)
@@ -1851,6 +1853,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
if (error != 0 && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
+ &spa->spa_creation_version);
+ if (error != 0 && error != ENOENT)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
/*
* Load the persistent error log. If we have an older pool, this will
* not be present.
@@ -2076,7 +2083,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
/*
* Check all DTLs to see if anything needs resilvering.
*/
- if (vdev_resilver_needed(rvd, NULL, NULL))
+ if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ vdev_resilver_needed(rvd, NULL, NULL))
spa_async_request(spa, SPA_ASYNC_RESILVER);
/*
@@ -2375,7 +2383,7 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
if (spa_spare_exists(guid, &pool, NULL) &&
pool != 0ULL) {
VERIFY(nvlist_lookup_uint64_array(
- spares[i], ZPOOL_CONFIG_STATS,
+ spares[i], ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) == 0);
vs->vs_state = VDEV_STATE_CANT_OPEN;
vs->vs_aux = VDEV_AUX_SPARED;
@@ -2432,7 +2440,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
ASSERT(vd != NULL);
VERIFY(nvlist_lookup_uint64_array(l2cache[i],
- ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
+ ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+ == 0);
vdev_get_stats(vd, vs);
}
}
@@ -2819,6 +2828,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
cmn_err(CE_PANIC, "failed to add pool config");
}
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
+ sizeof (uint64_t), 1, &version, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add pool version");
+ }
+
/* Newly created pools with the right version are always deflated. */
if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
spa->spa_deflate = TRUE;
@@ -2861,7 +2876,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
if (props != NULL) {
spa_configfile_set(spa, props, B_FALSE);
- spa_sync_props(spa, props, CRED(), tx);
+ spa_sync_props(spa, props, tx);
}
dmu_tx_commit(tx);
@@ -3219,8 +3234,6 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
return (error);
}
- spa_async_resume(spa);
-
/*
* Override any spares and level 2 cache devices as specified by
* the user, as these may have correct device names/devids, etc.
@@ -3271,6 +3284,8 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
}
+ spa_async_resume(spa);
+
/*
* It's possible that the pool was expanded while it was exported.
* We kick off an async task to handle this for us.
@@ -3455,7 +3470,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa->spa_state = new_state;
- spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
+ spa->spa_final_txg = spa_last_synced_txg(spa) +
+ TXG_DEFER_SIZE + 1;
vdev_config_dirty(spa->spa_root_vdev);
spa_config_exit(spa, SCL_ALL, FTAG);
}
@@ -3635,7 +3651,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
int
spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
{
- uint64_t txg, open_txg;
+ uint64_t txg, dtl_max_txg;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
vdev_ops_t *pvops;
@@ -3771,13 +3787,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
vdev_config_dirty(tvd);
/*
- * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate
- * upward when spa_vdev_exit() calls vdev_dtl_reassess().
+ * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
+ * for any dmu_sync-ed blocks. It will propagate upward when
+ * spa_vdev_exit() calls vdev_dtl_reassess().
*/
- open_txg = txg + TXG_CONCURRENT_STATES - 1;
+ dtl_max_txg = txg + TXG_CONCURRENT_STATES;
- vdev_dtl_dirty(newvd, DTL_MISSING,
- TXG_INITIAL, open_txg - TXG_INITIAL + 1);
+ vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+ dtl_max_txg - TXG_INITIAL);
if (newvd->vdev_isspare) {
spa_spare_activate(newvd);
@@ -3793,10 +3810,18 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
*/
vdev_dirty(tvd, VDD_DTL, newvd, txg);
- (void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
+ /*
+ * Restart the resilver
+ */
+ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+
+ /*
+ * Commit the config
+ */
+ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
- spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL,
- CRED(), "%s vdev=%s %s vdev=%s",
+ spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL,
+ "%s vdev=%s %s vdev=%s",
replacing && newvd_isspare ? "spare in" :
replacing ? "replace" : "attach", newvdpath,
replacing ? "for" : "to", oldvdpath);
@@ -3804,11 +3829,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
spa_strfree(oldvdpath);
spa_strfree(newvdpath);
- /*
- * Kick off a resilver to update newvd.
- */
- VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
-
return (0);
}
@@ -4004,7 +4024,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
error = spa_vdev_exit(spa, vd, txg, 0);
- spa_history_internal_log(LOG_POOL_VDEV_DETACH, spa, NULL, CRED(),
+ spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
"vdev=%s", vdpath);
spa_strfree(vdpath);
@@ -4024,7 +4044,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
continue;
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
- (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+ (void) spa_vdev_remove(spa, unspare_guid,
+ B_TRUE);
mutex_enter(&spa_namespace_lock);
spa_close(spa, FTAG);
}
@@ -4266,8 +4287,8 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
if (vml[c] != NULL) {
vdev_split(vml[c]);
if (error == 0)
- spa_history_internal_log(LOG_POOL_VDEV_DETACH,
- spa, tx, CRED(), "vdev=%s",
+ spa_history_log_internal(LOG_POOL_VDEV_DETACH,
+ spa, tx, "vdev=%s",
vml[c]->vdev_path);
vdev_free(vml[c]);
}
@@ -4283,7 +4304,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
zio_handle_panic_injection(spa, FTAG, 3);
/* split is complete; log a history record */
- spa_history_internal_log(LOG_POOL_SPLIT, newspa, NULL, CRED(),
+ spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
"split new pool %s from pool %s", newname, spa_name(spa));
kmem_free(vml, children * sizeof (vdev_t *));
@@ -4359,21 +4380,13 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
}
/*
- * Removing a device from the vdev namespace requires several steps
- * and can take a significant amount of time. As a result we use
- * the spa_vdev_config_[enter/exit] functions which allow us to
- * grab and release the spa_config_lock while still holding the namespace
- * lock. During each step the configuration is synced out.
- */
-
-/*
* Evacuate the device.
*/
-int
+static int
spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
{
- int error = 0;
uint64_t txg;
+ int error = 0;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
@@ -4386,14 +4399,12 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
* should no longer have any blocks allocated on it.
*/
if (vd->vdev_islog) {
- error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
- NULL, DS_FIND_CHILDREN);
+ if (vd->vdev_stat.vs_alloc != 0)
+ error = spa_offline_log(spa);
} else {
- error = ENOTSUP; /* until we have bp rewrite */
+ error = ENOTSUP;
}
- txg_wait_synced(spa_get_dsl(spa), 0);
-
if (error)
return (error);
@@ -4401,6 +4412,7 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
* The evacuation succeeded. Remove any remaining MOS metadata
* associated with this vdev, and wait for these changes to sync.
*/
+ ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
txg = spa_vdev_config_enter(spa);
vd->vdev_removing = B_TRUE;
vdev_dirty(vd, 0, NULL, txg);
@@ -4413,7 +4425,7 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
/*
* Complete the removal by cleaning up the namespace.
*/
-void
+static void
spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
{
vdev_t *rvd = spa->spa_root_vdev;
@@ -4424,6 +4436,12 @@ spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
ASSERT(vd == vd->vdev_top);
+ /*
+ * Only remove any devices which are empty.
+ */
+ if (vd->vdev_stat.vs_alloc != 0)
+ return;
+
(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
if (list_link_active(&vd->vdev_state_dirty_node))
@@ -4439,15 +4457,19 @@ spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
vdev_add_child(rvd, vd);
}
- vdev_config_dirty(rvd);
-
- /*
- * Reassess the health of our root vdev.
- */
- vdev_reopen(rvd);
}
/*
+ * Remove a device from the pool -
+ *
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time. As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock. During each step the configuration is synced out.
+ */
+
+/*
* Remove a device from the pool. Currently, this supports removing only hot
* spares, slogs, and level 2 ARC devices.
*/
@@ -4690,40 +4712,38 @@ spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
/*
* ==========================================================================
- * SPA Scrubbing
+ * SPA Scanning
* ==========================================================================
*/
int
-spa_scrub(spa_t *spa, pool_scrub_type_t type)
+spa_scan_stop(spa_t *spa)
{
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+ if (dsl_scan_resilvering(spa->spa_dsl_pool))
+ return (EBUSY);
+ return (dsl_scan_cancel(spa->spa_dsl_pool));
+}
- if ((uint_t)type >= POOL_SCRUB_TYPES)
+int
+spa_scan(spa_t *spa, pool_scan_func_t func)
+{
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+
+ if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
return (ENOTSUP);
/*
* If a resilver was requested, but there is no DTL on a
* writeable leaf device, we have nothing to do.
*/
- if (type == POOL_SCRUB_RESILVER &&
+ if (func == POOL_SCAN_RESILVER &&
!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
return (0);
}
- if (type == POOL_SCRUB_EVERYTHING &&
- spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE &&
- spa->spa_dsl_pool->dp_scrub_isresilver)
- return (EBUSY);
-
- if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) {
- return (dsl_pool_scrub_clean(spa->spa_dsl_pool));
- } else if (type == POOL_SCRUB_NONE) {
- return (dsl_pool_scrub_cancel(spa->spa_dsl_pool));
- } else {
- return (EINVAL);
- }
+ return (dsl_scan(spa->spa_dsl_pool, func));
}
/*
@@ -4829,8 +4849,8 @@ spa_async_thread(spa_t *spa)
* then log an internal history event.
*/
if (new_space != old_space) {
- spa_history_internal_log(LOG_POOL_VDEV_ONLINE,
- spa, NULL, CRED(),
+ spa_history_log_internal(LOG_POOL_VDEV_ONLINE,
+ spa, NULL,
"pool '%s' size: %llu(+%llu)",
spa_name(spa), new_space, new_space - old_space);
}
@@ -4874,7 +4894,7 @@ spa_async_thread(spa_t *spa)
* Kick off a resilver.
*/
if (tasks & SPA_ASYNC_RESILVER)
- VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0);
+ dsl_resilver_restart(spa->spa_dsl_pool, 0);
/*
* Let the world know that we're done.
@@ -4920,6 +4940,7 @@ spa_async_dispatch(spa_t *spa)
void
spa_async_request(spa_t *spa, int task)
{
+ zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
mutex_enter(&spa->spa_async_lock);
spa->spa_async_tasks |= task;
mutex_exit(&spa->spa_async_lock);
@@ -5024,7 +5045,7 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
for (i = 0; i < sav->sav_count; i++)
list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
- B_FALSE, B_FALSE, B_TRUE);
+ B_FALSE, VDEV_CONFIG_L2CACHE);
VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
sav->sav_count) == 0);
for (i = 0; i < sav->sav_count; i++)
@@ -5064,7 +5085,7 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
* Set zpool properties.
*/
static void
-spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
{
spa_t *spa = arg1;
objset_t *mos = spa->spa_meta_objset;
@@ -5176,8 +5197,8 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
/* log internal history if this is not a zpool create */
if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
tx->tx_txg != TXG_INITIAL) {
- spa_history_internal_log(LOG_POOL_PROPSET,
- spa, tx, cr, "%s %lld %s",
+ spa_history_log_internal(LOG_POOL_PROPSET,
+ spa, tx, "%s %lld %s",
nvpair_name(elem), intval, spa_name(spa));
}
}
@@ -5272,13 +5293,17 @@ spa_sync(spa_t *spa, uint64_t txg)
}
/*
- * If anything has changed in this txg, push the deferred frees
- * from the previous txg. If not, leave them alone so that we
- * don't generate work on an otherwise idle system.
+ * If anything has changed in this txg, or if someone is waiting
+ * for this txg to sync (eg, spa_vdev_remove()), push the
+ * deferred frees from the previous txg. If not, leave them
+ * alone so that we don't generate work on an otherwise idle
+ * system.
*/
if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
!txg_list_empty(&dp->dp_dirty_dirs, txg) ||
- !txg_list_empty(&dp->dp_sync_tasks, txg))
+ !txg_list_empty(&dp->dp_sync_tasks, txg) ||
+ ((dp->dp_scan->scn_phys.scn_state == DSS_SCANNING ||
+ txg_sync_waiting(dp)) && !spa_shutting_down(spa)))
spa_sync_deferred_bplist(spa, defer_bpl, tx, txg);
/*
@@ -5304,11 +5329,7 @@ spa_sync(spa_t *spa, uint64_t txg)
}
ddt_sync(spa, txg);
-
- mutex_enter(&spa->spa_scrub_lock);
- while (spa->spa_scrub_inflight > 0)
- cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
- mutex_exit(&spa->spa_scrub_lock);
+ dsl_scan_sync(dp, tx);
while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
vdev_sync(vd, txg);
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
index 68a40bec89..cdeda3f93c 100644
--- a/usr/src/uts/common/fs/zfs/spa_config.c
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/spa.h>
@@ -419,7 +418,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
split_guid) == 0);
}
- nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE);
+ nvroot = vdev_config_generate(spa, vd, getstats, 0);
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
nvlist_free(nvroot);
diff --git a/usr/src/uts/common/fs/zfs/spa_errlog.c b/usr/src/uts/common/fs/zfs/spa_errlog.c
index 4c834e2d4e..282140b3bd 100644
--- a/usr/src/uts/common/fs/zfs/spa_errlog.c
+++ b/usr/src/uts/common/fs/zfs/spa_errlog.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -54,36 +53,6 @@
#include <sys/zap.h>
#include <sys/zio.h>
-/*
- * This is a stripped-down version of strtoull, suitable only for converting
- * lowercase hexidecimal numbers that don't overflow.
- */
-uint64_t
-strtonum(const char *str, char **nptr)
-{
- uint64_t val = 0;
- char c;
- int digit;
-
- while ((c = *str) != '\0') {
- if (c >= '0' && c <= '9')
- digit = c - '0';
- else if (c >= 'a' && c <= 'f')
- digit = 10 + c - 'a';
- else
- break;
-
- val *= 16;
- val += digit;
-
- str++;
- }
-
- if (nptr)
- *nptr = (char *)str;
-
- return (val);
-}
/*
* Convert a bookmark to a string.
diff --git a/usr/src/uts/common/fs/zfs/spa_history.c b/usr/src/uts/common/fs/zfs/spa_history.c
index 18d4836bc7..212abae5b8 100644
--- a/usr/src/uts/common/fs/zfs/spa_history.c
+++ b/usr/src/uts/common/fs/zfs/spa_history.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/spa.h>
@@ -33,6 +32,7 @@
#include <sys/utsname.h>
#include <sys/cmn_err.h>
#include <sys/sunddi.h>
+#include "zfs_comutil.h"
#ifdef _KERNEL
#include <sys/zone.h>
#endif
@@ -189,7 +189,7 @@ spa_history_zone()
*/
/*ARGSUSED*/
static void
-spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
{
spa_t *spa = arg1;
history_arg_t *hap = arg2;
@@ -244,6 +244,8 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
hap->ha_log_type == LOG_CMD_NORMAL) {
VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD,
history_str) == 0);
+
+ zfs_dbgmsg("command: %s", history_str);
} else {
VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT,
hap->ha_event) == 0);
@@ -251,6 +253,11 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
tx->tx_txg) == 0);
VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR,
history_str) == 0);
+
+ zfs_dbgmsg("internal %s pool:%s txg:%llu %s",
+ zfs_history_event_names[hap->ha_event], spa_name(spa),
+ (longlong_t)tx->tx_txg, history_str);
+
}
VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0);
@@ -418,7 +425,7 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
static void
log_internal(history_internal_events_t event, spa_t *spa,
- dmu_tx_t *tx, cred_t *cr, const char *fmt, va_list adx)
+ dmu_tx_t *tx, const char *fmt, va_list adx)
{
history_arg_t *ha;
@@ -441,7 +448,7 @@ log_internal(history_internal_events_t event, spa_t *spa,
ha->ha_uid = 0;
if (dmu_tx_is_syncing(tx)) {
- spa_history_log_sync(spa, ha, cr, tx);
+ spa_history_log_sync(spa, ha, tx);
} else {
dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
spa_history_log_sync, spa, ha, 0, tx);
@@ -450,8 +457,8 @@ log_internal(history_internal_events_t event, spa_t *spa,
}
void
-spa_history_internal_log(history_internal_events_t event, spa_t *spa,
- dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+spa_history_log_internal(history_internal_events_t event, spa_t *spa,
+ dmu_tx_t *tx, const char *fmt, ...)
{
dmu_tx_t *htx = tx;
va_list adx;
@@ -466,7 +473,7 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa,
}
va_start(adx, fmt);
- log_internal(event, spa, htx, cr, fmt, adx);
+ log_internal(event, spa, htx, fmt, adx);
va_end(adx);
/* if we didn't get a tx from the caller, commit the one we made */
@@ -481,7 +488,7 @@ spa_history_log_version(spa_t *spa, history_internal_events_t event)
uint64_t current_vers = spa_version(spa);
if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) {
- spa_history_internal_log(event, spa, NULL, CRED(),
+ spa_history_log_internal(event, spa, NULL,
"pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s",
(u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION,
utsname.nodename, utsname.release, utsname.version,
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index c815cd6113..8faa84a1b0 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -40,6 +40,7 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_prop.h>
+#include <sys/dsl_scan.h>
#include <sys/fs/zfs.h>
#include <sys/metaslab_impl.h>
#include <sys/arc.h>
@@ -888,10 +889,10 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
/*
- * If the config changed, notify the scrub thread that it must restart.
+ * If the config changed, notify the scrub that it must restart.
+ * This will initiate a resilver if needed.
*/
if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
- dsl_pool_scrub_restart(spa->spa_dsl_pool);
config_changed = B_TRUE;
spa->spa_config_generation++;
}
@@ -1078,7 +1079,6 @@ spa_rename(const char *name, const char *newname)
return (0);
}
-
/*
* Determine whether a pool with given pool_guid exists. If device_guid is
* non-zero, determine whether the pool exists *and* contains a device with the
@@ -1209,6 +1209,37 @@ zfs_panic_recover(const char *fmt, ...)
}
/*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexidecimal numbers that don't overflow.
+ */
+uint64_t
+strtonum(const char *str, char **nptr)
+{
+ uint64_t val = 0;
+ char c;
+ int digit;
+
+ while ((c = *str) != '\0') {
+ if (c >= '0' && c <= '9')
+ digit = c - '0';
+ else if (c >= 'a' && c <= 'f')
+ digit = 10 + c - 'a';
+ else
+ break;
+
+ val *= 16;
+ val += digit;
+
+ str++;
+ }
+
+ if (nptr)
+ *nptr = (char *)str;
+
+ return (val);
+}
+
+/*
* ==========================================================================
* Accessor functions
* ==========================================================================
@@ -1390,6 +1421,12 @@ spa_max_replication(spa_t *spa)
return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
}
+int
+spa_prev_software_version(spa_t *spa)
+{
+ return (spa->spa_prev_software_version);
+}
+
uint64_t
dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
{
@@ -1584,3 +1621,45 @@ spa_dedup_checksum(spa_t *spa)
{
return (spa->spa_dedup_checksum);
}
+
+/*
+ * Reset pool scan stat per scan pass (or reboot).
+ */
+void
+spa_scan_stat_init(spa_t *spa)
+{
+ /* data not stored on disk */
+ spa->spa_scan_pass_start = gethrestime_sec();
+ spa->spa_scan_pass_exam = 0;
+ vdev_scan_stat_init(spa->spa_root_vdev);
+}
+
+/*
+ * Get scan stats for zpool status reports
+ */
+int
+spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
+{
+ dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
+
+ if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+ return (ENOENT);
+ bzero(ps, sizeof (pool_scan_stat_t));
+
+ /* data stored on disk */
+ ps->pss_func = scn->scn_phys.scn_func;
+ ps->pss_start_time = scn->scn_phys.scn_start_time;
+ ps->pss_end_time = scn->scn_phys.scn_end_time;
+ ps->pss_to_examine = scn->scn_phys.scn_to_examine;
+ ps->pss_examined = scn->scn_phys.scn_examined;
+ ps->pss_to_process = scn->scn_phys.scn_to_process;
+ ps->pss_processed = scn->scn_phys.scn_processed;
+ ps->pss_errors = scn->scn_phys.scn_errors;
+ ps->pss_state = scn->scn_phys.scn_state;
+
+ /* data not stored on disk */
+ ps->pss_pass_start = spa->spa_scan_pass_start;
+ ps->pss_pass_exam = spa->spa_scan_pass_exam;
+
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index c528fac1a6..f0ad04ff49 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ARC_H
@@ -48,7 +47,8 @@ arc_done_func_t arc_getbuf_func;
struct arc_buf {
arc_buf_hdr_t *b_hdr;
arc_buf_t *b_next;
- krwlock_t b_lock;
+ kmutex_t b_evict_lock;
+ krwlock_t b_data_lock;
void *b_data;
arc_evict_func_t *b_efunc;
void *b_private;
@@ -92,6 +92,8 @@ void arc_buf_add_ref(arc_buf_t *buf, void *tag);
int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf);
void arc_release(arc_buf_t *buf, void *tag);
+int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+ zbookmark_t *zb);
int arc_released(arc_buf_t *buf);
int arc_has_callback(arc_buf_t *buf);
void arc_buf_freeze(arc_buf_t *buf);
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
index 54686ba32d..4c05806e3e 100644
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DBUF_H
@@ -278,6 +277,7 @@ void dbuf_evict(dmu_buf_impl_t *db);
void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_unoverride(dbuf_dirty_record_t *dr);
void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_release_bp(dmu_buf_impl_t *db);
void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
struct dmu_tx *);
diff --git a/usr/src/uts/common/fs/zfs/sys/ddt.h b/usr/src/uts/common/fs/zfs/sys/ddt.h
index 5eab6a2fb2..bd446acafa 100644
--- a/usr/src/uts/common/fs/zfs/sys/ddt.h
+++ b/usr/src/uts/common/fs/zfs/sys/ddt.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DDT_H
@@ -232,6 +231,8 @@ extern int ddt_load(spa_t *spa);
extern void ddt_unload(spa_t *spa);
extern void ddt_sync(spa_t *spa, uint64_t txg);
extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx);
extern const ddt_ops_t ddt_zap_ops;
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 9c9369f8a7..7df5d48321 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -118,7 +118,7 @@ typedef enum dmu_object_type {
DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
DMU_OT_NEXT_CLONES, /* ZAP */
- DMU_OT_SCRUB_QUEUE, /* ZAP */
+ DMU_OT_SCAN_QUEUE, /* ZAP */
DMU_OT_USERGROUP_USED, /* ZAP */
DMU_OT_USERGROUP_QUOTA, /* ZAP */
DMU_OT_USERREFS, /* ZAP */
@@ -128,6 +128,8 @@ typedef enum dmu_object_type {
DMU_OT_SA_MASTER_NODE, /* ZAP */
DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
+ DMU_OT_SCAN_XLATE, /* ZAP */
+ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
DMU_OT_NUMTYPES
} dmu_object_type_t;
@@ -220,23 +222,8 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
#define DMU_POOL_DDT "DDT-%s-%s-%s"
#define DMU_POOL_DDT_STATS "DDT-statistics"
-
-/* 4x8 zbookmark_t */
-#define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark"
-/* 4x8 ddt_bookmark_t */
-#define DMU_POOL_SCRUB_DDT_BOOKMARK "scrub_ddt_bookmark"
-/* 1x8 max_class */
-#define DMU_POOL_SCRUB_DDT_CLASS_MAX "scrub_ddt_class_max"
-/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */
-#define DMU_POOL_SCRUB_QUEUE "scrub_queue"
-/* 1x8 txg */
-#define DMU_POOL_SCRUB_MIN_TXG "scrub_min_txg"
-/* 1x8 txg */
-#define DMU_POOL_SCRUB_MAX_TXG "scrub_max_txg"
-/* 1x4 enum scrub_func */
-#define DMU_POOL_SCRUB_FUNC "scrub_func"
-/* 1x8 count */
-#define DMU_POOL_SCRUB_ERRORS "scrub_errors"
+#define DMU_POOL_CREATION_VERSION "creation_version"
+#define DMU_POOL_SCAN "scan"
/*
* Allocate an object from this objset. The range of object numbers
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
index ef1782b74d..5c5119a207 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -46,6 +46,9 @@ struct dmu_tx;
#define OBJSET_PHYS_SIZE 2048
#define OBJSET_OLD_PHYS_SIZE 1024
+#define OBJSET_BUF_HAS_USERUSED(buf) \
+ (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)
+
#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0)
typedef struct objset_phys {
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
index 5b0821253d..844e7f1aeb 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DMU_TRAVERSE_H
@@ -37,9 +36,11 @@ extern "C" {
struct dnode_phys;
struct dsl_dataset;
struct zilog;
+struct arc_buf;
typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg);
+ struct arc_buf *pbuf, const zbookmark_t *zb, const struct dnode_phys *dnp,
+ void *arg);
#define TRAVERSE_PRE (1<<0)
#define TRAVERSE_POST (1<<1)
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
index 6eb7505ea5..a21971679c 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_DATASET_H
@@ -113,7 +112,6 @@ typedef struct dsl_dataset {
/* only used in syncing context, only valid for non-snapshots: */
struct dsl_dataset *ds_prev;
- uint64_t ds_origin_txg;
/* has internal locking: */
bplist_t ds_deadlist;
@@ -235,8 +233,7 @@ int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
uint64_t *ref_rsrv);
int dsl_dataset_set_quota(const char *dsname, zprop_source_t source,
uint64_t quota);
-void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr,
- dmu_tx_t *tx);
+dsl_syncfunc_t dsl_dataset_set_quota_sync;
int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
uint64_t reservation);
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
index 14a64e019e..327cda6eec 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_DIR_H
@@ -90,6 +89,7 @@ struct dsl_dir {
kmutex_t dd_lock;
list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
+ uint64_t dd_origin_txg;
/* gross estimate of space used by in-flight tx's */
uint64_t dd_tempreserved[TXG_SIZE];
@@ -142,6 +142,7 @@ timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
/* internal reserved dir name */
#define MOS_DIR_NAME "$MOS"
#define ORIGIN_DIR_NAME "$ORIGIN"
+#define XLATION_DIR_NAME "$XLATION"
#ifdef ZFS_DEBUG
#define dprintf_dd(dd, fmt, ...) do { \
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
index 97541ad2f1..187040e700 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
@@ -32,6 +32,7 @@
#include <sys/zio.h>
#include <sys/dnode.h>
#include <sys/ddt.h>
+#include <sys/arc.h>
#ifdef __cplusplus
extern "C" {
@@ -42,12 +43,7 @@ struct dsl_dir;
struct dsl_dataset;
struct dsl_pool;
struct dmu_tx;
-
-enum scrub_func {
- SCRUB_FUNC_NONE,
- SCRUB_FUNC_CLEAN,
- SCRUB_FUNC_NUMFUNCS
-};
+struct dsl_scan;
/* These macros are for indexing into the zfs_all_blkstats_t. */
#define DMU_OT_DEFERRED DMU_OT_NONE
@@ -87,25 +83,13 @@ typedef struct dsl_pool {
uint64_t dp_write_limit;
uint64_t dp_tmp_userrefs_obj;
+ struct dsl_scan *dp_scan;
+
/* Uses dp_lock */
kmutex_t dp_lock;
uint64_t dp_space_towrite[TXG_SIZE];
uint64_t dp_tempreserved[TXG_SIZE];
- enum scrub_func dp_scrub_func;
- uint64_t dp_scrub_queue_obj;
- uint64_t dp_scrub_min_txg;
- uint64_t dp_scrub_max_txg;
- uint64_t dp_scrub_start_time;
- uint64_t dp_scrub_ddt_class_max;
- zbookmark_t dp_scrub_bookmark;
- ddt_bookmark_t dp_scrub_ddt_bookmark;
- boolean_t dp_scrub_pausing;
- boolean_t dp_scrub_isresilver;
- boolean_t dp_scrub_restart;
- kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
- zio_t *dp_scrub_prefetch_zio_root;
-
/* Has its own locking */
tx_state_t dp_tx;
txg_list_t dp_dirty_datasets;
@@ -138,20 +122,15 @@ void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
const blkptr_t *bpp);
-void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
- struct dmu_tx *tx);
+int dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb);
+int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+ arc_done_func_t *done, void *private, int priority, int zio_flags,
+ uint32_t *arc_flags, const zbookmark_t *zb);
void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
-int dsl_pool_scrub_cancel(dsl_pool_t *dp);
-int dsl_pool_scrub_clean(dsl_pool_t *dp);
-void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
-void dsl_pool_scrub_restart(dsl_pool_t *dp);
-void dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
- const ddt_entry_t *dde);
-
taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
@@ -159,6 +138,7 @@ extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
const char *tag, dmu_tx_t *tx);
extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
+int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h
index d8a8ab2d64..a636ad3509 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_PROP_H
@@ -91,7 +90,7 @@ int dsl_prop_set(const char *ddname, const char *propname,
zprop_source_t source, int intsz, int numints, const void *buf);
int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl);
void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
- cred_t *cr, dmu_tx_t *tx);
+ dmu_tx_t *tx);
void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
zprop_source_t source, uint64_t *value);
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h
new file mode 100644
index 0000000000..c0eaa49567
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_SCAN_H
+#define _SYS_DSL_SCAN_H
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/bplist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+struct dsl_dataset;
+struct dsl_pool;
+struct dmu_tx;
+
+/*
+ * All members of this structure must be uint64_t, for byteswap
+ * purposes.
+ */
+typedef struct dsl_scan_phys {
+ uint64_t scn_func; /* pool_scan_func_t */
+ uint64_t scn_state; /* dsl_scan_state_t */
+ uint64_t scn_queue_obj;
+ uint64_t scn_min_txg;
+ uint64_t scn_max_txg;
+ uint64_t scn_cur_min_txg;
+ uint64_t scn_cur_max_txg;
+ uint64_t scn_start_time;
+ uint64_t scn_end_time;
+ uint64_t scn_to_examine; /* total bytes to be scanned */
+ uint64_t scn_examined; /* bytes scanned so far */
+ uint64_t scn_to_process;
+ uint64_t scn_processed;
+ uint64_t scn_errors; /* scan I/O error count */
+ uint64_t scn_ddt_class_max;
+ ddt_bookmark_t scn_ddt_bookmark;
+ zbookmark_t scn_bookmark;
+ uint64_t scn_flags; /* dsl_scan_flags_t */
+} dsl_scan_phys_t;
+
+#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))
+
+typedef enum dsl_scan_flags {
+ DSF_VISIT_DS_AGAIN = 1<<0,
+} dsl_scan_flags_t;
+
+typedef struct dsl_scan {
+ struct dsl_pool *scn_dp;
+
+ boolean_t scn_pausing;
+ uint64_t scn_restart_txg;
+ uint64_t scn_sync_start_time;
+ zio_t *scn_prefetch_zio_root;
+
+ /* for debugging / information */
+ uint64_t scn_visited_this_txg;
+
+ dsl_scan_phys_t scn_phys;
+} dsl_scan_t;
+
+int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
+void dsl_scan_fini(struct dsl_pool *dp);
+void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
+int dsl_scan_cancel(struct dsl_pool *);
+int dsl_scan(struct dsl_pool *, pool_scan_func_t);
+void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
+boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
+boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
+void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+ ddt_entry_t *dde, dmu_tx_t *tx);
+void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
+ struct dmu_tx *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_SCAN_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h b/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h
index 4995bfe5ac..9126290cdb 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_DSL_SYNCTASK_H
#define _SYS_DSL_SYNCTASK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/txg.h>
#include <sys/zfs_context.h>
@@ -38,7 +35,7 @@ extern "C" {
struct dsl_pool;
typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
-typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *);
+typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *);
typedef struct dsl_sync_task {
list_node_t dst_node;
@@ -53,7 +50,6 @@ typedef struct dsl_sync_task_group {
txg_node_t dstg_node;
list_t dstg_tasks;
struct dsl_pool *dstg_pool;
- cred_t *dstg_cr;
uint64_t dstg_txg;
int dstg_err;
int dstg_space;
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
index 5ce6251ddb..583d6303bd 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h
index d3fe7b1f89..bc3ade80f1 100644
--- a/usr/src/uts/common/fs/zfs/sys/refcount.h
+++ b/usr/src/uts/common/fs/zfs/sys/refcount.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_REFCOUNT_H
#define _SYS_REFCOUNT_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/inttypes.h>
#include <sys/list.h>
#include <sys/zfs_context.h>
@@ -91,6 +88,11 @@ typedef struct refcount {
atomic_add_64_nv(&(rc)->rc_count, number)
#define refcount_remove_many(rc, number, holder) \
atomic_add_64_nv(&(rc)->rc_count, -number)
+#define refcount_transfer(dst, src) { \
+ uint64_t __tmp = (src)->rc_count; \
+ atomic_add_64(&(src)->rc_count, -__tmp); \
+ atomic_add_64(&(dst)->rc_count, __tmp); \
+}
#define refcount_init()
#define refcount_fini()
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index a26a55cb42..4cc9244cd0 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SPA_H
@@ -262,7 +261,7 @@ typedef struct blkptr {
#define BP_GET_UCSIZE(bp) \
((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
- BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
#define BP_GET_NDVAS(bp) \
(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
@@ -432,6 +431,8 @@ extern void spa_async_suspend(spa_t *spa);
extern void spa_async_resume(spa_t *spa);
extern spa_t *spa_inject_addref(char *pool);
extern void spa_inject_delref(spa_t *spa);
+extern void spa_scan_stat_init(spa_t *spa);
+extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
@@ -439,6 +440,14 @@ extern void spa_inject_delref(spa_t *spa);
#define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10
#define SPA_ASYNC_AUTOEXPAND 0x20
+#define SPA_ASYNC_REMOVE_DONE 0x40
+#define SPA_ASYNC_REMOVE_STOP 0x80
+
+/*
+ * Controls the behavior of spa_vdev_remove().
+ */
+#define SPA_REMOVE_UNSPARE 0x01
+#define SPA_REMOVE_DONE 0x02
/* device manipulation */
extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
@@ -447,6 +456,7 @@ extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
int replace_done);
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
+extern boolean_t spa_vdev_remove_active(spa_t *spa);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
@@ -465,14 +475,19 @@ extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
extern void spa_l2cache_activate(vdev_t *vd);
extern void spa_l2cache_drop(spa_t *spa);
-/* scrubbing */
-extern int spa_scrub(spa_t *spa, pool_scrub_type_t type);
+/* scanning */
+extern int spa_scan(spa_t *spa, pool_scan_func_t func);
+extern int spa_scan_stop(spa_t *spa);
/* spa syncing */
extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
extern void spa_sync_allpools(void);
-#define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */
+/*
+ * DEFERRED_FREE must be large enough that regular blocks are not
+ * deferred. XXX so can't we change it back to 1?
+ */
+#define SYNC_PASS_DEFERRED_FREE 2 /* defer frees after this pass */
#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */
#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */
@@ -577,6 +592,7 @@ extern boolean_t spa_deflate(spa_t *spa);
extern metaslab_class_t *spa_normal_class(spa_t *spa);
extern metaslab_class_t *spa_log_class(spa_t *spa);
extern int spa_max_replication(spa_t *spa);
+extern int spa_prev_software_version(spa_t *spa);
extern int spa_busy(void);
extern uint8_t spa_get_failmode(spa_t *spa);
extern boolean_t spa_suspended(spa_t *spa);
@@ -632,8 +648,8 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
char *his_buf);
extern int spa_history_log(spa_t *spa, const char *his_buf,
history_log_type_t what);
-extern void spa_history_internal_log(history_internal_events_t event,
- spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
+extern void spa_history_log_internal(history_internal_events_t event,
+ spa_t *spa, dmu_tx_t *tx, const char *fmt, ...);
extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt);
/* error handling */
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 9daec092b4..f857f733d2 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SPA_IMPL_H
@@ -150,13 +149,14 @@ struct spa {
kmutex_t spa_scrub_lock; /* resilver/scrub lock */
uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */
- uint64_t spa_scrub_errors; /* scrub I/O error count */
kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
uint8_t spa_scrub_active; /* active or suspended? */
uint8_t spa_scrub_type; /* type of scrub we're doing */
uint8_t spa_scrub_finished; /* indicator to rotate logs */
uint8_t spa_scrub_started; /* started since last boot */
uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */
+ uint64_t spa_scan_pass_start; /* start time per pass/reboot */
+ uint64_t spa_scan_pass_exam; /* examined bytes per pass */
kmutex_t spa_async_lock; /* protect async state */
kthread_t *spa_async_thread; /* thread doing async task */
int spa_async_suspended; /* async tasks suspended */
@@ -212,7 +212,8 @@ struct spa {
uint64_t spa_did; /* if procp != p0, did of t1 */
boolean_t spa_autoreplace; /* autoreplace set in open */
int spa_vdev_locks; /* locks grabbed */
-
+ uint64_t spa_creation_version; /* version at pool creation */
+ uint64_t spa_prev_software_version;
/*
* spa_refcnt & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options.
diff --git a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h
index c135df9b10..6ab6aa3135 100644
--- a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_UBERBLOCK_IMPL_H
@@ -52,6 +51,9 @@ struct uberblock {
uint64_t ub_guid_sum; /* sum of all vdev guids */
uint64_t ub_timestamp; /* UTC time of last sync */
blkptr_t ub_rootbp; /* MOS objset_phys_t */
+
+ /* highest SPA_VERSION supported by software that wrote this txg */
+ uint64_t ub_software_version;
};
#ifdef __cplusplus
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index b37516a984..941f234dc6 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_VDEV_H
@@ -83,8 +82,7 @@ extern void vdev_split(vdev_t *vd);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
extern void vdev_clear_stats(vdev_t *vd);
extern void vdev_stat_update(zio_t *zio, uint64_t psize);
-extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
- boolean_t complete);
+extern void vdev_scan_stat_init(vdev_t *vd);
extern void vdev_propagate_state(vdev_t *vd);
extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
vdev_aux_t aux);
@@ -126,9 +124,15 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
extern void vdev_state_dirty(vdev_t *vd);
extern void vdev_state_clean(vdev_t *vd);
+typedef enum vdev_config_flag {
+ VDEV_CONFIG_SPARE = 1 << 0,
+ VDEV_CONFIG_L2CACHE = 1 << 1,
+ VDEV_CONFIG_REMOVING = 1 << 2
+} vdev_config_flag_t;
+
extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
- boolean_t getstats, boolean_t isspare, boolean_t isl2cache);
+ boolean_t getstats, vdev_config_flag_t flags);
/*
* Label routines
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index f78ec5084e..2b886bc588 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -151,7 +151,7 @@ struct vdev {
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
boolean_t vdev_remove_wanted; /* async remove wanted? */
boolean_t vdev_probe_wanted; /* async probe wanted? */
- boolean_t vdev_removing; /* device is being removed? */
+ uint64_t vdev_removing; /* device is being removed? */
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
diff --git a/usr/src/uts/common/fs/zfs/sys/zap.h b/usr/src/uts/common/fs/zfs/sys/zap.h
index 3b9de2a2f9..b44fb8fbba 100644
--- a/usr/src/uts/common/fs/zfs/sys/zap.h
+++ b/usr/src/uts/common/fs/zfs/sys/zap.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZAP_H
@@ -259,7 +258,6 @@ int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
*/
int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
-
/*
* Returns (in name) the name of the entry whose (value & mask)
* (za_first_integer) is value, or ENOENT if not found. The string
@@ -276,6 +274,14 @@ int zap_value_search(objset_t *os, uint64_t zapobj,
*/
int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
+/* Same as zap_join, but set the values to 'value'. */
+int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ uint64_t value, dmu_tx_t *tx);
+
+/* Same as zap_join, but add together any duplicated entries. */
+int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ dmu_tx_t *tx);
+
/*
* Manipulate entries where the name + value are the "same" (the name is
* a stringified version of the value).
@@ -286,6 +292,21 @@ int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
dmu_tx_t *tx);
+/* Here the key is an int and the value is a different int. */
+int zap_add_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t *valuep);
+
+/*
+ * They name is a stringified version of key; increment its value by
+ * delta. Zero values will be zap_remove()-ed.
+ */
+int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+ dmu_tx_t *tx);
+int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+ dmu_tx_t *tx);
+
struct zap;
struct zap_leaf;
typedef struct zap_cursor {
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
index 5aa0efc98d..739a380b75 100644
--- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZAP_IMPL_H
@@ -67,9 +66,12 @@ typedef struct mzap_ent {
avl_node_t mze_node;
int mze_chunkid;
uint64_t mze_hash;
- mzap_ent_phys_t mze_phys;
+ uint32_t mze_cd; /* copy from mze_phys->mze_cd */
} mzap_ent_t;
+#define MZE_PHYS(zap, mze) \
+ (&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid])
+
/*
* The (fat) zap is stored in one object. It is an array of
* 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
index 173b6b195e..3a33636741 100644
--- a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
+++ b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
@@ -19,13 +19,14 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZAP_LEAF_H
#define _SYS_ZAP_LEAF_H
+#include <sys/zap.h>
+
#ifdef __cplusplus
extern "C" {
#endif
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
index 450ac1c81b..50ecf9b362 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZFS_DEBUG_H
#define _SYS_ZFS_DEBUG_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -68,6 +65,16 @@ extern void __dprintf(const char *file, const char *func,
extern void zfs_panic_recover(const char *fmt, ...);
+typedef struct zfs_dbgmsg {
+ list_node_t zdm_node;
+ time_t zdm_timestamp;
+ char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+extern void zfs_dbgmsg_init(void);
+extern void zfs_dbgmsg_fini(void);
+extern void zfs_dbgmsg(const char *fmt, ...);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index f5884f2925..991b9f2f59 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -28,6 +27,7 @@
#include <sys/dmu_impl.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
#include <sys/callb.h>
/*
@@ -136,7 +136,7 @@ txg_sync_start(dsl_pool_t *dp)
* 32-bit x86. This is due in part to nested pools and
* scrub_visitbp() recursion.
*/
- tx->tx_sync_thread = thread_create(NULL, 12<<10, txg_sync_thread,
+ tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
dp, 0, &p0, TS_RUN, minclsyspri);
mutex_exit(&tx->tx_sync_lock);
@@ -366,12 +366,12 @@ txg_sync_thread(dsl_pool_t *dp)
uint64_t txg;
/*
- * We sync when we're scrubbing, there's someone waiting
+ * We sync when we're scanning, there's someone waiting
* on us, or the quiesce thread has handed off a txg to
* us, or we have reached our timeout.
*/
timer = (delta >= timeout ? 0 : timeout - delta);
- while ((dp->dp_scrub_func == SCRUB_FUNC_NONE ||
+ while ((dp->dp_scan->scn_phys.scn_state != DSS_SCANNING ||
spa_load_state(spa) != SPA_LOAD_NONE ||
spa_shutting_down(spa)) &&
!tx->tx_exiting && timer > 0 &&
diff --git a/usr/src/uts/common/fs/zfs/uberblock.c b/usr/src/uts/common/fs/zfs/uberblock.c
index 34d7e0c3ac..692cda137f 100644
--- a/usr/src/uts/common/fs/zfs/uberblock.c
+++ b/usr/src/uts/common/fs/zfs/uberblock.c
@@ -19,12 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/uberblock_impl.h>
#include <sys/vdev_impl.h>
@@ -58,6 +55,7 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
ub->ub_txg = txg;
ub->ub_guid_sum = rvd->vdev_guid_sum;
ub->ub_timestamp = gethrestime_sec();
+ ub->ub_software_version = SPA_VERSION;
return (ub->ub_rootbp.blk_birth == txg);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 2b77c45574..a61f29b8e7 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -39,6 +39,7 @@
#include <sys/fs/zfs.h>
#include <sys/arc.h>
#include <sys/zil.h>
+#include <sys/dsl_scan.h>
/*
* Virtual device management.
@@ -486,6 +487,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_ms_shift);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
&vd->vdev_asize);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ &vd->vdev_removing);
}
if (parent && !parent->vdev_parent) {
@@ -860,7 +863,12 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
if (txg == 0)
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
- if (oldc == 0)
+ /*
+ * If the vdev is being removed we don't activate
+ * the metaslabs since we want to ensure that no new
+ * allocations are performed on this device.
+ */
+ if (oldc == 0 && !vd->vdev_removing)
metaslab_group_activate(vd->vdev_mg);
if (txg == 0)
@@ -1648,9 +1656,12 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
return;
if (vd->vdev_ops->vdev_op_leaf) {
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
mutex_enter(&vd->vdev_dtl_lock);
if (scrub_txg != 0 &&
- (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
+ (spa->spa_scrub_started ||
+ (scn && scn->scn_phys.scn_errors == 0))) {
/*
* We completed a scrub up to scrub_txg. If we
* did it without rebooting, then the scrub dtl
@@ -2029,7 +2040,10 @@ vdev_sync(vdev_t *vd, uint64_t txg)
dmu_tx_commit(tx);
}
- if (vd->vdev_removing)
+ /*
+ * Remove the metadata associated with this vdev once it's empty.
+ */
+ if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
vdev_remove(vd, txg);
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
@@ -2403,7 +2417,7 @@ vdev_allocatable(vdev_t *vd)
* we're asking two separate questions about it.
*/
return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
- !vd->vdev_cant_write && !vd->vdev_ishole && !vd->vdev_removing);
+ !vd->vdev_cant_write && !vd->vdev_ishole);
}
boolean_t
@@ -2433,7 +2447,6 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
mutex_enter(&vd->vdev_stat_lock);
bcopy(&vd->vdev_stat, vs, sizeof (*vs));
- vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_min_asize(vd);
@@ -2455,7 +2468,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
}
- vs->vs_scrub_examined += cvs->vs_scrub_examined;
+ cvs->vs_scan_removing = cvd->vdev_removing;
mutex_exit(&vd->vdev_stat_lock);
}
}
@@ -2472,6 +2485,19 @@ vdev_clear_stats(vdev_t *vd)
}
void
+vdev_scan_stat_init(vdev_t *vd)
+{
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_scan_stat_init(vd->vdev_child[c]);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_scan_processed = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
vdev_stat_update(zio_t *zio, uint64_t psize)
{
spa_t *spa = zio->io_spa;
@@ -2515,8 +2541,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
mutex_enter(&vd->vdev_stat_lock);
if (flags & ZIO_FLAG_IO_REPAIR) {
- if (flags & ZIO_FLAG_SCRUB_THREAD)
- vs->vs_scrub_repaired += psize;
+ if (flags & ZIO_FLAG_SCRUB_THREAD) {
+ dsl_scan_phys_t *scn_phys =
+ &spa->spa_dsl_pool->dp_scan->scn_phys;
+ uint64_t *processed = &scn_phys->scn_processed;
+
+ /* XXX cleanup? */
+ if (vd->vdev_ops->vdev_op_leaf)
+ atomic_add_64(processed, psize);
+ vs->vs_scan_processed += psize;
+ }
+
if (flags & ZIO_FLAG_SELF_HEAL)
vs->vs_self_healed += psize;
}
@@ -2602,35 +2637,6 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
}
}
-void
-vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
-{
- vdev_stat_t *vs = &vd->vdev_stat;
-
- for (int c = 0; c < vd->vdev_children; c++)
- vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
-
- mutex_enter(&vd->vdev_stat_lock);
-
- if (type == POOL_SCRUB_NONE) {
- /*
- * Update completion and end time. Leave everything else alone
- * so we can report what happened during the previous scrub.
- */
- vs->vs_scrub_complete = complete;
- vs->vs_scrub_end = gethrestime_sec();
- } else {
- vs->vs_scrub_type = type;
- vs->vs_scrub_complete = 0;
- vs->vs_scrub_examined = 0;
- vs->vs_scrub_repaired = 0;
- vs->vs_scrub_start = gethrestime_sec();
- vs->vs_scrub_end = 0;
- }
-
- mutex_exit(&vd->vdev_stat_lock);
-}
-
/*
* Update the in-core space usage stats for this vdev, its metaslab class,
* and the root vdev.
@@ -2730,7 +2736,7 @@ vdev_config_dirty(vdev_t *vd)
* sketchy, but it will work.
*/
nvlist_free(aux[c]);
- aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE);
+ aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
return;
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index d11b3df7c6..75ec545345 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -141,6 +140,7 @@
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/zio.h>
+#include <sys/dsl_scan.h>
#include <sys/fs/zfs.h>
/*
@@ -208,7 +208,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
*/
nvlist_t *
vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
- boolean_t isspare, boolean_t isl2cache)
+ vdev_config_flag_t flags)
{
nvlist_t *nv = NULL;
@@ -216,7 +216,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
vd->vdev_ops->vdev_op_type) == 0);
- if (!isspare && !isl2cache)
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
== 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
@@ -270,7 +270,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_isspare)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
- if (!isspare && !isl2cache && vd == vd->vdev_top) {
+ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
+ vd == vd->vdev_top) {
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
vd->vdev_ms_array) == 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
@@ -281,6 +282,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vd->vdev_asize) == 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
vd->vdev_islog) == 0);
+ if (vd->vdev_removing)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ vd->vdev_removing) == 0);
}
if (vd->vdev_dtl_smo.smo_object != 0)
@@ -293,28 +297,52 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (getstats) {
vdev_stat_t vs;
+ pool_scan_stat_t ps;
+
vdev_get_stats(vd, &vs);
- VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+
+ /* provide either current or previous scan information */
+ if (spa_scan_get_stats(spa, &ps) == 0) {
+ VERIFY(nvlist_add_uint64_array(nv,
+ ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+ sizeof (pool_scan_stat_t) / sizeof (uint64_t))
+ == 0);
+ }
}
if (!vd->vdev_ops->vdev_op_leaf) {
nvlist_t **child;
- int c;
+ int c, idx;
ASSERT(!vd->vdev_ishole);
child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
KM_SLEEP);
- for (c = 0; c < vd->vdev_children; c++)
- child[c] = vdev_config_generate(spa, vd->vdev_child[c],
- getstats, isspare, isl2cache);
+ for (c = 0, idx = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ /*
+ * If we're generating an nvlist of removing
+ * vdevs then skip over any device which is
+ * not being removed.
+ */
+ if ((flags & VDEV_CONFIG_REMOVING) &&
+ !cvd->vdev_removing)
+ continue;
- VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- child, vd->vdev_children) == 0);
+ child[idx++] = vdev_config_generate(spa, cvd,
+ getstats, flags);
+ }
+
+ if (idx) {
+ VERIFY(nvlist_add_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+ }
- for (c = 0; c < vd->vdev_children; c++)
+ for (c = 0; c < idx; c++)
nvlist_free(child[c]);
kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
@@ -375,12 +403,11 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config)
{
vdev_t *rvd = spa->spa_root_vdev;
uint64_t *array;
- uint_t idx;
+ uint_t c, idx;
array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
- idx = 0;
- for (int c = 0; c < rvd->vdev_children; c++) {
+ for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
if (tvd->vdev_ishole)
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 30415c8abb..4b0f5602c1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -1604,6 +1603,7 @@ vdev_raidz_io_start(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
+
/*
* Report a checksum error for a child of a RAID-Z device.
*/
diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c
index 6be63ef6b0..df5bd46739 100644
--- a/usr/src/uts/common/fs/zfs/zap.c
+++ b/usr/src/uts/common/fs/zfs/zap.c
@@ -978,6 +978,56 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
}
int
+zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ uint64_t value, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int err;
+
+ for (zap_cursor_init(&zc, os, fromobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ if (za.za_integer_length != 8 || za.za_num_integers != 1)
+ return (EINVAL);
+ err = zap_add(os, intoobj, za.za_name,
+ 8, 1, &value, tx);
+ if (err)
+ return (err);
+ }
+ zap_cursor_fini(&zc);
+ return (0);
+}
+
+int
+zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+ dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int err;
+
+ for (zap_cursor_init(&zc, os, fromobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ uint64_t delta = 0;
+
+ if (za.za_integer_length != 8 || za.za_num_integers != 1)
+ return (EINVAL);
+
+ err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ delta += za.za_first_integer;
+ err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
+ if (err)
+ return (err);
+ }
+ zap_cursor_fini(&zc);
+ return (0);
+}
+
+int
zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
{
char name[20];
@@ -1005,17 +1055,34 @@ zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
}
int
-zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
- dmu_tx_t *tx)
+zap_add_int_key(objset_t *os, uint64_t obj,
+ uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
{
char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_lookup(os, obj, name, 8, 1, valuep));
+}
+
+int
+zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+ dmu_tx_t *tx)
+{
uint64_t value = 0;
int err;
if (delta == 0)
return (0);
- (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
err = zap_lookup(os, obj, name, 8, 1, &value);
if (err != 0 && err != ENOENT)
return (err);
@@ -1027,6 +1094,15 @@ zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
return (err);
}
+int
+zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+ dmu_tx_t *tx)
+{
+ char name[20];
+
+ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+ return (zap_increment(os, obj, name, delta, tx));
+}
/*
* Routines for iterating over the attributes.
@@ -1101,7 +1177,6 @@ again:
return (err);
}
-
static void
zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
{
diff --git a/usr/src/uts/common/fs/zfs/zap_leaf.c b/usr/src/uts/common/fs/zfs/zap_leaf.c
index 285d9c5674..19a795db82 100644
--- a/usr/src/uts/common/fs/zfs/zap_leaf.c
+++ b/usr/src/uts/common/fs/zfs/zap_leaf.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -37,6 +36,7 @@
#include <sys/zap.h>
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
+#include <sys/arc.h>
static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
@@ -538,14 +538,6 @@ zap_entry_update(zap_entry_handle_t *zeh,
if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
return (EAGAIN);
- /*
- * We should search other chained leaves (via
- * zap_entry_remove,create?) otherwise returning EAGAIN will
- * just send us into an infinite loop if we have to chain
- * another leaf block, rather than being able to split this
- * block.
- */
-
zap_leaf_array_free(l, &le->le_value_chunk);
le->le_value_chunk =
zap_leaf_array_create(l, buf, integer_size, num_integers);
diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c
index 2de5812fa2..3fc92ff122 100644
--- a/usr/src/uts/common/fs/zfs/zap_micro.c
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c
@@ -31,6 +31,7 @@
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
#include <sys/avl.h>
+#include <sys/arc.h>
#ifdef _KERNEL
#include <sys/sunddi.h>
@@ -254,26 +255,26 @@ mze_compare(const void *arg1, const void *arg2)
return (+1);
if (mze1->mze_hash < mze2->mze_hash)
return (-1);
- if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
+ if (mze1->mze_cd > mze2->mze_cd)
return (+1);
- if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
+ if (mze1->mze_cd < mze2->mze_cd)
return (-1);
return (0);
}
static void
-mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
+mze_insert(zap_t *zap, int chunkid, uint64_t hash)
{
mzap_ent_t *mze;
ASSERT(zap->zap_ismicro);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- ASSERT(mzep->mze_cd < zap_maxcd(zap));
mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
mze->mze_chunkid = chunkid;
mze->mze_hash = hash;
- mze->mze_phys = *mzep;
+ mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
+ ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
avl_add(&zap->zap_m.zap_avl, mze);
}
@@ -289,14 +290,15 @@ mze_find(zap_name_t *zn)
ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
mze_tofind.mze_hash = zn->zn_hash;
- mze_tofind.mze_phys.mze_cd = 0;
+ mze_tofind.mze_cd = 0;
again:
mze = avl_find(avl, &mze_tofind, &idx);
if (mze == NULL)
mze = avl_nearest(avl, idx, AVL_AFTER);
for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
- if (zap_match(zn, mze->mze_phys.mze_name))
+ ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
+ if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
return (mze);
}
if (zn->zn_matchtype == MT_BEST) {
@@ -319,12 +321,12 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash)
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
mze_tofind.mze_hash = hash;
- mze_tofind.mze_phys.mze_cd = 0;
+ mze_tofind.mze_cd = 0;
cd = 0;
for (mze = avl_find(avl, &mze_tofind, &idx);
mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
- if (mze->mze_phys.mze_cd != cd)
+ if (mze->mze_cd != cd)
break;
cd++;
}
@@ -408,7 +410,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
zap->zap_m.zap_num_entries++;
zn = zap_name_alloc(zap, mze->mze_name,
MT_EXACT);
- mze_insert(zap, i, zn->zn_hash, mze);
+ mze_insert(zap, i, zn->zn_hash);
zap_name_free(zn);
}
}
@@ -727,11 +729,11 @@ again:
other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
if (zn == NULL) {
- zn = zap_name_alloc(zap, mze->mze_phys.mze_name,
+ zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
MT_FIRST);
allocdzn = B_TRUE;
}
- if (zap_match(zn, other->mze_phys.mze_name)) {
+ if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
if (allocdzn)
zap_name_free(zn);
return (B_TRUE);
@@ -793,9 +795,10 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
} else if (integer_size != 8) {
err = EINVAL;
} else {
- *(uint64_t *)buf = mze->mze_phys.mze_value;
+ *(uint64_t *)buf =
+ MZE_PHYS(zap, mze)->mze_value;
(void) strlcpy(realname,
- mze->mze_phys.mze_name, rn_len);
+ MZE_PHYS(zap, mze)->mze_name, rn_len);
if (ncp) {
*ncp = mzap_normalization_conflict(zap,
zn, mze);
@@ -932,7 +935,7 @@ again:
if (zap->zap_m.zap_alloc_next ==
zap->zap_m.zap_num_chunks)
zap->zap_m.zap_alloc_next = 0;
- mze_insert(zap, i, zn->zn_hash, mze);
+ mze_insert(zap, i, zn->zn_hash);
return;
}
}
@@ -1017,10 +1020,20 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
{
zap_t *zap;
mzap_ent_t *mze;
+ uint64_t oldval;
const uint64_t *intval = val;
zap_name_t *zn;
int err;
+#ifdef ZFS_DEBUG
+ /*
+ * If there is an old value, it shouldn't change across the
+ * lockdir (eg, due to bprewrite's xlation).
+ */
+ if (integer_size == 8 && num_integers == 1)
+ (void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
+#endif
+
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
if (err)
return (err);
@@ -1044,9 +1057,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
} else {
mze = mze_find(zn);
if (mze != NULL) {
- mze->mze_phys.mze_value = *intval;
- zap->zap_m.zap_phys->mz_chunk
- [mze->mze_chunkid].mze_value = *intval;
+ ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
+ MZE_PHYS(zap, mze)->mze_value = *intval;
} else {
mzap_addent(zn, *intval);
}
@@ -1245,7 +1257,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
err = ENOENT;
mze_tofind.mze_hash = zc->zc_hash;
- mze_tofind.mze_phys.mze_cd = zc->zc_cd;
+ mze_tofind.mze_cd = zc->zc_cd;
mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
if (mze == NULL) {
@@ -1253,18 +1265,16 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
idx, AVL_AFTER);
}
if (mze) {
- ASSERT(0 == bcmp(&mze->mze_phys,
- &zc->zc_zap->zap_m.zap_phys->mz_chunk
- [mze->mze_chunkid], sizeof (mze->mze_phys)));
-
+ mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
+ ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
za->za_normalization_conflict =
mzap_normalization_conflict(zc->zc_zap, NULL, mze);
za->za_integer_length = 8;
za->za_num_integers = 1;
- za->za_first_integer = mze->mze_phys.mze_value;
- (void) strcpy(za->za_name, mze->mze_phys.mze_name);
+ za->za_first_integer = mzep->mze_value;
+ (void) strcpy(za->za_name, mzep->mze_name);
zc->zc_hash = mze->mze_hash;
- zc->zc_cd = mze->mze_phys.mze_cd;
+ zc->zc_cd = mze->mze_cd;
err = 0;
} else {
zc->zc_hash = -1ULL;
@@ -1313,7 +1323,7 @@ zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)
goto out;
}
zc->zc_hash = mze->mze_hash;
- zc->zc_cd = mze->mze_phys.mze_cd;
+ zc->zc_cd = mze->mze_cd;
}
out:
diff --git a/usr/src/uts/common/fs/zfs/zfs_debug.c b/usr/src/uts/common/fs/zfs/zfs_debug.c
new file mode 100644
index 0000000000..d0f411a993
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_debug.c
@@ -0,0 +1,95 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+
+list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size;
+kmutex_t zfs_dbgmsgs_lock;
+int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */
+
+void
+zfs_dbgmsg_init(void)
+{
+ list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
+ offsetof(zfs_dbgmsg_t, zdm_node));
+ mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+ zfs_dbgmsg_t *zdm;
+
+ while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) {
+ int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+ kmem_free(zdm, size);
+ zfs_dbgmsg_size -= size;
+ }
+ mutex_destroy(&zfs_dbgmsgs_lock);
+ ASSERT3U(zfs_dbgmsg_size, ==, 0);
+}
+
+/*
+ * Print these messages by running:
+ * echo ::zfs_dbgmsg | mdb -k
+ *
+ * Monitor these messages by running:
+ * dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ */
+void
+zfs_dbgmsg(const char *fmt, ...)
+{
+ int size;
+ va_list adx;
+ zfs_dbgmsg_t *zdm;
+
+ va_start(adx, fmt);
+ size = vsnprintf(NULL, 0, fmt, adx);
+ va_end(adx);
+
+ /*
+ * There is one byte of string in sizeof (zfs_dbgmsg_t), used
+ * for the terminating null.
+ */
+ zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP);
+ zdm->zdm_timestamp = gethrestime_sec();
+
+ va_start(adx, fmt);
+ (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx);
+ va_end(adx);
+
+ DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg);
+
+ mutex_enter(&zfs_dbgmsgs_lock);
+ list_insert_tail(&zfs_dbgmsgs, zdm);
+ zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size;
+ while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) {
+ zdm = list_remove_head(&zfs_dbgmsgs);
+ size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+ kmem_free(zdm, size);
+ zfs_dbgmsg_size -= size;
+ }
+ mutex_exit(&zfs_dbgmsgs_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index d280125152..de5fb1e4ce 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -62,6 +61,7 @@
#include <sys/zfs_ctldir.h>
#include <sys/zfs_dir.h>
#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
#include <sharefs/share.h>
#include <sys/dmu_objset.h>
@@ -117,7 +117,7 @@ void
__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
{
const char *newfile;
- char buf[256];
+ char buf[512];
va_list adx;
/*
@@ -1237,8 +1237,13 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
return (error);
}
+/*
+ * inputs:
+ * zc_name name of the pool
+ * zc_cookie scan func (pool_scan_func_t)
+ */
static int
-zfs_ioc_pool_scrub(zfs_cmd_t *zc)
+zfs_ioc_pool_scan(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
@@ -1246,7 +1251,10 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc)
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
- error = spa_scrub(spa, zc->zc_cookie);
+ if (zc->zc_cookie == POOL_SCAN_NONE)
+ error = spa_scan_stop(spa);
+ else
+ error = spa_scan(spa, zc->zc_cookie);
spa_close(spa, FTAG);
@@ -1402,6 +1410,12 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
return (error);
}
+/*
+ * inputs:
+ * zc_name name of the pool
+ * zc_nvlist_conf nvlist of devices to remove
+ * zc_cookie to stop the remove?
+ */
static int
zfs_ioc_vdev_remove(zfs_cmd_t *zc)
{
@@ -4250,7 +4264,7 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
B_FALSE },
{ zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE,
B_FALSE },
- { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+ { zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE,
B_TRUE },
{ zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE,
B_FALSE },
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index da9163c963..f68dde85f8 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -560,34 +560,6 @@ unregister:
}
-static void
-uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
- int64_t delta, dmu_tx_t *tx)
-{
- uint64_t used = 0;
- char buf[32];
- int err;
- uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
-
- if (delta == 0)
- return;
-
- (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
- err = zap_lookup(os, obj, buf, 8, 1, &used);
-
- ASSERT(err == 0 || err == ENOENT);
- /* no underflow/overflow */
- ASSERT(delta > 0 || used >= -delta);
- ASSERT(delta < 0 || used + delta > used);
- used += delta;
- if (used == 0)
- err = zap_remove(os, obj, buf, tx);
- else
- err = zap_update(os, obj, buf, 8, 1, &used, tx);
- ASSERT(err == 0);
-
-}
-
static int
zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
uint64_t *userp, uint64_t *groupp)
@@ -2239,9 +2211,8 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
sa_register_update_callback(os, zfs_sa_upgrade);
}
- spa_history_internal_log(LOG_DS_UPGRADE,
- dmu_objset_spa(os), tx, CRED(),
- "oldver=%llu newver=%llu dataset = %llu",
+ spa_history_log_internal(LOG_DS_UPGRADE,
+ dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu",
zfsvfs->z_version, newvers, dmu_objset_id(os));
dmu_tx_commit(tx);
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index e4c435341f..4aa4d10b07 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -36,6 +36,7 @@
#include <sys/dsl_dataset.h>
#include <sys/vdev.h>
#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
/*
* The zfs intent log (ZIL) saves transaction records of system calls
@@ -179,7 +180,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
- error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+ error = dsl_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (error == 0) {
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 4e481b16b7..181d07fbdb 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -661,6 +660,9 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
{
zio_t *zio;
+ dprintf_bp(bp, "freeing in txg %llu, pass %u",
+ (longlong_t)txg, spa->spa_sync_pass);
+
ASSERT(!BP_IS_HOLE(bp));
ASSERT(spa_syncing_txg(spa) == txg);
ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
@@ -2073,6 +2075,8 @@ zio_ddt_write(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
+ddt_entry_t *freedde; /* for debugging */
+
static int
zio_ddt_free(zio_t *zio)
{
@@ -2086,7 +2090,7 @@ zio_ddt_free(zio_t *zio)
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ddt_enter(ddt);
- dde = ddt_lookup(ddt, bp, B_TRUE);
+ freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
ddp = ddt_phys_select(dde, bp);
ddt_phys_decref(ddp);
ddt_exit(ddt);
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 3481fb0586..7b4577a9f3 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -249,7 +249,7 @@ struct maparg {
/*ARGSUSED*/
static int
-zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct maparg *ma = arg;
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index a5d9122246..54dd8abec7 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -333,14 +333,15 @@ typedef enum {
#define SPA_VERSION_22 22ULL
#define SPA_VERSION_23 23ULL
#define SPA_VERSION_24 24ULL
+#define SPA_VERSION_25 25ULL
/*
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
* format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
* and do the appropriate changes. Also bump the version number in
* usr/src/grub/capability.
*/
-#define SPA_VERSION SPA_VERSION_24
-#define SPA_VERSION_STRING "24"
+#define SPA_VERSION SPA_VERSION_25
+#define SPA_VERSION_STRING "25"
/*
* Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -386,6 +387,7 @@ typedef enum {
#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
#define SPA_VERSION_SA SPA_VERSION_24
+#define SPA_VERSION_SCAN SPA_VERSION_25
/*
* ZPL version - rev'd whenever an incompatible on-disk format change
@@ -450,7 +452,8 @@ typedef struct zpool_rewind_policy {
#define ZPOOL_CONFIG_ASHIFT "ashift"
#define ZPOOL_CONFIG_ASIZE "asize"
#define ZPOOL_CONFIG_DTL "DTL"
-#define ZPOOL_CONFIG_STATS "stats"
+#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */
+#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */
#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
#define ZPOOL_CONFIG_ERRCOUNT "error_count"
#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
@@ -473,6 +476,7 @@ typedef struct zpool_rewind_policy {
#define ZPOOL_CONFIG_ORIG_GUID "orig_guid"
#define ZPOOL_CONFIG_SPLIT_GUID "split_guid"
#define ZPOOL_CONFIG_SPLIT_LIST "guid_list"
+#define ZPOOL_CONFIG_REMOVING "removing"
#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */
@@ -580,14 +584,14 @@ typedef enum pool_state {
} pool_state_t;
/*
- * Scrub types.
+ * Scan Functions.
*/
-typedef enum pool_scrub_type {
- POOL_SCRUB_NONE,
- POOL_SCRUB_RESILVER,
- POOL_SCRUB_EVERYTHING,
- POOL_SCRUB_TYPES
-} pool_scrub_type_t;
+typedef enum pool_scan_func {
+ POOL_SCAN_NONE,
+ POOL_SCAN_SCRUB,
+ POOL_SCAN_RESILVER,
+ POOL_SCAN_FUNCS
+} pool_scan_func_t;
/*
* ZIO types. Needed to interpret vdev statistics below.
@@ -603,6 +607,36 @@ typedef enum zio_type {
} zio_type_t;
/*
+ * Pool statistics. Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct pool_scan_stat {
+ /* values stored on disk */
+ uint64_t pss_func; /* pool_scan_func_t */
+ uint64_t pss_state; /* dsl_scan_state_t */
+ uint64_t pss_start_time; /* scan start time */
+ uint64_t pss_end_time; /* scan end time */
+ uint64_t pss_to_examine; /* total bytes to scan */
+ uint64_t pss_examined; /* total examined bytes */
+ uint64_t pss_to_process; /* total bytes to process */
+ uint64_t pss_processed; /* total processed bytes */
+ uint64_t pss_errors; /* scan errors */
+
+ /* values not stored on disk */
+ uint64_t pss_pass_exam; /* examined bytes per scan pass */
+ uint64_t pss_pass_start; /* start time of a scan pass */
+} pool_scan_stat_t;
+
+typedef enum dsl_scan_state {
+ DSS_NONE,
+ DSS_SCANNING,
+ DSS_FINISHED,
+ DSS_CANCELED,
+ DSS_NUM_STATES
+} dsl_scan_state_t;
+
+
+/*
* Vdev statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
*/
@@ -620,13 +654,8 @@ typedef struct vdev_stat {
uint64_t vs_write_errors; /* write errors */
uint64_t vs_checksum_errors; /* checksum errors */
uint64_t vs_self_healed; /* self-healed bytes */
- uint64_t vs_scrub_type; /* pool_scrub_type_t */
- uint64_t vs_scrub_complete; /* completed? */
- uint64_t vs_scrub_examined; /* bytes examined; top */
- uint64_t vs_scrub_repaired; /* bytes repaired; leaf */
- uint64_t vs_scrub_errors; /* errors during scrub */
- uint64_t vs_scrub_start; /* UTC scrub start time */
- uint64_t vs_scrub_end; /* UTC scrub end time */
+ uint64_t vs_scan_removing; /* removing? */
+ uint64_t vs_scan_processed; /* scan processed bytes */
} vdev_stat_t;
/*
@@ -682,7 +711,7 @@ typedef enum zfs_ioc {
ZFS_IOC_POOL_CONFIGS,
ZFS_IOC_POOL_STATS,
ZFS_IOC_POOL_TRYIMPORT,
- ZFS_IOC_POOL_SCRUB,
+ ZFS_IOC_POOL_SCAN,
ZFS_IOC_POOL_FREEZE,
ZFS_IOC_POOL_UPGRADE,
ZFS_IOC_POOL_GET_HISTORY,
@@ -820,7 +849,7 @@ typedef enum history_internal_events {
LOG_POOL_VDEV_OFFLINE,
LOG_POOL_UPGRADE,
LOG_POOL_CLEAR,
- LOG_POOL_SCRUB,
+ LOG_POOL_SCAN,
LOG_POOL_PROPSET,
LOG_DS_CREATE,
LOG_DS_CLONE,
@@ -843,7 +872,7 @@ typedef enum history_internal_events {
LOG_DS_UPGRADE,
LOG_DS_REFQUOTA,
LOG_DS_REFRESERV,
- LOG_POOL_SCRUB_DONE,
+ LOG_POOL_SCAN_DONE,
LOG_DS_USER_HOLD,
LOG_DS_USER_RELEASE,
LOG_POOL_SPLIT,