summaryrefslogtreecommitdiff
path: root/usr/src/uts
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts')
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c50
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c217
-rw-r--r--usr/src/uts/common/fs/zfs/spa_config.c7
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c49
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab_impl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h8
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h1
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c135
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_label.c46
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_missing.c17
-rw-r--r--usr/src/uts/common/fs/zfs/zio_inject.c39
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h10
16 files changed, 505 insertions, 84 deletions
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 77556ac5d7..3ebde10240 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -57,12 +57,13 @@ int metaslab_df_free_pct = 30;
* ==========================================================================
*/
metaslab_class_t *
-metaslab_class_create(space_map_ops_t *ops)
+metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
{
metaslab_class_t *mc;
mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+ mc->mc_spa = spa;
mc->mc_rotor = NULL;
mc->mc_ops = ops;
@@ -126,6 +127,32 @@ metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
mg->mg_class = NULL;
}
+int
+metaslab_class_validate(metaslab_class_t *mc)
+{
+ metaslab_group_t *mg;
+ vdev_t *vd;
+
+ /*
+ * Must hold one of the spa_config locks.
+ */
+ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
+ spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
+
+ if ((mg = mc->mc_rotor) == NULL)
+ return (0);
+
+ do {
+ vd = mg->mg_vd;
+ ASSERT(vd->vdev_mg != NULL);
+ ASSERT3P(vd->vdev_top, ==, vd);
+ ASSERT3P(mg->mg_class, ==, mc);
+ ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
+ } while ((mg = mg->mg_next) != mc->mc_rotor);
+
+ return (0);
+}
+
/*
* ==========================================================================
* Metaslab groups
@@ -634,6 +661,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
dmu_tx_t *tx;
int t;
+ ASSERT(!vd->vdev_ishole);
+
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
/*
@@ -721,6 +750,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
vdev_t *vd = mg->mg_vd;
int t;
+ ASSERT(!vd->vdev_ishole);
+
mutex_enter(&msp->ms_lock);
/*
@@ -932,10 +963,21 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
*/
if (hintdva) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
- if (flags & METASLAB_HINTBP_AVOID)
- mg = vd->vdev_mg->mg_next;
- else
+
+ /*
+ * It's possible the vdev we're using as the hint no
+ * longer exists (i.e. removed). Consult the rotor when
+ * all else fails.
+ */
+ if (vd != NULL && vd->vdev_mg != NULL) {
mg = vd->vdev_mg;
+
+ if (flags & METASLAB_HINTBP_AVOID &&
+ mg->mg_next != NULL)
+ mg = mg->mg_next;
+ } else {
+ mg = mc->mc_rotor;
+ }
} else if (d != 0) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
mg = vd->vdev_mg->mg_next;
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 53e1ac0f4a..f503592396 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -42,6 +42,7 @@
#include <sys/zil.h>
#include <sys/vdev_impl.h>
#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
#include <sys/uberblock_impl.h>
#include <sys/txg.h>
#include <sys/avl.h>
@@ -578,8 +579,8 @@ spa_activate(spa_t *spa, int mode)
spa->spa_state = POOL_STATE_ACTIVE;
spa->spa_mode = mode;
- spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
- spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
+ spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
for (int t = 0; t < ZIO_TYPES; t++) {
const zio_taskq_info_t *ztip = &zio_taskqs[t];
@@ -1101,26 +1102,23 @@ spa_check_removed(vdev_t *vd)
* that the label does not contain the most up-to-date information.
*/
void
-spa_load_log_state(spa_t *spa)
+spa_load_log_state(spa_t *spa, nvlist_t *nv)
{
- nvlist_t *nv, *nvroot, **child;
- uint64_t is_log;
- uint_t children;
- vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *ovd, *rvd = spa->spa_root_vdev;
- VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0);
- VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
- VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0);
-
- for (int c = 0; c < children; c++) {
- vdev_t *tvd = rvd->vdev_child[c];
+ /*
+ * Load the original root vdev tree from the passed config.
+ */
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
- if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
- &is_log) == 0 && is_log)
- vdev_load_log_state(tvd, child[c]);
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *cvd = rvd->vdev_child[c];
+ if (cvd->vdev_islog)
+ vdev_load_log_state(cvd, ovd->vdev_child[c]);
}
- nvlist_free(nv);
+ vdev_free(ovd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
}
/*
@@ -1151,7 +1149,7 @@ static int
spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
{
int error = 0;
- nvlist_t *nvroot = NULL;
+ nvlist_t *nvconfig, *nvroot = NULL;
vdev_t *rvd;
uberblock_t *ub = &spa->spa_uberblock;
uint64_t config_cache_txg = spa->spa_config_txg;
@@ -1306,23 +1304,22 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
goto out;
}
+ if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
if (!mosconfig) {
- nvlist_t *newconfig;
uint64_t hostid;
- if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
-
- if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
+ if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
char *hostname;
unsigned long myhostid = 0;
- VERIFY(nvlist_lookup_string(newconfig,
+ VERIFY(nvlist_lookup_string(nvconfig,
ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
#ifdef _KERNEL
@@ -1347,12 +1344,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
}
}
- spa_config_set(spa, newconfig);
+ spa_config_set(spa, nvconfig);
spa_unload(spa);
spa_deactivate(spa);
spa_activate(spa, orig_mode);
- return (spa_load(spa, newconfig, state, B_TRUE));
+ return (spa_load(spa, nvconfig, state, B_TRUE));
}
if (zap_lookup(spa->spa_meta_objset,
@@ -1471,7 +1468,10 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
spa_config_exit(spa, SCL_ALL, FTAG);
}
- spa_load_log_state(spa);
+ VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ spa_load_log_state(spa, nvroot);
+ nvlist_free(nvconfig);
if (spa_check_logs(spa)) {
vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -2910,7 +2910,7 @@ spa_reset(char *pool)
int
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
{
- uint64_t txg;
+ uint64_t txg, id;
int error;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *tvd;
@@ -2951,9 +2951,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
* Transfer each new top-level vdev from vd to rvd.
*/
for (int c = 0; c < vd->vdev_children; c++) {
+
+ /*
+ * Set the vdev id to the first hole, if one exists.
+ */
+ for (id = 0; id < rvd->vdev_children; id++) {
+ if (rvd->vdev_child[id]->vdev_ishole) {
+ vdev_free(rvd->vdev_child[id]);
+ break;
+ }
+ }
tvd = vd->vdev_child[c];
vdev_remove_child(vd, tvd);
- tvd->vdev_id = rvd->vdev_children;
+ tvd->vdev_id = id;
vdev_add_child(rvd, tvd);
vdev_config_dirty(tvd);
}
@@ -3136,6 +3146,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
*/
vdev_remove_child(newrootvd, newvd);
newvd->vdev_id = pvd->vdev_children;
+ newvd->vdev_crtxg = oldvd->vdev_crtxg;
vdev_add_child(pvd, newvd);
tvd = newvd->vdev_top;
@@ -3444,16 +3455,127 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
}
/*
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time. As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock. During each step the configuration is synced out.
+ */
+
+/*
+ * Initial phase of device removal - stop future allocations from this device.
+ */
+void
+spa_vdev_remove_start(spa_t *spa, vdev_t *vd)
+{
+ metaslab_group_t *mg = vd->vdev_mg;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ /*
+ * Remove our vdev from the allocatable vdevs
+ */
+ if (mg)
+ metaslab_class_remove(mg->mg_class, mg);
+}
+
+/*
+ * Evacuate the device.
+ */
+int
+spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
+{
+ uint64_t txg;
+ int error;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+
+ /*
+ * Evacuate the device. We don't hold the config lock as writer
+ * since we need to do I/O but we do keep the
+ * spa_namespace_lock held. Once this completes the device
+ * should no longer have any blocks allocated on it.
+ */
+ if (vd->vdev_islog) {
+ /*
+ * Evacuate the device.
+ */
+ if (error = dmu_objset_find(spa_name(spa),
+ zil_vdev_offline, NULL, DS_FIND_CHILDREN)) {
+ uint64_t txg;
+
+ txg = spa_vdev_config_enter(spa);
+ metaslab_class_add(spa->spa_log_class,
+ vd->vdev_mg);
+ return (spa_vdev_exit(spa, NULL, txg, error));
+ }
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ }
+
+ /*
+ * Remove any remaining MOS metadata associated with the device.
+ */
+ txg = spa_vdev_config_enter(spa);
+ vd->vdev_removing = B_TRUE;
+ vdev_dirty(vd, 0, NULL, txg);
+ vdev_config_dirty(vd);
+ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+ return (0);
+}
+
+/*
+ * Complete the removal by cleaning up the namespace.
+ */
+void
+spa_vdev_remove_done(spa_t *spa, vdev_t *vd)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ metaslab_group_t *mg = vd->vdev_mg;
+ uint64_t id = vd->vdev_id;
+ boolean_t last_vdev = (id == (rvd->vdev_children - 1));
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+ vdev_free(vd);
+
+ /*
+ * It's possible that another thread is trying todo a spa_vdev_add()
+ * at the same time we're trying remove it. As a result the
+ * added vdev may not have initialized its metaslabs yet.
+ */
+ if (mg != NULL)
+ metaslab_group_destroy(mg);
+
+ if (last_vdev) {
+ vdev_compact_children(rvd);
+ } else {
+ vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
+ vdev_add_child(rvd, vd);
+ }
+ vdev_config_dirty(rvd);
+
+ /*
+ * Reassess the health of our root vdev.
+ */
+ vdev_reopen(rvd);
+}
+
+/*
* Remove a device from the pool. Currently, this supports removing only hot
- * spares and level 2 ARC devices.
+ * spares, slogs, and level 2 ARC devices.
*/
int
spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
{
vdev_t *vd;
nvlist_t **spares, **l2cache, *nv;
- uint_t nspares, nl2cache;
uint64_t txg = 0;
+ uint_t nspares, nl2cache;
int error = 0;
boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
@@ -3489,6 +3611,29 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
spa_load_l2cache(spa);
spa->spa_l2cache.sav_sync = B_TRUE;
+ } else if (vd != NULL && vd->vdev_islog) {
+ ASSERT(!locked);
+
+ /*
+ * XXX - Once we have bp-rewrite this should
+ * become the common case.
+ */
+
+ /*
+ * 1. Stop allocations
+ * 2. Evacuate the device (i.e. kill off stubby and
+ * metadata) and wait for it to complete (i.e. sync).
+ * 3. Cleanup the vdev namespace.
+ */
+ spa_vdev_remove_start(spa, vd);
+
+ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+ if ((error = spa_vdev_remove_evacuate(spa, vd)) != 0)
+ return (error);
+ txg = spa_vdev_config_enter(spa);
+
+ spa_vdev_remove_done(spa, vd);
+
} else if (vd != NULL) {
/*
* Normal vdevs cannot be removed (yet).
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
index b2063bba13..d611e0aa9b 100644
--- a/usr/src/uts/common/fs/zfs/spa_config.c
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -383,6 +383,13 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
vd = vd->vdev_top; /* label contains top config */
}
+ /*
+ * Add the top-level config. We even add this on pools which
+ * don't support holes in the namespace as older pools will
+ * just ignore it.
+ */
+ vdev_top_config_generate(spa, config);
+
nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE);
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
nvlist_free(nvroot);
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 9384db4ae9..38474c194d 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -836,6 +836,18 @@ uint64_t
spa_vdev_enter(spa_t *spa)
{
mutex_enter(&spa_namespace_lock);
+ return (spa_vdev_config_enter(spa));
+}
+
+/*
+ * Internal implementation for spa_vdev_enter(). Used when a vdev
+ * operation requires multiple syncs (i.e. removing a device) while
+ * keeping the spa_namespace_lock held.
+ */
+uint64_t
+spa_vdev_config_enter(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
@@ -843,14 +855,14 @@ spa_vdev_enter(spa_t *spa)
}
/*
- * Unlock the spa_t after adding or removing a vdev. Besides undoing the
- * locking of spa_vdev_enter(), we also want make sure the transactions have
- * synced to disk, and then update the global configuration cache with the new
- * information.
+ * Used in combination with spa_vdev_config_enter() to allow the syncing
+ * of multiple transactions without releasing the spa_namespace_lock.
*/
-int
-spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+void
+spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
int config_changed = B_FALSE;
ASSERT(txg > spa_last_synced_txg(spa));
@@ -870,9 +882,23 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
config_changed = B_TRUE;
}
+ /*
+ * Verify the metaslab classes.
+ */
+ ASSERT(metaslab_class_validate(spa->spa_normal_class) == 0);
+ ASSERT(metaslab_class_validate(spa->spa_log_class) == 0);
+
spa_config_exit(spa, SCL_ALL, spa);
/*
+ * Panic the system if the specified tag requires it. This
+ * is useful for ensuring that configurations are updated
+ * transactionally.
+ */
+ if (zio_injection_enabled)
+ zio_handle_panic_injection(spa, tag);
+
+ /*
* Note: this txg_wait_synced() is important because it ensures
* that there won't be more than one config change per txg.
* This allows us to use the txg as the generation number.
@@ -892,7 +918,18 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
*/
if (config_changed)
spa_config_sync(spa, B_FALSE, B_TRUE);
+}
+/*
+ * Unlock the spa_t after adding or removing a vdev. Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+ spa_vdev_config_exit(spa, vd, txg, error, FTAG);
mutex_exit(&spa_namespace_lock);
return (error);
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
index 5d3e11c971..78a5f94952 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -57,10 +57,12 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
boolean_t now);
extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
-extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops);
+extern metaslab_class_t *metaslab_class_create(spa_t *spa,
+ space_map_ops_t *ops);
extern void metaslab_class_destroy(metaslab_class_t *mc);
extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
+extern int metaslab_class_validate(metaslab_class_t *mc);
extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
vdev_t *vd);
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
index d67dea7e97..bdf9559631 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -37,6 +37,7 @@ extern "C" {
#endif
struct metaslab_class {
+ spa_t *mc_spa;
metaslab_group_t *mc_rotor;
uint64_t mc_allocated;
space_map_ops_t *mc_ops;
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index bccee25da9..b4165b24c8 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -430,6 +430,9 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
/* Pool vdev add/remove lock */
extern uint64_t spa_vdev_enter(spa_t *spa);
+extern uint64_t spa_vdev_config_enter(spa_t *spa);
+extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
+ int error, char *tag);
extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
/* Pool vdev state change lock */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index a76cecb4b2..ecf6c2fe17 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -122,6 +122,7 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
extern void vdev_state_dirty(vdev_t *vd);
extern void vdev_state_clean(vdev_t *vd);
+extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
boolean_t getstats, boolean_t isspare, boolean_t isl2cache);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 23780430df..bb2f98c33e 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -129,6 +129,7 @@ struct vdev {
boolean_t vdev_expanding; /* expand the vdev? */
int vdev_open_error; /* error on last open */
kthread_t *vdev_open_thread; /* thread opening children */
+ uint64_t vdev_crtxg; /* txg when top-level was added */
/*
* Top-level vdev state.
@@ -143,10 +144,12 @@ struct vdev {
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
boolean_t vdev_remove_wanted; /* async remove wanted? */
boolean_t vdev_probe_wanted; /* async probe wanted? */
+ boolean_t vdev_removing; /* device is being removed? */
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
uint64_t vdev_islog; /* is an intent log device */
+ uint64_t vdev_ishole; /* is a hole in the namespace */
/*
* Leaf vdev state.
@@ -248,6 +251,8 @@ typedef struct vdev_label {
/*
* Allocate or free a vdev
*/
+extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
+ vdev_ops_t *ops);
extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
vdev_t *parent, uint_t id, int alloctype);
extern void vdev_free(vdev_t *vd);
@@ -264,7 +269,7 @@ extern void vdev_remove_parent(vdev_t *cvd);
/*
* vdev sync load and sync
*/
-extern void vdev_load_log_state(vdev_t *vd, nvlist_t *nv);
+extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
extern void vdev_load(vdev_t *vd);
extern void vdev_sync(vdev_t *vd, uint64_t txg);
extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
@@ -280,6 +285,7 @@ extern vdev_ops_t vdev_raidz_ops;
extern vdev_ops_t vdev_disk_ops;
extern vdev_ops_t vdev_file_ops;
extern vdev_ops_t vdev_missing_ops;
+extern vdev_ops_t vdev_hole_ops;
extern vdev_ops_t vdev_spare_ops;
/*
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index b7a2f57cbc..37615ba35f 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -117,6 +117,7 @@ typedef struct zinject_record {
uint64_t zi_type;
uint32_t zi_freq;
uint32_t zi_failfast;
+ char zi_func[MAXNAMELEN];
} zinject_record_t;
#define ZINJECT_NULL 0x1
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index a85a1cdfcb..305c697697 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -442,6 +442,7 @@ extern int zio_inject_fault(char *name, int flags, int *id,
extern int zio_inject_list_next(int *id, char *name, size_t buflen,
struct zinject_record *record);
extern int zio_clear_fault(int id);
+extern void zio_handle_panic_injection(spa_t *spa, char *tag);
extern int zio_handle_fault_injection(zio_t *zio, int error);
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
extern int zio_handle_label_injection(zio_t *zio, int error);
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 415cd4a9e9..9c8aa43425 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -54,6 +54,7 @@ static vdev_ops_t *vdev_ops_table[] = {
&vdev_disk_ops,
&vdev_file_ops,
&vdev_missing_ops,
+ &vdev_hole_ops,
NULL
};
@@ -281,7 +282,7 @@ vdev_compact_children(vdev_t *pvd)
/*
* Allocate and minimally initialize a vdev_t.
*/
-static vdev_t *
+vdev_t *
vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
{
vdev_t *vd;
@@ -293,7 +294,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
spa->spa_root_vdev = vd;
}
- if (guid == 0) {
+ if (guid == 0 && ops != &vdev_hole_ops) {
if (spa->spa_root_vdev == vd) {
/*
* The root vdev's guid will also be the pool guid,
@@ -318,6 +319,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_guid_sum = guid;
vd->vdev_ops = ops;
vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_ishole = (ops == &vdev_hole_ops);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -397,6 +399,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
return (ENOTSUP);
+ if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
+ return (ENOTSUP);
+
/*
* Set the nparity property for RAID-Z vdevs.
*/
@@ -472,6 +477,12 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
/*
+ * Retrieve the vdev creation time.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+ &vd->vdev_crtxg);
+
+ /*
* If we're a top-level vdev, try to load the allocation parameters.
*/
if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
@@ -705,6 +716,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
mvd->vdev_min_asize = cvd->vdev_min_asize;
mvd->vdev_ashift = cvd->vdev_ashift;
mvd->vdev_state = cvd->vdev_state;
+ mvd->vdev_crtxg = cvd->vdev_crtxg;
vdev_remove_child(pvd, cvd);
vdev_add_child(pvd, mvd);
@@ -772,9 +784,14 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
metaslab_t **mspp;
int error;
- if (vd->vdev_ms_shift == 0) /* not being allocated from yet */
+ /*
+ * This vdev is not being allocated from yet or is a hole.
+ */
+ if (vd->vdev_ms_shift == 0)
return (0);
+ ASSERT(!vd->vdev_ishole);
+
/*
* Compute the raidz-deflation ratio. Note, we hard-code
* in 128k (1 << 17) because it is the current "typical" blocksize.
@@ -1105,6 +1122,12 @@ vdev_open(vdev_t *vd)
vd->vdev_state = VDEV_STATE_HEALTHY;
}
+ /*
+ * For hole or missing vdevs we just return success.
+ */
+ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+ return (0);
+
for (int c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
@@ -1393,6 +1416,7 @@ void
vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
{
ASSERT(vd == vd->vdev_top);
+ ASSERT(!vd->vdev_ishole);
ASSERT(ISP2(flags));
if (flags & VDD_METASLAB)
@@ -1502,7 +1526,7 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
vdev_dtl_reassess(vd->vdev_child[c], txg,
scrub_txg, scrub_done);
- if (vd == spa->spa_root_vdev)
+ if (vd == spa->spa_root_vdev || vd->vdev_ishole)
return;
if (vd->vdev_ops->vdev_op_leaf) {
@@ -1592,6 +1616,8 @@ vdev_dtl_load(vdev_t *vd)
if (smo->smo_object == 0)
return (0);
+ ASSERT(!vd->vdev_ishole);
+
if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
return (error);
@@ -1619,6 +1645,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
dmu_buf_t *db;
dmu_tx_t *tx;
+ ASSERT(!vd->vdev_ishole);
+
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (vd->vdev_detached) {
@@ -1755,7 +1783,7 @@ vdev_load(vdev_t *vd)
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
- if (vd == vd->vdev_top &&
+ if (vd == vd->vdev_top && !vd->vdev_ishole &&
(vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
vdev_metaslab_init(vd, 0) != 0))
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1812,10 +1840,48 @@ vdev_validate_aux(vdev_t *vd)
}
void
+vdev_remove(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ if (vd->vdev_dtl_smo.smo_object) {
+ ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
+ vd->vdev_dtl_smo.smo_object = 0;
+ }
+
+ if (vd->vdev_ms != NULL) {
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp == NULL || msp->ms_smo.smo_object == 0)
+ continue;
+
+ ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
+ (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
+ msp->ms_smo.smo_object = 0;
+ }
+ }
+
+ if (vd->vdev_ms_array) {
+ (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
+ vd->vdev_ms_array = 0;
+ vd->vdev_ms_shift = 0;
+ }
+ dmu_tx_commit(tx);
+}
+
+void
vdev_sync_done(vdev_t *vd, uint64_t txg)
{
metaslab_t *msp;
+ ASSERT(!vd->vdev_ishole);
+
while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
metaslab_sync_done(msp, txg);
}
@@ -1828,6 +1894,8 @@ vdev_sync(vdev_t *vd, uint64_t txg)
metaslab_t *msp;
dmu_tx_t *tx;
+ ASSERT(!vd->vdev_ishole);
+
if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
ASSERT(vd == vd->vdev_top);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1838,6 +1906,9 @@ vdev_sync(vdev_t *vd, uint64_t txg)
dmu_tx_commit(tx);
}
+ if (vd->vdev_removing)
+ vdev_remove(vd, txg);
+
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
metaslab_sync(msp, txg);
(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
@@ -2110,7 +2181,15 @@ vdev_clear(spa_t *spa, vdev_t *vd)
boolean_t
vdev_is_dead(vdev_t *vd)
{
- return (vd->vdev_state < VDEV_STATE_DEGRADED);
+ /*
+ * Holes and missing devices are always considered "dead".
+ * This simplifies the code since we don't have to check for
+ * these types of devices in the various code paths.
+ * Instead we rely on the fact that we skip over dead devices
+ * before issuing I/O to them.
+ */
+ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
+ vd->vdev_ops == &vdev_missing_ops);
}
boolean_t
@@ -2139,7 +2218,7 @@ vdev_allocatable(vdev_t *vd)
* we're asking two separate questions about it.
*/
return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
- !vd->vdev_cant_write);
+ !vd->vdev_cant_write && !vd->vdev_ishole && !vd->vdev_removing);
}
boolean_t
@@ -2391,7 +2470,7 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
* Don't count non-normal (e.g. intent log) space as part of
* the pool's capacity.
*/
- if (vd->vdev_mg->mg_class != spa->spa_normal_class)
+ if (vd->vdev_islog)
return;
mutex_enter(&rvd->vdev_stat_lock);
@@ -2472,7 +2551,8 @@ vdev_config_dirty(vdev_t *vd)
} else {
ASSERT(vd == vd->vdev_top);
- if (!list_link_active(&vd->vdev_config_dirty_node))
+ if (!list_link_active(&vd->vdev_config_dirty_node) &&
+ !vd->vdev_ishole)
list_insert_head(&spa->spa_config_dirty_list, vd);
}
}
@@ -2546,6 +2626,12 @@ vdev_propagate_state(vdev_t *vd)
for (int c = 0; c < vd->vdev_children; c++) {
child = vd->vdev_child[c];
+ /*
+ * Don't factor holes into the decision.
+ */
+ if (child->vdev_ishole)
+ continue;
+
if (!vdev_readable(child) ||
(!vdev_writeable(child) && spa_writeable(spa))) {
/*
@@ -2739,32 +2825,31 @@ vdev_is_bootable(vdev_t *vd)
return (B_TRUE);
}
+/*
+ * Load the state from the original vdev tree (ovd) which
+ * we've retrieved from the MOS config object. If the original
+ * vdev was offline then we transfer that state to the device
+ * in the current vdev tree (nvd).
+ */
void
-vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
+vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
{
- uint_t children;
- nvlist_t **child;
- uint64_t val;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = nvd->vdev_spa;
- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- &child, &children) == 0) {
- for (int c = 0; c < children; c++)
- vdev_load_log_state(vd->vdev_child[c], child[c]);
- }
+ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
- if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
- ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
+ for (int c = 0; c < nvd->vdev_children; c++)
+ vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
+ if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) {
/*
* It would be nice to call vdev_offline()
* directly but the pool isn't fully loaded and
* the txg threads have not been started yet.
*/
- spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER);
- vd->vdev_offline = val;
- vdev_reopen(vd->vdev_top);
- spa_config_exit(spa, SCL_STATE_ALL, FTAG);
+ nvd->vdev_offline = ovd->vdev_offline;
+ vdev_reopen(nvd->vdev_top);
}
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 06cb720128..87adc01622 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -287,6 +287,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
vd->vdev_dtl_smo.smo_object) == 0);
+ if (vd->vdev_crtxg)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+ vd->vdev_crtxg) == 0);
+
if (getstats) {
vdev_stat_t vs;
vdev_get_stats(vd, &vs);
@@ -298,6 +302,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
nvlist_t **child;
int c;
+ ASSERT(!vd->vdev_ishole);
+
child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
KM_SLEEP);
@@ -329,11 +335,45 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_unspare)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
B_TRUE) == 0);
+ if (vd->vdev_ishole)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE,
+ B_TRUE) == 0);
}
return (nv);
}
+/*
+ * Generate a view of the top-level vdevs. If we currently have holes
+ * in the namespace, then generate an array which contains a list of holey
+ * vdevs. Additionally, add the number of top-level children that currently
+ * exist.
+ */
+void
+vdev_top_config_generate(spa_t *spa, nvlist_t *config)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t *array;
+ uint_t idx;
+
+ array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
+
+ idx = 0;
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+
+ if (tvd->vdev_ishole)
+ array[idx++] = c;
+ }
+
+ VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
+ array, idx++) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+ rvd->vdev_children) == 0);
+
+ kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
+}
+
nvlist_t *
vdev_label_read_config(vdev_t *vd)
{
@@ -516,6 +556,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
crtxg, reason)) != 0)
return (error);
+ /* Track the creation time for this vdev */
+ vd->vdev_crtxg = crtxg;
+
if (!vd->vdev_ops->vdev_op_leaf)
return (0);
@@ -976,6 +1019,9 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
KM_SLEEP);
+
+ ASSERT(!vd->vdev_ishole);
+
zio_t *vio = zio_null(zio, spa, NULL,
(vd->vdev_islog || vd->vdev_aux != NULL) ?
vdev_label_sync_ignore_done : vdev_label_sync_top_done,
diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c
index 731f7d3dce..e1bf7d86a3 100644
--- a/usr/src/uts/common/fs/zfs/vdev_missing.c
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -48,8 +48,8 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
* VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
* will fail the GUID sum check before ever trying to open the pool.
*/
- *psize = SPA_MINDEVSIZE;
- *ashift = SPA_MINBLOCKSHIFT;
+ *psize = 0;
+ *ashift = 0;
return (0);
}
@@ -83,3 +83,14 @@ vdev_ops_t vdev_missing_ops = {
VDEV_TYPE_MISSING, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
+
+vdev_ops_t vdev_hole_ops = {
+ vdev_missing_open,
+ vdev_missing_close,
+ vdev_default_asize,
+ vdev_missing_io_start,
+ vdev_missing_io_done,
+ NULL,
+ VDEV_TYPE_HOLE, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c
index f8e6880c90..c5ff55243a 100644
--- a/usr/src/uts/common/fs/zfs/zio_inject.c
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c
@@ -96,6 +96,30 @@ zio_match_handler(zbookmark_t *zb, uint64_t type,
}
/*
+ * Panic the system when a config change happens in the function
+ * specified by tag.
+ */
+void
+zio_handle_panic_injection(spa_t *spa, char *tag)
+{
+ inject_handler_t *handler;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (spa != handler->zi_spa)
+ continue;
+
+ if (strcmp(tag, handler->zi_record.zi_func) == 0)
+ panic("Panic requested in function %s\n", tag);
+ }
+
+ rw_exit(&inject_lock);
+}
+
+/*
* Determine if the I/O in question should return failure. Returns the errno
* to be returned to the caller.
*/
@@ -126,8 +150,9 @@ zio_handle_fault_injection(zio_t *zio, int error)
if (zio->io_spa != handler->zi_spa)
continue;
- /* Ignore device errors */
- if (handler->zi_record.zi_guid != 0)
+ /* Ignore device errors and panic injection */
+ if (handler->zi_record.zi_guid != 0 ||
+ handler->zi_record.zi_func[0] != '\0')
continue;
/* If this handler matches, return EIO */
@@ -170,8 +195,9 @@ zio_handle_label_injection(zio_t *zio, int error)
uint64_t start = handler->zi_record.zi_start;
uint64_t end = handler->zi_record.zi_end;
- /* Ignore device only faults */
- if (handler->zi_record.zi_start == 0)
+ /* Ignore device only faults or panic injection */
+ if (handler->zi_record.zi_start == 0 ||
+ handler->zi_record.zi_func[0] != '\0')
continue;
/*
@@ -205,8 +231,9 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) {
- /* Ignore label specific faults */
- if (handler->zi_record.zi_start != 0)
+ /* Ignore label specific faults or panic injection */
+ if (handler->zi_record.zi_start != 0 ||
+ handler->zi_record.zi_func[0] != '\0')
continue;
if (vd->vdev_guid == handler->zi_record.zi_guid) {
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index b88fb5419a..de0d67176e 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -295,14 +295,15 @@ typedef enum zfs_cache_type {
#define SPA_VERSION_16 16ULL
#define SPA_VERSION_17 17ULL
#define SPA_VERSION_18 18ULL
+#define SPA_VERSION_19 19ULL
/*
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
* format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
* and do the appropriate changes. Also bump the version number in
* usr/src/grub/capability.
*/
-#define SPA_VERSION SPA_VERSION_18
-#define SPA_VERSION_STRING "18"
+#define SPA_VERSION SPA_VERSION_19
+#define SPA_VERSION_STRING "19"
/*
* Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -342,6 +343,7 @@ typedef enum zfs_cache_type {
#define SPA_VERSION_STMF_PROP SPA_VERSION_16
#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
#define SPA_VERSION_USERREFS SPA_VERSION_18
+#define SPA_VERSION_HOLES SPA_VERSION_19
/*
* ZPL version - rev'd whenever an incompatible on-disk format change
@@ -401,6 +403,9 @@ typedef enum zfs_cache_type {
#define ZPOOL_CONFIG_PHYS_PATH "phys_path"
#define ZPOOL_CONFIG_IS_LOG "is_log"
#define ZPOOL_CONFIG_L2CACHE "l2cache"
+#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array"
+#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
+#define ZPOOL_CONFIG_IS_HOLE "is_hole"
#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */
@@ -422,6 +427,7 @@ typedef enum zfs_cache_type {
#define VDEV_TYPE_DISK "disk"
#define VDEV_TYPE_FILE "file"
#define VDEV_TYPE_MISSING "missing"
+#define VDEV_TYPE_HOLE "hole"
#define VDEV_TYPE_SPARE "spare"
#define VDEV_TYPE_LOG "log"
#define VDEV_TYPE_L2CACHE "l2cache"