diff options
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/cmd/fs.d/nfs/svc/nfs-server | 8 | ||||
-rw-r--r-- | usr/src/cmd/zpool/zpool_main.c | 8 | ||||
-rw-r--r-- | usr/src/common/zfs/zfs_prop.c | 4 | ||||
-rw-r--r-- | usr/src/lib/libdiskmgt/common/inuse_fs.c | 17 | ||||
-rw-r--r-- | usr/src/lib/libzfs/common/libzfs_dataset.c | 5 | ||||
-rw-r--r-- | usr/src/lib/libzfs/common/libzfs_import.c | 16 | ||||
-rw-r--r-- | usr/src/lib/libzfs/common/libzfs_pool.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/spa.c | 188 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/spa_misc.c | 101 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/spa.h | 9 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/spa_impl.h | 5 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/vdev.h | 13 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev.c | 34 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_label.c | 265 | ||||
-rw-r--r-- | usr/src/uts/common/sys/fs/zfs.h | 3 |
15 files changed, 461 insertions, 219 deletions
diff --git a/usr/src/cmd/fs.d/nfs/svc/nfs-server b/usr/src/cmd/fs.d/nfs/svc/nfs-server index fc5e243d42..f80ec1c0d2 100644 --- a/usr/src/cmd/fs.d/nfs/svc/nfs-server +++ b/usr/src/cmd/fs.d/nfs/svc/nfs-server @@ -20,7 +20,7 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #pragma ident "%Z%%M% %I% %E% SMI" @@ -56,6 +56,12 @@ case "$1" in startnfsd=1 fi + # If auto-enable behavior is disabled, always start nfsd + + if [ `svcprop -p application/auto_enable nfs/server` = "false" ]; then + startnfsd=1 + fi + # When the system comes up umask is not set; so set the mode now [ -f /etc/dfs/sharetab ] && /usr/bin/chmod 644 /etc/dfs/sharetab diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c index f0acccfb39..7a91e9c94c 100644 --- a/usr/src/cmd/zpool/zpool_main.c +++ b/usr/src/cmd/zpool/zpool_main.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1743,6 +1743,12 @@ zpool_do_iostat(int argc, char **argv) if (verbose) (void) printf("\n"); + /* + * Flush the output so that redirection to a file isn't buffered + * indefinitely. + */ + (void) fflush(stdout); + if (interval == 0) break; diff --git a/usr/src/common/zfs/zfs_prop.c b/usr/src/common/zfs/zfs_prop.c index 96c2c046f9..b06f11990a 100644 --- a/usr/src/common/zfs/zfs_prop.c +++ b/usr/src/common/zfs/zfs_prop.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -167,6 +167,8 @@ static prop_desc_t zfs_prop_table[ZFS_NPROP_ALL] = { ZFS_TYPE_ANY, NULL, "NAME", B_FALSE }, { "iscsioptions", prop_type_string, 0, NULL, prop_inherit, ZFS_TYPE_VOLUME, NULL, "ISCSIOPTIONS", B_FALSE }, + { "numclones", prop_type_number, 0, NULL, prop_readonly, + ZFS_TYPE_SNAPSHOT, NULL, NULL, B_FALSE }, }; zfs_proptype_t diff --git a/usr/src/lib/libdiskmgt/common/inuse_fs.c b/usr/src/lib/libdiskmgt/common/inuse_fs.c index 1dafdc05e5..e3f25b2156 100644 --- a/usr/src/lib/libdiskmgt/common/inuse_fs.c +++ b/usr/src/lib/libdiskmgt/common/inuse_fs.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -207,6 +206,16 @@ load_heuristics() continue; } + /* + * Skip checking for ZFS filesystems. We know that + * inuse_zpool() will have already been called, which does a + * better job of checking anyway. More importantly, an unused + * hot spare will still claim to have a ZFS filesystem because + * it doesn't do the same level of checks. + */ + if (strcmp(dp->d_name, "zfs") == 0) + continue; + (void) snprintf(path, sizeof (path), "/usr/lib/fs/%s", dp->d_name); diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c index c850f7621c..95fa504a07 100644 --- a/usr/src/lib/libzfs/common/libzfs_dataset.c +++ b/usr/src/lib/libzfs/common/libzfs_dataset.c @@ -1416,6 +1416,10 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zfs_source_t *src, *val = (zhp->zfs_mntopts != NULL); break; + case ZFS_PROP_NUMCLONES: + *val = zhp->zfs_dmustats.dds_num_clones; + break; + default: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "cannot get non-numeric property")); @@ -1503,6 +1507,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, case ZFS_PROP_USED: case ZFS_PROP_VOLSIZE: case ZFS_PROP_VOLBLOCKSIZE: + case ZFS_PROP_NUMCLONES: /* * Basic numeric values are built on top of * get_numeric_property(). diff --git a/usr/src/lib/libzfs/common/libzfs_import.c b/usr/src/lib/libzfs/common/libzfs_import.c index d8b8af1f92..0bc9245304 100644 --- a/usr/src/lib/libzfs/common/libzfs_import.c +++ b/usr/src/lib/libzfs/common/libzfs_import.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -898,7 +898,7 @@ zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr, uint64_t guid, vdev_guid; zpool_handle_t *zhp; nvlist_t *pool_config; - uint64_t stateval; + uint64_t stateval, isspare; spare_cbdata_t cb = { 0 }; boolean_t isactive; @@ -961,6 +961,18 @@ zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr, ret = B_FALSE; } + /* + * If this is an active spare within another pool, we + * treat it like an unused hot spare. This allows the + * user to create a pool with a hot spare that currently + * in use within another pool. Since we return B_TRUE, + * libdiskmgt will continue to prevent generic consumers + * from using the device. + */ + if (ret && nvlist_lookup_uint64(config, + ZPOOL_CONFIG_IS_SPARE, &isspare) == 0 && isspare) + stateval = POOL_STATE_SPARE; + if (zhp != NULL) zpool_close(zhp); } else { diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c index 28418c281a..87e8105e98 100644 --- a/usr/src/lib/libzfs/common/libzfs_pool.c +++ b/usr/src/lib/libzfs/common/libzfs_pool.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1182,7 +1182,7 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) if (!avail_spare) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "only hot spares can be removed")); + "only inactive hot spares can be removed")); return (zfs_error(hdl, EZFS_NODEVICE, msg)); } diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index e058bfd02c..f009b5602d 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -301,14 +301,22 @@ spa_load_spares(spa_t *spa) nvlist_t **spares; uint_t nspares; int i; + vdev_t *vd, *tvd; /* * First, close and free any existing spare vdevs. */ for (i = 0; i < spa->spa_nspares; i++) { - vdev_close(spa->spa_spares[i]); - vdev_free(spa->spa_spares[i]); + vd = spa->spa_spares[i]; + + /* Undo the call to spa_activate() below */ + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && + tvd->vdev_isspare) + spa_spare_remove(tvd); + vdev_close(vd); + vdev_free(vd); } + if (spa->spa_spares) kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); @@ -326,18 +334,42 @@ spa_load_spares(spa_t *spa) /* * Construct the array of vdevs, opening them to get status in the - * process. + * process. For each spare, there is potentially two different vdev_t + * structures associated with it: one in the list of spares (used only + * for basic validation purposes) and one in the active vdev + * configuration (if it's spared in). During this phase we open and + * validate each vdev on the spare list. If the vdev also exists in the + * active configuration, then we also mark this vdev as an active spare. */ spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_nspares; i++) { - vdev_t *vd; - VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); spa->spa_spares[i] = vd; + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { + if (!tvd->vdev_isspare) + spa_spare_add(tvd); + + /* + * We only mark the spare active if we were successfully + * able to load the vdev. Otherwise, importing a pool + * with a bad active spare would result in strange + * behavior, because multiple pool would think the spare + * is actively in use. + * + * There is a vulnerability here to an equally bizarre + * circumstance, where a dead active spare is later + * brought back to life (onlined or otherwise). Given + * the rarity of this scenario, and the extra complexity + * it adds, we ignore the possibility. + */ + if (!vdev_is_dead(tvd)) + spa_spare_activate(tvd); + } + if (vdev_open(vd) != 0) continue; @@ -867,6 +899,7 @@ spa_add_spares(spa_t *spa, nvlist_t *config) uint64_t guid; vdev_stat_t *vs; uint_t vsc; + uint64_t pool; if (spa->spa_nspares == 0) return; @@ -889,7 +922,7 @@ spa_add_spares(spa_t *spa, nvlist_t *config) for (i = 0; i < nspares; i++) { VERIFY(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &guid) == 0); - if (spa_spare_inuse(guid)) { + if (spa_spare_exists(guid, &pool) && pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( spares[i], ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); @@ -943,7 +976,9 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) /* * Validate that the 'spares' array is well formed. We must have an array of - * nvlists, each which describes a valid leaf vdev. + * nvlists, each which describes a valid leaf vdev. If this is an import (mode + * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long + * as they are well-formed. */ static int spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) @@ -970,34 +1005,45 @@ spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) if (spa_version(spa) < ZFS_VERSION_SPARES) return (ENOTSUP); + /* + * Set the pending spare list so we correctly handle device in-use + * checking. + */ + spa->spa_pending_spares = spares; + spa->spa_pending_nspares = nspares; + for (i = 0; i < nspares; i++) { if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, mode)) != 0) - return (error); + goto out; if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); - return (EINVAL); - } - - if ((error = vdev_open(vd)) != 0) { - vdev_free(vd); - return (error); + error = EINVAL; + goto out; } vd->vdev_top = vd; - if ((error = vdev_label_spare(vd, crtxg)) != 0) { - vdev_free(vd); - return (error); - } - VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, - vd->vdev_guid) == 0); + if ((error = vdev_open(vd)) == 0 && + (error = vdev_label_init(vd, crtxg, + VDEV_LABEL_SPARE)) == 0) { + VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + } vdev_free(vd); + + if (error && mode != VDEV_ALLOC_SPARE) + goto out; + else + error = 0; } - return (0); +out: + spa->spa_pending_spares = NULL; + spa->spa_pending_nspares = 0; + return (error); } /* @@ -1455,33 +1501,47 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); - if ((error = spa_validate_spares(spa, nvroot, txg, - VDEV_ALLOC_ADD)) != 0) - return (spa_vdev_exit(spa, vd, txg, error)); + spa->spa_pending_vdev = vd; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; - if (vd->vdev_children == 0 && nspares == 0) + if (vd->vdev_children == 0 && nspares == 0) { + spa->spa_pending_vdev = NULL; return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } if (vd->vdev_children != 0) { - if ((error = vdev_create(vd, txg, B_FALSE)) != 0) + if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { + spa->spa_pending_vdev = NULL; return (spa_vdev_exit(spa, vd, txg, error)); - - /* - * Transfer each new top-level vdev from vd to rvd. - */ - for (c = 0; c < vd->vdev_children; c++) { - tvd = vd->vdev_child[c]; - vdev_remove_child(vd, tvd); - tvd->vdev_id = rvd->vdev_children; - vdev_add_child(rvd, tvd); - vdev_config_dirty(tvd); } } + /* + * We must validate the spares after checking the children. Otherwise, + * vdev_inuse() will blindly overwrite the spare. + */ + if ((error = spa_validate_spares(spa, nvroot, txg, + VDEV_ALLOC_ADD)) != 0) { + spa->spa_pending_vdev = NULL; + return (spa_vdev_exit(spa, vd, txg, error)); + } + + spa->spa_pending_vdev = NULL; + + /* + * Transfer each new top-level vdev from vd to rvd. + */ + for (c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + vdev_remove_child(vd, tvd); + tvd->vdev_id = rvd->vdev_children; + vdev_add_child(rvd, tvd); + vdev_config_dirty(tvd); + } + if (nspares != 0) { if (spa->spa_sparelist != NULL) { nvlist_t **oldspares; @@ -1613,10 +1673,16 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) /* * If the source is a hot spare, and the parent isn't already a * spare, then we want to create a new hot spare. Otherwise, we - * want to create a replacing vdev. + * want to create a replacing vdev. The user is not allowed to + * attach to a spared vdev child unless the 'isspare' state is + * the same (spare replaces spare, non-spare replaces + * non-spare). */ if (pvd->vdev_ops == &vdev_replacing_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + else if (pvd->vdev_ops == &vdev_spare_ops && + newvd->vdev_isspare != oldvd->vdev_isspare) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); else if (pvd->vdev_ops != &vdev_spare_ops && newvd->vdev_isspare) pvops = &vdev_spare_ops; @@ -1695,7 +1761,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) open_txg - TXG_INITIAL + 1); mutex_exit(&newvd->vdev_dtl_lock); - dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); + if (newvd->vdev_isspare) + spa_spare_activate(newvd); /* * Mark newvd's DTL dirty in this txg. @@ -1818,9 +1885,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) * it may be that the unwritability of the disk is the reason * it's being detached! */ - error = vdev_label_init(vd, 0, B_FALSE); - if (error) - dprintf("unable to erase labels on %s\n", vdev_description(vd)); + error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. @@ -1841,8 +1906,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) */ if (unspare) { ASSERT(cvd->vdev_isspare); - spa_spare_remove(cvd->vdev_guid); - cvd->vdev_isspare = B_FALSE; + spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; } @@ -1861,39 +1925,37 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) ASSERT(tvd->vdev_parent == rvd); /* - * Reopen this top-level vdev to reassess health after detach. + * Reevaluate the parent vdev state. */ - vdev_reopen(tvd); + vdev_propagate_state(cvd->vdev_parent); /* - * If the device we just detached was smaller than the others, - * it may be possible to add metaslabs (i.e. grow the pool). - * vdev_metaslab_init() can't fail because the existing metaslabs - * are already in core, so there's nothing to read from disk. + * If the device we just detached was smaller than the others, it may be + * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() + * can't fail because the existing metaslabs are already in core, so + * there's nothing to read from disk. */ VERIFY(vdev_metaslab_init(tvd, txg) == 0); vdev_config_dirty(tvd); /* - * Mark vd's DTL as dirty in this txg. - * vdev_dtl_sync() will see that vd->vdev_detached is set - * and free vd's DTL object in syncing context. - * But first make sure we're not on any *other* txg's DTL list, - * to prevent vd from being accessed after it's freed. + * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that + * vd->vdev_detached is set and free vd's DTL object in syncing context. + * But first make sure we're not on any *other* txg's DTL list, to + * prevent vd from being accessed after it's freed. */ for (t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); - dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); - error = spa_vdev_exit(spa, vd, txg, 0); /* - * If we are supposed to remove the given vdev from the list of spares, - * iterate over all pools in the system and replace it if it's present. + * If this was the removal of the original device in a hot spare vdev, + * then we want to go through and remove the device from the hot spare + * list of every other pool. */ if (unspare) { spa = NULL; @@ -3021,10 +3083,18 @@ boolean_t spa_has_spare(spa_t *spa, uint64_t guid) { int i; + uint64_t spareguid; for (i = 0; i < spa->spa_nspares; i++) if (spa->spa_spares[i]->vdev_guid == guid) return (B_TRUE); + for (i = 0; i < spa->spa_pending_nspares; i++) { + if (nvlist_lookup_uint64(spa->spa_pending_spares[i], + ZPOOL_CONFIG_GUID, &spareguid) == 0 && + spareguid == guid) + return (B_TRUE); + } + return (B_FALSE); } diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index de5be3092f..3e80ebf985 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -175,8 +175,8 @@ static kcondvar_t spa_namespace_cv; static int spa_active_count; int spa_max_replication_override = SPA_DVAS_PER_BP; -static avl_tree_t spa_spare_avl; static kmutex_t spa_spare_lock; +static avl_tree_t spa_spare_avl; kmem_cache_t *spa_buffer_pool; int spa_mode; @@ -355,13 +355,30 @@ spa_refcount_zero(spa_t *spa) */ /* - * We track spare information on a global basis. This allows us to do two - * things: determine when a spare is no longer referenced by any active pool, - * and (quickly) determine if a spare is currently in use in another pool on the - * system. + * Spares are tracked globally due to the following constraints: + * + * - A spare may be part of multiple pools. + * - A spare may be added to a pool even if it's actively in use within + * another pool. + * - A spare in use in any pool can only be the source of a replacement if + * the target is a spare in the same pool. + * + * We keep track of all spares on the system through the use of a reference + * counted AVL tree. When a vdev is added as a spare, or used as a replacement + * spare, then we bump the reference count in the AVL tree. In addition, we set + * the 'vdev_isspare' member to indicate that the device is a spare (active or + * inactive). When a spare is made active (used to replace a device in the + * pool), we also keep track of which pool its been made a part of. + * + * The 'spa_spare_lock' protects the AVL tree. These functions are normally + * called under the spa_namespace lock as part of vdev reconfiguration. The + * separate spare lock exists for the status query path, which does not need to + * be completely consistent with respect to other vdev configuration changes. */ + typedef struct spa_spare { uint64_t spare_guid; + uint64_t spare_pool; avl_node_t spare_avl; int spare_count; } spa_spare_t; @@ -381,29 +398,31 @@ spa_spare_compare(const void *a, const void *b) } void -spa_spare_add(uint64_t guid) +spa_spare_add(vdev_t *vd) { avl_index_t where; spa_spare_t search; spa_spare_t *spare; mutex_enter(&spa_spare_lock); + ASSERT(!vd->vdev_isspare); - search.spare_guid = guid; + search.spare_guid = vd->vdev_guid; if ((spare = avl_find(&spa_spare_avl, &search, &where)) != NULL) { spare->spare_count++; } else { - spare = kmem_alloc(sizeof (spa_spare_t), KM_SLEEP); - spare->spare_guid = guid; + spare = kmem_zalloc(sizeof (spa_spare_t), KM_SLEEP); + spare->spare_guid = vd->vdev_guid; spare->spare_count = 1; avl_insert(&spa_spare_avl, spare, where); } + vd->vdev_isspare = B_TRUE; mutex_exit(&spa_spare_lock); } void -spa_spare_remove(uint64_t guid) +spa_spare_remove(vdev_t *vd) { spa_spare_t search; spa_spare_t *spare; @@ -411,34 +430,62 @@ spa_spare_remove(uint64_t guid) mutex_enter(&spa_spare_lock); - search.spare_guid = guid; + search.spare_guid = vd->vdev_guid; spare = avl_find(&spa_spare_avl, &search, &where); + ASSERT(vd->vdev_isspare); ASSERT(spare != NULL); if (--spare->spare_count == 0) { avl_remove(&spa_spare_avl, spare); kmem_free(spare, sizeof (spa_spare_t)); + } else if (spare->spare_pool == spa_guid(vd->vdev_spa)) { + spare->spare_pool = 0ULL; } + vd->vdev_isspare = B_FALSE; mutex_exit(&spa_spare_lock); } boolean_t -spa_spare_inuse(uint64_t guid) +spa_spare_exists(uint64_t guid, uint64_t *pool) { - spa_spare_t search; + spa_spare_t search, *found; avl_index_t where; - boolean_t ret; mutex_enter(&spa_spare_lock); search.spare_guid = guid; - ret = (avl_find(&spa_spare_avl, &search, &where) != NULL); + found = avl_find(&spa_spare_avl, &search, &where); + + if (pool) { + if (found) + *pool = found->spare_pool; + else + *pool = 0ULL; + } mutex_exit(&spa_spare_lock); - return (ret); + return (found != NULL); +} + +void +spa_spare_activate(vdev_t *vd) +{ + spa_spare_t search, *found; + avl_index_t where; + + mutex_enter(&spa_spare_lock); + ASSERT(vd->vdev_isspare); + + search.spare_guid = vd->vdev_guid; + found = avl_find(&spa_spare_avl, &search, &where); + ASSERT(found != NULL); + ASSERT(found->spare_pool == 0ULL); + + found->spare_pool = spa_guid(vd->vdev_spa); + mutex_exit(&spa_spare_lock); } /* @@ -680,9 +727,23 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) continue; if (spa->spa_root_vdev == NULL) continue; - if (spa_guid(spa) == pool_guid && (device_guid == 0 || - vdev_lookup_by_guid(spa->spa_root_vdev, device_guid))) - break; + if (spa_guid(spa) == pool_guid) { + if (device_guid == 0) + break; + + if (vdev_lookup_by_guid(spa->spa_root_vdev, + device_guid) != NULL) + break; + + /* + * Check any devices we may in the process of adding. + */ + if (spa->spa_pending_vdev) { + if (vdev_lookup_by_guid(spa->spa_pending_vdev, + device_guid) != NULL) + break; + } + } } return (spa != NULL); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 8715b23846..7a55c9e292 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -344,9 +344,10 @@ extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); /* spare state (which is global across all pools) */ -extern void spa_spare_add(uint64_t guid); -extern void spa_spare_remove(uint64_t guid); -extern boolean_t spa_spare_inuse(uint64_t guid); +extern void spa_spare_add(vdev_t *vd); +extern void spa_spare_remove(vdev_t *vd); +extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool); +extern void spa_spare_activate(vdev_t *vd); /* scrubbing */ extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index 285c8cc7c4..9c75c8872d 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -138,6 +138,9 @@ struct spa { uint64_t spa_deflate; /* should we deflate? */ uint64_t spa_history; /* history object */ kmutex_t spa_history_lock; /* history lock */ + vdev_t *spa_pending_vdev; /* pending vdev additions */ + nvlist_t **spa_pending_spares; /* pending spare additions */ + uint_t spa_pending_nspares; /* # pending spares */ /* * spa_refcnt must be the last element because it changes size based on * compilation options. In order for the MDB module to function diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index ae8d157d1a..3120811625 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -115,8 +115,15 @@ struct uberblock; extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); extern nvlist_t *vdev_label_read_config(vdev_t *vd); extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub); -int vdev_label_init(vdev_t *vd, uint64_t create_txg, boolean_t isreplacing); -int vdev_label_spare(vdev_t *vd, uint64_t create_txg); + +typedef enum { + VDEV_LABEL_CREATE, /* create/add a new device */ + VDEV_LABEL_REPLACE, /* replace an existing device */ + VDEV_LABEL_SPARE, /* add a new hot spare */ + VDEV_LABEL_REMOVE /* remove an existing device */ +} vdev_labeltype_t; + +extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); #ifdef __cplusplus } diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 007833e95e..0e96289ef4 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -328,7 +328,7 @@ vdev_free_common(vdev_t *vd) spa_strfree(vd->vdev_devid); if (vd->vdev_isspare) - spa_spare_remove(vd->vdev_guid); + spa_spare_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); @@ -456,15 +456,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* - * Look for the 'is_spare' flag. If this is the case, then we are a - * repurposed hot spare. - */ - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, - &vd->vdev_isspare); - if (vd->vdev_isspare) - spa_spare_add(vd->vdev_guid); - - /* * If we're a top-level vdev, try to load the allocation parameters. */ if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { @@ -1019,6 +1010,22 @@ vdev_reopen(vdev_t *vd) (void) vdev_open(vd); /* + * Call vdev_validate() here to make sure we have the same device. + * Otherwise, a device with an invalid label could be successfully + * opened in response to vdev_reopen(). + * + * The downside to this is that if the user is simply experimenting by + * overwriting an entire disk, we'll fault the device rather than + * demonstrate self-healing capabilities. On the other hand, with + * proper FMA integration, the series of errors we'd see from the device + * would result in a faulted device anyway. Given that this doesn't + * model any real-world corruption, it's better to catch this here and + * correctly identify that the device has either changed beneath us, or + * is corrupted beyond recognition. + */ + (void) vdev_validate(vd); + + /* * Reassess root vdev's health. */ vdev_propagate_state(spa->spa_root_vdev); @@ -1044,7 +1051,8 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) /* * Recursively initialize all labels. */ - if ((error = vdev_label_init(vd, txg, isreplacing)) != 0) { + if ((error = vdev_label_init(vd, txg, isreplacing ? + VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } @@ -1325,6 +1333,8 @@ vdev_validate_spare(vdev_t *vd) return (-1); } + spa_spare_add(vd); + /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index 335b3e5a36..8d8cb6f7af 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -334,9 +334,110 @@ vdev_label_read_config(vdev_t *vd) return (config); } -static int -vdev_label_common(vdev_t *vd, uint64_t crtxg, boolean_t isspare, - boolean_t isreplacing) +/* + * Determine if a device is in use. The 'spare_guid' parameter will be filled + * in with the device guid if this spare is active elsewhere on the system. + */ +static boolean_t +vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, + uint64_t *spare_guid) +{ + spa_t *spa = vd->vdev_spa; + uint64_t state, pool_guid, device_guid, txg, spare_pool; + uint64_t vdtxg = 0; + nvlist_t *label; + + if (spare_guid) + *spare_guid = 0ULL; + + /* + * Read the label, if any, and perform some basic sanity checks. + */ + if ((label = vdev_label_read_config(vd)) == NULL) + return (B_FALSE); + + (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, + &vdtxg); + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, + &device_guid) != 0) { + nvlist_free(label); + return (B_FALSE); + } + + if (state != POOL_STATE_SPARE && + (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0)) { + nvlist_free(label); + return (B_FALSE); + } + + nvlist_free(label); + + /* + * Check to see if this device indeed belongs to the pool it claims to + * be a part of. The only way this is allowed is if the device is a hot + * spare (which we check for later on). + */ + if (state != POOL_STATE_SPARE && + !spa_guid_exists(pool_guid, device_guid) && + !spa_spare_exists(device_guid, NULL)) + return (B_FALSE); + + /* + * If the transaction group is zero, then this an initialized (but + * unused) label. This is only an error if the create transaction + * on-disk is the same as the one we're using now, in which case the + * user has attempted to add the same vdev multiple times in the same + * transaction. + */ + if (state != POOL_STATE_SPARE && txg == 0 && vdtxg == crtxg) + return (B_TRUE); + + /* + * Check to see if this is a spare device. We do an explicit check for + * spa_has_spare() here because it may be on our pending list of spares + * to add. + */ + if (spa_spare_exists(device_guid, &spare_pool) || + spa_has_spare(spa, device_guid)) { + if (spare_guid) + *spare_guid = device_guid; + + switch (reason) { + case VDEV_LABEL_CREATE: + return (B_TRUE); + + case VDEV_LABEL_REPLACE: + return (!spa_has_spare(spa, device_guid) || + spare_pool != 0ULL); + + case VDEV_LABEL_SPARE: + return (spa_has_spare(spa, device_guid)); + } + } + + /* + * If the device is marked ACTIVE, then this device is in use by another + * pool on the system. + */ + return (state == POOL_STATE_ACTIVE); +} + +/* + * Initialize a vdev label. We check to make sure each leaf device is not in + * use, and writable. We put down an initial label which we will later + * overwrite with a complete label. Note that it's important to do this + * sequentially, not in parallel, so that we catch cases of multiple use of the + * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with + * itself. + */ +int +vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) { spa_t *spa = vd->vdev_spa; nvlist_t *label; @@ -348,108 +449,62 @@ vdev_label_common(vdev_t *vd, uint64_t crtxg, boolean_t isspare, char *buf; size_t buflen; int error; + uint64_t spare_guid; ASSERT(spa_config_held(spa, RW_WRITER)); for (c = 0; c < vd->vdev_children; c++) - if ((error = vdev_label_common(vd->vdev_child[c], - crtxg, isspare, isreplacing)) != 0) + if ((error = vdev_label_init(vd->vdev_child[c], + crtxg, reason)) != 0) return (error); if (!vd->vdev_ops->vdev_op_leaf) return (0); /* - * Make sure each leaf device is writable, and zero its initial content. - * Along the way, also make sure that no leaf is already in use. - * Note that it's important to do this sequentially, not in parallel, - * so that we catch cases of multiple use of the same leaf vdev in - * the vdev we're creating -- e.g. mirroring a disk with itself. + * Dead vdevs cannot be initialized. */ if (vdev_is_dead(vd)) return (EIO); /* - * Check whether this device is already in use. - * Ignore the check if crtxg == 0, which we use for device removal. + * Determine if the vdev is in use. */ - if (crtxg != 0 && - (label = vdev_label_read_config(vd)) != NULL) { - uint64_t state, pool_guid, device_guid, txg, spare; - uint64_t mycrtxg = 0; - - (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, - &mycrtxg); - - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, - &state) == 0 && state == POOL_STATE_ACTIVE && - nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, - &pool_guid) == 0 && - nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, - &device_guid) == 0 && - spa_guid_exists(pool_guid, device_guid) && - nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, - &txg) == 0 && (txg != 0 || mycrtxg == crtxg)) { - if (isspare && pool_guid != spa_guid(spa) && - nvlist_lookup_uint64(label, - ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && - !spa_has_spare(spa, device_guid)) { - /* - * If this is a request to add a spare that - * is actively in use in another pool, simply - * return success, after updating the guid. - */ - vdev_t *pvd = vd->vdev_parent; - - for (; pvd != NULL; pvd = pvd->vdev_parent) { - pvd->vdev_guid_sum -= vd->vdev_guid; - pvd->vdev_guid_sum += device_guid; - } - - vd->vdev_guid = vd->vdev_guid_sum = device_guid; - nvlist_free(label); - return (0); - } - nvlist_free(label); - return (EBUSY); + if (reason != VDEV_LABEL_REMOVE && + vdev_inuse(vd, crtxg, reason, &spare_guid)) + return (EBUSY); + + ASSERT(reason != VDEV_LABEL_REMOVE || + vdev_inuse(vd, crtxg, reason, NULL)); + + /* + * If this is a request to add or replace a spare that is in use + * elsewhere on the system, then we must update the guid (which was + * initialized to a random value) to reflect the actual GUID (which is + * shared between multiple pools). + */ + if (reason != VDEV_LABEL_REMOVE && spare_guid != 0ULL) { + vdev_t *pvd = vd->vdev_parent; + + for (; pvd != NULL; pvd = pvd->vdev_parent) { + pvd->vdev_guid_sum -= vd->vdev_guid; + pvd->vdev_guid_sum += spare_guid; } + vd->vdev_guid = vd->vdev_guid_sum = spare_guid; + /* - * If this device is reserved as a hot spare for this pool, - * adopt its GUID, and mark it as such. This way we preserve - * the fact that it is a hot spare even as it is added and - * removed from the pool. + * If this is a replacement, then we want to fallthrough to the + * rest of the code. If we're adding a spare, then it's already + * labelled appropriately and we can just return. */ - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, - &state) == 0 && state == POOL_STATE_SPARE && - nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, - &device_guid) == 0) { - vdev_t *pvd = vd->vdev_parent; - - if ((isspare || !isreplacing) && - spa_has_spare(spa, device_guid)) { - nvlist_free(label); - return (EBUSY); - } - - for (; pvd != NULL; pvd = pvd->vdev_parent) { - pvd->vdev_guid_sum -= vd->vdev_guid; - pvd->vdev_guid_sum += device_guid; - } - - vd->vdev_guid = vd->vdev_guid_sum = device_guid; - - if (!isspare) { - vd->vdev_isspare = B_TRUE; - spa_spare_add(vd->vdev_guid); - } - } - - nvlist_free(label); + if (reason == VDEV_LABEL_SPARE) + return (0); + ASSERT(reason == VDEV_LABEL_REPLACE); } /* - * The device isn't in use, so initialize its label. + * Initialize its label. */ vp = zio_buf_alloc(sizeof (vdev_phys_t)); bzero(vp, sizeof (vdev_phys_t)); @@ -459,16 +514,16 @@ vdev_label_common(vdev_t *vd, uint64_t crtxg, boolean_t isspare, * We mark it as being from txg 0 to indicate that it's not * really part of an active pool just yet. The labels will * be written again with a meaningful txg by spa_sync(). - * - * For hot spares, we generate a special label that identifies as a - * mutually shared hot spare. If this is being added as a hot spare, - * always write out the spare label. If this was a hot spare, then - * always label it as such. If we are adding the vdev, it will remain - * labelled in this state until it's really added to the config. If we - * are removing the vdev or destroying the pool, then it goes back to - * its original hot spare state. */ - if (isspare || vd->vdev_isspare) { + if (reason == VDEV_LABEL_SPARE || + (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) { + /* + * For inactive hot spares, we generate a special label that + * identifies as a mutually shared hot spare. We write the + * label if we are adding a hot spare, or if we are removing an + * active hot spare (in which case we want to revert the + * labels). + */ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, @@ -546,23 +601,17 @@ vdev_label_common(vdev_t *vd, uint64_t crtxg, boolean_t isspare, zio_buf_free(vb, sizeof (vdev_boot_header_t)); zio_buf_free(vp, sizeof (vdev_phys_t)); - return (error); -} - -int -vdev_label_init(vdev_t *vd, uint64_t crtxg, boolean_t isreplacing) -{ - return (vdev_label_common(vd, crtxg, B_FALSE, isreplacing)); -} + /* + * If this vdev hasn't been previously identified as a spare, then we + * mark it as such only if a) we are labelling it as a spare, or b) it + * exists as a spare elsewhere in the system. + */ + if (error == 0 && !vd->vdev_isspare && + (reason == VDEV_LABEL_SPARE || + spa_spare_exists(vd->vdev_guid, NULL))) + spa_spare_add(vd); -/* - * Label a disk as a hot spare. A hot spare label is a special label with only - * the following members: version, pool_state, and guid. - */ -int -vdev_label_spare(vdev_t *vd, uint64_t crtxg) -{ - return (vdev_label_common(vd, crtxg, B_TRUE, B_FALSE)); + return (error); } /* diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index 88f55c25f1..fe5ae0fbf1 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -94,6 +94,7 @@ typedef enum { ZFS_PROP_CREATETXG, ZFS_PROP_NAME, ZFS_PROP_ISCSIOPTIONS, + ZFS_PROP_NUMCLONES, ZFS_NPROP_ALL } zfs_prop_t; |