summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/boot/Makefile.version2
-rw-r--r--usr/src/boot/lib/libstand/zfs/zfsimpl.c660
-rw-r--r--usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h8
-rw-r--r--usr/src/boot/sys/cddl/boot/zfs/zfssubr.c6
4 files changed, 420 insertions, 256 deletions
diff --git a/usr/src/boot/Makefile.version b/usr/src/boot/Makefile.version
index 707e31612b..7b7e86bbba 100644
--- a/usr/src/boot/Makefile.version
+++ b/usr/src/boot/Makefile.version
@@ -33,4 +33,4 @@ LOADER_VERSION = 1.1
# Use date like formatting here, YYYY.MM.DD.XX, without leading zeroes.
# The version is processed from left to right, the version number can only
# be increased.
-BOOT_VERSION = $(LOADER_VERSION)-2019.12.18.2
+BOOT_VERSION = $(LOADER_VERSION)-2020.01.06.1
diff --git a/usr/src/boot/lib/libstand/zfs/zfsimpl.c b/usr/src/boot/lib/libstand/zfs/zfsimpl.c
index a85f6f5721..212b5faa52 100644
--- a/usr/src/boot/lib/libstand/zfs/zfsimpl.c
+++ b/usr/src/boot/lib/libstand/zfs/zfsimpl.c
@@ -490,12 +490,12 @@ vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
}
rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
- if (rc)
- return (rc);
- if (bp != NULL)
- return (zio_checksum_verify(vdev->spa, bp, buf));
+ if (rc == 0) {
+ if (bp != NULL)
+ rc = zio_checksum_verify(vdev->v_spa, bp, buf);
+ }
- return (0);
+ return (rc);
}
typedef struct remap_segment {
@@ -774,8 +774,10 @@ static vdev_t *
vdev_lookup_top(spa_t *spa, uint64_t vdev)
{
vdev_t *rvd;
+ vdev_list_t *vlist;
- STAILQ_FOREACH(rvd, &spa->spa_vdevs, v_childlink)
+ vlist = &spa->spa_root_vdev->v_children;
+ STAILQ_FOREACH(rvd, vlist, v_childlink)
if (rvd->v_id == vdev)
break;
@@ -838,7 +840,7 @@ static void
vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg)
{
list_t stack;
- spa_t *spa = vd->spa;
+ spa_t *spa = vd->v_spa;
zio_t *zio = arg;
remap_segment_t *rs;
@@ -931,19 +933,20 @@ static int
vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
off_t offset, size_t bytes)
{
- zio_t zio = { 0 };
- spa_t *spa = vdev->spa;
- indirect_vsd_t *iv = malloc(sizeof (*iv));
+ zio_t zio;
+ spa_t *spa = vdev->v_spa;
+ indirect_vsd_t *iv;
indirect_split_t *first;
int rc = EIO;
+ iv = calloc(1, sizeof (*iv));
if (iv == NULL)
return (ENOMEM);
- bzero(iv, sizeof (*iv));
list_create(&iv->iv_splits,
sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
+ bzero(&zio, sizeof (zio));
zio.io_spa = spa;
zio.io_bp = (blkptr_t *)bp;
zio.io_data = buf;
@@ -1082,40 +1085,73 @@ vdev_create(uint64_t guid, vdev_read_t *vdev_read)
vdev_t *vdev;
vdev_indirect_config_t *vic;
- vdev = malloc(sizeof (vdev_t));
- memset(vdev, 0, sizeof (vdev_t));
- STAILQ_INIT(&vdev->v_children);
- vdev->v_guid = guid;
- vdev->v_state = VDEV_STATE_OFFLINE;
- vdev->v_read = vdev_read;
+ vdev = calloc(1, sizeof (vdev_t));
+ if (vdev != NULL) {
+ STAILQ_INIT(&vdev->v_children);
+ vdev->v_guid = guid;
+ vdev->v_read = vdev_read;
- vic = &vdev->vdev_indirect_config;
- vic->vic_prev_indirect_vdev = UINT64_MAX;
- STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
+ /*
+ * root vdev has no read function, we use this fact to
+ * skip setting up data we do not need for root vdev.
+ * We only point root vdev from spa.
+ */
+ if (vdev_read != NULL) {
+ vic = &vdev->vdev_indirect_config;
+ vic->vic_prev_indirect_vdev = UINT64_MAX;
+ STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
+ }
+ }
return (vdev);
}
-static int
-vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
- vdev_t **vdevp, int is_newer)
+static void
+vdev_set_initial_state(vdev_t *vdev, const unsigned char *nvlist)
{
- int rc;
- uint64_t guid, id, ashift, asize, nparity;
- const char *type;
- const char *path;
- vdev_t *vdev, *kid;
- const unsigned char *kids;
- int nkids, i, is_new;
uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
uint64_t is_log;
- if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
- NULL, &guid) ||
- nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) ||
+ is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
+ is_log = 0;
+ (void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
+ &is_offline);
+ (void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
+ &is_removed);
+ (void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
+ &is_faulted);
+ (void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64,
+ NULL, &is_degraded);
+ (void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64,
+ NULL, &isnt_present);
+ (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL,
+ &is_log);
+
+ if (is_offline != 0)
+ vdev->v_state = VDEV_STATE_OFFLINE;
+ else if (is_removed != 0)
+ vdev->v_state = VDEV_STATE_REMOVED;
+ else if (is_faulted != 0)
+ vdev->v_state = VDEV_STATE_FAULTED;
+ else if (is_degraded != 0)
+ vdev->v_state = VDEV_STATE_DEGRADED;
+ else if (isnt_present != 0)
+ vdev->v_state = VDEV_STATE_CANT_OPEN;
+
+ vdev->v_islog = is_log != 0;
+}
+
+static int
+vdev_init(uint64_t guid, const unsigned char *nvlist, vdev_t **vdevp)
+{
+ uint64_t id, ashift, asize, nparity;
+ const char *path;
+ const char *type;
+ vdev_t *vdev;
+
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) ||
nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
NULL, &type)) {
- printf("ZFS: can't find vdev details\n");
return (ENOENT);
}
@@ -1132,153 +1168,238 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
return (EIO);
}
- is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
- is_log = 0;
-
- nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
- &is_offline);
- nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
- &is_removed);
- nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
- &is_faulted);
- nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, NULL,
- &is_degraded);
- nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, NULL,
- &isnt_present);
- nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL,
- &is_log);
+ if (strcmp(type, VDEV_TYPE_MIRROR) == 0)
+ vdev = vdev_create(guid, vdev_mirror_read);
+ else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0)
+ vdev = vdev_create(guid, vdev_raidz_read);
+ else if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
+ vdev = vdev_create(guid, vdev_replacing_read);
+ else if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) {
+ vdev_indirect_config_t *vic;
- vdev = vdev_find(guid);
- if (!vdev) {
- is_new = 1;
-
- if (strcmp(type, VDEV_TYPE_MIRROR) == 0)
- vdev = vdev_create(guid, vdev_mirror_read);
- else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0)
- vdev = vdev_create(guid, vdev_raidz_read);
- else if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
- vdev = vdev_create(guid, vdev_replacing_read);
- else if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) {
- vdev_indirect_config_t *vic;
-
- vdev = vdev_create(guid, vdev_indirect_read);
+ vdev = vdev_create(guid, vdev_indirect_read);
+ if (vdev != NULL) {
vdev->v_state = VDEV_STATE_HEALTHY;
vic = &vdev->vdev_indirect_config;
nvlist_find(nvlist,
- ZPOOL_CONFIG_INDIRECT_OBJECT, DATA_TYPE_UINT64,
+ ZPOOL_CONFIG_INDIRECT_OBJECT,
+ DATA_TYPE_UINT64,
NULL, &vic->vic_mapping_object);
nvlist_find(nvlist,
- ZPOOL_CONFIG_INDIRECT_BIRTHS, DATA_TYPE_UINT64,
+ ZPOOL_CONFIG_INDIRECT_BIRTHS,
+ DATA_TYPE_UINT64,
NULL, &vic->vic_births_object);
nvlist_find(nvlist,
- ZPOOL_CONFIG_PREV_INDIRECT_VDEV, DATA_TYPE_UINT64,
+ ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
+ DATA_TYPE_UINT64,
NULL, &vic->vic_prev_indirect_vdev);
- } else
- vdev = vdev_create(guid, vdev_disk_read);
-
- vdev->v_id = id;
- vdev->v_top = pvdev != NULL ? pvdev : vdev;
- if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
- DATA_TYPE_UINT64, NULL, &ashift) == 0) {
- vdev->v_ashift = ashift;
- } else {
- vdev->v_ashift = 0;
}
- if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
- DATA_TYPE_UINT64, NULL, &asize) == 0) {
- vdev->v_psize = asize +
- VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
- }
- if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
- DATA_TYPE_UINT64, NULL, &nparity) == 0) {
- vdev->v_nparity = nparity;
+ } else {
+ vdev = vdev_create(guid, vdev_disk_read);
+ }
+
+ if (vdev == NULL)
+ return (ENOMEM);
+
+ vdev_set_initial_state(vdev, nvlist);
+ vdev->v_id = id;
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
+ DATA_TYPE_UINT64, NULL, &ashift) == 0)
+ vdev->v_ashift = ashift;
+
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
+ DATA_TYPE_UINT64, NULL, &asize) == 0) {
+ vdev->v_psize = asize +
+ VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+ }
+
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
+ DATA_TYPE_UINT64, NULL, &nparity) == 0)
+ vdev->v_nparity = nparity;
+
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
+ DATA_TYPE_STRING, NULL, &path) == 0) {
+ if (strncmp(path, "/dev/dsk/", 9) == 0)
+ path += 9;
+ vdev->v_name = strdup(path);
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_PHYS_PATH,
+ DATA_TYPE_STRING, NULL, &path) == 0) {
+ vdev->v_phys_path = strdup(path);
} else {
- vdev->v_nparity = 0;
+ vdev->v_phys_path = NULL;
}
- if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_DEVID,
DATA_TYPE_STRING, NULL, &path) == 0) {
- if (strncmp(path, "/dev/dsk/", 9) == 0)
- path += 9;
- vdev->v_name = strdup(path);
- if (nvlist_find(nvlist, ZPOOL_CONFIG_PHYS_PATH,
- DATA_TYPE_STRING, NULL, &path) == 0) {
- vdev->v_phys_path = strdup(path);
- } else {
- vdev->v_phys_path = NULL;
- }
- if (nvlist_find(nvlist, ZPOOL_CONFIG_DEVID,
- DATA_TYPE_STRING, NULL, &path) == 0) {
- vdev->v_devid = strdup(path);
- } else {
- vdev->v_devid = NULL;
- }
+ vdev->v_devid = strdup(path);
} else {
- char *name;
-
- if (strcmp(type, "raidz") == 0) {
- if (vdev->v_nparity < 1 ||
- vdev->v_nparity > 3) {
- printf("ZFS: can only boot from disk, "
- "mirror, raidz1, raidz2 and raidz3 "
- "vdevs\n");
- return (EIO);
- }
- rc = asprintf(&name, "%s%d-%" PRIu64, type,
- vdev->v_nparity, id);
- } else {
- rc = asprintf(&name, "%s-%" PRIu64, type, id);
- }
- if (rc < 0)
- return (ENOMEM);
- vdev->v_name = name;
+ vdev->v_devid = NULL;
}
- vdev->v_islog = is_log == 1;
} else {
- is_new = 0;
+ char *name;
+
+ name = NULL;
+ if (strcmp(type, "raidz") == 0) {
+ if (vdev->v_nparity < 1 ||
+ vdev->v_nparity > 3) {
+ printf("ZFS: invalid raidz parity: %d\n",
+ vdev->v_nparity);
+ return (EIO);
+ }
+ (void) asprintf(&name, "%s%d-%" PRIu64, type,
+ vdev->v_nparity, id);
+ } else {
+ (void) asprintf(&name, "%s-%" PRIu64, type, id);
+ }
+ vdev->v_name = name;
}
+ *vdevp = vdev;
+ return (0);
+}
+
+/*
+ * Find slot for vdev. We return either NULL to signal to use
+ * STAILQ_INSERT_HEAD, or we return link element to be used with
+ * STAILQ_INSERT_AFTER.
+ */
+static vdev_t *
+vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev)
+{
+ vdev_t *v, *previous;
+
+ if (STAILQ_EMPTY(&top_vdev->v_children))
+ return (NULL);
+
+ previous = NULL;
+ STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) {
+ if (v->v_id > vdev->v_id)
+ return (previous);
+
+ if (v->v_id == vdev->v_id)
+ return (v);
+
+ if (v->v_id < vdev->v_id)
+ previous = v;
+ }
+ return (previous);
+}
+
+static size_t
+vdev_child_count(vdev_t *vdev)
+{
+ vdev_t *v;
+ size_t count;
+
+ count = 0;
+ STAILQ_FOREACH(v, &vdev->v_children, v_childlink) {
+ count++;
+ }
+ return (count);
+}
+
+/*
+ * Insert vdev into top_vdev children list. List is ordered by v_id.
+ */
+static void
+vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
+{
+ vdev_t *previous;
+ size_t count;
- if (is_new || is_newer) {
+ /*
+ * The top level vdev can appear in random order, depending how
+ * the firmware is presenting the disk devices.
+ * However, we will insert vdev to create list ordered by v_id,
+ * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER
+ * as STAILQ does not have insert before.
+ */
+ previous = vdev_find_previous(top_vdev, vdev);
+
+ if (previous == NULL) {
+ STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink);
+ } else if (previous->v_id == vdev->v_id) {
/*
- * This is either new vdev or we've already seen this vdev,
- * but from an older vdev label, so let's refresh its state
- * from the newer label.
+ * This vdev was configured from label config,
+ * do not insert duplicate.
*/
- if (is_offline)
- vdev->v_state = VDEV_STATE_OFFLINE;
- else if (is_removed)
- vdev->v_state = VDEV_STATE_REMOVED;
- else if (is_faulted)
- vdev->v_state = VDEV_STATE_FAULTED;
- else if (is_degraded)
- vdev->v_state = VDEV_STATE_DEGRADED;
- else if (isnt_present)
- vdev->v_state = VDEV_STATE_CANT_OPEN;
+ return;
+ } else {
+ STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev,
+ v_childlink);
+ }
+
+ count = vdev_child_count(top_vdev);
+ if (top_vdev->v_nchildren < count)
+ top_vdev->v_nchildren = count;
+}
+
+static int
+vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist)
+{
+ vdev_t *top_vdev, *vdev;
+ const unsigned char *kids;
+ int rc, nkids;
+
+ /* Get top vdev. */
+ top_vdev = vdev_find(top_guid);
+ if (top_vdev == NULL) {
+ rc = vdev_init(top_guid, nvlist, &top_vdev);
+ if (rc != 0)
+ return (rc);
+ top_vdev->v_spa = spa;
+ top_vdev->v_top = top_vdev;
+ vdev_insert(spa->spa_root_vdev, top_vdev);
}
+ /* Add children if there are any. */
rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
&nkids, &kids);
- /*
- * Its ok if we don't have any kids.
- */
if (rc == 0) {
- vdev->v_nchildren = nkids;
- for (i = 0; i < nkids; i++) {
- rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
- if (rc)
+ for (int i = 0; i < nkids; i++) {
+ uint64_t guid;
+
+ rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
+ DATA_TYPE_UINT64, NULL, &guid);
+ if (rc != 0)
+ return (rc);
+ rc = vdev_init(guid, kids, &vdev);
+ if (rc != 0)
return (rc);
- if (is_new)
- STAILQ_INSERT_TAIL(&vdev->v_children, kid,
- v_childlink);
+
+ vdev->v_spa = spa;
+ vdev->v_top = top_vdev;
+ vdev_insert(top_vdev, vdev);
+
kids = nvlist_next(kids);
}
} else {
- vdev->v_nchildren = 0;
+ /*
+ * When there are no children, nvlist_find() does return
+ * error, reset it because leaf devices have no children.
+ */
+ rc = 0;
}
- if (vdevp)
- *vdevp = vdev;
- return (0);
+ return (rc);
+}
+
+static int
+vdev_init_from_label(spa_t *spa, const unsigned char *nvlist)
+{
+ uint64_t pool_guid, top_guid;
+ const unsigned char *vdevs;
+
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
+ NULL, &pool_guid) ||
+ nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
+ NULL, &top_guid) ||
+ nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
+ NULL, &vdevs)) {
+ printf("ZFS: can't find vdev details\n");
+ return (ENOENT);
+ }
+
+ return (vdev_from_nvlist(spa, top_guid, vdevs));
}
static void
@@ -1288,6 +1409,10 @@ vdev_set_state(vdev_t *vdev)
int good_kids;
int bad_kids;
+ STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
+ vdev_set_state(kid);
+ }
+
/*
* A mirror or raidz is healthy if all its kids are healthy. A
* mirror is degraded if any of its kids is healthy; a raidz
@@ -1322,6 +1447,104 @@ vdev_set_state(vdev_t *vdev)
}
}
+static int
+vdev_update_from_nvlist(uint64_t top_guid, const unsigned char *nvlist)
+{
+ vdev_t *vdev;
+ const unsigned char *kids;
+ int rc, nkids;
+
+ /* Update top vdev. */
+ vdev = vdev_find(top_guid);
+ if (vdev != NULL)
+ vdev_set_initial_state(vdev, nvlist);
+
+ /* Update children if there are any. */
+ rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
+ &nkids, &kids);
+ if (rc == 0) {
+ for (int i = 0; i < nkids; i++) {
+ uint64_t guid;
+
+ rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
+ DATA_TYPE_UINT64, NULL, &guid);
+ if (rc != 0)
+ break;
+
+ vdev = vdev_find(guid);
+ if (vdev != NULL)
+ vdev_set_initial_state(vdev, kids);
+
+ kids = nvlist_next(kids);
+ }
+ } else {
+ rc = 0;
+ }
+
+ return (rc);
+}
+
+static int
+vdev_init_from_nvlist(spa_t *spa, const unsigned char *nvlist)
+{
+ uint64_t pool_guid, vdev_children;
+ const unsigned char *vdevs, *kids;
+ int rc, nkids;
+
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
+ NULL, &pool_guid) ||
+ nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64,
+ NULL, &vdev_children) ||
+ nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
+ NULL, &vdevs)) {
+ printf("ZFS: can't find vdev details\n");
+ return (ENOENT);
+ }
+
+ /* Wrong guid?! */
+ if (spa->spa_guid != pool_guid)
+ return (EINVAL);
+
+ spa->spa_root_vdev->v_nchildren = vdev_children;
+
+ rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
+ &nkids, &kids);
+
+ /*
+ * MOS config has at least one child for root vdev.
+ */
+ if (rc != 0)
+ return (rc);
+
+ for (int i = 0; i < nkids; i++) {
+ uint64_t guid;
+ vdev_t *vdev;
+
+ rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
+ NULL, &guid);
+ if (rc != 0)
+ break;
+ vdev = vdev_find(guid);
+ /*
+ * Top level vdev is missing, create it.
+ */
+ if (vdev == NULL)
+ rc = vdev_from_nvlist(spa, guid, kids);
+ else
+ rc = vdev_update_from_nvlist(guid, kids);
+ if (rc != 0)
+ break;
+ kids = nvlist_next(kids);
+ }
+
+ /*
+ * Re-evaluate top-level vdev state.
+ */
+ vdev_set_state(spa->spa_root_vdev);
+
+ return (rc);
+}
+
static spa_t *
spa_find_by_guid(uint64_t guid)
{
@@ -1362,7 +1585,7 @@ spa_get_primary_vdev(const spa_t *spa)
spa = spa_get_primary();
if (spa == NULL)
return (NULL);
- vdev = STAILQ_FIRST(&spa->spa_vdevs);
+ vdev = spa->spa_root_vdev;
if (vdev == NULL)
return (NULL);
for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
@@ -1382,8 +1605,14 @@ spa_create(uint64_t guid, const char *name)
free(spa);
return (NULL);
}
- STAILQ_INIT(&spa->spa_vdevs);
spa->spa_guid = guid;
+ spa->spa_root_vdev = vdev_create(guid, NULL);
+ if (spa->spa_root_vdev == NULL) {
+ free(spa->spa_name);
+ free(spa);
+ return (NULL);
+ }
+ spa->spa_root_vdev->v_name = strdup("root");
STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
return (spa);
@@ -1460,6 +1689,7 @@ spa_status(spa_t *spa)
{
static char bootfs[ZFS_MAXNAMELEN];
uint64_t rootid;
+ vdev_list_t *vlist;
vdev_t *vdev;
int good_kids, bad_kids, degraded_kids, ret;
vdev_state_t state;
@@ -1488,7 +1718,8 @@ spa_status(spa_t *spa)
good_kids = 0;
degraded_kids = 0;
bad_kids = 0;
- STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+ vlist = &spa->spa_root_vdev->v_children;
+ STAILQ_FOREACH(vdev, vlist, v_childlink) {
if (vdev->v_state == VDEV_STATE_HEALTHY)
good_kids++;
else if (vdev->v_state == VDEV_STATE_DEGRADED)
@@ -1506,7 +1737,8 @@ spa_status(spa_t *spa)
ret = print_state(0, spa->spa_name, state);
if (ret != 0)
return (ret);
- STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+
+ STAILQ_FOREACH(vdev, vlist, v_childlink) {
ret = vdev_status(vdev, 1);
if (ret != 0)
return (ret);
@@ -1698,15 +1930,14 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
{
vdev_t vtmp;
spa_t *spa;
- vdev_t *vdev, *top_vdev, *pool_vdev;
+ vdev_t *vdev;
unsigned char *nvlist;
uint64_t val;
- uint64_t guid;
+ uint64_t guid, vdev_children;
uint64_t pool_txg, pool_guid;
const char *pool_name;
- const unsigned char *vdevs;
const unsigned char *features;
- int rc, is_newer;
+ int rc;
/*
* Load the vdev label and figure out which
@@ -1778,18 +2009,17 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
*/
spa = spa_find_by_guid(pool_guid);
if (spa == NULL) {
+ nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN,
+ DATA_TYPE_UINT64, NULL, &vdev_children);
spa = spa_create(pool_guid, pool_name);
if (spa == NULL) {
free(nvlist);
return (ENOMEM);
}
+ spa->spa_root_vdev->v_nchildren = vdev_children;
}
- if (pool_txg > spa->spa_txg) {
+ if (pool_txg > spa->spa_txg)
spa->spa_txg = pool_txg;
- is_newer = 1;
- } else {
- is_newer = 0;
- }
/*
* Get the vdev tree and create our in-core copy of it.
@@ -1809,39 +2039,25 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
return (EIO);
}
- if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
- NULL, &vdevs)) {
- free(nvlist);
- return (EIO);
- }
-
- rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
+ rc = vdev_init_from_label(spa, nvlist);
free(nvlist);
if (rc != 0)
return (rc);
/*
- * Add the toplevel vdev to the pool if its not already there.
- */
- STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
- if (top_vdev == pool_vdev)
- break;
-
- if (!pool_vdev && top_vdev) {
- top_vdev->spa = spa;
- STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
- }
-
- /*
* We should already have created an incomplete vdev for this
* vdev. Find it and initialise it with our read proc.
*/
vdev = vdev_find(guid);
- if (vdev) {
+ if (vdev != NULL) {
vdev->v_phys_read = phys_read;
vdev->v_read_priv = read_priv;
- vdev->v_state = VDEV_STATE_HEALTHY;
vdev->v_psize = vtmp.v_psize;
+ /*
+ * If no other state is set, mark vdev healthy.
+ */
+ if (vdev->v_state == VDEV_STATE_UNKNOWN)
+ vdev->v_state = VDEV_STATE_HEALTHY;
} else {
printf("ZFS: inconsistent nvlist contents\n");
return (EIO);
@@ -1851,13 +2067,13 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
spa->spa_with_log = vdev->v_islog;
/* Record boot vdev for spa. */
- if (is_newer == 1)
+ if (spa->spa_boot_vdev == NULL)
spa->spa_boot_vdev = vdev;
/*
* Re-evaluate top-level vdev state.
*/
- vdev_set_state(top_vdev);
+ vdev_set_state(vdev->v_top);
/*
* Ok, we are happy with the pool so far. Lets find
@@ -1866,7 +2082,6 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
*/
vdev_uberblock_load(vdev, &spa->spa_uberblock);
- vdev->spa = spa;
if (spap != NULL)
*spap = spa;
return (0);
@@ -1961,7 +2176,8 @@ zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
for (i = 0; i < SPA_DVAS_PER_BP; i++) {
const dva_t *dva = &bp->blk_dva[i];
vdev_t *vdev;
- int vdevid;
+ vdev_list_t *vlist;
+ uint64_t vdevid;
off_t offset;
if (!dva->dva_word[0] && !dva->dva_word[1])
@@ -1969,7 +2185,8 @@ zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
vdevid = DVA_GET_VDEV(dva);
offset = DVA_GET_OFFSET(dva);
- STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+ vlist = &spa->spa_root_vdev->v_children;
+ STAILQ_FOREACH(vdev, vlist, v_childlink) {
if (vdev->v_id == vdevid)
break;
}
@@ -1978,7 +2195,7 @@ zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
size = BP_GET_PSIZE(bp);
if (vdev->v_read == vdev_raidz_read) {
- align = 1ULL << vdev->v_top->v_ashift;
+ align = 1ULL << vdev->v_ashift;
if (P2PHASE(size, align) != 0)
size = P2ROUNDUP(size, align);
}
@@ -3026,9 +3243,7 @@ zfs_spa_init(spa_t *spa)
dnode_phys_t dir;
uint64_t config_object;
unsigned char *nvlist;
- char *type;
- const unsigned char *nv;
- int nkids, rc;
+ int rc;
if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
@@ -3066,62 +3281,11 @@ zfs_spa_init(spa_t *spa)
if (rc != 0)
return (rc);
- /* Update vdevs from MOS config. */
- if (nvlist_find(nvlist + 4, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
- NULL, &nv)) {
- rc = EIO;
- goto done;
- }
-
- if (nvlist_find(nv, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL, &type)) {
- printf("ZFS: can't find vdev details\n");
- rc = ENOENT;
- goto done;
- }
- if (strcmp(type, VDEV_TYPE_ROOT) != 0) {
- rc = ENOENT;
- goto done;
- }
-
- rc = nvlist_find(nv, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
- &nkids, &nv);
- if (rc != 0)
- goto done;
-
- for (int i = 0; i < nkids; i++) {
- vdev_t *vd, *prev, *kid = NULL;
- rc = vdev_init_from_nvlist(nv, NULL, &kid, 0);
- if (rc != 0) {
- printf("vdev_init_from_nvlist: %d\n", rc);
- break;
- }
- kid->spa = spa;
- prev = NULL;
- STAILQ_FOREACH(vd, &spa->spa_vdevs, v_childlink) {
- /* Already present? */
- if (kid->v_id == vd->v_id) {
- kid = NULL;
- break;
- }
- if (vd->v_id > kid->v_id) {
- if (prev == NULL) {
- STAILQ_INSERT_HEAD(&spa->spa_vdevs,
- kid, v_childlink);
- } else {
- STAILQ_INSERT_AFTER(&spa->spa_vdevs,
- prev, kid, v_childlink);
- }
- kid = NULL;
- break;
- }
- prev = vd;
- }
- if (kid != NULL)
- STAILQ_INSERT_TAIL(&spa->spa_vdevs, kid, v_childlink);
- nv = nvlist_next(nv);
- }
- rc = 0;
-done:
+ /*
+ * Update vdevs from MOS config. Note, we do skip encoding bytes
+ * here. See also vdev_label_read_config().
+ */
+ rc = vdev_init_from_nvlist(spa, nvlist + 4);
free(nvlist);
return (rc);
}
diff --git a/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h b/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h
index 6b629f8fe5..c57181b670 100644
--- a/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h
+++ b/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h
@@ -763,6 +763,7 @@ typedef enum {
#define ZPOOL_CONFIG_IS_LOG "is_log"
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read"
+#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
/*
* The persistent vdev state is stored as separate values rather than a single
@@ -1765,13 +1766,13 @@ typedef struct vdev {
int v_ashift; /* offset to block shift */
int v_nparity; /* # parity for raidz */
struct vdev *v_top; /* parent vdev */
- int v_nchildren; /* # children */
+ size_t v_nchildren; /* # children */
vdev_state_t v_state; /* current state */
vdev_phys_read_t *v_phys_read; /* read from raw leaf vdev */
vdev_read_t *v_read; /* read from vdev */
void *v_read_priv; /* private data for read function */
boolean_t v_islog;
- struct spa *spa; /* link to spa */
+ struct spa *v_spa; /* link to spa */
/*
* Values stored in the config for an indirect or removing vdev.
*/
@@ -1790,11 +1791,10 @@ typedef struct spa {
uint64_t spa_guid; /* pool guid */
uint64_t spa_txg; /* most recent transaction */
struct uberblock spa_uberblock; /* best uberblock so far */
- vdev_list_t spa_vdevs; /* list of all toplevel vdevs */
+ vdev_t *spa_root_vdev; /* toplevel vdev container */
objset_phys_t spa_mos; /* MOS for this pool */
zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */
void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
- int spa_inited; /* initialized */
vdev_t *spa_boot_vdev; /* boot device for kernel */
boolean_t spa_with_log; /* this pool has log */
} spa_t;
diff --git a/usr/src/boot/sys/cddl/boot/zfs/zfssubr.c b/usr/src/boot/sys/cddl/boot/zfs/zfssubr.c
index 4cfd337213..0d509ed4a1 100644
--- a/usr/src/boot/sys/cddl/boot/zfs/zfssubr.c
+++ b/usr/src/boot/sys/cddl/boot/zfs/zfssubr.c
@@ -1643,7 +1643,7 @@ reconstruct:
int rv;
if (data_errors == 0) {
- rv = raidz_checksum_verify(vd->spa, bp, data, bytes);
+ rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
if (rv == 0) {
/*
* If we read parity information (unnecessarily
@@ -1689,7 +1689,7 @@ reconstruct:
code = vdev_raidz_reconstruct(rm, tgts, n);
- rv = raidz_checksum_verify(vd->spa, bp, data, bytes);
+ rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
if (rv == 0) {
/*
* If we read more parity disks than were used
@@ -1764,7 +1764,7 @@ reconstruct:
if (total_errors > rm->rm_firstdatacol) {
error = EIO;
} else if (total_errors < rm->rm_firstdatacol &&
- (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes,
+ (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
total_errors, data_errors)) != 0) {
/*
* If we didn't use all the available parity for the