summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbillm <none@none>2006-04-10 05:03:38 -0700
committerbillm <none@none>2006-04-10 05:03:38 -0700
commit44cd46cadd9aab751dae6a4023c1cb5bf316d274 (patch)
tree27db23d9e2bc81a70d528c18cf9d04874891ed9d
parentdc5d169b4bfc1a6993578ef34dae678076fd19fb (diff)
downloadillumos-gate-44cd46cadd9aab751dae6a4023c1cb5bf316d274.tar.gz
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
6410700 zdb should support reading raw blocks out of storage pool 6410709 ztest: spa config can change before pool export
-rw-r--r--usr/src/cmd/mdb/common/modules/zfs/zfs.c28
-rw-r--r--usr/src/cmd/zdb/zdb.c503
-rw-r--r--usr/src/cmd/zpool/zpool_main.c11
-rw-r--r--usr/src/cmd/ztest/ztest.c5
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c36
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_pool.c4
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c203
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c88
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c162
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h8
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h20
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h17
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_impl.h13
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c84
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c231
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c10
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_root.c26
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c2
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c208
-rw-r--r--usr/src/uts/common/fs/zfs/zio_checksum.c5
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h18
27 files changed, 1176 insertions, 524 deletions
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
index 40b2e019bc..5b218aee5f 100644
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -437,20 +437,28 @@ blkptr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
zct[i].ci_name = local_strdup(buf);
}
- for (i = 0; i < SPA_DVAS_PER_BP; i++) {
+ /*
+ * Super-ick warning: This code is also duplicated in
+ * cmd/zdb.c . Yeah, I hate code replication, too.
+ */
+ for (i = 0; i < BP_GET_NDVAS(&bp); i++) {
dva_t *dva = &bp.blk_dva[i];
- mdb_printf("DVA[%d]: GANG: %-5s GRID: %2x ASIZE: %5x "
- "vdev %llu offset %llx\n",
- i,
- DVA_GET_GANG(dva) ? "TRUE" : "FALSE",
- DVA_GET_GRID(dva),
- DVA_GET_ASIZE(dva),
- DVA_GET_VDEV(dva),
- DVA_GET_OFFSET(dva));
+
+ mdb_printf("DVA[%d]: vdev_id %lld / %llx\n", i,
+ DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva));
+ mdb_printf("DVA[%d]: GANG: %-5s GRID: %04x\t"
+ "ASIZE: %llx\n", i, DVA_GET_GANG(dva) ? "TRUE" : "FALSE",
+ DVA_GET_GRID(dva), DVA_GET_ASIZE(dva));
+ mdb_printf("DVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", i,
+ DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), BP_GET_PSIZE(&bp),
+ BP_SHOULD_BYTESWAP(&bp) ? "e" : "",
+ !DVA_GET_GANG(dva) && BP_GET_LEVEL(&bp) != 0 ? "i" : "",
+ DVA_GET_GANG(dva) ? "g" : "",
+ BP_GET_COMPRESS(&bp) != 0 ? "d" : "");
}
mdb_printf("LSIZE: %-16llx\t\tPSIZE: %llx\n",
BP_GET_LSIZE(&bp), BP_GET_PSIZE(&bp));
- mdb_printf("ENDIAN: %-6s TYPE: %s\n",
+ mdb_printf("ENDIAN: %6s\t\t\t\t\tTYPE: %s\n",
BP_GET_BYTEORDER(&bp) ? "LITTLE" : "BIG",
doti[BP_GET_TYPE(&bp)].ot_name);
mdb_printf("BIRTH: %-16llx LEVEL: %-2d\tFILL: %llx\n",
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 611f8ffc0c..3615846a00 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -27,6 +27,7 @@
#include <stdio.h>
#include <stdlib.h>
+#include <ctype.h>
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
@@ -84,8 +85,9 @@ usage(void)
"Usage: %s [-udibcsvLU] [-O order] [-B os:obj:level:blkid] "
"dataset [object...]\n"
" %s -C [pool]\n"
- " %s -l dev\n",
- cmdname, cmdname, cmdname);
+ " %s -l dev\n"
+ " %s -R vdev:offset:size:flags\n",
+ cmdname, cmdname, cmdname, cmdname);
(void) fprintf(stderr, " -u uberblock\n");
(void) fprintf(stderr, " -d datasets\n");
@@ -102,6 +104,8 @@ usage(void)
(void) fprintf(stderr, " -U use zpool.cache in /tmp\n");
(void) fprintf(stderr, " -B objset:object:level:blkid -- "
"simulate bad block\n");
+ (void) fprintf(stderr, " -R read and display block from a"
+ "device\n");
(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
"to make only that option verbose\n");
(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
@@ -523,20 +527,41 @@ blkid2offset(dnode_phys_t *dnp, int level, uint64_t blkid)
dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
}
+static void
+sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas)
+{
+ dva_t *dva = bp->blk_dva;
+ int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1;
+ int i;
+
+ blkbuf[0] = '\0';
+
+ for (i = 0; i < ndvas; i++)
+ (void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ",
+ (u_longlong_t)DVA_GET_VDEV(&dva[i]),
+ (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
+ (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
+
+ (void) sprintf(blkbuf + strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu",
+ (u_longlong_t)BP_GET_LSIZE(bp),
+ (u_longlong_t)BP_GET_PSIZE(bp),
+ (u_longlong_t)bp->blk_fill,
+ (u_longlong_t)bp->blk_birth);
+}
+
/* ARGSUSED */
static int
zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
{
zbookmark_t *zb = &bc->bc_bookmark;
blkptr_t *bp = &bc->bc_blkptr;
- dva_t *dva = &bp->blk_dva[0];
void *data = bc->bc_data;
dnode_phys_t *dnp = bc->bc_dnode;
- char buffer[300];
+ char blkbuf[BP_SPRINTF_LEN + 80];
int l;
if (bc->bc_errno) {
- (void) sprintf(buffer,
+ (void) sprintf(blkbuf,
"Error %d reading <%llu, %llu, %lld, %llu>: ",
bc->bc_errno,
(u_longlong_t)zb->zb_objset,
@@ -581,37 +606,28 @@ zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
ASSERT3U(fill, ==, bp->blk_fill);
}
- (void) sprintf(buffer, "%16llx ",
+ (void) sprintf(blkbuf, "%16llx ",
(u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
ASSERT(zb->zb_level >= 0);
for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
if (l == zb->zb_level) {
- (void) sprintf(buffer + strlen(buffer), "L%llx",
+ (void) sprintf(blkbuf + strlen(blkbuf), "L%llx",
(u_longlong_t)zb->zb_level);
} else {
- (void) sprintf(buffer + strlen(buffer), " ");
+ (void) sprintf(blkbuf + strlen(blkbuf), " ");
}
}
out:
if (bp->blk_birth == 0) {
- (void) sprintf(buffer + strlen(buffer), "<hole>");
- (void) printf("%s\n", buffer);
+ (void) sprintf(blkbuf + strlen(blkbuf), "<hole>");
+ (void) printf("%s\n", blkbuf);
} else {
- // XXBP - Need to print number of active BPs here
- (void) sprintf(buffer + strlen(buffer),
- "vdev=%llu off=%llx %llxL/%llxP/%llxA F=%llu B=%llu",
- (u_longlong_t)DVA_GET_VDEV(dva),
- (u_longlong_t)DVA_GET_OFFSET(dva),
- (u_longlong_t)BP_GET_LSIZE(bp),
- (u_longlong_t)BP_GET_PSIZE(bp),
- (u_longlong_t)DVA_GET_ASIZE(dva),
- (u_longlong_t)bp->blk_fill,
- (u_longlong_t)bp->blk_birth);
-
- (void) printf("%s\n", buffer);
+ sprintf_blkptr_compact(blkbuf + strlen(blkbuf), bp,
+ dump_opt['d'] > 5 ? 1 : 0);
+ (void) printf("%s\n", blkbuf);
}
return (bc->bc_errno ? ERESTART : 0);
@@ -762,18 +778,12 @@ dump_bplist(objset_t *mos, uint64_t object, char *name)
(void) printf("\n");
while (bplist_iterate(&bpl, &itor, bp) == 0) {
+ char blkbuf[BP_SPRINTF_LEN];
+
ASSERT(bp->blk_birth != 0);
- // XXBP - Do we want to see all DVAs, or just one?
- (void) printf("\tItem %3llu: vdev=%llu off=%llx "
- "%llxL/%llxP/%llxA F=%llu B=%llu\n",
- (u_longlong_t)itor - 1,
- (u_longlong_t)DVA_GET_VDEV(&bp->blk_dva[0]),
- (u_longlong_t)DVA_GET_OFFSET(&bp->blk_dva[0]),
- (u_longlong_t)BP_GET_LSIZE(bp),
- (u_longlong_t)BP_GET_PSIZE(bp),
- (u_longlong_t)DVA_GET_ASIZE(&bp->blk_dva[0]),
- (u_longlong_t)bp->blk_fill,
- (u_longlong_t)bp->blk_birth);
+ sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
+ (void) printf("\tItem %3llu: %s\n",
+ (u_longlong_t)itor - 1, blkbuf);
}
bplist_close(&bpl);
@@ -1228,45 +1238,73 @@ zdb_space_map_load(spa_t *spa)
static int
zdb_space_map_claim(spa_t *spa, blkptr_t *bp, zbookmark_t *zb)
{
- dva_t *dva = &bp->blk_dva[0];
- uint64_t vdev = DVA_GET_VDEV(dva);
- uint64_t offset = DVA_GET_OFFSET(dva);
- uint64_t size = DVA_GET_ASIZE(dva);
+ dva_t *dva = bp->blk_dva;
vdev_t *vd;
metaslab_t *msp;
space_map_t *allocmap, *freemap;
int error;
+ int d;
+ blkptr_t blk = *bp;
+
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ uint64_t vdev = DVA_GET_VDEV(&dva[d]);
+ uint64_t offset = DVA_GET_OFFSET(&dva[d]);
+ uint64_t size = DVA_GET_ASIZE(&dva[d]);
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
+ return (ENXIO);
+
+ if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
+ return (ENXIO);
- if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
- return (ENXIO);
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ allocmap = &msp->ms_allocmap[0];
+ freemap = &msp->ms_freemap[0];
- if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
- return (ENXIO);
+ /* Prepare our copy of the bp in case we need to read GBHs */
+ if (DVA_GET_GANG(&dva[d])) {
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+ DVA_SET_ASIZE(&blk.blk_dva[d], size);
+ DVA_SET_GANG(&blk.blk_dva[d], 0);
+ }
+
+ mutex_enter(&msp->ms_lock);
+ if (space_map_contains(freemap, offset, size)) {
+ mutex_exit(&msp->ms_lock);
+ return (EAGAIN); /* allocated more than once */
+ }
- if (DVA_GET_GANG(dva)) {
+ if (!space_map_contains(allocmap, offset, size)) {
+ mutex_exit(&msp->ms_lock);
+ return (ESTALE); /* not allocated at all */
+ }
+
+ space_map_remove(allocmap, offset, size);
+ space_map_add(freemap, offset, size);
+
+ mutex_exit(&msp->ms_lock);
+ }
+
+ if (BP_IS_GANG(bp)) {
zio_gbh_phys_t gbh;
- blkptr_t blk = *bp;
int g;
/* LINTED - compile time assert */
ASSERT(sizeof (zio_gbh_phys_t) == SPA_GANGBLOCKSIZE);
- size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
- DVA_SET_GANG(&blk.blk_dva[0], 0);
- DVA_SET_ASIZE(&blk.blk_dva[0], size);
+
BP_SET_CHECKSUM(&blk, ZIO_CHECKSUM_GANG_HEADER);
BP_SET_PSIZE(&blk, SPA_GANGBLOCKSIZE);
BP_SET_LSIZE(&blk, SPA_GANGBLOCKSIZE);
BP_SET_COMPRESS(&blk, ZIO_COMPRESS_OFF);
- error = zio_wait(zio_read(NULL, spa, &blk,
- &gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
- ZIO_PRIORITY_SYNC_READ,
+ error = zio_wait(zio_read(NULL, spa, &blk, &gbh,
+ SPA_GANGBLOCKSIZE, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD, zb));
if (error)
return (error);
if (BP_SHOULD_BYTESWAP(&blk))
byteswap_uint64_array(&gbh, SPA_GANGBLOCKSIZE);
for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
- if (gbh.zg_blkptr[g].blk_birth == 0)
+ if (BP_IS_HOLE(&gbh.zg_blkptr[g]))
break;
error = zdb_space_map_claim(spa, &gbh.zg_blkptr[g], zb);
if (error)
@@ -1274,26 +1312,6 @@ zdb_space_map_claim(spa_t *spa, blkptr_t *bp, zbookmark_t *zb)
}
}
- msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
- allocmap = &msp->ms_allocmap[0];
- freemap = &msp->ms_freemap[0];
-
- mutex_enter(&msp->ms_lock);
- if (space_map_contains(freemap, offset, size)) {
- mutex_exit(&msp->ms_lock);
- return (EAGAIN); /* allocated more than once */
- }
-
- if (!space_map_contains(allocmap, offset, size)) {
- mutex_exit(&msp->ms_lock);
- return (ESTALE); /* not allocated at all */
- }
-
- space_map_remove(allocmap, offset, size);
- space_map_add(freemap, offset, size);
-
- mutex_exit(&msp->ms_lock);
-
return (0);
}
@@ -1448,7 +1466,7 @@ zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
zcb->zcb_readfails = 0;
- ASSERT(bp->blk_birth != 0);
+ ASSERT(!BP_IS_HOLE(bp));
zdb_count_block(spa, zcb, bp, type);
@@ -1511,13 +1529,13 @@ dump_block_stats(spa_t *spa)
spa->spa_sync_bplist_obj));
while (bplist_iterate(bpl, &itor, &blk) == 0) {
- zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
if (dump_opt['b'] >= 4) {
char blkbuf[BP_SPRINTF_LEN];
sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
(void) printf("[%s] %s\n",
"deferred free", blkbuf);
}
+ zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
}
bplist_close(bpl);
@@ -1703,6 +1721,321 @@ dump_zpool(spa_t *spa)
exit(rc);
}
+#define ZDB_FLAG_CHECKSUM 0x0001
+#define ZDB_FLAG_DECOMPRESS 0x0002
+#define ZDB_FLAG_BSWAP 0x0004
+#define ZDB_FLAG_GBH 0x0008
+#define ZDB_FLAG_INDIRECT 0x0010
+#define ZDB_FLAG_PHYS 0x0020
+#define ZDB_FLAG_RAW 0x0040
+#define ZDB_FLAG_PRINT_BLKPTR 0x0080
+
+int flagbits[256];
+
+static void
+zdb_print_blkptr(blkptr_t *bp, int flags)
+{
+ dva_t *dva = bp->blk_dva;
+ int d;
+
+ if (flags & ZDB_FLAG_BSWAP)
+ byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
+ /*
+ * Super-ick warning: This code is also duplicated in
+ * cmd/mdb/common/modules/zfs/zfs.c . Yeah, I hate code
+ * replication, too.
+ */
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ (void) printf("\tDVA[%d]: vdev_id %lld / %llx\n", d,
+ DVA_GET_VDEV(&dva[d]), DVA_GET_OFFSET(&dva[d]));
+ (void) printf("\tDVA[%d]: GANG: %-5s GRID: %04llx\t"
+ "ASIZE: %llx\n", d,
+ DVA_GET_GANG(&dva[d]) ? "TRUE" : "FALSE",
+ DVA_GET_GRID(&dva[d]), DVA_GET_ASIZE(&dva[d]));
+ (void) printf("\tDVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", d,
+ DVA_GET_VDEV(&dva[d]), DVA_GET_OFFSET(&dva[d]),
+ BP_GET_PSIZE(bp),
+ BP_SHOULD_BYTESWAP(bp) ? "e" : "",
+ !DVA_GET_GANG(&dva[d]) && BP_GET_LEVEL(bp) != 0 ?
+ "d" : "",
+ DVA_GET_GANG(&dva[d]) ? "g" : "",
+ BP_GET_COMPRESS(bp) != 0 ? "d" : "");
+ }
+ (void) printf("\tLSIZE: %-16llx\t\tPSIZE: %llx\n",
+ BP_GET_LSIZE(bp), BP_GET_PSIZE(bp));
+ (void) printf("\tENDIAN: %6s\t\t\t\t\tTYPE: %s\n",
+ BP_GET_BYTEORDER(bp) ? "LITTLE" : "BIG",
+ dmu_ot[BP_GET_TYPE(bp)].ot_name);
+ (void) printf("\tBIRTH: %-16llx LEVEL: %-2llu\tFILL: %llx\n",
+ (u_longlong_t)bp->blk_birth, BP_GET_LEVEL(bp),
+ (u_longlong_t)bp->blk_fill);
+ (void) printf("\tCKFUNC: %-16s\t\tCOMP: %s\n",
+ zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
+ zio_compress_table[BP_GET_COMPRESS(bp)].ci_name);
+ (void) printf("\tCKSUM: %llx:%llx:%llx:%llx\n",
+ (u_longlong_t)bp->blk_cksum.zc_word[0],
+ (u_longlong_t)bp->blk_cksum.zc_word[1],
+ (u_longlong_t)bp->blk_cksum.zc_word[2],
+ (u_longlong_t)bp->blk_cksum.zc_word[3]);
+}
+
+static void
+zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
+{
+ int i;
+
+ for (i = 0; i < nbps; i++)
+ zdb_print_blkptr(&bp[i], flags);
+}
+
+static void
+zdb_dump_gbh(void *buf, int flags)
+{
+ zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
+}
+
+static void
+zdb_dump_block_raw(void *buf, uint64_t size, int flags)
+{
+ if (flags & ZDB_FLAG_BSWAP)
+ byteswap_uint64_array(buf, size);
+ (void) write(2, buf, size);
+}
+
+static void
+zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
+{
+ uint64_t *d = (uint64_t *)buf;
+ int nwords = size / sizeof (uint64_t);
+ int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
+ int i, j;
+ char *hdr, *c;
+
+
+ if (do_bswap)
+ hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8";
+ else
+ hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f";
+
+ (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr);
+
+ for (i = 0; i < nwords; i += 2) {
+ (void) printf("%06llx: %016llx %016llx ",
+ (u_longlong_t)(i * sizeof (uint64_t)),
+ (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
+ (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
+
+ c = (char *)&d[i];
+ for (j = 0; j < 2 * sizeof (uint64_t); j++)
+ (void) printf("%c", isprint(c[j]) ? c[j] : '.');
+ (void) printf("\n");
+ }
+}
+
+/*
+ * There are two acceptable formats:
+ * leaf_name - For example: c1t0d0 or /tmp/ztest.0a
+ * child[.child]* - For example: 0.1.1
+ *
+ * The second form can be used to specify arbitrary vdevs anywhere
+ * in the heirarchy. For example, in a pool with a mirror of
+ * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
+ */
+static vdev_t *
+zdb_vdev_lookup(vdev_t *vdev, char *path)
+{
+ char *s, *p, *q;
+ int i;
+
+ if (vdev == NULL)
+ return (NULL);
+
+ /* First, assume the x.x.x.x format */
+ i = (int)strtoul(path, &s, 10);
+ if (s == path || (s && *s != '.' && *s != '\0'))
+ goto name;
+ if (i < 0 || i >= vdev->vdev_children)
+ return (NULL);
+
+ vdev = vdev->vdev_child[i];
+ if (*s == '\0')
+ return (vdev);
+ return (zdb_vdev_lookup(vdev, s+1));
+
+name:
+ for (i = 0; i < vdev->vdev_children; i++) {
+ vdev_t *vc = vdev->vdev_child[i];
+
+ if (vc->vdev_path == NULL) {
+ vc = zdb_vdev_lookup(vc, path);
+ if (vc == NULL)
+ continue;
+ else
+ return (vc);
+ }
+
+ p = strrchr(vc->vdev_path, '/');
+ p = p ? p + 1 : vc->vdev_path;
+ q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
+
+ if (strcmp(vc->vdev_path, path) == 0)
+ return (vc);
+ if (strcmp(p, path) == 0)
+ return (vc);
+ if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
+ return (vc);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Read a block from a pool and print it out. The syntax of the
+ * block descriptor is:
+ *
+ * pool:vdev_specifier:offset:size[:flags]
+ *
+ * pool - The name of the pool you wish to read from
+ * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
+ * offset - offset, in hex, in bytes
+ * size - Amount of data to read, in hex, in bytes
+ * flags - A string of characters specifying options
+ * b: Decode a blkptr at given offset within block
+ * *c: Calculate and display checksums
+ * *d: Decompress data before dumping
+ * e: Byteswap data before dumping
+ * *g: Display data as a gang block header
+ * *i: Display as an indirect block
+ * p: Do I/O to physical offset
+ * r: Dump raw data to stdout
+ *
+ * * = not yet implemented
+ */
+static void
+zdb_read_block(char *thing, spa_t **spap)
+{
+ spa_t *spa = *spap;
+ int flags = 0;
+ uint64_t offset = 0, size = 0, blkptr_offset = 0;
+ zio_t *zio;
+ vdev_t *vd;
+ void *buf;
+ char *s, *p, *dup, *spa_name, *vdev, *flagstr;
+ int i, error, zio_flags;
+
+ dup = strdup(thing);
+ s = strtok(dup, ":");
+ spa_name = s ? s : "";
+ s = strtok(NULL, ":");
+ vdev = s ? s : "";
+ s = strtok(NULL, ":");
+ offset = strtoull(s ? s : "", NULL, 16);
+ s = strtok(NULL, ":");
+ size = strtoull(s ? s : "", NULL, 16);
+ s = strtok(NULL, ":");
+ flagstr = s ? s : "";
+
+ s = NULL;
+ if (size == 0)
+ s = "size must not be zero";
+ if (!IS_P2ALIGNED(size, DEV_BSIZE))
+ s = "size must be a multiple of sector size";
+ if (!IS_P2ALIGNED(offset, DEV_BSIZE))
+ s = "offset must be a multiple of sector size";
+ if (s) {
+ (void) printf("Invalid block specifier: %s - %s\n", thing, s);
+ free(dup);
+ return;
+ }
+
+ for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
+ for (i = 0; flagstr[i]; i++) {
+ int bit = flagbits[flagstr[i]];
+
+ if (bit == 0) {
+ (void) printf("***Invalid flag: %c\n",
+ flagstr[i]);
+ continue;
+ }
+ flags |= bit;
+
+ /* If it's not something with an argument, keep going */
+ if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_DECOMPRESS |
+ ZDB_FLAG_PRINT_BLKPTR)) == 0)
+ continue;
+
+ p = &flagstr[i + 1];
+ if (bit == ZDB_FLAG_PRINT_BLKPTR)
+ blkptr_offset = strtoull(p, &p, 16);
+ if (*p != ':' && *p != '\0') {
+ (void) printf("***Invalid flag arg: '%s'\n", s);
+ free(dup);
+ return;
+ }
+ }
+ }
+
+ if (spa == NULL || spa->spa_name == NULL ||
+ strcmp(spa->spa_name, spa_name)) {
+ if (spa && spa->spa_name)
+ spa_close(spa, (void *)zdb_read_block);
+ error = spa_open(spa_name, spap, (void *)zdb_read_block);
+ if (error)
+ fatal("Failed to open pool '%s': errno = %d\n",
+ spa_name, error);
+ spa = *spap;
+ }
+
+ vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
+ if (vd == NULL) {
+ (void) printf("***Invalid vdev: %s\n", vdev);
+ free(dup);
+ return;
+ } else {
+ if (vd->vdev_path)
+ (void) printf("Found vdev: %s\n", vd->vdev_path);
+ else
+ (void) printf("Found vdev type: %s\n",
+ vd->vdev_ops->vdev_op_type);
+ }
+
+ buf = umem_alloc(size, UMEM_NOFAIL);
+
+ zio_flags = ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK;
+
+ if (flags & ZDB_FLAG_PHYS)
+ zio_flags |= ZIO_FLAG_PHYSICAL;
+
+ zio = zio_root(spa, NULL, NULL, 0);
+ /* XXX todo - cons up a BP so RAID-Z will be happy */
+ zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, buf, size,
+ ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, zio_flags, NULL, NULL));
+ error = zio_wait(zio);
+
+ if (error) {
+ (void) printf("Read of %s failed, error: %d\n", thing, error);
+ goto out;
+ }
+
+ if (flags & ZDB_FLAG_PRINT_BLKPTR)
+ zdb_print_blkptr((blkptr_t *)(void *)
+ ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
+ else if (flags & ZDB_FLAG_RAW)
+ zdb_dump_block_raw(buf, size, flags);
+ else if (flags & ZDB_FLAG_INDIRECT)
+ zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
+ flags);
+ else if (flags & ZDB_FLAG_GBH)
+ zdb_dump_gbh(buf, flags);
+ else
+ zdb_dump_block(thing, buf, size, flags);
+
+out:
+ umem_free(buf, size);
+ free(dup);
+}
+
int
main(int argc, char **argv)
{
@@ -1721,7 +2054,7 @@ main(int argc, char **argv)
dprintf_setup(&argc, argv);
- while ((c = getopt(argc, argv, "udibcsvCLO:B:Ul")) != -1) {
+ while ((c = getopt(argc, argv, "udibcsvCLO:B:UlR")) != -1) {
switch (c) {
case 'u':
case 'd':
@@ -1731,6 +2064,7 @@ main(int argc, char **argv)
case 's':
case 'C':
case 'l':
+ case 'R':
dump_opt[c]++;
dump_all = 0;
break;
@@ -1801,7 +2135,7 @@ main(int argc, char **argv)
}
for (c = 0; c < 256; c++) {
- if (dump_all && c != 'L' && c != 'l')
+ if (dump_all && c != 'L' && c != 'l' && c != 'R')
dump_opt[c] = 1;
if (dump_opt[c])
dump_opt[c] += verbose;
@@ -1823,6 +2157,27 @@ main(int argc, char **argv)
return (0);
}
+ if (dump_opt['R']) {
+ flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
+ flagbits['c'] = ZDB_FLAG_CHECKSUM;
+ flagbits['d'] = ZDB_FLAG_DECOMPRESS;
+ flagbits['e'] = ZDB_FLAG_BSWAP;
+ flagbits['g'] = ZDB_FLAG_GBH;
+ flagbits['i'] = ZDB_FLAG_INDIRECT;
+ flagbits['p'] = ZDB_FLAG_PHYS;
+ flagbits['r'] = ZDB_FLAG_RAW;
+
+ spa = NULL;
+ while (argv[0]) {
+ zdb_read_block(argv[0], &spa);
+ argv++;
+ argc--;
+ }
+ if (spa)
+ spa_close(spa, (void *)zdb_read_block);
+ return (0);
+ }
+
if (dump_opt['C'])
dump_config(argv[0]);
diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c
index 2cbecad212..e2297b24aa 100644
--- a/usr/src/cmd/zpool/zpool_main.c
+++ b/usr/src/cmd/zpool/zpool_main.c
@@ -2783,8 +2783,9 @@ upgrade_one(zpool_handle_t *zhp, void *unused)
ret = zpool_upgrade(zhp);
if (ret == 0)
- (void) printf(gettext("Successfully upgraded '%s'\n"),
- zpool_get_name(zhp));
+ (void) printf(gettext("Successfully upgraded '%s' "
+ "from version %llu to version %llu\n"), zpool_get_name(zhp),
+ (u_longlong_t)version, (u_longlong_t)ZFS_VERSION);
return (ret != 0);
}
@@ -2848,8 +2849,10 @@ zpool_do_upgrade(int argc, char **argv)
(void) printf(gettext("VER DESCRIPTION\n"));
(void) printf("--- -----------------------------------------"
"---------------\n");
- (void) printf(gettext(" 1 Initial ZFS version.\n\n"));
- (void) printf(gettext("For more information on a particular "
+ (void) printf(gettext(" 1 Initial ZFS version.\n"));
+ (void) printf(gettext(" 2 Ditto blocks "
+ "(replicated metadata)\n"));
+ (void) printf(gettext("\nFor more information on a particular "
"version, including supported releases, see:\n\n"));
(void) printf("http://www.opensolaris.org/os/community/zfs/"
"version/N\n\n");
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index fbcc56a30d..f214da36fa 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -2825,9 +2825,6 @@ ztest_spa_import_export(char *oldname, char *newname)
if (error)
fatal(0, "spa_open('%s') = %d", oldname, error);
- ASSERT(spa->spa_config != NULL);
-
- VERIFY(nvlist_dup(spa->spa_config, &config, 0) == 0);
pool_guid = spa_guid(spa);
spa_close(spa, FTAG);
@@ -2836,7 +2833,7 @@ ztest_spa_import_export(char *oldname, char *newname)
/*
* Export it.
*/
- error = spa_export(oldname);
+ error = spa_export(oldname, &config);
if (error)
fatal(0, "spa_export('%s') = %d", oldname, error);
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 5f0d90dd3c..0b500d4a83 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -2186,7 +2186,7 @@ arc_write_done(zio_t *zio)
}
int
-arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
arc_done_func_t *done, void *private, int priority, int flags,
uint32_t arc_flags, zbookmark_t *zb)
@@ -2205,7 +2205,7 @@ arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
acb->acb_byteswap = (arc_byteswap_func_t *)-1;
hdr->b_acb = acb;
hdr->b_flags |= ARC_IO_IN_PROGRESS;
- rzio = zio_write(pio, spa, checksum, compress, txg, bp,
+ rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb);
if (arc_flags & ARC_WAIT)
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 2982f743d2..ebcd9d7ad3 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -2029,7 +2029,9 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
zb.zb_object = db->db.db_object;
zb.zb_level = db->db_level;
zb.zb_blkid = db->db_blkid;
- (void) arc_write(zio, os->os_spa, checksum, compress, txg,
+
+ (void) arc_write(zio, os->os_spa, checksum, compress,
+ dmu_get_replication_level(os->os_spa, &zb, dn->dn_type), txg,
db->db_blkptr, *data, dbuf_write_done, db,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb);
/*
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 2f6abbadae..52c8413c9a 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -82,8 +82,6 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
dmu_buf_impl_t *db;
int err;
- /* dataset_verify(dd); */
-
err = dnode_hold(os->os, object, FTAG, &dn);
if (err)
return (err);
@@ -1425,7 +1423,8 @@ int
dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
blkptr_t *bp, uint64_t txg)
{
- dsl_pool_t *dp = os->os->os_dsl_dataset->ds_dir->dd_pool;
+ objset_impl_t *osi = os->os;
+ dsl_pool_t *dp = osi->os_dsl_dataset->ds_dir->dd_pool;
tx_state_t *tx = &dp->dp_tx;
dmu_buf_impl_t *db;
blkptr_t *blk;
@@ -1508,7 +1507,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
}
arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
if (!BP_IS_HOLE(blk)) {
- (void) arc_free(NULL, os->os->os_spa, txg, blk,
+ (void) arc_free(NULL, osi->os_spa, txg, blk,
NULL, NULL, ARC_WAIT);
}
kmem_free(blk, sizeof (blkptr_t));
@@ -1520,13 +1519,14 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
blk->blk_birth = 0; /* mark as invalid */
- zb.zb_objset = os->os->os_dsl_dataset->ds_object;
+ zb.zb_objset = osi->os_dsl_dataset->ds_object;
zb.zb_object = db->db.db_object;
zb.zb_level = db->db_level;
zb.zb_blkid = db->db_blkid;
- err = arc_write(NULL, os->os->os_spa,
- zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum),
- zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress),
+ err = arc_write(NULL, osi->os_spa,
+ zio_checksum_select(db->db_dnode->dn_checksum, osi->os_checksum),
+ zio_compress_select(db->db_dnode->dn_compress, osi->os_compress),
+ dmu_get_replication_level(osi->os_spa, &zb, db->db_dnode->dn_type),
txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
ASSERT(err == 0);
@@ -1556,7 +1556,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
* XXX should we be ignoring the return code?
*/
if (!BP_IS_HOLE(blk)) {
- (void) arc_free(NULL, os->os->os_spa, txg, blk,
+ (void) arc_free(NULL, osi->os_spa, txg, blk,
NULL, NULL, ARC_WAIT);
}
kmem_free(blk, sizeof (blkptr_t));
@@ -1625,6 +1625,24 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dnode_rele(dn, FTAG);
}
+/*
+ * XXX - eventually, this should take into account per-dataset (or
+ * even per-object?) user requests for higher levels of replication.
+ */
+int
+dmu_get_replication_level(spa_t *spa, zbookmark_t *zb, dmu_object_type_t ot)
+{
+ int ncopies = 1;
+
+ if (dmu_ot[ot].ot_metadata)
+ ncopies++;
+ if (zb->zb_level != 0)
+ ncopies++;
+ if (zb->zb_objset == 0 && zb->zb_object == 0)
+ ncopies++;
+ return (MIN(ncopies, spa_max_replication(spa)));
+}
+
int
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 3a7f3531ea..7784049a23 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -679,7 +679,9 @@ dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
zb.zb_level = -1;
zb.zb_blkid = 0;
err = arc_write(NULL, os->os_spa, os->os_md_checksum,
- os->os_md_compress, tx->tx_txg, &os->os_rootbp, abuf, killer, os,
+ os->os_md_compress,
+ dmu_get_replication_level(os->os_spa, &zb, DMU_OT_OBJSET),
+ tx->tx_txg, &os->os_rootbp, abuf, killer, os,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
ASSERT(err == 0);
VERIFY(arc_buf_remove_ref(abuf, FTAG) == 1);
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index b8e54be6f6..77a1adb3b1 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -232,7 +232,7 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
uint64_t space, resv;
/*
- * Reserve about 1% (1/128), or at least 16MB, for allocation
+ * Reserve about 1.6% (1/64), or at least 32MB, for allocation
* efficiency.
* XXX The intent log is not accounted for, so it must fit
* within this slop.
@@ -242,7 +242,7 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
* (e.g. make it possible to rm(1) files from a full pool).
*/
space = spa_get_space(dp->dp_spa);
- resv = MAX(space >> 7, SPA_MINDEVSIZE >> 2);
+ resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
if (netfree)
resv >>= 1;
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 72eaa89bd0..8728f21d7e 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -352,14 +352,19 @@ metaslab_fini(metaslab_t *msp)
kmem_free(msp, sizeof (metaslab_t));
}
-#define METASLAB_ACTIVE_WEIGHT (1ULL << 63)
+#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
+#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
+#define METASLAB_ACTIVE_MASK \
+ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+#define METASLAB_SMO_BONUS_MULTIPLIER 2
static uint64_t
metaslab_weight(metaslab_t *msp)
{
+ metaslab_group_t *mg = msp->ms_group;
space_map_t *sm = &msp->ms_map;
space_map_obj_t *smo = &msp->ms_smo;
- vdev_t *vd = msp->ms_group->mg_vd;
+ vdev_t *vd = mg->mg_vd;
uint64_t weight, space;
ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -387,26 +392,27 @@ metaslab_weight(metaslab_t *msp)
* For locality, assign higher weight to metaslabs we've used before.
*/
if (smo->smo_object != 0)
- weight *= 2;
- ASSERT(weight >= space && weight <= 4 * space);
+ weight *= METASLAB_SMO_BONUS_MULTIPLIER;
+ ASSERT(weight >= space &&
+ weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
/*
* If this metaslab is one we're actively using, adjust its weight to
* make it preferable to any inactive metaslab so we'll polish it off.
*/
- weight |= (msp->ms_weight & METASLAB_ACTIVE_WEIGHT);
+ weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
return (weight);
}
static int
-metaslab_activate(metaslab_t *msp)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
{
space_map_t *sm = &msp->ms_map;
ASSERT(MUTEX_HELD(&msp->ms_lock));
- if (msp->ms_weight < METASLAB_ACTIVE_WEIGHT) {
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
int error = space_map_load(sm, &metaslab_ff_ops,
SM_FREE, &msp->ms_smo,
msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
@@ -415,10 +421,10 @@ metaslab_activate(metaslab_t *msp)
return (error);
}
metaslab_group_sort(msp->ms_group, msp,
- msp->ms_weight | METASLAB_ACTIVE_WEIGHT);
+ msp->ms_weight | activation_weight);
}
ASSERT(sm->sm_loaded);
- ASSERT(msp->ms_weight >= METASLAB_ACTIVE_WEIGHT);
+ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
return (0);
}
@@ -426,8 +432,8 @@ metaslab_activate(metaslab_t *msp)
static void
metaslab_passivate(metaslab_t *msp, uint64_t size)
{
- metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size - 1));
- ASSERT(msp->ms_weight < METASLAB_ACTIVE_WEIGHT);
+ metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
+ ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
}
/*
@@ -571,7 +577,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
* future allocations have synced. (If we unloaded it now and then
* loaded a moment later, the map wouldn't reflect those allocations.)
*/
- if (sm->sm_loaded && msp->ms_weight < METASLAB_ACTIVE_WEIGHT) {
+ if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
int evictable = 1;
for (t = 1; t < TXG_CONCURRENT_STATES; t++)
@@ -616,7 +622,7 @@ metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock);
- error = metaslab_activate(msp);
+ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
if (error) {
mutex_exit(&msp->ms_lock);
return (error);
@@ -633,25 +639,76 @@ metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg)
return (0);
}
-static metaslab_t *
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t *offp,
- uint64_t txg)
+static uint64_t
+metaslab_distance(metaslab_t *msp, dva_t *dva)
+{
+ uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
+ uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
+ uint64_t start = msp->ms_map.sm_start >> ms_shift;
+
+ if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
+ return (1ULL << 63);
+
+ if (offset < start)
+ return ((start - offset) << ms_shift);
+ if (offset > start)
+ return ((offset - start) << ms_shift);
+ return (0);
+}
+
+static uint64_t
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
+ uint64_t min_distance, dva_t *dva, int d)
{
metaslab_t *msp = NULL;
uint64_t offset = -1ULL;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ uint64_t activation_weight;
+ uint64_t target_distance;
+ int i;
+
+ activation_weight = METASLAB_WEIGHT_PRIMARY;
+ for (i = 0; i < d; i++)
+ if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
+ activation_weight = METASLAB_WEIGHT_SECONDARY;
for (;;) {
mutex_enter(&mg->mg_lock);
- msp = avl_first(&mg->mg_metaslab_tree);
- if (msp == NULL || msp->ms_weight < size) {
- mutex_exit(&mg->mg_lock);
- return (NULL);
+ for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
+ if (msp->ms_weight < size) {
+ mutex_exit(&mg->mg_lock);
+ return (-1ULL);
+ }
+
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY)
+ break;
+
+ target_distance = min_distance +
+ (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
+
+ for (i = 0; i < d; i++)
+ if (metaslab_distance(msp, &dva[i]) <
+ target_distance)
+ break;
+ if (i == d)
+ break;
}
mutex_exit(&mg->mg_lock);
+ if (msp == NULL)
+ return (-1ULL);
mutex_enter(&msp->ms_lock);
- if (metaslab_activate(msp) != 0) {
+ if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
+ activation_weight == METASLAB_WEIGHT_PRIMARY) {
+ metaslab_passivate(msp,
+ (msp->ms_weight & ~METASLAB_ACTIVE_MASK) /
+ METASLAB_SMO_BONUS_MULTIPLIER);
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ if (metaslab_activate(msp, activation_weight) != 0) {
mutex_exit(&msp->ms_lock);
continue;
}
@@ -659,7 +716,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t *offp,
if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
break;
- metaslab_passivate(msp, size);
+ metaslab_passivate(msp, size - 1);
mutex_exit(&msp->ms_lock);
}
@@ -671,22 +728,24 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t *offp,
mutex_exit(&msp->ms_lock);
- *offp = offset;
- return (msp);
+ return (offset);
}
/*
* Allocate a block for the specified i/o.
*/
-int
-metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg)
+static int
+metaslab_alloc_one(spa_t *spa, uint64_t psize, dva_t *dva, int d,
+ dva_t *hintdva, uint64_t txg)
{
- metaslab_t *msp;
metaslab_group_t *mg, *rotor;
metaslab_class_t *mc;
vdev_t *vd;
+ int dshift = 3;
+ int all_zero;
uint64_t offset = -1ULL;
uint64_t asize;
+ uint64_t distance;
mc = spa_metaslab_class_select(spa);
@@ -695,17 +754,50 @@ metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg)
* Note that there's no locking on mc_rotor or mc_allocated because
* nothing actually breaks if we miss a few updates -- we just won't
* allocate quite as evenly. It all balances out over time.
+ *
+ * If we are doing ditto blocks, try to spread them across consecutive
+ * vdevs. If we're forced to reuse a vdev before we've allocated
+ * all of our ditto blocks, then try and spread them out on that
+ * vdev as much as possible. If it turns out to not be possible,
+ * gradually lower our standards until anything becomes acceptable.
+ * Also, allocating on consecutive vdevs (as opposed to random vdevs)
+ * gives us hope of containing our fault domains to something we're
+ * able to reason about. Otherwise, any two top-level vdev failures
+ * will guarantee the loss of data. With consecutive allocation,
+ * only two adjacent top-level vdev failures will result in data loss.
+ *
+ * If we are doing gang blocks (hintdva is non-NULL), try to keep
+ * ourselves on the same vdev as our gang block header. That
+ * way, we can hope for locality in vdev_cache, plus it makes our
+ * fault domains something tractable.
*/
- mg = rotor = mc->mc_rotor;
+ if (hintdva) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
+ mg = vd->vdev_mg;
+ } else if (d != 0) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
+ mg = vd->vdev_mg->mg_next;
+ } else {
+ mg = mc->mc_rotor;
+ }
+ rotor = mg;
+
+top:
+ all_zero = B_TRUE;
do {
vd = mg->mg_vd;
+
+ distance = vd->vdev_asize >> dshift;
+ if (distance <= (1ULL << vd->vdev_ms_shift))
+ distance = 0;
+ else
+ all_zero = B_FALSE;
+
asize = vdev_psize_to_asize(vd, psize);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
- msp = metaslab_group_alloc(mg, asize, &offset, txg);
- if (msp != NULL) {
- ASSERT(offset != -1ULL);
-
+ offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+ if (offset != -1ULL) {
/*
* If we've just selected this metaslab group,
* figure out whether the corresponding vdev is
@@ -740,10 +832,10 @@ metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg)
mc->mc_allocated = 0;
}
- DVA_SET_VDEV(dva, vd->vdev_id);
- DVA_SET_OFFSET(dva, offset);
- DVA_SET_GANG(dva, 0);
- DVA_SET_ASIZE(dva, asize);
+ DVA_SET_VDEV(&dva[d], vd->vdev_id);
+ DVA_SET_OFFSET(&dva[d], offset);
+ DVA_SET_GANG(&dva[d], 0);
+ DVA_SET_ASIZE(&dva[d], asize);
return (0);
}
@@ -751,13 +843,46 @@ metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg)
mc->mc_allocated = 0;
} while ((mg = mg->mg_next) != rotor);
- DVA_SET_VDEV(dva, 0);
- DVA_SET_OFFSET(dva, 0);
- DVA_SET_GANG(dva, 0);
+ if (!all_zero) {
+ dshift++;
+ ASSERT(dshift < 64);
+ goto top;
+ }
+
+ bzero(&dva[d], sizeof (dva_t));
return (ENOSPC);
}
+int
+metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ncopies,
+ uint64_t txg, blkptr_t *hintbp)
+{
+ int d, error;
+ dva_t *dva = bp->blk_dva;
+ dva_t *hintdva = hintbp->blk_dva;
+
+ ASSERT(ncopies > 0 && ncopies <= spa_max_replication(spa));
+ ASSERT(BP_GET_NDVAS(bp) == 0);
+ ASSERT(hintbp == NULL || ncopies <= BP_GET_NDVAS(hintbp));
+
+ for (d = 0; d < ncopies; d++) {
+ error = metaslab_alloc_one(spa, psize, dva, d, hintdva, txg);
+ if (error) {
+ for (d--; d >= 0; d--) {
+ ASSERT(DVA_IS_VALID(&dva[d]));
+ metaslab_free(spa, &dva[d], txg, B_TRUE);
+ bzero(&dva[d], sizeof (dva_t));
+ }
+ return (ENOSPC);
+ }
+ }
+ ASSERT(error == 0);
+ ASSERT(BP_GET_NDVAS(bp) == ncopies);
+
+ return (0);
+}
+
/*
* Free the block represented by DVA in the context of the specified
* transaction group.
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index f4ecf519cd..95f633eac1 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -940,10 +940,13 @@ spa_tryimport(nvlist_t *tryconfig)
* configuration from the cache afterwards.
*/
static int
-spa_export_common(char *pool, int new_state)
+spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
{
spa_t *spa;
+ if (oldconfig)
+ *oldconfig = NULL;
+
if (!(spa_mode & FWRITE))
return (EROFS);
@@ -1011,6 +1014,9 @@ spa_export_common(char *pool, int new_state)
spa_deactivate(spa);
}
+ if (oldconfig && spa->spa_config)
+ VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
+
if (new_state != POOL_STATE_UNINITIALIZED) {
spa_remove(spa);
spa_config_sync();
@@ -1026,16 +1032,16 @@ spa_export_common(char *pool, int new_state)
int
spa_destroy(char *pool)
{
- return (spa_export_common(pool, POOL_STATE_DESTROYED));
+ return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
}
/*
* Export a storage pool.
*/
int
-spa_export(char *pool)
+spa_export(char *pool, nvlist_t **oldconfig)
{
- return (spa_export_common(pool, POOL_STATE_EXPORTED));
+ return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
}
/*
@@ -1045,7 +1051,7 @@ spa_export(char *pool)
int
spa_reset(char *pool)
{
- return (spa_export_common(pool, POOL_STATE_UNINITIALIZED));
+ return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
}
@@ -1497,7 +1503,7 @@ spa_scrub_io_done(zio_t *zio)
mutex_enter(&spa->spa_scrub_lock);
if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- vdev_t *vd = zio->io_vd;
+ vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
spa->spa_scrub_errors++;
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_scrub_errors++;
@@ -1535,9 +1541,12 @@ static int
spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
{
blkptr_t *bp = &bc->bc_blkptr;
- vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+ vdev_t *vd = spa->spa_root_vdev;
+ dva_t *dva = bp->blk_dva;
+ int needs_resilver = B_FALSE;
+ int d;
- if (bc->bc_errno || vd == NULL) {
+ if (bc->bc_errno) {
/*
* We can't scrub this block, but we can continue to scrub
* the rest of the pool. Note the error and move along.
@@ -1546,43 +1555,52 @@ spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
spa->spa_scrub_errors++;
mutex_exit(&spa->spa_scrub_lock);
- if (vd != NULL) {
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_scrub_errors++;
- mutex_exit(&vd->vdev_stat_lock);
- }
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
return (ERESTART);
}
ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
- /*
- * Keep track of how much data we've examined so that
- * zpool(1M) status can make useful progress reports.
- */
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp);
- mutex_exit(&vd->vdev_stat_lock);
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
- if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
- if (DVA_GET_GANG(&bp->blk_dva[0])) {
- /*
- * Gang members may be spread across multiple vdevs,
- * so the best we can do is look at the pool-wide DTL.
- * XXX -- it would be better to change our allocation
- * policy to ensure that this can't happen.
- */
- vd = spa->spa_root_vdev;
- }
- if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) {
- spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
- ZIO_FLAG_RESILVER, &bc->bc_bookmark);
+ ASSERT(vd != NULL);
+
+ /*
+ * Keep track of how much data we've examined so that
+ * zpool(1M) status can make useful progress reports.
+ */
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
+ mutex_exit(&vd->vdev_stat_lock);
+
+ if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
+ if (DVA_GET_GANG(&dva[d])) {
+ /*
+ * Gang members may be spread across multiple
+ * vdevs, so the best we can do is look at the
+ * pool-wide DTL.
+ * XXX -- it would be better to change our
+ * allocation policy to ensure that this can't
+ * happen.
+ */
+ vd = spa->spa_root_vdev;
+ }
+ if (vdev_dtl_contains(&vd->vdev_dtl_map,
+ bp->blk_birth, 1))
+ needs_resilver = B_TRUE;
}
- } else {
+ }
+
+ if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
ZIO_FLAG_SCRUB, &bc->bc_bookmark);
- }
+ else if (needs_resilver)
+ spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
+ ZIO_FLAG_RESILVER, &bc->bc_bookmark);
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index d12fe822a7..843b77d9ff 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -52,60 +52,60 @@
*
* spa_namespace_lock (global mutex)
*
- * This lock must be acquired to do any of the following:
+ * This lock must be acquired to do any of the following:
*
- * - Lookup a spa_t by name
- * - Add or remove a spa_t from the namespace
- * - Increase spa_refcount from non-zero
- * - Check if spa_refcount is zero
- * - Rename a spa_t
+ * - Lookup a spa_t by name
+ * - Add or remove a spa_t from the namespace
+ * - Increase spa_refcount from non-zero
+ * - Check if spa_refcount is zero
+ * - Rename a spa_t
* - add/remove/attach/detach devices
- * - Held for the duration of create/destroy/import/export
+ * - Held for the duration of create/destroy/import/export
*
- * It does not need to handle recursion. A create or destroy may
- * reference objects (files or zvols) in other pools, but by
- * definition they must have an existing reference, and will never need
- * to lookup a spa_t by name.
+ * It does not need to handle recursion. A create or destroy may
+ * reference objects (files or zvols) in other pools, but by
+ * definition they must have an existing reference, and will never need
+ * to lookup a spa_t by name.
*
* spa_refcount (per-spa refcount_t protected by mutex)
*
- * This reference count keep track of any active users of the spa_t. The
- * spa_t cannot be destroyed or freed while this is non-zero. Internally,
- * the refcount is never really 'zero' - opening a pool implicitly keeps
- * some references in the DMU. Internally we check against SPA_MINREF, but
- * present the image of a zero/non-zero value to consumers.
+ * This reference count keep track of any active users of the spa_t. The
+ * spa_t cannot be destroyed or freed while this is non-zero. Internally,
+ * the refcount is never really 'zero' - opening a pool implicitly keeps
+ * some references in the DMU. Internally we check against SPA_MINREF, but
+ * present the image of a zero/non-zero value to consumers.
*
* spa_config_lock (per-spa crazy rwlock)
*
- * This SPA special is a recursive rwlock, capable of being acquired from
- * asynchronous threads. It has protects the spa_t from config changes,
- * and must be held in the following circumstances:
+ * This SPA special is a recursive rwlock, capable of being acquired from
+ * asynchronous threads. It has protects the spa_t from config changes,
+ * and must be held in the following circumstances:
*
- * - RW_READER to perform I/O to the spa
- * - RW_WRITER to change the vdev config
+ * - RW_READER to perform I/O to the spa
+ * - RW_WRITER to change the vdev config
*
* spa_config_cache_lock (per-spa mutex)
*
- * This mutex prevents the spa_config nvlist from being updated. No
+ * This mutex prevents the spa_config nvlist from being updated. No
* other locks are required to obtain this lock, although implicitly you
* must have the namespace lock or non-zero refcount to have any kind
* of spa_t pointer at all.
*
* The locking order is fairly straightforward:
*
- * spa_namespace_lock -> spa_refcount
+ * spa_namespace_lock -> spa_refcount
*
- * The namespace lock must be acquired to increase the refcount from 0
- * or to check if it is zero.
+ * The namespace lock must be acquired to increase the refcount from 0
+ * or to check if it is zero.
*
- * spa_refcount -> spa_config_lock
+ * spa_refcount -> spa_config_lock
*
- * There must be at least one valid reference on the spa_t to acquire
- * the config lock.
+ * There must be at least one valid reference on the spa_t to acquire
+ * the config lock.
*
- * spa_namespace_lock -> spa_config_lock
+ * spa_namespace_lock -> spa_config_lock
*
- * The namespace lock must always be taken before the config lock.
+ * The namespace lock must always be taken before the config lock.
*
*
* The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
@@ -114,53 +114,53 @@
* The namespace is manipulated using the following functions, all which require
* the spa_namespace_lock to be held.
*
- * spa_lookup() Lookup a spa_t by name.
+ * spa_lookup() Lookup a spa_t by name.
*
- * spa_add() Create a new spa_t in the namespace.
+ * spa_add() Create a new spa_t in the namespace.
*
- * spa_remove() Remove a spa_t from the namespace. This also
- * frees up any memory associated with the spa_t.
+ * spa_remove() Remove a spa_t from the namespace. This also
+ * frees up any memory associated with the spa_t.
*
- * spa_next() Returns the next spa_t in the system, or the
- * first if NULL is passed.
+ * spa_next() Returns the next spa_t in the system, or the
+ * first if NULL is passed.
*
- * spa_evict_all() Shutdown and remove all spa_t structures in
- * the system.
+ * spa_evict_all() Shutdown and remove all spa_t structures in
+ * the system.
*
* spa_guid_exists() Determine whether a pool/device guid exists.
*
* The spa_refcount is manipulated using the following functions:
*
- * spa_open_ref() Adds a reference to the given spa_t. Must be
- * called with spa_namespace_lock held if the
- * refcount is currently zero.
+ * spa_open_ref() Adds a reference to the given spa_t. Must be
+ * called with spa_namespace_lock held if the
+ * refcount is currently zero.
*
- * spa_close() Remove a reference from the spa_t. This will
- * not free the spa_t or remove it from the
- * namespace. No locking is required.
+ * spa_close() Remove a reference from the spa_t. This will
+ * not free the spa_t or remove it from the
+ * namespace. No locking is required.
*
- * spa_refcount_zero() Returns true if the refcount is currently
- * zero. Must be called with spa_namespace_lock
- * held.
+ * spa_refcount_zero() Returns true if the refcount is currently
+ * zero. Must be called with spa_namespace_lock
+ * held.
*
* The spa_config_lock is manipulated using the following functions:
*
- * spa_config_enter() Acquire the config lock as RW_READER or
- * RW_WRITER. At least one reference on the spa_t
- * must exist.
+ * spa_config_enter() Acquire the config lock as RW_READER or
+ * RW_WRITER. At least one reference on the spa_t
+ * must exist.
*
- * spa_config_exit() Release the config lock.
+ * spa_config_exit() Release the config lock.
*
- * spa_config_held() Returns true if the config lock is currently
- * held in the given state.
+ * spa_config_held() Returns true if the config lock is currently
+ * held in the given state.
*
* The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
*
- * spa_vdev_enter() Acquire the namespace lock and the config lock
+ * spa_vdev_enter() Acquire the namespace lock and the config lock
* for writing.
*
- * spa_vdev_exit() Release the config lock, wait for all I/O
- * to complete, sync the updated configs to the
+ * spa_vdev_exit() Release the config lock, wait for all I/O
+ * to complete, sync the updated configs to the
* cache, and release the namespace lock.
*
* The spa_name() function also requires either the spa_namespace_lock
@@ -173,6 +173,7 @@ static avl_tree_t spa_namespace_avl;
kmutex_t spa_namespace_lock;
static kcondvar_t spa_namespace_cv;
static int spa_active_count;
+static int spa_max_replication_override = SPA_DVAS_PER_BP;
kmem_cache_t *spa_buffer_pool;
int spa_mode;
@@ -617,8 +618,7 @@ spa_get_random(uint64_t range)
void
sprintf_blkptr(char *buf, int len, blkptr_t *bp)
{
- /* XXBP - Need to see if we want all DVAs or not */
- dva_t *dva = BP_IDENTITY(bp);
+ int d;
if (bp == NULL) {
(void) snprintf(buf, len, "<NULL>");
@@ -630,20 +630,27 @@ sprintf_blkptr(char *buf, int len, blkptr_t *bp)
return;
}
- (void) snprintf(buf, len, "[L%llu %s] vdev=%llu offset=%llx "
- "size=%llxL/%llxP/%llxA %s %s %s %s "
- "birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
+ (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
(u_longlong_t)BP_GET_LEVEL(bp),
dmu_ot[BP_GET_TYPE(bp)].ot_name,
- (u_longlong_t)DVA_GET_VDEV(dva),
- (u_longlong_t)DVA_GET_OFFSET(dva),
(u_longlong_t)BP_GET_LSIZE(bp),
- (u_longlong_t)BP_GET_PSIZE(bp),
- (u_longlong_t)DVA_GET_ASIZE(dva),
+ (u_longlong_t)BP_GET_PSIZE(bp));
+
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ dva_t *dva = &bp->blk_dva[d];
+ (void) snprintf(buf + strlen(buf), len - strlen(buf),
+ "DVA[%d]=<%llu:%llx:%llx> ", d,
+ (u_longlong_t)DVA_GET_VDEV(dva),
+ (u_longlong_t)DVA_GET_OFFSET(dva),
+ (u_longlong_t)DVA_GET_ASIZE(dva));
+ }
+
+ (void) snprintf(buf + strlen(buf), len - strlen(buf),
+ "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
- DVA_GET_GANG(dva) == 0 ? "contiguous" : "gang",
+ BP_IS_GANG(bp) ? "gang" : "contiguous",
(u_longlong_t)bp->blk_birth,
(u_longlong_t)bp->blk_fill,
(u_longlong_t)bp->blk_cksum.zc_word[0],
@@ -796,8 +803,29 @@ spa_get_asize(spa_t *spa, uint64_t lsize)
/*
* For now, the worst case is 512-byte RAID-Z blocks, in which
* case the space requirement is exactly 2x; so just assume that.
+ * Add to this the fact that we can have up to 3 DVAs per bp, and
+ * we have to multiply by a total of 6x.
*/
- return (lsize << 1);
+ return (lsize * 6);
+}
+
+uint64_t
+spa_version(spa_t *spa)
+{
+ return (spa->spa_ubsync.ub_version);
+}
+
+int
+spa_max_replication(spa_t *spa)
+{
+ /*
+ * As of ZFS_VERSION == ZFS_VERSION_DITTO_BLOCKS, we are able to
+ * handle BPs with more than one DVA allocated. Set our max
+ * replication level accordingly.
+ */
+ if (spa_version(spa) < ZFS_VERSION_DITTO_BLOCKS)
+ return (1);
+ return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
}
/*
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index 1a93d4e4ca..811ac94436 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -75,7 +75,7 @@ int arc_referenced(arc_buf_t *buf);
int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
arc_done_func_t *done, void *private, int priority, int flags,
uint32_t arc_flags, zbookmark_t *zb);
-int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
arc_done_func_t *done, void *private, int priority, int flags,
uint32_t arc_flags, zbookmark_t *zb);
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 70a94147b5..78dd9632e6 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -56,6 +56,8 @@ struct dsl_pool;
struct dnode;
struct drr_begin;
struct drr_end;
+struct zbookmark;
+struct spa;
typedef struct objset objset_t;
typedef struct dmu_tx dmu_tx_t;
@@ -263,6 +265,12 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dmu_tx_t *tx);
/*
+ * Decide how many copies of a given block we should make. Can be from
+ * 1 to SPA_DVAS_PER_BP.
+ */
+int dmu_get_replication_level(struct spa *spa, struct zbookmark *zb,
+ dmu_object_type_t ot);
+/*
* The bonus data is accessed more or less like a regular buffer.
* You must dmu_bonus_hold() to get the buffer, which will give you a
* dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
index ef2a9a2b89..c72b5ddf16 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -47,7 +47,8 @@ extern void metaslab_fini(metaslab_t *msp);
extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
-extern int metaslab_alloc(spa_t *spa, uint64_t size, dva_t *dva, uint64_t txg);
+extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp,
+ int ncopies, uint64_t txg, blkptr_t *hintbp);
extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now);
extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index cbe8257953..265d19f63a 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -234,6 +234,16 @@ typedef struct blkptr {
(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+#define BP_GET_NDVAS(bp) \
+ (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define BP_COUNT_GANG(bp) \
+ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \
+ DVA_GET_GANG(&(bp)->blk_dva[1]) + \
+ DVA_GET_GANG(&(bp)->blk_dva[2]))
+
#define DVA_EQUAL(dva1, dva2) \
((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
(dva1)->dva_word[0] == (dva2)->dva_word[0])
@@ -248,9 +258,9 @@ typedef struct blkptr {
(zcp)->zc_word[3] = w3; \
}
-#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
-
#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
+#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
+#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
#define BP_ZERO(bp) \
{ \
@@ -281,7 +291,7 @@ typedef struct blkptr {
#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
-#define BP_SPRINTF_LEN 256
+#define BP_SPRINTF_LEN 320
#include <sys/dmu.h>
@@ -297,7 +307,7 @@ extern int spa_create(const char *pool, nvlist_t *config, const char *altroot);
extern int spa_import(const char *pool, nvlist_t *config, const char *altroot);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
extern int spa_destroy(char *pool);
-extern int spa_export(char *pool);
+extern int spa_export(char *pool, nvlist_t **oldconfig);
extern int spa_reset(char *pool);
extern void spa_async_request(spa_t *spa, int flag);
extern void spa_async_suspend(spa_t *spa);
@@ -387,6 +397,8 @@ extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa);
extern uint64_t spa_get_alloc(spa_t *spa);
extern uint64_t spa_get_space(spa_t *spa);
extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern uint64_t spa_version(spa_t *spa);
+extern int spa_max_replication(spa_t *spa);
extern int spa_busy(void);
/* Miscellaneous support routines */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index f9cf5d3354..c8d5db50f5 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -80,6 +80,7 @@ extern void vdev_stat_update(zio_t *zio);
extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
boolean_t complete);
extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
+extern void vdev_propagate_state(vdev_t *vd);
extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
vdev_aux_t aux);
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 268581336a..66c9a910ca 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -34,6 +34,7 @@
#include <sys/avl.h>
#include <sys/dkio.h>
#include <sys/fs/zfs.h>
+#include <sys/zio_impl.h>
#ifdef __cplusplus
extern "C" {
@@ -58,9 +59,8 @@ typedef struct zio_block_tail {
(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
sizeof (uint64_t))
-#define ZIO_GET_DVA(zio) (&(zio)->io_bp->blk_dva[(zio)->io_dva_index])
#define ZIO_GET_IOSIZE(zio) \
- (DVA_GET_GANG(ZIO_GET_DVA(zio)) ? \
+ (BP_IS_GANG((zio)->io_bp) ? \
SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp))
typedef struct zio_gbh {
@@ -152,7 +152,6 @@ enum zio_compress {
typedef struct zio zio_t;
typedef void zio_done_func_t(zio_t *zio);
-typedef struct zio_transform zio_transform_t;
extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
extern char *zio_type_name[ZIO_TYPES];
@@ -190,9 +189,9 @@ struct zio {
zio_t *io_root;
spa_t *io_spa;
zbookmark_t io_bookmark;
- int io_checksum;
- int io_compress;
- int io_dva_index;
+ enum zio_checksum io_checksum;
+ enum zio_compress io_compress;
+ int io_ndvas;
uint64_t io_txg;
blkptr_t *io_bp;
blkptr_t io_bp_copy;
@@ -225,8 +224,8 @@ struct zio {
/* Internal pipeline state */
int io_flags;
- uint8_t io_type;
- uint8_t io_stage;
+ enum zio_type io_type;
+ enum zio_stage io_stage;
uint8_t io_stalled;
uint8_t io_priority;
struct dk_callback io_dk_callback;
@@ -257,7 +256,7 @@ extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
int priority, int flags, zbookmark_t *zb);
extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
- uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
zio_done_func_t *done, void *private, int priority, int flags,
zbookmark_t *zb);
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
index e1abf0e49d..d2ddbc34e9 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
@@ -61,9 +61,6 @@ typedef enum zio_stage {
ZIO_STAGE_READY, /* RWFCI */
- ZIO_STAGE_DVA_TRANSLATE, /* RW--- */
-
- ZIO_STAGE_VDEV_IO_SETUP, /* RW--I */
ZIO_STAGE_VDEV_IO_START, /* RW--I */
ZIO_STAGE_VDEV_IO_DONE, /* RW--I */
ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */
@@ -88,8 +85,7 @@ typedef enum zio_stage {
(1U << ZIO_STAGE_READ_DECOMPRESS))
#define ZIO_VDEV_IO_PIPELINE \
- ((1U << ZIO_STAGE_VDEV_IO_SETUP) | \
- (1U << ZIO_STAGE_VDEV_IO_START) | \
+ ((1U << ZIO_STAGE_VDEV_IO_START) | \
(1U << ZIO_STAGE_VDEV_IO_DONE) | \
(1U << ZIO_STAGE_VDEV_IO_ASSESS))
@@ -103,8 +99,7 @@ typedef enum zio_stage {
(1U << ZIO_STAGE_DONE))
#define ZIO_READ_PIPELINE \
- ((1U << ZIO_STAGE_DVA_TRANSLATE) | \
- ZIO_READ_PHYS_PIPELINE)
+ ZIO_READ_PHYS_PIPELINE
#define ZIO_WRITE_PHYS_PIPELINE \
((1U << ZIO_STAGE_OPEN) | \
@@ -116,8 +111,7 @@ typedef enum zio_stage {
(1U << ZIO_STAGE_DONE))
#define ZIO_WRITE_COMMON_PIPELINE \
- ((1U << ZIO_STAGE_DVA_TRANSLATE) | \
- ZIO_WRITE_PHYS_PIPELINE)
+ ZIO_WRITE_PHYS_PIPELINE
#define ZIO_WRITE_PIPELINE \
((1U << ZIO_STAGE_WRITE_COMPRESS) | \
@@ -193,6 +187,7 @@ typedef enum zio_stage {
#define ZIO_ERROR_PIPELINE_MASK \
ZIO_WAIT_FOR_CHILDREN_PIPELINE
+typedef struct zio_transform zio_transform_t;
struct zio_transform {
void *zt_data;
uint64_t zt_size;
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 4c216b4ee5..7836041872 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -847,31 +847,16 @@ void
vdev_reopen(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
- vdev_t *rvd = spa->spa_root_vdev;
- int c;
ASSERT(spa_config_held(spa, RW_WRITER));
- if (vd == rvd) {
- for (c = 0; c < rvd->vdev_children; c++)
- vdev_reopen(rvd->vdev_child[c]);
- return;
- }
-
- /* only valid for top-level vdevs */
- ASSERT3P(vd, ==, vd->vdev_top);
-
vdev_close(vd);
(void) vdev_open(vd);
/*
* Reassess root vdev's health.
*/
- rvd->vdev_state = VDEV_STATE_HEALTHY;
- for (c = 0; c < rvd->vdev_children; c++) {
- uint64_t state = rvd->vdev_child[c]->vdev_state;
- rvd->vdev_state = MIN(rvd->vdev_state, state);
- }
+ vdev_propagate_state(spa->spa_root_vdev);
}
int
@@ -1741,6 +1726,39 @@ vdev_config_clean(vdev_t *vd)
list_remove(&spa->spa_dirty_list, vd);
}
+void
+vdev_propagate_state(vdev_t *vd)
+{
+ vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ int degraded = 0, faulted = 0;
+ int corrupted = 0;
+ int c;
+ vdev_t *child;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ child = vd->vdev_child[c];
+ if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
+ faulted++;
+ else if (child->vdev_state == VDEV_STATE_DEGRADED)
+ degraded++;
+
+ if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+ corrupted++;
+ }
+
+ vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+
+ /*
+ * Root special: if there is a toplevel vdev that cannot be
+ * opened due to corrupted metadata, then propagate the root
+ * vdev's aux state as 'corrupt' rather than 'insufficient
+ * replicas'.
+ */
+ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN)
+ vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+}
+
/*
* Set a vdev's state. If this is during an open, we don't update the parent
* state, because we're in the process of opening children depth-first.
@@ -1810,36 +1828,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
if (isopen)
return;
- if (vd->vdev_parent != NULL) {
- int c;
- int degraded = 0, faulted = 0;
- int corrupted = 0;
- vdev_t *parent, *child;
-
- parent = vd->vdev_parent;
- for (c = 0; c < parent->vdev_children; c++) {
- child = parent->vdev_child[c];
- if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
- faulted++;
- else if (child->vdev_state == VDEV_STATE_DEGRADED)
- degraded++;
-
- if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
- corrupted++;
- }
-
- vd->vdev_parent->vdev_ops->vdev_op_state_change(
- vd->vdev_parent, faulted, degraded);
-
- /*
- * Root special: if this is a toplevel vdev that cannot be
- * opened due to corrupted metadata, then propagate the root
- * vdev's aux state as 'corrupt' rather than 'insufficient
- * replicas'.
- */
- if (corrupted && vd == vd->vdev_top)
- vdev_set_state(vd->vdev_spa->spa_root_vdev,
- B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- }
+ if (vd->vdev_parent != NULL)
+ vdev_propagate_state(vd->vdev_parent);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index ee5732a59c..d79c38a32e 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -35,25 +35,85 @@
* Virtual device vector for mirroring.
*/
+typedef struct mirror_child {
+ vdev_t *mc_vd;
+ uint64_t mc_offset;
+ int mc_error;
+ short mc_tried;
+ short mc_skipped;
+} mirror_child_t;
+
typedef struct mirror_map {
- int mm_error;
- short mm_tried;
- short mm_skipped;
+ int mm_children;
+ int mm_replacing;
+ int mm_preferred;
+ int mm_root;
+ mirror_child_t mm_child[1];
} mirror_map_t;
static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
- zio->io_vsd = kmem_zalloc(zio->io_vd->vdev_children *
- sizeof (mirror_map_t), KM_SLEEP);
- return (zio->io_vsd);
+ mirror_map_t *mm = NULL;
+ mirror_child_t *mc;
+ vdev_t *vd = zio->io_vd;
+ int c, d;
+
+ if (vd == NULL) {
+ dva_t *dva = zio->io_bp->blk_dva;
+ spa_t *spa = zio->io_spa;
+
+ c = BP_GET_NDVAS(zio->io_bp);
+
+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+ mm->mm_children = c;
+ mm->mm_replacing = B_FALSE;
+ mm->mm_preferred = spa_get_random(c);
+ mm->mm_root = B_TRUE;
+
+ /*
+ * Check the other, lower-index DVAs to see if they're on
+ * the same vdev as the child we picked. If they are, use
+ * them since they are likely to have been allocated from
+ * the primary metaslab in use at the time, and hence are
+ * more likely to have locality with single-copy data.
+ */
+ for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
+ if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
+ mm->mm_preferred = d;
+ }
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
+ mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
+ }
+ } else {
+ c = vd->vdev_children;
+
+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+ mm->mm_children = c;
+ mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops);
+ mm->mm_preferred = mm->mm_replacing ? 0 : spa_get_random(c);
+ mm->mm_root = B_FALSE;
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ mc->mc_vd = vd->vdev_child[c];
+ mc->mc_offset = zio->io_offset;
+ }
+ }
+
+ zio->io_vsd = mm;
+ return (mm);
}
static void
vdev_mirror_map_free(zio_t *zio)
{
- kmem_free(zio->io_vsd,
- zio->io_vd->vdev_children * sizeof (mirror_map_t));
+ mirror_map_t *mm = zio->io_vsd;
+
+ kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
zio->io_vsd = NULL;
}
@@ -103,30 +163,31 @@ vdev_mirror_close(vdev_t *vd)
static void
vdev_mirror_child_done(zio_t *zio)
{
- mirror_map_t *mm = zio->io_private;
+ mirror_child_t *mc = zio->io_private;
- mm->mm_error = zio->io_error;
- mm->mm_tried = 1;
- mm->mm_skipped = 0;
+ mc->mc_error = zio->io_error;
+ mc->mc_tried = 1;
+ mc->mc_skipped = 0;
}
static void
vdev_mirror_scrub_done(zio_t *zio)
{
- mirror_map_t *mm = zio->io_private;
+ mirror_child_t *mc = zio->io_private;
if (zio->io_error == 0) {
zio_t *pio = zio->io_parent;
mutex_enter(&pio->io_lock);
+ ASSERT3U(zio->io_size, >=, pio->io_size);
bcopy(zio->io_data, pio->io_data, pio->io_size);
mutex_exit(&pio->io_lock);
}
zio_buf_free(zio->io_data, zio->io_size);
- mm->mm_error = zio->io_error;
- mm->mm_tried = 1;
- mm->mm_skipped = 0;
+ mc->mc_error = zio->io_error;
+ mc->mc_tried = 1;
+ mc->mc_skipped = 0;
}
static void
@@ -144,60 +205,42 @@ static int
vdev_mirror_child_select(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
- vdev_t *vd = zio->io_vd;
- vdev_t *cvd;
+ mirror_child_t *mc;
uint64_t txg = zio->io_txg;
int i, c;
ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
/*
- * Select the child we'd like to read from absent any errors.
- * The current policy is to alternate sides at 8M granularity.
- * XXX -- investigate other policies for read distribution.
- */
- c = (zio->io_offset >> (SPA_MAXBLOCKSHIFT + 6)) % vd->vdev_children;
-
- /*
- * If this is a replacing vdev, always try child 0 (the source) first.
- */
- if (vd->vdev_ops == &vdev_replacing_ops)
- c = 0;
-
- /*
* Try to find a child whose DTL doesn't contain the block to read.
* If a child is known to be completely inaccessible (indicated by
* vdev_is_dead() returning B_TRUE), don't even try.
*/
- for (i = 0; i < vd->vdev_children; i++, c++) {
- if (c >= vd->vdev_children)
+ for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
+ if (c >= mm->mm_children)
c = 0;
- if (mm[c].mm_tried || mm[c].mm_skipped)
+ mc = &mm->mm_child[c];
+ if (mc->mc_tried || mc->mc_skipped)
continue;
- cvd = vd->vdev_child[c];
- if (vdev_is_dead(cvd)) {
- mm[c].mm_error = ENXIO;
- mm[c].mm_tried = 1; /* don't even try */
- mm[c].mm_skipped = 1;
+ if (vdev_is_dead(mc->mc_vd)) {
+ mc->mc_error = ENXIO;
+ mc->mc_tried = 1; /* don't even try */
+ mc->mc_skipped = 1;
continue;
}
- if (!vdev_dtl_contains(&cvd->vdev_dtl_map, txg, 1))
+ if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1))
return (c);
- mm[c].mm_error = ESTALE;
- mm[c].mm_skipped = 1;
+ mc->mc_error = ESTALE;
+ mc->mc_skipped = 1;
}
/*
* Every device is either missing or has this txg in its DTL.
- * If we don't have any sibling replicas to consult, look for
- * any child we haven't already tried before giving up.
+ * Look for any child we haven't already tried before giving up.
*/
- if (vd == vd->vdev_top || vd->vdev_parent->vdev_children <= 1) {
- for (c = 0; c < vd->vdev_children; c++) {
- if (!mm[c].mm_tried)
- return (c);
- }
- }
+ for (c = 0; c < mm->mm_children; c++)
+ if (!mm->mm_child[c].mc_tried)
+ return (c);
/*
* Every child failed. There's no place left to look.
@@ -208,28 +251,28 @@ vdev_mirror_child_select(zio_t *zio)
static void
vdev_mirror_io_start(zio_t *zio)
{
- vdev_t *vd = zio->io_vd;
mirror_map_t *mm;
+ mirror_child_t *mc;
int c, children;
mm = vdev_mirror_map_alloc(zio);
if (zio->io_type == ZIO_TYPE_READ) {
- if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
- vd->vdev_ops != &vdev_replacing_ops) {
+ if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
/*
* For scrubbing reads we need to allocate a read
* buffer for each child and issue reads to all
* children. If any child succeeds, it will copy its
* data into zio->io_data in vdev_mirror_scrub_done.
*/
- for (c = 0; c < vd->vdev_children; c++) {
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
- vd->vdev_child[c], zio->io_offset,
+ mc->mc_vd, mc->mc_offset,
zio_buf_alloc(zio->io_size), zio->io_size,
zio->io_type, zio->io_priority,
- ZIO_FLAG_CANFAIL, vdev_mirror_scrub_done,
- &mm[c]));
+ ZIO_FLAG_CANFAIL,
+ vdev_mirror_scrub_done, mc));
}
zio_wait_children_done(zio);
return;
@@ -248,23 +291,23 @@ vdev_mirror_io_start(zio_t *zio)
* first child happens to have a DTL entry here as well.
* All other writes go to all children.
*/
- if ((zio->io_flags & ZIO_FLAG_RESILVER) &&
- vd->vdev_ops == &vdev_replacing_ops &&
- !vdev_dtl_contains(&vd->vdev_child[0]->vdev_dtl_map,
+ if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing &&
+ !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map,
zio->io_txg, 1)) {
- c = vd->vdev_children - 1;
+ c = mm->mm_children - 1;
children = 1;
} else {
c = 0;
- children = vd->vdev_children;
+ children = mm->mm_children;
}
}
while (children--) {
+ mc = &mm->mm_child[c];
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
- vd->vdev_child[c], zio->io_offset, zio->io_data,
- zio->io_size, zio->io_type, zio->io_priority,
- ZIO_FLAG_CANFAIL, vdev_mirror_child_done, &mm[c]));
+ mc->mc_vd, mc->mc_offset,
+ zio->io_data, zio->io_size, zio->io_type, zio->io_priority,
+ ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc));
c++;
}
@@ -274,20 +317,19 @@ vdev_mirror_io_start(zio_t *zio)
static void
vdev_mirror_io_done(zio_t *zio)
{
- vdev_t *vd = zio->io_vd;
- vdev_t *cvd;
mirror_map_t *mm = zio->io_vsd;
+ mirror_child_t *mc;
int c;
int good_copies = 0;
int unexpected_errors = 0;
- ASSERT(mm != NULL);
-
zio->io_error = 0;
zio->io_numerrors = 0;
- for (c = 0; c < vd->vdev_children; c++) {
- if (mm[c].mm_tried && mm[c].mm_error == 0) {
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+
+ if (mc->mc_tried && mc->mc_error == 0) {
good_copies++;
continue;
}
@@ -296,10 +338,10 @@ vdev_mirror_io_done(zio_t *zio)
* We preserve any EIOs because those may be worth retrying;
* whereas ECKSUM and ENXIO are more likely to be persistent.
*/
- if (mm[c].mm_error) {
+ if (mc->mc_error) {
if (zio->io_error != EIO)
- zio->io_error = mm[c].mm_error;
- if (!mm[c].mm_skipped)
+ zio->io_error = mc->mc_error;
+ if (!mc->mc_skipped)
unexpected_errors++;
zio->io_numerrors++;
}
@@ -308,11 +350,12 @@ vdev_mirror_io_done(zio_t *zio)
if (zio->io_type == ZIO_TYPE_WRITE) {
/*
* XXX -- for now, treat partial writes as success.
+ * XXX -- For a replacing vdev, we need to make sure the
+ * new child succeeds.
*/
/* XXPOLICY */
if (good_copies != 0)
zio->io_error = 0;
- ASSERT(mm != NULL);
vdev_mirror_map_free(zio);
zio_next_stage(zio);
return;
@@ -325,17 +368,16 @@ vdev_mirror_io_done(zio_t *zio)
*/
/* XXPOLICY */
if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
- ASSERT(c >= 0 && c < vd->vdev_children);
- cvd = vd->vdev_child[c];
- dprintf("%s: retrying i/o (err=%d) on child %s\n",
- vdev_description(zio->io_vd), zio->io_error,
- vdev_description(cvd));
+ ASSERT(c >= 0 && c < mm->mm_children);
+ mc = &mm->mm_child[c];
+ dprintf("retrying i/o (err=%d) on child %s\n",
+ zio->io_error, vdev_description(mc->mc_vd));
zio->io_error = 0;
zio_vdev_io_redone(zio);
- zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd,
- zio->io_offset, zio->io_data, zio->io_size,
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
- vdev_mirror_child_done, &mm[c]));
+ vdev_mirror_child_done, mc));
zio_wait_children_done(zio);
return;
}
@@ -360,7 +402,7 @@ vdev_mirror_io_done(zio_t *zio)
rio = zio_null(zio, zio->io_spa,
vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL);
- for (c = 0; c < vd->vdev_children; c++) {
+ for (c = 0; c < mm->mm_children; c++) {
/*
* Don't rewrite known good children.
* Not only is it unnecessary, it could
@@ -368,24 +410,23 @@ vdev_mirror_io_done(zio_t *zio)
* power while rewriting the only good copy,
* there would be no good copies left!
*/
- cvd = vd->vdev_child[c];
+ mc = &mm->mm_child[c];
- if (mm[c].mm_error == 0) {
- if (mm[c].mm_tried)
+ if (mc->mc_error == 0) {
+ if (mc->mc_tried)
continue;
- if (!vdev_dtl_contains(&cvd->vdev_dtl_map,
+ if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
zio->io_txg, 1))
continue;
- mm[c].mm_error = ESTALE;
+ mc->mc_error = ESTALE;
}
- dprintf("%s resilvered %s @ 0x%llx error %d\n",
- vdev_description(vd),
- vdev_description(cvd),
- zio->io_offset, mm[c].mm_error);
+ dprintf("resilvered %s @ 0x%llx error %d\n",
+ vdev_description(mc->mc_vd), mc->mc_offset,
+ mc->mc_error);
- zio_nowait(zio_vdev_child_io(rio, zio->io_bp, cvd,
- zio->io_offset, zio->io_data, zio->io_size,
+ zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd,
+ mc->mc_offset, zio->io_data, zio->io_size,
ZIO_TYPE_WRITE, zio->io_priority,
ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 6e69053b8a..33225de39b 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -272,12 +272,7 @@ vdev_raidz_io_start(zio_t *zio)
rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children);
- if (DVA_GET_GANG(ZIO_GET_DVA(zio))) {
- ASSERT3U(rm->rm_asize, ==,
- vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE));
- } else {
- ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio)));
- }
+ ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
if (zio->io_type == ZIO_TYPE_WRITE) {
@@ -357,11 +352,10 @@ vdev_raidz_io_done(zio_t *zio)
vdev_t *cvd;
raidz_map_t *rm = zio->io_vsd;
raidz_col_t *rc;
- blkptr_t *bp = zio->io_bp;
int unexpected_errors = 0;
int c;
- ASSERT(bp != NULL); /* XXX need to add code to enforce this */
+ ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
zio->io_error = 0;
zio->io_numerrors = 0;
diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c
index 9ffdc8fba1..0e8752c6ce 100644
--- a/usr/src/uts/common/fs/zfs/vdev_root.c
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c
@@ -35,12 +35,29 @@
* Virtual device vector for the pool's root vdev.
*/
+/*
+ * We should be able to tolerate one failure with absolutely no damage
+ * to our metadata. Two failures will take out space maps, a bunch of
+ * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy
+ * place to live. When we get smarter, we can liberalize this policy.
+ * e.g. If we haven't lost two consecutive top-level vdevs, then we are
+ * probably fine. Adding bean counters during alloc/free can make this
+ * future guesswork more accurate.
+ */
+/*ARGSUSED*/
+static int
+too_many_errors(vdev_t *vd, int numerrors)
+{
+ return (numerrors > 0);
+}
+
static int
vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
{
vdev_t *cvd;
int c, error;
int lasterror = 0;
+ int numerrors = 0;
if (vd->vdev_children == 0) {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
@@ -52,17 +69,20 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
if ((error = vdev_open(cvd)) != 0) {
lasterror = error;
+ numerrors++;
continue;
}
}
- if (lasterror)
+ if (too_many_errors(vd, numerrors)) {
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
*asize = 0;
*ashift = 0;
- return (lasterror);
+ return (0);
}
static void
@@ -77,7 +97,7 @@ vdev_root_close(vdev_t *vd)
static void
vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
{
- if (faulted > 0)
+ if (too_many_errors(vd, faulted))
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_NO_REPLICAS);
else if (degraded != 0)
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index cd7e79a8be..0cff445cf3 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -392,7 +392,7 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
static int
zfs_ioc_pool_export(zfs_cmd_t *zc)
{
- return (spa_export(zc->zc_name));
+ return (spa_export(zc->zc_name, NULL));
}
static int
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 050db0ff34..373d0c41d0 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -248,8 +248,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_bp = bp;
zio->io_bp_copy = *bp;
zio->io_bp_orig = *bp;
- /* XXBP - Need to inherit this when it matters */
- zio->io_dva_index = 0;
}
zio->io_done = done;
zio->io_private = private;
@@ -279,6 +277,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
if (pio->io_child != NULL)
pio->io_child->io_sibling_prev = zio;
pio->io_child = zio;
+ zio->io_ndvas = pio->io_ndvas;
mutex_exit(&pio->io_lock);
}
@@ -310,7 +309,6 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
int priority, int flags, zbookmark_t *zb)
{
zio_t *zio;
- dva_t *dva;
ASSERT3U(size, ==, BP_GET_LSIZE(bp));
@@ -325,9 +323,6 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
*/
zio->io_bp = &zio->io_bp_copy;
- bp = zio->io_bp;
- dva = ZIO_GET_DVA(zio);
-
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
uint64_t csize = BP_GET_PSIZE(bp);
void *cbuf = zio_buf_alloc(csize);
@@ -336,7 +331,7 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
}
- if (DVA_GET_GANG(dva)) {
+ if (BP_IS_GANG(bp)) {
uint64_t gsize = SPA_GANGBLOCKSIZE;
void *gbuf = zio_buf_alloc(gsize);
@@ -348,7 +343,7 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
}
zio_t *
-zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
zio_done_func_t *done, void *private, int priority, int flags,
zbookmark_t *zb)
@@ -371,6 +366,7 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
zio->io_checksum = checksum;
zio->io_compress = compress;
+ zio->io_ndvas = ncopies;
if (compress != ZIO_COMPRESS_OFF)
zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
@@ -380,6 +376,10 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
BP_ZERO(bp);
BP_SET_LSIZE(bp, size);
BP_SET_PSIZE(bp, size);
+ } else {
+ /* Make sure someone doesn't change their mind on overwrites */
+ ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
+ spa_max_replication(spa)) == BP_GET_NDVAS(bp));
}
return (zio);
@@ -393,7 +393,6 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
{
zio_t *zio;
- /* XXBP - We need to re-evaluate when to insert pipeline stages */
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
ZIO_TYPE_WRITE, priority, flags,
ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
@@ -402,6 +401,9 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
zio->io_checksum = checksum;
zio->io_compress = ZIO_COMPRESS_OFF;
+ if (pio != NULL)
+ ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
+
return (zio);
}
@@ -441,7 +443,6 @@ zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
return (zio_null(pio, spa, NULL, NULL, 0));
}
- /* XXBP - We need to re-evaluate when to insert pipeline stages */
zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0,
ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
@@ -471,7 +472,6 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
ASSERT3U(spa_first_txg(spa), <=, txg);
- /* XXBP - We need to re-evaluate when to insert pipeline stages */
zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
@@ -623,7 +623,7 @@ zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
done, private, type, priority,
(zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
- ZIO_STAGE_VDEV_IO_SETUP - 1, pipeline);
+ ZIO_STAGE_VDEV_IO_START - 1, pipeline);
cio->io_vd = vd;
cio->io_offset = offset;
@@ -748,8 +748,13 @@ zio_done(zio_t *zio)
ASSERT(bp->blk_pad[2] == 0);
ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
- !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
+ !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
ASSERT(!BP_SHOULD_BYTESWAP(bp));
+ if (zio->io_ndvas != 0)
+ ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
+ ASSERT(BP_COUNT_GANG(bp) == 0 ||
+ (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
+ }
}
if (vd != NULL)
@@ -902,6 +907,7 @@ zio_write_compress(zio_t *zio)
BP_ZERO(bp);
zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
} else {
+ ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
BP_SET_LSIZE(bp, lsize);
BP_SET_PSIZE(bp, csize);
BP_SET_COMPRESS(bp, compress);
@@ -946,7 +952,7 @@ zio_gang_pipeline(zio_t *zio)
* By default, the pipeline assumes that we're dealing with a gang
* block. If we're not, strip out any gang-specific stages.
*/
- if (!DVA_GET_GANG(ZIO_GET_DVA(zio)))
+ if (!BP_IS_GANG(zio->io_bp))
zio->io_pipeline &= ~ZIO_GANG_STAGES;
zio_next_stage(zio);
@@ -968,7 +974,7 @@ zio_get_gang_header(zio_t *zio)
uint64_t gsize = SPA_GANGBLOCKSIZE;
void *gbuf = zio_buf_alloc(gsize);
- ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+ ASSERT(BP_IS_GANG(bp));
zio_push_transform(zio, gbuf, gsize, gsize);
@@ -987,7 +993,7 @@ zio_read_gang_members(zio_t *zio)
uint64_t gsize, gbufsize, loff, lsize;
int i;
- ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+ ASSERT(BP_IS_GANG(zio->io_bp));
zio_gang_byteswap(zio);
zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
@@ -1019,7 +1025,7 @@ zio_rewrite_gang_members(zio_t *zio)
uint64_t gsize, gbufsize, loff, lsize;
int i;
- ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+ ASSERT(BP_IS_GANG(zio->io_bp));
ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
zio_gang_byteswap(zio);
@@ -1054,7 +1060,7 @@ zio_free_gang_members(zio_t *zio)
uint64_t gsize, gbufsize;
int i;
- ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+ ASSERT(BP_IS_GANG(zio->io_bp));
zio_gang_byteswap(zio);
zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
@@ -1079,7 +1085,7 @@ zio_claim_gang_members(zio_t *zio)
uint64_t gsize, gbufsize;
int i;
- ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+ ASSERT(BP_IS_GANG(zio->io_bp));
zio_gang_byteswap(zio);
zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
@@ -1100,17 +1106,23 @@ static void
zio_write_allocate_gang_member_done(zio_t *zio)
{
zio_t *pio = zio->io_parent;
- dva_t *cdva = ZIO_GET_DVA(zio);
- dva_t *pdva = ZIO_GET_DVA(pio);
+ dva_t *cdva = zio->io_bp->blk_dva;
+ dva_t *pdva = pio->io_bp->blk_dva;
uint64_t asize;
+ int d;
- ASSERT(DVA_GET_GANG(pdva));
+ ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
+ ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
+ ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
+ ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
- /* XXBP - Need to be careful here with multiple DVAs */
mutex_enter(&pio->io_lock);
- asize = DVA_GET_ASIZE(pdva);
- asize += DVA_GET_ASIZE(cdva);
- DVA_SET_ASIZE(pdva, asize);
+ for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
+ ASSERT(DVA_GET_GANG(&pdva[d]));
+ asize = DVA_GET_ASIZE(&pdva[d]);
+ asize += DVA_GET_ASIZE(&cdva[d]);
+ DVA_SET_ASIZE(&pdva[d], asize);
+ }
mutex_exit(&pio->io_lock);
}
@@ -1118,41 +1130,50 @@ static void
zio_write_allocate_gang_members(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
- dva_t *dva = ZIO_GET_DVA(zio);
+ dva_t *dva = bp->blk_dva;
+ spa_t *spa = zio->io_spa;
zio_gbh_phys_t *gbh;
+ uint64_t txg = zio->io_txg;
uint64_t resid = zio->io_size;
uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
uint64_t gsize, loff, lsize;
uint32_t gbps_left;
+ int ndvas = zio->io_ndvas;
+ int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
int error;
- int i;
+ int i, d;
gsize = SPA_GANGBLOCKSIZE;
gbps_left = SPA_GBH_NBLKPTRS;
- error = metaslab_alloc(zio->io_spa, gsize, dva, zio->io_txg);
+ error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL);
if (error == ENOSPC)
panic("can't allocate gang block header");
ASSERT(error == 0);
- DVA_SET_GANG(dva, 1);
+ for (d = 0; d < gbh_ndvas; d++)
+ DVA_SET_GANG(&dva[d], 1);
- bp->blk_birth = zio->io_txg;
+ bp->blk_birth = txg;
gbh = zio_buf_alloc(gsize);
bzero(gbh, gsize);
+ /* We need to test multi-level gang blocks */
+ if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0)
+ maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
+
for (loff = 0, i = 0; loff != zio->io_size;
loff += lsize, resid -= lsize, gbps_left--, i++) {
blkptr_t *gbp = &gbh->zg_blkptr[i];
- dva = &gbp->blk_dva[0];
+ dva = gbp->blk_dva;
ASSERT(gbps_left != 0);
maxalloc = MIN(maxalloc, resid);
while (resid <= maxalloc * gbps_left) {
- error = metaslab_alloc(zio->io_spa, maxalloc, dva,
- zio->io_txg);
+ error = metaslab_alloc(spa, maxalloc, gbp, ndvas,
+ txg, bp);
if (error == 0)
break;
ASSERT3U(error, ==, ENOSPC);
@@ -1166,9 +1187,9 @@ zio_write_allocate_gang_members(zio_t *zio)
BP_SET_LSIZE(gbp, lsize);
BP_SET_PSIZE(gbp, lsize);
BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
- gbp->blk_birth = zio->io_txg;
- zio_nowait(zio_rewrite(zio, zio->io_spa,
- zio->io_checksum, zio->io_txg, gbp,
+ gbp->blk_birth = txg;
+ zio_nowait(zio_rewrite(zio, spa,
+ zio->io_checksum, txg, gbp,
(char *)zio->io_data + loff, lsize,
zio_write_allocate_gang_member_done, NULL,
zio->io_priority, zio->io_flags,
@@ -1176,8 +1197,8 @@ zio_write_allocate_gang_members(zio_t *zio)
} else {
lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
ASSERT(lsize != SPA_MINBLOCKSIZE);
- zio_nowait(zio_write_allocate(zio, zio->io_spa,
- zio->io_checksum, zio->io_txg, gbp,
+ zio_nowait(zio_write_allocate(zio, spa,
+ zio->io_checksum, txg, gbp,
(char *)zio->io_data + loff, lsize,
zio_write_allocate_gang_member_done, NULL,
zio->io_priority, zio->io_flags));
@@ -1189,6 +1210,12 @@ zio_write_allocate_gang_members(zio_t *zio)
zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
zio_push_transform(zio, gbh, gsize, gsize);
+ /*
+ * As much as we'd like this to be zio_wait_children_ready(),
+ * updating our ASIZE doesn't happen until the io_done callback,
+ * so we have to wait for that to finish in order for our BP
+ * to be stable.
+ */
zio_wait_children_done(zio);
}
@@ -1201,10 +1228,12 @@ static void
zio_dva_allocate(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
- dva_t *dva = ZIO_GET_DVA(zio);
int error;
ASSERT(BP_IS_HOLE(bp));
+ ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
+ ASSERT3U(zio->io_ndvas, >, 0);
+ ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa));
/* For testing, make some blocks above a certain size be gang blocks */
if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
@@ -1214,7 +1243,8 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
- error = metaslab_alloc(zio->io_spa, zio->io_size, dva, zio->io_txg);
+ error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas,
+ zio->io_txg, NULL);
if (error == 0) {
bp->blk_birth = zio->io_txg;
@@ -1233,11 +1263,13 @@ static void
zio_dva_free(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
- dva_t *dva = ZIO_GET_DVA(zio);
+ dva_t *dva = bp->blk_dva;
+ int d;
ASSERT(!BP_IS_HOLE(bp));
- metaslab_free(zio->io_spa, dva, zio->io_txg, B_FALSE);
+ for (d = 0; d < BP_GET_NDVAS(bp); d++)
+ metaslab_free(zio->io_spa, &dva[d], zio->io_txg, B_FALSE);
BP_ZERO(bp);
@@ -1248,31 +1280,17 @@ static void
zio_dva_claim(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
- dva_t *dva = ZIO_GET_DVA(zio);
+ dva_t *dva = bp->blk_dva;
+ int error = 0;
+ int d;
ASSERT(!BP_IS_HOLE(bp));
- zio->io_error = metaslab_claim(zio->io_spa, dva, zio->io_txg);
-
- zio_next_stage(zio);
-}
-
-static void
-zio_dva_translate(zio_t *zio)
-{
- spa_t *spa = zio->io_spa;
- dva_t *dva = ZIO_GET_DVA(zio);
- uint64_t vdev = DVA_GET_VDEV(dva);
- uint64_t offset = DVA_GET_OFFSET(dva);
-
- ASSERT3U(zio->io_size, ==, ZIO_GET_IOSIZE(zio));
-
- zio->io_offset = offset;
-
- if ((zio->io_vd = vdev_lookup_top(spa, vdev)) == NULL)
- zio->io_error = ENXIO;
- else if (offset + zio->io_size > zio->io_vd->vdev_asize)
- zio->io_error = EOVERFLOW;
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ error = metaslab_claim(zio->io_spa, &dva[d], zio->io_txg);
+ if (error)
+ zio->io_error = error;
+ }
zio_next_stage(zio);
}
@@ -1284,17 +1302,26 @@ zio_dva_translate(zio_t *zio)
*/
static void
-zio_vdev_io_setup(zio_t *zio)
+zio_vdev_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
- vdev_t *tvd = vd->vdev_top;
- uint64_t align = 1ULL << tvd->vdev_ashift;
+ vdev_t *tvd = vd ? vd->vdev_top : NULL;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t align;
+
+ if (vd == NULL) {
+ /* The mirror_ops handle multiple DVAs in a single BP */
+ vdev_mirror_ops.vdev_op_io_start(zio);
+ return;
+ }
+
+ align = 1ULL << tvd->vdev_ashift;
- /* XXPOLICY */
if (zio->io_retries == 0 && vd == tvd)
zio->io_flags |= ZIO_FLAG_FAILFAST;
- if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+ vd->vdev_children == 0) {
zio->io_flags |= ZIO_FLAG_PHYSICAL;
zio->io_offset += VDEV_LABEL_START_SIZE;
}
@@ -1312,15 +1339,6 @@ zio_vdev_io_setup(zio_t *zio)
zio->io_flags |= ZIO_FLAG_SUBBLOCK;
}
- zio_next_stage(zio);
-}
-
-static void
-zio_vdev_io_start(zio_t *zio)
-{
- blkptr_t *bp = zio->io_bp;
- uint64_t align = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
-
ASSERT(P2PHASE(zio->io_offset, align) == 0);
ASSERT(P2PHASE(zio->io_size, align) == 0);
ASSERT(bp == NULL ||
@@ -1335,7 +1353,11 @@ zio_vdev_io_start(zio_t *zio)
static void
zio_vdev_io_done(zio_t *zio)
{
- vdev_io_done(zio);
+ if (zio->io_vd == NULL)
+ /* The mirror_ops handle multiple DVAs in a single BP */
+ vdev_mirror_ops.vdev_op_io_done(zio);
+ else
+ vdev_io_done(zio);
}
/* XXPOLICY */
@@ -1348,7 +1370,7 @@ zio_should_retry(zio_t *zio)
return (B_FALSE);
if (zio->io_delegate_list != NULL)
return (B_FALSE);
- if (vd != vd->vdev_top)
+ if (vd && vd != vd->vdev_top)
return (B_FALSE);
if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
return (B_FALSE);
@@ -1362,7 +1384,7 @@ static void
zio_vdev_io_assess(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
- vdev_t *tvd = vd->vdev_top;
+ vdev_t *tvd = vd ? vd->vdev_top : NULL;
ASSERT(zio->io_vsd == NULL);
@@ -1394,7 +1416,7 @@ zio_vdev_io_assess(zio_t *zio)
/* XXPOLICY */
zio->io_flags &= ~ZIO_FLAG_FAILFAST;
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
- zio->io_stage = ZIO_STAGE_VDEV_IO_SETUP - 1;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
dprintf("retry #%d for %s to %s offset %llx\n",
zio->io_retries, zio_type_name[zio->io_type],
@@ -1404,8 +1426,8 @@ zio_vdev_io_assess(zio_t *zio)
return;
}
- if (zio->io_error != 0 && !(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
- zio->io_error != ECKSUM) {
+ if (zio->io_error != 0 && zio->io_error != ECKSUM &&
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) {
/*
* Poor man's hotplug support. Even if we're done retrying this
* I/O, try to reopen the vdev to see if it's still attached.
@@ -1480,8 +1502,8 @@ zio_gang_checksum_generate(zio_t *zio)
zio_cksum_t zc;
zio_gbh_phys_t *gbh = zio->io_data;
+ ASSERT(BP_IS_GANG(zio->io_bp));
ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
- ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
@@ -1518,9 +1540,11 @@ zio_checksum_verified(zio_t *zio)
void
zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
{
- zcp->zc_word[0] = DVA_GET_VDEV(ZIO_GET_DVA(zio));
- zcp->zc_word[1] = DVA_GET_OFFSET(ZIO_GET_DVA(zio));
- zcp->zc_word[2] = zio->io_bp->blk_birth;
+ blkptr_t *bp = zio->io_bp;
+
+ zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
+ zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
+ zcp->zc_word[2] = bp->blk_birth;
zcp->zc_word[3] = 0;
}
@@ -1552,8 +1576,6 @@ zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
zio_dva_claim,
zio_gang_checksum_generate,
zio_ready,
- zio_dva_translate,
- zio_vdev_io_setup,
zio_vdev_io_start,
zio_vdev_io_done,
zio_vdev_io_assess,
@@ -1656,7 +1678,7 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp,
BP_ZERO(bp);
- error = metaslab_alloc(spa, size, BP_IDENTITY(bp), txg);
+ error = metaslab_alloc(spa, size, bp, 1, txg, NULL);
if (error == 0) {
BP_SET_CHECKSUM(bp, checksum);
@@ -1681,7 +1703,7 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp,
void
zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
{
- ASSERT(DVA_GET_GANG(BP_IDENTITY(bp)) == 0);
+ ASSERT(!BP_IS_GANG(bp));
dprintf_bp(bp, "txg %llu: ", txg);
diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c
index ca65f831a3..30369227b5 100644
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c
@@ -122,9 +122,8 @@ int
zio_checksum_error(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
- dva_t *dva = ZIO_GET_DVA(zio);
zio_cksum_t zc = bp->blk_cksum;
- uint_t checksum = DVA_GET_GANG(dva) ? ZIO_CHECKSUM_GANG_HEADER :
+ uint_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER :
BP_GET_CHECKSUM(bp);
int byteswap = BP_SHOULD_BYTESWAP(bp);
void *data = zio->io_data;
@@ -159,7 +158,7 @@ zio_checksum_error(zio_t *zio)
}
zc = expected_cksum;
} else {
- ASSERT(!DVA_GET_GANG(dva));
+ ASSERT(!BP_IS_GANG(bp));
ci->ci_func[byteswap](data, size, &actual_cksum);
}
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index f1a331051d..5aaca0662b 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -109,7 +109,23 @@ uint64_t zfs_prop_default_numeric(zfs_prop_t);
/*
* On-disk format version.
*/
-#define ZFS_VERSION 1ULL
+#define ZFS_VERSION_1 1ULL
+#define ZFS_VERSION_2 2ULL
+#define ZFS_VERSION ZFS_VERSION_2
+
+/*
+ * Symbolic names for the changes that caused a ZFS_VERSION switch.
+ * Used in the code when checking for presence or absence of a feature.
+ * Feel free to define multiple symbolic names for each version if there
+ * were multiple changes to on-disk structures during that version.
+ *
+ * NOTE: When checking the current ZFS_VERSION in your code, be sure
+ * to use spa_version() since it reports the version of the
+ * last synced uberblock. Checking the in-flight version can
+ * be dangerous in some cases.
+ */
+#define ZFS_VERSION_INITIAL ZFS_VERSION_1
+#define ZFS_VERSION_DITTO_BLOCKS ZFS_VERSION_2
/*
* The following are configuration names used in the nvlist describing a pool's