diff options
author | billm <none@none> | 2006-04-10 05:03:38 -0700 |
---|---|---|
committer | billm <none@none> | 2006-04-10 05:03:38 -0700 |
commit | 44cd46cadd9aab751dae6a4023c1cb5bf316d274 (patch) | |
tree | 27db23d9e2bc81a70d528c18cf9d04874891ed9d | |
parent | dc5d169b4bfc1a6993578ef34dae678076fd19fb (diff) | |
download | illumos-gate-44cd46cadd9aab751dae6a4023c1cb5bf316d274.tar.gz |
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
6410700 zdb should support reading raw blocks out of storage pool
6410709 ztest: spa config can change before pool export
27 files changed, 1176 insertions, 524 deletions
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index 40b2e019bc..5b218aee5f 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -437,20 +437,28 @@ blkptr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) zct[i].ci_name = local_strdup(buf); } - for (i = 0; i < SPA_DVAS_PER_BP; i++) { + /* + * Super-ick warning: This code is also duplicated in + * cmd/zdb.c . Yeah, I hate code replication, too. + */ + for (i = 0; i < BP_GET_NDVAS(&bp); i++) { dva_t *dva = &bp.blk_dva[i]; - mdb_printf("DVA[%d]: GANG: %-5s GRID: %2x ASIZE: %5x " - "vdev %llu offset %llx\n", - i, - DVA_GET_GANG(dva) ? "TRUE" : "FALSE", - DVA_GET_GRID(dva), - DVA_GET_ASIZE(dva), - DVA_GET_VDEV(dva), - DVA_GET_OFFSET(dva)); + + mdb_printf("DVA[%d]: vdev_id %lld / %llx\n", i, + DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva)); + mdb_printf("DVA[%d]: GANG: %-5s GRID: %04x\t" + "ASIZE: %llx\n", i, DVA_GET_GANG(dva) ? "TRUE" : "FALSE", + DVA_GET_GRID(dva), DVA_GET_ASIZE(dva)); + mdb_printf("DVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", i, + DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), BP_GET_PSIZE(&bp), + BP_SHOULD_BYTESWAP(&bp) ? "e" : "", + !DVA_GET_GANG(dva) && BP_GET_LEVEL(&bp) != 0 ? "i" : "", + DVA_GET_GANG(dva) ? "g" : "", + BP_GET_COMPRESS(&bp) != 0 ? "d" : ""); } mdb_printf("LSIZE: %-16llx\t\tPSIZE: %llx\n", BP_GET_LSIZE(&bp), BP_GET_PSIZE(&bp)); - mdb_printf("ENDIAN: %-6s TYPE: %s\n", + mdb_printf("ENDIAN: %6s\t\t\t\t\tTYPE: %s\n", BP_GET_BYTEORDER(&bp) ? "LITTLE" : "BIG", doti[BP_GET_TYPE(&bp)].ot_name); mdb_printf("BIRTH: %-16llx LEVEL: %-2d\tFILL: %llx\n", diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index 611f8ffc0c..3615846a00 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -27,6 +27,7 @@ #include <stdio.h> #include <stdlib.h> +#include <ctype.h> #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/spa_impl.h> @@ -84,8 +85,9 @@ usage(void) "Usage: %s [-udibcsvLU] [-O order] [-B os:obj:level:blkid] " "dataset [object...]\n" " %s -C [pool]\n" - " %s -l dev\n", - cmdname, cmdname, cmdname); + " %s -l dev\n" + " %s -R vdev:offset:size:flags\n", + cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " -u uberblock\n"); (void) fprintf(stderr, " -d datasets\n"); @@ -102,6 +104,8 @@ usage(void) (void) fprintf(stderr, " -U use zpool.cache in /tmp\n"); (void) fprintf(stderr, " -B objset:object:level:blkid -- " "simulate bad block\n"); + (void) fprintf(stderr, " -R read and display block from a" + "device\n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); @@ -523,20 +527,41 @@ blkid2offset(dnode_phys_t *dnp, int level, uint64_t blkid) dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } +static void +sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas) +{ + dva_t *dva = bp->blk_dva; + int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1; + int i; + + blkbuf[0] = '\0'; + + for (i = 0; i < ndvas; i++) + (void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ", + (u_longlong_t)DVA_GET_VDEV(&dva[i]), + (u_longlong_t)DVA_GET_OFFSET(&dva[i]), + (u_longlong_t)DVA_GET_ASIZE(&dva[i])); + + (void) sprintf(blkbuf + strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu", + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)BP_GET_PSIZE(bp), + (u_longlong_t)bp->blk_fill, + (u_longlong_t)bp->blk_birth); +} + /* ARGSUSED */ static int zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) { zbookmark_t *zb = &bc->bc_bookmark; blkptr_t *bp = &bc->bc_blkptr; - dva_t *dva = &bp->blk_dva[0]; void *data = bc->bc_data; dnode_phys_t *dnp = bc->bc_dnode; - char buffer[300]; + char blkbuf[BP_SPRINTF_LEN + 80]; int l; if (bc->bc_errno) { - (void) sprintf(buffer, + (void) sprintf(blkbuf, "Error %d reading <%llu, %llu, %lld, %llu>: ", bc->bc_errno, (u_longlong_t)zb->zb_objset, @@ -581,37 +606,28 @@ zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) ASSERT3U(fill, ==, bp->blk_fill); } - (void) sprintf(buffer, "%16llx ", + (void) sprintf(blkbuf, "%16llx ", (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid)); ASSERT(zb->zb_level >= 0); for (l = dnp->dn_nlevels - 1; l >= -1; l--) { if (l == zb->zb_level) { - (void) sprintf(buffer + strlen(buffer), "L%llx", + (void) sprintf(blkbuf + strlen(blkbuf), "L%llx", (u_longlong_t)zb->zb_level); } else { - (void) sprintf(buffer + strlen(buffer), " "); + (void) sprintf(blkbuf + strlen(blkbuf), " "); } } out: if (bp->blk_birth == 0) { - (void) sprintf(buffer + strlen(buffer), "<hole>"); - (void) printf("%s\n", buffer); + (void) sprintf(blkbuf + strlen(blkbuf), "<hole>"); + (void) printf("%s\n", blkbuf); } else { - // XXBP - Need to print number of active BPs here - (void) sprintf(buffer + strlen(buffer), - "vdev=%llu off=%llx %llxL/%llxP/%llxA F=%llu B=%llu", - (u_longlong_t)DVA_GET_VDEV(dva), - (u_longlong_t)DVA_GET_OFFSET(dva), - (u_longlong_t)BP_GET_LSIZE(bp), - (u_longlong_t)BP_GET_PSIZE(bp), - (u_longlong_t)DVA_GET_ASIZE(dva), - (u_longlong_t)bp->blk_fill, - (u_longlong_t)bp->blk_birth); - - (void) printf("%s\n", buffer); + sprintf_blkptr_compact(blkbuf + strlen(blkbuf), bp, + dump_opt['d'] > 5 ? 1 : 0); + (void) printf("%s\n", blkbuf); } return (bc->bc_errno ? ERESTART : 0); @@ -762,18 +778,12 @@ dump_bplist(objset_t *mos, uint64_t object, char *name) (void) printf("\n"); while (bplist_iterate(&bpl, &itor, bp) == 0) { + char blkbuf[BP_SPRINTF_LEN]; + ASSERT(bp->blk_birth != 0); - // XXBP - Do we want to see all DVAs, or just one? - (void) printf("\tItem %3llu: vdev=%llu off=%llx " - "%llxL/%llxP/%llxA F=%llu B=%llu\n", - (u_longlong_t)itor - 1, - (u_longlong_t)DVA_GET_VDEV(&bp->blk_dva[0]), - (u_longlong_t)DVA_GET_OFFSET(&bp->blk_dva[0]), - (u_longlong_t)BP_GET_LSIZE(bp), - (u_longlong_t)BP_GET_PSIZE(bp), - (u_longlong_t)DVA_GET_ASIZE(&bp->blk_dva[0]), - (u_longlong_t)bp->blk_fill, - (u_longlong_t)bp->blk_birth); + sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0); + (void) printf("\tItem %3llu: %s\n", + (u_longlong_t)itor - 1, blkbuf); } bplist_close(&bpl); @@ -1228,45 +1238,73 @@ zdb_space_map_load(spa_t *spa) static int zdb_space_map_claim(spa_t *spa, blkptr_t *bp, zbookmark_t *zb) { - dva_t *dva = &bp->blk_dva[0]; - uint64_t vdev = DVA_GET_VDEV(dva); - uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t size = DVA_GET_ASIZE(dva); + dva_t *dva = bp->blk_dva; vdev_t *vd; metaslab_t *msp; space_map_t *allocmap, *freemap; int error; + int d; + blkptr_t blk = *bp; + + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + uint64_t vdev = DVA_GET_VDEV(&dva[d]); + uint64_t offset = DVA_GET_OFFSET(&dva[d]); + uint64_t size = DVA_GET_ASIZE(&dva[d]); + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL) + return (ENXIO); + + if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) + return (ENXIO); - if ((vd = vdev_lookup_top(spa, vdev)) == NULL) - return (ENXIO); + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + allocmap = &msp->ms_allocmap[0]; + freemap = &msp->ms_freemap[0]; - if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) - return (ENXIO); + /* Prepare our copy of the bp in case we need to read GBHs */ + if (DVA_GET_GANG(&dva[d])) { + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + DVA_SET_ASIZE(&blk.blk_dva[d], size); + DVA_SET_GANG(&blk.blk_dva[d], 0); + } + + mutex_enter(&msp->ms_lock); + if (space_map_contains(freemap, offset, size)) { + mutex_exit(&msp->ms_lock); + return (EAGAIN); /* allocated more than once */ + } - if (DVA_GET_GANG(dva)) { + if (!space_map_contains(allocmap, offset, size)) { + mutex_exit(&msp->ms_lock); + return (ESTALE); /* not allocated at all */ + } + + space_map_remove(allocmap, offset, size); + space_map_add(freemap, offset, size); + + mutex_exit(&msp->ms_lock); + } + + if (BP_IS_GANG(bp)) { zio_gbh_phys_t gbh; - blkptr_t blk = *bp; int g; /* LINTED - compile time assert */ ASSERT(sizeof (zio_gbh_phys_t) == SPA_GANGBLOCKSIZE); - size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - DVA_SET_GANG(&blk.blk_dva[0], 0); - DVA_SET_ASIZE(&blk.blk_dva[0], size); + BP_SET_CHECKSUM(&blk, ZIO_CHECKSUM_GANG_HEADER); BP_SET_PSIZE(&blk, SPA_GANGBLOCKSIZE); BP_SET_LSIZE(&blk, SPA_GANGBLOCKSIZE); BP_SET_COMPRESS(&blk, ZIO_COMPRESS_OFF); - error = zio_wait(zio_read(NULL, spa, &blk, - &gbh, SPA_GANGBLOCKSIZE, NULL, NULL, - ZIO_PRIORITY_SYNC_READ, + error = zio_wait(zio_read(NULL, spa, &blk, &gbh, + SPA_GANGBLOCKSIZE, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD, zb)); if (error) return (error); if (BP_SHOULD_BYTESWAP(&blk)) byteswap_uint64_array(&gbh, SPA_GANGBLOCKSIZE); for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { - if (gbh.zg_blkptr[g].blk_birth == 0) + if (BP_IS_HOLE(&gbh.zg_blkptr[g])) break; error = zdb_space_map_claim(spa, &gbh.zg_blkptr[g], zb); if (error) @@ -1274,26 +1312,6 @@ zdb_space_map_claim(spa_t *spa, blkptr_t *bp, zbookmark_t *zb) } } - msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - allocmap = &msp->ms_allocmap[0]; - freemap = &msp->ms_freemap[0]; - - mutex_enter(&msp->ms_lock); - if (space_map_contains(freemap, offset, size)) { - mutex_exit(&msp->ms_lock); - return (EAGAIN); /* allocated more than once */ - } - - if (!space_map_contains(allocmap, offset, size)) { - mutex_exit(&msp->ms_lock); - return (ESTALE); /* not allocated at all */ - } - - space_map_remove(allocmap, offset, size); - space_map_add(freemap, offset, size); - - mutex_exit(&msp->ms_lock); - return (0); } @@ -1448,7 +1466,7 @@ zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) zcb->zcb_readfails = 0; - ASSERT(bp->blk_birth != 0); + ASSERT(!BP_IS_HOLE(bp)); zdb_count_block(spa, zcb, bp, type); @@ -1511,13 +1529,13 @@ dump_block_stats(spa_t *spa) spa->spa_sync_bplist_obj)); while (bplist_iterate(bpl, &itor, &blk) == 0) { - zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED); if (dump_opt['b'] >= 4) { char blkbuf[BP_SPRINTF_LEN]; sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk); (void) printf("[%s] %s\n", "deferred free", blkbuf); } + zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED); } bplist_close(bpl); @@ -1703,6 +1721,321 @@ dump_zpool(spa_t *spa) exit(rc); } +#define ZDB_FLAG_CHECKSUM 0x0001 +#define ZDB_FLAG_DECOMPRESS 0x0002 +#define ZDB_FLAG_BSWAP 0x0004 +#define ZDB_FLAG_GBH 0x0008 +#define ZDB_FLAG_INDIRECT 0x0010 +#define ZDB_FLAG_PHYS 0x0020 +#define ZDB_FLAG_RAW 0x0040 +#define ZDB_FLAG_PRINT_BLKPTR 0x0080 + +int flagbits[256]; + +static void +zdb_print_blkptr(blkptr_t *bp, int flags) +{ + dva_t *dva = bp->blk_dva; + int d; + + if (flags & ZDB_FLAG_BSWAP) + byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); + /* + * Super-ick warning: This code is also duplicated in + * cmd/mdb/common/modules/zfs/zfs.c . Yeah, I hate code + * replication, too. + */ + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + (void) printf("\tDVA[%d]: vdev_id %lld / %llx\n", d, + DVA_GET_VDEV(&dva[d]), DVA_GET_OFFSET(&dva[d])); + (void) printf("\tDVA[%d]: GANG: %-5s GRID: %04llx\t" + "ASIZE: %llx\n", d, + DVA_GET_GANG(&dva[d]) ? "TRUE" : "FALSE", + DVA_GET_GRID(&dva[d]), DVA_GET_ASIZE(&dva[d])); + (void) printf("\tDVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", d, + DVA_GET_VDEV(&dva[d]), DVA_GET_OFFSET(&dva[d]), + BP_GET_PSIZE(bp), + BP_SHOULD_BYTESWAP(bp) ? "e" : "", + !DVA_GET_GANG(&dva[d]) && BP_GET_LEVEL(bp) != 0 ? + "d" : "", + DVA_GET_GANG(&dva[d]) ? "g" : "", + BP_GET_COMPRESS(bp) != 0 ? "d" : ""); + } + (void) printf("\tLSIZE: %-16llx\t\tPSIZE: %llx\n", + BP_GET_LSIZE(bp), BP_GET_PSIZE(bp)); + (void) printf("\tENDIAN: %6s\t\t\t\t\tTYPE: %s\n", + BP_GET_BYTEORDER(bp) ? "LITTLE" : "BIG", + dmu_ot[BP_GET_TYPE(bp)].ot_name); + (void) printf("\tBIRTH: %-16llx LEVEL: %-2llu\tFILL: %llx\n", + (u_longlong_t)bp->blk_birth, BP_GET_LEVEL(bp), + (u_longlong_t)bp->blk_fill); + (void) printf("\tCKFUNC: %-16s\t\tCOMP: %s\n", + zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name, + zio_compress_table[BP_GET_COMPRESS(bp)].ci_name); + (void) printf("\tCKSUM: %llx:%llx:%llx:%llx\n", + (u_longlong_t)bp->blk_cksum.zc_word[0], + (u_longlong_t)bp->blk_cksum.zc_word[1], + (u_longlong_t)bp->blk_cksum.zc_word[2], + (u_longlong_t)bp->blk_cksum.zc_word[3]); +} + +static void +zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) +{ + int i; + + for (i = 0; i < nbps; i++) + zdb_print_blkptr(&bp[i], flags); +} + +static void +zdb_dump_gbh(void *buf, int flags) +{ + zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); +} + +static void +zdb_dump_block_raw(void *buf, uint64_t size, int flags) +{ + if (flags & ZDB_FLAG_BSWAP) + byteswap_uint64_array(buf, size); + (void) write(2, buf, size); +} + +static void +zdb_dump_block(char *label, void *buf, uint64_t size, int flags) +{ + uint64_t *d = (uint64_t *)buf; + int nwords = size / sizeof (uint64_t); + int do_bswap = !!(flags & ZDB_FLAG_BSWAP); + int i, j; + char *hdr, *c; + + + if (do_bswap) + hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; + else + hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; + + (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); + + for (i = 0; i < nwords; i += 2) { + (void) printf("%06llx: %016llx %016llx ", + (u_longlong_t)(i * sizeof (uint64_t)), + (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), + (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); + + c = (char *)&d[i]; + for (j = 0; j < 2 * sizeof (uint64_t); j++) + (void) printf("%c", isprint(c[j]) ? c[j] : '.'); + (void) printf("\n"); + } +} + +/* + * There are two acceptable formats: + * leaf_name - For example: c1t0d0 or /tmp/ztest.0a + * child[.child]* - For example: 0.1.1 + * + * The second form can be used to specify arbitrary vdevs anywhere + * in the heirarchy. For example, in a pool with a mirror of + * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . + */ +static vdev_t * +zdb_vdev_lookup(vdev_t *vdev, char *path) +{ + char *s, *p, *q; + int i; + + if (vdev == NULL) + return (NULL); + + /* First, assume the x.x.x.x format */ + i = (int)strtoul(path, &s, 10); + if (s == path || (s && *s != '.' && *s != '\0')) + goto name; + if (i < 0 || i >= vdev->vdev_children) + return (NULL); + + vdev = vdev->vdev_child[i]; + if (*s == '\0') + return (vdev); + return (zdb_vdev_lookup(vdev, s+1)); + +name: + for (i = 0; i < vdev->vdev_children; i++) { + vdev_t *vc = vdev->vdev_child[i]; + + if (vc->vdev_path == NULL) { + vc = zdb_vdev_lookup(vc, path); + if (vc == NULL) + continue; + else + return (vc); + } + + p = strrchr(vc->vdev_path, '/'); + p = p ? p + 1 : vc->vdev_path; + q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; + + if (strcmp(vc->vdev_path, path) == 0) + return (vc); + if (strcmp(p, path) == 0) + return (vc); + if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) + return (vc); + } + + return (NULL); +} + +/* + * Read a block from a pool and print it out. The syntax of the + * block descriptor is: + * + * pool:vdev_specifier:offset:size[:flags] + * + * pool - The name of the pool you wish to read from + * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) + * offset - offset, in hex, in bytes + * size - Amount of data to read, in hex, in bytes + * flags - A string of characters specifying options + * b: Decode a blkptr at given offset within block + * *c: Calculate and display checksums + * *d: Decompress data before dumping + * e: Byteswap data before dumping + * *g: Display data as a gang block header + * *i: Display as an indirect block + * p: Do I/O to physical offset + * r: Dump raw data to stdout + * + * * = not yet implemented + */ +static void +zdb_read_block(char *thing, spa_t **spap) +{ + spa_t *spa = *spap; + int flags = 0; + uint64_t offset = 0, size = 0, blkptr_offset = 0; + zio_t *zio; + vdev_t *vd; + void *buf; + char *s, *p, *dup, *spa_name, *vdev, *flagstr; + int i, error, zio_flags; + + dup = strdup(thing); + s = strtok(dup, ":"); + spa_name = s ? s : ""; + s = strtok(NULL, ":"); + vdev = s ? s : ""; + s = strtok(NULL, ":"); + offset = strtoull(s ? s : "", NULL, 16); + s = strtok(NULL, ":"); + size = strtoull(s ? s : "", NULL, 16); + s = strtok(NULL, ":"); + flagstr = s ? s : ""; + + s = NULL; + if (size == 0) + s = "size must not be zero"; + if (!IS_P2ALIGNED(size, DEV_BSIZE)) + s = "size must be a multiple of sector size"; + if (!IS_P2ALIGNED(offset, DEV_BSIZE)) + s = "offset must be a multiple of sector size"; + if (s) { + (void) printf("Invalid block specifier: %s - %s\n", thing, s); + free(dup); + return; + } + + for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) { + for (i = 0; flagstr[i]; i++) { + int bit = flagbits[flagstr[i]]; + + if (bit == 0) { + (void) printf("***Invalid flag: %c\n", + flagstr[i]); + continue; + } + flags |= bit; + + /* If it's not something with an argument, keep going */ + if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_DECOMPRESS | + ZDB_FLAG_PRINT_BLKPTR)) == 0) + continue; + + p = &flagstr[i + 1]; + if (bit == ZDB_FLAG_PRINT_BLKPTR) + blkptr_offset = strtoull(p, &p, 16); + if (*p != ':' && *p != '\0') { + (void) printf("***Invalid flag arg: '%s'\n", s); + free(dup); + return; + } + } + } + + if (spa == NULL || spa->spa_name == NULL || + strcmp(spa->spa_name, spa_name)) { + if (spa && spa->spa_name) + spa_close(spa, (void *)zdb_read_block); + error = spa_open(spa_name, spap, (void *)zdb_read_block); + if (error) + fatal("Failed to open pool '%s': errno = %d\n", + spa_name, error); + spa = *spap; + } + + vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); + if (vd == NULL) { + (void) printf("***Invalid vdev: %s\n", vdev); + free(dup); + return; + } else { + if (vd->vdev_path) + (void) printf("Found vdev: %s\n", vd->vdev_path); + else + (void) printf("Found vdev type: %s\n", + vd->vdev_ops->vdev_op_type); + } + + buf = umem_alloc(size, UMEM_NOFAIL); + + zio_flags = ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK; + + if (flags & ZDB_FLAG_PHYS) + zio_flags |= ZIO_FLAG_PHYSICAL; + + zio = zio_root(spa, NULL, NULL, 0); + /* XXX todo - cons up a BP so RAID-Z will be happy */ + zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, buf, size, + ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, zio_flags, NULL, NULL)); + error = zio_wait(zio); + + if (error) { + (void) printf("Read of %s failed, error: %d\n", thing, error); + goto out; + } + + if (flags & ZDB_FLAG_PRINT_BLKPTR) + zdb_print_blkptr((blkptr_t *)(void *) + ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); + else if (flags & ZDB_FLAG_RAW) + zdb_dump_block_raw(buf, size, flags); + else if (flags & ZDB_FLAG_INDIRECT) + zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t), + flags); + else if (flags & ZDB_FLAG_GBH) + zdb_dump_gbh(buf, flags); + else + zdb_dump_block(thing, buf, size, flags); + +out: + umem_free(buf, size); + free(dup); +} + int main(int argc, char **argv) { @@ -1721,7 +2054,7 @@ main(int argc, char **argv) dprintf_setup(&argc, argv); - while ((c = getopt(argc, argv, "udibcsvCLO:B:Ul")) != -1) { + while ((c = getopt(argc, argv, "udibcsvCLO:B:UlR")) != -1) { switch (c) { case 'u': case 'd': @@ -1731,6 +2064,7 @@ main(int argc, char **argv) case 's': case 'C': case 'l': + case 'R': dump_opt[c]++; dump_all = 0; break; @@ -1801,7 +2135,7 @@ main(int argc, char **argv) } for (c = 0; c < 256; c++) { - if (dump_all && c != 'L' && c != 'l') + if (dump_all && c != 'L' && c != 'l' && c != 'R') dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; @@ -1823,6 +2157,27 @@ main(int argc, char **argv) return (0); } + if (dump_opt['R']) { + flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; + flagbits['c'] = ZDB_FLAG_CHECKSUM; + flagbits['d'] = ZDB_FLAG_DECOMPRESS; + flagbits['e'] = ZDB_FLAG_BSWAP; + flagbits['g'] = ZDB_FLAG_GBH; + flagbits['i'] = ZDB_FLAG_INDIRECT; + flagbits['p'] = ZDB_FLAG_PHYS; + flagbits['r'] = ZDB_FLAG_RAW; + + spa = NULL; + while (argv[0]) { + zdb_read_block(argv[0], &spa); + argv++; + argc--; + } + if (spa) + spa_close(spa, (void *)zdb_read_block); + return (0); + } + if (dump_opt['C']) dump_config(argv[0]); diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c index 2cbecad212..e2297b24aa 100644 --- a/usr/src/cmd/zpool/zpool_main.c +++ b/usr/src/cmd/zpool/zpool_main.c @@ -2783,8 +2783,9 @@ upgrade_one(zpool_handle_t *zhp, void *unused) ret = zpool_upgrade(zhp); if (ret == 0) - (void) printf(gettext("Successfully upgraded '%s'\n"), - zpool_get_name(zhp)); + (void) printf(gettext("Successfully upgraded '%s' " + "from version %llu to version %llu\n"), zpool_get_name(zhp), + (u_longlong_t)version, (u_longlong_t)ZFS_VERSION); return (ret != 0); } @@ -2848,8 +2849,10 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); - (void) printf(gettext(" 1 Initial ZFS version.\n\n")); - (void) printf(gettext("For more information on a particular " + (void) printf(gettext(" 1 Initial ZFS version.\n")); + (void) printf(gettext(" 2 Ditto blocks " + "(replicated metadata)\n")); + (void) printf(gettext("\nFor more information on a particular " "version, including supported releases, see:\n\n")); (void) printf("http://www.opensolaris.org/os/community/zfs/" "version/N\n\n"); diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index fbcc56a30d..f214da36fa 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -2825,9 +2825,6 @@ ztest_spa_import_export(char *oldname, char *newname) if (error) fatal(0, "spa_open('%s') = %d", oldname, error); - ASSERT(spa->spa_config != NULL); - - VERIFY(nvlist_dup(spa->spa_config, &config, 0) == 0); pool_guid = spa_guid(spa); spa_close(spa, FTAG); @@ -2836,7 +2833,7 @@ ztest_spa_import_export(char *oldname, char *newname) /* * Export it. */ - error = spa_export(oldname); + error = spa_export(oldname, &config); if (error) fatal(0, "spa_export('%s') = %d", oldname, error); diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 5f0d90dd3c..0b500d4a83 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -2186,7 +2186,7 @@ arc_write_done(zio_t *zio) } int -arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, +arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, arc_done_func_t *done, void *private, int priority, int flags, uint32_t arc_flags, zbookmark_t *zb) @@ -2205,7 +2205,7 @@ arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, acb->acb_byteswap = (arc_byteswap_func_t *)-1; hdr->b_acb = acb; hdr->b_flags |= ARC_IO_IN_PROGRESS; - rzio = zio_write(pio, spa, checksum, compress, txg, bp, + rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb); if (arc_flags & ARC_WAIT) diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 2982f743d2..ebcd9d7ad3 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -2029,7 +2029,9 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) zb.zb_object = db->db.db_object; zb.zb_level = db->db_level; zb.zb_blkid = db->db_blkid; - (void) arc_write(zio, os->os_spa, checksum, compress, txg, + + (void) arc_write(zio, os->os_spa, checksum, compress, + dmu_get_replication_level(os->os_spa, &zb, dn->dn_type), txg, db->db_blkptr, *data, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb); /* diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 2f6abbadae..52c8413c9a 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -82,8 +82,6 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, dmu_buf_impl_t *db; int err; - /* dataset_verify(dd); */ - err = dnode_hold(os->os, object, FTAG, &dn); if (err) return (err); @@ -1425,7 +1423,8 @@ int dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, blkptr_t *bp, uint64_t txg) { - dsl_pool_t *dp = os->os->os_dsl_dataset->ds_dir->dd_pool; + objset_impl_t *osi = os->os; + dsl_pool_t *dp = osi->os_dsl_dataset->ds_dir->dd_pool; tx_state_t *tx = &dp->dp_tx; dmu_buf_impl_t *db; blkptr_t *blk; @@ -1508,7 +1507,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, } arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); if (!BP_IS_HOLE(blk)) { - (void) arc_free(NULL, os->os->os_spa, txg, blk, + (void) arc_free(NULL, osi->os_spa, txg, blk, NULL, NULL, ARC_WAIT); } kmem_free(blk, sizeof (blkptr_t)); @@ -1520,13 +1519,14 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); blk->blk_birth = 0; /* mark as invalid */ - zb.zb_objset = os->os->os_dsl_dataset->ds_object; + zb.zb_objset = osi->os_dsl_dataset->ds_object; zb.zb_object = db->db.db_object; zb.zb_level = db->db_level; zb.zb_blkid = db->db_blkid; - err = arc_write(NULL, os->os->os_spa, - zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum), - zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress), + err = arc_write(NULL, osi->os_spa, + zio_checksum_select(db->db_dnode->dn_checksum, osi->os_checksum), + zio_compress_select(db->db_dnode->dn_compress, osi->os_compress), + dmu_get_replication_level(osi->os_spa, &zb, db->db_dnode->dn_type), txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb); ASSERT(err == 0); @@ -1556,7 +1556,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, * XXX should we be ignoring the return code? */ if (!BP_IS_HOLE(blk)) { - (void) arc_free(NULL, os->os->os_spa, txg, blk, + (void) arc_free(NULL, osi->os_spa, txg, blk, NULL, NULL, ARC_WAIT); } kmem_free(blk, sizeof (blkptr_t)); @@ -1625,6 +1625,24 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dnode_rele(dn, FTAG); } +/* + * XXX - eventually, this should take into account per-dataset (or + * even per-object?) user requests for higher levels of replication. + */ +int +dmu_get_replication_level(spa_t *spa, zbookmark_t *zb, dmu_object_type_t ot) +{ + int ncopies = 1; + + if (dmu_ot[ot].ot_metadata) + ncopies++; + if (zb->zb_level != 0) + ncopies++; + if (zb->zb_objset == 0 && zb->zb_object == 0) + ncopies++; + return (MIN(ncopies, spa_max_replication(spa))); +} + int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 3a7f3531ea..7784049a23 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -679,7 +679,9 @@ dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx) zb.zb_level = -1; zb.zb_blkid = 0; err = arc_write(NULL, os->os_spa, os->os_md_checksum, - os->os_md_compress, tx->tx_txg, &os->os_rootbp, abuf, killer, os, + os->os_md_compress, + dmu_get_replication_level(os->os_spa, &zb, DMU_OT_OBJSET), + tx->tx_txg, &os->os_rootbp, abuf, killer, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb); ASSERT(err == 0); VERIFY(arc_buf_remove_ref(abuf, FTAG) == 1); diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index b8e54be6f6..77a1adb3b1 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -232,7 +232,7 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) uint64_t space, resv; /* - * Reserve about 1% (1/128), or at least 16MB, for allocation + * Reserve about 1.6% (1/64), or at least 32MB, for allocation * efficiency. * XXX The intent log is not accounted for, so it must fit * within this slop. @@ -242,7 +242,7 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) * (e.g. make it possible to rm(1) files from a full pool). */ space = spa_get_space(dp->dp_spa); - resv = MAX(space >> 7, SPA_MINDEVSIZE >> 2); + resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); if (netfree) resv >>= 1; diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 72eaa89bd0..8728f21d7e 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -352,14 +352,19 @@ metaslab_fini(metaslab_t *msp) kmem_free(msp, sizeof (metaslab_t)); } -#define METASLAB_ACTIVE_WEIGHT (1ULL << 63) +#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) +#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) +#define METASLAB_ACTIVE_MASK \ + (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) +#define METASLAB_SMO_BONUS_MULTIPLIER 2 static uint64_t metaslab_weight(metaslab_t *msp) { + metaslab_group_t *mg = msp->ms_group; space_map_t *sm = &msp->ms_map; space_map_obj_t *smo = &msp->ms_smo; - vdev_t *vd = msp->ms_group->mg_vd; + vdev_t *vd = mg->mg_vd; uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); @@ -387,26 +392,27 @@ metaslab_weight(metaslab_t *msp) * For locality, assign higher weight to metaslabs we've used before. */ if (smo->smo_object != 0) - weight *= 2; - ASSERT(weight >= space && weight <= 4 * space); + weight *= METASLAB_SMO_BONUS_MULTIPLIER; + ASSERT(weight >= space && + weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space); /* * If this metaslab is one we're actively using, adjust its weight to * make it preferable to any inactive metaslab so we'll polish it off. */ - weight |= (msp->ms_weight & METASLAB_ACTIVE_WEIGHT); + weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); return (weight); } static int -metaslab_activate(metaslab_t *msp) +metaslab_activate(metaslab_t *msp, uint64_t activation_weight) { space_map_t *sm = &msp->ms_map; ASSERT(MUTEX_HELD(&msp->ms_lock)); - if (msp->ms_weight < METASLAB_ACTIVE_WEIGHT) { + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { int error = space_map_load(sm, &metaslab_ff_ops, SM_FREE, &msp->ms_smo, msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); @@ -415,10 +421,10 @@ metaslab_activate(metaslab_t *msp) return (error); } metaslab_group_sort(msp->ms_group, msp, - msp->ms_weight | METASLAB_ACTIVE_WEIGHT); + msp->ms_weight | activation_weight); } ASSERT(sm->sm_loaded); - ASSERT(msp->ms_weight >= METASLAB_ACTIVE_WEIGHT); + ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); return (0); } @@ -426,8 +432,8 @@ metaslab_activate(metaslab_t *msp) static void metaslab_passivate(metaslab_t *msp, uint64_t size) { - metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size - 1)); - ASSERT(msp->ms_weight < METASLAB_ACTIVE_WEIGHT); + metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); + ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); } /* @@ -571,7 +577,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * future allocations have synced. (If we unloaded it now and then * loaded a moment later, the map wouldn't reflect those allocations.) */ - if (sm->sm_loaded && msp->ms_weight < METASLAB_ACTIVE_WEIGHT) { + if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { int evictable = 1; for (t = 1; t < TXG_CONCURRENT_STATES; t++) @@ -616,7 +622,7 @@ metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); - error = metaslab_activate(msp); + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); if (error) { mutex_exit(&msp->ms_lock); return (error); @@ -633,25 +639,76 @@ metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg) return (0); } -static metaslab_t * -metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t *offp, - uint64_t txg) +static uint64_t +metaslab_distance(metaslab_t *msp, dva_t *dva) +{ + uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; + uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; + uint64_t start = msp->ms_map.sm_start >> ms_shift; + + if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) + return (1ULL << 63); + + if (offset < start) + return ((start - offset) << ms_shift); + if (offset > start) + return ((offset - start) << ms_shift); + return (0); +} + +static uint64_t +metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, + uint64_t min_distance, dva_t *dva, int d) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; + avl_tree_t *t = &mg->mg_metaslab_tree; + uint64_t activation_weight; + uint64_t target_distance; + int i; + + activation_weight = METASLAB_WEIGHT_PRIMARY; + for (i = 0; i < d; i++) + if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) + activation_weight = METASLAB_WEIGHT_SECONDARY; for (;;) { mutex_enter(&mg->mg_lock); - msp = avl_first(&mg->mg_metaslab_tree); - if (msp == NULL || msp->ms_weight < size) { - mutex_exit(&mg->mg_lock); - return (NULL); + for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { + if (msp->ms_weight < size) { + mutex_exit(&mg->mg_lock); + return (-1ULL); + } + + if (activation_weight == METASLAB_WEIGHT_PRIMARY) + break; + + target_distance = min_distance + + (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); + + for (i = 0; i < d; i++) + if (metaslab_distance(msp, &dva[i]) < + target_distance) + break; + if (i == d) + break; } mutex_exit(&mg->mg_lock); + if (msp == NULL) + return (-1ULL); mutex_enter(&msp->ms_lock); - if (metaslab_activate(msp) != 0) { + if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && + activation_weight == METASLAB_WEIGHT_PRIMARY) { + metaslab_passivate(msp, + (msp->ms_weight & ~METASLAB_ACTIVE_MASK) / + METASLAB_SMO_BONUS_MULTIPLIER); + mutex_exit(&msp->ms_lock); + continue; + } + + if (metaslab_activate(msp, activation_weight) != 0) { mutex_exit(&msp->ms_lock); continue; } @@ -659,7 +716,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t *offp, if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) break; - metaslab_passivate(msp, size); + metaslab_passivate(msp, size - 1); mutex_exit(&msp->ms_lock); } @@ -671,22 +728,24 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t *offp, mutex_exit(&msp->ms_lock); - *offp = offset; - return (msp); + return (offset); } /* * Allocate a block for the specified i/o. */ -int -metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg) +static int +metaslab_alloc_one(spa_t *spa, uint64_t psize, dva_t *dva, int d, + dva_t *hintdva, uint64_t txg) { - metaslab_t *msp; metaslab_group_t *mg, *rotor; metaslab_class_t *mc; vdev_t *vd; + int dshift = 3; + int all_zero; uint64_t offset = -1ULL; uint64_t asize; + uint64_t distance; mc = spa_metaslab_class_select(spa); @@ -695,17 +754,50 @@ metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg) * Note that there's no locking on mc_rotor or mc_allocated because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. + * + * If we are doing ditto blocks, try to spread them across consecutive + * vdevs. If we're forced to reuse a vdev before we've allocated + * all of our ditto blocks, then try and spread them out on that + * vdev as much as possible. If it turns out to not be possible, + * gradually lower our standards until anything becomes acceptable. + * Also, allocating on consecutive vdevs (as opposed to random vdevs) + * gives us hope of containing our fault domains to something we're + * able to reason about. Otherwise, any two top-level vdev failures + * will guarantee the loss of data. With consecutive allocation, + * only two adjacent top-level vdev failures will result in data loss. + * + * If we are doing gang blocks (hintdva is non-NULL), try to keep + * ourselves on the same vdev as our gang block header. That + * way, we can hope for locality in vdev_cache, plus it makes our + * fault domains something tractable. */ - mg = rotor = mc->mc_rotor; + if (hintdva) { + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); + mg = vd->vdev_mg; + } else if (d != 0) { + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); + mg = vd->vdev_mg->mg_next; + } else { + mg = mc->mc_rotor; + } + rotor = mg; + +top: + all_zero = B_TRUE; do { vd = mg->mg_vd; + + distance = vd->vdev_asize >> dshift; + if (distance <= (1ULL << vd->vdev_ms_shift)) + distance = 0; + else + all_zero = B_FALSE; + asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - msp = metaslab_group_alloc(mg, asize, &offset, txg); - if (msp != NULL) { - ASSERT(offset != -1ULL); - + offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); + if (offset != -1ULL) { /* * If we've just selected this metaslab group, * figure out whether the corresponding vdev is @@ -740,10 +832,10 @@ metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg) mc->mc_allocated = 0; } - DVA_SET_VDEV(dva, vd->vdev_id); - DVA_SET_OFFSET(dva, offset); - DVA_SET_GANG(dva, 0); - DVA_SET_ASIZE(dva, asize); + DVA_SET_VDEV(&dva[d], vd->vdev_id); + DVA_SET_OFFSET(&dva[d], offset); + DVA_SET_GANG(&dva[d], 0); + DVA_SET_ASIZE(&dva[d], asize); return (0); } @@ -751,13 +843,46 @@ metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg) mc->mc_allocated = 0; } while ((mg = mg->mg_next) != rotor); - DVA_SET_VDEV(dva, 0); - DVA_SET_OFFSET(dva, 0); - DVA_SET_GANG(dva, 0); + if (!all_zero) { + dshift++; + ASSERT(dshift < 64); + goto top; + } + + bzero(&dva[d], sizeof (dva_t)); return (ENOSPC); } +int +metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ncopies, + uint64_t txg, blkptr_t *hintbp) +{ + int d, error; + dva_t *dva = bp->blk_dva; + dva_t *hintdva = hintbp->blk_dva; + + ASSERT(ncopies > 0 && ncopies <= spa_max_replication(spa)); + ASSERT(BP_GET_NDVAS(bp) == 0); + ASSERT(hintbp == NULL || ncopies <= BP_GET_NDVAS(hintbp)); + + for (d = 0; d < ncopies; d++) { + error = metaslab_alloc_one(spa, psize, dva, d, hintdva, txg); + if (error) { + for (d--; d >= 0; d--) { + ASSERT(DVA_IS_VALID(&dva[d])); + metaslab_free(spa, &dva[d], txg, B_TRUE); + bzero(&dva[d], sizeof (dva_t)); + } + return (ENOSPC); + } + } + ASSERT(error == 0); + ASSERT(BP_GET_NDVAS(bp) == ncopies); + + return (0); +} + /* * Free the block represented by DVA in the context of the specified * transaction group. diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index f4ecf519cd..95f633eac1 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -940,10 +940,13 @@ spa_tryimport(nvlist_t *tryconfig) * configuration from the cache afterwards. */ static int -spa_export_common(char *pool, int new_state) +spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) { spa_t *spa; + if (oldconfig) + *oldconfig = NULL; + if (!(spa_mode & FWRITE)) return (EROFS); @@ -1011,6 +1014,9 @@ spa_export_common(char *pool, int new_state) spa_deactivate(spa); } + if (oldconfig && spa->spa_config) + VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); + if (new_state != POOL_STATE_UNINITIALIZED) { spa_remove(spa); spa_config_sync(); @@ -1026,16 +1032,16 @@ spa_export_common(char *pool, int new_state) int spa_destroy(char *pool) { - return (spa_export_common(pool, POOL_STATE_DESTROYED)); + return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); } /* * Export a storage pool. */ int -spa_export(char *pool) +spa_export(char *pool, nvlist_t **oldconfig) { - return (spa_export_common(pool, POOL_STATE_EXPORTED)); + return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); } /* @@ -1045,7 +1051,7 @@ spa_export(char *pool) int spa_reset(char *pool) { - return (spa_export_common(pool, POOL_STATE_UNINITIALIZED)); + return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); } @@ -1497,7 +1503,7 @@ spa_scrub_io_done(zio_t *zio) mutex_enter(&spa->spa_scrub_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - vdev_t *vd = zio->io_vd; + vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; spa->spa_scrub_errors++; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_scrub_errors++; @@ -1535,9 +1541,12 @@ static int spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) { blkptr_t *bp = &bc->bc_blkptr; - vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); + vdev_t *vd = spa->spa_root_vdev; + dva_t *dva = bp->blk_dva; + int needs_resilver = B_FALSE; + int d; - if (bc->bc_errno || vd == NULL) { + if (bc->bc_errno) { /* * We can't scrub this block, but we can continue to scrub * the rest of the pool. Note the error and move along. @@ -1546,43 +1555,52 @@ spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) spa->spa_scrub_errors++; mutex_exit(&spa->spa_scrub_lock); - if (vd != NULL) { - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_scrub_errors++; - mutex_exit(&vd->vdev_stat_lock); - } + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_scrub_errors++; + mutex_exit(&vd->vdev_stat_lock); return (ERESTART); } ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); - /* - * Keep track of how much data we've examined so that - * zpool(1M) status can make useful progress reports. - */ - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); - mutex_exit(&vd->vdev_stat_lock); + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); - if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { - if (DVA_GET_GANG(&bp->blk_dva[0])) { - /* - * Gang members may be spread across multiple vdevs, - * so the best we can do is look at the pool-wide DTL. - * XXX -- it would be better to change our allocation - * policy to ensure that this can't happen. - */ - vd = spa->spa_root_vdev; - } - if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { - spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, - ZIO_FLAG_RESILVER, &bc->bc_bookmark); + ASSERT(vd != NULL); + + /* + * Keep track of how much data we've examined so that + * zpool(1M) status can make useful progress reports. + */ + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); + mutex_exit(&vd->vdev_stat_lock); + + if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { + if (DVA_GET_GANG(&dva[d])) { + /* + * Gang members may be spread across multiple + * vdevs, so the best we can do is look at the + * pool-wide DTL. + * XXX -- it would be better to change our + * allocation policy to ensure that this can't + * happen. + */ + vd = spa->spa_root_vdev; + } + if (vdev_dtl_contains(&vd->vdev_dtl_map, + bp->blk_birth, 1)) + needs_resilver = B_TRUE; } - } else { + } + + if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SCRUB, &bc->bc_bookmark); - } + else if (needs_resilver) + spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, + ZIO_FLAG_RESILVER, &bc->bc_bookmark); return (0); } diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index d12fe822a7..843b77d9ff 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -52,60 +52,60 @@ * * spa_namespace_lock (global mutex) * - * This lock must be acquired to do any of the following: + * This lock must be acquired to do any of the following: * - * - Lookup a spa_t by name - * - Add or remove a spa_t from the namespace - * - Increase spa_refcount from non-zero - * - Check if spa_refcount is zero - * - Rename a spa_t + * - Lookup a spa_t by name + * - Add or remove a spa_t from the namespace + * - Increase spa_refcount from non-zero + * - Check if spa_refcount is zero + * - Rename a spa_t * - add/remove/attach/detach devices - * - Held for the duration of create/destroy/import/export + * - Held for the duration of create/destroy/import/export * - * It does not need to handle recursion. A create or destroy may - * reference objects (files or zvols) in other pools, but by - * definition they must have an existing reference, and will never need - * to lookup a spa_t by name. + * It does not need to handle recursion. A create or destroy may + * reference objects (files or zvols) in other pools, but by + * definition they must have an existing reference, and will never need + * to lookup a spa_t by name. * * spa_refcount (per-spa refcount_t protected by mutex) * - * This reference count keep track of any active users of the spa_t. The - * spa_t cannot be destroyed or freed while this is non-zero. Internally, - * the refcount is never really 'zero' - opening a pool implicitly keeps - * some references in the DMU. Internally we check against SPA_MINREF, but - * present the image of a zero/non-zero value to consumers. + * This reference count keep track of any active users of the spa_t. The + * spa_t cannot be destroyed or freed while this is non-zero. Internally, + * the refcount is never really 'zero' - opening a pool implicitly keeps + * some references in the DMU. Internally we check against SPA_MINREF, but + * present the image of a zero/non-zero value to consumers. * * spa_config_lock (per-spa crazy rwlock) * - * This SPA special is a recursive rwlock, capable of being acquired from - * asynchronous threads. It has protects the spa_t from config changes, - * and must be held in the following circumstances: + * This SPA special is a recursive rwlock, capable of being acquired from + * asynchronous threads. It has protects the spa_t from config changes, + * and must be held in the following circumstances: * - * - RW_READER to perform I/O to the spa - * - RW_WRITER to change the vdev config + * - RW_READER to perform I/O to the spa + * - RW_WRITER to change the vdev config * * spa_config_cache_lock (per-spa mutex) * - * This mutex prevents the spa_config nvlist from being updated. No + * This mutex prevents the spa_config nvlist from being updated. No * other locks are required to obtain this lock, although implicitly you * must have the namespace lock or non-zero refcount to have any kind * of spa_t pointer at all. * * The locking order is fairly straightforward: * - * spa_namespace_lock -> spa_refcount + * spa_namespace_lock -> spa_refcount * - * The namespace lock must be acquired to increase the refcount from 0 - * or to check if it is zero. + * The namespace lock must be acquired to increase the refcount from 0 + * or to check if it is zero. * - * spa_refcount -> spa_config_lock + * spa_refcount -> spa_config_lock * - * There must be at least one valid reference on the spa_t to acquire - * the config lock. + * There must be at least one valid reference on the spa_t to acquire + * the config lock. * - * spa_namespace_lock -> spa_config_lock + * spa_namespace_lock -> spa_config_lock * - * The namespace lock must always be taken before the config lock. + * The namespace lock must always be taken before the config lock. * * * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and @@ -114,53 +114,53 @@ * The namespace is manipulated using the following functions, all which require * the spa_namespace_lock to be held. * - * spa_lookup() Lookup a spa_t by name. + * spa_lookup() Lookup a spa_t by name. * - * spa_add() Create a new spa_t in the namespace. + * spa_add() Create a new spa_t in the namespace. * - * spa_remove() Remove a spa_t from the namespace. This also - * frees up any memory associated with the spa_t. + * spa_remove() Remove a spa_t from the namespace. This also + * frees up any memory associated with the spa_t. * - * spa_next() Returns the next spa_t in the system, or the - * first if NULL is passed. + * spa_next() Returns the next spa_t in the system, or the + * first if NULL is passed. * - * spa_evict_all() Shutdown and remove all spa_t structures in - * the system. + * spa_evict_all() Shutdown and remove all spa_t structures in + * the system. * * spa_guid_exists() Determine whether a pool/device guid exists. * * The spa_refcount is manipulated using the following functions: * - * spa_open_ref() Adds a reference to the given spa_t. Must be - * called with spa_namespace_lock held if the - * refcount is currently zero. + * spa_open_ref() Adds a reference to the given spa_t. Must be + * called with spa_namespace_lock held if the + * refcount is currently zero. * - * spa_close() Remove a reference from the spa_t. This will - * not free the spa_t or remove it from the - * namespace. No locking is required. + * spa_close() Remove a reference from the spa_t. This will + * not free the spa_t or remove it from the + * namespace. No locking is required. * - * spa_refcount_zero() Returns true if the refcount is currently - * zero. Must be called with spa_namespace_lock - * held. + * spa_refcount_zero() Returns true if the refcount is currently + * zero. Must be called with spa_namespace_lock + * held. * * The spa_config_lock is manipulated using the following functions: * - * spa_config_enter() Acquire the config lock as RW_READER or - * RW_WRITER. At least one reference on the spa_t - * must exist. + * spa_config_enter() Acquire the config lock as RW_READER or + * RW_WRITER. At least one reference on the spa_t + * must exist. * - * spa_config_exit() Release the config lock. + * spa_config_exit() Release the config lock. * - * spa_config_held() Returns true if the config lock is currently - * held in the given state. + * spa_config_held() Returns true if the config lock is currently + * held in the given state. * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * - * spa_vdev_enter() Acquire the namespace lock and the config lock + * spa_vdev_enter() Acquire the namespace lock and the config lock * for writing. * - * spa_vdev_exit() Release the config lock, wait for all I/O - * to complete, sync the updated configs to the + * spa_vdev_exit() Release the config lock, wait for all I/O + * to complete, sync the updated configs to the * cache, and release the namespace lock. * * The spa_name() function also requires either the spa_namespace_lock @@ -173,6 +173,7 @@ static avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; static kcondvar_t spa_namespace_cv; static int spa_active_count; +static int spa_max_replication_override = SPA_DVAS_PER_BP; kmem_cache_t *spa_buffer_pool; int spa_mode; @@ -617,8 +618,7 @@ spa_get_random(uint64_t range) void sprintf_blkptr(char *buf, int len, blkptr_t *bp) { - /* XXBP - Need to see if we want all DVAs or not */ - dva_t *dva = BP_IDENTITY(bp); + int d; if (bp == NULL) { (void) snprintf(buf, len, "<NULL>"); @@ -630,20 +630,27 @@ sprintf_blkptr(char *buf, int len, blkptr_t *bp) return; } - (void) snprintf(buf, len, "[L%llu %s] vdev=%llu offset=%llx " - "size=%llxL/%llxP/%llxA %s %s %s %s " - "birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx", + (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ", (u_longlong_t)BP_GET_LEVEL(bp), dmu_ot[BP_GET_TYPE(bp)].ot_name, - (u_longlong_t)DVA_GET_VDEV(dva), - (u_longlong_t)DVA_GET_OFFSET(dva), (u_longlong_t)BP_GET_LSIZE(bp), - (u_longlong_t)BP_GET_PSIZE(bp), - (u_longlong_t)DVA_GET_ASIZE(dva), + (u_longlong_t)BP_GET_PSIZE(bp)); + + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + dva_t *dva = &bp->blk_dva[d]; + (void) snprintf(buf + strlen(buf), len - strlen(buf), + "DVA[%d]=<%llu:%llx:%llx> ", d, + (u_longlong_t)DVA_GET_VDEV(dva), + (u_longlong_t)DVA_GET_OFFSET(dva), + (u_longlong_t)DVA_GET_ASIZE(dva)); + } + + (void) snprintf(buf + strlen(buf), len - strlen(buf), + "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx", zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name, zio_compress_table[BP_GET_COMPRESS(bp)].ci_name, BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", - DVA_GET_GANG(dva) == 0 ? "contiguous" : "gang", + BP_IS_GANG(bp) ? "gang" : "contiguous", (u_longlong_t)bp->blk_birth, (u_longlong_t)bp->blk_fill, (u_longlong_t)bp->blk_cksum.zc_word[0], @@ -796,8 +803,29 @@ spa_get_asize(spa_t *spa, uint64_t lsize) /* * For now, the worst case is 512-byte RAID-Z blocks, in which * case the space requirement is exactly 2x; so just assume that. + * Add to this the fact that we can have up to 3 DVAs per bp, and + * we have to multiply by a total of 6x. */ - return (lsize << 1); + return (lsize * 6); +} + +uint64_t +spa_version(spa_t *spa) +{ + return (spa->spa_ubsync.ub_version); +} + +int +spa_max_replication(spa_t *spa) +{ + /* + * As of ZFS_VERSION == ZFS_VERSION_DITTO_BLOCKS, we are able to + * handle BPs with more than one DVA allocated. Set our max + * replication level accordingly. + */ + if (spa_version(spa) < ZFS_VERSION_DITTO_BLOCKS) + return (1); + return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } /* diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index 1a93d4e4ca..811ac94436 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -75,7 +75,7 @@ int arc_referenced(arc_buf_t *buf); int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, arc_done_func_t *done, void *private, int priority, int flags, uint32_t arc_flags, zbookmark_t *zb); -int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, +int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, arc_done_func_t *done, void *private, int priority, int flags, uint32_t arc_flags, zbookmark_t *zb); diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 70a94147b5..78dd9632e6 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -56,6 +56,8 @@ struct dsl_pool; struct dnode; struct drr_begin; struct drr_end; +struct zbookmark; +struct spa; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; @@ -263,6 +265,12 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); /* + * Decide how many copies of a given block we should make. Can be from + * 1 to SPA_DVAS_PER_BP. + */ +int dmu_get_replication_level(struct spa *spa, struct zbookmark *zb, + dmu_object_type_t ot); +/* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h index ef2a9a2b89..c72b5ddf16 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h @@ -47,7 +47,8 @@ extern void metaslab_fini(metaslab_t *msp); extern void metaslab_sync(metaslab_t *msp, uint64_t txg); extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); -extern int metaslab_alloc(spa_t *spa, uint64_t size, dva_t *dva, uint64_t txg); +extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, + int ncopies, uint64_t txg, blkptr_t *hintbp); extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now); extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index cbe8257953..265d19f63a 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -234,6 +234,16 @@ typedef struct blkptr { (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[2])) +#define BP_GET_NDVAS(bp) \ + (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) + +#define BP_COUNT_GANG(bp) \ + (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ + DVA_GET_GANG(&(bp)->blk_dva[1]) + \ + DVA_GET_GANG(&(bp)->blk_dva[2])) + #define DVA_EQUAL(dva1, dva2) \ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ (dva1)->dva_word[0] == (dva2)->dva_word[0]) @@ -248,9 +258,9 @@ typedef struct blkptr { (zcp)->zc_word[3] = w3; \ } -#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) - #define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) +#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) +#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) #define BP_ZERO(bp) \ { \ @@ -281,7 +291,7 @@ typedef struct blkptr { #define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) -#define BP_SPRINTF_LEN 256 +#define BP_SPRINTF_LEN 320 #include <sys/dmu.h> @@ -297,7 +307,7 @@ extern int spa_create(const char *pool, nvlist_t *config, const char *altroot); extern int spa_import(const char *pool, nvlist_t *config, const char *altroot); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); -extern int spa_export(char *pool); +extern int spa_export(char *pool, nvlist_t **oldconfig); extern int spa_reset(char *pool); extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_suspend(spa_t *spa); @@ -387,6 +397,8 @@ extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa); extern uint64_t spa_get_alloc(spa_t *spa); extern uint64_t spa_get_space(spa_t *spa); extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); +extern uint64_t spa_version(spa_t *spa); +extern int spa_max_replication(spa_t *spa); extern int spa_busy(void); /* Miscellaneous support routines */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index f9cf5d3354..c8d5db50f5 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -80,6 +80,7 @@ extern void vdev_stat_update(zio_t *zio); extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete); extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec); +extern void vdev_propagate_state(vdev_t *vd); extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux); diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 268581336a..66c9a910ca 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -34,6 +34,7 @@ #include <sys/avl.h> #include <sys/dkio.h> #include <sys/fs/zfs.h> +#include <sys/zio_impl.h> #ifdef __cplusplus extern "C" { @@ -58,9 +59,8 @@ typedef struct zio_block_tail { (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ sizeof (uint64_t)) -#define ZIO_GET_DVA(zio) (&(zio)->io_bp->blk_dva[(zio)->io_dva_index]) #define ZIO_GET_IOSIZE(zio) \ - (DVA_GET_GANG(ZIO_GET_DVA(zio)) ? \ + (BP_IS_GANG((zio)->io_bp) ? \ SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp)) typedef struct zio_gbh { @@ -152,7 +152,6 @@ enum zio_compress { typedef struct zio zio_t; typedef void zio_done_func_t(zio_t *zio); -typedef struct zio_transform zio_transform_t; extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE]; extern char *zio_type_name[ZIO_TYPES]; @@ -190,9 +189,9 @@ struct zio { zio_t *io_root; spa_t *io_spa; zbookmark_t io_bookmark; - int io_checksum; - int io_compress; - int io_dva_index; + enum zio_checksum io_checksum; + enum zio_compress io_compress; + int io_ndvas; uint64_t io_txg; blkptr_t *io_bp; blkptr_t io_bp_copy; @@ -225,8 +224,8 @@ struct zio { /* Internal pipeline state */ int io_flags; - uint8_t io_type; - uint8_t io_stage; + enum zio_type io_type; + enum zio_stage io_stage; uint8_t io_stalled; uint8_t io_priority; struct dk_callback io_dk_callback; @@ -257,7 +256,7 @@ extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, int priority, int flags, zbookmark_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, - uint64_t txg, blkptr_t *bp, void *data, uint64_t size, + int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, int priority, int flags, zbookmark_t *zb); diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h index e1abf0e49d..d2ddbc34e9 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h @@ -61,9 +61,6 @@ typedef enum zio_stage { ZIO_STAGE_READY, /* RWFCI */ - ZIO_STAGE_DVA_TRANSLATE, /* RW--- */ - - ZIO_STAGE_VDEV_IO_SETUP, /* RW--I */ ZIO_STAGE_VDEV_IO_START, /* RW--I */ ZIO_STAGE_VDEV_IO_DONE, /* RW--I */ ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */ @@ -88,8 +85,7 @@ typedef enum zio_stage { (1U << ZIO_STAGE_READ_DECOMPRESS)) #define ZIO_VDEV_IO_PIPELINE \ - ((1U << ZIO_STAGE_VDEV_IO_SETUP) | \ - (1U << ZIO_STAGE_VDEV_IO_START) | \ + ((1U << ZIO_STAGE_VDEV_IO_START) | \ (1U << ZIO_STAGE_VDEV_IO_DONE) | \ (1U << ZIO_STAGE_VDEV_IO_ASSESS)) @@ -103,8 +99,7 @@ typedef enum zio_stage { (1U << ZIO_STAGE_DONE)) #define ZIO_READ_PIPELINE \ - ((1U << ZIO_STAGE_DVA_TRANSLATE) | \ - ZIO_READ_PHYS_PIPELINE) + ZIO_READ_PHYS_PIPELINE #define ZIO_WRITE_PHYS_PIPELINE \ ((1U << ZIO_STAGE_OPEN) | \ @@ -116,8 +111,7 @@ typedef enum zio_stage { (1U << ZIO_STAGE_DONE)) #define ZIO_WRITE_COMMON_PIPELINE \ - ((1U << ZIO_STAGE_DVA_TRANSLATE) | \ - ZIO_WRITE_PHYS_PIPELINE) + ZIO_WRITE_PHYS_PIPELINE #define ZIO_WRITE_PIPELINE \ ((1U << ZIO_STAGE_WRITE_COMPRESS) | \ @@ -193,6 +187,7 @@ typedef enum zio_stage { #define ZIO_ERROR_PIPELINE_MASK \ ZIO_WAIT_FOR_CHILDREN_PIPELINE +typedef struct zio_transform zio_transform_t; struct zio_transform { void *zt_data; uint64_t zt_size; diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 4c216b4ee5..7836041872 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -847,31 +847,16 @@ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - int c; ASSERT(spa_config_held(spa, RW_WRITER)); - if (vd == rvd) { - for (c = 0; c < rvd->vdev_children; c++) - vdev_reopen(rvd->vdev_child[c]); - return; - } - - /* only valid for top-level vdevs */ - ASSERT3P(vd, ==, vd->vdev_top); - vdev_close(vd); (void) vdev_open(vd); /* * Reassess root vdev's health. */ - rvd->vdev_state = VDEV_STATE_HEALTHY; - for (c = 0; c < rvd->vdev_children; c++) { - uint64_t state = rvd->vdev_child[c]->vdev_state; - rvd->vdev_state = MIN(rvd->vdev_state, state); - } + vdev_propagate_state(spa->spa_root_vdev); } int @@ -1741,6 +1726,39 @@ vdev_config_clean(vdev_t *vd) list_remove(&spa->spa_dirty_list, vd); } +void +vdev_propagate_state(vdev_t *vd) +{ + vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + int degraded = 0, faulted = 0; + int corrupted = 0; + int c; + vdev_t *child; + + for (c = 0; c < vd->vdev_children; c++) { + child = vd->vdev_child[c]; + if (child->vdev_state <= VDEV_STATE_CANT_OPEN) + faulted++; + else if (child->vdev_state == VDEV_STATE_DEGRADED) + degraded++; + + if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) + corrupted++; + } + + vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + + /* + * Root special: if there is a toplevel vdev that cannot be + * opened due to corrupted metadata, then propagate the root + * vdev's aux state as 'corrupt' rather than 'insufficient + * replicas'. + */ + if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) + vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); +} + /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. @@ -1810,36 +1828,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) if (isopen) return; - if (vd->vdev_parent != NULL) { - int c; - int degraded = 0, faulted = 0; - int corrupted = 0; - vdev_t *parent, *child; - - parent = vd->vdev_parent; - for (c = 0; c < parent->vdev_children; c++) { - child = parent->vdev_child[c]; - if (child->vdev_state <= VDEV_STATE_CANT_OPEN) - faulted++; - else if (child->vdev_state == VDEV_STATE_DEGRADED) - degraded++; - - if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) - corrupted++; - } - - vd->vdev_parent->vdev_ops->vdev_op_state_change( - vd->vdev_parent, faulted, degraded); - - /* - * Root special: if this is a toplevel vdev that cannot be - * opened due to corrupted metadata, then propagate the root - * vdev's aux state as 'corrupt' rather than 'insufficient - * replicas'. - */ - if (corrupted && vd == vd->vdev_top) - vdev_set_state(vd->vdev_spa->spa_root_vdev, - B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - } + if (vd->vdev_parent != NULL) + vdev_propagate_state(vd->vdev_parent); } diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c index ee5732a59c..d79c38a32e 100644 --- a/usr/src/uts/common/fs/zfs/vdev_mirror.c +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c @@ -35,25 +35,85 @@ * Virtual device vector for mirroring. */ +typedef struct mirror_child { + vdev_t *mc_vd; + uint64_t mc_offset; + int mc_error; + short mc_tried; + short mc_skipped; +} mirror_child_t; + typedef struct mirror_map { - int mm_error; - short mm_tried; - short mm_skipped; + int mm_children; + int mm_replacing; + int mm_preferred; + int mm_root; + mirror_child_t mm_child[1]; } mirror_map_t; static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { - zio->io_vsd = kmem_zalloc(zio->io_vd->vdev_children * - sizeof (mirror_map_t), KM_SLEEP); - return (zio->io_vsd); + mirror_map_t *mm = NULL; + mirror_child_t *mc; + vdev_t *vd = zio->io_vd; + int c, d; + + if (vd == NULL) { + dva_t *dva = zio->io_bp->blk_dva; + spa_t *spa = zio->io_spa; + + c = BP_GET_NDVAS(zio->io_bp); + + mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); + mm->mm_children = c; + mm->mm_replacing = B_FALSE; + mm->mm_preferred = spa_get_random(c); + mm->mm_root = B_TRUE; + + /* + * Check the other, lower-index DVAs to see if they're on + * the same vdev as the child we picked. If they are, use + * them since they are likely to have been allocated from + * the primary metaslab in use at the time, and hence are + * more likely to have locality with single-copy data. + */ + for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { + if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) + mm->mm_preferred = d; + } + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); + mc->mc_offset = DVA_GET_OFFSET(&dva[c]); + } + } else { + c = vd->vdev_children; + + mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); + mm->mm_children = c; + mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops); + mm->mm_preferred = mm->mm_replacing ? 0 : spa_get_random(c); + mm->mm_root = B_FALSE; + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + mc->mc_vd = vd->vdev_child[c]; + mc->mc_offset = zio->io_offset; + } + } + + zio->io_vsd = mm; + return (mm); } static void vdev_mirror_map_free(zio_t *zio) { - kmem_free(zio->io_vsd, - zio->io_vd->vdev_children * sizeof (mirror_map_t)); + mirror_map_t *mm = zio->io_vsd; + + kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); zio->io_vsd = NULL; } @@ -103,30 +163,31 @@ vdev_mirror_close(vdev_t *vd) static void vdev_mirror_child_done(zio_t *zio) { - mirror_map_t *mm = zio->io_private; + mirror_child_t *mc = zio->io_private; - mm->mm_error = zio->io_error; - mm->mm_tried = 1; - mm->mm_skipped = 0; + mc->mc_error = zio->io_error; + mc->mc_tried = 1; + mc->mc_skipped = 0; } static void vdev_mirror_scrub_done(zio_t *zio) { - mirror_map_t *mm = zio->io_private; + mirror_child_t *mc = zio->io_private; if (zio->io_error == 0) { zio_t *pio = zio->io_parent; mutex_enter(&pio->io_lock); + ASSERT3U(zio->io_size, >=, pio->io_size); bcopy(zio->io_data, pio->io_data, pio->io_size); mutex_exit(&pio->io_lock); } zio_buf_free(zio->io_data, zio->io_size); - mm->mm_error = zio->io_error; - mm->mm_tried = 1; - mm->mm_skipped = 0; + mc->mc_error = zio->io_error; + mc->mc_tried = 1; + mc->mc_skipped = 0; } static void @@ -144,60 +205,42 @@ static int vdev_mirror_child_select(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; - vdev_t *vd = zio->io_vd; - vdev_t *cvd; + mirror_child_t *mc; uint64_t txg = zio->io_txg; int i, c; ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg); /* - * Select the child we'd like to read from absent any errors. - * The current policy is to alternate sides at 8M granularity. - * XXX -- investigate other policies for read distribution. - */ - c = (zio->io_offset >> (SPA_MAXBLOCKSHIFT + 6)) % vd->vdev_children; - - /* - * If this is a replacing vdev, always try child 0 (the source) first. - */ - if (vd->vdev_ops == &vdev_replacing_ops) - c = 0; - - /* * Try to find a child whose DTL doesn't contain the block to read. * If a child is known to be completely inaccessible (indicated by * vdev_is_dead() returning B_TRUE), don't even try. */ - for (i = 0; i < vd->vdev_children; i++, c++) { - if (c >= vd->vdev_children) + for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { + if (c >= mm->mm_children) c = 0; - if (mm[c].mm_tried || mm[c].mm_skipped) + mc = &mm->mm_child[c]; + if (mc->mc_tried || mc->mc_skipped) continue; - cvd = vd->vdev_child[c]; - if (vdev_is_dead(cvd)) { - mm[c].mm_error = ENXIO; - mm[c].mm_tried = 1; /* don't even try */ - mm[c].mm_skipped = 1; + if (vdev_is_dead(mc->mc_vd)) { + mc->mc_error = ENXIO; + mc->mc_tried = 1; /* don't even try */ + mc->mc_skipped = 1; continue; } - if (!vdev_dtl_contains(&cvd->vdev_dtl_map, txg, 1)) + if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1)) return (c); - mm[c].mm_error = ESTALE; - mm[c].mm_skipped = 1; + mc->mc_error = ESTALE; + mc->mc_skipped = 1; } /* * Every device is either missing or has this txg in its DTL. - * If we don't have any sibling replicas to consult, look for - * any child we haven't already tried before giving up. + * Look for any child we haven't already tried before giving up. */ - if (vd == vd->vdev_top || vd->vdev_parent->vdev_children <= 1) { - for (c = 0; c < vd->vdev_children; c++) { - if (!mm[c].mm_tried) - return (c); - } - } + for (c = 0; c < mm->mm_children; c++) + if (!mm->mm_child[c].mc_tried) + return (c); /* * Every child failed. There's no place left to look. @@ -208,28 +251,28 @@ vdev_mirror_child_select(zio_t *zio) static void vdev_mirror_io_start(zio_t *zio) { - vdev_t *vd = zio->io_vd; mirror_map_t *mm; + mirror_child_t *mc; int c, children; mm = vdev_mirror_map_alloc(zio); if (zio->io_type == ZIO_TYPE_READ) { - if ((zio->io_flags & ZIO_FLAG_SCRUB) && - vd->vdev_ops != &vdev_replacing_ops) { + if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { /* * For scrubbing reads we need to allocate a read * buffer for each child and issue reads to all * children. If any child succeeds, it will copy its * data into zio->io_data in vdev_mirror_scrub_done. */ - for (c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - vd->vdev_child[c], zio->io_offset, + mc->mc_vd, mc->mc_offset, zio_buf_alloc(zio->io_size), zio->io_size, zio->io_type, zio->io_priority, - ZIO_FLAG_CANFAIL, vdev_mirror_scrub_done, - &mm[c])); + ZIO_FLAG_CANFAIL, + vdev_mirror_scrub_done, mc)); } zio_wait_children_done(zio); return; @@ -248,23 +291,23 @@ vdev_mirror_io_start(zio_t *zio) * first child happens to have a DTL entry here as well. * All other writes go to all children. */ - if ((zio->io_flags & ZIO_FLAG_RESILVER) && - vd->vdev_ops == &vdev_replacing_ops && - !vdev_dtl_contains(&vd->vdev_child[0]->vdev_dtl_map, + if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing && + !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map, zio->io_txg, 1)) { - c = vd->vdev_children - 1; + c = mm->mm_children - 1; children = 1; } else { c = 0; - children = vd->vdev_children; + children = mm->mm_children; } } while (children--) { + mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - vd->vdev_child[c], zio->io_offset, zio->io_data, - zio->io_size, zio->io_type, zio->io_priority, - ZIO_FLAG_CANFAIL, vdev_mirror_child_done, &mm[c])); + mc->mc_vd, mc->mc_offset, + zio->io_data, zio->io_size, zio->io_type, zio->io_priority, + ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc)); c++; } @@ -274,20 +317,19 @@ vdev_mirror_io_start(zio_t *zio) static void vdev_mirror_io_done(zio_t *zio) { - vdev_t *vd = zio->io_vd; - vdev_t *cvd; mirror_map_t *mm = zio->io_vsd; + mirror_child_t *mc; int c; int good_copies = 0; int unexpected_errors = 0; - ASSERT(mm != NULL); - zio->io_error = 0; zio->io_numerrors = 0; - for (c = 0; c < vd->vdev_children; c++) { - if (mm[c].mm_tried && mm[c].mm_error == 0) { + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + + if (mc->mc_tried && mc->mc_error == 0) { good_copies++; continue; } @@ -296,10 +338,10 @@ vdev_mirror_io_done(zio_t *zio) * We preserve any EIOs because those may be worth retrying; * whereas ECKSUM and ENXIO are more likely to be persistent. */ - if (mm[c].mm_error) { + if (mc->mc_error) { if (zio->io_error != EIO) - zio->io_error = mm[c].mm_error; - if (!mm[c].mm_skipped) + zio->io_error = mc->mc_error; + if (!mc->mc_skipped) unexpected_errors++; zio->io_numerrors++; } @@ -308,11 +350,12 @@ vdev_mirror_io_done(zio_t *zio) if (zio->io_type == ZIO_TYPE_WRITE) { /* * XXX -- for now, treat partial writes as success. + * XXX -- For a replacing vdev, we need to make sure the + * new child succeeds. */ /* XXPOLICY */ if (good_copies != 0) zio->io_error = 0; - ASSERT(mm != NULL); vdev_mirror_map_free(zio); zio_next_stage(zio); return; @@ -325,17 +368,16 @@ vdev_mirror_io_done(zio_t *zio) */ /* XXPOLICY */ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { - ASSERT(c >= 0 && c < vd->vdev_children); - cvd = vd->vdev_child[c]; - dprintf("%s: retrying i/o (err=%d) on child %s\n", - vdev_description(zio->io_vd), zio->io_error, - vdev_description(cvd)); + ASSERT(c >= 0 && c < mm->mm_children); + mc = &mm->mm_child[c]; + dprintf("retrying i/o (err=%d) on child %s\n", + zio->io_error, vdev_description(mc->mc_vd)); zio->io_error = 0; zio_vdev_io_redone(zio); - zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd, - zio->io_offset, zio->io_data, zio->io_size, + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL, - vdev_mirror_child_done, &mm[c])); + vdev_mirror_child_done, mc)); zio_wait_children_done(zio); return; } @@ -360,7 +402,7 @@ vdev_mirror_io_done(zio_t *zio) rio = zio_null(zio, zio->io_spa, vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL); - for (c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < mm->mm_children; c++) { /* * Don't rewrite known good children. * Not only is it unnecessary, it could @@ -368,24 +410,23 @@ vdev_mirror_io_done(zio_t *zio) * power while rewriting the only good copy, * there would be no good copies left! */ - cvd = vd->vdev_child[c]; + mc = &mm->mm_child[c]; - if (mm[c].mm_error == 0) { - if (mm[c].mm_tried) + if (mc->mc_error == 0) { + if (mc->mc_tried) continue; - if (!vdev_dtl_contains(&cvd->vdev_dtl_map, + if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, zio->io_txg, 1)) continue; - mm[c].mm_error = ESTALE; + mc->mc_error = ESTALE; } - dprintf("%s resilvered %s @ 0x%llx error %d\n", - vdev_description(vd), - vdev_description(cvd), - zio->io_offset, mm[c].mm_error); + dprintf("resilvered %s @ 0x%llx error %d\n", + vdev_description(mc->mc_vd), mc->mc_offset, + mc->mc_error); - zio_nowait(zio_vdev_child_io(rio, zio->io_bp, cvd, - zio->io_offset, zio->io_data, zio->io_size, + zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd, + mc->mc_offset, zio->io_data, zio->io_size, ZIO_TYPE_WRITE, zio->io_priority, ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, NULL, NULL)); diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index 6e69053b8a..33225de39b 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -272,12 +272,7 @@ vdev_raidz_io_start(zio_t *zio) rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children); - if (DVA_GET_GANG(ZIO_GET_DVA(zio))) { - ASSERT3U(rm->rm_asize, ==, - vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE)); - } else { - ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio))); - } + ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); if (zio->io_type == ZIO_TYPE_WRITE) { @@ -357,11 +352,10 @@ vdev_raidz_io_done(zio_t *zio) vdev_t *cvd; raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; - blkptr_t *bp = zio->io_bp; int unexpected_errors = 0; int c; - ASSERT(bp != NULL); /* XXX need to add code to enforce this */ + ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ zio->io_error = 0; zio->io_numerrors = 0; diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c index 9ffdc8fba1..0e8752c6ce 100644 --- a/usr/src/uts/common/fs/zfs/vdev_root.c +++ b/usr/src/uts/common/fs/zfs/vdev_root.c @@ -35,12 +35,29 @@ * Virtual device vector for the pool's root vdev. */ +/* + * We should be able to tolerate one failure with absolutely no damage + * to our metadata. Two failures will take out space maps, a bunch of + * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy + * place to live. When we get smarter, we can liberalize this policy. + * e.g. If we haven't lost two consecutive top-level vdevs, then we are + * probably fine. Adding bean counters during alloc/free can make this + * future guesswork more accurate. + */ +/*ARGSUSED*/ +static int +too_many_errors(vdev_t *vd, int numerrors) +{ + return (numerrors > 0); +} + static int vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { vdev_t *cvd; int c, error; int lasterror = 0; + int numerrors = 0; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; @@ -52,17 +69,20 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) if ((error = vdev_open(cvd)) != 0) { lasterror = error; + numerrors++; continue; } } - if (lasterror) + if (too_many_errors(vd, numerrors)) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } *asize = 0; *ashift = 0; - return (lasterror); + return (0); } static void @@ -77,7 +97,7 @@ vdev_root_close(vdev_t *vd) static void vdev_root_state_change(vdev_t *vd, int faulted, int degraded) { - if (faulted > 0) + if (too_many_errors(vd, faulted)) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded != 0) diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index cd7e79a8be..0cff445cf3 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -392,7 +392,7 @@ zfs_ioc_pool_import(zfs_cmd_t *zc) static int zfs_ioc_pool_export(zfs_cmd_t *zc) { - return (spa_export(zc->zc_name)); + return (spa_export(zc->zc_name, NULL)); } static int diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 050db0ff34..373d0c41d0 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -248,8 +248,6 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio->io_bp = bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; - /* XXBP - Need to inherit this when it matters */ - zio->io_dva_index = 0; } zio->io_done = done; zio->io_private = private; @@ -279,6 +277,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, if (pio->io_child != NULL) pio->io_child->io_sibling_prev = zio; pio->io_child = zio; + zio->io_ndvas = pio->io_ndvas; mutex_exit(&pio->io_lock); } @@ -310,7 +309,6 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, int priority, int flags, zbookmark_t *zb) { zio_t *zio; - dva_t *dva; ASSERT3U(size, ==, BP_GET_LSIZE(bp)); @@ -325,9 +323,6 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, */ zio->io_bp = &zio->io_bp_copy; - bp = zio->io_bp; - dva = ZIO_GET_DVA(zio); - if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { uint64_t csize = BP_GET_PSIZE(bp); void *cbuf = zio_buf_alloc(csize); @@ -336,7 +331,7 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; } - if (DVA_GET_GANG(dva)) { + if (BP_IS_GANG(bp)) { uint64_t gsize = SPA_GANGBLOCKSIZE; void *gbuf = zio_buf_alloc(gsize); @@ -348,7 +343,7 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, } zio_t * -zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, +zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, int priority, int flags, zbookmark_t *zb) @@ -371,6 +366,7 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, zio->io_checksum = checksum; zio->io_compress = compress; + zio->io_ndvas = ncopies; if (compress != ZIO_COMPRESS_OFF) zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; @@ -380,6 +376,10 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, BP_ZERO(bp); BP_SET_LSIZE(bp, size); BP_SET_PSIZE(bp, size); + } else { + /* Make sure someone doesn't change their mind on overwrites */ + ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), + spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } return (zio); @@ -393,7 +393,6 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum, { zio_t *zio; - /* XXBP - We need to re-evaluate when to insert pipeline stages */ zio = zio_create(pio, spa, txg, bp, data, size, done, private, ZIO_TYPE_WRITE, priority, flags, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); @@ -402,6 +401,9 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum, zio->io_checksum = checksum; zio->io_compress = ZIO_COMPRESS_OFF; + if (pio != NULL) + ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); + return (zio); } @@ -441,7 +443,6 @@ zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, return (zio_null(pio, spa, NULL, NULL, 0)); } - /* XXBP - We need to re-evaluate when to insert pipeline stages */ zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); @@ -471,7 +472,6 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); ASSERT3U(spa_first_txg(spa), <=, txg); - /* XXBP - We need to re-evaluate when to insert pipeline stages */ zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); @@ -623,7 +623,7 @@ zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, done, private, type, priority, (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, - ZIO_STAGE_VDEV_IO_SETUP - 1, pipeline); + ZIO_STAGE_VDEV_IO_START - 1, pipeline); cio->io_vd = vd; cio->io_offset = offset; @@ -748,8 +748,13 @@ zio_done(zio_t *zio) ASSERT(bp->blk_pad[2] == 0); ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && - !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) + !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT(!BP_SHOULD_BYTESWAP(bp)); + if (zio->io_ndvas != 0) + ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); + ASSERT(BP_COUNT_GANG(bp) == 0 || + (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); + } } if (vd != NULL) @@ -902,6 +907,7 @@ zio_write_compress(zio_t *zio) BP_ZERO(bp); zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; } else { + ASSERT3U(BP_GET_NDVAS(bp), ==, 0); BP_SET_LSIZE(bp, lsize); BP_SET_PSIZE(bp, csize); BP_SET_COMPRESS(bp, compress); @@ -946,7 +952,7 @@ zio_gang_pipeline(zio_t *zio) * By default, the pipeline assumes that we're dealing with a gang * block. If we're not, strip out any gang-specific stages. */ - if (!DVA_GET_GANG(ZIO_GET_DVA(zio))) + if (!BP_IS_GANG(zio->io_bp)) zio->io_pipeline &= ~ZIO_GANG_STAGES; zio_next_stage(zio); @@ -968,7 +974,7 @@ zio_get_gang_header(zio_t *zio) uint64_t gsize = SPA_GANGBLOCKSIZE; void *gbuf = zio_buf_alloc(gsize); - ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); + ASSERT(BP_IS_GANG(bp)); zio_push_transform(zio, gbuf, gsize, gsize); @@ -987,7 +993,7 @@ zio_read_gang_members(zio_t *zio) uint64_t gsize, gbufsize, loff, lsize; int i; - ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); + ASSERT(BP_IS_GANG(zio->io_bp)); zio_gang_byteswap(zio); zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); @@ -1019,7 +1025,7 @@ zio_rewrite_gang_members(zio_t *zio) uint64_t gsize, gbufsize, loff, lsize; int i; - ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); + ASSERT(BP_IS_GANG(zio->io_bp)); ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); zio_gang_byteswap(zio); @@ -1054,7 +1060,7 @@ zio_free_gang_members(zio_t *zio) uint64_t gsize, gbufsize; int i; - ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); + ASSERT(BP_IS_GANG(zio->io_bp)); zio_gang_byteswap(zio); zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); @@ -1079,7 +1085,7 @@ zio_claim_gang_members(zio_t *zio) uint64_t gsize, gbufsize; int i; - ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); + ASSERT(BP_IS_GANG(zio->io_bp)); zio_gang_byteswap(zio); zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); @@ -1100,17 +1106,23 @@ static void zio_write_allocate_gang_member_done(zio_t *zio) { zio_t *pio = zio->io_parent; - dva_t *cdva = ZIO_GET_DVA(zio); - dva_t *pdva = ZIO_GET_DVA(pio); + dva_t *cdva = zio->io_bp->blk_dva; + dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; + int d; - ASSERT(DVA_GET_GANG(pdva)); + ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); + ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); + ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); + ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); - /* XXBP - Need to be careful here with multiple DVAs */ mutex_enter(&pio->io_lock); - asize = DVA_GET_ASIZE(pdva); - asize += DVA_GET_ASIZE(cdva); - DVA_SET_ASIZE(pdva, asize); + for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { + ASSERT(DVA_GET_GANG(&pdva[d])); + asize = DVA_GET_ASIZE(&pdva[d]); + asize += DVA_GET_ASIZE(&cdva[d]); + DVA_SET_ASIZE(&pdva[d], asize); + } mutex_exit(&pio->io_lock); } @@ -1118,41 +1130,50 @@ static void zio_write_allocate_gang_members(zio_t *zio) { blkptr_t *bp = zio->io_bp; - dva_t *dva = ZIO_GET_DVA(zio); + dva_t *dva = bp->blk_dva; + spa_t *spa = zio->io_spa; zio_gbh_phys_t *gbh; + uint64_t txg = zio->io_txg; uint64_t resid = zio->io_size; uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); uint64_t gsize, loff, lsize; uint32_t gbps_left; + int ndvas = zio->io_ndvas; + int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); int error; - int i; + int i, d; gsize = SPA_GANGBLOCKSIZE; gbps_left = SPA_GBH_NBLKPTRS; - error = metaslab_alloc(zio->io_spa, gsize, dva, zio->io_txg); + error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL); if (error == ENOSPC) panic("can't allocate gang block header"); ASSERT(error == 0); - DVA_SET_GANG(dva, 1); + for (d = 0; d < gbh_ndvas; d++) + DVA_SET_GANG(&dva[d], 1); - bp->blk_birth = zio->io_txg; + bp->blk_birth = txg; gbh = zio_buf_alloc(gsize); bzero(gbh, gsize); + /* We need to test multi-level gang blocks */ + if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0) + maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); + for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, resid -= lsize, gbps_left--, i++) { blkptr_t *gbp = &gbh->zg_blkptr[i]; - dva = &gbp->blk_dva[0]; + dva = gbp->blk_dva; ASSERT(gbps_left != 0); maxalloc = MIN(maxalloc, resid); while (resid <= maxalloc * gbps_left) { - error = metaslab_alloc(zio->io_spa, maxalloc, dva, - zio->io_txg); + error = metaslab_alloc(spa, maxalloc, gbp, ndvas, + txg, bp); if (error == 0) break; ASSERT3U(error, ==, ENOSPC); @@ -1166,9 +1187,9 @@ zio_write_allocate_gang_members(zio_t *zio) BP_SET_LSIZE(gbp, lsize); BP_SET_PSIZE(gbp, lsize); BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); - gbp->blk_birth = zio->io_txg; - zio_nowait(zio_rewrite(zio, zio->io_spa, - zio->io_checksum, zio->io_txg, gbp, + gbp->blk_birth = txg; + zio_nowait(zio_rewrite(zio, spa, + zio->io_checksum, txg, gbp, (char *)zio->io_data + loff, lsize, zio_write_allocate_gang_member_done, NULL, zio->io_priority, zio->io_flags, @@ -1176,8 +1197,8 @@ zio_write_allocate_gang_members(zio_t *zio) } else { lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); ASSERT(lsize != SPA_MINBLOCKSIZE); - zio_nowait(zio_write_allocate(zio, zio->io_spa, - zio->io_checksum, zio->io_txg, gbp, + zio_nowait(zio_write_allocate(zio, spa, + zio->io_checksum, txg, gbp, (char *)zio->io_data + loff, lsize, zio_write_allocate_gang_member_done, NULL, zio->io_priority, zio->io_flags)); @@ -1189,6 +1210,12 @@ zio_write_allocate_gang_members(zio_t *zio) zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; zio_push_transform(zio, gbh, gsize, gsize); + /* + * As much as we'd like this to be zio_wait_children_ready(), + * updating our ASIZE doesn't happen until the io_done callback, + * so we have to wait for that to finish in order for our BP + * to be stable. + */ zio_wait_children_done(zio); } @@ -1201,10 +1228,12 @@ static void zio_dva_allocate(zio_t *zio) { blkptr_t *bp = zio->io_bp; - dva_t *dva = ZIO_GET_DVA(zio); int error; ASSERT(BP_IS_HOLE(bp)); + ASSERT3U(BP_GET_NDVAS(bp), ==, 0); + ASSERT3U(zio->io_ndvas, >, 0); + ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa)); /* For testing, make some blocks above a certain size be gang blocks */ if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { @@ -1214,7 +1243,8 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - error = metaslab_alloc(zio->io_spa, zio->io_size, dva, zio->io_txg); + error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas, + zio->io_txg, NULL); if (error == 0) { bp->blk_birth = zio->io_txg; @@ -1233,11 +1263,13 @@ static void zio_dva_free(zio_t *zio) { blkptr_t *bp = zio->io_bp; - dva_t *dva = ZIO_GET_DVA(zio); + dva_t *dva = bp->blk_dva; + int d; ASSERT(!BP_IS_HOLE(bp)); - metaslab_free(zio->io_spa, dva, zio->io_txg, B_FALSE); + for (d = 0; d < BP_GET_NDVAS(bp); d++) + metaslab_free(zio->io_spa, &dva[d], zio->io_txg, B_FALSE); BP_ZERO(bp); @@ -1248,31 +1280,17 @@ static void zio_dva_claim(zio_t *zio) { blkptr_t *bp = zio->io_bp; - dva_t *dva = ZIO_GET_DVA(zio); + dva_t *dva = bp->blk_dva; + int error = 0; + int d; ASSERT(!BP_IS_HOLE(bp)); - zio->io_error = metaslab_claim(zio->io_spa, dva, zio->io_txg); - - zio_next_stage(zio); -} - -static void -zio_dva_translate(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - dva_t *dva = ZIO_GET_DVA(zio); - uint64_t vdev = DVA_GET_VDEV(dva); - uint64_t offset = DVA_GET_OFFSET(dva); - - ASSERT3U(zio->io_size, ==, ZIO_GET_IOSIZE(zio)); - - zio->io_offset = offset; - - if ((zio->io_vd = vdev_lookup_top(spa, vdev)) == NULL) - zio->io_error = ENXIO; - else if (offset + zio->io_size > zio->io_vd->vdev_asize) - zio->io_error = EOVERFLOW; + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + error = metaslab_claim(zio->io_spa, &dva[d], zio->io_txg); + if (error) + zio->io_error = error; + } zio_next_stage(zio); } @@ -1284,17 +1302,26 @@ zio_dva_translate(zio_t *zio) */ static void -zio_vdev_io_setup(zio_t *zio) +zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; - vdev_t *tvd = vd->vdev_top; - uint64_t align = 1ULL << tvd->vdev_ashift; + vdev_t *tvd = vd ? vd->vdev_top : NULL; + blkptr_t *bp = zio->io_bp; + uint64_t align; + + if (vd == NULL) { + /* The mirror_ops handle multiple DVAs in a single BP */ + vdev_mirror_ops.vdev_op_io_start(zio); + return; + } + + align = 1ULL << tvd->vdev_ashift; - /* XXPOLICY */ if (zio->io_retries == 0 && vd == tvd) zio->io_flags |= ZIO_FLAG_FAILFAST; - if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { + if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && + vd->vdev_children == 0) { zio->io_flags |= ZIO_FLAG_PHYSICAL; zio->io_offset += VDEV_LABEL_START_SIZE; } @@ -1312,15 +1339,6 @@ zio_vdev_io_setup(zio_t *zio) zio->io_flags |= ZIO_FLAG_SUBBLOCK; } - zio_next_stage(zio); -} - -static void -zio_vdev_io_start(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - uint64_t align = 1ULL << zio->io_vd->vdev_top->vdev_ashift; - ASSERT(P2PHASE(zio->io_offset, align) == 0); ASSERT(P2PHASE(zio->io_size, align) == 0); ASSERT(bp == NULL || @@ -1335,7 +1353,11 @@ zio_vdev_io_start(zio_t *zio) static void zio_vdev_io_done(zio_t *zio) { - vdev_io_done(zio); + if (zio->io_vd == NULL) + /* The mirror_ops handle multiple DVAs in a single BP */ + vdev_mirror_ops.vdev_op_io_done(zio); + else + vdev_io_done(zio); } /* XXPOLICY */ @@ -1348,7 +1370,7 @@ zio_should_retry(zio_t *zio) return (B_FALSE); if (zio->io_delegate_list != NULL) return (B_FALSE); - if (vd != vd->vdev_top) + if (vd && vd != vd->vdev_top) return (B_FALSE); if (zio->io_flags & ZIO_FLAG_DONT_RETRY) return (B_FALSE); @@ -1362,7 +1384,7 @@ static void zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; - vdev_t *tvd = vd->vdev_top; + vdev_t *tvd = vd ? vd->vdev_top : NULL; ASSERT(zio->io_vsd == NULL); @@ -1394,7 +1416,7 @@ zio_vdev_io_assess(zio_t *zio) /* XXPOLICY */ zio->io_flags &= ~ZIO_FLAG_FAILFAST; zio->io_flags |= ZIO_FLAG_DONT_CACHE; - zio->io_stage = ZIO_STAGE_VDEV_IO_SETUP - 1; + zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; dprintf("retry #%d for %s to %s offset %llx\n", zio->io_retries, zio_type_name[zio->io_type], @@ -1404,8 +1426,8 @@ zio_vdev_io_assess(zio_t *zio) return; } - if (zio->io_error != 0 && !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && - zio->io_error != ECKSUM) { + if (zio->io_error != 0 && zio->io_error != ECKSUM && + !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) { /* * Poor man's hotplug support. Even if we're done retrying this * I/O, try to reopen the vdev to see if it's still attached. @@ -1480,8 +1502,8 @@ zio_gang_checksum_generate(zio_t *zio) zio_cksum_t zc; zio_gbh_phys_t *gbh = zio->io_data; + ASSERT(BP_IS_GANG(zio->io_bp)); ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); - ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); @@ -1518,9 +1540,11 @@ zio_checksum_verified(zio_t *zio) void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) { - zcp->zc_word[0] = DVA_GET_VDEV(ZIO_GET_DVA(zio)); - zcp->zc_word[1] = DVA_GET_OFFSET(ZIO_GET_DVA(zio)); - zcp->zc_word[2] = zio->io_bp->blk_birth; + blkptr_t *bp = zio->io_bp; + + zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); + zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); + zcp->zc_word[2] = bp->blk_birth; zcp->zc_word[3] = 0; } @@ -1552,8 +1576,6 @@ zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { zio_dva_claim, zio_gang_checksum_generate, zio_ready, - zio_dva_translate, - zio_vdev_io_setup, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, @@ -1656,7 +1678,7 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, BP_ZERO(bp); - error = metaslab_alloc(spa, size, BP_IDENTITY(bp), txg); + error = metaslab_alloc(spa, size, bp, 1, txg, NULL); if (error == 0) { BP_SET_CHECKSUM(bp, checksum); @@ -1681,7 +1703,7 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) { - ASSERT(DVA_GET_GANG(BP_IDENTITY(bp)) == 0); + ASSERT(!BP_IS_GANG(bp)); dprintf_bp(bp, "txg %llu: ", txg); diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c index ca65f831a3..30369227b5 100644 --- a/usr/src/uts/common/fs/zfs/zio_checksum.c +++ b/usr/src/uts/common/fs/zfs/zio_checksum.c @@ -122,9 +122,8 @@ int zio_checksum_error(zio_t *zio) { blkptr_t *bp = zio->io_bp; - dva_t *dva = ZIO_GET_DVA(zio); zio_cksum_t zc = bp->blk_cksum; - uint_t checksum = DVA_GET_GANG(dva) ? ZIO_CHECKSUM_GANG_HEADER : + uint_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp); int byteswap = BP_SHOULD_BYTESWAP(bp); void *data = zio->io_data; @@ -159,7 +158,7 @@ zio_checksum_error(zio_t *zio) } zc = expected_cksum; } else { - ASSERT(!DVA_GET_GANG(dva)); + ASSERT(!BP_IS_GANG(bp)); ci->ci_func[byteswap](data, size, &actual_cksum); } diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index f1a331051d..5aaca0662b 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -109,7 +109,23 @@ uint64_t zfs_prop_default_numeric(zfs_prop_t); /* * On-disk format version. */ -#define ZFS_VERSION 1ULL +#define ZFS_VERSION_1 1ULL +#define ZFS_VERSION_2 2ULL +#define ZFS_VERSION ZFS_VERSION_2 + +/* + * Symbolic names for the changes that caused a ZFS_VERSION switch. + * Used in the code when checking for presence or absence of a feature. + * Feel free to define multiple symbolic names for each version if there + * were multiple changes to on-disk structures during that version. + * + * NOTE: When checking the current ZFS_VERSION in your code, be sure + * to use spa_version() since it reports the version of the + * last synced uberblock. Checking the in-flight version can + * be dangerous in some cases. + */ +#define ZFS_VERSION_INITIAL ZFS_VERSION_1 +#define ZFS_VERSION_DITTO_BLOCKS ZFS_VERSION_2 /* * The following are configuration names used in the nvlist describing a pool's |