summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith M Wesolowski <wesolows@foobazco.org>2014-06-06 18:08:43 +0000
committerKeith M Wesolowski <wesolows@foobazco.org>2014-06-06 18:08:43 +0000
commit63162cf70473b17134d09687effc3da3d4f75588 (patch)
treed22ddd687f7e324b77ec7d197c146a58519d75fd
parent947d419922a14b9fce40294b5c0d6470a768c105 (diff)
parent06315b795c0d54f0228e0b8af497a28752dd92da (diff)
downloadillumos-joyent-63162cf70473b17134d09687effc3da3d4f75588.tar.gz
[illumos-gate merge]
commit 06315b795c0d54f0228e0b8af497a28752dd92da 4881 zfs send performance degradation when embedded block pointers are encountered commit 7fd05ac4dec0c343d2f68f310d3718b715ecfbaf 4390 i/o errors when deleting filesystem/zvol can lead to space map corruption commit 5d7b4d438c4a51eccc95e77a83a437b4d48380eb 4757 ZFS embedded-data block pointers ("zero block compression") 4913 zfs release should not be subject to space checks Conflicts: usr/src/man/man1m/zfs.1m
-rw-r--r--usr/src/cmd/mdb/common/mdb/mdb_ctf.c4
-rw-r--r--usr/src/cmd/mdb/common/modules/zfs/zfs.c1
-rw-r--r--usr/src/cmd/truss/expound.c8
-rw-r--r--usr/src/cmd/zdb/zdb.c69
-rw-r--r--usr/src/cmd/zfs/zfs_main.c16
-rw-r--r--usr/src/cmd/zstreamdump/zstreamdump.c55
-rw-r--r--usr/src/cmd/ztest/ztest.c48
-rw-r--r--usr/src/common/zfs/zfeature_common.c5
-rw-r--r--usr/src/common/zfs/zfeature_common.h7
-rw-r--r--usr/src/common/zfs/zpool_prop.c2
-rw-r--r--usr/src/grub/capability2
-rw-r--r--usr/src/grub/grub-0.97/stage2/fsys_zfs.c90
-rw-r--r--usr/src/grub/grub-0.97/stage2/zfs-include/spa.h27
-rw-r--r--usr/src/lib/libzfs/common/libzfs.h6
-rw-r--r--usr/src/lib/libzfs/common/libzfs_pool.c1
-rw-r--r--usr/src/lib/libzfs/common/libzfs_sendrecv.c46
-rw-r--r--usr/src/lib/libzfs_core/common/libzfs_core.c12
-rw-r--r--usr/src/lib/libzfs_core/common/libzfs_core.h6
-rw-r--r--usr/src/lib/libzpool/common/llib-lzpool1
-rw-r--r--usr/src/man/man1m/zfs.1m41
-rw-r--r--usr/src/man/man5/zpool-features.527
-rw-r--r--usr/src/uts/common/Makefile.files1
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c93
-rw-r--r--usr/src/uts/common/fs/zfs/blkptr.c119
-rw-r--r--usr/src/uts/common/fs/zfs/bpobj.c41
-rw-r--r--usr/src/uts/common/fs/zfs/bptree.c101
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c61
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c85
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c32
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c218
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_traverse.c155
-rw-r--r--usr/src/uts/common/fs/zfs/dnode.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dnode_sync.c8
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dataset.c2
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_destroy.c5
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_pool.c9
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_scan.c153
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_userhold.c3
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c26
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c42
-rw-r--r--usr/src/uts/common/fs/zfs/sys/blkptr.h38
-rw-r--r--usr/src/uts/common/fs/zfs/sys/bptree.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dbuf.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h14
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_impl.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_send.h6
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_dir.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_pool.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_scan.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h179
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_impl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_debug.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h33
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h12
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_debug.c2
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c11
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c9
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c92
-rw-r--r--usr/src/uts/common/fs/zfs/zio_compress.c19
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c2
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h1
-rw-r--r--usr/src/uts/common/sys/sdt.h2
62 files changed, 1618 insertions, 450 deletions
diff --git a/usr/src/cmd/mdb/common/mdb/mdb_ctf.c b/usr/src/cmd/mdb/common/mdb/mdb_ctf.c
index 1f505ef8f1..1ae0952619 100644
--- a/usr/src/cmd/mdb/common/mdb/mdb_ctf.c
+++ b/usr/src/cmd/mdb/common/mdb/mdb_ctf.c
@@ -1543,10 +1543,10 @@ vread_helper(mdb_ctf_id_t modid, char *modbuf,
*
* typedef struct mdb_zio {
* enum zio_type io_type;
- * void *io_waiter;
+ * uintptr_t io_waiter;
* struct {
* struct {
- * void *list_next;
+ * uintptr_t list_next;
* } list_head;
* } io_parent_list;
* int io_error;
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
index 16c0a175f1..79884b41f0 100644
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -1501,7 +1501,6 @@ space_cb(uintptr_t addr, const void *unknown, void *arg)
return (WALK_ERR);
for (i = 0; i < TXG_SIZE; i++) {
-
if (mdb_ctf_vread(&rt, "range_tree_t",
"mdb_range_tree_t", ms.ms_alloctree[i], 0) == -1)
sd->ms_alloctree[i] += rt.rt_space;
diff --git a/usr/src/cmd/truss/expound.c b/usr/src/cmd/truss/expound.c
index 422ead7df9..915ec4626b 100644
--- a/usr/src/cmd/truss/expound.c
+++ b/usr/src/cmd/truss/expound.c
@@ -22,7 +22,7 @@
/*
* Copyright 2012 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -5071,9 +5071,9 @@ show_zfs_ioc(private_t *pri, long addr)
(void) printf(" zc_defer_destroy=%d\n",
(int)zc.zc_defer_destroy);
}
- if (zc.zc_temphold) {
- (void) printf(" zc_temphold=%d\n",
- (int)zc.zc_temphold);
+ if (zc.zc_flags) {
+ (void) printf(" zc_flags=0x%x\n",
+ zc.zc_flags);
}
if (zc.zc_action_handle) {
(void) printf(" zc_action_handle=%llu\n",
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 94b0132b16..281426bb5d 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -75,9 +75,9 @@
DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))
#ifndef lint
-extern int zfs_recover;
+extern boolean_t zfs_recover;
#else
-int zfs_recover;
+boolean_t zfs_recover;
#endif
const char cmdname[] = "zdb";
@@ -1032,8 +1032,17 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
return;
}
- blkbuf[0] = '\0';
+ if (BP_IS_EMBEDDED(bp)) {
+ (void) sprintf(blkbuf,
+ "EMBEDDED et=%u %llxL/%llxP B=%llu",
+ (int)BPE_GET_ETYPE(bp),
+ (u_longlong_t)BPE_GET_LSIZE(bp),
+ (u_longlong_t)BPE_GET_PSIZE(bp),
+ (u_longlong_t)bp->blk_birth);
+ return;
+ }
+ blkbuf[0] = '\0';
for (int i = 0; i < ndvas; i++)
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf), "%llu:%llx:%llx ",
@@ -1051,7 +1060,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
"%llxL/%llxP F=%llu B=%llu/%llu",
(u_longlong_t)BP_GET_LSIZE(bp),
(u_longlong_t)BP_GET_PSIZE(bp),
- (u_longlong_t)bp->blk_fill,
+ (u_longlong_t)BP_GET_FILL(bp),
(u_longlong_t)bp->blk_birth,
(u_longlong_t)BP_PHYSICAL_BIRTH(bp));
}
@@ -1064,8 +1073,10 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb,
char blkbuf[BP_SPRINTF_LEN];
int l;
- ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
- ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
+ if (!BP_IS_EMBEDDED(bp)) {
+ ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
+ ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
+ }
(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
@@ -1119,10 +1130,10 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
err = visit_indirect(spa, dnp, cbp, &czb);
if (err)
break;
- fill += cbp->blk_fill;
+ fill += BP_GET_FILL(cbp);
}
if (!err)
- ASSERT3U(fill, ==, bp->blk_fill);
+ ASSERT3U(fill, ==, BP_GET_FILL(bp));
(void) arc_buf_remove_ref(buf, &buf);
}
@@ -1789,14 +1800,14 @@ dump_dir(objset_t *os)
if (dds.dds_type == DMU_OST_META) {
dds.dds_creation_txg = TXG_INITIAL;
- usedobjs = os->os_rootbp->blk_fill;
+ usedobjs = BP_GET_FILL(os->os_rootbp);
refdbytes = os->os_spa->spa_dsl_pool->
dp_mos_dir->dd_phys->dd_used_bytes;
} else {
dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
}
- ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill);
+ ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
zdb_nicenum(refdbytes, numbuf);
@@ -2107,6 +2118,9 @@ typedef struct zdb_cb {
zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
uint64_t zcb_dedup_asize;
uint64_t zcb_dedup_blocks;
+ uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
+ uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
+ [BPE_PAYLOAD_SIZE];
uint64_t zcb_start;
uint64_t zcb_lastprint;
uint64_t zcb_totalasize;
@@ -2161,6 +2175,13 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
}
+ if (BP_IS_EMBEDDED(bp)) {
+ zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
+ zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
+ [BPE_GET_PSIZE(bp)]++;
+ return;
+ }
+
if (dump_opt['L'])
return;
@@ -2258,7 +2279,8 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
- if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) {
+ if (!BP_IS_EMBEDDED(bp) &&
+ (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size);
int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
@@ -2450,7 +2472,7 @@ dump_block_stats(spa_t *spa)
zdb_blkstats_t *zb, *tzb;
uint64_t norm_alloc, norm_space, total_alloc, total_found;
int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
- int leaks = 0;
+ boolean_t leaks = B_FALSE;
(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
(dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
@@ -2538,7 +2560,7 @@ dump_block_stats(spa_t *spa)
(u_longlong_t)total_alloc,
(dump_opt['L']) ? "unreachable" : "leaked",
(longlong_t)(total_alloc - total_found));
- leaks = 1;
+ leaks = B_TRUE;
}
if (tzb->zb_count == 0)
@@ -2570,6 +2592,23 @@ dump_block_stats(spa_t *spa)
(void) printf("\tSPA allocated: %10llu used: %5.2f%%\n",
(u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
+ for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
+ if (zcb.zcb_embedded_blocks[i] == 0)
+ continue;
+ (void) printf("\n");
+ (void) printf("\tadditional, non-pointer bps of type %u: "
+ "%10llu\n",
+ i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
+
+ if (dump_opt['b'] >= 3) {
+ (void) printf("\t number of (compressed) bytes: "
+ "number of bps\n");
+ dump_histogram(zcb.zcb_embedded_histogram[i],
+ sizeof (zcb.zcb_embedded_histogram[i]) /
+ sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
+ }
+ }
+
if (tzb->zb_ditto_samevdev != 0) {
(void) printf("\tDittoed blocks on same vdev: %llu\n",
(longlong_t)tzb->zb_ditto_samevdev);
@@ -2682,14 +2721,14 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
avl_index_t where;
zdb_ddt_entry_t *zdde, zdde_search;
- if (BP_IS_HOLE(bp))
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0);
if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
(void) printf("traversing objset %llu, %llu objects, "
"%lu blocks so far\n",
(u_longlong_t)zb->zb_objset,
- (u_longlong_t)bp->blk_fill,
+ (u_longlong_t)BP_GET_FILL(bp),
avl_numnodes(t));
}
diff --git a/usr/src/cmd/zfs/zfs_main.c b/usr/src/cmd/zfs/zfs_main.c
index a2bb30f1b0..be631fd1f2 100644
--- a/usr/src/cmd/zfs/zfs_main.c
+++ b/usr/src/cmd/zfs/zfs_main.c
@@ -256,9 +256,9 @@ get_usage(zfs_help_t idx)
case HELP_ROLLBACK:
return (gettext("\trollback [-rRf] <snapshot>\n"));
case HELP_SEND:
- return (gettext("\tsend [-DnPpRv] [-[iI] snapshot] "
+ return (gettext("\tsend [-DnPpRve] [-[iI] snapshot] "
"<snapshot>\n"
- "\tsend [-i snapshot|bookmark] "
+ "\tsend [-e] [-i snapshot|bookmark] "
"<filesystem|volume|snapshot>\n"));
case HELP_SET:
return (gettext("\tset <property=value> "
@@ -3380,6 +3380,7 @@ rollback_check_dependent(zfs_handle_t *zhp, void *data)
zfs_close(zhp);
return (0);
}
+
/*
* Report any snapshots more recent than the one specified. Used when '-r' is
* not specified. We reuse this same callback for the snapshot dependents - if
@@ -3719,7 +3720,7 @@ zfs_do_send(int argc, char **argv)
boolean_t extraverbose = B_FALSE;
/* check options */
- while ((c = getopt(argc, argv, ":i:I:RDpvnP")) != -1) {
+ while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) {
switch (c) {
case 'i':
if (fromname)
@@ -3754,6 +3755,9 @@ zfs_do_send(int argc, char **argv)
case 'n':
flags.dryrun = B_TRUE;
break;
+ case 'e':
+ flags.embed_data = B_TRUE;
+ break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
@@ -3792,6 +3796,7 @@ zfs_do_send(int argc, char **argv)
if (strchr(argv[0], '@') == NULL ||
(fromname && strchr(fromname, '#') != NULL)) {
char frombuf[ZFS_MAXNAMELEN];
+ enum lzc_send_flags lzc_flags = 0;
if (flags.replicate || flags.doall || flags.props ||
flags.dedup || flags.dryrun || flags.verbose ||
@@ -3806,6 +3811,9 @@ zfs_do_send(int argc, char **argv)
if (zhp == NULL)
return (1);
+ if (flags.embed_data)
+ lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
+
if (fromname != NULL &&
(fromname[0] == '#' || fromname[0] == '@')) {
/*
@@ -3819,7 +3827,7 @@ zfs_do_send(int argc, char **argv)
(void) strlcat(frombuf, fromname, sizeof (frombuf));
fromname = frombuf;
}
- err = zfs_send_one(zhp, fromname, STDOUT_FILENO);
+ err = zfs_send_one(zhp, fromname, STDOUT_FILENO, lzc_flags);
zfs_close(zhp);
return (err != 0);
}
diff --git a/usr/src/cmd/zstreamdump/zstreamdump.c b/usr/src/cmd/zstreamdump/zstreamdump.c
index f7a4091625..dce1cb3d76 100644
--- a/usr/src/cmd/zstreamdump/zstreamdump.c
+++ b/usr/src/cmd/zstreamdump/zstreamdump.c
@@ -49,7 +49,6 @@
*/
#define DUMP_GROUPING 4
-uint64_t drr_record_count[DRR_NUMTYPES];
uint64_t total_write_size = 0;
uint64_t total_stream_len = 0;
FILE *send_stream = 0;
@@ -123,7 +122,7 @@ print_block(char *buf, int length)
* Start printing ASCII characters at a constant offset, after
* the hex prints. Leave 3 characters per byte on a line (2 digit
* hex number plus 1 space) plus spaces between characters and
- * groupings
+ * groupings.
*/
int ascii_start = BYTES_PER_LINE * 3 +
BYTES_PER_LINE / DUMP_GROUPING + 2;
@@ -160,6 +159,8 @@ int
main(int argc, char *argv[])
{
char *buf = malloc(INITIAL_BUFLEN);
+ uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
+ uint64_t total_records = 0;
dmu_replay_record_t thedrr;
dmu_replay_record_t *drr = &thedrr;
struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
@@ -170,6 +171,7 @@ main(int argc, char *argv[])
struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref;
struct drr_free *drrf = &thedrr.drr_u.drr_free;
struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
+ struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
char c;
boolean_t verbose = B_FALSE;
boolean_t first = B_TRUE;
@@ -264,6 +266,7 @@ main(int argc, char *argv[])
}
drr_record_count[drr->drr_type]++;
+ total_records++;
switch (drr->drr_type) {
case DRR_BEGIN:
@@ -376,8 +379,8 @@ main(int argc, char *argv[])
drro->drr_bonuslen);
}
if (drro->drr_bonuslen > 0) {
- (void) ssread(buf, P2ROUNDUP(drro->drr_bonuslen,
- 8), &zc);
+ (void) ssread(buf,
+ P2ROUNDUP(drro->drr_bonuslen, 8), &zc);
if (dump) {
print_block(buf,
P2ROUNDUP(drro->drr_bonuslen, 8));
@@ -506,6 +509,38 @@ main(int argc, char *argv[])
print_block(buf, drrs->drr_length);
}
break;
+ case DRR_WRITE_EMBEDDED:
+ if (do_byteswap) {
+ drrwe->drr_object =
+ BSWAP_64(drrwe->drr_object);
+ drrwe->drr_offset =
+ BSWAP_64(drrwe->drr_offset);
+ drrwe->drr_length =
+ BSWAP_64(drrwe->drr_length);
+ drrwe->drr_toguid =
+ BSWAP_64(drrwe->drr_toguid);
+ drrwe->drr_lsize =
+ BSWAP_32(drrwe->drr_lsize);
+ drrwe->drr_psize =
+ BSWAP_32(drrwe->drr_psize);
+ }
+ if (verbose) {
+ (void) printf("WRITE_EMBEDDED object = %llu "
+ "offset = %llu length = %llu\n"
+ "toguid = %llx comp = %u etype = %u "
+ "lsize = %u psize = %u\n",
+ (u_longlong_t)drrwe->drr_object,
+ (u_longlong_t)drrwe->drr_offset,
+ (u_longlong_t)drrwe->drr_length,
+ (u_longlong_t)drrwe->drr_toguid,
+ drrwe->drr_compression,
+ drrwe->drr_etype,
+ drrwe->drr_lsize,
+ drrwe->drr_psize);
+ }
+ (void) ssread(buf,
+ P2ROUNDUP(drrwe->drr_psize, 8), &zc);
+ break;
}
pcksum = zc;
}
@@ -524,18 +559,16 @@ main(int argc, char *argv[])
(u_longlong_t)drr_record_count[DRR_FREEOBJECTS]);
(void) printf("\tTotal DRR_WRITE records = %lld\n",
(u_longlong_t)drr_record_count[DRR_WRITE]);
+ (void) printf("\tTotal DRR_WRITE_BYREF records = %lld\n",
+ (u_longlong_t)drr_record_count[DRR_WRITE_BYREF]);
+ (void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld\n",
+ (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED]);
(void) printf("\tTotal DRR_FREE records = %lld\n",
(u_longlong_t)drr_record_count[DRR_FREE]);
(void) printf("\tTotal DRR_SPILL records = %lld\n",
(u_longlong_t)drr_record_count[DRR_SPILL]);
(void) printf("\tTotal records = %lld\n",
- (u_longlong_t)(drr_record_count[DRR_BEGIN] +
- drr_record_count[DRR_OBJECT] +
- drr_record_count[DRR_FREEOBJECTS] +
- drr_record_count[DRR_WRITE] +
- drr_record_count[DRR_FREE] +
- drr_record_count[DRR_SPILL] +
- drr_record_count[DRR_END]));
+ (u_longlong_t)total_records);
(void) printf("\tTotal write size = %lld (0x%llx)\n",
(u_longlong_t)total_write_size, (u_longlong_t)total_write_size);
(void) printf("\tTotal stream length = %lld (0x%llx)\n",
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index 14ce93f552..d4dac71db8 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -52,7 +52,7 @@
* At random times, the child self-immolates with a SIGKILL.
* This is the software equivalent of pulling the power cord.
* The parent then runs the test again, using the existing
- * storage pool, as many times as desired. If backwards compatability
+ * storage pool, as many times as desired. If backwards compatibility
* testing is enabled ztest will sometimes run the "older" version
* of ztest after a SIGKILL.
*
@@ -1265,13 +1265,13 @@ static void
ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
{
- ASSERT(bt->bt_magic == BT_MAGIC);
- ASSERT(bt->bt_objset == dmu_objset_id(os));
- ASSERT(bt->bt_object == object);
- ASSERT(bt->bt_offset == offset);
- ASSERT(bt->bt_gen <= gen);
- ASSERT(bt->bt_txg <= txg);
- ASSERT(bt->bt_crtxg == crtxg);
+ ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
+ ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
+ ASSERT3U(bt->bt_object, ==, object);
+ ASSERT3U(bt->bt_offset, ==, offset);
+ ASSERT3U(bt->bt_gen, <=, gen);
+ ASSERT3U(bt->bt_txg, <=, txg);
+ ASSERT3U(bt->bt_crtxg, ==, crtxg);
}
static ztest_block_tag_t *
@@ -3470,6 +3470,11 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
if (error)
fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
error = dsl_dataset_promote(clone2name, NULL);
+ if (error == ENOSPC) {
+ dmu_objset_disown(os, FTAG);
+ ztest_record_enospc(FTAG);
+ goto out;
+ }
if (error != EBUSY)
fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
error);
@@ -3625,11 +3630,19 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
return;
}
- dmu_object_set_checksum(os, bigobj,
- (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx);
+ enum zio_checksum cksum;
+ do {
+ cksum = (enum zio_checksum)
+ ztest_random_dsl_prop(ZFS_PROP_CHECKSUM);
+ } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+ dmu_object_set_checksum(os, bigobj, cksum, tx);
- dmu_object_set_compress(os, bigobj,
- (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx);
+ enum zio_compress comp;
+ do {
+ comp = (enum zio_compress)
+ ztest_random_dsl_prop(ZFS_PROP_COMPRESSION);
+ } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS);
+ dmu_object_set_compress(os, bigobj, comp, tx);
/*
* For each index from n to n + s, verify that the existing bufwad
@@ -4709,8 +4722,13 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
error = dsl_dataset_user_hold(holds, 0, NULL);
fnvlist_free(holds);
- if (error)
- fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag);
+ if (error == ENOSPC) {
+ ztest_record_enospc("dsl_dataset_user_hold");
+ goto out;
+ } else if (error) {
+ fatal(0, "dsl_dataset_user_hold(%s, %s) = %u",
+ fullname, tag, error);
+ }
error = dsl_destroy_snapshot(fullname, B_FALSE);
if (error != EBUSY) {
@@ -5163,7 +5181,7 @@ ztest_run_zdb(char *pool)
isa = strdup(isa);
/* LINTED */
(void) sprintf(bin,
- "/usr/sbin%.*s/zdb -bcc%s%s -U %s %s",
+ "/usr/sbin%.*s/zdb -bcc%s%s -d -U %s %s",
isalen,
isa,
ztest_opts.zo_verbose >= 3 ? "s" : "",
diff --git a/usr/src/common/zfs/zfeature_common.c b/usr/src/common/zfs/zfeature_common.c
index 8aab7fcdf6..447c64cc20 100644
--- a/usr/src/common/zfs/zfeature_common.c
+++ b/usr/src/common/zfs/zfeature_common.c
@@ -215,4 +215,9 @@ zpool_feature_init(void)
"com.joyent:filesystem_limits", "filesystem_limits",
"Filesystem and snapshot limits.", B_TRUE, B_FALSE, B_FALSE,
filesystem_limits_deps);
+
+ zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
+ "com.delphix:embedded_data", "embedded_data",
+ "Blocks which compress very well use even less space.",
+ B_FALSE, B_TRUE, B_TRUE, NULL);
}
diff --git a/usr/src/common/zfs/zfeature_common.h b/usr/src/common/zfs/zfeature_common.h
index 9a369b7fee..be2111be91 100644
--- a/usr/src/common/zfs/zfeature_common.h
+++ b/usr/src/common/zfs/zfeature_common.h
@@ -48,6 +48,7 @@ typedef enum spa_feature {
SPA_FEATURE_ENABLED_TXG,
SPA_FEATURE_HOLE_BIRTH,
SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_EMBEDDED_DATA,
SPA_FEATURE_BOOKMARKS,
SPA_FEATURE_FS_SS_LIMIT,
SPA_FEATURES
@@ -68,7 +69,7 @@ typedef struct zfeature_info {
const spa_feature_t *fi_depends;
} zfeature_info_t;
-typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg);
+typedef int (zfeature_func_t)(zfeature_info_t *, void *);
#define ZFS_FEATURE_DEBUG
@@ -77,8 +78,8 @@ extern zfeature_info_t spa_feature_table[SPA_FEATURES];
extern boolean_t zfeature_is_valid_guid(const char *);
extern boolean_t zfeature_is_supported(const char *);
-extern int zfeature_lookup_name(const char *name, spa_feature_t *res);
-extern boolean_t zfeature_depends_on(spa_feature_t fid, spa_feature_t check);
+extern int zfeature_lookup_name(const char *, spa_feature_t *);
+extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t);
extern void zpool_feature_init(void);
diff --git a/usr/src/common/zfs/zpool_prop.c b/usr/src/common/zfs/zpool_prop.c
index 72db879371..51041a8766 100644
--- a/usr/src/common/zfs/zpool_prop.c
+++ b/usr/src/common/zfs/zpool_prop.c
@@ -81,6 +81,8 @@ zpool_prop_init(void)
ZFS_TYPE_POOL, "<size>", "FREE");
zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<size>", "FREEING");
+ zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<size>", "LEAKED");
zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
diff --git a/usr/src/grub/capability b/usr/src/grub/capability
index 964cbe9306..e0f25f1275 100644
--- a/usr/src/grub/capability
+++ b/usr/src/grub/capability
@@ -29,7 +29,7 @@
# GRUB necessitating that the boot blocks be reinstalled for that fix or
# enhancement to take effect.
#
-VERSION=23
+VERSION=24
dboot
xVM
zfs
diff --git a/usr/src/grub/grub-0.97/stage2/fsys_zfs.c b/usr/src/grub/grub-0.97/stage2/fsys_zfs.c
index 950f3ce880..341b6cd971 100644
--- a/usr/src/grub/grub-0.97/stage2/fsys_zfs.c
+++ b/usr/src/grub/grub-0.97/stage2/fsys_zfs.c
@@ -166,12 +166,15 @@ zio_checksum_verify(blkptr_t *bp, char *data, int size)
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
zio_cksum_t actual_cksum, expected_cksum;
- /* byteswap is not supported */
- if (byteswap)
+ if (byteswap) {
+ grub_printf("byteswap not supported\n");
return (-1);
+ }
- if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
+ if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) {
+ grub_printf("checksum algorithm %u not supported\n", checksum);
return (-1);
+ }
if (ci->ci_eck) {
expected_cksum = zec->zec_cksum;
@@ -179,7 +182,6 @@ zio_checksum_verify(blkptr_t *bp, char *data, int size)
ci->ci_func[0](data, size, &actual_cksum);
zec->zec_cksum = expected_cksum;
zc = expected_cksum;
-
} else {
ci->ci_func[byteswap](data, size, &actual_cksum);
}
@@ -379,6 +381,72 @@ zio_read_data(blkptr_t *bp, void *buf, char *stack)
}
/*
+ * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
+ * more than BPE_PAYLOAD_SIZE bytes).
+ */
+static void
+decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
+{
+ int psize, i;
+ uint8_t *buf8 = buf;
+ uint64_t w = 0;
+ const uint64_t *bp64 = (const uint64_t *)bp;
+
+ psize = BPE_GET_PSIZE(bp);
+
+ /*
+ * Decode the words of the block pointer into the byte array.
+ * Low bits of first word are the first byte (little endian).
+ */
+ for (i = 0; i < psize; i++) {
+ if (i % sizeof (w) == 0) {
+ /* beginning of a word */
+ w = *bp64;
+ bp64++;
+ if (!BPE_IS_PAYLOADWORD(bp, bp64))
+ bp64++;
+ }
+ buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
+ }
+}
+
+/*
+ * Fill in the buffer with the (decompressed) payload of the embedded
+ * blkptr_t. Takes into account compression and byteorder (the payload is
+ * treated as a stream of bytes).
+ * Return 0 on success, or ENOSPC if it won't fit in the buffer.
+ */
+static int
+decode_embedded_bp(const blkptr_t *bp, void *buf)
+{
+ int comp;
+ int lsize, psize;
+ uint8_t *dst = buf;
+ uint64_t w = 0;
+
+ lsize = BPE_GET_LSIZE(bp);
+ psize = BPE_GET_PSIZE(bp);
+ comp = BP_GET_COMPRESS(bp);
+
+ if (comp != ZIO_COMPRESS_OFF) {
+ uint8_t dstbuf[BPE_PAYLOAD_SIZE];
+
+ if ((unsigned int)comp >= ZIO_COMPRESS_FUNCTIONS ||
+ decomp_table[comp].decomp_func == NULL) {
+ grub_printf("compression algorithm not supported\n");
+ return (ERR_FSYS_CORRUPT);
+ }
+
+ decode_embedded_bp_compressed(bp, dstbuf);
+ decomp_table[comp].decomp_func(dstbuf, buf, psize, lsize);
+ } else {
+ decode_embedded_bp_compressed(bp, buf);
+ }
+
+ return (0);
+}
+
+/*
* Read in a block of data, verify its checksum, decompress if needed,
* and put the uncompressed data in buf.
*
@@ -392,6 +460,15 @@ zio_read(blkptr_t *bp, void *buf, char *stack)
int lsize, psize, comp;
char *retbuf;
+ if (BP_IS_EMBEDDED(bp)) {
+ if (BPE_GET_ETYPE(bp) != BP_EMBEDDED_TYPE_DATA) {
+ grub_printf("unsupported embedded BP (type=%u)\n",
+ (int)BPE_GET_ETYPE(bp));
+ return (ERR_FSYS_CORRUPT);
+ }
+ return (decode_embedded_bp(bp, buf));
+ }
+
comp = BP_GET_COMPRESS(bp);
lsize = BP_GET_LSIZE(bp);
psize = BP_GET_PSIZE(bp);
@@ -404,7 +481,8 @@ zio_read(blkptr_t *bp, void *buf, char *stack)
}
if ((char *)buf < stack && ((char *)buf) + lsize > stack) {
- grub_printf("not enough memory allocated\n");
+ grub_printf("not enough memory to fit %u bytes on stack\n",
+ lsize);
return (ERR_WONT_FIT);
}
@@ -764,6 +842,7 @@ zap_iterate(dnode_phys_t *zap_dnode, zap_cb_t *cb, void *arg, char *stack)
* Input
* mdn - metadnode to get the object dnode
* objnum - object number for the object dnode
+ * type - if nonzero, object must be of this type
* buf - data buffer that holds the returning dnode
* stack - scratch area
*
@@ -968,6 +1047,7 @@ static const char *spa_feature_names[] = {
"org.illumos:lz4_compress",
"com.delphix:hole_birth",
"com.delphix:extensible_dataset",
+ "com.delphix:embedded_data",
NULL
};
diff --git a/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h b/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h
index 8d53ad6866..19fe52f13f 100644
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h
@@ -116,7 +116,7 @@ typedef struct zio_cksum {
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE |
+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@@ -150,7 +150,8 @@ typedef struct zio_cksum {
* G gang block indicator
* B byteorder (endianness)
* D dedup
- * X unused
+ * X encryption (on version 30, which is not supported)
+ * E blkptr_t contains embedded data
* lvl level of indirection
* type DMU object type
* phys birth txg of block allocation; zero if same as logical birth txg
@@ -204,8 +205,8 @@ typedef struct blkptr {
#define BP_SET_PSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
-#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
-#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7)
+#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x)
#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
@@ -216,6 +217,8 @@ typedef struct blkptr {
#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
+
#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
@@ -297,6 +300,22 @@ typedef struct blkptr {
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
}
+#define BPE_GET_ETYPE(bp) BP_GET_CHECKSUM(bp)
+#define BPE_GET_LSIZE(bp) \
+ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)
+#define BPE_GET_PSIZE(bp) \
+ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)
+
+typedef enum bp_embedded_type {
+ BP_EMBEDDED_TYPE_DATA,
+ NUM_BP_EMBEDDED_TYPES
+} bp_embedded_type_t;
+
+#define BPE_NUM_WORDS 14
+#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
+#define BPE_IS_PAYLOADWORD(bp, wp) \
+ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
+
#ifdef _BIG_ENDIAN
#define ZFS_HOST_BYTEORDER (0ULL)
#else
diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h
index 62dce8f2d4..ef5224c763 100644
--- a/usr/src/lib/libzfs/common/libzfs.h
+++ b/usr/src/lib/libzfs/common/libzfs.h
@@ -39,6 +39,7 @@
#include <sys/fs/zfs.h>
#include <sys/avl.h>
#include <ucred.h>
+#include <libzfs_core.h>
#ifdef __cplusplus
extern "C" {
@@ -591,13 +592,16 @@ typedef struct sendflags {
/* show progress (ie. -v) */
boolean_t progress;
+
+ /* WRITE_EMBEDDED records of type DATA are permitted */
+ boolean_t embed_data;
} sendflags_t;
typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
extern int zfs_send(zfs_handle_t *, const char *, const char *,
sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **);
-extern int zfs_send_one(zfs_handle_t *, const char *, int);
+extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags);
extern int zfs_promote(zfs_handle_t *);
extern int zfs_hold(zfs_handle_t *, const char *, const char *,
diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c
index 20eeccd286..32d9e45d50 100644
--- a/usr/src/lib/libzfs/common/libzfs_pool.c
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c
@@ -281,6 +281,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
case ZPOOL_PROP_ALLOCATED:
case ZPOOL_PROP_FREE:
case ZPOOL_PROP_FREEING:
+ case ZPOOL_PROP_LEAKED:
case ZPOOL_PROP_EXPANDSZ:
if (literal) {
(void) snprintf(buf, len, "%llu",
diff --git a/usr/src/lib/libzfs/common/libzfs_sendrecv.c b/usr/src/lib/libzfs/common/libzfs_sendrecv.c
index 8e0e4e1e6e..6697b52831 100644
--- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c
+++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
*/
@@ -42,6 +42,7 @@
#include <time.h>
#include <libzfs.h>
+#include <libzfs_core.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
@@ -213,6 +214,7 @@ cksummer(void *arg)
struct drr_object *drro = &thedrr.drr_u.drr_object;
struct drr_write *drrw = &thedrr.drr_u.drr_write;
struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
+ struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
FILE *ofp;
int outfd;
dmu_replay_record_t wbr_drr = {0};
@@ -409,6 +411,20 @@ cksummer(void *arg)
break;
}
+ case DRR_WRITE_EMBEDDED:
+ {
+ if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+ &stream_cksum, outfd) == -1)
+ goto out;
+ (void) ssread(buf,
+ P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp);
+ if (cksum_and_write(buf,
+ P2ROUNDUP((uint64_t)drrwe->drr_psize, 8),
+ &stream_cksum, outfd) == -1)
+ goto out;
+ break;
+ }
+
case DRR_FREE:
{
if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
@@ -790,7 +806,7 @@ typedef struct send_dump_data {
char prevsnap[ZFS_MAXNAMELEN];
uint64_t prevsnap_obj;
boolean_t seenfrom, seento, replicate, doall, fromorigin;
- boolean_t verbose, dryrun, parsable, progress;
+ boolean_t verbose, dryrun, parsable, progress, embed_data;
int outfd;
boolean_t err;
nvlist_t *fss;
@@ -870,7 +886,8 @@ estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
*/
static int
dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
- boolean_t fromorigin, int outfd, nvlist_t *debugnv)
+ boolean_t fromorigin, int outfd, enum lzc_send_flags flags,
+ nvlist_t *debugnv)
{
zfs_cmd_t zc = { 0 };
libzfs_handle_t *hdl = zhp->zfs_hdl;
@@ -884,6 +901,7 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
zc.zc_obj = fromorigin;
zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
zc.zc_fromobj = fromsnap_obj;
+ zc.zc_flags = flags;
VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0));
if (fromsnap && fromsnap[0] != '\0') {
@@ -1134,8 +1152,12 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
}
}
+ enum lzc_send_flags flags = 0;
+ if (sdd->embed_data)
+ flags |= LZC_SEND_FLAG_EMBED_DATA;
+
err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
- fromorigin, sdd->outfd, sdd->debugnv);
+ fromorigin, sdd->outfd, flags, sdd->debugnv);
if (sdd->progress) {
(void) pthread_cancel(tid);
@@ -1479,6 +1501,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
sdd.parsable = flags->parsable;
sdd.progress = flags->progress;
sdd.dryrun = flags->dryrun;
+ sdd.embed_data = flags->embed_data;
sdd.filter_cb = filter_func;
sdd.filter_cb_arg = cb_arg;
if (debugnvp)
@@ -1610,7 +1633,8 @@ err_out:
}
int
-zfs_send_one(zfs_handle_t *zhp, const char *from, int fd)
+zfs_send_one(zfs_handle_t *zhp, const char *from, int fd,
+ enum lzc_send_flags flags)
{
int err;
libzfs_handle_t *hdl = zhp->zfs_hdl;
@@ -1619,7 +1643,7 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd)
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"warning: cannot send '%s'"), zhp->zfs_name);
- err = lzc_send(zhp->zfs_name, from, fd);
+ err = lzc_send(zhp->zfs_name, from, fd, flags);
if (err != 0) {
switch (errno) {
case EXDEV:
@@ -2537,6 +2561,16 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
(void) recv_read(hdl, fd, buf,
drr->drr_u.drr_spill.drr_length, B_FALSE, NULL);
break;
+ case DRR_WRITE_EMBEDDED:
+ if (byteswap) {
+ drr->drr_u.drr_write_embedded.drr_psize =
+ BSWAP_32(drr->drr_u.drr_write_embedded.
+ drr_psize);
+ }
+ (void) recv_read(hdl, fd, buf,
+ P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize,
+ 8), B_FALSE, NULL);
+ break;
case DRR_WRITE_BYREF:
case DRR_FREEOBJECTS:
case DRR_FREE:
diff --git a/usr/src/lib/libzfs_core/common/libzfs_core.c b/usr/src/lib/libzfs_core/common/libzfs_core.c
index 7653d028a9..6f36568667 100644
--- a/usr/src/lib/libzfs_core/common/libzfs_core.c
+++ b/usr/src/lib/libzfs_core/common/libzfs_core.c
@@ -439,6 +439,8 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp)
}
/*
+ * Generate a zfs send stream for the specified snapshot and write it to
+ * the specified file descriptor.
*
* "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap")
*
@@ -452,9 +454,15 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp)
* snapshot in the origin, etc.
*
* "fd" is the file descriptor to write the send stream to.
+ *
+ * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
+ * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
+ * which the receiving system must support (as indicated by support
+ * for the "embedded_data" feature).
*/
int
-lzc_send(const char *snapname, const char *from, int fd)
+lzc_send(const char *snapname, const char *from, int fd,
+ enum lzc_send_flags flags)
{
nvlist_t *args;
int err;
@@ -463,6 +471,8 @@ lzc_send(const char *snapname, const char *from, int fd)
fnvlist_add_int32(args, "fd", fd);
if (from != NULL)
fnvlist_add_string(args, "fromsnap", from);
+ if (flags & LZC_SEND_FLAG_EMBED_DATA)
+ fnvlist_add_boolean(args, "embedok");
err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
nvlist_free(args);
return (err);
diff --git a/usr/src/lib/libzfs_core/common/libzfs_core.h b/usr/src/lib/libzfs_core/common/libzfs_core.h
index 484a48afe2..d7d767055d 100644
--- a/usr/src/lib/libzfs_core/common/libzfs_core.h
+++ b/usr/src/lib/libzfs_core/common/libzfs_core.h
@@ -52,7 +52,11 @@ int lzc_hold(nvlist_t *, int, nvlist_t **);
int lzc_release(nvlist_t *, nvlist_t **);
int lzc_get_holds(const char *, nvlist_t **);
-int lzc_send(const char *, const char *, int);
+enum lzc_send_flags {
+ LZC_SEND_FLAG_EMBED_DATA = 1 << 0
+};
+
+int lzc_send(const char *, const char *, int, enum lzc_send_flags);
int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int);
int lzc_send_space(const char *, const char *, uint64_t *);
diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool
index 6c64977c32..2872049eb5 100644
--- a/usr/src/lib/libzpool/common/llib-lzpool
+++ b/usr/src/lib/libzpool/common/llib-lzpool
@@ -60,6 +60,7 @@
#include <sys/dmu_tx.h>
#include <sys/dsl_destroy.h>
#include <sys/dsl_userhold.h>
+#include <sys/blkptr.h>
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
diff --git a/usr/src/man/man1m/zfs.1m b/usr/src/man/man1m/zfs.1m
index d0f1c82ed8..2fcf129852 100644
--- a/usr/src/man/man1m/zfs.1m
+++ b/usr/src/man/man1m/zfs.1m
@@ -176,12 +176,12 @@ zfs \- configures ZFS file systems
.LP
.nf
-\fBzfs\fR \fBsend\fR [\fB-DnPpRv\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
+\fBzfs\fR \fBsend\fR [\fB-DnPpRve\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
.fi
.LP
.nf
-\fBzfs\fR \fBsend\fR [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs\fR \fBsend\fR [\fB-e\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
.fi
.LP
@@ -2947,7 +2947,7 @@ See \fBzpool-features\fR(5) for details on ZFS feature flags and the
.sp
.ne 2
.na
-\fBzfs send\fR [\fB-DnPpRrv\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
+\fBzfs send\fR [\fB-DnPpRrve\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
.ad
.sp .6
.RS 4n
@@ -3035,6 +3035,23 @@ property information is only included if the \fB-p\fR flag is specified.
.sp
.ne 2
.na
+\fB\fB-e\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a more compact stream by using WRITE_EMBEDDED records for blocks
+which are stored more compactly on disk by the \fBembedded_data\fR pool
+feature. This flag has no effect if the \fBembedded_data\fR feature is
+disabled. The receiving system must have the \fBembedded_data\fR feature
+enabled. If the \fBlz4_compress\fR feature is active on the sending system,
+then the receiving system must have that feature enabled as well. See
+\fBzpool-features\fR(5) for details on ZFS feature flags and the
+\fBembedded_data\fR feature.
+.RE
+
+.sp
+.ne 2
+.na
\fB\fB-p\fR\fR
.ad
.sp .6
@@ -3083,7 +3100,7 @@ on future versions of \fBZFS\fR.
.sp
.ne 2
.na
-\fBzfs send\fR [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs send\fR [\fB-e\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
.ad
.sp .6
.RS 4n
@@ -3111,6 +3128,22 @@ be the origin snapshot, or an earlier snapshot in the origin's filesystem,
or the origin's origin, etc.
.RE
+.sp
+.ne 2
+.na
+\fB\fB-e\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a more compact stream by using WRITE_EMBEDDED records for blocks
+which are stored more compactly on disk by the \fBembedded_data\fR pool
+feature. This flag has no effect if the \fBembedded_data\fR feature is
+disabled. The receiving system must have the \fBembedded_data\fR feature
+enabled. If the \fBlz4_compress\fR feature is active on the sending system,
+then the receiving system must have that feature enabled as well. See
+\fBzpool-features\fR(5) for details on ZFS feature flags and the
+\fBembedded_data\fR feature.
+.RE
.RE
.sp
diff --git a/usr/src/man/man5/zpool-features.5 b/usr/src/man/man5/zpool-features.5
index 076e08a2d2..d21ffc9f5e 100644
--- a/usr/src/man/man5/zpool-features.5
+++ b/usr/src/man/man5/zpool-features.5
@@ -401,6 +401,33 @@ never return to being \fBenabled\fB.
.RE
+.sp
+.ne 2
+.na
+\fB\fBembedded_data\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID com.delphix:embedded_data
+READ\-ONLY COMPATIBLE no
+DEPENDENCIES none
+.TE
+
+This feature improves the performance and compression ratio of
+highly-compressible blocks. Blocks whose contents can compress to 112 bytes
+or smaller can take advantage of this feature.
+
+When this feature is enabled, the contents of highly-compressible blocks are
+stored in the block "pointer" itself (a misnomer in this case, as it contains
+the compresseed data, rather than a pointer to its location on disk). Thus
+the space of the block (one sector, typically 512 bytes or 4KB) is saved,
+and no additional i/o is needed to read and write the data block.
+
+This feature becomes \fBactive\fR as soon as it is enabled and will
+never return to being \fBenabled\fR.
+
+.RE
.SH "SEE ALSO"
\fBzpool\fR(1M)
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index e00224f248..6ab842488a 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -1344,6 +1344,7 @@ TRANS_OBJS += mdtrans.o trans_ioctl.o trans_log.o
ZFS_COMMON_OBJS += \
arc.o \
+ blkptr.o \
bplist.o \
bpobj.o \
bptree.o \
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 581b69791d..73bc0780ce 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -747,8 +747,10 @@ buf_discard_identity(arc_buf_hdr_t *hdr)
}
static arc_buf_hdr_t *
-buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
{
+ const dva_t *dva = BP_IDENTITY(bp);
+ uint64_t birth = BP_PHYSICAL_BIRTH(bp);
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
arc_buf_hdr_t *buf;
@@ -780,6 +782,8 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
arc_buf_hdr_t *fbuf;
uint32_t i;
+ ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
+ ASSERT(buf->b_birth != 0);
ASSERT(!HDR_IN_HASH_TABLE(buf));
*lockp = hash_lock;
mutex_enter(hash_lock);
@@ -2753,10 +2757,10 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
static void
arc_read_done(zio_t *zio)
{
- arc_buf_hdr_t *hdr, *found;
+ arc_buf_hdr_t *hdr;
arc_buf_t *buf;
arc_buf_t *abuf; /* buffer we're assigning to callback */
- kmutex_t *hash_lock;
+ kmutex_t *hash_lock = NULL;
arc_callback_t *callback_list, *acb;
int freeable = FALSE;
@@ -2771,12 +2775,22 @@ arc_read_done(zio_t *zio)
* reason for it not to be found is if we were freed during the
* read.
*/
- found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
- &hash_lock);
-
- ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
- (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
- (found == hdr && HDR_L2_READING(hdr)));
+ if (HDR_IN_HASH_TABLE(hdr)) {
+ ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+ ASSERT3U(hdr->b_dva.dva_word[0], ==,
+ BP_IDENTITY(zio->io_bp)->dva_word[0]);
+ ASSERT3U(hdr->b_dva.dva_word[1], ==,
+ BP_IDENTITY(zio->io_bp)->dva_word[1]);
+
+ arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
+ &hash_lock);
+
+ ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
+ hash_lock == NULL) ||
+ (found == hdr &&
+ DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+ (found == hdr && HDR_L2_READING(hdr)));
+ }
hdr->b_flags &= ~ARC_L2_EVICTED;
if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
@@ -2900,16 +2914,25 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
const zbookmark_t *zb)
{
- arc_buf_hdr_t *hdr;
+ arc_buf_hdr_t *hdr = NULL;
arc_buf_t *buf = NULL;
- kmutex_t *hash_lock;
+ kmutex_t *hash_lock = NULL;
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
+ ASSERT(!BP_IS_EMBEDDED(bp) ||
+ BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
+
top:
- hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
- &hash_lock);
- if (hdr && hdr->b_datacnt > 0) {
+ if (!BP_IS_EMBEDDED(bp)) {
+ /*
+ * Embedded BP's have no DVA and require no I/O to "read".
+ * Create an anonymous arc buf to back it.
+ */
+ hdr = buf_hash_find(guid, bp, &hash_lock);
+ }
+
+ if (hdr != NULL && hdr->b_datacnt > 0) {
*arc_flags |= ARC_CACHED;
@@ -2983,7 +3006,7 @@ top:
done(NULL, buf, private);
} else {
uint64_t size = BP_GET_LSIZE(bp);
- arc_callback_t *acb;
+ arc_callback_t *acb;
vdev_t *vd = NULL;
uint64_t addr = 0;
boolean_t devw = B_FALSE;
@@ -2992,15 +3015,17 @@ top:
if (hdr == NULL) {
/* this block is not in the cache */
- arc_buf_hdr_t *exists;
+ arc_buf_hdr_t *exists = NULL;
arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
buf = arc_buf_alloc(spa, size, private, type);
hdr = buf->b_hdr;
- hdr->b_dva = *BP_IDENTITY(bp);
- hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
- hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
- exists = buf_hash_insert(hdr, &hash_lock);
- if (exists) {
+ if (!BP_IS_EMBEDDED(bp)) {
+ hdr->b_dva = *BP_IDENTITY(bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+ hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
+ exists = buf_hash_insert(hdr, &hash_lock);
+ }
+ if (exists != NULL) {
/* somebody beat us to the hash insert */
mutex_exit(hash_lock);
buf_discard_identity(hdr);
@@ -3072,7 +3097,8 @@ top:
vd = NULL;
}
- mutex_exit(hash_lock);
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
/*
* At this point, we have a level 1 cache miss. Try again in
@@ -3215,8 +3241,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
kmutex_t *hash_lock;
uint64_t guid = spa_load_guid(spa);
- hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
- &hash_lock);
+ ASSERT(!BP_IS_EMBEDDED(bp));
+
+ hdr = buf_hash_find(guid, bp, &hash_lock);
if (hdr == NULL)
return;
if (HDR_BUF_AVAILABLE(hdr)) {
@@ -3532,7 +3559,7 @@ arc_write_done(zio_t *zio)
ASSERT(hdr->b_acb == NULL);
if (zio->io_error == 0) {
- if (BP_IS_HOLE(zio->io_bp)) {
+ if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
buf_discard_identity(hdr);
} else {
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
@@ -3544,10 +3571,10 @@ arc_write_done(zio_t *zio)
}
/*
- * If the block to be written was all-zero, we may have
- * compressed it away. In this case no write was performed
- * so there will be no dva/birth/checksum. The buffer must
- * therefore remain anonymous (and uncached).
+ * If the block to be written was all-zero or compressed enough to be
+ * embedded in the BP, no write was performed so there will be no
+ * dva/birth/checksum. The buffer must therefore remain anonymous
+ * (and uncached).
*/
if (!BUF_EMPTY(hdr)) {
arc_buf_hdr_t *exists;
@@ -4838,7 +4865,7 @@ static boolean_t
l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
{
void *cdata;
- size_t csize, len;
+ size_t csize, len, rounded;
ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
ASSERT(l2hdr->b_tmp_cdata != NULL);
@@ -4848,6 +4875,12 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
cdata, l2hdr->b_asize);
+ rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
+ if (rounded > csize) {
+ bzero((char *)cdata + csize, rounded - csize);
+ csize = rounded;
+ }
+
if (csize == 0) {
/* zero block, indicate that there's nothing to write */
zio_data_buf_free(cdata, len);
diff --git a/usr/src/uts/common/fs/zfs/blkptr.c b/usr/src/uts/common/fs/zfs/blkptr.c
new file mode 100644
index 0000000000..7e61dc96ff
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/blkptr.c
@@ -0,0 +1,119 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+/*
+ * Embedded-data Block Pointers
+ *
+ * Normally, block pointers point (via their DVAs) to a block which holds data.
+ * If the data that we need to store is very small, this is an inefficient
+ * use of space, because a block must be at minimum 1 sector (typically 512
+ * bytes or 4KB). Additionally, reading these small blocks tends to generate
+ * more random reads.
+ *
+ * Embedded-data Block Pointers allow small pieces of data (the "payload",
+ * up to 112 bytes) to be stored in the block pointer itself, instead of
+ * being pointed to. The "Pointer" part of this name is a bit of a
+ * misnomer, as nothing is pointed to.
+ *
+ * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to
+ * be embedded in the block pointer. The logic for this is handled in
+ * the SPA, by the zio pipeline. Therefore most code outside the zio
+ * pipeline doesn't need special-cases to handle these block pointers.
+ *
+ * See spa.h for details on the exact layout of embedded block pointers.
+ */
+
+void
+encode_embedded_bp_compressed(blkptr_t *bp, void *data,
+ enum zio_compress comp, int uncompressed_size, int compressed_size)
+{
+ uint64_t *bp64 = (uint64_t *)bp;
+ uint64_t w = 0;
+ uint8_t *data8 = data;
+
+ ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE);
+ ASSERT(uncompressed_size == compressed_size ||
+ comp != ZIO_COMPRESS_OFF);
+ ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
+ ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+
+ bzero(bp, sizeof (*bp));
+ BP_SET_EMBEDDED(bp, B_TRUE);
+ BP_SET_COMPRESS(bp, comp);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ BPE_SET_LSIZE(bp, uncompressed_size);
+ BPE_SET_PSIZE(bp, compressed_size);
+
+ /*
+ * Encode the byte array into the words of the block pointer.
+ * First byte goes into low bits of first word (little endian).
+ */
+ for (int i = 0; i < compressed_size; i++) {
+ BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]);
+ if (i % sizeof (w) == sizeof (w) - 1) {
+ /* we've reached the end of a word */
+ ASSERT3P(bp64, <, bp + 1);
+ *bp64 = w;
+ bp64++;
+ if (!BPE_IS_PAYLOADWORD(bp, bp64))
+ bp64++;
+ w = 0;
+ }
+ }
+ /* write last partial word */
+ if (bp64 < (uint64_t *)(bp + 1))
+ *bp64 = w;
+}
+
+/*
+ * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
+ * more than BPE_PAYLOAD_SIZE bytes).
+ */
+void
+decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
+{
+ int psize;
+ uint8_t *buf8 = buf;
+ uint64_t w = 0;
+ const uint64_t *bp64 = (const uint64_t *)bp;
+
+ ASSERT(BP_IS_EMBEDDED(bp));
+
+ psize = BPE_GET_PSIZE(bp);
+
+ /*
+ * Decode the words of the block pointer into the byte array.
+ * Low bits of first word are the first byte (little endian).
+ */
+ for (int i = 0; i < psize; i++) {
+ if (i % sizeof (w) == 0) {
+ /* beginning of a word */
+ ASSERT3P(bp64, <, bp + 1);
+ w = *bp64;
+ bp64++;
+ if (!BPE_IS_PAYLOADWORD(bp, bp64))
+ bp64++;
+ }
+ buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
+ }
+}
diff --git a/usr/src/uts/common/fs/zfs/bpobj.c b/usr/src/uts/common/fs/zfs/bpobj.c
index 0fb597ba95..e75ae72f9e 100644
--- a/usr/src/uts/common/fs/zfs/bpobj.c
+++ b/usr/src/uts/common/fs/zfs/bpobj.c
@@ -192,6 +192,13 @@ bpobj_close(bpobj_t *bpo)
mutex_destroy(&bpo->bpo_lock);
}
+static boolean_t
+bpobj_hasentries(bpobj_t *bpo)
+{
+ return (bpo->bpo_phys->bpo_num_blkptrs != 0 ||
+ (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0));
+}
+
static int
bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
boolean_t free)
@@ -332,9 +339,11 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
out:
/* If there are no entries, there should be no bytes. */
- ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
- (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
- bpo->bpo_phys->bpo_bytes == 0);
+ if (!bpobj_hasentries(bpo)) {
+ ASSERT0(bpo->bpo_phys->bpo_bytes);
+ ASSERT0(bpo->bpo_phys->bpo_comp);
+ ASSERT0(bpo->bpo_phys->bpo_uncomp);
+ }
mutex_exit(&bpo->bpo_lock);
return (err);
@@ -377,7 +386,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
- if (used == 0) {
+ if (!bpobj_hasentries(&subbpo)) {
/* No point in having an empty subobj. */
bpobj_close(&subbpo);
bpobj_free(bpo->bpo_os, subobj, tx);
@@ -453,13 +462,29 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
ASSERT(!BP_IS_HOLE(bp));
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+ if (BP_IS_EMBEDDED(bp)) {
+ /*
+ * The bpobj will compress better without the payload.
+ *
+ * Note that we store EMBEDDED bp's because they have an
+ * uncompressed size, which must be accounted for. An
+ * alternative would be to add their size to bpo_uncomp
+ * without storing the bp, but that would create additional
+ * complications: bpo_uncomp would be inconsistent with the
+ * set of BP's stored, and bpobj_iterate() wouldn't visit
+ * all the space accounted for in the bpobj.
+ */
+ bzero(&stored_bp, sizeof (stored_bp));
+ stored_bp.blk_prop = bp->blk_prop;
+ stored_bp.blk_birth = bp->blk_birth;
+ } else if (!BP_GET_DEDUP(bp)) {
+ /* The bpobj will compress better without the checksum */
+ bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+ }
+
/* We never need the fill count. */
stored_bp.blk_fill = 0;
- /* The bpobj will compress better if we can leave off the checksum */
- if (!BP_GET_DEDUP(bp))
- bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
-
mutex_enter(&bpo->bpo_lock);
offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
diff --git a/usr/src/uts/common/fs/zfs/bptree.c b/usr/src/uts/common/fs/zfs/bptree.c
index 83f365864d..77067d24e9 100644
--- a/usr/src/uts/common/fs/zfs/bptree.c
+++ b/usr/src/uts/common/fs/zfs/bptree.c
@@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
return (dmu_object_free(os, obj, tx));
}
+boolean_t
+bptree_is_empty(objset_t *os, uint64_t obj)
+{
+ dmu_buf_t *db;
+ bptree_phys_t *bt;
+ boolean_t rv;
+
+ VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
+ bt = db->db_data;
+ rv = (bt->bt_begin == bt->bt_end);
+ dmu_buf_rele(db, FTAG);
+ return (rv);
+}
+
void
bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
{
dmu_buf_t *db;
bptree_phys_t *bt;
- bptree_entry_phys_t bte;
+ bptree_entry_phys_t bte = { 0 };
/*
* bptree objects are in the pool mos, therefore they can only be
@@ -122,7 +136,6 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
bte.be_birth_txg = birth_txg;
bte.be_bp = *bp;
- bzero(&bte.be_zb, sizeof (bte.be_zb));
dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
dmu_buf_will_dirty(db, tx);
@@ -153,10 +166,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
return (err);
}
+/*
+ * If "free" is set:
+ * - It is assumed that "func" will be freeing the block pointers.
+ * - If "func" returns nonzero, the bookmark will be remembered and
+ * iteration will be restarted from this point on next invocation.
+ * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
+ * bptree_iterate will remember the bookmark, continue traversing
+ * any additional entries, and return 0.
+ *
+ * If "free" is not set, traversal will stop and return an error if
+ * an i/o error is encountered.
+ *
+ * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
+ * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
+ * traverse_dataset_destroyed()).
+ */
int
bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
void *arg, dmu_tx_t *tx)
{
+ boolean_t ioerr = B_FALSE;
int err;
uint64_t i;
dmu_buf_t *db;
@@ -182,49 +212,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
bptree_entry_phys_t bte;
int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
- ASSERT(!free || i == ba.ba_phys->bt_begin);
-
err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
&bte, DMU_READ_NO_PREFETCH);
if (err != 0)
break;
- if (zfs_recover)
+ if (zfs_free_leak_on_eio)
flags |= TRAVERSE_HARD;
+ zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld "
+ "bookmark %lld/%lld/%lld/%lld",
+ i, (longlong_t)bte.be_birth_txg,
+ (longlong_t)bte.be_zb.zb_objset,
+ (longlong_t)bte.be_zb.zb_object,
+ (longlong_t)bte.be_zb.zb_level,
+ (longlong_t)bte.be_zb.zb_blkid);
err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
bte.be_birth_txg, &bte.be_zb, flags,
bptree_visit_cb, &ba);
if (free) {
- if (err == ERESTART) {
+ /*
+ * The callback has freed the visited block pointers.
+ * Record our traversal progress on disk, either by
+ * updating this record's bookmark, or by logically
+ * removing this record by advancing bt_begin.
+ */
+ if (err != 0) {
/* save bookmark for future resume */
ASSERT3U(bte.be_zb.zb_objset, ==,
ZB_DESTROYED_OBJSET);
ASSERT0(bte.be_zb.zb_level);
dmu_write(os, obj, i * sizeof (bte),
sizeof (bte), &bte, tx);
- break;
- }
- if (err != 0) {
+ if (err == EIO || err == ECKSUM ||
+ err == ENXIO) {
+ /*
+ * Skip the rest of this tree and
+ * continue on to the next entry.
+ */
+ err = 0;
+ ioerr = B_TRUE;
+ } else {
+ break;
+ }
+ } else if (ioerr) {
/*
- * We can not properly handle an i/o
- * error, because the traversal code
- * does not know how to resume from an
- * arbitrary bookmark.
+ * This entry is finished, but there were
+ * i/o errors on previous entries, so we
+ * can't adjust bt_begin. Set this entry's
+ * be_birth_txg such that it will be
+ * treated as a no-op in future traversals.
*/
- zfs_panic_recover("error %u from "
- "traverse_dataset_destroyed()", err);
+ bte.be_birth_txg = UINT64_MAX;
+ dmu_write(os, obj, i * sizeof (bte),
+ sizeof (bte), &bte, tx);
}
- ba.ba_phys->bt_begin++;
- (void) dmu_free_range(os, obj,
- i * sizeof (bte), sizeof (bte), tx);
+ if (!ioerr) {
+ ba.ba_phys->bt_begin++;
+ (void) dmu_free_range(os, obj,
+ i * sizeof (bte), sizeof (bte), tx);
+ }
+ } else if (err != 0) {
+ break;
}
}
- ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+ ASSERT(!free || err != 0 || ioerr ||
+ ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
/* if all blocks are free there should be no used space */
if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
+ if (zfs_free_leak_on_eio) {
+ ba.ba_phys->bt_bytes = 0;
+ ba.ba_phys->bt_comp = 0;
+ ba.ba_phys->bt_uncomp = 0;
+ }
+
ASSERT0(ba.ba_phys->bt_bytes);
ASSERT0(ba.ba_phys->bt_comp);
ASSERT0(ba.ba_phys->bt_uncomp);
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 76a3d0762f..eec178f19b 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -40,6 +40,8 @@
#include <sys/dmu_zfetch.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/blkptr.h>
#include <sys/range_tree.h>
/*
@@ -1446,6 +1448,38 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_exit(&db->db_mtx);
}
+void
+dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+ bp_embedded_type_t etype, enum zio_compress comp,
+ int uncompressed_size, int compressed_size, int byteorder,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+ struct dirty_leaf *dl;
+ dmu_object_type_t type;
+
+ DB_DNODE_ENTER(db);
+ type = DB_DNODE(db)->dn_type;
+ DB_DNODE_EXIT(db);
+
+ ASSERT0(db->db_level);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+ dmu_buf_will_not_fill(dbuf, tx);
+
+ ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+ dl = &db->db_last_dirty->dt.dl;
+ encode_embedded_bp_compressed(&dl->dr_overridden_by,
+ data, comp, uncompressed_size, compressed_size);
+ BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
+ BP_SET_TYPE(&dl->dr_overridden_by, type);
+ BP_SET_LEVEL(&dl->dr_overridden_by, 0);
+ BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
+
+ dl->dr_override_state = DR_OVERRIDDEN;
+ dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
+}
+
/*
* Directly assign a provided arc buf to a given dbuf if it's not referenced
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
@@ -1830,7 +1864,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
}
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
- if (bp && !BP_IS_HOLE(bp)) {
+ if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
@@ -2462,7 +2496,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
uint64_t fill = 0;
int i;
- ASSERT(db->db_blkptr == bp);
+ ASSERT3P(db->db_blkptr, ==, bp);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
@@ -2474,7 +2508,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
- BP_GET_TYPE(bp) == dn->dn_bonustype));
+ BP_GET_TYPE(bp) == dn->dn_bonustype) ||
+ BP_IS_EMBEDDED(bp));
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
}
@@ -2515,12 +2550,13 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
if (BP_IS_HOLE(ibp))
continue;
- fill += ibp->blk_fill;
+ fill += BP_GET_FILL(ibp);
}
}
DB_DNODE_EXIT(db);
- bp->blk_fill = fill;
+ if (!BP_IS_EMBEDDED(bp))
+ bp->blk_fill = fill;
mutex_exit(&db->db_mtx);
}
@@ -2632,7 +2668,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
db->db.db_size);
- arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ if (!arc_released(db->db_buf))
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
DB_DNODE_EXIT(db);
mutex_destroy(&dr->dt.di.dr_mtx);
@@ -2758,10 +2795,16 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
DB_DNODE_EXIT(db);
- if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
- ASSERT(db->db_state != DB_NOFILL);
+ if (db->db_level == 0 &&
+ dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ /*
+ * The BP for this block has been provided by open context
+ * (by dmu_sync() or dmu_buf_write_embedded()).
+ */
+ void *contents = (data != NULL) ? data->b_data : NULL;
+
dr->dr_zio = zio_write(zio, os->os_spa, txg,
- db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
+ db->db_blkptr, contents, db->db.db_size, &zp,
dbuf_write_override_ready, NULL, dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 3c6e9037d0..1b0a72caf5 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -125,17 +125,13 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
};
int
-dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
- void *tag, dmu_buf_t **dbp, int flags)
+dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
uint64_t blkid;
dmu_buf_impl_t *db;
int err;
- int db_flags = DB_RF_CANFAIL;
-
- if (flags & DMU_READ_NO_PREFETCH)
- db_flags |= DB_RF_NOPREFETCH;
err = dnode_hold(os, object, FTAG, &dn);
if (err)
@@ -144,18 +140,37 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+
if (db == NULL) {
- err = SET_ERROR(EIO);
- } else {
+ *dbp = NULL;
+ return (SET_ERROR(EIO));
+ }
+
+ *dbp = &db->db;
+ return (err);
+}
+
+int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp, int flags)
+{
+ int err;
+ int db_flags = DB_RF_CANFAIL;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
+
+ err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
+ if (err == 0) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags);
- if (err) {
+ if (err != 0) {
dbuf_rele(db, tag);
- db = NULL;
+ *dbp = NULL;
}
}
- dnode_rele(dn, FTAG);
- *dbp = &db->db; /* NULL db plus first field offset is NULL */
return (err);
}
@@ -848,6 +863,25 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+ void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+ int compressed_size, int byteorder, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+
+ ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
+ ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+ VERIFY0(dmu_buf_hold_noread(os, object, offset,
+ FTAG, &db));
+
+ dmu_buf_write_embedded(db,
+ data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
+ uncompressed_size, compressed_size, byteorder, tx);
+
+ dmu_buf_rele(db, FTAG);
+}
+
/*
* DMU support for xuio
*/
@@ -1264,7 +1298,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
* block size still needs to be known for replay.
*/
BP_SET_LSIZE(bp, db->db_size);
- } else {
+ } else if (!BP_IS_EMBEDDED(bp)) {
ASSERT(BP_GET_LEVEL(bp) == 0);
bp->blk_fill = 1;
}
@@ -1535,9 +1569,15 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
{
dnode_t *dn;
- /* XXX assumes dnode_hold will not get an i/o error */
- (void) dnode_hold(os, object, FTAG, &dn);
- ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+ /*
+ * Send streams include each object's checksum function. This
+ * check ensures that the receiving system can understand the
+ * checksum function transmitted.
+ */
+ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+
+ VERIFY0(dnode_hold(os, object, FTAG, &dn));
+ ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
dn->dn_checksum = checksum;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
@@ -1549,9 +1589,14 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
{
dnode_t *dn;
- /* XXX assumes dnode_hold will not get an i/o error */
- (void) dnode_hold(os, object, FTAG, &dn);
- ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
+ /*
+ * Send streams include each object's compression function. This
+ * check ensures that the receiving system can understand the
+ * compression function transmitted.
+ */
+ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
+
+ VERIFY0(dnode_hold(os, object, FTAG, &dn));
dn->dn_compress = compress;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
@@ -1717,7 +1762,7 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
doi->doi_fill_count = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
- doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
+ doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
mutex_exit(&dn->dn_mtx);
rw_exit(&dn->dn_struct_rwlock);
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index ef82f17683..efed341d6f 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -338,7 +338,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
* default (fletcher2/off). Snapshots don't need to know about
* checksum/compression/copies.
*/
- if (ds) {
+ if (ds != NULL) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
primary_cache_changed_cb, os);
@@ -391,7 +391,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
kmem_free(os, sizeof (objset_t));
return (err);
}
- } else if (ds == NULL) {
+ } else {
/* It's the meta-objset. */
os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
os->os_compress = ZIO_COMPRESS_LZJB;
@@ -435,17 +435,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
&os->os_groupused_dnode);
}
- /*
- * We should be the only thread trying to do this because we
- * have ds_opening_lock
- */
- if (ds) {
- mutex_enter(&ds->ds_lock);
- ASSERT(ds->ds_objset == NULL);
- ds->ds_objset = os;
- mutex_exit(&ds->ds_lock);
- }
-
*osp = os;
return (0);
}
@@ -456,11 +445,19 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
int err = 0;
mutex_enter(&ds->ds_opening_lock);
- *osp = ds->ds_objset;
- if (*osp == NULL) {
+ if (ds->ds_objset == NULL) {
+ objset_t *os;
err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
- ds, dsl_dataset_get_blkptr(ds), osp);
+ ds, dsl_dataset_get_blkptr(ds), &os);
+
+ if (err == 0) {
+ mutex_enter(&ds->ds_lock);
+ ASSERT(ds->ds_objset == NULL);
+ ds->ds_objset = os;
+ mutex_exit(&ds->ds_lock);
+ }
}
+ *osp = ds->ds_objset;
mutex_exit(&ds->ds_opening_lock);
return (err);
}
@@ -986,6 +983,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
+ ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT3P(bp, ==, os->os_rootbp);
ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
ASSERT0(BP_GET_LEVEL(bp));
@@ -998,7 +996,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
*/
bp->blk_fill = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
- bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+ bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
}
/* ARGSUSED */
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index ed0defc592..5da3f8b0e7 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -48,7 +48,9 @@
#include <sys/zfs_onexit.h>
#include <sys/dmu_send.h>
#include <sys/dsl_destroy.h>
+#include <sys/blkptr.h>
#include <sys/dsl_bookmark.h>
+#include <sys/zfeature.h>
/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
int zfs_send_corrupt_data = B_FALSE;
@@ -168,7 +170,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
}
static int
-dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
+dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
{
struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
@@ -203,13 +205,22 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_offset = offset;
drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
- drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
- if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
- drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
- DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
- DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
- DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
- drrw->drr_key.ddk_cksum = bp->blk_cksum;
+ if (BP_IS_EMBEDDED(bp)) {
+ /*
+ * There's no pre-computed checksum of embedded BP's, so
+ * (like fletcher4-checkummed blocks) userland will have
+ * to compute a dedup-capable checksum itself.
+ */
+ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
+ } else {
+ drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+ if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
+ drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+ DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+ drrw->drr_key.ddk_cksum = bp->blk_cksum;
+ }
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR));
@@ -219,6 +230,43 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
}
static int
+dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
+ int blksz, const blkptr_t *bp)
+{
+ char buf[BPE_PAYLOAD_SIZE];
+ struct drr_write_embedded *drrw =
+ &(dsp->dsa_drr->drr_u.drr_write_embedded);
+
+ if (dsp->dsa_pending_op != PENDING_NONE) {
+ if (dump_bytes(dsp, dsp->dsa_drr,
+ sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ dsp->dsa_pending_op = PENDING_NONE;
+ }
+
+ ASSERT(BP_IS_EMBEDDED(bp));
+
+ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+ dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
+ drrw->drr_object = object;
+ drrw->drr_offset = offset;
+ drrw->drr_length = blksz;
+ drrw->drr_toguid = dsp->dsa_toguid;
+ drrw->drr_compression = BP_GET_COMPRESS(bp);
+ drrw->drr_etype = BPE_GET_ETYPE(bp);
+ drrw->drr_lsize = BPE_GET_LSIZE(bp);
+ drrw->drr_psize = BPE_GET_PSIZE(bp);
+
+ decode_embedded_bp_compressed(bp, buf);
+
+ if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
+ return (EINTR);
+ return (0);
+}
+
+static int
dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
{
struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
@@ -338,6 +386,33 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
return (0);
}
+static boolean_t
+backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
+{
+ if (!BP_IS_EMBEDDED(bp))
+ return (B_FALSE);
+
+ /*
+ * Compression function must be legacy, or explicitly enabled.
+ */
+ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
+ !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
+ return (B_FALSE);
+
+ /*
+ * Embed type must be explicitly enabled.
+ */
+ switch (BPE_GET_ETYPE(bp)) {
+ case BP_EMBEDDED_TYPE_DATA:
+ if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
+ return (B_TRUE);
+ break;
+ default:
+ return (B_FALSE);
+ }
+ return (B_FALSE);
+}
+
#define BP_SPAN(dnp, level) \
(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
@@ -406,11 +481,17 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
(void) arc_buf_remove_ref(abuf, &abuf);
+ } else if (backup_do_embed(dsp, bp)) {
+ /* it's an embedded level-0 block of a regular object */
+ int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ err = dump_write_embedded(dsp, zb->zb_object,
+ zb->zb_blkid * blksz, blksz, bp);
} else { /* it's a level-0 block of a regular object */
uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
+ ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT0(zb->zb_level);
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
@@ -429,7 +510,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
}
}
- err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
+ err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
blksz, bp, abuf->b_data);
(void) arc_buf_remove_ref(abuf, &abuf);
}
@@ -443,14 +524,15 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/
static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
- zfs_bookmark_phys_t *fromzb, boolean_t is_clone, int outfd,
- vnode_t *vp, offset_t *off)
+ zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
+ int outfd, vnode_t *vp, offset_t *off)
{
objset_t *os;
dmu_replay_record_t *drr;
dmu_sendarg_t *dsp;
int err;
uint64_t fromtxg = 0;
+ uint64_t featureflags = 0;
err = dmu_objset_from_ds(ds, &os);
if (err != 0) {
@@ -473,13 +555,23 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
return (SET_ERROR(EINVAL));
}
if (version >= ZPL_VERSION_SA) {
- DMU_SET_FEATUREFLAGS(
- drr->drr_u.drr_begin.drr_versioninfo,
- DMU_BACKUP_FEATURE_SA_SPILL);
+ featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
}
}
#endif
+ if (embedok &&
+ spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
+ featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+ featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
+ } else {
+ embedok = B_FALSE;
+ }
+
+ DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
+ featureflags);
+
drr->drr_u.drr_begin.drr_creation_time =
ds->ds_phys->ds_creation_time;
drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
@@ -511,6 +603,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
dsp->dsa_pending_op = PENDING_NONE;
dsp->dsa_incremental = (fromzb != NULL);
+ dsp->dsa_featureflags = featureflags;
mutex_enter(&ds->ds_sendstream_lock);
list_insert_head(&ds->ds_sendstreams, dsp);
@@ -562,7 +655,7 @@ out:
int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
- int outfd, vnode_t *vp, offset_t *off)
+ boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
@@ -596,10 +689,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
zb.zbm_guid = fromds->ds_phys->ds_guid;
is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG);
- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
outfd, vp, off);
} else {
- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
outfd, vp, off);
}
dsl_dataset_rele(ds, FTAG);
@@ -607,7 +700,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
}
int
-dmu_send(const char *tosnap, const char *fromsnap,
+dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
@@ -674,10 +767,10 @@ dmu_send(const char *tosnap, const char *fromsnap,
dsl_pool_rele(dp, FTAG);
return (err);
}
- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
outfd, vp, off);
} else {
- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
outfd, vp, off);
}
if (owned)
@@ -847,6 +940,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
uint64_t fromguid = drrb->drr_fromguid;
int flags = drrb->drr_flags;
int error;
+ uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
dsl_dataset_t *ds;
const char *tofs = drba->drba_cookie->drc_tofs;
@@ -860,11 +954,22 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
return (SET_ERROR(EINVAL));
/* Verify pool version supports SA if SA_SPILL feature set */
- if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
- DMU_BACKUP_FEATURE_SA_SPILL) &&
- spa_version(dp->dp_spa) < SPA_VERSION_SA) {
+ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+ spa_version(dp->dp_spa) < SPA_VERSION_SA)
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+ * record to a plan WRITE record, so the pool must have the
+ * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+ * records. Same with WRITE_EMBEDDED records that use LZ4 compression.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
- }
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
@@ -1159,7 +1264,6 @@ backup_byteswap(dmu_replay_record_t *drr)
break;
case DRR_OBJECT:
DO64(drr_object.drr_object);
- /* DO64(drr_object.drr_allocation_txg); */
DO32(drr_object.drr_type);
DO32(drr_object.drr_bonustype);
DO32(drr_object.drr_blksz);
@@ -1197,6 +1301,14 @@ backup_byteswap(dmu_replay_record_t *drr)
DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
DO64(drr_write_byref.drr_key.ddk_prop);
break;
+ case DRR_WRITE_EMBEDDED:
+ DO64(drr_write_embedded.drr_object);
+ DO64(drr_write_embedded.drr_offset);
+ DO64(drr_write_embedded.drr_length);
+ DO64(drr_write_embedded.drr_toguid);
+ DO32(drr_write_embedded.drr_lsize);
+ DO32(drr_write_embedded.drr_psize);
+ break;
case DRR_FREE:
DO64(drr_free.drr_object);
DO64(drr_free.drr_offset);
@@ -1384,7 +1496,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
int err;
guid_map_entry_t gmesrch;
guid_map_entry_t *gmep;
- avl_index_t where;
+ avl_index_t where;
objset_t *ref_os = NULL;
dmu_buf_t *dbp;
@@ -1407,8 +1519,9 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
ref_os = os;
}
- if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
- drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
+ err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
+ drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
+ if (err != 0)
return (err);
tx = dmu_tx_create(os);
@@ -1428,6 +1541,48 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
}
static int
+restore_write_embedded(struct restorearg *ra, objset_t *os,
+ struct drr_write_embedded *drrwnp)
+{
+ dmu_tx_t *tx;
+ int err;
+ void *data;
+
+ if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
+ return (EINVAL);
+
+ if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE)
+ return (EINVAL);
+
+ if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES)
+ return (EINVAL);
+ if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
+ return (EINVAL);
+
+ data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8));
+ if (data == NULL)
+ return (ra->err);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, drrwnp->drr_object,
+ drrwnp->drr_offset, drrwnp->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ dmu_write_embedded(os, drrwnp->drr_object,
+ drrwnp->drr_offset, data, drrwnp->drr_etype,
+ drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
+ ra->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+static int
restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
{
dmu_tx_t *tx;
@@ -1621,6 +1776,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
ra.err = restore_write_byref(&ra, os, &drrwbr);
break;
}
+ case DRR_WRITE_EMBEDDED:
+ {
+ struct drr_write_embedded drrwe =
+ drr->drr_u.drr_write_embedded;
+ ra.err = restore_write_embedded(&ra, os, &drrwe);
+ break;
+ }
case DRR_FREE:
{
struct drr_free drrf = drr->drr_u.drr_free;
diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c
index 1f64d73acd..ad2f09ed4a 100644
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -58,6 +58,7 @@ typedef struct traverse_data {
zbookmark_t *td_resume;
int td_flags;
prefetch_data_t *td_pfd;
+ boolean_t td_paused;
blkptr_cb_t *td_func;
void *td_arg;
} traverse_data_t;
@@ -163,7 +164,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
* If we found the block we're trying to resume from, zero
* the bookmark out to indicate that we have resumed.
*/
- ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
bzero(td->td_resume, sizeof (*zb));
if (td->td_flags & TRAVERSE_POST)
@@ -174,14 +174,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
}
static void
-traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
-{
- ASSERT(td->td_resume != NULL);
- ASSERT0(zb->zb_level);
- bcopy(zb, td->td_resume, sizeof (*td->td_resume));
-}
-
-static void
traverse_prefetch_metadata(traverse_data_t *td,
const blkptr_t *bp, const zbookmark_t *zb)
{
@@ -205,16 +197,25 @@ traverse_prefetch_metadata(traverse_data_t *td,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
}
+static boolean_t
+prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
+{
+ ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
+ BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
+ return (B_FALSE);
+ return (B_TRUE);
+}
+
static int
traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
const blkptr_t *bp, const zbookmark_t *zb)
{
zbookmark_t czb;
- int err = 0, lasterr = 0;
+ int err = 0;
arc_buf_t *buf = NULL;
prefetch_data_t *pd = td->td_pfd;
boolean_t hard = td->td_flags & TRAVERSE_HARD;
- boolean_t pause = B_FALSE;
switch (resume_skip_check(td, dnp, zb)) {
case RESUME_SKIP_ALL:
@@ -251,14 +252,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
return (0);
}
- if (BP_IS_HOLE(bp)) {
- err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
- return (err);
- }
-
- if (pd && !pd->pd_exited &&
- ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
- BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
+ if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
mutex_enter(&pd->pd_mtx);
ASSERT(pd->pd_blks_fetched >= 0);
while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
@@ -268,13 +262,18 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
mutex_exit(&pd->pd_mtx);
}
+ if (BP_IS_HOLE(bp)) {
+ err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+ if (err != 0)
+ goto post;
+ return (0);
+ }
+
if (td->td_flags & TRAVERSE_PRE) {
err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
td->td_arg);
if (err == TRAVERSE_VISIT_NO_CHILDREN)
return (0);
- if (err == ERESTART)
- pause = B_TRUE; /* handle pausing at a common point */
if (err != 0)
goto post;
}
@@ -288,7 +287,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
- return (err);
+ goto post;
cbp = buf->b_data;
for (i = 0; i < epb; i++) {
@@ -304,11 +303,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
zb->zb_level - 1,
zb->zb_blkid * epb + i);
err = traverse_visitbp(td, dnp, &cbp[i], &czb);
- if (err != 0) {
- if (!hard)
- break;
- lasterr = err;
- }
+ if (err != 0)
+ break;
}
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_WAIT;
@@ -318,7 +314,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
- return (err);
+ goto post;
dnp = buf->b_data;
for (i = 0; i < epb; i++) {
@@ -330,11 +326,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
for (i = 0; i < epb; i++) {
err = traverse_dnode(td, &dnp[i], zb->zb_objset,
zb->zb_blkid * epb + i);
- if (err != 0) {
- if (!hard)
- break;
- lasterr = err;
- }
+ if (err != 0)
+ break;
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t flags = ARC_WAIT;
@@ -344,7 +337,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
- return (err);
+ goto post;
osp = buf->b_data;
dnp = &osp->os_meta_dnode;
@@ -359,19 +352,11 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = traverse_dnode(td, dnp, zb->zb_objset,
DMU_META_DNODE_OBJECT);
- if (err && hard) {
- lasterr = err;
- err = 0;
- }
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
dnp = &osp->os_groupused_dnode;
err = traverse_dnode(td, dnp, zb->zb_objset,
DMU_GROUPUSED_OBJECT);
}
- if (err && hard) {
- lasterr = err;
- err = 0;
- }
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
dnp = &osp->os_userused_dnode;
err = traverse_dnode(td, dnp, zb->zb_objset,
@@ -383,19 +368,37 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
(void) arc_buf_remove_ref(buf, &buf);
post:
- if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
+ if (err == 0 && (td->td_flags & TRAVERSE_POST))
err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
- if (err == ERESTART)
- pause = B_TRUE;
+
+ if (hard && (err == EIO || err == ECKSUM)) {
+ /*
+ * Ignore this disk error as requested by the HARD flag,
+ * and continue traversal.
+ */
+ err = 0;
}
- if (pause && td->td_resume != NULL) {
- ASSERT3U(err, ==, ERESTART);
- ASSERT(!hard);
- traverse_pause(td, zb);
+ /*
+ * If we are stopping here, set td_resume.
+ */
+ if (td->td_resume != NULL && err != 0 && !td->td_paused) {
+ td->td_resume->zb_objset = zb->zb_objset;
+ td->td_resume->zb_object = zb->zb_object;
+ td->td_resume->zb_level = 0;
+ /*
+ * If we have stopped on an indirect block (e.g. due to
+ * i/o error), we have not visited anything below it.
+ * Set the bookmark to the first level-0 block that we need
+ * to visit. This way, the resuming code does not need to
+ * deal with resuming from indirect blocks.
+ */
+ td->td_resume->zb_blkid = zb->zb_blkid <<
+ (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+ td->td_paused = B_TRUE;
}
- return (err != 0 ? err : lasterr);
+ return (err);
}
static void
@@ -420,30 +423,21 @@ static int
traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
uint64_t objset, uint64_t object)
{
- int j, err = 0, lasterr = 0;
+ int j, err = 0;
zbookmark_t czb;
- boolean_t hard = (td->td_flags & TRAVERSE_HARD);
for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
- if (err != 0) {
- if (!hard)
- break;
- lasterr = err;
- }
+ if (err != 0)
+ break;
}
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
- if (err != 0) {
- if (!hard)
- return (err);
- lasterr = err;
- }
}
- return (err != 0 ? err : lasterr);
+ return (err);
}
/* ARGSUSED */
@@ -458,10 +452,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (pfd->pd_cancel)
return (SET_ERROR(EINTR));
- if (BP_IS_HOLE(bp) ||
- !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
- BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
- BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
+ if (!prefetch_needed(pfd, bp))
return (0);
mutex_enter(&pfd->pd_mtx);
@@ -530,6 +521,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
td.td_arg = arg;
td.td_pfd = &pd;
td.td_flags = flags;
+ td.td_paused = B_FALSE;
pd.pd_blks_max = zfs_pd_blks_max;
pd.pd_flags = flags;
@@ -603,7 +595,7 @@ int
traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
blkptr_cb_t func, void *arg)
{
- int err, lasterr = 0;
+ int err;
uint64_t obj;
dsl_pool_t *dp = spa_get_dsl(spa);
objset_t *mos = dp->dp_meta_objset;
@@ -616,16 +608,15 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
return (err);
/* visit each dataset */
- for (obj = 1; err == 0 || (err != ESRCH && hard);
+ for (obj = 1; err == 0;
err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
dmu_object_info_t doi;
err = dmu_object_info(mos, obj, &doi);
if (err != 0) {
- if (!hard)
- return (err);
- lasterr = err;
- continue;
+ if (hard)
+ continue;
+ break;
}
if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
@@ -636,23 +627,19 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
dsl_pool_config_exit(dp, FTAG);
if (err != 0) {
- if (!hard)
- return (err);
- lasterr = err;
- continue;
+ if (hard)
+ continue;
+ break;
}
if (ds->ds_phys->ds_prev_snap_txg > txg)
txg = ds->ds_phys->ds_prev_snap_txg;
err = traverse_dataset(ds, txg, flags, func, arg);
dsl_dataset_rele(ds, FTAG);
- if (err != 0) {
- if (!hard)
- return (err);
- lasterr = err;
- }
+ if (err != 0)
+ break;
}
}
if (err == ESRCH)
err = 0;
- return (err != 0 ? err : lasterr);
+ return (err);
}
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
index 18121e9eb6..7261314ad5 100644
--- a/usr/src/uts/common/fs/zfs/dnode.c
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -1811,8 +1811,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
*offset = *offset >> span;
for (i = BF64_GET(*offset, 0, epbs);
i >= 0 && i < epb; i += inc) {
- if (bp[i].blk_fill >= minfill &&
- bp[i].blk_fill <= maxfill &&
+ if (BP_GET_FILL(&bp[i]) >= minfill &&
+ BP_GET_FILL(&bp[i]) <= maxfill &&
(hole || bp[i].blk_birth > txg))
break;
if (inc > 0 || *offset > 0)
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
index ee6edd3d4e..04fbd37b6e 100644
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -233,8 +233,6 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
}
#endif
-#define ALL -1
-
static void
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
dmu_tx_t *tx)
@@ -362,7 +360,6 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
free_children(db, blkid, nblks, tx);
dbuf_rele(db, FTAG);
-
}
}
@@ -591,11 +588,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnp->dn_bonustype = dn->dn_bonustype;
dnp->dn_bonuslen = dn->dn_bonuslen;
}
-
ASSERT(dnp->dn_nlevels > 1 ||
BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ ASSERT(dnp->dn_nlevels < 2 ||
+ BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
if (dn->dn_next_type[txgoff] != 0) {
dnp->dn_type = dn->dn_type;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index dcb0c2d615..c587237969 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -1641,7 +1641,7 @@ dsl_dataset_space(dsl_dataset_t *ds,
else
*availbytesp = 0;
}
- *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
+ *usedobjsp = BP_GET_FILL(&ds->ds_phys->ds_bp);
*availobjsp = DN_MAX_OBJECT - *usedobjsp;
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_destroy.c b/usr/src/uts/common/fs/zfs/dsl_destroy.c
index 639412cdee..441036c25a 100644
--- a/usr/src/uts/common/fs/zfs/dsl_destroy.c
+++ b/usr/src/uts/common/fs/zfs/dsl_destroy.c
@@ -539,7 +539,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
struct killarg *ka = arg;
dmu_tx_t *tx = ka->tx;
- if (BP_IS_HOLE(bp))
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0);
if (zb->zb_level == ZB_ZIL_LEVEL) {
@@ -589,6 +589,7 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
uint64_t count;
objset_t *mos;
+ ASSERT(!dsl_dataset_is_snapshot(ds));
if (dsl_dataset_is_snapshot(ds))
return (SET_ERROR(EINVAL));
@@ -711,7 +712,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
ds->ds_prev->ds_phys->ds_num_children == 2 &&
ds->ds_prev->ds_userrefs == 0);
- /* Remove our reservation */
+ /* Remove our reservation. */
if (ds->ds_reserved != 0) {
dsl_dataset_set_refreservation_sync_impl(ds,
(ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index ba6df9a7e3..73a6cd271f 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -250,6 +250,13 @@ dsl_pool_open(dsl_pool_t *dp)
dp->dp_meta_objset, obj));
}
+ /*
+ * Note: errors ignored, because the leak dir will not exist if we
+ * have not encountered a leak yet.
+ */
+ (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
+ &dp->dp_leak_dir);
+
if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
@@ -297,6 +304,8 @@ dsl_pool_close(dsl_pool_t *dp)
dsl_dir_rele(dp->dp_mos_dir, dp);
if (dp->dp_free_dir)
dsl_dir_rele(dp->dp_free_dir, dp);
+ if (dp->dp_leak_dir)
+ dsl_dir_rele(dp->dp_leak_dir, dp);
if (dp->dp_root_dir)
dsl_dir_rele(dp->dp_root_dir, dp);
diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c
index f5e2ea825d..860f9dfb4e 100644
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c
@@ -52,9 +52,7 @@
typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
-static scan_cb_t dsl_scan_defrag_cb;
static scan_cb_t dsl_scan_scrub_cb;
-static scan_cb_t dsl_scan_remove_cb;
static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
@@ -67,7 +65,7 @@ int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
@@ -1358,7 +1356,7 @@ dsl_scan_active(dsl_scan_t *scn)
if (spa_shutting_down(spa))
return (B_FALSE);
if (scn->scn_phys.scn_state == DSS_SCANNING ||
- scn->scn_async_destroying)
+ (scn->scn_async_destroying && !scn->scn_async_stalled))
return (B_TRUE);
if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
@@ -1373,7 +1371,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
{
dsl_scan_t *scn = dp->dp_scan;
spa_t *spa = dp->dp_spa;
- int err;
+ int err = 0;
/*
* Check for scn_restart_txg before checking spa_load_state, so
@@ -1391,7 +1389,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
dsl_scan_setup_sync(&func, tx);
}
- if (!dsl_scan_active(scn) ||
+ /*
+ * If the scan is inactive due to a stalled async destroy, try again.
+ */
+ if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) ||
spa_sync_pass(dp->dp_spa) > 1)
return;
@@ -1401,10 +1402,11 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
spa->spa_scrub_active = B_TRUE;
/*
- * First process the free list. If we pause the free, don't do
- * any scanning. This ensures that there is no free list when
- * we are scanning, so the scan code doesn't have to worry about
- * traversing it.
+ * First process the async destroys. If we pause, don't do
+ * any scrubbing or resilvering. This ensures that there are no
+ * async destroys while we are scanning, so the scan code doesn't
+ * have to worry about traversing it. It is also faster to free the
+ * blocks than to scrub them.
*/
if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
scn->scn_is_bptree = B_FALSE;
@@ -1414,48 +1416,96 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
dsl_scan_free_block_cb, scn, tx);
VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
- if (err == 0 && spa_feature_is_active(spa,
- SPA_FEATURE_ASYNC_DESTROY)) {
- ASSERT(scn->scn_async_destroying);
- scn->scn_is_bptree = B_TRUE;
- scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
- NULL, ZIO_FLAG_MUSTSUCCEED);
- err = bptree_iterate(dp->dp_meta_objset,
- dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
- scn, tx);
- VERIFY0(zio_wait(scn->scn_zio_root));
-
- if (err == 0) {
- /* finished; deactivate async destroy feature */
- spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY,
- tx);
- ASSERT(!spa_feature_is_active(spa,
- SPA_FEATURE_ASYNC_DESTROY));
- VERIFY0(zap_remove(dp->dp_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_BPTREE_OBJ, tx));
- VERIFY0(bptree_free(dp->dp_meta_objset,
- dp->dp_bptree_obj, tx));
- dp->dp_bptree_obj = 0;
- scn->scn_async_destroying = B_FALSE;
- }
+ if (err != 0 && err != ERESTART)
+ zfs_panic_recover("error %u from bpobj_iterate()", err);
+ }
+
+ if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+ ASSERT(scn->scn_async_destroying);
+ scn->scn_is_bptree = B_TRUE;
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_MUSTSUCCEED);
+ err = bptree_iterate(dp->dp_meta_objset,
+ dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
+ VERIFY0(zio_wait(scn->scn_zio_root));
+
+ if (err == EIO || err == ECKSUM) {
+ err = 0;
+ } else if (err != 0 && err != ERESTART) {
+ zfs_panic_recover("error %u from "
+ "traverse_dataset_destroyed()", err);
}
- if (scn->scn_visited_this_txg) {
- zfs_dbgmsg("freed %llu blocks in %llums from "
- "free_bpobj/bptree txg %llu",
- (longlong_t)scn->scn_visited_this_txg,
- (longlong_t)
- NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
- (longlong_t)tx->tx_txg);
- scn->scn_visited_this_txg = 0;
- /*
- * Re-sync the ddt so that we can further modify
- * it when doing bprewrite.
- */
- ddt_sync(spa, tx->tx_txg);
+
+ /*
+ * If we didn't make progress, mark the async destroy as
+ * stalled, so that we will not initiate a spa_sync() on
+ * its behalf.
+ */
+ scn->scn_async_stalled = (scn->scn_visited_this_txg == 0);
+
+ if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
+ /* finished; deactivate async destroy feature */
+ spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
+ ASSERT(!spa_feature_is_active(spa,
+ SPA_FEATURE_ASYNC_DESTROY));
+ VERIFY0(zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_BPTREE_OBJ, tx));
+ VERIFY0(bptree_free(dp->dp_meta_objset,
+ dp->dp_bptree_obj, tx));
+ dp->dp_bptree_obj = 0;
+ scn->scn_async_destroying = B_FALSE;
}
- if (err == ERESTART)
- return;
+ }
+ if (scn->scn_visited_this_txg) {
+ zfs_dbgmsg("freed %llu blocks in %llums from "
+ "free_bpobj/bptree txg %llu; err=%u",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)
+ NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
+ (longlong_t)tx->tx_txg, err);
+ scn->scn_visited_this_txg = 0;
+
+ /*
+ * Write out changes to the DDT that may be required as a
+ * result of the blocks freed. This ensures that the DDT
+ * is clean when a scrub/resilver runs.
+ */
+ ddt_sync(spa, tx->tx_txg);
+ }
+ if (err != 0)
+ return;
+ if (!scn->scn_async_destroying && zfs_free_leak_on_eio &&
+ (dp->dp_free_dir->dd_phys->dd_used_bytes != 0 ||
+ dp->dp_free_dir->dd_phys->dd_compressed_bytes != 0 ||
+ dp->dp_free_dir->dd_phys->dd_uncompressed_bytes != 0)) {
+ /*
+ * We have finished background destroying, but there is still
+ * some space left in the dp_free_dir. Transfer this leaked
+ * space to the dp_leak_dir.
+ */
+ if (dp->dp_leak_dir == NULL) {
+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+ LEAK_DIR_NAME, tx);
+ VERIFY0(dsl_pool_open_special_dir(dp,
+ LEAK_DIR_NAME, &dp->dp_leak_dir));
+ rrw_exit(&dp->dp_config_rwlock, FTAG);
+ }
+ dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
+ dp->dp_free_dir->dd_phys->dd_used_bytes,
+ dp->dp_free_dir->dd_phys->dd_compressed_bytes,
+ dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
+ dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+ -dp->dp_free_dir->dd_phys->dd_used_bytes,
+ -dp->dp_free_dir->dd_phys->dd_compressed_bytes,
+ -dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
+ }
+ if (!scn->scn_async_destroying) {
+ /* finished; verify that space accounting went to zero */
+ ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
+ ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);
+ ASSERT0(dp->dp_free_dir->dd_phys->dd_uncompressed_bytes);
}
if (scn->scn_phys.scn_state != DSS_SCANNING)
@@ -1638,6 +1688,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
count_block(dp->dp_blkstats, bp);
+ if (BP_IS_EMBEDDED(bp))
+ return (0);
+
ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
zio_flags |= ZIO_FLAG_SCRUB;
diff --git a/usr/src/uts/common/fs/zfs/dsl_userhold.c b/usr/src/uts/common/fs/zfs/dsl_userhold.c
index 7f2c26f766..be5b7102ca 100644
--- a/usr/src/uts/common/fs/zfs/dsl_userhold.c
+++ b/usr/src/uts/common/fs/zfs/dsl_userhold.c
@@ -600,8 +600,7 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
ddura.ddura_chkholds = fnvlist_alloc();
error = dsl_sync_task(pool, dsl_dataset_user_release_check,
- dsl_dataset_user_release_sync, &ddura,
- fnvlist_num_pairs(holds));
+ dsl_dataset_user_release_sync, &ddura, 0);
fnvlist_free(ddura.ddura_todelete);
fnvlist_free(ddura.ddura_chkholds);
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index baa54392c6..f708ce3ef2 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -241,19 +241,25 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
}
if (pool != NULL) {
- dsl_dir_t *freedir = pool->dp_free_dir;
-
/*
* The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
* when opening pools before this version freedir will be NULL.
*/
- if (freedir != NULL) {
+ if (pool->dp_free_dir != NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
- freedir->dd_phys->dd_used_bytes, src);
+ pool->dp_free_dir->dd_phys->dd_used_bytes, src);
} else {
spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
NULL, 0, src);
}
+
+ if (pool->dp_leak_dir != NULL) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
+ pool->dp_leak_dir->dd_phys->dd_used_bytes, src);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
+ NULL, 0, src);
+ }
}
spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
@@ -1827,7 +1833,7 @@ static int
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
- if (!BP_IS_HOLE(bp)) {
+ if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
zio_t *rio = arg;
size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size);
@@ -2375,9 +2381,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
- &spa->spa_feat_enabled_txg_obj) != 0) {
+ &spa->spa_feat_enabled_txg_obj) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
- }
}
spa->spa_is_initializing = B_TRUE;
@@ -5260,11 +5265,6 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
ASSERT(!locked);
ASSERT(vd == vd->vdev_top);
- /*
- * XXX - Once we have bp-rewrite this should
- * become the common case.
- */
-
mg = vd->vdev_mg;
/*
@@ -6438,7 +6438,7 @@ spa_upgrade(spa_t *spa, uint64_t version)
* possible.
*/
ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
- ASSERT(version >= spa->spa_uberblock.ub_version);
+ ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
spa->spa_uberblock.ub_version = version;
vdev_config_dirty(spa->spa_root_vdev);
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index fa0658493e..ea386591b2 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -250,7 +250,38 @@ int zfs_flags = 0;
* This should only be used as a last resort, as it typically results
* in leaked space, or worse.
*/
-int zfs_recover = 0;
+boolean_t zfs_recover = B_FALSE;
+
+/*
+ * If destroy encounters an EIO while reading metadata (e.g. indirect
+ * blocks), space referenced by the missing metadata can not be freed.
+ * Normally this causes the background destroy to become "stalled", as
+ * it is unable to make forward progress. While in this stalled state,
+ * all remaining space to free from the error-encountering filesystem is
+ * "temporarily leaked". Set this flag to cause it to ignore the EIO,
+ * permanently leak the space from indirect blocks that can not be read,
+ * and continue to free everything else that it can.
+ *
+ * The default, "stalling" behavior is useful if the storage partially
+ * fails (i.e. some but not all i/os fail), and then later recovers. In
+ * this case, we will be able to continue pool operations while it is
+ * partially failed, and when it recovers, we can continue to free the
+ * space, with no leaks. However, note that this case is actually
+ * fairly rare.
+ *
+ * Typically pools either (a) fail completely (but perhaps temporarily,
+ * e.g. a top-level vdev going offline), or (b) have localized,
+ * permanent errors (e.g. disk returns the wrong data due to bit flip or
+ * firmware bug). In case (a), this setting does not matter because the
+ * pool will be suspended and the sync thread will not be able to make
+ * forward progress regardless. In case (b), because the error is
+ * permanent, the best we can do is leak the minimum amount of space,
+ * which is what setting this flag will do. Therefore, it is reasonable
+ * for this flag to normally be set, but we chose the more conservative
+ * approach of not setting it, so that there is no possibility of
+ * leaking space in the "partial temporary" failure case.
+ */
+boolean_t zfs_free_leak_on_eio = B_FALSE;
/*
* Expiration time in milliseconds. This value has two meanings. First it is
@@ -1341,7 +1372,10 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
sizeof (type));
}
- checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+ if (!BP_IS_EMBEDDED(bp)) {
+ checksum =
+ zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+ }
compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
}
@@ -1643,7 +1677,7 @@ bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
{
uint64_t dsize = 0;
- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++)
dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
return (dsize);
@@ -1656,7 +1690,7 @@ bp_get_dsize(spa_t *spa, const blkptr_t *bp)
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++)
dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
spa_config_exit(spa, SCL_VDEV, FTAG);
diff --git a/usr/src/uts/common/fs/zfs/sys/blkptr.h b/usr/src/uts/common/fs/zfs/sys/blkptr.h
new file mode 100644
index 0000000000..b720482a73
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/blkptr.h
@@ -0,0 +1,38 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_BLKPTR_H
+#define _SYS_BLKPTR_H
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void encode_embedded_bp_compressed(blkptr_t *, void *,
+ enum zio_compress, int, int);
+void decode_embedded_bp_compressed(const blkptr_t *, void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BLKPTR_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/bptree.h b/usr/src/uts/common/fs/zfs/sys/bptree.h
index 9715072118..a533cb9490 100644
--- a/usr/src/uts/common/fs/zfs/sys/bptree.h
+++ b/usr/src/uts/common/fs/zfs/sys/bptree.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_BPTREE_H
@@ -50,6 +50,7 @@ typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx);
int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+boolean_t bptree_is_empty(objset_t *os, uint64_t obj);
void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
index 207834dfd4..c80b7ffba3 100644
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -274,6 +274,9 @@ void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
+void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+ bp_embedded_type_t etype, enum zio_compress comp,
+ int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
void dbuf_clear(dmu_buf_impl_t *db);
void dbuf_evict(dmu_buf_impl_t *db);
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 2237d43996..9a6006ce66 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -119,6 +119,14 @@ typedef enum dmu_object_byteswap {
((ot) & DMU_OT_METADATA) : \
dmu_ot[(ot)].ot_metadata)
+/*
+ * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
+ * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
+ * is repurposed for embedded BPs.
+ */
+#define DMU_OT_HAS_FILL(ot) \
+ ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
+
#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
((ot) & DMU_OT_BYTESWAP_MASK) : \
dmu_ot[(ot)].ot_byteswap)
@@ -245,7 +253,6 @@ void zfs_znode_byteswap(void *buf, size_t size);
#define DMU_USERUSED_OBJECT (-1ULL)
#define DMU_GROUPUSED_OBJECT (-2ULL)
-#define DMU_DEADLIST_OBJECT (-3ULL)
/*
* artificial blkids for bonus buffer and spill blocks
@@ -394,6 +401,11 @@ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dmu_tx_t *tx);
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+ void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+ int compressed_size, int byteorder, dmu_tx_t *tx);
+
/*
* Decide how to write a block: checksum, compression, number of copies, etc.
*/
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
index 76af7bd54e..cf45e91dba 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
@@ -294,12 +294,15 @@ typedef struct dmu_sendarg {
int dsa_err;
dmu_pendop_t dsa_pending_op;
boolean_t dsa_incremental;
+ uint64_t dsa_featureflags;
uint64_t dsa_last_data_object;
uint64_t dsa_last_data_offset;
} dmu_sendarg_t;
void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
+int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
+ void *, dmu_buf_t **);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_send.h b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
index f45949b8d7..dc183c02c3 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_send.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
@@ -37,12 +37,12 @@ struct dsl_dataset;
struct drr_begin;
struct avl_tree;
-int dmu_send(const char *tosnap, const char *fromsnap, int outfd,
- struct vnode *vp, offset_t *off);
+int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+ int outfd, struct vnode *vp, offset_t *off);
int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
uint64_t *sizep);
int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
- int outfd, struct vnode *vp, offset_t *off);
+ boolean_t embedok, int outfd, vnode_t *vp, offset_t *off);
typedef struct dmu_recv_cookie {
struct dsl_dataset *drc_ds;
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
index 752ccadb72..a9c4f67515 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
@@ -160,6 +160,7 @@ boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
#define ORIGIN_DIR_NAME "$ORIGIN"
#define XLATION_DIR_NAME "$XLATION"
#define FREE_DIR_NAME "$FREE"
+#define LEAK_DIR_NAME "$LEAK"
#ifdef ZFS_DEBUG
#define dprintf_dd(dd, fmt, ...) do { \
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
index ea180c9a5b..a6fb201047 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
@@ -84,6 +84,7 @@ typedef struct dsl_pool {
struct dsl_dir *dp_root_dir;
struct dsl_dir *dp_mos_dir;
struct dsl_dir *dp_free_dir;
+ struct dsl_dir *dp_leak_dir;
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_vnrele_taskq;
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h
index bf8c5ac824..d9f3966807 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h
@@ -114,6 +114,7 @@ typedef struct dsl_scan {
/* for freeing blocks */
boolean_t scn_is_bptree;
boolean_t scn_async_destroying;
+ boolean_t scn_async_stalled;
/* for debugging / information */
uint64_t scn_visited_this_txg;
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index c76dc0fbad..4bb3d0a9ad 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -154,7 +154,7 @@ typedef struct zio_cksum {
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE |
+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@@ -188,7 +188,8 @@ typedef struct zio_cksum {
* G gang block indicator
* B byteorder (endianness)
* D dedup
- * X unused
+ * X encryption (on version 30, which is not supported)
+ * E blkptr_t contains embedded data (see below)
* lvl level of indirection
* type DMU object type
* phys birth txg of block allocation; zero if same as logical birth txg
@@ -196,6 +197,100 @@ typedef struct zio_cksum {
* fill count number of non-zero blocks under this bp
* checksum[4] 256-bit checksum of the data this bp describes
*/
+
+/*
+ * "Embedded" blkptr_t's don't actually point to a block, instead they
+ * have a data payload embedded in the blkptr_t itself. See the comment
+ * in blkptr.c for more details.
+ *
+ * The blkptr_t is laid out as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | payload |
+ * 1 | payload |
+ * 2 | payload |
+ * 3 | payload |
+ * 4 | payload |
+ * 5 | payload |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7 | payload |
+ * 8 | payload |
+ * 9 | payload |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * a | logical birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * b | payload |
+ * c | payload |
+ * d | payload |
+ * e | payload |
+ * f | payload |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * payload contains the embedded data
+ * B (byteorder) byteorder (endianness)
+ * D (dedup) padding (set to zero)
+ * X encryption (set to zero; see above)
+ * E (embedded) set to one
+ * lvl indirection level
+ * type DMU object type
+ * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*)
+ * comp compression function of payload
+ * PSIZE size of payload after compression, in bytes
+ * LSIZE logical size of payload, in bytes
+ * note that 25 bits is enough to store the largest
+ * "normal" BP's LSIZE (2^16 * 2^9) in bytes
+ * log. birth transaction group in which the block was logically born
+ *
+ * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
+ * bp's they are stored in units of SPA_MINBLOCKSHIFT.
+ * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
+ * The B, D, X, lvl, type, and comp fields are stored the same as with normal
+ * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must
+ * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before
+ * other macros, as they assert that they are only used on BP's of the correct
+ * "embedded-ness".
+ */
+
+#define BPE_GET_ETYPE(bp) \
+ (ASSERT(BP_IS_EMBEDDED(bp)), \
+ BF64_GET((bp)->blk_prop, 40, 8))
+#define BPE_SET_ETYPE(bp, t) do { \
+ ASSERT(BP_IS_EMBEDDED(bp)); \
+ BF64_SET((bp)->blk_prop, 40, 8, t); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BPE_GET_LSIZE(bp) \
+ (ASSERT(BP_IS_EMBEDDED(bp)), \
+ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
+#define BPE_SET_LSIZE(bp, x) do { \
+ ASSERT(BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BPE_GET_PSIZE(bp) \
+ (ASSERT(BP_IS_EMBEDDED(bp)), \
+ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
+#define BPE_SET_PSIZE(bp, x) do { \
+ ASSERT(BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+typedef enum bp_embedded_type {
+ BP_EMBEDDED_TYPE_DATA,
+ BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
+ NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
+} bp_embedded_type_t;
+
+#define BPE_NUM_WORDS 14
+#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
+#define BPE_IS_PAYLOADWORD(bp, wp) \
+ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
+
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
@@ -242,20 +337,37 @@ typedef struct blkptr {
#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
#define BP_GET_LSIZE(bp) \
- BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
-#define BP_SET_LSIZE(bp, x) \
- BF64_SET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
+ (BP_IS_EMBEDDED(bp) ? \
+ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
+ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
+#define BP_SET_LSIZE(bp, x) do { \
+ ASSERT(!BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, \
+ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
+_NOTE(CONSTCOND) } while (0)
#define BP_GET_PSIZE(bp) \
- BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
-#define BP_SET_PSIZE(bp, x) \
- BF64_SET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
+ (BP_IS_EMBEDDED(bp) ? 0 : \
+ BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
+#define BP_SET_PSIZE(bp, x) do { \
+ ASSERT(!BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, \
+ 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7)
+#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x)
-#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
-#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
+#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x)
-#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
-#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
+#define BP_GET_CHECKSUM(bp) \
+ (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
+ BF64_GET((bp)->blk_prop, 40, 8))
+#define BP_SET_CHECKSUM(bp, x) do { \
+ ASSERT(!BP_IS_EMBEDDED(bp)); \
+ BF64_SET((bp)->blk_prop, 40, 8, x); \
+_NOTE(CONSTCOND) } while (0)
#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
@@ -263,9 +375,6 @@ typedef struct blkptr {
#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
-#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1)
-#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x)
-
#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
@@ -273,31 +382,39 @@ typedef struct blkptr {
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
#define BP_PHYSICAL_BIRTH(bp) \
- ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+ (BP_IS_EMBEDDED(bp) ? 0 : \
+ (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
#define BP_SET_BIRTH(bp, logical, physical) \
{ \
+ ASSERT(!BP_IS_EMBEDDED(bp)); \
(bp)->blk_birth = (logical); \
(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
}
+#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
+
#define BP_GET_ASIZE(bp) \
- (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
- DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+ (BP_IS_EMBEDDED(bp) ? 0 : \
+ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ DVA_GET_ASIZE(&(bp)->blk_dva[2]))
#define BP_GET_UCSIZE(bp) \
((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
#define BP_GET_NDVAS(bp) \
- (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+ (BP_IS_EMBEDDED(bp) ? 0 : \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
#define BP_COUNT_GANG(bp) \
+ (BP_IS_EMBEDDED(bp) ? 0 : \
(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
DVA_GET_GANG(&(bp)->blk_dva[1]) + \
- DVA_GET_GANG(&(bp)->blk_dva[2]))
+ DVA_GET_GANG(&(bp)->blk_dva[2])))
#define DVA_EQUAL(dva1, dva2) \
((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
@@ -305,6 +422,7 @@ typedef struct blkptr {
#define BP_EQUAL(bp1, bp2) \
(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
+ (bp1)->blk_birth == (bp2)->blk_birth && \
DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
@@ -325,11 +443,13 @@ typedef struct blkptr {
(zcp)->zc_word[3] = w3; \
}
-#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
-#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
+#define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
+#define BP_IS_GANG(bp) \
+ (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \
(dva)->dva_word[1] == 0ULL)
-#define BP_IS_HOLE(bp) DVA_IS_EMPTY(BP_IDENTITY(bp))
+#define BP_IS_HOLE(bp) \
+ (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
/* BP_IS_RAIDZ(bp) assumes no block compression */
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
@@ -383,6 +503,17 @@ typedef struct blkptr {
" birth=%lluL", \
(u_longlong_t)bp->blk_birth); \
} \
+ } else if (BP_IS_EMBEDDED(bp)) { \
+ len = func(buf + len, size - len, \
+ "EMBEDDED [L%llu %s] et=%u %s " \
+ "size=%llxL/%llxP birth=%lluL", \
+ (u_longlong_t)BP_GET_LEVEL(bp), \
+ type, \
+ (int)BPE_GET_ETYPE(bp), \
+ compress, \
+ (u_longlong_t)BPE_GET_LSIZE(bp), \
+ (u_longlong_t)BPE_GET_PSIZE(bp), \
+ (u_longlong_t)bp->blk_birth); \
} else { \
for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
const dva_t *dva = &bp->blk_dva[d]; \
@@ -416,7 +547,7 @@ typedef struct blkptr {
(u_longlong_t)BP_GET_PSIZE(bp), \
(u_longlong_t)bp->blk_birth, \
(u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
- (u_longlong_t)bp->blk_fill, \
+ (u_longlong_t)BP_GET_FILL(bp), \
ws, \
(u_longlong_t)bp->blk_cksum.zc_word[0], \
(u_longlong_t)bp->blk_cksum.zc_word[1], \
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 40f447e9ef..9d74efa184 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -38,6 +38,7 @@
#include <sys/refcount.h>
#include <sys/bplist.h>
#include <sys/bpobj.h>
+#include <sys/zfeature.h>
#include <zfeature_common.h>
#ifdef __cplusplus
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
index e6a1fa2828..3c8e2177bc 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
@@ -47,7 +47,8 @@ extern "C" {
#endif
extern int zfs_flags;
-extern int zfs_recover;
+extern boolean_t zfs_recover;
+extern boolean_t zfs_free_leak_on_eio;
#define ZFS_DEBUG_DPRINTF (1<<0)
#define ZFS_DEBUG_DBUF_VERIFY (1<<1)
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index 9422177023..bf9f83c376 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_IOCTL_H
@@ -79,15 +79,19 @@ typedef enum drr_headertype {
* Feature flags for zfs send streams (flags in drr_versioninfo)
*/
-#define DMU_BACKUP_FEATURE_DEDUP (0x1)
-#define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2)
-#define DMU_BACKUP_FEATURE_SA_SPILL (0x4)
+#define DMU_BACKUP_FEATURE_DEDUP (1<<0)
+#define DMU_BACKUP_FEATURE_DEDUPPROPS (1<<1)
+#define DMU_BACKUP_FEATURE_SA_SPILL (1<<2)
+/* flags #3 - #15 are reserved for incompatible closed-source implementations */
+#define DMU_BACKUP_FEATURE_EMBED_DATA (1<<16)
+#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17)
/*
* Mask of all supported backup features
*/
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
- DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL)
+ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
+ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)
/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
@@ -129,7 +133,7 @@ typedef struct dmu_replay_record {
enum {
DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
- DRR_SPILL, DRR_NUMTYPES
+ DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES
} drr_type;
uint32_t drr_payloadlen;
union {
@@ -206,6 +210,19 @@ typedef struct dmu_replay_record {
uint64_t drr_pad[4]; /* needed for crypto */
/* spill data follows */
} drr_spill;
+ struct drr_write_embedded {
+ uint64_t drr_object;
+ uint64_t drr_offset;
+ /* logical length, should equal blocksize */
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ uint8_t drr_compression;
+ uint8_t drr_etype;
+ uint8_t drr_pad[6];
+ uint32_t drr_lsize; /* uncompressed size of payload */
+ uint32_t drr_psize; /* compr. (real) size of payload */
+ /* (possibly compressed) content follows */
+ } drr_write_embedded;
} drr_u;
} dmu_replay_record_t;
@@ -307,8 +324,8 @@ typedef struct zfs_cmd {
dmu_objset_stats_t zc_objset_stats;
struct drr_begin zc_begin_record;
zinject_record_t zc_inject_record;
- boolean_t zc_defer_destroy;
- boolean_t zc_temphold;
+ uint32_t zc_defer_destroy;
+ uint32_t zc_flags;
uint64_t zc_action_handle;
int zc_cleanup_fd;
uint8_t zc_pad[4]; /* alignment */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 8e901b804b..933c258c26 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -84,6 +84,12 @@ enum zio_checksum {
ZIO_CHECKSUM_FUNCTIONS
};
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
+
#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
@@ -113,6 +119,12 @@ enum zio_compress {
ZIO_COMPRESS_FUNCTIONS
};
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
+
/* N.B. when altering this value, also change BOOTFS_COMPRESS_VALID below */
#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
diff --git a/usr/src/uts/common/fs/zfs/zfs_debug.c b/usr/src/uts/common/fs/zfs/zfs_debug.c
index 26ea561eb1..85fa7600d9 100644
--- a/usr/src/uts/common/fs/zfs/zfs_debug.c
+++ b/usr/src/uts/common/fs/zfs/zfs_debug.c
@@ -28,7 +28,7 @@
list_t zfs_dbgmsgs;
int zfs_dbgmsg_size;
kmutex_t zfs_dbgmsgs_lock;
-int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
void
zfs_dbgmsg_init(void)
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 559a2ec648..14fe1f5bd4 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -4310,6 +4310,7 @@ out:
* zc_fromobj objsetid of incremental fromsnap (may be zero)
* zc_guid if set, estimate size of stream only. zc_cookie is ignored.
* output size in zc_objset_type.
+ * zc_flags if =1, WRITE_EMBEDDED records are permitted
*
* outputs:
* zc_objset_type estimated size, if zc_guid is set
@@ -4320,6 +4321,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
int error;
offset_t off;
boolean_t estimate = (zc->zc_guid != 0);
+ boolean_t embedok = (zc->zc_flags & 0x1);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
@@ -4380,7 +4382,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
- zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off);
+ zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
@@ -5282,6 +5284,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
* innvl: {
* "fd" -> file descriptor to write stream to (int32)
* (optional) "fromsnap" -> full snap name to send an incremental from
+ * (optional) "embedok" -> (value ignored)
+ * presence indicates DRR_WRITE_EMBEDDED records are permitted
* }
*
* outnvl is unused
@@ -5294,6 +5298,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
offset_t off;
char *fromname = NULL;
int fd;
+ boolean_t embedok;
error = nvlist_lookup_int32(innvl, "fd", &fd);
if (error != 0)
@@ -5301,12 +5306,14 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
+ embedok = nvlist_exists(innvl, "embedok");
+
file_t *fp = getf(fd);
if (fp == NULL)
return (SET_ERROR(EBADF));
off = fp->f_offset;
- error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off);
+ error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 17b22a0c8f..b873b12bd7 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -137,10 +137,15 @@ int
zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
{
avl_tree_t *t = &zilog->zl_bp_tree;
- const dva_t *dva = BP_IDENTITY(bp);
+ const dva_t *dva;
zil_bp_node_t *zn;
avl_index_t where;
+ if (BP_IS_EMBEDDED(bp))
+ return (0);
+
+ dva = BP_IDENTITY(bp);
+
if (avl_find(t, dva, &where) != NULL)
return (SET_ERROR(EEXIST));
@@ -831,7 +836,7 @@ zil_lwb_write_done(zio_t *zio)
ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
ASSERT(!BP_IS_GANG(zio->io_bp));
ASSERT(!BP_IS_HOLE(zio->io_bp));
- ASSERT(zio->io_bp->blk_fill == 0);
+ ASSERT(BP_GET_FILL(zio->io_bp) == 0);
/*
* Ensure the lwb buffer pointer is cleared before releasing
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index d283576fc8..23d5cf5666 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -37,6 +37,7 @@
#include <sys/dmu_objset.h>
#include <sys/arc.h>
#include <sys/ddt.h>
+#include <sys/blkptr.h>
#include <sys/zfeature.h>
#include <sys/zfs_zone.h>
@@ -216,7 +217,7 @@ zio_buf_alloc(size_t size)
{
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
- ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+ ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
}
@@ -671,6 +672,16 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_physdone = physdone;
zio->io_prop = *zp;
+ /*
+ * Data can be NULL if we are going to call zio_write_override() to
+ * provide the already-allocated BP. But we may need the data to
+ * verify a dedup hit (if requested). In this case, don't try to
+ * dedup (just take the already-allocated BP verbatim).
+ */
+ if (data == NULL && zio->io_prop.zp_dedup_verify) {
+ zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
+ }
+
return (zio);
}
@@ -710,6 +721,14 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
void
zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
{
+
+ /*
+ * The check for EMBEDDED is a performance optimization. We
+ * process the free here (by ignoring it) rather than
+ * putting it on the list and then processing it in zio_free_sync().
+ */
+ if (BP_IS_EMBEDDED(bp))
+ return;
metaslab_check_free(spa, bp);
/*
@@ -734,13 +753,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio_t *zio;
enum zio_stage stage = ZIO_FREE_PIPELINE;
- dprintf_bp(bp, "freeing in txg %llu, pass %u",
- (longlong_t)txg, spa->spa_sync_pass);
-
ASSERT(!BP_IS_HOLE(bp));
ASSERT(spa_syncing_txg(spa) == txg);
ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
+ if (BP_IS_EMBEDDED(bp))
+ return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
metaslab_check_free(spa, bp);
arc_freed(spa, bp);
@@ -756,7 +775,6 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
-
return (zio);
}
@@ -766,6 +784,11 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
{
zio_t *zio;
+ dprintf_bp(bp, "claiming in txg %llu", txg);
+
+ if (BP_IS_EMBEDDED(bp))
+ return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
/*
* A claim is an allocation of a specific block. Claims are needed
* to support immediate writes in the intent log. The issue is that
@@ -972,12 +995,20 @@ zio_read_bp_init(zio_t *zio)
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_child_type == ZIO_CHILD_LOGICAL &&
!(zio->io_flags & ZIO_FLAG_RAW)) {
- uint64_t psize = BP_GET_PSIZE(bp);
+ uint64_t psize =
+ BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
void *cbuf = zio_buf_alloc(psize);
zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
}
+ if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ decode_embedded_bp_compressed(bp, zio->io_data);
+ } else {
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ }
+
if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
@@ -1021,6 +1052,9 @@ zio_write_bp_init(zio_t *zio)
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ if (BP_IS_EMBEDDED(bp))
+ return (ZIO_PIPELINE_CONTINUE);
+
/*
* If we've been overridden and nopwrite is set then
* set the flag accordingly to indicate that a nopwrite
@@ -1069,7 +1103,7 @@ zio_write_bp_init(zio_t *zio)
compress = ZIO_COMPRESS_OFF;
/* Make sure someone doesn't change their mind on overwrites */
- ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
+ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
spa_max_replication(spa)) == BP_GET_NDVAS(bp));
}
@@ -1079,9 +1113,38 @@ zio_write_bp_init(zio_t *zio)
if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
+ } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
+ zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
+ encode_embedded_bp_compressed(bp,
+ cbuf, compress, lsize, psize);
+ BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
+ BP_SET_TYPE(bp, zio->io_prop.zp_type);
+ BP_SET_LEVEL(bp, zio->io_prop.zp_level);
+ zio_buf_free(cbuf, lsize);
+ bp->blk_birth = zio->io_txg;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ ASSERT(spa_feature_is_active(spa,
+ SPA_FEATURE_EMBEDDED_DATA));
+ return (ZIO_PIPELINE_CONTINUE);
} else {
- ASSERT(psize < lsize);
- zio_push_transform(zio, cbuf, psize, lsize, NULL);
+ /*
+ * Round up compressed size to MINBLOCKSIZE and
+ * zero the tail.
+ */
+ size_t rounded =
+ P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
+ if (rounded > psize) {
+ bzero((char *)cbuf + psize, rounded - psize);
+ psize = rounded;
+ }
+ if (psize == lsize) {
+ compress = ZIO_COMPRESS_OFF;
+ zio_buf_free(cbuf, lsize);
+ } else {
+ zio_push_transform(zio, cbuf,
+ psize, lsize, NULL);
+ }
}
}
@@ -2779,7 +2842,7 @@ zio_checksum_verified(zio_t *zio)
/*
* ==========================================================================
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
- * An error of 0 indictes success. ENXIO indicates whole-device failure,
+ * An error of 0 indicates success. ENXIO indicates whole-device failure,
* which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
* indicate errors that are specific to one I/O, and most likely permanent.
* Any other error is presumed to be worse because we weren't expecting it.
@@ -2889,7 +2952,7 @@ zio_done(zio_t *zio)
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
ASSERT(zio->io_children[c][w] == 0);
- if (bp != NULL) {
+ if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
ASSERT(bp->blk_pad[0] == 0);
ASSERT(bp->blk_pad[1] == 0);
ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
@@ -3179,13 +3242,6 @@ zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
ASSERT(zb1->zb_objset == zb2->zb_objset);
ASSERT(zb2->zb_level == 0);
- /*
- * A bookmark in the deadlist is considered to be after
- * everything else.
- */
- if (zb2->zb_object == DMU_DEADLIST_OBJECT)
- return (B_TRUE);
-
/* The objset_phys_t isn't before anything. */
if (dnp == NULL)
return (B_FALSE);
diff --git a/usr/src/uts/common/fs/zfs/zio_compress.c b/usr/src/uts/common/fs/zfs/zio_compress.c
index d3c6ae17ce..0d77f246c4 100644
--- a/usr/src/uts/common/fs/zfs/zio_compress.c
+++ b/usr/src/uts/common/fs/zfs/zio_compress.c
@@ -80,7 +80,7 @@ size_t
zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
{
uint64_t *word, *word_end;
- size_t c_len, d_len, r_len;
+ size_t c_len, d_len;
zio_compress_info_t *ci = &zio_compress_table[c];
ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
@@ -102,28 +102,13 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
return (s_len);
/* Compress at least 12.5% */
- d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE);
- if (d_len == 0)
- return (s_len);
-
+ d_len = s_len - (s_len >> 3);
c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
if (c_len > d_len)
return (s_len);
- /*
- * Cool. We compressed at least as much as we were hoping to.
- * For both security and repeatability, pad out the last sector.
- */
- r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE);
- if (r_len > c_len) {
- bzero((char *)dst + c_len, r_len - c_len);
- c_len = r_len;
- }
-
ASSERT3U(c_len, <=, d_len);
- ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0);
-
return (c_len);
}
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index fa1a756c61..7287cca267 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -260,6 +260,8 @@ zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
return (0);
+ VERIFY(!BP_IS_EMBEDDED(bp));
+
VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
ma->ma_blks++;
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index caad33c567..01deecdf02 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -189,6 +189,7 @@ typedef enum {
ZPOOL_PROP_COMMENT,
ZPOOL_PROP_EXPANDSZ,
ZPOOL_PROP_FREEING,
+ ZPOOL_PROP_LEAKED,
ZPOOL_NUM_PROPS
} zpool_prop_t;
diff --git a/usr/src/uts/common/sys/sdt.h b/usr/src/uts/common/sys/sdt.h
index ed8603eabc..6cbbe856bb 100644
--- a/usr/src/uts/common/sys/sdt.h
+++ b/usr/src/uts/common/sys/sdt.h
@@ -407,7 +407,7 @@ extern "C" {
type7, arg7, type8, arg8);
/*
- * the set-error SDT probe is extra static, in that we declare its fake
+ * The set-error SDT probe is extra static, in that we declare its fake
* function literally, rather than with the DTRACE_PROBE1() macro. This is
* necessary so that SET_ERROR() can evaluate to a value, which wouldn't
* be possible if it required multiple statements (to declare the function