summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2016-11-30 13:11:37 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2016-11-30 13:11:37 +0000
commit80c431c3af17a3f5c86dac722986210ac5675994 (patch)
tree36ba06aacad5bf84cf7ad8209b616ee19dfa9ea3
parent6bd01ddca2d0dd95b05bbc3db21df2e9bc2855b4 (diff)
parent5602294fda888d923d57a78bafdaf48ae6223dea (diff)
downloadillumos-joyent-80c431c3af17a3f5c86dac722986210ac5675994.tar.gz
[illumos-gate merge]
commit 5602294fda888d923d57a78bafdaf48ae6223dea 7252 compressed zfs send / receive 7628 create long versions of ZFS send / receive options commit 4a6959565df1e2af817732421764a9da2f446da9 6911 nfs4: unexpected permission denied
-rw-r--r--usr/src/cmd/zfs/zfs_main.c27
-rw-r--r--usr/src/cmd/zstreamdump/zstreamdump.c30
-rw-r--r--usr/src/lib/libzfs/common/libzfs.h3
-rw-r--r--usr/src/lib/libzfs/common/libzfs_sendrecv.c51
-rw-r--r--usr/src/lib/libzfs_core/common/libzfs_core.c11
-rw-r--r--usr/src/lib/libzfs_core/common/libzfs_core.h5
-rw-r--r--usr/src/man/man1m/zfs.1m48
-rw-r--r--usr/src/pkg/manifests/system-test-zfstest.mf23
-rw-r--r--usr/src/test/zfs-tests/include/commands.cfg3
-rw-r--r--usr/src/test/zfs-tests/include/libtest.shlib45
-rw-r--r--usr/src/test/zfs-tests/include/properties.shlib94
-rw-r--r--usr/src/test/zfs-tests/runfiles/delphix.run8
-rw-r--r--usr/src/test/zfs-tests/runfiles/omnios.run8
-rw-r--r--usr/src/test/zfs-tests/runfiles/openindiana.run8
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/Makefile17
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend.cfg4
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend.kshlib122
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh3
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh3
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh3
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh3
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh3
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-cD.ksh77
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_embedded_blocks.ksh103
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_incremental.ksh100
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_lz4_disabled.ksh73
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_mixed_compression.ksh54
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_props.ksh67
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_dedup.ksh55
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled.ksh68
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_resume.ksh49
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate.ksh91
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_contents.ksh55
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_ratio.ksh66
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_volume.ksh80
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_zstreamdump.ksh59
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize.ksh198
-rw-r--r--usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio4
-rw-r--r--usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio4
-rw-r--r--usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio4
-rw-r--r--usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio4
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_client.c11
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_srv_attr.c123
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_srv_ns.c213
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_srv_readdir.c26
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_vnops.c4
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_export.c27
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c1030
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c171
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c42
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c11
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c231
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dataset.c8
-rw-r--r--usr/src/uts/common/fs/zfs/lz4.c2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc.h20
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_send.h9
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_dataset.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/refcount.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h25
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h26
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_compress.h23
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c35
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c107
-rw-r--r--usr/src/uts/common/nfs/export.h9
65 files changed, 3090 insertions, 803 deletions
diff --git a/usr/src/cmd/zfs/zfs_main.c b/usr/src/cmd/zfs/zfs_main.c
index 7d9153f9fe..0132ab81bb 100644
--- a/usr/src/cmd/zfs/zfs_main.c
+++ b/usr/src/cmd/zfs/zfs_main.c
@@ -34,6 +34,7 @@
#include <assert.h>
#include <ctype.h>
#include <errno.h>
+#include <getopt.h>
#include <libgen.h>
#include <libintl.h>
#include <libuutil.h>
@@ -262,7 +263,7 @@ get_usage(zfs_help_t idx)
case HELP_ROLLBACK:
return (gettext("\trollback [-rRf] <snapshot>\n"));
case HELP_SEND:
- return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
+ return (gettext("\tsend [-DnPpRvLec] [-[iI] snapshot] "
"<snapshot>\n"
"\tsend [-Le] [-i snapshot|bookmark] "
"<filesystem|volume|snapshot>\n"
@@ -3784,8 +3785,23 @@ zfs_do_send(int argc, char **argv)
nvlist_t *dbgnv = NULL;
boolean_t extraverbose = B_FALSE;
+ struct option long_options[] = {
+ {"replicate", no_argument, NULL, 'R'},
+ {"props", no_argument, NULL, 'p'},
+ {"parsable", no_argument, NULL, 'P'},
+ {"dedup", no_argument, NULL, 'D'},
+ {"verbose", no_argument, NULL, 'v'},
+ {"dryrun", no_argument, NULL, 'n'},
+ {"large-block", no_argument, NULL, 'L'},
+ {"embed", no_argument, NULL, 'e'},
+ {"resume", required_argument, NULL, 't'},
+ {"compressed", no_argument, NULL, 'c'},
+ {0, 0, 0, 0}
+ };
+
/* check options */
- while ((c = getopt(argc, argv, ":i:I:RDpvnPLet:")) != -1) {
+ while ((c = getopt_long(argc, argv, ":i:I:RbDpvnPLet:c", long_options,
+ NULL)) != -1) {
switch (c) {
case 'i':
if (fromname)
@@ -3829,12 +3845,17 @@ zfs_do_send(int argc, char **argv)
case 't':
resume_token = optarg;
break;
+ case 'c':
+ flags.compress = B_TRUE;
+ break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
+ /*FALLTHROUGH*/
+ default:
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
@@ -3905,6 +3926,8 @@ zfs_do_send(int argc, char **argv)
lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
if (flags.embed_data)
lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
+ if (flags.compress)
+ lzc_flags |= LZC_SEND_FLAG_COMPRESS;
if (fromname != NULL &&
(fromname[0] == '#' || fromname[0] == '@')) {
diff --git a/usr/src/cmd/zstreamdump/zstreamdump.c b/usr/src/cmd/zstreamdump/zstreamdump.c
index 3b390a4663..17adbecd79 100644
--- a/usr/src/cmd/zstreamdump/zstreamdump.c
+++ b/usr/src/cmd/zstreamdump/zstreamdump.c
@@ -25,8 +25,8 @@
*/
/*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <ctype.h>
@@ -39,6 +39,7 @@
#include <sys/dmu.h>
#include <sys/zfs_ioctl.h>
+#include <sys/zio.h>
#include <zfs_fletcher.h>
/*
@@ -251,6 +252,7 @@ main(int argc, char *argv[])
(void) fprintf(stderr, "invalid option '%c'\n",
optopt);
usage();
+ break;
}
}
@@ -452,38 +454,50 @@ main(int argc, char *argv[])
drrw->drr_object = BSWAP_64(drrw->drr_object);
drrw->drr_type = BSWAP_32(drrw->drr_type);
drrw->drr_offset = BSWAP_64(drrw->drr_offset);
- drrw->drr_length = BSWAP_64(drrw->drr_length);
+ drrw->drr_logical_size =
+ BSWAP_64(drrw->drr_logical_size);
drrw->drr_toguid = BSWAP_64(drrw->drr_toguid);
drrw->drr_key.ddk_prop =
BSWAP_64(drrw->drr_key.ddk_prop);
+ drrw->drr_compressed_size =
+ BSWAP_64(drrw->drr_compressed_size);
}
+
+ uint64_t payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+
/*
* If this is verbose and/or dump output,
* print info on the modified block
*/
if (verbose) {
(void) printf("WRITE object = %llu type = %u "
- "checksum type = %u\n"
- " offset = %llu length = %llu "
+ "checksum type = %u compression type = %u\n"
+ " offset = %llu logical_size = %llu "
+ "compressed_size = %llu "
+ "payload_size = %llu "
"props = %llx\n",
(u_longlong_t)drrw->drr_object,
drrw->drr_type,
drrw->drr_checksumtype,
+ drrw->drr_compressiontype,
(u_longlong_t)drrw->drr_offset,
- (u_longlong_t)drrw->drr_length,
+ (u_longlong_t)drrw->drr_logical_size,
+ (u_longlong_t)drrw->drr_compressed_size,
+ (u_longlong_t)payload_size,
(u_longlong_t)drrw->drr_key.ddk_prop);
}
+
/*
* Read the contents of the block in from STDIN to buf
*/
- (void) ssread(buf, drrw->drr_length, &zc);
+ (void) ssread(buf, payload_size, &zc);
/*
* If in dump mode
*/
if (dump) {
- print_block(buf, drrw->drr_length);
+ print_block(buf, payload_size);
}
- total_write_size += drrw->drr_length;
+ total_write_size += payload_size;
break;
case DRR_WRITE_BYREF:
diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h
index 524809a9e7..5b823c9525 100644
--- a/usr/src/lib/libzfs/common/libzfs.h
+++ b/usr/src/lib/libzfs/common/libzfs.h
@@ -602,6 +602,9 @@ typedef struct sendflags {
/* WRITE_EMBEDDED records of type DATA are permitted */
boolean_t embed_data;
+
+ /* compressed WRITE records are permitted */
+ boolean_t compress;
} sendflags_t;
typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
diff --git a/usr/src/lib/libzfs/common/libzfs_sendrecv.c b/usr/src/lib/libzfs/common/libzfs_sendrecv.c
index eab6d4bacb..b8db4c9c8e 100644
--- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c
+++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c
@@ -347,8 +347,10 @@ cksummer(void *arg)
{
struct drr_write *drrw = &drr->drr_u.drr_write;
dataref_t dataref;
+ uint64_t payload_size;
- (void) ssread(buf, drrw->drr_length, ofp);
+ payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+ (void) ssread(buf, payload_size, ofp);
/*
* Use the existing checksum if it's dedup-capable,
@@ -362,7 +364,7 @@ cksummer(void *arg)
zio_cksum_t tmpsha256;
SHA256Init(&ctx);
- SHA256Update(&ctx, buf, drrw->drr_length);
+ SHA256Update(&ctx, buf, payload_size);
SHA256Final(&tmpsha256, &ctx);
drrw->drr_key.ddk_cksum.zc_word[0] =
BE_64(tmpsha256.zc_word[0]);
@@ -392,7 +394,7 @@ cksummer(void *arg)
wbr_drrr->drr_object = drrw->drr_object;
wbr_drrr->drr_offset = drrw->drr_offset;
- wbr_drrr->drr_length = drrw->drr_length;
+ wbr_drrr->drr_length = drrw->drr_logical_size;
wbr_drrr->drr_toguid = drrw->drr_toguid;
wbr_drrr->drr_refguid = dataref.ref_guid;
wbr_drrr->drr_refobject =
@@ -414,7 +416,7 @@ cksummer(void *arg)
goto out;
} else {
/* block not previously seen */
- if (dump_record(drr, buf, drrw->drr_length,
+ if (dump_record(drr, buf, payload_size,
&stream_cksum, outfd) != 0)
goto out;
}
@@ -917,7 +919,7 @@ typedef struct send_dump_data {
uint64_t prevsnap_obj;
boolean_t seenfrom, seento, replicate, doall, fromorigin;
boolean_t verbose, dryrun, parsable, progress, embed_data, std_out;
- boolean_t large_block;
+ boolean_t large_block, compress;
int outfd;
boolean_t err;
nvlist_t *fss;
@@ -933,7 +935,7 @@ typedef struct send_dump_data {
static int
estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
- boolean_t fromorigin, uint64_t *sizep)
+ boolean_t fromorigin, enum lzc_send_flags flags, uint64_t *sizep)
{
zfs_cmd_t zc = { 0 };
libzfs_handle_t *hdl = zhp->zfs_hdl;
@@ -946,6 +948,7 @@ estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
zc.zc_fromobj = fromsnap_obj;
zc.zc_guid = 1; /* estimate flag */
+ zc.zc_flags = flags;
if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) {
char errbuf[1024];
@@ -1184,6 +1187,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
progress_arg_t pa = { 0 };
pthread_t tid;
char *thissnap;
+ enum lzc_send_flags flags = 0;
int err;
boolean_t isfromsnap, istosnap, fromorigin;
boolean_t exclude = B_FALSE;
@@ -1212,6 +1216,13 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
if (istosnap)
sdd->seento = B_TRUE;
+ if (sdd->large_block)
+ flags |= LZC_SEND_FLAG_LARGE_BLOCK;
+ if (sdd->embed_data)
+ flags |= LZC_SEND_FLAG_EMBED_DATA;
+ if (sdd->compress)
+ flags |= LZC_SEND_FLAG_COMPRESS;
+
if (!sdd->doall && !isfromsnap && !istosnap) {
if (sdd->replicate) {
char *snapname;
@@ -1258,7 +1269,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
if (sdd->verbose) {
uint64_t size = 0;
(void) estimate_ioctl(zhp, sdd->prevsnap_obj,
- fromorigin, &size);
+ fromorigin, flags, &size);
send_print_verbose(fout, zhp->zfs_name,
sdd->prevsnap[0] ? sdd->prevsnap : NULL,
@@ -1283,12 +1294,6 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
}
}
- enum lzc_send_flags flags = 0;
- if (sdd->large_block)
- flags |= LZC_SEND_FLAG_LARGE_BLOCK;
- if (sdd->embed_data)
- flags |= LZC_SEND_FLAG_EMBED_DATA;
-
err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
fromorigin, sdd->outfd, flags, sdd->debugnv);
@@ -1594,8 +1599,12 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd,
fromguid = 0;
(void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid);
+ if (flags->largeblock || nvlist_exists(resume_nvl, "largeblockok"))
+ lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
if (flags->embed_data || nvlist_exists(resume_nvl, "embedok"))
lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
+ if (flags->compress || nvlist_exists(resume_nvl, "compressok"))
+ lzc_flags |= LZC_SEND_FLAG_COMPRESS;
if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) {
if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) {
@@ -1628,7 +1637,8 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd,
if (flags->verbose) {
uint64_t size = 0;
- error = lzc_send_space(zhp->zfs_name, fromname, &size);
+ error = lzc_send_space(zhp->zfs_name, fromname,
+ lzc_flags, &size);
if (error == 0)
size = MAX(0, (int64_t)(size - bytes));
send_print_verbose(stderr, zhp->zfs_name, fromname,
@@ -1856,6 +1866,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
sdd.dryrun = flags->dryrun;
sdd.large_block = flags->largeblock;
sdd.embed_data = flags->embed_data;
+ sdd.compress = flags->compress;
sdd.filter_cb = filter_func;
sdd.filter_cb_arg = cb_arg;
if (debugnvp)
@@ -2921,11 +2932,17 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
case DRR_WRITE:
if (byteswap) {
- drr->drr_u.drr_write.drr_length =
- BSWAP_64(drr->drr_u.drr_write.drr_length);
+ drr->drr_u.drr_write.drr_logical_size =
+ BSWAP_64(
+ drr->drr_u.drr_write.drr_logical_size);
+ drr->drr_u.drr_write.drr_compressed_size =
+ BSWAP_64(
+ drr->drr_u.drr_write.drr_compressed_size);
}
+ uint64_t payload_size =
+ DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write);
(void) recv_read(hdl, fd, buf,
- drr->drr_u.drr_write.drr_length, B_FALSE, NULL);
+ payload_size, B_FALSE, NULL);
break;
case DRR_SPILL:
if (byteswap) {
diff --git a/usr/src/lib/libzfs_core/common/libzfs_core.c b/usr/src/lib/libzfs_core/common/libzfs_core.c
index cc5e2a781b..7e7891798d 100644
--- a/usr/src/lib/libzfs_core/common/libzfs_core.c
+++ b/usr/src/lib/libzfs_core/common/libzfs_core.c
@@ -487,6 +487,8 @@ lzc_send_resume(const char *snapname, const char *from, int fd,
fnvlist_add_boolean(args, "largeblockok");
if (flags & LZC_SEND_FLAG_EMBED_DATA)
fnvlist_add_boolean(args, "embedok");
+ if (flags & LZC_SEND_FLAG_COMPRESS)
+ fnvlist_add_boolean(args, "compressok");
if (resumeobj != 0 || resumeoff != 0) {
fnvlist_add_uint64(args, "resume_object", resumeobj);
fnvlist_add_uint64(args, "resume_offset", resumeoff);
@@ -512,7 +514,8 @@ lzc_send_resume(const char *snapname, const char *from, int fd,
* an equivalent snapshot.
*/
int
-lzc_send_space(const char *snapname, const char *from, uint64_t *spacep)
+lzc_send_space(const char *snapname, const char *from,
+ enum lzc_send_flags flags, uint64_t *spacep)
{
nvlist_t *args;
nvlist_t *result;
@@ -521,6 +524,12 @@ lzc_send_space(const char *snapname, const char *from, uint64_t *spacep)
args = fnvlist_alloc();
if (from != NULL)
fnvlist_add_string(args, "from", from);
+ if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
+ fnvlist_add_boolean(args, "largeblockok");
+ if (flags & LZC_SEND_FLAG_EMBED_DATA)
+ fnvlist_add_boolean(args, "embedok");
+ if (flags & LZC_SEND_FLAG_COMPRESS)
+ fnvlist_add_boolean(args, "compressok");
err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result);
nvlist_free(args);
if (err == 0)
diff --git a/usr/src/lib/libzfs_core/common/libzfs_core.h b/usr/src/lib/libzfs_core/common/libzfs_core.h
index 6b4575ddeb..094fa257e4 100644
--- a/usr/src/lib/libzfs_core/common/libzfs_core.h
+++ b/usr/src/lib/libzfs_core/common/libzfs_core.h
@@ -62,13 +62,14 @@ int lzc_get_holds(const char *, nvlist_t **);
enum lzc_send_flags {
LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
- LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1
+ LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1,
+ LZC_SEND_FLAG_COMPRESS = 1 << 2
};
int lzc_send(const char *, const char *, int, enum lzc_send_flags);
int lzc_send_resume(const char *, const char *, int,
enum lzc_send_flags, uint64_t, uint64_t);
-int lzc_send_space(const char *, const char *, uint64_t *);
+int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *);
struct dmu_replay_record;
diff --git a/usr/src/man/man1m/zfs.1m b/usr/src/man/man1m/zfs.1m
index 8c78343cc4..ee49174cb4 100644
--- a/usr/src/man/man1m/zfs.1m
+++ b/usr/src/man/man1m/zfs.1m
@@ -165,12 +165,12 @@
.Ar snapshot bookmark
.Nm
.Cm send
-.Op Fl DLPRenpv
+.Op Fl DLPRcenpv
.Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot
.Ar snapshot
.Nm
.Cm send
-.Op Fl Le
+.Op Fl Lce
.Op Fl i Ar snapshot Ns | Ns Ar bookmark
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
.Nm
@@ -2451,7 +2451,7 @@ feature.
.It Xo
.Nm
.Cm send
-.Op Fl DLPRenpv
+.Op Fl DLPRcenpv
.Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot
.Ar snapshot
.Xc
@@ -2464,7 +2464,7 @@ to a different system
.Pc .
By default, a full stream is generated.
.Bl -tag -width "-D"
-.It Fl D
+.It Fl D, -dedup
Generate a deduplicated stream. Blocks which would have been sent multiple times
in the send stream will only be sent once. The receiving system must also
support this feature to receive a deduplicated stream. This flag can be used
@@ -2484,7 +2484,7 @@ is similar to
The incremental source may be specified as with the
.Fl i
option.
-.It Fl L
+.It Fl L, -large-block
Generate a stream which may contain blocks larger than 128KB. This flag has no
effect if the
.Sy large_blocks
@@ -2498,9 +2498,9 @@ pool feature enabled as well. See
for details on ZFS feature flags and the
.Sy large_blocks
feature.
-.It Fl P
+.It Fl P, -parsable
Print machine-parsable verbose information about the stream package generated.
-.It Fl R
+.It Fl R, -replicate
Generate a replication stream package, which will replicate the specified
file system, and all descendent file systems, up to the named snapshot. When
received, all properties, snapshots, descendent file systems, and clones are
@@ -2518,7 +2518,7 @@ is received. If the
.Fl F
flag is specified when this stream is received, snapshots and file systems that
do not exist on the sending side are destroyed.
-.It Fl e
+.It Fl e, -embed
Generate a more compact stream by using
.Sy WRITE_EMBEDDED
records for blocks which are stored more compactly on disk by the
@@ -2535,6 +2535,16 @@ that feature enabled as well. See
for details on ZFS feature flags and the
.Sy embedded_data
feature.
+.It Fl c, -compressed
+Generate a more compact stream by using compressed WRITE records for blocks
+which are compressed on disk and in memory (see the
+.Sy compression No property for details). If the Sy lz4_compress No feature
+is active on the sending system, then the receiving system must have that
+feature enabled as well. If the
+.Sy large_blocks No feature is enabled on the sending system but the Fl L
+option is not supplied in conjunction with
+.Fl c, No then the data will be decompressed before sending so it can be split
+into smaller block sizes.
.It Fl i Ar snapshot
Generate an incremental stream from the first
.Ar snapshot
@@ -2557,7 +2567,7 @@ be fully specified
not just
.Em @origin
.Pc .
-.It Fl n
+.It Fl n, -dryrun
Do a dry-run
.Pq Qq No-op
send. Do not generate any actual send data. This is useful in conjunction with
@@ -2570,11 +2580,11 @@ be written to standard output
.Po contrast with a non-dry-run, where the stream is written to standard output
and the verbose output goes to standard error
.Pc .
-.It Fl p
+.It Fl p, -props
Include the dataset's properties in the stream. This flag is implicit when
.Fl R
is specified. The receiving system must also support this feature.
-.It Fl v
+.It Fl v, -verbose
Print verbose information about the stream package generated. This information
includes a per-second report of how much data has been sent.
.Pp
@@ -2584,7 +2594,7 @@ on future versions of ZFS .
.It Xo
.Nm
.Cm send
-.Op Fl Le
+.Op Fl Lce
.Op Fl i Ar snapshot Ns | Ns Ar bookmark
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
.Xc
@@ -2594,7 +2604,7 @@ read-only, or the filesystem must not be mounted. When the stream generated from
a filesystem or volume is received, the default snapshot name will be
.Qq --head-- .
.Bl -tag -width "-L"
-.It Fl L
+.It Fl L, -large-block
Generate a stream which may contain blocks larger than 128KB. This flag has no
effect if the
.Sy large_blocks
@@ -2608,7 +2618,17 @@ pool feature enabled as well. See
for details on ZFS feature flags and the
.Sy large_blocks
feature.
-.It Fl e
+.It Fl c, -compressed
+Generate a more compact stream by using compressed WRITE records for blocks
+which are compressed on disk and in memory (see the
+.Sy compression No property for details). If the Sy lz4_compress No feature is
+active on the sending system, then the receiving system must have that feature
+enabled as well. If the
+.Sy large_blocks No feature is enabled on the sending system but the Fl L
+option is not supplied in conjunction with
+.Fl c, No then the data will be decompressed before sending so it can be split
+into smaller block sizes.
+.It Fl e, -embed
Generate a more compact stream by using
.Sy WRITE_EMBEDDED
records for blocks which are stored more compactly on disk by the
diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf
index 1ce41fa420..6a0ad9b813 100644
--- a/usr/src/pkg/manifests/system-test-zfstest.mf
+++ b/usr/src/pkg/manifests/system-test-zfstest.mf
@@ -2058,6 +2058,27 @@ file path=opt/zfs-tests/tests/functional/rsend/rsend_020_pos mode=0555
file path=opt/zfs-tests/tests/functional/rsend/rsend_021_pos mode=0555
file path=opt/zfs-tests/tests/functional/rsend/rsend_022_pos mode=0555
file path=opt/zfs-tests/tests/functional/rsend/rsend_024_pos mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-cD mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_embedded_blocks \
+ mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_incremental mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_lz4_disabled mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_mixed_compression \
+ mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_props mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_recv_dedup mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled \
+ mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_resume mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate \
+ mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_verify_contents \
+ mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_verify_ratio mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_volume mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_zstreamdump mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize \
+ mode=0555
file path=opt/zfs-tests/tests/functional/rsend/setup mode=0555
file path=opt/zfs-tests/tests/functional/scrub_mirror/cleanup mode=0555
file path=opt/zfs-tests/tests/functional/scrub_mirror/default.cfg mode=0555
@@ -2280,7 +2301,7 @@ file path=opt/zfs-tests/tests/perf/scripts/io.d mode=0555
file path=opt/zfs-tests/tests/perf/scripts/prefetch_io.d mode=0555
license cr_Sun license=cr_Sun
license lic_CDDL license=lic_CDDL
-#depend fmri=benchmark/fio type=require
depend fmri=system/file-system/zfs/tests type=require
+depend fmri=system/test/fio type=require
depend fmri=system/test/testrunner type=require
depend fmri=system/xopen/xcu4 type=require
diff --git a/usr/src/test/zfs-tests/include/commands.cfg b/usr/src/test/zfs-tests/include/commands.cfg
index bf60cd9565..a83c22c8c4 100644
--- a/usr/src/test/zfs-tests/include/commands.cfg
+++ b/usr/src/test/zfs-tests/include/commands.cfg
@@ -156,7 +156,8 @@ export USR_SBIN_FILES='arp
zhack
zinject
zoneadm
- zonecfg'
+ zonecfg
+ zstreamdump'
export SBIN_FILES='fdisk
mount
diff --git a/usr/src/test/zfs-tests/include/libtest.shlib b/usr/src/test/zfs-tests/include/libtest.shlib
index 2957cf0808..d40c1aa39e 100644
--- a/usr/src/test/zfs-tests/include/libtest.shlib
+++ b/usr/src/test/zfs-tests/include/libtest.shlib
@@ -2527,3 +2527,48 @@ function get_min
echo $min
}
+
+#
+# Generate a random number between 1 and the argument.
+#
+function random
+{
+ typeset max=$1
+ echo $(( ($RANDOM % $max) + 1 ))
+}
+
+# Write data that can be compressed into a directory
+function write_compressible
+{
+ typeset dir=$1
+ typeset megs=$2
+ typeset nfiles=${3:-1}
+ typeset bs=${4:-1024k}
+ typeset fname=${5:-file}
+
+ [[ -d $dir ]] || log_fail "No directory: $dir"
+
+ log_must eval "fio \
+ --name=job \
+ --fallocate=0 \
+ --minimal \
+ --randrepeat=0 \
+ --buffer_compress_percentage=66 \
+ --buffer_compress_chunk=4096 \
+ --directory=$dir \
+ --numjobs=$nfiles \
+ --rw=write \
+ --bs=$bs \
+ --filesize=$megs \
+ --filename_format='$fname.\$jobnum' >/dev/null"
+}
+
+function get_objnum
+{
+ typeset pathname=$1
+ typeset objnum
+
+ [[ -e $pathname ]] || log_fail "No such file or directory: $pathname"
+ objnum=$(stat -c %i $pathname)
+ echo $objnum
+}
diff --git a/usr/src/test/zfs-tests/include/properties.shlib b/usr/src/test/zfs-tests/include/properties.shlib
index 2897e90c25..b1c1b0be44 100644
--- a/usr/src/test/zfs-tests/include/properties.shlib
+++ b/usr/src/test/zfs-tests/include/properties.shlib
@@ -13,10 +13,29 @@
# Copyright (c) 2012, 2016 by Delphix. All rights reserved.
#
-typeset -a compress_props=('on' 'off' 'lzjb' 'gzip' 'gzip-1' 'gzip-2' 'gzip-3'
- 'gzip-4' 'gzip-5' 'gzip-6' 'gzip-7' 'gzip-8' 'gzip-9' 'zle')
+typeset -a compress_prop_vals=('on' 'off' 'lzjb' 'gzip' 'gzip-1' 'gzip-2'
+ 'gzip-3' 'gzip-4' 'gzip-5' 'gzip-6' 'gzip-7' 'gzip-8' 'gzip-9' 'zle' 'lz4')
+typeset -a checksum_prop_vals=('on' 'off' 'fletcher2' 'fletcher4' 'sha256'
+ 'noparity' 'sha512' 'skein' 'edonr')
+typeset -a recsize_prop_vals=('512' '1024' '2048' '4096' '8192' '16384'
+ '32768' '65536' '131072' '262144' '524288' '1048576')
+typeset -a aclinherit_prop_vals=('discard' 'noallow' 'restricted' 'passthrough'
+ 'passthrough-x')
+typeset -a aclmode_prop_vals=('discard' 'groupmask' 'passthrough' 'restricted')
+typeset -a canmount_prop_vals=('on' 'off' 'noauto')
+typeset -a copies_prop_vals=('1' '2' '3')
+typeset -a logbias_prop_vals=('latency' 'throughput')
+typeset -a primarycache_prop_vals=('all' 'none' 'metadata')
+typeset -a redundant_metadata_prop_vals=('all' 'most')
+typeset -a secondarycache_prop_vals=('all' 'none' 'metadata')
+typeset -a snapdir_prop_vals=('hidden' 'visible')
+typeset -a sync_prop_vals=('standard' 'always' 'disabled')
-typeset -a checksum_props=('on' 'off' 'fletcher2' 'fletcher4' 'sha256')
+typeset -a fs_props=('compress' 'checksum' 'recsize' 'aclinherit' 'aclmode'
+ 'canmount' 'copies' 'logbias' 'primarycache' 'redundant_metadata'
+ 'secondarycache' 'snapdir' 'sync')
+typeset -a vol_props=('compress' 'checksum' 'copies' 'logbias' 'primarycache'
+ 'secondarycache' 'redundant_metadata' 'sync')
#
# Given the property array passed in, return 'num_props' elements to the
@@ -44,20 +63,81 @@ function get_rand_prop
function get_rand_compress
{
- get_rand_prop compress_props $1 2
+ get_rand_prop compress_prop_vals $1 2
}
function get_rand_compress_any
{
- get_rand_prop compress_props $1 0
+ get_rand_prop compress_prop_vals $1 0
}
function get_rand_checksum
{
- get_rand_prop checksum_props $1 2
+ get_rand_prop checksum_prop_vals $1 2
}
function get_rand_checksum_any
{
- get_rand_prop checksum_props $1 0
+ get_rand_prop checksum_prop_vals $1 0
+}
+
+function get_rand_recsize
+{
+ get_rand_prop recsize_prop_vals $1 0
+}
+
+function get_rand_large_recsize
+{
+ get_rand_prop recsize_prop_vals $1 9
+}
+
+#
+# Functions to toggle on/off properties
+#
+typeset -a binary_props=('atime' 'devices' 'exec' 'nbmand' 'readonly' 'setuid'
+ 'xattr' 'zoned')
+
+function toggle_prop
+{
+ typeset ds=$1
+ typeset prop=$2
+
+ datasetexists $ds || log_fail "$ds does not exist"
+ typeset val=$(get_prop $prop $ds)
+ typeset newval='off'
+
+ [[ $val = $newval ]] && newval='on'
+ log_must zfs set $prop=$newval $ds
+}
+
+function toggle_binary_props
+{
+ typeset ds=$1
+ typeset prop
+
+ for prop in "${binary_props[@]}"; do
+ toggle_prop $ds $prop
+ done
+}
+
+function randomize_ds_props
+{
+ typeset ds=$1
+ typeset prop proplist val
+
+ datasetexists $ds || log_fail "$ds does not exist"
+ if ds_is_volume $ds; then
+ toggle_prop $ds readonly
+ proplist="${vol_props[@]}"
+ elif ds_is_filesystem $ds; then
+ toggle_binary_props $ds
+ proplist="${fs_props[@]}"
+ else
+ log_fail "$ds is neither a volume nor a file system"
+ fi
+
+ for prop in $proplist; do
+ typeset val=$(get_rand_prop "${prop}_prop_vals" 1 0)
+ log_must zfs set $prop=$val $ds
+ done
}
diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run
index 7be16fe46e..7f6afe6451 100644
--- a/usr/src/test/zfs-tests/runfiles/delphix.run
+++ b/usr/src/test/zfs-tests/runfiles/delphix.run
@@ -479,7 +479,13 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos',
'rsend_013_pos', 'rsend_014_pos',
'rsend_019_pos', 'rsend_020_pos',
- 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos']
+ 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos',
+ 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props',
+ 'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump',
+ 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
+ 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
+ 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
+ 'send-c_recv_dedup']
[/opt/zfs-tests/tests/functional/scrub_mirror]
tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos',
diff --git a/usr/src/test/zfs-tests/runfiles/omnios.run b/usr/src/test/zfs-tests/runfiles/omnios.run
index a1cef540c5..f66317cd6d 100644
--- a/usr/src/test/zfs-tests/runfiles/omnios.run
+++ b/usr/src/test/zfs-tests/runfiles/omnios.run
@@ -475,7 +475,13 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos',
'rsend_013_pos', 'rsend_014_pos',
'rsend_019_pos', 'rsend_020_pos',
- 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos']
+ 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos',
+ 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props',
+ 'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump',
+ 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
+ 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
+ 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
+ 'send-c_recv_dedup']
[/opt/zfs-tests/tests/functional/scrub_mirror]
tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos',
diff --git a/usr/src/test/zfs-tests/runfiles/openindiana.run b/usr/src/test/zfs-tests/runfiles/openindiana.run
index f8f6af23a7..7293eb949c 100644
--- a/usr/src/test/zfs-tests/runfiles/openindiana.run
+++ b/usr/src/test/zfs-tests/runfiles/openindiana.run
@@ -475,7 +475,13 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos',
'rsend_013_pos', 'rsend_014_pos',
'rsend_019_pos', 'rsend_020_pos',
- 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos']
+ 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos',
+ 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props',
+ 'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump',
+ 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
+ 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
+ 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
+ 'send-c_recv_dedup']
[/opt/zfs-tests/tests/functional/scrub_mirror]
tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos',
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/Makefile b/usr/src/test/zfs-tests/tests/functional/rsend/Makefile
index c482d9d607..918cfcc56e 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/Makefile
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/Makefile
@@ -10,7 +10,7 @@
#
#
-# Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+# Copyright (c) 2013, 2015 by Delphix. All rights reserved.
#
include $(SRC)/Makefile.master
@@ -38,6 +38,21 @@ PROGS = cleanup \
rsend_021_pos \
rsend_022_pos \
rsend_024_pos \
+ send-cD \
+ send-c_embedded_blocks \
+ send-c_incremental \
+ send-c_lz4_disabled \
+ send-c_mixed_compression \
+ send-c_props \
+ send-c_recv_dedup \
+ send-c_recv_lz4_disabled \
+ send-c_resume \
+ send-c_stream_size_estimate \
+ send-c_verify_contents \
+ send-c_verify_ratio \
+ send-c_volume \
+ send-c_zstreamdump \
+ send-cpL_varied_recsize \
setup
FILES = rsend.cfg \
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend.cfg b/usr/src/test/zfs-tests/tests/functional/rsend/rsend.cfg
index 2c1654e089..8400ecfe35 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend.cfg
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend.cfg
@@ -34,6 +34,6 @@ export DISK2=$(echo $DISKS | awk '{print $2}')
export DISK3=$(echo $DISKS | awk '{print $3}')
export POOL=$TESTPOOL
-export POOL2=$TESTPOOL1
-export POOL3=$TESTPOOL2
+export POOL2=$TESTPOOL2
+export POOL3=$TESTPOOL3
export FS=$TESTFS
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend.kshlib b/usr/src/test/zfs-tests/tests/functional/rsend/rsend.kshlib
index da5b7cb3a4..a82d3b3d59 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend.kshlib
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend.kshlib
@@ -29,6 +29,7 @@
#
. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/include/math.shlib
. $STF_SUITE/tests/functional/rsend/rsend.cfg
#
@@ -514,8 +515,8 @@ function test_fs_setup
typeset sendpool=${sendfs%%/*}
typeset recvpool=${recvfs%%/*}
- datasetexists $sendfs && log_must $ZFS destroy -r $sendpool
- datasetexists $recvfs && log_must $ZFS destroy -r $recvpool
+ datasetexists $sendfs && log_must zfs destroy -r $sendpool
+ datasetexists $recvfs && log_must zfs destroy -r $recvpool
if $(datasetexists $sendfs || zfs create -o compress=lz4 $sendfs); then
mk_files 1000 256 0 $sendfs &
@@ -549,3 +550,120 @@ function test_fs_setup
fi
log_must zfs create -o compress=lz4 $sendpool/stream
}
+
+#
+# Check to see if the specified features are set in a send stream.
+# The values for these features are found in uts/common/fs/zfs/sys/zfs_ioctl.h
+#
+# $1 The stream file
+# $2-$n The flags expected in the stream
+#
+function stream_has_features
+{
+ typeset file=$1
+ shift
+
+ [[ -f $file ]] || log_fail "Couldn't find file: $file"
+ typeset flags=$(cat $file | zstreamdump | awk '/features =/ {print $3}')
+ typeset -A feature
+ feature[dedup]="1"
+ feature[dedupprops]="2"
+ feature[sa_spill]="4"
+ feature[embed_data]="10000"
+ feature[lz4]="20000"
+ feature[mooch_byteswap]="40000"
+ feature[large_blocks]="80000"
+ feature[resuming]="100000"
+ feature[redacted]="200000"
+ feature[compressed]="400000"
+
+ typeset flag known derived=0
+ for flag in "$@"; do
+ known=${feature[$flag]}
+ [[ -z $known ]] && log_fail "Unknown feature: $flag"
+
+ derived=$(echo "$flags & ${feature[$flag]} = X" | mdb | sed 's/ //g')
+ [[ $derived = $known ]] || return 1
+ done
+
+ return 0
+}
+
+#
+# Parse zstreamdump -v output. The output varies for each kind of record:
+# BEGIN records are simply output as "BEGIN"
+# END records are output as "END"
+# OBJECT records become "OBJECT <object num>"
+# FREEOBJECTS records become "FREEOBJECTS <startobj> <numobjs>"
+# FREE records become "<record type> <start> <length>"
+# WRITE records become:
+# "<record type> <compression type> <start> <logical size> <compressed size>
+# <data size>"
+#
+function parse_dump
+{
+ sed '/^WRITE/{N;s/\n/ /;}' | grep "^[A-Z]" | awk '{
+ if ($1 == "BEGIN" || $1 == "END") print $1
+ if ($1 == "OBJECT") print $1" "$4
+ if ($1 == "FREEOBJECTS") print $1" "$4" "$7
+ if ($1 == "FREE") print $1" "$7" "$10
+ if ($1 == "WRITE") print $1" "$15" "$18" "$21" "$24" "$27}'
+}
+
+#
+# Given a send stream, verify that the size of the stream matches what's
+# expected based on the source or target dataset. If the stream is an
+# incremental stream, subtract the size of the source snapshot before
+# comparing. This function does not currently handle incremental streams
+# that remove data.
+#
+# $1 The zstreamdump output file
+# $2 The dataset to compare against
+# This can be a source of a send or recv target (fs, not snapshot)
+# $3 The percentage below which verification is deemed a failure
+# $4 The source snapshot of an incremental send
+#
+
+function verify_stream_size
+{
+ typeset stream=$1
+ typeset ds=$2
+ typeset percent=${3:-90}
+ typeset inc_src=$4
+
+ [[ -f $stream ]] || log_fail "No such file: $stream"
+ datasetexists $ds || log_fail "No such dataset: $ds"
+
+ typeset stream_size=$(cat $stream | zstreamdump | sed -n \
+ 's/ Total write size = \(.*\) (0x.*)/\1/p')
+
+ typeset inc_size=0
+ if [[ -n $inc_src ]]; then
+ inc_size=$(get_prop lrefer $inc_src)
+ if stream_has_features $stream compressed; then
+ inc_size=$(get_prop refer $inc_src)
+ fi
+ fi
+
+ if stream_has_features $stream compressed; then
+ ds_size=$(get_prop refer $ds)
+ else
+ ds_size=$(get_prop lrefer $ds)
+ fi
+ ds_size=$((ds_size - inc_size))
+
+ within_percent $stream_size $ds_size $percent || log_fail \
+ "$stream_size $ds_size differed by too much"
+}
+
+# Cleanup function for tests involving resumable send
+function resume_cleanup
+{
+ typeset sendfs=$1
+ typeset streamfs=$2
+
+ datasetexists $sendfs && log_must zfs destroy -r $sendfs
+ datasetexists $streamfs && log_must zfs destroy -r $streamfs
+ cleanup_pool $POOL2
+ rm -f /$POOL/initial.zsend /$POOL/incremental.zsend
+}
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh
index d6a5fa2b75..79c9bb6d9b 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh
@@ -37,8 +37,9 @@ verify_runnable "both"
log_assert "Verify resumability of a full and incremental ZFS send/receive " \
"in the presence of a corrupted stream"
-log_onexit cleanup_pools $POOL2 $POOL3
+log_onexit resume_cleanup $sendfs $streamfs
+sendfs=$POOL/sendfs
recvfs=$POOL3/recvfs
streamfs=$POOL2/stream
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh
index 1dcbdace8e..97c19f505a 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh
@@ -35,12 +35,13 @@ verify_runnable "both"
log_assert "Verify resumability of full ZFS send/receive with the -D " \
"(dedup) flag"
-log_onexit cleanup_pool $POOL2
sendfs=$POOL/sendfs
recvfs=$POOL2/recvfs
streamfs=$POOL/stream
+log_onexit resume_cleanup $sendfs $streamfs
+
test_fs_setup $sendfs $recvfs
resume_test "zfs send -D -v $sendfs@a" $streamfs $recvfs
file_check $sendfs $recvfs
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh
index 8fb0abb7a5..2d2a3304da 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh
@@ -37,12 +37,13 @@ verify_runnable "both"
log_assert "Verify resumability of a full and incremental ZFS send/receive " \
"with the -e (embedded) flag"
-log_onexit cleanup_pool $POOL2
sendfs=$POOL/sendfs
recvfs=$POOL2/recvfs
streamfs=$POOL/stream
+log_onexit resume_cleanup $sendfs $streamfs
+
test_fs_setup $sendfs $recvfs
resume_test "zfs send -v -e $sendfs@a" $streamfs $recvfs
resume_test "zfs send -v -e -i @a $sendfs@b" $streamfs $recvfs
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh
index 3fdb049422..9592cb9a79 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh
@@ -40,12 +40,13 @@ verify_runnable "both"
log_assert "Verify resumability of an incremental ZFS send/receive with ZFS " \
"bookmarks"
-log_onexit cleanup_pool $POOL2
sendfs=$POOL/sendfs
recvfs=$POOL2/recvfs
streamfs=$POOL/stream
+log_onexit resume_cleanup $sendfs $streamfs
+
test_fs_setup $sendfs $recvfs
log_must zfs bookmark $sendfs@a $sendfs#bm_a
log_must zfs destroy $sendfs@a
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh
index 62fba64589..d5d938e4b7 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh
@@ -37,12 +37,13 @@ verify_runnable "both"
log_assert "Verify resumability of a full ZFS send/receive with the source " \
"filesystem unmounted"
-log_onexit cleanup_pool $POOL2
sendfs=$POOL/sendfs
recvfs=$POOL2/recvfs
streamfs=$POOL/stream
+log_onexit resume_cleanup $sendfs $streamfs
+
test_fs_setup $sendfs $recvfs
log_must zfs unmount $sendfs
resume_test "zfs send $sendfs" $streamfs $recvfs
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-cD.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-cD.ksh
new file mode 100644
index 0000000000..25dc46b3c3
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-cD.ksh
@@ -0,0 +1,77 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify that the -c and -D flags do not interfere with each other.
+#
+# Strategy:
+# 1. Write unique data to a filesystem and create a compressed, deduplicated
+# full stream.
+# 2. Verify that the stream and send dataset show the same size
+# 3. Make several copies of the original data, and create both full and
+# incremental compressed, deduplicated send streams
+# 4. Verify the full stream is no bigger than the stream from step 1
+# 5. Verify the streams can be received correctly.
+#
+
+verify_runnable "both"
+
+log_assert "Verify that the -c and -D flags do not interfere with each other"
+log_onexit cleanup_pool $POOL2
+
+typeset sendfs=$POOL2/sendfs
+typeset recvfs=$POOL2/recvfs
+typeset stream0=$BACKDIR/stream.0
+typeset stream1=$BACKDIR/stream.1
+typeset inc=$BACKDIR/stream.inc
+
+log_must zfs create -o compress=lz4 $sendfs
+log_must zfs create -o compress=lz4 $recvfs
+typeset dir=$(get_prop mountpoint $sendfs)
+# Don't use write_compressible: we want compressible but undedupable data here.
+log_must cp /kernel/genunix $dir/file
+log_must zfs snapshot $sendfs@snap0
+log_must eval "zfs send -D -c $sendfs@snap0 >$stream0"
+
+# The stream size should match at this point because the data is all unique
+verify_stream_size $stream0 $sendfs
+
+for i in {0..3}; do
+ log_must cp $dir/file $dir/file.$i
+done
+log_must zfs snapshot $sendfs@snap1
+
+# The stream sizes should match, since the second stream contains no new blocks
+log_must eval "zfs send -D -c $sendfs@snap1 >$stream1"
+typeset size0=$(stat -c %s $stream0)
+typeset size1=$(stat -c %s $stream1)
+within_percent $size0 $size1 90 || log_fail "$size0 and $size1"
+
+# Finally, make sure the receive works correctly.
+log_must eval "zfs send -D -c -i snap0 $sendfs@snap1 >$inc"
+log_must eval "zfs recv -d $recvfs <$stream0"
+log_must eval "zfs recv -d $recvfs <$inc"
+cmp_ds_cont $sendfs $recvfs
+
+# The size of the incremental should be the same as the initial send.
+typeset size2=$(stat -c %s $inc)
+within_percent $size0 $size2 90 || log_fail "$size0 and $size1"
+
+log_pass "The -c and -D flags do not interfere with each other"
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_embedded_blocks.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_embedded_blocks.ksh
new file mode 100644
index 0000000000..1913c71190
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_embedded_blocks.ksh
@@ -0,0 +1,103 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/include/properties.shlib
+
+#
+# Description:
+# Verify that compressed streams can contain embedded blocks.
+#
+# Strategy:
+# 1. Create a filesystem with compressible data and embedded blocks.
+# 2. Verify the created streams can be received correctly.
+# 3. Verify the presence / absence of embedded blocks in the compressed stream,
+# as well as the receiving file system.
+#
+
+verify_runnable "both"
+
+log_assert "Verify that compressed streams can contain embedded blocks."
+log_onexit cleanup_pool $POOL2
+
+typeset objs obj recsize
+typeset sendfs=$POOL2/sendfs
+typeset recvfs=$POOL2/recvfs
+typeset stream=$BACKDIR/stream
+typeset dump=$BACKDIR/dump
+typeset recvfs2=$POOL2/recvfs2
+typeset stream2=$BACKDIR/stream2
+typeset dump2=$BACKDIR/dump2
+log_must zfs create -o compress=lz4 $sendfs
+log_must zfs create -o compress=lz4 $recvfs
+log_must zfs create -o compress=lz4 $recvfs2
+typeset dir=$(get_prop mountpoint $sendfs)
+
+# Populate the send dataset with compressible data and embedded block files.
+write_compressible $dir 16m
+for recsize in "${recsize_prop_vals[@]}"; do
+ # For lz4, this method works for blocks up to 16k, but not larger
+ [[ $recsize -eq $((32 * 1024)) ]] && break
+
+ log_must mkholes -h 0:$((recsize - 8)) -d $((recsize - 8)):8 \
+ $dir/$recsize
+done
+
+# Generate the streams and zstreamdump output.
+log_must zfs snapshot $sendfs@now
+log_must eval "zfs send -c $sendfs@now >$stream"
+log_must eval "zstreamdump -v <$stream >$dump"
+log_must eval "zfs recv -d $recvfs <$stream"
+cmp_ds_cont $sendfs $recvfs
+verify_stream_size $stream $sendfs
+log_mustnot stream_has_features $stream embed_data
+
+log_must eval "zfs send -c -e $sendfs@now >$stream2"
+log_must eval "zstreamdump -v <$stream2 >$dump2"
+log_must eval "zfs recv -d $recvfs2 <$stream2"
+cmp_ds_cont $sendfs $recvfs2
+verify_stream_size $stream2 $sendfs
+log_must stream_has_features $stream2 embed_data
+
+# Verify embedded blocks are present only when expected.
+for recsize in "${recsize_prop_vals[@]}"; do
+ [[ $recsize -eq $((32 * 1024)) ]] && break
+
+ typeset send_obj=$(get_objnum $(get_prop mountpoint $sendfs)/$recsize)
+ typeset recv_obj=$(get_objnum \
+ $(get_prop mountpoint $recvfs/sendfs)/$recsize)
+ typeset recv2_obj=$(get_objnum \
+ $(get_prop mountpoint $recvfs2/sendfs)/$recsize)
+
+ log_must eval "zdb -ddddd $sendfs $send_obj >$BACKDIR/sendfs.zdb"
+ log_must eval "zdb -ddddd $recvfs/sendfs $recv_obj >$BACKDIR/recvfs.zdb"
+ log_must eval "zdb -ddddd $recvfs2/sendfs $recv2_obj >$BACKDIR/recvfs2.zdb"
+
+ grep -q "EMBEDDED" $BACKDIR/sendfs.zdb || \
+ log_fail "Obj $send_obj not embedded in $sendfs"
+ grep -q "EMBEDDED" $BACKDIR/recvfs.zdb || \
+ log_fail "Obj $recv_obj not embedded in $recvfs"
+ grep -q "EMBEDDED" $BACKDIR/recvfs2.zdb || \
+ log_fail "Obj $recv2_obj not embedded in $recvfs2"
+
+ grep -q "WRITE_EMBEDDED object = $send_obj offset = 0" $dump && \
+ log_fail "Obj $obj embedded in zstreamdump output"
+ grep -q "WRITE_EMBEDDED object = $send_obj offset = 0" $dump2 || \
+ log_fail "Obj $obj not embedded in zstreamdump output"
+done
+
+log_pass "Compressed streams can contain embedded blocks."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_incremental.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_incremental.ksh
new file mode 100644
index 0000000000..719970d995
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_incremental.ksh
@@ -0,0 +1,100 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify that compressed send works correctly with incremental sends.
+#
+# Strategy:
+# 1. Randomly choose either a -i or -I incremental.
+# 2. Generate compressed incremental replication streams for a pool, a
+# descendant dataset, and a volume.
+# 3. Receive these streams verifying both the contents, and intermediate
+# snapshots are present or absent as appropriate to the -i or -I option.
+#
+
+verify_runnable "both"
+
+log_assert "Verify compressed send works with incremental send streams."
+log_onexit cleanup_pool $POOL2
+
+typeset opt=$(random_get "-i" "-I")
+typeset final dstlist list vol
+
+log_must eval "zfs send -R $POOL@final > $BACKDIR/final"
+log_must eval "zfs receive -d -F $POOL2 < $BACKDIR/final"
+
+function do_checks
+{
+ log_must cmp_ds_cont $POOL $POOL2
+ [[ $opt = "-I" ]] && log_must cmp_ds_subs $POOL $POOL2
+ [[ $opt = "-i" ]] && log_mustnot cmp_ds_subs $POOL $POOL2
+
+ [[ $1 != "clean" ]] && return
+
+ cleanup_pool $POOL2
+ log_must eval "zfs send -R $POOL@final > $BACKDIR/final"
+ log_must eval "zfs receive -d -F $POOL2 < $BACKDIR/final"
+}
+
+if is_global_zone; then
+ # Send from the pool root
+ final=$(getds_with_suffix $POOL2 @final)
+ list="$final $(getds_with_suffix $POOL2 @snapA)"
+ list="$list $(getds_with_suffix $POOL2 @snapB)"
+ list="$list $(getds_with_suffix $POOL2 @snapC)"
+
+ log_must eval "zfs send -c -R $opt @init $POOL2@final >$BACKDIR/pool"
+ log_must destroy_tree $list
+ log_must eval "zfs recv -d -F $POOL2 <$BACKDIR/pool"
+
+ dstlist=$(getds_with_suffix $POOL2 @final)
+ [[ $final != $dstlist ]] && log_fail "$final != $dstlist"
+
+ do_checks clean
+
+ # Send of a volume
+ vol=$POOL2/$FS/vol
+ final=$(getds_with_suffix $vol @final)
+ log_must eval "zfs send -c -R $opt @init $vol@final >$BACKDIR/vol"
+ log_must destroy_tree $vol@snapB $vol@snapC $vol@final
+ log_must eval "zfs recv -d -F $POOL2 <$BACKDIR/vol"
+
+ dstlist=$(getds_with_suffix $POOL2/$FS/vol @final)
+ [[ $final != $dstlist ]] && log_fail "$final != $dstlist"
+
+ do_checks clean
+fi
+
+# Send of a descendant fs
+final=$(getds_with_suffix $POOL2/$FS @final)
+list="$final $(getds_with_suffix $POOL2/$FS @snapA)"
+list="$list $(getds_with_suffix $POOL2/$FS @snapB)"
+list="$list $(getds_with_suffix $POOL2/$FS @snapC)"
+
+log_must eval "zfs send -c -R $opt @init $POOL2/$FS@final >$BACKDIR/fs"
+log_must destroy_tree $list
+log_must eval "zfs recv -d -F $POOL2 <$BACKDIR/fs"
+
+dstlist=$(getds_with_suffix $POOL2/$FS @final)
+[[ $final != $dstlist ]] && log_fail "$final != $dstlist"
+
+do_checks
+
+log_pass "Compressed send works with incremental send streams."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_lz4_disabled.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_lz4_disabled.ksh
new file mode 100644
index 0000000000..b2df3d01da
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_lz4_disabled.ksh
@@ -0,0 +1,73 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify a pool without the lz4 feature enabled can create compressed send
+# streams, and that they can be received into pools with or without the
+# lz4 feature.
+#
+# Strategy:
+# 1. For each of an uncompressed, and gzip dataset created from a pool with
+# the lz4 feature disabled, receive the stream into a pool with and without
+# the feature enabled.
+#
+
+verify_runnable "both"
+
+log_assert "Verify compressed streams are rejected if incompatible."
+
+typeset send_ds=$POOL2/testds
+typeset recv_ds=$POOL3/testds
+
+function cleanup
+{
+ poolexists $POOL2 && destroy_pool $POOL2
+ poolexists $POOL3 && destroy_pool $POOL3
+ log_must zpool create $POOL2 $DISK2
+}
+log_onexit cleanup
+
+datasetexists $POOL2 && log_must zpool destroy $POOL2
+log_must zpool create -d $POOL2 $DISK2
+
+for compress in off gzip; do
+ for pool_opt in '' -d; do
+ poolexists $POOL3 && destroy_pool $POOL3
+ log_must zpool create $pool_opt $POOL3 $DISK3
+
+ datasetexists $send_ds && log_must zfs destroy -r $send_ds
+ datasetexists $recv_ds && log_must zfs destroy -r $recv_ds
+
+ log_must zfs create -o compress=$compress $send_ds
+ typeset dir=$(get_prop mountpoint $send_ds)
+ write_compressible $dir 16m
+ log_must zfs snapshot $send_ds@full
+
+ log_must eval "zfs send -c $send_ds@full >$BACKDIR/full-c"
+ log_must eval "zfs recv $recv_ds <$BACKDIR/full-c"
+
+ log_must zfs destroy -r $recv_ds
+
+ log_must eval "zfs send $send_ds@full >$BACKDIR/full"
+ log_must eval "zfs recv $recv_ds <$BACKDIR/full"
+ done
+done
+
+log_pass "Compressed streams are rejected if incompatible."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_mixed_compression.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_mixed_compression.ksh
new file mode 100644
index 0000000000..5bc2bb000b
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_mixed_compression.ksh
@@ -0,0 +1,54 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/include/properties.shlib
+
+#
+# Description:
+# Verify datasets using mixed compression algorithms can be received.
+#
+# Strategy:
+# 1. Write data with each of the available compression algorithms
+# 2. Receive a full compressed send, and verify the data and compression ratios
+#
+
+verify_runnable "both"
+
+log_assert "Verify datasets using mixed compression algorithms can be received."
+log_onexit cleanup_pool $POOL2
+
+send_ds=$POOL2/sendfs
+recv_ds=$POOL2/recvfs
+
+log_must zfs create $send_ds
+
+for prop in "${compress_prop_vals[@]}"; do
+ log_must zfs set compress=$prop $send_ds
+ write_compressible $(get_prop mountpoint $send_ds) 16m
+done
+
+log_must zfs set compress=off $send_ds
+log_must zfs snapshot $send_ds@full
+log_must eval "zfs send -c $send_ds@full >$BACKDIR/full"
+log_must eval "zfs recv $recv_ds <$BACKDIR/full"
+
+verify_stream_size $BACKDIR/full $send_ds
+verify_stream_size $BACKDIR/full $recv_ds
+log_must cmp_ds_cont $send_ds $recv_ds
+
+log_pass "Datasets using mixed compression algorithms can be received."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_props.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_props.ksh
new file mode 100644
index 0000000000..49d86a3dce
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_props.ksh
@@ -0,0 +1,67 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/include/properties.shlib
+
+#
+# Description:
+# Verify compressed send streams can still preserve properties
+#
+# Strategy:
+# 1. Randomly modify the properties in the src pool
+# 2. Send a full compressed stream with -p to preserve properties
+# 3. Verify all the received properties match the source datasets
+# 4. Repeat the process with -R instead of -p
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+ destroy_pool $POOL
+ destroy_pool $POOL2
+ log_must zpool create $POOL $DISK1
+ log_must zpool create $POOL2 $DISK2
+ log_must setup_test_model $POOL
+}
+
+log_assert "Compressed send doesn't interfere with preservation of properties"
+log_onexit cleanup
+
+typeset -a datasets=("" "/pclone" "/$FS" "/$FS/fs1" "/$FS/fs1/fs2"
+ "/$FS/fs1/fclone" "/vol" "/$FS/vol")
+
+typeset ds
+for opt in "-p" "-R"; do
+ for ds in ${datasets[@]}; do
+ randomize_ds_props $POOL$ds
+ done
+
+ log_must eval "zfs send -c $opt $POOL@final > $BACKDIR/pool-final$opt"
+ log_must eval "zfs receive -d -F $POOL2 < $BACKDIR/pool-final$opt"
+
+ for ds in ${datasets[@]}; do
+ log_must cmp_ds_prop $POOL$ds $POOL2$ds
+ log_must cmp_ds_prop $POOL$ds@final $POOL2$ds@final
+ done
+
+ # Don't cleanup the second time, since we do that on exit anyway.
+ [[ $opt = "-p" ]] && cleanup
+done
+
+log_pass "Compressed send doesn't interfere with preservation of properties"
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_dedup.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_dedup.ksh
new file mode 100644
index 0000000000..eb8c050bf8
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_dedup.ksh
@@ -0,0 +1,55 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify that we can receive a compressed stream into a deduped filesystem.
+#
+# Strategy:
+# 1. Write heavily duplicated data to a filesystem and create a compressed
+# full stream.
+# 2. Verify that the stream can be received correctly into a dedup=verify
+# filesystem.
+#
+
+verify_runnable "both"
+
+log_pass "Verify a compressed stream can be received into a deduped filesystem"
+log_onexit cleanup_pool $POOL2
+
+typeset sendfs=$POOL2/sendfs
+typeset recvfs=$POOL2/recvfs
+typeset stream0=$BACKDIR/stream.0
+typeset stream1=$BACKDIR/stream.1
+typeset inc=$BACKDIR/stream.inc
+
+log_must zfs create -o compress=lz4 $sendfs
+log_must zfs create -o compress=lz4 -o dedup=verify $recvfs
+typeset dir=$(get_prop mountpoint $sendfs)
+for i in {0..10}; do
+ log_must cp /kernel/genunix $dir/file.$i
+done
+log_must zfs snapshot $sendfs@snap0
+log_must eval "zfs send -c $sendfs@snap0 >$stream0"
+
+# Finally, make sure the receive works correctly.
+log_must eval "zfs recv -d $recvfs <$stream0"
+cmp_ds_cont $sendfs $recvfs
+
+log_pass "The compressed stream could be received into a deduped filesystem"
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled.ksh
new file mode 100644
index 0000000000..822ea3655e
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled.ksh
@@ -0,0 +1,68 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify a pool without the lz4 feature gracefully rejects a compressed stream
+# because on any sending pool that supports it, metadata will be compressed
+# with lz4 even if user data is not compressed.
+#
+# Strategy:
+# 1. For each of an uncompressed, gzip and lz4 dataset, do the following
+# receives into a pool without the lz4 feature:
+# 2. Attempt to receive the compressed stream (should fail)
+# 3. Attempt to receive the uncompressed stream (should succeed)
+#
+
+verify_runnable "both"
+
+log_assert "Verify compressed streams are rejected if incompatible."
+
+typeset compress_types="off gzip lz4"
+typeset send_ds=$POOL2/testds
+typeset recv_ds=$POOL3/testds
+
+function cleanup
+{
+ poolexists $POOL2 && destroy_pool $POOL2
+ poolexists $POOL3 && destroy_pool $POOL3
+ log_must zpool create $POOL2 $DISK2
+}
+log_onexit cleanup
+
+datasetexists $POOL3 && log_must zpool destroy $POOL3
+log_must zpool create -d $POOL3 $DISK3
+
+for compress in $compress_types; do
+ datasetexists $send_ds && log_must zfs destroy -r $send_ds
+ datasetexists $recv_ds && log_must zfs destroy -r $recv_ds
+
+ log_must zfs create -o compress=$compress $send_ds
+ typeset dir=$(get_prop mountpoint $send_ds)
+ write_compressible $dir 16m
+ log_must zfs snapshot $send_ds@full
+
+ log_must eval "zfs send -c $send_ds@full >$BACKDIR/full-c"
+ log_mustnot eval "zfs recv $recv_ds <$BACKDIR/full-c"
+
+ log_must eval "zfs send $send_ds@full >$BACKDIR/full"
+ log_must eval "zfs recv $recv_ds <$BACKDIR/full"
+done
+
+log_pass "Compressed streams are rejected if incompatible."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_resume.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_resume.ksh
new file mode 100644
index 0000000000..8b36177647
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_resume.ksh
@@ -0,0 +1,49 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify resumability of full and incremental ZFS send/receive with the -c
+# (compress) flag in the presence of a corrupted stream.
+#
+# Strategy:
+# 1. Start a full ZFS send with the -c flag (compress), redirect output to
+# a file
+# 2. Mess up the contents of the stream state file on disk
+# 3. Try ZFS receive, which should fail with a checksum mismatch error
+# 4. ZFS send to the stream state file again using the receive_resume_token
+# 5. ZFS receieve and verify the receive completes successfully
+# 6. Repeat steps on an incremental ZFS send
+#
+
+verify_runnable "both"
+
+sendfs=$POOL/sendfs
+recvfs=$POOL2/recvfs
+streamfs=$POOL/stream
+
+log_assert "Verify compressed send streams can be resumed if interrupted"
+log_onexit resume_cleanup $sendfs $streamfs
+
+test_fs_setup $sendfs $recvfs
+resume_test "zfs send -c -v $sendfs@a" $streamfs $recvfs
+resume_test "zfs send -c -v -i @a $sendfs@b" $streamfs $recvfs
+file_check $sendfs $recvfs
+
+log_pass "Compressed send streams can be resumed if interrupted"
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate.ksh
new file mode 100644
index 0000000000..3e4da295d6
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate.ksh
@@ -0,0 +1,91 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify the stream size estimate given by -P accounts for compressed send.
+# Verify the stream size given by -P accounts for compressed send."
+#
+# Strategy:
+# 1. For datasets of varied compression types do the following:
+# 2. Write data, verify stream size estimates with and without -c
+#
+
+verify_runnable "both"
+typeset compress_types="off gzip lz4"
+typeset send_ds="$POOL2/testfs"
+typeset send_vol="$POOL2/vol"
+typeset send_voldev="/dev/zvol/rdsk/$POOL2/vol"
+typeset file="$BACKDIR/file.0"
+typeset megs="16"
+typeset compress
+
+function get_estimated_size
+{
+ typeset cmd=$1
+ typeset ds=${cmd##* }
+ typeset tmpfile=$(mktemp -p $BACKDIR)
+
+ eval "$cmd >$tmpfile"
+ [[ $? -eq 0 ]] || log_fail "get_estimated_size: $cmd"
+ typeset size=$(eval "awk '\$2 == \"$ds\" {print \$3}' $tmpfile")
+ rm -f $tmpfile
+
+ echo $size
+}
+
+log_assert "Verify the stream size given by -P accounts for compressed send."
+log_onexit cleanup_pool $POOL2
+
+write_compressible $BACKDIR ${megs}m
+
+for compress in $compress_types; do
+ datasetexists $send_ds && log_must zfs destroy -r $send_ds
+ datasetexists $send_vol && log_must zfs destroy -r $send_vol
+ log_must zfs create -o compress=$compress $send_ds
+ log_must zfs create -V 1g -o compress=$compress $send_vol
+
+ typeset dir=$(get_prop mountpoint $send_ds)
+ log_must cp $file $dir
+ log_must zfs snapshot $send_ds@snap
+ log_must dd if=$file of=$send_voldev
+ log_must zfs snapshot $send_vol@snap
+
+ typeset ds_size=$(get_estimated_size "zfs send -nP $send_ds@snap")
+ typeset ds_lrefer=$(get_prop lrefer $send_ds)
+ within_percent $ds_size $ds_lrefer 90 || log_fail \
+ "$ds_size and $ds_lrefer differed by too much"
+
+ typeset vol_size=$(get_estimated_size "zfs send -nP $send_vol@snap")
+ typeset vol_lrefer=$(get_prop lrefer $send_vol)
+ within_percent $vol_size $vol_lrefer 90 || log_fail \
+ "$vol_size and $vol_lrefer differed by too much"
+
+ typeset ds_csize=$(get_estimated_size "zfs send -nP -c $send_ds@snap")
+ typeset ds_refer=$(get_prop refer $send_ds)
+ within_percent $ds_csize $ds_refer 90 || log_fail \
+ "$ds_csize and $ds_refer differed by too much"
+
+ typeset vol_csize=$(get_estimated_size "zfs send -nP -c $send_vol@snap")
+ typeset vol_refer=$(get_prop refer $send_vol)
+ within_percent $vol_csize $vol_refer 90 || log_fail \
+ "$vol_csize and $vol_refer differed by too much"
+done
+
+log_pass "The the stream size given by -P accounts for compressed send."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_contents.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_contents.ksh
new file mode 100644
index 0000000000..de2d29c923
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_contents.ksh
@@ -0,0 +1,55 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify compressed send streams replicate data and datasets
+#
+# Strategy:
+# 1. Back up all the data from POOL/FS
+# 2. Verify all the datasets and data can be recovered in POOL2
+# 3. Back up all the data from root filesystem POOL2
+# 4. Verify all the data can be recovered, too
+#
+
+verify_runnable "both"
+
+log_assert "zfs send -c -R send replication stream up to the named snap."
+log_onexit cleanup_pool $POOL2
+
+# Verify the entire pool and descendants can be backed up and restored.
+log_must eval "zfs send -c -R $POOL@final > $BACKDIR/pool-final-R"
+log_must eval "zfs receive -d -F $POOL2 < $BACKDIR/pool-final-R"
+
+dstds=$(get_dst_ds $POOL $POOL2)
+log_must cmp_ds_subs $POOL $dstds
+log_must cmp_ds_cont $POOL $dstds
+
+# Cleanup POOL2
+log_must cleanup_pool $POOL2
+
+# Verify all the filesystems and descendants can be backed up and restored.
+log_must eval "zfs send -c -R $POOL/$FS@final > $BACKDIR/fs-final-R"
+log_must eval "zfs receive -d $POOL2 < $BACKDIR/fs-final-R"
+
+dstds=$(get_dst_ds $POOL/$FS $POOL2)
+log_must cmp_ds_subs $POOL/$FS $dstds
+log_must cmp_ds_cont $POOL/$FS $dstds
+
+log_pass "zfs send -c -R send replication stream up to the named snap."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_ratio.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_ratio.ksh
new file mode 100644
index 0000000000..a5138c527b
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_ratio.ksh
@@ -0,0 +1,66 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/include/properties.shlib
+
+#
+# Description:
+# Verify that the amount of data in a send -c stream matches compressratio.
+#
+# Strategy:
+# 1. For random compression types, and compressible / incompressible data:
+# 2. Create a snap with data
+# 3. Compare the size of the stream with the data on the dataset, adjusted
+# by compressratio for normal send, and compared to used for send -c.
+#
+
+verify_runnable "both"
+
+log_assert "Verify send -c streams are compressed"
+log_onexit cleanup_pool $POOL2
+
+typeset sendfs=$POOL2/$FS
+typeset megs=128
+
+for prop in $(get_rand_compress_any 6); do
+ for compressible in 'yes' 'no'; do
+ log_must zfs create -o compress=$prop $sendfs
+
+ if [[ $compressible = 'yes' ]]; then
+ write_compressible $(get_prop mountpoint $sendfs) \
+ ${megs}m
+ else
+ typeset file="$(get_prop mountpoint $sendfs)/ddfile"
+ log_must dd if=/dev/urandom of=$file bs=1024k count=$megs
+ fi
+
+ log_must zfs snapshot $sendfs@snap
+
+ # Calculate the sizes and verify the compression ratio.
+ log_must eval "zfs send $sendfs@snap >$BACKDIR/uncompressed"
+ verify_stream_size $BACKDIR/uncompressed $sendfs
+
+ log_must eval "zfs send -c $sendfs@snap >$BACKDIR/compressed"
+ verify_stream_size $BACKDIR/compressed $sendfs
+
+ log_must rm $BACKDIR/uncompressed $BACKDIR/compressed
+ log_must zfs destroy -r $sendfs
+ done
+done
+
+log_pass "Verify send -c streams are compressed"
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_volume.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_volume.ksh
new file mode 100644
index 0000000000..4ce3d5a09b
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_volume.ksh
@@ -0,0 +1,80 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify that compressed send correctly handles volumes
+#
+# Strategy:
+# 1. Write compressible data into a volume, take a snap
+# 2. Verify the compressed stream is the correct size, and has the correct data
+# 3. Repeat step 2 for an incremental compressed stream
+#
+
+function cleanup
+{
+ log_must zfs destroy -r $vol
+ cleanup_pool $POOL2
+}
+
+verify_runnable "both"
+
+log_assert "Verify compressed send works with volumes"
+log_onexit cleanup
+
+typeset vol="$POOL/newvol"
+typeset vol2="$POOL2/newvol"
+typeset voldev="/dev/zvol/rdsk/$POOL/newvol"
+typeset voldev2="/dev/zvol/rdsk/$POOL2/newvol"
+typeset data1=$BACKDIR/file.0
+typeset data2=$BACKDIR/file.1
+typeset megs=8
+
+log_must zfs create -V 256m -o compress=lz4 $vol
+
+write_compressible $BACKDIR ${megs}m 2
+md5_1=$(md5sum $data1 | awk '{print $1}')
+md5_2=$(md5sum $data2 | awk '{print $1}')
+
+log_must dd if=$data1 of=$voldev bs=1024k
+log_must zfs snapshot $vol@snap
+
+log_must eval "zfs send -c $vol@snap >$BACKDIR/full"
+log_must eval "zfs recv -d $POOL2 <$BACKDIR/full"
+
+verify_stream_size $BACKDIR/full $vol
+verify_stream_size $BACKDIR/full $vol2
+md5=$(dd if=$voldev2 bs=1024k count=$megs 2>/dev/null | md5sum | \
+ awk '{print $1}')
+[[ $md5 = $md5_1 ]] || log_fail "md5 mismatch: $md5 != $md5_1"
+
+# Repeat, for an incremental send
+log_must dd oseek=$megs if=$data2 of=$voldev bs=1024k
+log_must zfs snapshot $vol@snap2
+
+log_must eval "zfs send -c -i snap $vol@snap2 >$BACKDIR/inc"
+log_must eval "zfs recv -d $POOL2 <$BACKDIR/inc"
+
+verify_stream_size $BACKDIR/inc $vol 90 $vol@snap
+verify_stream_size $BACKDIR/inc $vol2 90 $vol2@snap
+md5=$(dd iseek=$megs if=$voldev2 bs=1024k count=$megs 2>/dev/null | md5sum | \
+ awk '{print $1}')
+[[ $md5 = $md5_2 ]] || log_fail "md5 mismatch: $md5 != $md5_2"
+
+log_pass "Verify compressed send works with volumes"
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_zstreamdump.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_zstreamdump.ksh
new file mode 100644
index 0000000000..6f8359e56c
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_zstreamdump.ksh
@@ -0,0 +1,59 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/include/math.shlib
+
+#
+# Description:
+# Verify compression features show up in zstreamdump
+#
+# Strategy:
+# 1. Create a full compressed send stream
+# 2. Verify zstreamdump shows this stream has the relevant features
+# 3. Verify zstreamdump's accounting of logical and compressed size is correct
+#
+
+verify_runnable "both"
+
+log_assert "Verify zstreamdump correctly interprets compressed send streams."
+log_onexit cleanup_pool $POOL2
+
+typeset sendfs=$POOL2/fs
+
+log_must zfs create -o compress=lz4 $sendfs
+typeset dir=$(get_prop mountpoint $sendfs)
+write_compressible $dir 16m
+log_must zfs snapshot $sendfs@full
+
+log_must eval "zfs send -c $sendfs@full >$BACKDIR/full"
+log_must stream_has_features $BACKDIR/full lz4 compressed
+cat $BACKDIR/full | zstreamdump -v | parse_dump > $BACKDIR/dump.out
+
+lsize=$(awk '/^WRITE [^0]/ {lsize += $4} END {printf("%d", lsize)}' \
+ $BACKDIR/dump.out)
+lsize_prop=$(get_prop logicalused $sendfs)
+within_percent $lsize $lsize_prop 90 || log_fail \
+ "$lsize and $lsize_prop differed by too much"
+
+csize=$(awk '/^WRITE [^0]/ {csize += $5} END {printf("%d", csize)}' \
+ $BACKDIR/dump.out)
+csize_prop=$(get_prop used $sendfs)
+within_percent $csize $csize_prop 90 || log_fail \
+ "$csize and $csize_prop differed by too much"
+
+log_pass "zstreamdump correctly interprets compressed send streams."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize.ksh
new file mode 100644
index 0000000000..8c33e323b8
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize.ksh
@@ -0,0 +1,198 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify compressed send works correctly with datasets of varying recsize.
+#
+# Strategy:
+# 1. Check the recv behavior (into pools with features enabled and disabled)
+# of all combinations of -c -p and -L. Verify the stream is compressed,
+# and that the recsize property and that of a received file is correct
+# according to this matrix:
+#
+# +---------+--------+------------+------------+-----------+-----------+
+# | send | send | received | received | received | received |
+# | stream | stream | file bs | prop | file bs | props |
+# | recsize | flags | (disabled) | (disabled) | (enabled) | (enabled) |
+# +---------+--------+------------+------------+-----------+-----------+
+# | 128k | | 128k | 128k | 128k | 128k |
+# | 128k | -c | Fails | Fails | 128k | 128k |
+# | 128k | -p | 128k | 128k | 128k | 128k |
+# | 128k | -L | 128k | 128k | 128k | 128k |
+# | 128k | -cp | Fails | Fails | 128k | 128k |
+# | 128k | -cL | Fails | Fails | 128k | 128k |
+# | 128k | -pL | 128k | 128k | 128k | 128k |
+# | 128k | -cpL | Fails | Fails | 128k | 128k |
+# | 1m | | Fails | Fails | 128k | 128k |
+# | 1m | -c | Fails | Fails | 128k | 128k |
+# | 1m | -p | 128k | 128k | 128k | 1m |
+# | 1m | -L | Fails | Fails | 1m | 128k |
+# | 1m | -cp | Fails | Fails | 128k | 1m |
+# | 1m | -cL | Fails | Fails | 1m | 128k |
+# | 1m | -pL | Fails | Fails | 1m | 1m |
+# | 1m | -cpL | Fails | Fails | 1m | 1m |
+# +---------+--------+------------+------------+-----------+-----------+
+#
+
+verify_runnable "both"
+
+function cleanup
+{
+ datasetexists $TESTPOOL/128k && log_must zfs destroy $TESTPOOL/128k
+ datasetexists $TESTPOOL/1m && log_must zfs destroy $TESTPOOL/1m
+ cleanup_pool $POOL2
+ destroy_pool $POOL3
+}
+
+# For a received stream, verify the recsize (prop and file) match expectations.
+function check_recsize
+{
+ typeset recv_ds=$1
+ typeset expected_file_bs=$2
+ typeset expected_recsize=$3
+ typeset file="$(get_prop mountpoint $recv_ds)/testfile"
+
+ [[ -f $file ]] || log_fail "file '$file' doesn't exist"
+
+ typeset read_recsize=$(get_prop recsize $recv_ds)
+ typeset read_file_bs=$(stat $file | sed -n \
+ 's/.*IO Block: \([0-9]*\).*/\1/p')
+
+ [[ $read_recsize = $expected_recsize ]] || log_fail \
+ "read_recsize: $read_recsize expected_recsize: $expected_recsize"
+ [[ $read_file_bs = $expected_file_bs ]] || log_fail \
+ "read_file_bs: $read_file_bs expected_file_bs: $expected_file_bs"
+}
+
+#
+# This function does a zfs send and receive according to the parameters
+# below, and verifies the data shown in the strategy section.
+#
+# -[cpL] flags to pass through to 'zfs send'
+# -d Receive into a pool with all features disabled
+#
+# $1 The recordsize of the send dataset
+# $2 Whether or not the recv should work.
+# $3 The blocksize expected in a received file (default 128k)
+# $4 The recordsize property expected in a received dataset (default 128k)
+#
+function check
+{
+ typeset recv_pool=$POOL2
+ typeset flags='-'
+
+ while getopts "cdpL" opt; do
+ case $opt in
+ c)
+ flags+='c'
+ ;;
+ d)
+ recv_pool=$POOL3
+ ;;
+ p)
+ flags+='p'
+ ;;
+ L)
+ flags+='L'
+ ;;
+ esac
+ done
+ shift $(($OPTIND - 1))
+ [[ ${#flags} -eq 1 ]] && flags=''
+
+ typeset recsize=$1
+ typeset verify=$2
+ typeset expected_file_bs=${3-131072}
+ typeset expected_recsize=${4-131072}
+ typeset send_ds=$TESTPOOL/$recsize
+ typeset send_snap=$send_ds@snap
+ typeset recv_ds=$recv_pool/$recsize
+ typeset stream=$BACKDIR/stream.out
+
+ datasetexists $send_ds || log_fail "send ds: $send_ds doesn't exist"
+ [[ -f $stream ]] && log_must rm $stream
+ log_must eval "zfs send $flags $send_snap >$stream"
+ $verify eval "zfs recv $recv_ds <$stream"
+ typeset stream_size=$(cat $stream | zstreamdump | sed -n \
+ 's/ Total write size = \(.*\) (0x.*)/\1/p')
+
+ #
+ # Special case: For a send dataset with large blocks, don't try to
+ # verify the stream size is correct if the compress flag is present
+ # but the large blocks flag isn't. In these cases, the user data
+ # isn't compressed in the stream (though metadata is) so the
+ # verification would fail.
+ #
+ typeset do_size_test=true
+ [[ $recsize = $large && $flags =~ 'c' && ! $flags =~ 'L' ]] && \
+ do_size_test=false
+
+ $do_size_test && verify_stream_size $stream $send_ds
+
+ if [[ $verify = "log_mustnot" ]]; then
+ datasetnonexists $recv_ds || log_fail "$recv_ds shouldn't exist"
+ return
+ fi
+
+ check_recsize $recv_ds $expected_file_bs $expected_recsize
+ $do_size_test && verify_stream_size $stream $recv_ds
+ log_must zfs destroy -r $recv_ds
+}
+
+log_assert "Verify compressed send works with datasets of varying recsize."
+log_onexit cleanup
+typeset recsize opts dir
+typeset small=$((128 * 1024))
+typeset large=$((1024 * 1024))
+
+# Create POOL3 with features disabled and datasets to create test send streams
+log_must zpool create -d $POOL3 $DISK3
+write_compressible $BACKDIR 32m
+for recsize in $small $large; do
+ log_must zfs create -o compress=gzip -o recsize=$recsize \
+ $TESTPOOL/$recsize
+ dir=$(get_prop mountpoint $TESTPOOL/$recsize)
+ log_must cp $BACKDIR/file.0 $dir/testfile
+ log_must zfs snapshot $TESTPOOL/$recsize@snap
+done
+
+# Run tests for send streams without large blocks
+for opts in '' -d -c -p -dp -L -dL -cp -cL -pL -dpL -cpL; do
+ check $opts $small log_must
+done
+for opts in -dc -dcp -dcL -dcpL; do
+ check $opts $small log_mustnot
+done
+
+# Run tests for send streams with large blocks
+for opts in '' -d -dp -c; do
+ check $opts $large log_must
+done
+for opts in -dc -dL -dcp -dcL -dpL -dcpL; do
+ check $opts $large log_mustnot
+done
+check -p $large log_must $small $large
+check -L $large log_must $large $small
+check -cp $large log_must $small $large
+check -cL $large log_must $large $small
+check -pL $large log_must $large $large
+check -cpL $large log_must $large $large
+
+log_pass "Compressed send works with datasets of varying recsize."
diff --git a/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio b/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio
index f876bd63d3..8289d546de 100644
--- a/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio
+++ b/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio
@@ -10,7 +10,7 @@
#
#
-# Copyright (c) 2015 by Delphix. All rights reserved.
+# Copyright (c) 2016 by Delphix. All rights reserved.
#
[global]
@@ -24,7 +24,7 @@ thread=1
directory=/${TESTFS}
numjobs=${NUMJOBS}
filesize=${FILE_SIZE}
-buffer_compress_percentage=33
+buffer_compress_percentage=66
buffer_compress_chunk=4096
[job]
diff --git a/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio b/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio
index 0b750260ff..07090d4dcd 100644
--- a/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio
+++ b/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio
@@ -10,7 +10,7 @@
#
#
-# Copyright (c) 2015 by Delphix. All rights reserved.
+# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
#
[global]
@@ -29,7 +29,7 @@ bssplit=4k/50:8k/30:128k/10:1m/10
ioengine=psync
sync=${SYNC_TYPE}
numjobs=${NUMJOBS}
-buffer_compress_percentage=33
+buffer_compress_percentage=66
buffer_compress_chunk=4096
[job]
diff --git a/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio b/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio
index b1860a71dd..9233a84260 100644
--- a/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio
+++ b/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio
@@ -10,7 +10,7 @@
#
#
-# Copyright (c) 2015 by Delphix. All rights reserved.
+# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
#
[global]
@@ -27,7 +27,7 @@ ioengine=psync
sync=${SYNC_TYPE}
numjobs=${NUMJOBS}
filesize=${FILESIZE}
-buffer_compress_percentage=33
+buffer_compress_percentage=66
buffer_compress_chunk=4096
[job]
diff --git a/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio b/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio
index df1590cf11..0ee6d091db 100644
--- a/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio
+++ b/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio
@@ -10,7 +10,7 @@
#
#
-# Copyright (c) 2015 by Delphix. All rights reserved.
+# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
#
[global]
@@ -27,7 +27,7 @@ ioengine=psync
sync=${SYNC_TYPE}
numjobs=${NUMJOBS}
filesize=${FILESIZE}
-buffer_compress_percentage=33
+buffer_compress_percentage=66
buffer_compress_chunk=4096
[job]
diff --git a/usr/src/uts/common/fs/nfs/nfs4_client.c b/usr/src/uts/common/fs/nfs/nfs4_client.c
index 7bfa46e1fb..5438038105 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_client.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_client.c
@@ -184,7 +184,6 @@ nfs4_validate_caches(vnode_t *vp, cred_t *cr)
return (0);
}
- gar.n4g_va.va_mask = AT_ALL;
return (nfs4_getattr_otw(vp, &gar, cr, 0));
}
@@ -582,6 +581,16 @@ nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
rp->r_attr.va_ctime.tv_nsec !=
vap->va_ctime.tv_nsec)
ctime_changed = 1;
+
+ /*
+ * If the change attribute was not provided by server
+ * or it differs, then flush all caches.
+ */
+ if (!garp->n4g_change_valid ||
+ rp->r_change != garp->n4g_change) {
+ mtime_changed = 1;
+ ctime_changed = 1;
+ }
} else {
writemodify_set = B_TRUE;
}
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
index 855cd8cd92..7240faa356 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
@@ -209,7 +209,7 @@ rfs4_attr_init()
/* ARGSUSED */
static int
rfs4_fattr4_supported_attrs(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -251,7 +251,7 @@ static nfs_ftype4 vt_to_nf4[] = {
/* ARGSUSED */
static int
rfs4_fattr4_type(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -357,7 +357,7 @@ fattr4_get_fh_expire_type(struct exportinfo *exi, uint32_t *fh_expire_typep)
/* ARGSUSED */
static int
rfs4_fattr4_fh_expire_type(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
uint32_t fh_expire_type;
int error = 0;
@@ -396,6 +396,7 @@ fattr4_get_change(struct nfs4_svgetit_arg *sarg, fattr4_change *changep)
struct compound_state *cs = sarg->cs;
vnode_t *vp = cs->vp;
nfsstat4 status;
+ timespec_t vis_change;
if ((vap->va_mask & AT_CTIME) == 0) {
if (sarg->rdattr_error && (vp == NULL)) {
@@ -408,14 +409,22 @@ fattr4_get_change(struct nfs4_svgetit_arg *sarg, fattr4_change *changep)
if (status != NFS4_OK)
return (geterrno4(status));
}
- NFS4_SET_FATTR4_CHANGE(*changep, vap->va_ctime)
+ NFS4_SET_FATTR4_CHANGE(*changep, vap->va_ctime);
+
+ if (nfs_visible_change(cs->exi, vp, &vis_change)) {
+ fattr4_change visch;
+ NFS4_SET_FATTR4_CHANGE(visch, vis_change);
+ if (visch > *changep)
+ *changep = visch;
+ }
+
return (0);
}
/* ARGSUSED */
static int
rfs4_fattr4_change(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
fattr4_change change;
@@ -453,7 +462,7 @@ rfs4_fattr4_change(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_size(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -490,7 +499,7 @@ rfs4_fattr4_size(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_link_support(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -525,7 +534,7 @@ rfs4_fattr4_link_support(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_symlink_support(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -556,7 +565,7 @@ rfs4_fattr4_symlink_support(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_named_attr(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
ulong_t val;
@@ -626,7 +635,7 @@ rfs4_fattr4_named_attr(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_fsid(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
int *pmaj = (int *)&na->fsid.major;
@@ -681,7 +690,7 @@ rfs4_fattr4_fsid(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_unique_handles(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
/*
* XXX
@@ -718,7 +727,7 @@ rfs4_fattr4_unique_handles(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_lease_time(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -749,7 +758,7 @@ rfs4_fattr4_lease_time(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_rdattr_error(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -798,7 +807,7 @@ rfs4fhcmp(nfs_fh4 *wirefh, nfs_fh4 *srvfh)
/* ARGSUSED */
static int
rfs4_fattr4_filehandle(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
nfs_fh4 *fh;
@@ -861,7 +870,7 @@ rfs4_fattr4_filehandle(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_acl(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
vsecattr_t vs_native, vs_ace4;
@@ -1047,7 +1056,7 @@ rfs4_fattr4_acl(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_aclsupport(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1079,7 +1088,7 @@ rfs4_fattr4_aclsupport(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_archive(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -1087,7 +1096,7 @@ rfs4_fattr4_archive(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_cansettime(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1125,7 +1134,7 @@ rfs4_fattr4_cansettime(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_case_insensitive(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1159,7 +1168,7 @@ rfs4_fattr4_case_insensitive(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_case_preserving(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1194,7 +1203,7 @@ rfs4_fattr4_case_preserving(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_chown_restricted(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
ulong_t val;
@@ -1244,7 +1253,7 @@ rfs4_fattr4_chown_restricted(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_fileid(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1352,7 +1361,7 @@ rfs4_get_mntdfileid(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg)
/* ARGSUSED */
static int
rfs4_fattr4_mounted_on_fileid(nfs4_attr_cmd_t cmd,
- struct nfs4_svgetit_arg *sarg, union nfs4_attr_u *na)
+ struct nfs4_svgetit_arg *sarg, union nfs4_attr_u *na)
{
int error = 0;
@@ -1391,7 +1400,7 @@ rfs4_fattr4_mounted_on_fileid(nfs4_attr_cmd_t cmd,
/* ARGSUSED */
static int
rfs4_fattr4_files_avail(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1431,7 +1440,7 @@ rfs4_fattr4_files_avail(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_files_free(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1471,7 +1480,7 @@ rfs4_fattr4_files_free(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_files_total(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1571,7 +1580,7 @@ rfs4_free_fs_locations4(fs_locations4 *fsls4)
/* ARGSUSED */
static int
rfs4_fattr4_fs_locations(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
fs_locations4 *fsl;
@@ -1617,7 +1626,7 @@ rfs4_fattr4_fs_locations(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_hidden(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -1625,7 +1634,7 @@ rfs4_fattr4_hidden(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_homogeneous(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1659,7 +1668,7 @@ rfs4_fattr4_homogeneous(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_maxfilesize(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
ulong_t val;
@@ -1737,7 +1746,7 @@ rfs4_fattr4_maxfilesize(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_maxlink(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
ulong_t val;
@@ -1784,7 +1793,7 @@ rfs4_fattr4_maxlink(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_maxname(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
ulong_t val;
@@ -1831,7 +1840,7 @@ rfs4_fattr4_maxname(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_maxread(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1865,7 +1874,7 @@ rfs4_fattr4_maxread(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_maxwrite(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1899,7 +1908,7 @@ rfs4_fattr4_maxwrite(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_mimetype(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -1907,7 +1916,7 @@ rfs4_fattr4_mimetype(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_mode(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1950,7 +1959,7 @@ rfs4_fattr4_mode(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_no_trunc(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1984,7 +1993,7 @@ rfs4_fattr4_no_trunc(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_numlinks(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2024,7 +2033,7 @@ rfs4_fattr4_numlinks(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_owner(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
uid_t uid;
@@ -2136,7 +2145,7 @@ rfs4_fattr4_owner(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_owner_group(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
gid_t gid;
@@ -2252,7 +2261,7 @@ rfs4_fattr4_owner_group(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_quota_avail_hard(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -2260,7 +2269,7 @@ rfs4_fattr4_quota_avail_hard(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_quota_avail_soft(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -2268,7 +2277,7 @@ rfs4_fattr4_quota_avail_soft(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_quota_used(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -2276,7 +2285,7 @@ rfs4_fattr4_quota_used(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_rawdev(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2320,7 +2329,7 @@ rfs4_fattr4_rawdev(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_space_avail(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2367,7 +2376,7 @@ rfs4_fattr4_space_avail(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_space_free(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2414,7 +2423,7 @@ rfs4_fattr4_space_free(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_space_total(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2461,7 +2470,7 @@ rfs4_fattr4_space_total(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_space_used(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2502,7 +2511,7 @@ rfs4_fattr4_space_used(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_system(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -2510,7 +2519,7 @@ rfs4_fattr4_system(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_access(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
timestruc_t atime;
@@ -2557,7 +2566,7 @@ rfs4_fattr4_time_access(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_access_set(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
settime4 *ta;
@@ -2601,7 +2610,7 @@ rfs4_fattr4_time_access_set(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_backup(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -2609,7 +2618,7 @@ rfs4_fattr4_time_backup(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_create(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -2617,7 +2626,7 @@ rfs4_fattr4_time_create(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_delta(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2653,7 +2662,7 @@ rfs4_fattr4_time_delta(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_metadata(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
timestruc_t ctime;
@@ -2698,7 +2707,7 @@ rfs4_fattr4_time_metadata(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_modify(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
timestruc_t mtime;
@@ -2745,7 +2754,7 @@ rfs4_fattr4_time_modify(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_modify_set(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
settime4 *tm;
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
index 3ee41939ac..4ad799be46 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
@@ -144,7 +144,7 @@ nfs4_vget_pseudo(struct exportinfo *exi, vnode_t **vpp, fid_t *fidp)
*/
struct exportinfo *
pseudo_exportfs(vnode_t *vp, fid_t *fid, struct exp_visible *vis_head,
- struct exportdata *exdata)
+ struct exportdata *exdata)
{
struct exportinfo *exi;
struct exportdata *kex;
@@ -446,8 +446,12 @@ more_visible(struct exportinfo *exi, treenode_t *tree_head)
* list just assign the entire supplied list.
*/
if (exi->exi_visible == NULL) {
- tree_add_child(exi->exi_tree, tree_head);
+ tree_add_child(connect_point, tree_head);
exi->exi_visible = vis_head;
+
+ /* Update the change timestamp */
+ tree_update_change(connect_point, &vis_head->vis_change);
+
return;
}
@@ -504,6 +508,11 @@ more_visible(struct exportinfo *exi, treenode_t *tree_head)
connect_point = child;
} else { /* Branching */
tree_add_child(connect_point, curr);
+
+ /* Update the change timestamp */
+ tree_update_change(connect_point,
+ &curr->tree_vis->vis_change);
+
connect_point = NULL;
}
}
@@ -612,15 +621,17 @@ treeclimb_export(struct exportinfo *exip)
fid_t fid;
int error;
int exportdir;
- struct exportinfo *exi = NULL;
struct exportinfo *new_exi = exip;
struct exp_visible *visp;
struct exp_visible *vis_head = NULL;
struct vattr va;
treenode_t *tree_head = NULL;
+ timespec_t now;
ASSERT(RW_WRITE_HELD(&exported_lock));
+ gethrestime(&now);
+
vp = exip->exi_vp;
VN_HOLD(vp);
exportdir = 1;
@@ -633,36 +644,33 @@ treeclimb_export(struct exportinfo *exip)
if (error)
break;
- if (! exportdir) {
- /*
- * Check if this exportroot is a VROOT dir. If so,
- * then attach the pseudonodes. If not, then
- * continue .. traversal until we hit a VROOT
- * export (pseudo or real).
- */
- exi = checkexport4(&vp->v_vfsp->vfs_fsid, &fid, vp);
- if (exi != NULL && vp->v_flag & VROOT) {
- /*
- * Found an export info
- *
- * Extend the list of visible
- * directories whether it's a pseudo
- * or a real export.
- */
- more_visible(exi, tree_head);
- break; /* and climb no further */
- }
- }
-
/*
- * If at the root of the filesystem, need
- * to traverse across the mountpoint
- * and continue the climb on the mounted-on
- * filesystem.
+ * The root of the file system needs special handling
*/
if (vp->v_flag & VROOT) {
-
if (! exportdir) {
+ struct exportinfo *exi;
+
+ /*
+ * Check if this VROOT dir is already exported.
+ * If so, then attach the pseudonodes. If not,
+ * then continue .. traversal until we hit a
+ * VROOT export (pseudo or real).
+ */
+ exi = checkexport4(&vp->v_vfsp->vfs_fsid, &fid,
+ vp);
+ if (exi != NULL) {
+ /*
+ * Found an export info
+ *
+ * Extend the list of visible
+ * directories whether it's a pseudo
+ * or a real export.
+ */
+ more_visible(exi, tree_head);
+ break; /* and climb no further */
+ }
+
/*
* Found the root directory of a filesystem
* that isn't exported. Need to export
@@ -679,13 +687,21 @@ treeclimb_export(struct exportinfo *exip)
/*
* If sharing "/", new_exi is shared exportinfo
* (exip). Otherwise, new_exi is exportinfo
- * created in pseudo_exportfs() above.
+ * created by pseudo_exportfs() above.
*/
- ns_root = tree_prepend_node(tree_head, 0,
+ ns_root = tree_prepend_node(tree_head, NULL,
new_exi);
+
+ /* Update the change timestamp */
+ tree_update_change(ns_root, &now);
+
break;
}
+ /*
+ * Traverse across the mountpoint and continue the
+ * climb on the mounted-on filesystem.
+ */
vp = untraverse(vp);
exportdir = 0;
continue;
@@ -712,10 +728,10 @@ treeclimb_export(struct exportinfo *exip)
visp->vis_exported = exportdir;
visp->vis_secinfo = NULL;
visp->vis_seccnt = 0;
+ visp->vis_change = now; /* structure copy */
visp->vis_next = vis_head;
vis_head = visp;
-
/*
* Will set treenode's pointer to exportinfo to
* 1. shared exportinfo (exip) - if first visit here
@@ -765,7 +781,7 @@ treeclimb_export(struct exportinfo *exip)
/* Connect unconnected exportinfo, if there is any. */
if (new_exi && new_exi != exip)
- tree_head = tree_prepend_node(tree_head, 0, new_exi);
+ tree_head = tree_prepend_node(tree_head, NULL, new_exi);
while (tree_head) {
treenode_t *t2 = tree_head;
@@ -799,6 +815,7 @@ void
treeclimb_unexport(struct exportinfo *exip)
{
treenode_t *tnode, *old_nd;
+ treenode_t *connect_point = NULL;
ASSERT(RW_WRITE_HELD(&exported_lock));
@@ -809,25 +826,25 @@ treeclimb_unexport(struct exportinfo *exip)
*/
tnode->tree_exi = NULL;
- if (tnode->tree_vis) /* system root has tree_vis == NULL */
+ if (tnode->tree_vis != NULL) /* system root has tree_vis == NULL */
tnode->tree_vis->vis_exported = 0;
- while (tnode) {
+ while (tnode != NULL) {
/* Stop at VROOT node which is exported or has child */
if (TREE_ROOT(tnode) &&
- (TREE_EXPORTED(tnode) || tnode->tree_child_first))
+ (TREE_EXPORTED(tnode) || tnode->tree_child_first != NULL))
break;
/* Release pseudo export if it has no child */
if (TREE_ROOT(tnode) && !TREE_EXPORTED(tnode) &&
- tnode->tree_child_first == 0) {
+ tnode->tree_child_first == NULL) {
export_unlink(tnode->tree_exi);
exi_rele(tnode->tree_exi);
}
/* Release visible in parent's exportinfo */
- if (tnode->tree_vis)
+ if (tnode->tree_vis != NULL)
less_visible(vis2exi(tnode), tnode->tree_vis);
/* Continue with parent */
@@ -835,9 +852,16 @@ treeclimb_unexport(struct exportinfo *exip)
tnode = tnode->tree_parent;
/* Remove itself, if this is a leaf and non-exported node */
- if (old_nd->tree_child_first == NULL && !TREE_EXPORTED(old_nd))
+ if (old_nd->tree_child_first == NULL &&
+ !TREE_EXPORTED(old_nd)) {
tree_remove_node(old_nd);
+ connect_point = tnode;
+ }
}
+
+ /* Update the change timestamp */
+ if (connect_point != NULL)
+ tree_update_change(connect_point, NULL);
}
/*
@@ -929,7 +953,7 @@ has_visible(struct exportinfo *exi, vnode_t *vp)
fid_t fid;
bool_t vp_is_exported;
- vp_is_exported = VN_CMP(vp, exi->exi_vp);
+ vp_is_exported = VN_CMP(vp, exi->exi_vp);
/*
* An exported root vnode has a sub-dir shared if it has a visible list.
@@ -1111,10 +1135,9 @@ nfs_exported(struct exportinfo *exi, vnode_t *vp)
* skips . and .. entries.
*/
int
-nfs_visible_inode(struct exportinfo *exi, ino64_t ino, int *expseudo)
+nfs_visible_inode(struct exportinfo *exi, ino64_t ino,
+ struct exp_visible **visp)
{
- struct exp_visible *visp;
-
/*
* Only a PSEUDO node has a visible list or an exported VROOT
* node may have a visible list.
@@ -1122,12 +1145,108 @@ nfs_visible_inode(struct exportinfo *exi, ino64_t ino, int *expseudo)
if (! PSEUDO(exi))
exi = get_root_export(exi);
- for (visp = exi->exi_visible; visp; visp = visp->vis_next)
- if ((u_longlong_t)ino == visp->vis_ino) {
- *expseudo = visp->vis_exported;
+ for (*visp = exi->exi_visible; *visp != NULL; *visp = (*visp)->vis_next)
+ if ((u_longlong_t)ino == (*visp)->vis_ino) {
return (1);
}
- *expseudo = 0;
return (0);
}
+
+/*
+ * The change attribute value of the root of nfs pseudo namespace.
+ *
+ * The ns_root_change is protected by exported_lock because all of the treenode
+ * operations are protected by exported_lock too.
+ */
+static timespec_t ns_root_change;
+
+/*
+ * Get the change attribute from visible and returns TRUE.
+ * If the change value is not available returns FALSE.
+ */
+bool_t
+nfs_visible_change(struct exportinfo *exi, vnode_t *vp, timespec_t *change)
+{
+ struct exp_visible *visp;
+ fid_t fid;
+ treenode_t *node;
+
+ /*
+ * First check to see if vp is export root.
+ */
+ if (VN_CMP(vp, exi->exi_vp))
+ goto exproot;
+
+ /*
+ * Only a PSEUDO node has a visible list or an exported VROOT
+ * node may have a visible list.
+ */
+ if (!PSEUDO(exi))
+ exi = get_root_export(exi);
+
+ /* Get the fid of the vnode */
+ bzero(&fid, sizeof (fid));
+ fid.fid_len = MAXFIDSZ;
+ if (vop_fid_pseudo(vp, &fid) != 0)
+ return (FALSE);
+
+ /*
+ * We can't trust VN_CMP() above because of LOFS.
+ * Even though VOP_CMP will do the right thing for LOFS
+ * objects, VN_CMP will short circuit out early when the
+ * vnode ops ptrs are different. Just in case we're dealing
+ * with LOFS, compare exi_fid/fsid here.
+ */
+ if (EQFID(&exi->exi_fid, &fid) &&
+ EQFSID(&exi->exi_fsid, &vp->v_vfsp->vfs_fsid))
+ goto exproot;
+
+ /* See if it matches any fid in the visible list */
+ for (visp = exi->exi_visible; visp; visp = visp->vis_next) {
+ if (EQFID(&fid, &visp->vis_fid)) {
+ *change = visp->vis_change;
+ return (TRUE);
+ }
+ }
+
+ return (FALSE);
+
+exproot:
+ /* The VROOT export have its visible available through treenode */
+ node = exi->exi_tree;
+ if (node != ns_root) {
+ ASSERT(node->tree_vis != NULL);
+ *change = node->tree_vis->vis_change;
+ } else {
+ ASSERT(node->tree_vis == NULL);
+ *change = ns_root_change;
+ }
+
+ return (TRUE);
+}
+
+/*
+ * Update the change attribute value for a particular treenode. The change
+ * attribute value is stored in the visible attached to the treenode, or in the
+ * ns_root_change.
+ *
+ * If the change value is not supplied, the current time is used.
+ */
+void
+tree_update_change(treenode_t *tnode, timespec_t *change)
+{
+ timespec_t *vis_change;
+
+ ASSERT(tnode != NULL);
+ ASSERT((tnode != ns_root && tnode->tree_vis != NULL) ||
+ (tnode == ns_root && tnode->tree_vis == NULL));
+
+ vis_change = tnode == ns_root ? &ns_root_change
+ : &tnode->tree_vis->vis_change;
+
+ if (change != NULL)
+ *vis_change = *change;
+ else
+ gethrestime(vis_change);
+}
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_readdir.c b/usr/src/uts/common/fs/nfs/nfs4_srv_readdir.c
index 276d3b4f19..01c76cb203 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_readdir.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_readdir.c
@@ -104,8 +104,8 @@ static nfs_ftype4 vt_to_nf4[] = {
int
nfs4_readdir_getvp(vnode_t *dvp, char *d_name, vnode_t **vpp,
- struct exportinfo **exi, struct svc_req *req,
- struct compound_state *cs, int expseudo)
+ struct exportinfo **exi, struct svc_req *req, struct compound_state *cs,
+ int expseudo)
{
int error;
int ismntpt;
@@ -382,8 +382,8 @@ rfs4_get_sb_encode(vfs_t *vfsp, rfs4_sb_encode_t *psbe)
*/
/* ARGSUSED */
void
-rfs4_op_readdir(nfs_argop4 *argop, nfs_resop4 *resop,
- struct svc_req *req, struct compound_state *cs)
+rfs4_op_readdir(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req,
+ struct compound_state *cs)
{
READDIR4args *args = &argop->nfs_argop4_u.opreaddir;
READDIR4res *resp = &resop->nfs_resop4_u.opreaddir;
@@ -409,7 +409,7 @@ rfs4_op_readdir(nfs_argop4 *argop, nfs_resop4 *resop,
struct uio uio;
int tsize;
int check_visible;
- int expseudo = 0;
+ struct exp_visible *visp;
uint32_t *ptr, *ptr_redzone;
uint32_t *beginning_ptr;
@@ -687,8 +687,8 @@ readagain:
for (dp = (struct dirent64 *)rddir_data;
!no_space && rddir_result_size > 0; dp = nextdp(dp)) {
- /* reset expseudo */
- expseudo = 0;
+ /* reset visp */
+ visp = NULL;
if (vp) {
VN_RELE(vp);
@@ -707,7 +707,7 @@ readagain:
}
if (check_visible &&
- !nfs_visible_inode(cs->exi, dp->d_ino, &expseudo)) {
+ !nfs_visible_inode(cs->exi, dp->d_ino, &visp)) {
rddir_next_offset = dp->d_off;
continue;
}
@@ -724,7 +724,8 @@ readagain:
goto reencode_attrs;
error = nfs4_readdir_getvp(dvp, dp->d_name,
- &vp, &newexi, req, cs, expseudo);
+ &vp, &newexi, req, cs,
+ visp != NULL ? visp->vis_exported : 0);
if (error == ENOENT) {
rddir_next_offset = dp->d_off;
continue;
@@ -917,6 +918,13 @@ reencode_attrs:
u_longlong_t change;
NFS4_SET_FATTR4_CHANGE(change,
va.va_ctime);
+ if (visp != NULL) {
+ u_longlong_t visch;
+ NFS4_SET_FATTR4_CHANGE(visch,
+ visp->vis_change);
+ if (visch > change)
+ change = visch;
+ }
IXDR_PUT_HYPER(ptr, change);
}
if (ae & FATTR4_SIZE_MASK) {
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index 01886e3627..4c6be91e0a 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -2233,7 +2233,6 @@ nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
(rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
return (0);
- gar.n4g_va.va_mask = AT_ALL;
return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
}
@@ -12384,9 +12383,8 @@ nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
/*
* The getattr otw call will always get both the acl, in
* the form of a list of nfsace4's, and the number of acl
- * entries; independent of the value of gar.n4g_vsa.vsa_mask.
+ * entries; independent of the value of gar.n4g_va.va_mask.
*/
- gar.n4g_va.va_mask = AT_ALL;
error = nfs4_getattr_otw(vp, &gar, cr, 1);
if (error) {
vs_ace4_destroy(&gar.n4g_vsa);
diff --git a/usr/src/uts/common/fs/nfs/nfs_export.c b/usr/src/uts/common/fs/nfs/nfs_export.c
index 4c316a3876..200ef6668d 100644
--- a/usr/src/uts/common/fs/nfs/nfs_export.c
+++ b/usr/src/uts/common/fs/nfs/nfs_export.c
@@ -83,7 +83,7 @@ extern void sec_svc_freerootnames(int, int, caddr_t *);
static int build_seclist_nodups(exportdata_t *, secinfo_t *, int);
static void srv_secinfo_add(secinfo_t **, int *, secinfo_t *, int, int);
static void srv_secinfo_remove(secinfo_t **, int *, secinfo_t *, int);
-static void srv_secinfo_treeclimb(exportinfo_t *, secinfo_t *, int, int);
+static void srv_secinfo_treeclimb(exportinfo_t *, secinfo_t *, int, bool_t);
#ifdef VOLATILE_FH_TEST
static struct ex_vol_rename *find_volrnm_fh(exportinfo_t *, nfs_fh4 *);
@@ -703,12 +703,13 @@ vis2exi(treenode_t *tnode)
* given exportinfo from its ancestors upto the system root.
*/
void
-srv_secinfo_treeclimb(exportinfo_t *exip, secinfo_t *sec, int seccnt, int isadd)
+srv_secinfo_treeclimb(exportinfo_t *exip, secinfo_t *sec, int seccnt,
+ bool_t isadd)
{
treenode_t *tnode = exip->exi_tree;
ASSERT(RW_WRITE_HELD(&exported_lock));
- ASSERT(tnode);
+ ASSERT(tnode != NULL);
if (seccnt == 0)
return;
@@ -716,7 +717,7 @@ srv_secinfo_treeclimb(exportinfo_t *exip, secinfo_t *sec, int seccnt, int isadd)
/*
* If flavors are being added and the new export root isn't
* also VROOT, its implicitly allowed flavors are inherited from
- * from its pseudonode.
+ * its pseudonode.
* Note - for VROOT exports the implicitly allowed flavors were
* transferred from the PSEUDO export in exportfs()
*/
@@ -733,10 +734,10 @@ srv_secinfo_treeclimb(exportinfo_t *exip, secinfo_t *sec, int seccnt, int isadd)
*/
tnode = tnode->tree_parent;
- while (tnode) {
+ while (tnode != NULL) {
/* If there is exportinfo, update it */
- if (tnode->tree_exi) {
+ if (tnode->tree_exi != NULL) {
secinfo_t **pxsec =
&tnode->tree_exi->exi_export.ex_secinfo;
int *pxcnt = &tnode->tree_exi->exi_export.ex_seccnt;
@@ -749,7 +750,7 @@ srv_secinfo_treeclimb(exportinfo_t *exip, secinfo_t *sec, int seccnt, int isadd)
}
/* Update every visible - only root node has no visible */
- if (tnode->tree_vis) {
+ if (tnode->tree_vis != NULL) {
secinfo_t **pxsec = &tnode->tree_vis->vis_secinfo;
int *pxcnt = &tnode->tree_vis->vis_seccnt;
if (isadd)
@@ -1517,9 +1518,12 @@ exportfs(struct exportfs_args *args, model_t model, cred_t *cr)
if (error)
goto out7;
} else {
- /* If it's a re-export update namespace tree */
+ /* If it's a re-export update namespace tree */
exi->exi_tree = ex->exi_tree;
exi->exi_tree->tree_exi = exi;
+
+ /* Update the change timestamp */
+ tree_update_change(exi->exi_tree, NULL);
}
/*
@@ -1670,7 +1674,7 @@ unexport(struct exportinfo *exi)
* a pseudo export here to retain the visible list
* for paths to exports below.
*/
- if (exi->exi_visible) {
+ if (exi->exi_visible != NULL) {
struct exportinfo *newexi;
newexi = pseudo_exportfs(exi->exi_vp, &exi->exi_fid,
@@ -1680,6 +1684,9 @@ unexport(struct exportinfo *exi)
/* interconnect the existing treenode with the new exportinfo */
newexi->exi_tree = exi->exi_tree;
newexi->exi_tree->tree_exi = newexi;
+
+ /* Update the change timestamp */
+ tree_update_change(exi->exi_tree, NULL);
} else {
treeclimb_unexport(exi);
}
@@ -1893,7 +1900,7 @@ nfs_getfh(struct nfs_getfh_args *args, model_t model, cred_t *cr)
*/
struct exportinfo *
nfs_vptoexi(vnode_t *dvp, vnode_t *vp, cred_t *cr, int *walk,
- int *err, bool_t v4srv)
+ int *err, bool_t v4srv)
{
fid_t fid;
int error;
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index c4b8d2acc6..f3f6c818a0 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -77,10 +77,10 @@
* A new reference to a cache buffer can be obtained in two
* ways: 1) via a hash table lookup using the DVA as a key,
* or 2) via one of the ARC lists. The arc_read() interface
- * uses method 1, while the internal arc algorithms for
+ * uses method 1, while the internal ARC algorithms for
* adjusting the cache use method 2. We therefore provide two
* types of locks: 1) the hash table lock array, and 2) the
- * arc list locks.
+ * ARC list locks.
*
* Buffers do not have their own mutexes, rather they rely on the
* hash table mutexes for the bulk of their protection (i.e. most
@@ -93,21 +93,12 @@
* buf_hash_remove() expects the appropriate hash mutex to be
* already held before it is invoked.
*
- * Each arc state also has a mutex which is used to protect the
+ * Each ARC state also has a mutex which is used to protect the
* buffer list associated with the state. When attempting to
- * obtain a hash table lock while holding an arc list lock you
+ * obtain a hash table lock while holding an ARC list lock you
* must use: mutex_tryenter() to avoid deadlock. Also note that
* the active state mutex must be held before the ghost state mutex.
*
- * Arc buffers may have an associated eviction callback function.
- * This function will be invoked prior to removing the buffer (e.g.
- * in arc_do_user_evicts()). Note however that the data associated
- * with the buffer may be evicted prior to the callback. The callback
- * must be made with *no locks held* (to prevent deadlock). Additionally,
- * the users of callbacks must ensure that their private data is
- * protected from simultaneous callbacks from arc_clear_callback()
- * and arc_do_user_evicts().
- *
* Note that the majority of the performance stats are manipulated
* with atomic operations.
*
@@ -136,67 +127,81 @@
* are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
* the arc_buf_hdr_t that will point to the data block in memory. A block can
* only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
- * caches data in two ways -- in a list of arc buffers (arc_buf_t) and
+ * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
* also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
- * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC
- * consumer, and always contains uncompressed data. The ARC will provide
- * references to this data and will keep it cached until it is no longer in
- * use. Typically, the arc will try to cache only the L1ARC's physical data
- * block and will aggressively evict any arc_buf_t that is no longer referenced.
- * The amount of memory consumed by the arc_buf_t's can be seen via the
+ *
+ * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
+ * ability to store the physical data (b_pdata) associated with the DVA of the
+ * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block,
+ * it will match its on-disk compression characteristics. This behavior can be
+ * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
+ * compressed ARC functionality is disabled, the b_pdata will point to an
+ * uncompressed version of the on-disk data.
+ *
+ * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
+ * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
+ * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
+ * consumer. The ARC will provide references to this data and will keep it
+ * cached until it is no longer in use. The ARC caches only the L1ARC's physical
+ * data block and will evict any arc_buf_t that is no longer referenced. The
+ * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
* "overhead_size" kstat.
*
+ * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
+ * compressed form. The typical case is that consumers will want uncompressed
+ * data, and when that happens a new data buffer is allocated where the data is
+ * decompressed for them to use. Currently the only consumer who wants
+ * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
+ * exists on disk. When this happens, the arc_buf_t's data buffer is shared
+ * with the arc_buf_hdr_t.
*
- * arc_buf_hdr_t
- * +-----------+
- * | |
- * | |
- * | |
- * +-----------+
- * l2arc_buf_hdr_t| |
- * | |
- * +-----------+
- * l1arc_buf_hdr_t| |
- * | | arc_buf_t
- * | b_buf +------------>+---------+ arc_buf_t
- * | | |b_next +---->+---------+
- * | b_pdata +-+ |---------| |b_next +-->NULL
- * +-----------+ | | | +---------+
- * | |b_data +-+ | |
- * | +---------+ | |b_data +-+
- * +->+------+ | +---------+ |
- * (potentially) | | | |
- * compressed | | | |
- * data +------+ | v
- * +->+------+ +------+
- * uncompressed | | | |
- * data | | | |
- * +------+ +------+
+ * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
+ * first one is owned by a compressed send consumer (and therefore references
+ * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
+ * used by any other consumer (and has its own uncompressed copy of the data
+ * buffer).
*
- * The L1ARC's data pointer, however, may or may not be uncompressed. The
- * ARC has the ability to store the physical data (b_pdata) associated with
- * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk
- * physical block, it will match its on-disk compression characteristics.
- * If the block on-disk is compressed, then the physical data block
- * in the cache will also be compressed and vice-versa. This behavior
- * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
- * compressed ARC functionality is disabled, the b_pdata will point to an
- * uncompressed version of the on-disk data.
+ * arc_buf_hdr_t
+ * +-----------+
+ * | fields |
+ * | common to |
+ * | L1- and |
+ * | L2ARC |
+ * +-----------+
+ * | l2arc_buf_hdr_t
+ * | |
+ * +-----------+
+ * | l1arc_buf_hdr_t
+ * | | arc_buf_t
+ * | b_buf +------------>+-----------+ arc_buf_t
+ * | b_pdata +-+ |b_next +---->+-----------+
+ * +-----------+ | |-----------| |b_next +-->NULL
+ * | |b_comp = T | +-----------+
+ * | |b_data +-+ |b_comp = F |
+ * | +-----------+ | |b_data +-+
+ * +->+------+ | +-----------+ |
+ * compressed | | | |
+ * data | |<--------------+ | uncompressed
+ * +------+ compressed, | data
+ * shared +-->+------+
+ * data | |
+ * | |
+ * +------+
*
* When a consumer reads a block, the ARC must first look to see if the
- * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t,
- * then an additional arc_buf_t is allocated and the uncompressed data is
- * bcopied from the existing arc_buf_t. If the hdr is cached but does not
- * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses
- * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's
- * b_pdata is not compressed, then the block is shared with the newly
- * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t
- * in the arc buffer chain. Sharing the block reduces the memory overhead
- * required when the hdr is caching uncompressed blocks or the compressed
- * arc functionality has been disabled via 'zfs_compressed_arc_enabled'.
+ * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
+ * arc_buf_t and either copies uncompressed data into a new data buffer from an
+ * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a
+ * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the
+ * hdr is compressed and the desired compression characteristics of the
+ * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
+ * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
+ * the last buffer in the hdr's b_buf list, however a shared compressed buf can
+ * be anywhere in the hdr's list.
*
* The diagram below shows an example of an uncompressed ARC hdr that is
- * sharing its data with an arc_buf_t:
+ * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
+ * the last element in the buf list):
*
* arc_buf_hdr_t
* +-----------+
@@ -225,20 +230,24 @@
* | +------+ |
* +---------------------------------+
*
- * Writing to the arc requires that the ARC first discard the b_pdata
+ * Writing to the ARC requires that the ARC first discard the hdr's b_pdata
* since the physical block is about to be rewritten. The new data contents
- * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline
- * performs the write, it may compress the data before writing it to disk.
- * The ARC will be called with the transformed data and will bcopy the
- * transformed on-disk block into a newly allocated b_pdata.
+ * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
+ * it may compress the data before writing it to disk. The ARC will be called
+ * with the transformed data and will bcopy the transformed on-disk block into
+ * a newly allocated b_pdata. Writes are always done into buffers which have
+ * either been loaned (and hence are new and don't have other readers) or
+ * buffers which have been released (and hence have their own hdr, if there
+ * were originally other readers of the buf's original hdr). This ensures that
+ * the ARC only needs to update a single buf and its hdr after a write occurs.
*
* When the L2ARC is in use, it will also take advantage of the b_pdata. The
* L2ARC will always write the contents of b_pdata to the L2ARC. This means
- * that when compressed arc is enabled that the L2ARC blocks are identical
+ * that when compressed ARC is enabled that the L2ARC blocks are identical
* to the on-disk block in the main data pool. This provides a significant
* advantage since the ARC can leverage the bp's checksum when reading from the
* L2ARC to determine if the contents are valid. However, if the compressed
- * arc is disabled, then the L2ARC's block must be transformed to look
+ * ARC is disabled, then the L2ARC's block must be transformed to look
* like the physical block in the main data pool before comparing the
* checksum and determining its validity.
*/
@@ -805,6 +814,7 @@ struct arc_callback {
void *acb_private;
arc_done_func_t *acb_done;
arc_buf_t *acb_buf;
+ boolean_t acb_compressed;
zio_t *acb_zio_dummy;
arc_callback_t *acb_next;
};
@@ -856,7 +866,7 @@ typedef struct l1arc_buf_hdr {
zio_cksum_t *b_freeze_cksum;
#ifdef ZFS_DEBUG
/*
- * used for debugging wtih kmem_flags - by allocating and freeing
+ * Used for debugging with kmem_flags - by allocating and freeing
* b_thawed when the buffer is thawed, we get a record of the stack
* trace that thawed it.
*/
@@ -971,6 +981,8 @@ struct arc_buf_hdr {
HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
+#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
+#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
/*
* Other sizes
@@ -1065,7 +1077,7 @@ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
static uint64_t l2arc_ndev; /* number of devices */
typedef struct l2arc_read_callback {
- arc_buf_hdr_t *l2rcb_hdr; /* read buffer */
+ arc_buf_hdr_t *l2rcb_hdr; /* read header */
blkptr_t l2rcb_bp; /* original blkptr */
zbookmark_phys_t l2rcb_zb; /* original bookmark */
int l2rcb_flags; /* original flags */
@@ -1400,6 +1412,31 @@ retry:
}
}
+/*
+ * This is the size that the buf occupies in memory. If the buf is compressed,
+ * it will correspond to the compressed size. You should use this method of
+ * getting the buf size unless you explicitly need the logical size.
+ */
+int32_t
+arc_buf_size(arc_buf_t *buf)
+{
+ return (ARC_BUF_COMPRESSED(buf) ?
+ HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
+}
+
+int32_t
+arc_buf_lsize(arc_buf_t *buf)
+{
+ return (HDR_GET_LSIZE(buf->b_hdr));
+}
+
+enum zio_compress
+arc_get_compression(arc_buf_t *buf)
+{
+ return (ARC_BUF_COMPRESSED(buf) ?
+ HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
+}
+
#define ARC_MINTIME (hz>>4) /* 62 ms */
static inline boolean_t
@@ -1408,9 +1445,21 @@ arc_buf_is_shared(arc_buf_t *buf)
boolean_t shared = (buf->b_data != NULL &&
buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
+ IMPLY(shared, ARC_BUF_SHARED(buf));
+ IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
+
+ /*
+ * It would be nice to assert arc_can_share() too, but the "hdr isn't
+ * already being shared" requirement prevents us from doing that.
+ */
+
return (shared);
}
+/*
+ * Free the checksum associated with this header. If there is no checksum, this
+ * is a no-op.
+ */
static inline void
arc_cksum_free(arc_buf_hdr_t *hdr)
{
@@ -1423,6 +1472,25 @@ arc_cksum_free(arc_buf_hdr_t *hdr)
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
}
+/*
+ * Return true iff at least one of the bufs on hdr is not compressed.
+ */
+static boolean_t
+arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
+{
+ for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
+ if (!ARC_BUF_COMPRESSED(b)) {
+ return (B_TRUE);
+ }
+ }
+ return (B_FALSE);
+}
+
+/*
+ * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
+ * matches the checksum that is stored in the hdr. If there is no checksum,
+ * or if the buf is compressed, this is a no-op.
+ */
static void
arc_cksum_verify(arc_buf_t *buf)
{
@@ -1432,6 +1500,12 @@ arc_cksum_verify(arc_buf_t *buf)
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
+ }
+
ASSERT(HDR_HAS_L1HDR(hdr));
mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
@@ -1439,7 +1513,8 @@ arc_cksum_verify(arc_buf_t *buf)
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
- fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc);
+
+ fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
panic("buffer modified while frozen!");
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
@@ -1513,6 +1588,12 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
return (valid_cksum);
}
+/*
+ * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
+ * checksum and attaches it to the buf's hdr so that we can ensure that the buf
+ * isn't modified later on. If buf is compressed or there is already a checksum
+ * on the hdr, this is a no-op (we only checksum uncompressed bufs).
+ */
static void
arc_cksum_compute(arc_buf_t *buf)
{
@@ -1522,14 +1603,21 @@ arc_cksum_compute(arc_buf_t *buf)
return;
ASSERT(HDR_HAS_L1HDR(hdr));
+
mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+ ASSERT(arc_hdr_has_uncompressed_buf(hdr));
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+ return;
+ } else if (ARC_BUF_COMPRESSED(buf)) {
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
+
+ ASSERT(!ARC_BUF_COMPRESSED(buf));
hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
KM_SLEEP);
- fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL,
+ fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
hdr->b_l1hdr.b_freeze_cksum);
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
arc_buf_watch(buf);
@@ -1570,7 +1658,7 @@ arc_buf_watch(arc_buf_t *buf)
procctl_t ctl;
ctl.cmd = PCWATCH;
ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
- ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr);
+ ctl.prwatch.pr_size = arc_buf_size(buf);
ctl.prwatch.pr_wflags = WA_WRITE;
result = write(arc_procfd, &ctl, sizeof (ctl));
ASSERT3U(result, ==, sizeof (ctl));
@@ -1591,6 +1679,12 @@ arc_buf_type(arc_buf_hdr_t *hdr)
return (type);
}
+boolean_t
+arc_is_metadata(arc_buf_t *buf)
+{
+ return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
+}
+
static uint32_t
arc_bufc_to_flags(arc_buf_contents_t type)
{
@@ -1612,12 +1706,19 @@ arc_buf_thaw(arc_buf_t *buf)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (hdr->b_l1hdr.b_state != arc_anon)
- panic("modifying non-anon buffer!");
- if (HDR_IO_IN_PROGRESS(hdr))
- panic("modifying buffer while i/o in progress!");
- arc_cksum_verify(buf);
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+
+ arc_cksum_verify(buf);
+
+ /*
+ * Compressed buffers do not manipulate the b_freeze_cksum or
+ * allocate b_thawed.
+ */
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
}
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -1646,6 +1747,12 @@ arc_buf_freeze(arc_buf_t *buf)
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
+ }
+
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
@@ -1654,7 +1761,6 @@ arc_buf_freeze(arc_buf_t *buf)
hdr->b_l1hdr.b_state == arc_anon);
arc_cksum_compute(buf);
mutex_exit(hash_lock);
-
}
/*
@@ -1711,47 +1817,157 @@ arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
}
}
+/*
+ * Looks for another buf on the same hdr which has the data decompressed, copies
+ * from it, and returns true. If no such buf exists, returns false.
+ */
+static boolean_t
+arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ boolean_t copied = B_FALSE;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3P(buf->b_data, !=, NULL);
+ ASSERT(!ARC_BUF_COMPRESSED(buf));
+
+ for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
+ from = from->b_next) {
+ /* can't use our own data buffer */
+ if (from == buf) {
+ continue;
+ }
+
+ if (!ARC_BUF_COMPRESSED(from)) {
+ bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
+ copied = B_TRUE;
+ break;
+ }
+ }
+
+ /*
+ * There were no decompressed bufs, so there should not be a
+ * checksum on the hdr either.
+ */
+ EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
+
+ return (copied);
+}
+
+/*
+ * Given a buf that has a data buffer attached to it, this function will
+ * efficiently fill the buf with data of the specified compression setting from
+ * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
+ * are already sharing a data buf, no copy is performed.
+ *
+ * If the buf is marked as compressed but uncompressed data was requested, this
+ * will allocate a new data buffer for the buf, remove that flag, and fill the
+ * buf with uncompressed data. You can't request a compressed buf on a hdr with
+ * uncompressed data, and (since we haven't added support for it yet) if you
+ * want compressed data your buf must already be marked as compressed and have
+ * the correct-sized data buffer.
+ */
static int
-arc_decompress(arc_buf_t *buf)
+arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
+ boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
- int error;
- if (arc_buf_is_shared(buf)) {
- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
- } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
- /*
- * The arc_buf_hdr_t is either not compressed or is
- * associated with an embedded block or a hole in which
- * case they remain anonymous.
- */
- IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 ||
- HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr));
- ASSERT(!HDR_SHARED_DATA(hdr));
- bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr));
+ ASSERT3P(buf->b_data, !=, NULL);
+ IMPLY(compressed, hdr_compressed);
+ IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
+
+ if (hdr_compressed == compressed) {
+ if (!arc_buf_is_shared(buf)) {
+ bcopy(hdr->b_l1hdr.b_pdata, buf->b_data,
+ arc_buf_size(buf));
+ }
} else {
- ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT(hdr_compressed);
+ ASSERT(!compressed);
ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
- error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
- hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr),
- HDR_GET_LSIZE(hdr));
- if (error != 0) {
- zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d",
- hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr),
- HDR_GET_LSIZE(hdr));
- return (SET_ERROR(EIO));
+
+ /*
+ * If the buf is sharing its data with the hdr, unlink it and
+ * allocate a new data buffer for the buf.
+ */
+ if (arc_buf_is_shared(buf)) {
+ ASSERT(ARC_BUF_COMPRESSED(buf));
+
+ /* We need to give the buf it's own b_data */
+ buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
+ buf->b_data =
+ arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+
+ /* Previously overhead was 0; just add new overhead */
+ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
+ } else if (ARC_BUF_COMPRESSED(buf)) {
+ /* We need to reallocate the buf's b_data */
+ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
+ buf);
+ buf->b_data =
+ arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+
+ /* We increased the size of b_data; update overhead */
+ ARCSTAT_INCR(arcstat_overhead_size,
+ HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
+ }
+
+ /*
+ * Regardless of the buf's previous compression settings, it
+ * should not be compressed at the end of this function.
+ */
+ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+
+ /*
+ * Try copying the data from another buf which already has a
+ * decompressed version. If that's not possible, it's time to
+ * bite the bullet and decompress the data from the hdr.
+ */
+ if (arc_buf_try_copy_decompressed_data(buf)) {
+ /* Skip byteswapping and checksumming (already done) */
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
+ return (0);
+ } else {
+ int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+ hdr->b_l1hdr.b_pdata, buf->b_data,
+ HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
+
+ /*
+ * Absent hardware errors or software bugs, this should
+ * be impossible, but log it anyway so we can debug it.
+ */
+ if (error != 0) {
+ zfs_dbgmsg(
+ "hdr %p, compress %d, psize %d, lsize %d",
+ hdr, HDR_GET_COMPRESS(hdr),
+ HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
+ return (SET_ERROR(EIO));
+ }
}
}
+
+ /* Byteswap the buf's data if necessary */
if (bswap != DMU_BSWAP_NUMFUNCS) {
ASSERT(!HDR_SHARED_DATA(hdr));
ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
}
+
+ /* Compute the hdr's checksum if necessary */
arc_cksum_compute(buf);
+
return (0);
}
+int
+arc_decompress(arc_buf_t *buf)
+{
+ return (arc_buf_fill(buf, B_FALSE));
+}
+
/*
* Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t.
*/
@@ -1779,7 +1995,6 @@ static void
arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
{
arc_buf_contents_t type = arc_buf_type(hdr);
- uint64_t lsize = HDR_GET_LSIZE(hdr);
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -1787,7 +2002,8 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
- (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr);
+ (void) refcount_add_many(&state->arcs_esize[type],
+ HDR_GET_LSIZE(hdr), hdr);
return;
}
@@ -1798,11 +2014,10 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
}
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
+ if (arc_buf_is_shared(buf))
continue;
- }
- (void) refcount_add_many(&state->arcs_esize[type], lsize, buf);
+ (void) refcount_add_many(&state->arcs_esize[type],
+ arc_buf_size(buf), buf);
}
}
@@ -1812,10 +2027,9 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
* so that we can add and remove them from the refcount individually.
*/
static void
-arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
+arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
{
arc_buf_contents_t type = arc_buf_type(hdr);
- uint64_t lsize = HDR_GET_LSIZE(hdr);
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -1824,7 +2038,7 @@ arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
(void) refcount_remove_many(&state->arcs_esize[type],
- lsize, hdr);
+ HDR_GET_LSIZE(hdr), hdr);
return;
}
@@ -1835,12 +2049,10 @@ arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
}
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
+ if (arc_buf_is_shared(buf))
continue;
- }
(void) refcount_remove_many(&state->arcs_esize[type],
- lsize, buf);
+ arc_buf_size(buf), buf);
}
}
@@ -1868,7 +2080,7 @@ add_reference(arc_buf_hdr_t *hdr, void *tag)
if (state != arc_l2c_only) {
multilist_remove(&state->arcs_list[arc_buf_type(hdr)],
hdr);
- arc_evitable_space_decrement(hdr, state);
+ arc_evictable_space_decrement(hdr, state);
}
/* remove the prefetch flag if we get a reference */
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
@@ -1956,7 +2168,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
update_old = B_TRUE;
}
- arc_evitable_space_decrement(hdr, old_state);
+ arc_evictable_space_decrement(hdr, old_state);
}
if (new_state != arc_anon && new_state != arc_l2c_only) {
@@ -2019,13 +2231,11 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
* add to the refcount if the arc_buf_t is
* not shared.
*/
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
+ if (arc_buf_is_shared(buf))
continue;
- }
(void) refcount_add_many(&new_state->arcs_size,
- HDR_GET_LSIZE(hdr), buf);
+ arc_buf_size(buf), buf);
}
ASSERT3U(bufcnt, ==, buffers);
@@ -2042,6 +2252,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(old_state)) {
ASSERT0(bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
/*
* When moving a header off of a ghost state,
@@ -2053,7 +2264,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
(void) refcount_remove_many(&old_state->arcs_size,
HDR_GET_LSIZE(hdr), hdr);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
} else {
uint32_t buffers = 0;
@@ -2064,7 +2274,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
*/
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
- ASSERT3P(bufcnt, !=, 0);
+ ASSERT3U(bufcnt, !=, 0);
buffers++;
/*
@@ -2074,13 +2284,11 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
* add to the refcount if the arc_buf_t is
* not shared.
*/
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
+ if (arc_buf_is_shared(buf))
continue;
- }
(void) refcount_remove_many(
- &old_state->arcs_size, HDR_GET_LSIZE(hdr),
+ &old_state->arcs_size, arc_buf_size(buf),
buf);
}
ASSERT3U(bufcnt, ==, buffers);
@@ -2165,11 +2373,50 @@ arc_space_return(uint64_t space, arc_space_type_t type)
}
/*
- * Allocate an initial buffer for this hdr, subsequent buffers will
- * use arc_buf_clone().
+ * Given a hdr and a buf, returns whether that buf can share its b_data buffer
+ * with the hdr's b_pdata.
*/
-static arc_buf_t *
-arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag)
+static boolean_t
+arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+ /*
+ * The criteria for sharing a hdr's data are:
+ * 1. the hdr's compression matches the buf's compression
+ * 2. the hdr doesn't need to be byteswapped
+ * 3. the hdr isn't already being shared
+ * 4. the buf is either compressed or it is the last buf in the hdr list
+ *
+ * Criterion #4 maintains the invariant that shared uncompressed
+ * bufs must be the final buf in the hdr's b_buf list. Reading this, you
+ * might ask, "if a compressed buf is allocated first, won't that be the
+ * last thing in the list?", but in that case it's impossible to create
+ * a shared uncompressed buf anyway (because the hdr must be compressed
+ * to have the compressed buf). You might also think that #3 is
+ * sufficient to make this guarantee, however it's possible
+ * (specifically in the rare L2ARC write race mentioned in
+ * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
+ * is sharable, but wasn't at the time of its allocation. Rather than
+ * allow a new shared uncompressed buf to be created and then shuffle
+ * the list around to make it the last element, this simply disallows
+ * sharing if the new buf isn't the first to be added.
+ */
+ ASSERT3P(buf->b_hdr, ==, hdr);
+ boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF;
+ boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
+ return (buf_compressed == hdr_compressed &&
+ hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
+ !HDR_SHARED_DATA(hdr) &&
+ (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
+}
+
+/*
+ * Allocate a buf for this hdr. If you care about the data that's in the hdr,
+ * or if you want a compressed buffer, pass those flags in. Returns 0 if the
+ * copy was made successfully, or an error code otherwise.
+ */
+static int
+arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
+ boolean_t fill, arc_buf_t **ret)
{
arc_buf_t *buf;
@@ -2177,15 +2424,14 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag)
ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
VERIFY(hdr->b_type == ARC_BUFC_DATA ||
hdr->b_type == ARC_BUFC_METADATA);
+ ASSERT3P(ret, !=, NULL);
+ ASSERT3P(*ret, ==, NULL);
- ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT0(hdr->b_l1hdr.b_bufcnt);
-
- buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+ buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
- buf->b_next = NULL;
+ buf->b_next = hdr->b_l1hdr.b_buf;
+ buf->b_flags = 0;
add_reference(hdr, tag);
@@ -2196,58 +2442,63 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag)
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
- * If the hdr's data can be shared (no byteswapping, hdr is
- * uncompressed, hdr's data is not currently being written to the
- * L2ARC write) then we share the data buffer and set the appropriate
- * bit in the hdr's b_flags to indicate the hdr is sharing it's
- * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to
- * store the buf's data.
+ * Only honor requests for compressed bufs if the hdr is actually
+ * compressed.
*/
- if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
- HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) {
+ if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
+ buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
+
+ /*
+ * If the hdr's data can be shared then we share the data buffer and
+ * set the appropriate bit in the hdr's b_flags to indicate the hdr is
+ * sharing it's b_pdata with the arc_buf_t. Otherwise, we allocate a new
+ * buffer to store the buf's data.
+ *
+ * There is one additional restriction here because we're sharing
+ * hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively
+ * involved in an L2ARC write, because if this buf is used by an
+ * arc_write() then the hdr's data buffer will be released when the
+ * write completes, even though the L2ARC write might still be using it.
+ */
+ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr);
+
+ /* Set up b_data and sharing */
+ if (can_share) {
buf->b_data = hdr->b_l1hdr.b_pdata;
+ buf->b_flags |= ARC_BUF_FLAG_SHARED;
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
} else {
- buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
- ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
- arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+ buf->b_data =
+ arc_get_data_buf(hdr, arc_buf_size(buf), buf);
+ ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
}
VERIFY3P(buf->b_data, !=, NULL);
hdr->b_l1hdr.b_buf = buf;
hdr->b_l1hdr.b_bufcnt += 1;
- return (buf);
-}
+ /*
+ * If the user wants the data from the hdr, we need to either copy or
+ * decompress the data.
+ */
+ if (fill) {
+ return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0));
+ }
-/*
- * Used when allocating additional buffers.
- */
-static arc_buf_t *
-arc_buf_clone(arc_buf_t *from)
-{
- arc_buf_t *buf;
- arc_buf_hdr_t *hdr = from->b_hdr;
- uint64_t size = HDR_GET_LSIZE(hdr);
+ return (0);
+}
- ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(hdr->b_l1hdr.b_state != arc_anon);
+static char *arc_onloan_tag = "onloan";
- buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
- buf->b_hdr = hdr;
- buf->b_data = NULL;
- buf->b_next = hdr->b_l1hdr.b_buf;
- hdr->b_l1hdr.b_buf = buf;
- buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
- bcopy(from->b_data, buf->b_data, size);
- hdr->b_l1hdr.b_bufcnt += 1;
+static inline void
+arc_loaned_bytes_update(int64_t delta)
+{
+ atomic_add_64(&arc_loaned_bytes, delta);
- ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
- return (buf);
+ /* assert that it did not wrap around */
+ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
}
-static char *arc_onloan_tag = "onloan";
-
/*
* Loan out an anonymous arc buffer. Loaned buffers are not counted as in
* flight data by arc_tempreserve_space() until they are "returned". Loaned
@@ -2255,16 +2506,29 @@ static char *arc_onloan_tag = "onloan";
* freed.
*/
arc_buf_t *
-arc_loan_buf(spa_t *spa, int size)
+arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
{
- arc_buf_t *buf;
+ arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
+ is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
+
+ arc_loaned_bytes_update(size);
+
+ return (buf);
+}
- buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
+arc_buf_t *
+arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type)
+{
+ arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
+ psize, lsize, compression_type);
+
+ arc_loaned_bytes_update(psize);
- atomic_add_64(&arc_loaned_bytes, size);
return (buf);
}
+
/*
* Return a loaned arc buffer to the arc.
*/
@@ -2278,7 +2542,7 @@ arc_return_buf(arc_buf_t *buf, void *tag)
(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
- atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr));
+ arc_loaned_bytes_update(-arc_buf_size(buf));
}
/* Detach an arc_buf from a dbuf (tag) */
@@ -2292,7 +2556,7 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
- atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr));
+ arc_loaned_bytes_update(arc_buf_size(buf));
}
static void
@@ -2338,8 +2602,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
- ASSERT(!HDR_SHARED_DATA(hdr));
- ASSERT(!arc_buf_is_shared(buf));
+ ASSERT(arc_can_share(hdr, buf));
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
@@ -2351,6 +2614,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
refcount_transfer_ownership(&state->arcs_size, buf, hdr);
hdr->b_l1hdr.b_pdata = buf->b_data;
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
+ buf->b_flags |= ARC_BUF_FLAG_SHARED;
/*
* Since we've transferred ownership to the hdr we need
@@ -2359,7 +2623,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
*/
ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
- ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr));
+ ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
}
static void
@@ -2367,7 +2631,6 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
- ASSERT(HDR_SHARED_DATA(hdr));
ASSERT(arc_buf_is_shared(buf));
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
@@ -2379,6 +2642,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
refcount_transfer_ownership(&state->arcs_size, hdr, buf);
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
hdr->b_l1hdr.b_pdata = NULL;
+ buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
/*
* Since the buffer is no longer shared between
@@ -2386,26 +2650,63 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
*/
ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
- ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
+ ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
}
/*
- * Free up buf->b_data and if 'remove' is set, then pull the
- * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
+ * Remove an arc_buf_t from the hdr's buf list and return the last
+ * arc_buf_t on the list. If no buffers remain on the list then return
+ * NULL.
+ */
+static arc_buf_t *
+arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+ arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
+ arc_buf_t *lastbuf = NULL;
+
+ /*
+ * Remove the buf from the hdr list and locate the last
+ * remaining buffer on the list.
+ */
+ while (*bufp != NULL) {
+ if (*bufp == buf)
+ *bufp = buf->b_next;
+
+ /*
+ * If we've removed a buffer in the middle of
+ * the list then update the lastbuf and update
+ * bufp.
+ */
+ if (*bufp != NULL) {
+ lastbuf = *bufp;
+ bufp = &(*bufp)->b_next;
+ }
+ }
+ buf->b_next = NULL;
+ ASSERT3P(lastbuf, !=, buf);
+ IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
+ IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
+ IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
+
+ return (lastbuf);
+}
+
+/*
+ * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
+ * list and free it.
*/
static void
-arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove)
+arc_buf_destroy_impl(arc_buf_t *buf)
{
- arc_buf_t **bufp;
arc_buf_hdr_t *hdr = buf->b_hdr;
- uint64_t size = HDR_GET_LSIZE(hdr);
- boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf);
/*
- * Free up the data associated with the buf but only
- * if we're not sharing this with the hdr. If we are sharing
- * it with the hdr, then hdr will have performed the allocation
- * so allow it to do the free.
+ * Free up the data associated with the buf but only if we're not
+ * sharing this with the hdr. If we are sharing it with the hdr, the
+ * hdr is responsible for doing the free.
*/
if (buf->b_data != NULL) {
/*
@@ -2417,11 +2718,10 @@ arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove)
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
- if (destroyed_buf_is_shared) {
- ASSERT(ARC_BUF_LAST(buf));
- ASSERT(HDR_SHARED_DATA(hdr));
+ if (arc_buf_is_shared(buf)) {
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
} else {
+ uint64_t size = arc_buf_size(buf);
arc_free_data_buf(hdr, buf->b_data, size, buf);
ARCSTAT_INCR(arcstat_overhead_size, -size);
}
@@ -2431,58 +2731,58 @@ arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove)
hdr->b_l1hdr.b_bufcnt -= 1;
}
- /* only remove the buf if requested */
- if (!remove)
- return;
-
- /* remove the buf from the hdr list */
- arc_buf_t *lastbuf = NULL;
- bufp = &hdr->b_l1hdr.b_buf;
- while (*bufp != NULL) {
- if (*bufp == buf)
- *bufp = buf->b_next;
+ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
+ if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
/*
- * If we've removed a buffer in the middle of
- * the list then update the lastbuf and update
- * bufp.
+ * If the current arc_buf_t is sharing its data buffer with the
+ * hdr, then reassign the hdr's b_pdata to share it with the new
+ * buffer at the end of the list. The shared buffer is always
+ * the last one on the hdr's buffer list.
+ *
+ * There is an equivalent case for compressed bufs, but since
+ * they aren't guaranteed to be the last buf in the list and
+ * that is an exceedingly rare case, we just allow that space be
+ * wasted temporarily.
*/
- if (*bufp != NULL) {
- lastbuf = *bufp;
- bufp = &(*bufp)->b_next;
- }
- }
- buf->b_next = NULL;
- ASSERT3P(lastbuf, !=, buf);
-
- /*
- * If the current arc_buf_t is sharing its data
- * buffer with the hdr, then reassign the hdr's
- * b_pdata to share it with the new buffer at the end
- * of the list. The shared buffer is always the last one
- * on the hdr's buffer list.
- */
- if (destroyed_buf_is_shared && lastbuf != NULL) {
- ASSERT(ARC_BUF_LAST(buf));
- ASSERT(ARC_BUF_LAST(lastbuf));
- VERIFY(!arc_buf_is_shared(lastbuf));
+ if (lastbuf != NULL) {
+ /* Only one buf can be shared at once */
+ VERIFY(!arc_buf_is_shared(lastbuf));
+ /* hdr is uncompressed so can't have compressed buf */
+ VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
- arc_hdr_free_pdata(hdr);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ arc_hdr_free_pdata(hdr);
+ /*
+ * We must setup a new shared block between the
+ * last buffer and the hdr. The data would have
+ * been allocated by the arc buf so we need to transfer
+ * ownership to the hdr since it's now being shared.
+ */
+ arc_share_buf(hdr, lastbuf);
+ }
+ } else if (HDR_SHARED_DATA(hdr)) {
/*
- * We must setup a new shared block between the
- * last buffer and the hdr. The data would have
- * been allocated by the arc buf so we need to transfer
- * ownership to the hdr since it's now being shared.
+ * Uncompressed shared buffers are always at the end
+ * of the list. Compressed buffers don't have the
+ * same requirements. This makes it hard to
+ * simply assert that the lastbuf is shared so
+ * we rely on the hdr's compression flags to determine
+ * if we have a compressed, shared buffer.
*/
- arc_share_buf(hdr, lastbuf);
- } else if (HDR_SHARED_DATA(hdr)) {
- ASSERT(arc_buf_is_shared(lastbuf));
+ ASSERT3P(lastbuf, !=, NULL);
+ ASSERT(arc_buf_is_shared(lastbuf) ||
+ HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
}
- if (hdr->b_l1hdr.b_bufcnt == 0)
+ /*
+ * Free the checksum if we're removing the last uncompressed buf from
+ * this hdr.
+ */
+ if (!arc_hdr_has_uncompressed_buf(hdr)) {
arc_cksum_free(hdr);
+ }
/* clean up the buf */
buf->b_hdr = NULL;
@@ -2533,11 +2833,10 @@ arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
static arc_buf_hdr_t *
arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
- enum zio_compress compress, arc_buf_contents_t type)
+ enum zio_compress compression_type, arc_buf_contents_t type)
{
arc_buf_hdr_t *hdr;
- ASSERT3U(lsize, >, 0);
VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
@@ -2550,7 +2849,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
hdr->b_type = type;
hdr->b_flags = 0;
arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
- arc_hdr_set_compress(hdr, compress);
+ arc_hdr_set_compress(hdr, compression_type);
hdr->b_l1hdr.b_state = arc_anon;
hdr->b_l1hdr.b_arc_access = 0;
@@ -2679,13 +2978,41 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
* The buf is returned thawed since we expect the consumer to modify it.
*/
arc_buf_t *
-arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
+arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
{
arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
ZIO_COMPRESS_OFF, type);
ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
- arc_buf_t *buf = arc_buf_alloc_impl(hdr, tag);
+
+ arc_buf_t *buf = NULL;
+ VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf));
arc_buf_thaw(buf);
+
+ return (buf);
+}
+
+/*
+ * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
+ * for bufs containing metadata.
+ */
+arc_buf_t *
+arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type)
+{
+ ASSERT3U(lsize, >, 0);
+ ASSERT3U(lsize, >=, psize);
+ ASSERT(compression_type > ZIO_COMPRESS_OFF);
+ ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS);
+
+ arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+ compression_type, ARC_BUFC_DATA);
+ ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
+
+ arc_buf_t *buf = NULL;
+ VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf));
+ arc_buf_thaw(buf);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+
return (buf);
}
@@ -2752,7 +3079,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
arc_cksum_free(hdr);
while (hdr->b_l1hdr.b_buf != NULL)
- arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE);
+ arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
#ifdef ZFS_DEBUG
if (hdr->b_l1hdr.b_thawed != NULL) {
@@ -2798,16 +3125,10 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
ASSERT3P(buf->b_data, !=, NULL);
(void) remove_reference(hdr, hash_lock, tag);
- arc_buf_destroy_impl(buf, B_TRUE);
+ arc_buf_destroy_impl(buf);
mutex_exit(hash_lock);
}
-int32_t
-arc_buf_size(arc_buf_t *buf)
-{
- return (HDR_GET_LSIZE(buf->b_hdr));
-}
-
/*
* Evict the arc_buf_hdr that is provided as a parameter. The resultant
* state of the header is dependent on it's state prior to entering this
@@ -2853,7 +3174,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
if (HDR_HAS_L2HDR(hdr)) {
- ASSERT(hdr->b_l1hdr.b_pdata == NULL);
/*
* This buffer is cached on the 2nd Level ARC;
* don't destroy the header.
@@ -2866,7 +3186,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
hdr = arc_hdr_realloc(hdr, hdr_full_cache,
hdr_l2only_cache);
} else {
- ASSERT(hdr->b_l1hdr.b_pdata == NULL);
arc_change_state(arc_anon, hdr, hash_lock);
arc_hdr_destroy(hdr);
}
@@ -2895,7 +3214,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
if (buf->b_data != NULL)
bytes_evicted += HDR_GET_LSIZE(hdr);
mutex_exit(&buf->b_evict_lock);
- arc_buf_destroy_impl(buf, B_TRUE);
+ arc_buf_destroy_impl(buf);
}
if (HDR_HAS_L2HDR(hdr)) {
@@ -3244,7 +3563,7 @@ arc_adjust_meta(void)
/*
* Similar to the above, we want to evict enough bytes to get us
* below the meta limit, but not so much as to drop us below the
- * space alloted to the MFU (which is defined as arc_c - arc_p).
+ * space allotted to the MFU (which is defined as arc_c - arc_p).
*/
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
(int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
@@ -4197,7 +4516,7 @@ void
arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
if (zio == NULL || zio->io_error == 0)
- bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr));
+ bcopy(buf->b_data, arg, arc_buf_size(buf));
arc_buf_destroy(buf, arg);
}
@@ -4235,10 +4554,11 @@ static void
arc_read_done(zio_t *zio)
{
arc_buf_hdr_t *hdr = zio->io_private;
- arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */
kmutex_t *hash_lock = NULL;
- arc_callback_t *callback_list, *acb;
- int freeable = B_FALSE;
+ arc_callback_t *callback_list;
+ arc_callback_t *acb;
+ boolean_t freeable = B_FALSE;
+ boolean_t no_zio_error = (zio->io_error == 0);
/*
* The hdr was inserted into hash-table and removed from lists
@@ -4264,7 +4584,7 @@ arc_read_done(zio_t *zio)
ASSERT3P(hash_lock, !=, NULL);
}
- if (zio->io_error == 0) {
+ if (no_zio_error) {
/* byteswap if necessary */
if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
if (BP_GET_LEVEL(zio->io_bp) > 0) {
@@ -4285,8 +4605,7 @@ arc_read_done(zio_t *zio)
callback_list = hdr->b_l1hdr.b_acb;
ASSERT3P(callback_list, !=, NULL);
- if (hash_lock && zio->io_error == 0 &&
- hdr->b_l1hdr.b_state == arc_anon) {
+ if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
/*
* Only call arc_access on anonymous buffers. This is because
* if we've issued an I/O for an evicted buffer, we've already
@@ -4296,39 +4615,29 @@ arc_read_done(zio_t *zio)
arc_access(hdr, hash_lock);
}
- /* create copies of the data buffer for the callers */
- for (acb = callback_list; acb; acb = acb->acb_next) {
- if (acb->acb_done != NULL) {
- /*
- * If we're here, then this must be a demand read
- * since prefetch requests don't have callbacks.
- * If a read request has a callback (i.e. acb_done is
- * not NULL), then we decompress the data for the
- * first request and clone the rest. This avoids
- * having to waste cpu resources decompressing data
- * that nobody is explicitly waiting to read.
- */
- if (abuf == NULL) {
- acb->acb_buf = arc_buf_alloc_impl(hdr,
- acb->acb_private);
- if (zio->io_error == 0) {
- zio->io_error =
- arc_decompress(acb->acb_buf);
- }
- abuf = acb->acb_buf;
- } else {
- add_reference(hdr, acb->acb_private);
- acb->acb_buf = arc_buf_clone(abuf);
- }
+ /*
+ * If a read request has a callback (i.e. acb_done is not NULL), then we
+ * make a buf containing the data according to the parameters which were
+ * passed in. The implementation of arc_buf_alloc_impl() ensures that we
+ * aren't needlessly decompressing the data multiple times.
+ */
+ int callback_cnt = 0;
+ for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
+ if (!acb->acb_done)
+ continue;
+
+ /* This is a demand read since prefetches don't use callbacks */
+ callback_cnt++;
+
+ int error = arc_buf_alloc_impl(hdr, acb->acb_private,
+ acb->acb_compressed, no_zio_error, &acb->acb_buf);
+ if (no_zio_error) {
+ zio->io_error = error;
}
}
hdr->b_l1hdr.b_acb = NULL;
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
- if (abuf == NULL) {
- /*
- * This buffer didn't have a callback so it must
- * be a prefetch.
- */
+ if (callback_cnt == 0) {
ASSERT(HDR_PREFETCH(hdr));
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
@@ -4337,7 +4646,7 @@ arc_read_done(zio_t *zio)
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
callback_list != NULL);
- if (zio->io_error == 0) {
+ if (no_zio_error) {
arc_hdr_verify(hdr, zio->io_bp);
} else {
arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
@@ -4413,6 +4722,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
kmutex_t *hash_lock = NULL;
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
+ boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
ASSERT(!BP_IS_EMBEDDED(bp) ||
BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
@@ -4477,6 +4787,7 @@ top:
KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
+ acb->acb_compressed = compressed_read;
if (pio != NULL)
acb->acb_zio_dummy = zio_null(pio,
spa, NULL, NULL, NULL, zio_flags);
@@ -4511,23 +4822,9 @@ top:
}
ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
- /*
- * If this block is already in use, create a new
- * copy of the data so that we will be guaranteed
- * that arc_release() will always succeed.
- */
- buf = hdr->b_l1hdr.b_buf;
- if (buf == NULL) {
- ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
- buf = arc_buf_alloc_impl(hdr, private);
- VERIFY0(arc_decompress(buf));
- } else {
- add_reference(hdr, private);
- buf = arc_buf_clone(buf);
- }
- ASSERT3P(buf->b_data, !=, NULL);
-
+ /* Get a buf with the desired data in it. */
+ VERIFY0(arc_buf_alloc_impl(hdr, private,
+ compressed_read, B_TRUE, &buf));
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
@@ -4587,6 +4884,7 @@ top:
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
/*
* This is a delicate dance that we play here.
@@ -4627,6 +4925,7 @@ top:
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
+ acb->acb_compressed = compressed_read;
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
hdr->b_l1hdr.b_acb = acb;
@@ -4873,7 +5172,7 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT3P(state, !=, arc_anon);
/* this buffer is not on any list */
- ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
+ ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
if (HDR_HAS_L2HDR(hdr)) {
mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
@@ -4897,7 +5196,6 @@ arc_release(arc_buf_t *buf, void *tag)
*/
if (hdr->b_l1hdr.b_bufcnt > 1) {
arc_buf_hdr_t *nhdr;
- arc_buf_t **bufp;
uint64_t spa = hdr->b_spa;
uint64_t psize = HDR_GET_PSIZE(hdr);
uint64_t lsize = HDR_GET_LSIZE(hdr);
@@ -4908,8 +5206,7 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
(void) remove_reference(hdr, hash_lock, tag);
- if (arc_buf_is_shared(buf)) {
- ASSERT(HDR_SHARED_DATA(hdr));
+ if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
ASSERT(ARC_BUF_LAST(buf));
}
@@ -4919,60 +5216,58 @@ arc_release(arc_buf_t *buf, void *tag)
* a new anonymous hdr. Also find the last buffer
* in the hdr's buffer list.
*/
- arc_buf_t *lastbuf = NULL;
- bufp = &hdr->b_l1hdr.b_buf;
- while (*bufp != NULL) {
- if (*bufp == buf) {
- *bufp = buf->b_next;
- }
-
- /*
- * If we've removed a buffer in the middle of
- * the list then update the lastbuf and update
- * bufp.
- */
- if (*bufp != NULL) {
- lastbuf = *bufp;
- bufp = &(*bufp)->b_next;
- }
- }
- buf->b_next = NULL;
- ASSERT3P(lastbuf, !=, buf);
+ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
ASSERT3P(lastbuf, !=, NULL);
/*
* If the current arc_buf_t and the hdr are sharing their data
- * buffer, then we must stop sharing that block, transfer
- * ownership and setup sharing with a new arc_buf_t at the end
- * of the hdr's b_buf list.
+ * buffer, then we must stop sharing that block.
*/
if (arc_buf_is_shared(buf)) {
- ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
- ASSERT(ARC_BUF_LAST(lastbuf));
VERIFY(!arc_buf_is_shared(lastbuf));
/*
* First, sever the block sharing relationship between
- * buf and the arc_buf_hdr_t. Then, setup a new
- * block sharing relationship with the last buffer
- * on the arc_buf_t list.
+ * buf and the arc_buf_hdr_t.
*/
arc_unshare_buf(hdr, buf);
- arc_share_buf(hdr, lastbuf);
+
+ /*
+ * Now we need to recreate the hdr's b_pdata. Since we
+ * have lastbuf handy, we try to share with it, but if
+ * we can't then we allocate a new b_pdata and copy the
+ * data from buf into it.
+ */
+ if (arc_can_share(hdr, lastbuf)) {
+ arc_share_buf(hdr, lastbuf);
+ } else {
+ arc_hdr_alloc_pdata(hdr);
+ bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize);
+ }
VERIFY3P(lastbuf->b_data, !=, NULL);
} else if (HDR_SHARED_DATA(hdr)) {
- ASSERT(arc_buf_is_shared(lastbuf));
+ /*
+ * Uncompressed shared buffers are always at the end
+ * of the list. Compressed buffers don't have the
+ * same requirements. This makes it hard to
+ * simply assert that the lastbuf is shared so
+ * we rely on the hdr's compression flags to determine
+ * if we have a compressed, shared buffer.
+ */
+ ASSERT(arc_buf_is_shared(lastbuf) ||
+ HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
+ ASSERT(!ARC_BUF_SHARED(buf));
}
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
ASSERT3P(state, !=, arc_l2c_only);
(void) refcount_remove_many(&state->arcs_size,
- HDR_GET_LSIZE(hdr), buf);
+ arc_buf_size(buf), buf);
if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
ASSERT3P(state, !=, arc_l2c_only);
(void) refcount_remove_many(&state->arcs_esize[type],
- HDR_GET_LSIZE(hdr), buf);
+ arc_buf_size(buf), buf);
}
hdr->b_l1hdr.b_bufcnt -= 1;
@@ -4999,7 +5294,7 @@ arc_release(arc_buf_t *buf, void *tag)
mutex_exit(&buf->b_evict_lock);
(void) refcount_add_many(&arc_anon->arcs_size,
- HDR_GET_LSIZE(nhdr), buf);
+ arc_buf_size(buf), buf);
} else {
mutex_exit(&buf->b_evict_lock);
ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
@@ -5055,15 +5350,13 @@ arc_write_ready(zio_t *zio)
/*
* If we're reexecuting this zio because the pool suspended, then
* cleanup any state that was previously set the first time the
- * callback as invoked.
+ * callback was invoked.
*/
if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
arc_cksum_free(hdr);
arc_buf_unwatch(buf);
if (hdr->b_l1hdr.b_pdata != NULL) {
if (arc_buf_is_shared(buf)) {
- ASSERT(HDR_SHARED_DATA(hdr));
-
arc_unshare_buf(hdr, buf);
} else {
arc_hdr_free_pdata(hdr);
@@ -5100,26 +5393,23 @@ arc_write_ready(zio_t *zio)
* arc thus the on-disk block may or may not match what we maintain
* in the hdr's b_pdata field.
*/
- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
- ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF);
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ !ARC_BUF_COMPRESSED(buf)) {
+ ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF);
ASSERT3U(psize, >, 0);
arc_hdr_alloc_pdata(hdr);
bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize);
} else {
ASSERT3P(buf->b_data, ==, zio->io_orig_data);
- ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr));
- ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS);
- ASSERT(!HDR_SHARED_DATA(hdr));
- ASSERT(!arc_buf_is_shared(buf));
+ ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
/*
* This hdr is not compressed so we're able to share
* the arc_buf_t data buffer with the hdr.
*/
arc_share_buf(hdr, buf);
- VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata,
+ ASSERT0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata,
HDR_GET_LSIZE(hdr)));
}
arc_hdr_verify(hdr, zio->io_bp);
@@ -5178,7 +5468,7 @@ arc_write_done(zio_t *zio)
arc_buf_hdr_t *exists;
kmutex_t *hash_lock;
- ASSERT(zio->io_error == 0);
+ ASSERT3U(zio->io_error, ==, 0);
arc_cksum_verify(buf);
@@ -5248,6 +5538,11 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
if (l2arc)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_OFF);
+ ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
+ zio_flags |= ZIO_FLAG_RAW;
+ }
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
callback->awcb_children_ready = children_ready;
@@ -5268,7 +5563,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
* buf will take sole ownership of the block.
*/
if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
arc_unshare_buf(hdr, buf);
} else {
arc_hdr_free_pdata(hdr);
@@ -5279,8 +5573,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
ASSERT(!arc_buf_is_shared(buf));
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
- zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp,
- arc_write_ready,
+ zio = zio_write(pio, spa, txg, bp, buf->b_data,
+ HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, arc_write_ready,
(children_ready != NULL) ? arc_write_children_ready : NULL,
arc_write_physdone, arc_write_done, callback,
priority, zio_flags, zb);
@@ -5352,6 +5646,10 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
* network delays from blocking transactions that are ready to be
* assigned to a txg.
*/
+
+ /* assert that it has not wrapped around */
+ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
+
anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
arc_loaned_bytes), 0);
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 71ae0c4434..08d1cca1d9 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -850,7 +850,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
spa_t *spa = db->db_objset->os_spa;
mutex_exit(&db->db_mtx);
- abuf = arc_loan_buf(spa, blksz);
+ abuf = arc_loan_buf(spa, B_FALSE, blksz);
bcopy(db->db.db_data, abuf->b_data, blksz);
} else {
abuf = db->db_buf;
@@ -984,8 +984,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
BP_IS_HOLE(db->db_blkptr)))) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa,
- db->db.db_size, db, type));
+ dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type,
+ db->db.db_size));
bzero(db->db.db_data, db->db.db_size);
if (db->db_blkptr != NULL && db->db_level > 0 &&
@@ -1034,6 +1034,68 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
&aflags, &zb);
}
+/*
+ * This is our just-in-time copy function. It makes a copy of buffers that
+ * have been modified in a previous transaction group before we access them in
+ * the current active group.
+ *
+ * This function is used in three places: when we are dirtying a buffer for the
+ * first time in a txg, when we are freeing a range in a dnode that includes
+ * this buffer, and when we are accessing a buffer which was received compressed
+ * and later referenced in a WRITE_BYREF record.
+ *
+ * Note that when we are called from dbuf_free_range() we do not put a hold on
+ * the buffer, we just traverse the active dbuf list for the dnode.
+ */
+static void
+dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db.db_data != NULL);
+ ASSERT(db->db_level == 0);
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
+
+ if (dr == NULL ||
+ (dr->dt.dl.dr_data !=
+ ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+ return;
+
+ /*
+ * If the last dirty record for this dbuf has not yet synced
+ * and its referencing the dbuf data, either:
+ * reset the reference to point to a new copy,
+ * or (if there a no active holders)
+ * just null out the current db_data pointer.
+ */
+ ASSERT(dr->dr_txg >= txg - 2);
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ /* Note that the data bufs here are zio_bufs */
+ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
+ } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ int size = arc_buf_size(db->db_buf);
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ spa_t *spa = db->db_objset->os_spa;
+ enum zio_compress compress_type =
+ arc_get_compression(db->db_buf);
+
+ if (compress_type == ZIO_COMPRESS_OFF) {
+ dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
+ } else {
+ ASSERT3U(type, ==, ARC_BUFC_DATA);
+ dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
+ size, arc_buf_lsize(db->db_buf), compress_type);
+ }
+ bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
+ } else {
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
+ }
+}
+
int
dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
@@ -1062,6 +1124,18 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
+ /*
+ * If the arc buf is compressed, we need to decompress it to
+ * read the data. This could happen during the "zfs receive" of
+ * a stream which is compressed and deduplicated.
+ */
+ if (db->db_buf != NULL &&
+ arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
+ dbuf_fix_old_data(db,
+ spa_syncing_txg(dmu_objset_spa(db->db_objset)));
+ err = arc_decompress(db->db_buf);
+ dbuf_set_data(db, db->db_buf);
+ }
mutex_exit(&db->db_mtx);
if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
@@ -1137,7 +1211,7 @@ dbuf_noread(dmu_buf_impl_t *db)
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
- dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type));
+ dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size));
db->db_state = DB_FILL;
} else if (db->db_state == DB_NOFILL) {
dbuf_clear_data(db);
@@ -1147,60 +1221,6 @@ dbuf_noread(dmu_buf_impl_t *db)
mutex_exit(&db->db_mtx);
}
-/*
- * This is our just-in-time copy function. It makes a copy of
- * buffers, that have been modified in a previous transaction
- * group, before we modify them in the current active group.
- *
- * This function is used in two places: when we are dirtying a
- * buffer for the first time in a txg, and when we are freeing
- * a range in a dnode that includes this buffer.
- *
- * Note that when we are called from dbuf_free_range() we do
- * not put a hold on the buffer, we just traverse the active
- * dbuf list for the dnode.
- */
-static void
-dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
-{
- dbuf_dirty_record_t *dr = db->db_last_dirty;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db.db_data != NULL);
- ASSERT(db->db_level == 0);
- ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
-
- if (dr == NULL ||
- (dr->dt.dl.dr_data !=
- ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
- return;
-
- /*
- * If the last dirty record for this dbuf has not yet synced
- * and its referencing the dbuf data, either:
- * reset the reference to point to a new copy,
- * or (if there a no active holders)
- * just null out the current db_data pointer.
- */
- ASSERT(dr->dr_txg >= txg - 2);
- if (db->db_blkid == DMU_BONUS_BLKID) {
- /* Note that the data bufs here are zio_bufs */
- dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
- bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
- } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
- int size = db->db.db_size;
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- spa_t *spa = db->db_objset->os_spa;
-
- dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type);
- bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
- } else {
- db->db_buf = NULL;
- dbuf_clear_data(db);
- }
-}
-
void
dbuf_unoverride(dbuf_dirty_record_t *dr)
{
@@ -1401,7 +1421,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
dmu_buf_will_dirty(&db->db, tx);
/* create the data buffer for the new block */
- buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type);
+ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
/* copy old block data to the new block */
obuf = db->db_buf;
@@ -1995,9 +2015,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
ASSERT(!refcount_is_zero(&db->db_holds));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(db->db_level == 0);
- ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
+ ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
ASSERT(buf != NULL);
- ASSERT(arc_buf_size(buf) == db->db.db_size);
+ ASSERT(arc_buf_lsize(buf) == db->db.db_size);
ASSERT(tx->tx_txg != 0);
arc_return_buf(buf, db);
@@ -2594,8 +2614,8 @@ top:
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db,
- arc_alloc_buf(dn->dn_objset->os_spa,
- db->db.db_size, db, type));
+ arc_alloc_buf(dn->dn_objset->os_spa, db, type,
+ db->db.db_size));
bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
db->db.db_size);
}
@@ -3140,10 +3160,19 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
* objects only modified in the syncing context (e.g.
* DNONE_DNODE blocks).
*/
- int blksz = arc_buf_size(*datap);
+ int psize = arc_buf_size(*datap);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- *datap = arc_alloc_buf(os->os_spa, blksz, db, type);
- bcopy(db->db.db_data, (*datap)->b_data, blksz);
+ enum zio_compress compress_type = arc_get_compression(*datap);
+
+ if (compress_type == ZIO_COMPRESS_OFF) {
+ *datap = arc_alloc_buf(os->os_spa, db, type, psize);
+ } else {
+ ASSERT3U(type, ==, ARC_BUFC_DATA);
+ int lsize = arc_buf_lsize(*datap);
+ *datap = arc_alloc_compressed_buf(os->os_spa, db,
+ psize, lsize, compress_type);
+ }
+ bcopy(db->db.db_data, (*datap)->b_data, psize);
}
db->db_data_pending = dr;
@@ -3548,7 +3577,9 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
wp_flag = WP_SPILL;
wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
- dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+ dmu_write_policy(os, dn, db->db_level, wp_flag,
+ (data != NULL && arc_get_compression(data) != ZIO_COMPRESS_OFF) ?
+ arc_get_compression(data) : ZIO_COMPRESS_INHERIT, &zp);
DB_DNODE_EXIT(db);
/*
@@ -3567,8 +3598,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
*/
void *contents = (data != NULL) ? data->b_data : NULL;
- dr->dr_zio = zio_write(zio, os->os_spa, txg,
- &dr->dr_bp_copy, contents, db->db.db_size, &zp,
+ dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
+ contents, db->db.db_size, db->db.db_size, &zp,
dbuf_write_override_ready, NULL, NULL,
dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
@@ -3581,7 +3612,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(zio, os->os_spa, txg,
- &dr->dr_bp_copy, NULL, db->db.db_size, &zp,
+ &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
dbuf_write_nofill_ready, NULL, NULL,
dbuf_write_nofill_done, db,
ZIO_PRIORITY_ASYNC_WRITE,
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 124a61020b..e858c701a4 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -1024,7 +1024,7 @@ dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
int i = priv->next++;
ASSERT(i < priv->cnt);
- ASSERT(off + n <= arc_buf_size(abuf));
+ ASSERT(off + n <= arc_buf_lsize(abuf));
iov = uio->uio_iov + i;
iov->iov_base = (char *)abuf->b_data + off;
iov->iov_len = n;
@@ -1370,7 +1370,7 @@ dmu_request_arcbuf(dmu_buf_t *handle, int size)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
- return (arc_loan_buf(db->db_objset->os_spa, size));
+ return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
}
/*
@@ -1395,7 +1395,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
dnode_t *dn;
dmu_buf_impl_t *db;
- uint32_t blksz = (uint32_t)arc_buf_size(buf);
+ uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
uint64_t blkid;
DB_DNODE_ENTER(dbuf);
@@ -1408,18 +1408,19 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
/*
* We can only assign if the offset is aligned, the arc buf is the
- * same size as the dbuf, and the dbuf is not metadata. It
- * can't be metadata because the loaned arc buf comes from the
- * user-data kmem arena.
+ * same size as the dbuf, and the dbuf is not metadata.
*/
- if (offset == db->db.db_offset && blksz == db->db.db_size &&
- DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
+ if (offset == db->db.db_offset && blksz == db->db.db_size) {
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
objset_t *os;
uint64_t object;
+ /* compressed bufs must always be assignable to their dbuf */
+ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
+ ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
+
DB_DNODE_ENTER(dbuf);
dn = DB_DNODE(dbuf);
os = dn->dn_objset;
@@ -1569,8 +1570,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
dsa->dsa_zgd = zgd;
dsa->dsa_tx = tx;
- zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx),
- zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size,
+ zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
+ zgd->zgd_db->db_data, zgd->zgd_db->db_size, zgd->zgd_db->db_size,
zp, dmu_sync_late_arrival_ready, NULL,
NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL, zb));
@@ -1624,7 +1625,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
- dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
+ dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC,
+ ZIO_COMPRESS_INHERIT, &zp);
DB_DNODE_EXIT(db);
/*
@@ -1794,7 +1796,8 @@ int zfs_mdcomp_disable = 0;
int zfs_redundant_metadata_most_ditto_level = 2;
void
-dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
+dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
+ enum zio_compress override_compress, zio_prop_t *zp)
{
dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
@@ -1806,6 +1809,10 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
boolean_t nopwrite = B_FALSE;
boolean_t dedup_verify = os->os_dedup_verify;
int copies = os->os_copies;
+ boolean_t lz4_ac = spa_feature_is_active(os->os_spa,
+ SPA_FEATURE_LZ4_COMPRESS);
+
+ IMPLY(override_compress == ZIO_COMPRESS_LZ4, lz4_ac);
/*
* We maintain different write policies for each of the following
@@ -1892,7 +1899,16 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
}
zp->zp_checksum = checksum;
- zp->zp_compress = compress;
+
+ /*
+ * If we're writing a pre-compressed buffer, the compression type we use
+ * must match the data. If it hasn't been compressed yet, then we should
+ * use the value dictated by the policies above.
+ */
+ zp->zp_compress = override_compress != ZIO_COMPRESS_INHERIT
+ ? override_compress : compress;
+ ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
+
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 0734c1b42b..3ed68f7133 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -339,9 +339,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
/* Increase the blocksize if we are permitted. */
if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
- arc_buf_t *buf = arc_alloc_buf(spa,
- sizeof (objset_phys_t), &os->os_phys_buf,
- ARC_BUFC_METADATA);
+ arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
+ ARC_BUFC_METADATA, sizeof (objset_phys_t));
bzero(buf->b_data, sizeof (objset_phys_t));
bcopy(os->os_phys_buf->b_data, buf->b_data,
arc_buf_size(os->os_phys_buf));
@@ -354,8 +353,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
} else {
int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
- os->os_phys_buf = arc_alloc_buf(spa, size,
- &os->os_phys_buf, ARC_BUFC_METADATA);
+ os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
+ ARC_BUFC_METADATA, size);
os->os_phys = os->os_phys_buf->b_data;
bzero(os->os_phys, size);
}
@@ -1138,7 +1137,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
arc_release(os->os_phys_buf, &os->os_phys_buf);
- dmu_write_policy(os, NULL, 0, 0, &zp);
+ dmu_write_policy(os, NULL, 0, 0, ZIO_COMPRESS_INHERIT, &zp);
zio = arc_write(pio, os->os_spa, tx->tx_txg,
blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index 18ab28dc2a..72247ce381 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -249,8 +249,10 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
static int
dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
- uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
+ uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp,
+ void *data)
{
+ uint64_t payload_size;
struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
/*
@@ -261,7 +263,7 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
(object == dsp->dsa_last_data_object &&
offset > dsp->dsa_last_data_offset));
dsp->dsa_last_data_object = object;
- dsp->dsa_last_data_offset = offset + blksz - 1;
+ dsp->dsa_last_data_offset = offset + lsize - 1;
/*
* If there is any kind of pending aggregation (currently either
@@ -280,8 +282,26 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_object = object;
drrw->drr_type = type;
drrw->drr_offset = offset;
- drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
+ drrw->drr_logical_size = lsize;
+
+ /* only set the compression fields if the buf is compressed */
+ if (lsize != psize) {
+ ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED);
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT(!BP_SHOULD_BYTESWAP(bp));
+ ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
+ ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
+ ASSERT3S(psize, >, 0);
+ ASSERT3S(lsize, >=, psize);
+
+ drrw->drr_compressiontype = BP_GET_COMPRESS(bp);
+ drrw->drr_compressed_size = psize;
+ payload_size = drrw->drr_compressed_size;
+ } else {
+ payload_size = drrw->drr_logical_size;
+ }
+
if (bp == NULL || BP_IS_EMBEDDED(bp)) {
/*
* There's no pre-computed checksum for partial-block
@@ -301,7 +321,7 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_key.ddk_cksum = bp->blk_cksum;
}
- if (dump_record(dsp, data, blksz) != 0)
+ if (dump_record(dsp, data, payload_size) != 0)
return (SET_ERROR(EINTR));
return (0);
}
@@ -476,7 +496,7 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
* Compression function must be legacy, or explicitly enabled.
*/
if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
- !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
+ !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4)))
return (B_FALSE);
/*
@@ -639,18 +659,49 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
uint64_t offset;
+ /*
+ * If we have large blocks stored on disk but the send flags
+ * don't allow us to send large blocks, we split the data from
+ * the arc buf into chunks.
+ */
+ boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE &&
+ !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS);
+ /*
+ * We should only request compressed data from the ARC if all
+ * the following are true:
+ * - stream compression was requested
+ * - we aren't splitting large blocks into smaller chunks
+ * - the data won't need to be byteswapped before sending
+ * - this isn't an embedded block
+ * - this isn't metadata (if receiving on a different endian
+ * system it can be byteswapped more easily)
+ */
+ boolean_t request_compressed =
+ (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) &&
+ !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
+ !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
+
+ ASSERT0(zb->zb_level);
+ ASSERT(zb->zb_object > dsa->dsa_resume_object ||
+ (zb->zb_object == dsa->dsa_resume_object &&
+ zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
+
ASSERT0(zb->zb_level);
ASSERT(zb->zb_object > dsa->dsa_resume_object ||
(zb->zb_object == dsa->dsa_resume_object &&
zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
+ ASSERT3U(blksz, ==, BP_GET_LSIZE(bp));
+
+ enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
+ if (request_compressed)
+ zioflags |= ZIO_FLAG_RAW;
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
- &aflags, zb) != 0) {
+ ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) {
if (zfs_send_corrupt_data) {
/* Send a block filled with 0x"zfs badd bloc" */
- abuf = arc_alloc_buf(spa, blksz, &abuf,
- ARC_BUFC_DATA);
+ abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA,
+ blksz);
uint64_t *ptr;
for (ptr = abuf->b_data;
(char *)ptr < (char *)abuf->b_data + blksz;
@@ -663,21 +714,21 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
offset = zb->zb_blkid * blksz;
- if (!(dsa->dsa_featureflags &
- DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
- blksz > SPA_OLD_MAXBLOCKSIZE) {
+ if (split_large_blocks) {
+ ASSERT3U(arc_get_compression(abuf), ==,
+ ZIO_COMPRESS_OFF);
char *buf = abuf->b_data;
while (blksz > 0 && err == 0) {
int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
err = dump_write(dsa, type, zb->zb_object,
- offset, n, NULL, buf);
+ offset, n, n, NULL, buf);
offset += n;
buf += n;
blksz -= n;
}
} else {
- err = dump_write(dsa, type, zb->zb_object,
- offset, blksz, bp, abuf->b_data);
+ err = dump_write(dsa, type, zb->zb_object, offset,
+ blksz, arc_buf_size(abuf), bp, abuf->b_data);
}
arc_buf_destroy(abuf, &abuf);
}
@@ -704,9 +755,9 @@ get_next_record(bqueue_t *bq, struct send_block_record *data)
*/
static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
- zfs_bookmark_phys_t *ancestor_zb,
- boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd,
- uint64_t resumeobj, uint64_t resumeoff,
+ zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone,
+ boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
+ int outfd, uint64_t resumeobj, uint64_t resumeoff,
vnode_t *vp, offset_t *off)
{
objset_t *os;
@@ -749,7 +800,15 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
- featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
+ featureflags |= DMU_BACKUP_FEATURE_LZ4;
+ }
+ if (compressok) {
+ featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
+ }
+ if ((featureflags &
+ (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED)) !=
+ 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
+ featureflags |= DMU_BACKUP_FEATURE_LZ4;
}
if (resumeobj != 0 || resumeoff != 0) {
@@ -898,7 +957,7 @@ out:
int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
- boolean_t embedok, boolean_t large_block_ok,
+ boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
@@ -935,10 +994,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG);
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
- embedok, large_block_ok, outfd, 0, 0, vp, off);
+ embedok, large_block_ok, compressok, outfd, 0, 0, vp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
- embedok, large_block_ok, outfd, 0, 0, vp, off);
+ embedok, large_block_ok, compressok, outfd, 0, 0, vp, off);
}
dsl_dataset_rele(ds, FTAG);
return (err);
@@ -946,7 +1005,8 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
int
dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
- boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
+ boolean_t large_block_ok, boolean_t compressok, int outfd,
+ uint64_t resumeobj, uint64_t resumeoff,
vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
@@ -1014,11 +1074,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
return (err);
}
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
- embedok, large_block_ok,
+ embedok, large_block_ok, compressok,
outfd, resumeobj, resumeoff, vp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
- embedok, large_block_ok,
+ embedok, large_block_ok, compressok,
outfd, resumeobj, resumeoff, vp, off);
}
if (owned)
@@ -1029,33 +1089,45 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
}
static int
-dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size,
- uint64_t *sizep)
+dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
+ uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
{
int err;
+ uint64_t size;
/*
* Assume that space (both on-disk and in-stream) is dominated by
* data. We will adjust for indirect blocks and the copies property,
* but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
*/
+ uint64_t recordsize;
+ uint64_t record_count;
+
+ /* Assume all (uncompressed) blocks are recordsize. */
+ err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ &recordsize);
+ if (err != 0)
+ return (err);
+ record_count = uncompressed / recordsize;
+
+ /*
+ * If we're estimating a send size for a compressed stream, use the
+ * compressed data size to estimate the stream size. Otherwise, use the
+ * uncompressed data size.
+ */
+ size = stream_compressed ? compressed : uncompressed;
/*
* Subtract out approximate space used by indirect blocks.
* Assume most space is used by data blocks (non-indirect, non-dnode).
- * Assume all blocks are recordsize. Assume ditto blocks and
- * internal fragmentation counter out compression.
+ * Assume no ditto blocks or internal fragmentation.
*
* Therefore, space used by indirect blocks is sizeof(blkptr_t) per
- * block, which we observe in practice.
+ * block.
*/
- uint64_t recordsize;
- err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
- if (err != 0)
- return (err);
- size -= size / recordsize * sizeof (blkptr_t);
+ size -= record_count * sizeof (blkptr_t);
/* Add in the space for the record associated with each block. */
- size += size / recordsize * sizeof (dmu_replay_record_t);
+ size += record_count * sizeof (dmu_replay_record_t);
*sizep = size;
@@ -1063,11 +1135,12 @@ dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size,
}
int
-dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
+dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds,
+ boolean_t stream_compressed, uint64_t *sizep)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
int err;
- uint64_t size;
+ uint64_t uncomp, comp;
ASSERT(dsl_pool_config_held(dp));
@@ -1086,33 +1159,41 @@ dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
return (SET_ERROR(EXDEV));
- /* Get uncompressed size estimate of changed data. */
+ /* Get compressed and uncompressed size estimates of changed data. */
if (fromds == NULL) {
- size = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+ uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+ comp = dsl_dataset_phys(ds)->ds_compressed_bytes;
} else {
- uint64_t used, comp;
+ uint64_t used;
err = dsl_dataset_space_written(fromds, ds,
- &used, &comp, &size);
+ &used, &comp, &uncomp);
if (err != 0)
return (err);
}
- err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+ err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp,
+ stream_compressed, sizep);
return (err);
}
+struct calculate_send_arg {
+ uint64_t uncompressed;
+ uint64_t compressed;
+};
+
/*
* Simple callback used to traverse the blocks of a snapshot and sum their
- * uncompressed size
+ * uncompressed and compressed sizes.
*/
/* ARGSUSED */
static int
dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
- uint64_t *spaceptr = arg;
+ struct calculate_send_arg *space = arg;
if (bp != NULL && !BP_IS_HOLE(bp)) {
- *spaceptr += BP_GET_UCSIZE(bp);
+ space->uncompressed += BP_GET_UCSIZE(bp);
+ space->compressed += BP_GET_PSIZE(bp);
}
return (0);
}
@@ -1124,16 +1205,16 @@ dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/
int
dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
- uint64_t *sizep)
+ boolean_t stream_compressed, uint64_t *sizep)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
int err;
- uint64_t size = 0;
+ struct calculate_send_arg size = { 0 };
ASSERT(dsl_pool_config_held(dp));
/* tosnap must be a snapshot */
- if (!dsl_dataset_is_snapshot(ds))
+ if (!ds->ds_is_snapshot)
return (SET_ERROR(EINVAL));
/* verify that from_txg is before the provided snapshot was taken */
@@ -1150,7 +1231,8 @@ dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
if (err)
return (err);
- err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+ err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed,
+ size.compressed, stream_compressed, sizep);
return (err);
}
@@ -1281,14 +1363,14 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/*
* The receiving code doesn't know how to translate a WRITE_EMBEDDED
- * record to a plan WRITE record, so the pool must have the
+ * record to a plain WRITE record, so the pool must have the
* EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
* records. Same with WRITE_EMBEDDED records that use LZ4 compression.
*/
if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
return (SET_ERROR(ENOTSUP));
- if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+ if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
@@ -1458,10 +1540,20 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
8, 1, &zero, tx));
if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK,
+ 8, 1, &one, tx));
+ }
+ if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
DMU_BACKUP_FEATURE_EMBED_DATA) {
VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
8, 1, &one, tx));
}
+ if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_COMPRESSED) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK,
+ 8, 1, &one, tx));
+ }
}
dmu_buf_will_dirty(newds->ds_dbuf, tx);
@@ -1517,7 +1609,7 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
return (SET_ERROR(ENOTSUP));
- if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+ if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
@@ -1724,7 +1816,7 @@ struct receive_objnode {
uint64_t object;
};
-struct receive_arg {
+struct receive_arg {
objset_t *os;
vnode_t *vp; /* The vnode to read the stream from */
uint64_t voff; /* The current offset in the stream */
@@ -1852,10 +1944,11 @@ byteswap_record(dmu_replay_record_t *drr)
DO64(drr_write.drr_object);
DO32(drr_write.drr_type);
DO64(drr_write.drr_offset);
- DO64(drr_write.drr_length);
+ DO64(drr_write.drr_logical_size);
DO64(drr_write.drr_toguid);
ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
DO64(drr_write.drr_key.ddk_prop);
+ DO64(drr_write.drr_compressed_size);
break;
case DRR_WRITE_BYREF:
DO64(drr_write_byref.drr_object);
@@ -2085,7 +2178,7 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
dmu_tx_t *tx;
int err;
- if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
+ if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset ||
!DMU_OT_IS_VALID(drrw->drr_type))
return (SET_ERROR(EINVAL));
@@ -2107,7 +2200,7 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
tx = dmu_tx_create(rwa->os);
dmu_tx_hold_write(tx, drrw->drr_object,
- drrw->drr_offset, drrw->drr_length);
+ drrw->drr_offset, drrw->drr_logical_size);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err != 0) {
dmu_tx_abort(tx);
@@ -2117,9 +2210,10 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
dmu_object_byteswap_t byteswap =
DMU_OT_BYTESWAP(drrw->drr_type);
dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
- drrw->drr_length);
+ DRR_WRITE_PAYLOAD_SIZE(drrw));
}
+ /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */
dmu_buf_t *bonus;
if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
return (SET_ERROR(EINVAL));
@@ -2536,18 +2630,31 @@ receive_read_record(struct receive_arg *ra)
case DRR_WRITE:
{
struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
- arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os),
- drrw->drr_length);
+ arc_buf_t *abuf;
+ boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type);
+ if (DRR_WRITE_COMPRESSED(drrw)) {
+ ASSERT3U(drrw->drr_compressed_size, >, 0);
+ ASSERT3U(drrw->drr_logical_size, >=,
+ drrw->drr_compressed_size);
+ ASSERT(!is_meta);
+ abuf = arc_loan_compressed_buf(
+ dmu_objset_spa(ra->os),
+ drrw->drr_compressed_size, drrw->drr_logical_size,
+ drrw->drr_compressiontype);
+ } else {
+ abuf = arc_loan_buf(dmu_objset_spa(ra->os),
+ is_meta, drrw->drr_logical_size);
+ }
err = receive_read_payload_and_next_header(ra,
- drrw->drr_length, abuf->b_data);
+ DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data);
if (err != 0) {
dmu_return_arcbuf(abuf);
return (err);
}
ra->rrd->write_buf = abuf;
receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
- drrw->drr_length);
+ drrw->drr_logical_size);
return (err);
}
case DRR_WRITE_BYREF:
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index bac325b3a1..8bc528e1d4 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -1799,9 +1799,17 @@ get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
fnvlist_add_string(token_nv, "toname", buf);
}
if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_LARGEBLOCK) == 0) {
+ fnvlist_add_boolean(token_nv, "largeblockok");
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_EMBEDOK) == 0) {
fnvlist_add_boolean(token_nv, "embedok");
}
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_COMPRESSOK) == 0) {
+ fnvlist_add_boolean(token_nv, "compressok");
+ }
packed = fnvlist_pack(token_nv, &packed_size);
fnvlist_free(token_nv);
compressed = kmem_alloc(packed_size, KM_SLEEP);
diff --git a/usr/src/uts/common/fs/zfs/lz4.c b/usr/src/uts/common/fs/zfs/lz4.c
index 656360a6f2..3aa1b74ef3 100644
--- a/usr/src/uts/common/fs/zfs/lz4.c
+++ b/usr/src/uts/common/fs/zfs/lz4.c
@@ -85,7 +85,7 @@ lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
/*
* Returns 0 on success (decompression function returned non-negative)
- * and non-zero on failure (decompression function returned negative.
+ * and non-zero on failure (decompression function returned negative).
*/
return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
d_start, bufsiz, d_len) < 0);
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index b1e9456f5a..ad42cf7bcc 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -122,11 +122,17 @@ typedef enum arc_flags
} arc_flags_t;
+typedef enum arc_buf_flags {
+ ARC_BUF_FLAG_SHARED = 1 << 0,
+ ARC_BUF_FLAG_COMPRESSED = 1 << 1
+} arc_buf_flags_t;
+
struct arc_buf {
arc_buf_hdr_t *b_hdr;
arc_buf_t *b_next;
kmutex_t b_evict_lock;
void *b_data;
+ arc_buf_flags_t b_flags;
};
typedef enum arc_buf_contents {
@@ -150,13 +156,21 @@ typedef enum arc_space_type {
void arc_space_consume(uint64_t space, arc_space_type_t type);
void arc_space_return(uint64_t space, arc_space_type_t type);
-arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag,
- arc_buf_contents_t type);
-arc_buf_t *arc_loan_buf(spa_t *spa, int size);
+boolean_t arc_is_metadata(arc_buf_t *buf);
+enum zio_compress arc_get_compression(arc_buf_t *buf);
+int arc_decompress(arc_buf_t *buf);
+arc_buf_t *arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type,
+ int32_t size);
+arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, void *tag,
+ uint64_t psize, uint64_t lsize, enum zio_compress compression_type);
+arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size);
+arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type);
void arc_return_buf(arc_buf_t *buf, void *tag);
void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
void arc_buf_destroy(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf);
+int arc_buf_lsize(arc_buf_t *buf);
void arc_release(arc_buf_t *buf, void *tag);
int arc_released(arc_buf_t *buf);
void arc_buf_freeze(arc_buf_t *buf);
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 3510424250..3304027ccc 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -48,6 +48,7 @@
#include <sys/inttypes.h>
#include <sys/cred.h>
#include <sys/fs/zfs.h>
+#include <sys/zio_compress.h>
#include <sys/zio_priority.h>
#ifdef __cplusplus
@@ -419,7 +420,7 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
#define WP_SPILL 0x4
void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
- struct zio_prop *zp);
+ enum zio_compress compress_override, struct zio_prop *zp);
/*
* The bonus data is accessed more or less like a regular buffer.
* You must dmu_bonus_hold() to get the buffer, which will give you a
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_send.h b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
index 21d9cb4bb0..38b1b042e5 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_send.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
@@ -42,14 +42,15 @@ struct dmu_replay_record;
extern const char *recv_clone_name;
int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
- boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
+ boolean_t large_block_ok, boolean_t compressok, int outfd,
+ uint64_t resumeobj, uint64_t resumeoff,
struct vnode *vp, offset_t *off);
int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
- uint64_t *sizep);
+ boolean_t stream_compressed, uint64_t *sizep);
int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg,
- uint64_t *sizep);
+ boolean_t stream_compressed, uint64_t *sizep);
int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
- boolean_t embedok, boolean_t large_block_ok,
+ boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
int outfd, struct vnode *vp, offset_t *off);
typedef struct dmu_recv_cookie {
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
index 22c67b48a9..cab7cbb10f 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -96,7 +96,9 @@ struct dsl_pool;
#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
+#define DS_FIELD_RESUME_LARGEBLOCK "com.delphix:resume_largeblockok"
#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
+#define DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok"
/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h
index 345d42aa28..3f50cddb6f 100644
--- a/usr/src/uts/common/fs/zfs/sys/refcount.h
+++ b/usr/src/uts/common/fs/zfs/sys/refcount.h
@@ -103,7 +103,7 @@ typedef struct refcount {
atomic_add_64(&(src)->rc_count, -__tmp); \
atomic_add_64(&(dst)->rc_count, __tmp); \
}
-#define refcount_transfer_ownership(rc, current_holder, new_holder)
+#define refcount_transfer_ownership(rc, current_holder, new_holder) (void)0
#define refcount_held(rc, holder) ((rc)->rc_count > 0)
#define refcount_not_held(rc, holder) (B_TRUE)
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index bc83f87483..d86e3b45f1 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -87,19 +87,22 @@ typedef enum drr_headertype {
#define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2)
/* flags #3 - #15 are reserved for incompatible closed-source implementations */
#define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16)
-#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1 << 17)
+#define DMU_BACKUP_FEATURE_LZ4 (1 << 17)
/* flag #18 is reserved for a Delphix feature */
#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19)
#define DMU_BACKUP_FEATURE_RESUMING (1 << 20)
+/* flag #21 is reserved for a Delphix feature */
+#define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22)
/*
* Mask of all supported backup features
*/
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
- DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
+ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \
DMU_BACKUP_FEATURE_RESUMING | \
- DMU_BACKUP_FEATURE_LARGE_BLOCKS)
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS | \
+ DMU_BACKUP_FEATURE_COMPRESSED)
/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
@@ -152,6 +155,12 @@ typedef enum dmu_send_resume_token_version {
#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP)
+/* deal with compressed drr_write replay records */
+#define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0)
+#define DRR_WRITE_PAYLOAD_SIZE(drrw) \
+ (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \
+ (drrw)->drr_logical_size)
+
/*
* zfs ioctl command structure
*/
@@ -199,12 +208,16 @@ typedef struct dmu_replay_record {
dmu_object_type_t drr_type;
uint32_t drr_pad;
uint64_t drr_offset;
- uint64_t drr_length;
+ uint64_t drr_logical_size;
uint64_t drr_toguid;
uint8_t drr_checksumtype;
uint8_t drr_checksumflags;
- uint8_t drr_pad2[6];
- ddt_key_t drr_key; /* deduplication key */
+ uint8_t drr_compressiontype;
+ uint8_t drr_pad2[5];
+ /* deduplication key */
+ ddt_key_t drr_key;
+ /* only nonzero if drr_compressiontype is not 0 */
+ uint64_t drr_compressed_size;
/* content follows */
} drr_write;
struct drr_free {
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 405ac52cd4..7b5f0ccbf0 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -104,26 +104,6 @@ enum zio_checksum {
#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256
#define ZIO_DEDUPDITTO_MIN 100
-enum zio_compress {
- ZIO_COMPRESS_INHERIT = 0,
- ZIO_COMPRESS_ON,
- ZIO_COMPRESS_OFF,
- ZIO_COMPRESS_LZJB,
- ZIO_COMPRESS_EMPTY,
- ZIO_COMPRESS_GZIP_1,
- ZIO_COMPRESS_GZIP_2,
- ZIO_COMPRESS_GZIP_3,
- ZIO_COMPRESS_GZIP_4,
- ZIO_COMPRESS_GZIP_5,
- ZIO_COMPRESS_GZIP_6,
- ZIO_COMPRESS_GZIP_7,
- ZIO_COMPRESS_GZIP_8,
- ZIO_COMPRESS_GZIP_9,
- ZIO_COMPRESS_ZLE,
- ZIO_COMPRESS_LZ4,
- ZIO_COMPRESS_FUNCTIONS
-};
-
/*
* The number of "legacy" compression functions which can be set on individual
* objects.
@@ -428,6 +408,8 @@ struct zio {
void *io_orig_data;
uint64_t io_size;
uint64_t io_orig_size;
+ /* io_lsize != io_orig_size iff this is a raw write */
+ uint64_t io_lsize;
/* Stuff for the vdev stack */
vdev_t *io_vd;
@@ -484,11 +466,11 @@ extern zio_t *zio_root(spa_t *spa,
zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
- uint64_t size, zio_done_func_t *done, void *private,
+ uint64_t lsize, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, const zio_prop_t *zp,
+ void *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, enum zio_flag flags,
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_compress.h b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
index f4cb84511a..0c1783b140 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_compress.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
@@ -25,17 +25,36 @@
*/
/*
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZIO_COMPRESS_H
#define _SYS_ZIO_COMPRESS_H
-#include <sys/zio.h>
-
#ifdef __cplusplus
extern "C" {
#endif
+enum zio_compress {
+ ZIO_COMPRESS_INHERIT = 0,
+ ZIO_COMPRESS_ON,
+ ZIO_COMPRESS_OFF,
+ ZIO_COMPRESS_LZJB,
+ ZIO_COMPRESS_EMPTY,
+ ZIO_COMPRESS_GZIP_1,
+ ZIO_COMPRESS_GZIP_2,
+ ZIO_COMPRESS_GZIP_3,
+ ZIO_COMPRESS_GZIP_4,
+ ZIO_COMPRESS_GZIP_5,
+ ZIO_COMPRESS_GZIP_6,
+ ZIO_COMPRESS_GZIP_7,
+ ZIO_COMPRESS_GZIP_8,
+ ZIO_COMPRESS_GZIP_9,
+ ZIO_COMPRESS_ZLE,
+ ZIO_COMPRESS_LZ4,
+ ZIO_COMPRESS_FUNCTIONS
+};
+
/* Common signature for all zio compress functions. */
typedef size_t zio_compress_func_t(void *src, void *dst,
size_t s_len, size_t d_len, int);
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 823822e44c..c3cf3bc6d5 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -4450,6 +4450,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
boolean_t estimate = (zc->zc_guid != 0);
boolean_t embedok = (zc->zc_flags & 0x1);
boolean_t large_block_ok = (zc->zc_flags & 0x2);
+ boolean_t compressok = (zc->zc_flags & 0x4);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
@@ -4497,7 +4498,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
}
}
- error = dmu_send_estimate(tosnap, fromsnap,
+ error = dmu_send_estimate(tosnap, fromsnap, compressok,
&zc->zc_objset_type);
if (fromsnap != NULL)
@@ -4511,7 +4512,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
- zc->zc_fromobj, embedok, large_block_ok,
+ zc->zc_fromobj, embedok, large_block_ok, compressok,
zc->zc_cookie, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
@@ -5444,6 +5445,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
* indicates that blocks > 128KB are permitted
* (optional) "embedok" -> (value ignored)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
+ * (optional) "compressok" -> (value ignored)
+ * presence indicates compressed DRR_WRITE records are permitted
* (optional) "resume_object" and "resume_offset" -> (uint64)
* if present, resume send stream from specified object and offset.
* }
@@ -5460,6 +5463,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
int fd;
boolean_t largeblockok;
boolean_t embedok;
+ boolean_t compressok;
uint64_t resumeobj = 0;
uint64_t resumeoff = 0;
@@ -5471,6 +5475,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
largeblockok = nvlist_exists(innvl, "largeblockok");
embedok = nvlist_exists(innvl, "embedok");
+ compressok = nvlist_exists(innvl, "compressok");
(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
@@ -5480,8 +5485,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
return (SET_ERROR(EBADF));
off = fp->f_offset;
- error = dmu_send(snapname, fromname, embedok, largeblockok, fd,
- resumeobj, resumeoff, fp->f_vnode, &off);
+ error = dmu_send(snapname, fromname, embedok, largeblockok, compressok,
+ fd, resumeobj, resumeoff, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
@@ -5496,6 +5501,12 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
* innvl: {
* (optional) "from" -> full snap or bookmark name to send an incremental
* from
+ * (optional) "largeblockok" -> (value ignored)
+ * indicates that blocks > 128KB are permitted
+ * (optional) "embedok" -> (value ignored)
+ * presence indicates DRR_WRITE_EMBEDDED records are permitted
+ * (optional) "compressok" -> (value ignored)
+ * presence indicates compressed DRR_WRITE records are permitted
* }
*
* outnvl: {
@@ -5509,6 +5520,11 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
dsl_dataset_t *tosnap;
int error;
char *fromname;
+ /* LINTED E_FUNC_SET_NOT_USED */
+ boolean_t largeblockok;
+ /* LINTED E_FUNC_SET_NOT_USED */
+ boolean_t embedok;
+ boolean_t compressok;
uint64_t space;
error = dsl_pool_hold(snapname, FTAG, &dp);
@@ -5521,6 +5537,10 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
return (error);
}
+ largeblockok = nvlist_exists(innvl, "largeblockok");
+ embedok = nvlist_exists(innvl, "embedok");
+ compressok = nvlist_exists(innvl, "compressok");
+
error = nvlist_lookup_string(innvl, "from", &fromname);
if (error == 0) {
if (strchr(fromname, '@') != NULL) {
@@ -5533,7 +5553,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
if (error != 0)
goto out;
- error = dmu_send_estimate(tosnap, fromsnap, &space);
+ error = dmu_send_estimate(tosnap, fromsnap, compressok,
+ &space);
dsl_dataset_rele(fromsnap, FTAG);
} else if (strchr(fromname, '#') != NULL) {
/*
@@ -5548,7 +5569,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
if (error != 0)
goto out;
error = dmu_send_estimate_from_txg(tosnap,
- frombm.zbm_creation_txg, &space);
+ frombm.zbm_creation_txg, compressok, &space);
} else {
/*
* from is not properly formatted as a snapshot or
@@ -5559,7 +5580,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
}
} else {
// If estimating the size of a full send, use dmu_send_estimate
- error = dmu_send_estimate(tosnap, NULL, &space);
+ error = dmu_send_estimate(tosnap, NULL, compressok, &space);
}
fnvlist_add_uint64(outnvl, "space", space);
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 1d67b1080d..9eaea7666f 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -520,21 +520,23 @@ zio_timestamp_compare(const void *x1, const void *x2)
*/
static zio_t *
zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
- void *data, uint64_t size, zio_done_func_t *done, void *private,
- zio_type_t type, zio_priority_t priority, enum zio_flag flags,
- vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
- enum zio_stage stage, enum zio_stage pipeline)
+ void *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
+ void *private, zio_type_t type, zio_priority_t priority,
+ enum zio_flag flags, vdev_t *vd, uint64_t offset,
+ const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline)
{
zio_t *zio;
- ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
- ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+ ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE);
+ ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
ASSERT(vd || stage == ZIO_STAGE_OPEN);
+ IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0);
+
zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
bzero(zio, sizeof (zio_t));
@@ -578,7 +580,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_vd = vd;
zio->io_offset = offset;
zio->io_orig_data = zio->io_data = data;
- zio->io_orig_size = zio->io_size = size;
+ zio->io_orig_size = zio->io_size = psize;
+ zio->io_lsize = lsize;
zio->io_orig_flags = zio->io_flags = flags;
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
@@ -621,7 +624,7 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
{
zio_t *zio;
- zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
@@ -726,7 +729,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
zfs_blkptr_verify(spa, bp);
zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
- data, size, done, private,
+ data, size, size, done, private,
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
@@ -736,7 +739,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, const zio_prop_t *zp,
+ void *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, enum zio_flag flags,
@@ -753,7 +756,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zp->zp_copies > 0 &&
zp->zp_copies <= spa_max_replication(spa));
- zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
@@ -783,7 +786,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
{
zio_t *zio;
- zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
@@ -863,8 +866,8 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
stage |= ZIO_STAGE_ISSUE_ASYNC;
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
- NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
- NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
+ BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
+ flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
return (zio);
}
@@ -897,8 +900,8 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
- done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
- NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+ BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
+ flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
ASSERT0(zio->io_queued_timestamp);
return (zio);
@@ -912,7 +915,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
int c;
if (vd->vdev_children == 0) {
- zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
@@ -940,9 +943,9 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
ASSERT3U(offset + size, <=, vd->vdev_psize);
- zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
- ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
- NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+ zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
+ private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
+ offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
zio->io_prop.zp_checksum = checksum;
@@ -961,9 +964,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
ASSERT3U(offset + size, <=, vd->vdev_psize);
- zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
- ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
- NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+ zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
+ private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
+ offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
zio->io_prop.zp_checksum = checksum;
@@ -1039,7 +1042,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
flags &= ~ZIO_FLAG_IO_ALLOCATING;
}
- zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
+ zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
@@ -1061,7 +1064,7 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
ASSERT(vd->vdev_ops->vdev_op_leaf);
zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
- data, size, done, private, type, priority,
+ data, size, size, done, private, type, priority,
flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
vd, offset, NULL,
ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
@@ -1090,8 +1093,11 @@ zio_shrink(zio_t *zio, uint64_t size)
* Note, BP_IS_RAIDZ() assumes no compression.
*/
ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
- if (!BP_IS_RAIDZ(zio->io_bp))
- zio->io_orig_size = zio->io_size = size;
+ if (!BP_IS_RAIDZ(zio->io_bp)) {
+ /* we are not doing a raw write */
+ ASSERT3U(zio->io_size, ==, zio->io_lsize);
+ zio->io_orig_size = zio->io_size = zio->io_lsize = size;
+ }
}
/*
@@ -1200,10 +1206,12 @@ zio_write_compress(zio_t *zio)
zio_prop_t *zp = &zio->io_prop;
enum zio_compress compress = zp->zp_compress;
blkptr_t *bp = zio->io_bp;
- uint64_t lsize = zio->io_size;
- uint64_t psize = lsize;
+ uint64_t lsize = zio->io_lsize;
+ uint64_t psize = zio->io_size;
int pass = 1;
+ EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);
+
/*
* If our children haven't all reached the ready stage,
* wait for them and then repeat this pipeline stage.
@@ -1252,7 +1260,8 @@ zio_write_compress(zio_t *zio)
spa_max_replication(spa)) == BP_GET_NDVAS(bp));
}
- if (compress != ZIO_COMPRESS_OFF) {
+ /* If it's a compressed write that is not raw, compress the buffer. */
+ if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
void *cbuf = zio_buf_alloc(lsize);
psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
if (psize == 0 || psize == lsize) {
@@ -1303,6 +1312,8 @@ zio_write_compress(zio_t *zio)
zio->io_bp_override = NULL;
*bp = zio->io_bp_orig;
zio->io_pipeline = zio->io_orig_pipeline;
+ } else {
+ ASSERT3U(psize, !=, 0);
}
/*
@@ -2162,8 +2173,8 @@ zio_write_gang_block(zio_t *pio)
zp.zp_nopwrite = B_FALSE;
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
- (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
- zio_write_gang_member_ready, NULL, NULL, NULL,
+ (char *)pio->io_data + (pio->io_size - resid), lsize, lsize,
+ &zp, zio_write_gang_member_ready, NULL, NULL, NULL,
&gn->gn_child[g], pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
@@ -2368,6 +2379,10 @@ static boolean_t
zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
{
spa_t *spa = zio->io_spa;
+ boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW);
+
+ /* We should never get a raw, override zio */
+ ASSERT(!(zio->io_bp_override && do_raw));
/*
* Note: we compare the original data, not the transformed data,
@@ -2391,6 +2406,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (ddp->ddp_phys_birth != 0) {
arc_buf_t *abuf = NULL;
arc_flags_t aflags = ARC_FLAG_WAIT;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
blkptr_t blk = *zio->io_bp;
int error;
@@ -2398,10 +2414,26 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
ddt_exit(ddt);
+ /*
+ * Intuitively, it would make more sense to compare
+ * io_data than io_orig_data in the raw case since you
+ * don't want to look at any transformations that have
+ * happened to the data. However, for raw I/Os the
+ * data will actually be the same in io_data and
+ * io_orig_data, so all we have to do is issue this as
+ * a raw ARC read.
+ */
+ if (do_raw) {
+ zio_flags |= ZIO_FLAG_RAW;
+ ASSERT3U(zio->io_size, ==, zio->io_orig_size);
+ ASSERT0(bcmp(zio->io_data, zio->io_orig_data,
+ zio->io_size));
+ ASSERT3P(zio->io_transform_stack, ==, NULL);
+ }
+
error = arc_read(NULL, spa, &blk,
arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
- &aflags, &zio->io_bookmark);
+ zio_flags, &aflags, &zio->io_bookmark);
if (error == 0) {
if (arc_buf_size(abuf) != zio->io_orig_size ||
@@ -2516,6 +2548,7 @@ zio_ddt_write(zio_t *zio)
ASSERT(BP_GET_DEDUP(bp));
ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+ ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_TRUE);
@@ -2536,7 +2569,9 @@ zio_ddt_write(zio_t *zio)
BP_ZERO(bp);
} else {
zp->zp_dedup = B_FALSE;
+ BP_SET_DEDUP(bp, B_FALSE);
}
+ ASSERT(!BP_GET_DEDUP(bp));
zio->io_pipeline = ZIO_WRITE_PIPELINE;
ddt_exit(ddt);
return (ZIO_PIPELINE_CONTINUE);
@@ -2569,7 +2604,7 @@ zio_ddt_write(zio_t *zio)
}
dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
- zio->io_orig_size, &czp, NULL, NULL,
+ zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
@@ -2591,7 +2626,7 @@ zio_ddt_write(zio_t *zio)
ddt_phys_addref(ddp);
} else {
cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
- zio->io_orig_size, zp,
+ zio->io_orig_size, zio->io_orig_size, zp,
zio_ddt_child_write_ready, NULL, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
diff --git a/usr/src/uts/common/nfs/export.h b/usr/src/uts/common/nfs/export.h
index b6d223627d..66b86cdf8f 100644
--- a/usr/src/uts/common/nfs/export.h
+++ b/usr/src/uts/common/nfs/export.h
@@ -539,7 +539,7 @@ typedef struct secinfo secinfo_t;
* a real export at the mount point (VROOT) which has a subtree shared
* has a visible list.
*
- * The exi_visible field is NULL for normal, non=pseudo filesystems
+ * The exi_visible field is NULL for normal, non-pseudo filesystems
* which do not have any subtree exported. If the field is non-null,
* it points to a list of visible entries, identified by vis_fid and/or
* vis_ino. The presence of a "visible" list means that if this export
@@ -568,6 +568,7 @@ struct exp_visible {
struct exp_visible *vis_next;
struct secinfo *vis_secinfo;
int vis_seccnt;
+ timespec_t vis_change;
};
typedef struct exp_visible exp_visible_t;
@@ -635,7 +636,8 @@ extern exportinfo_t *vis2exi(treenode_t *);
extern int treeclimb_export(struct exportinfo *);
extern void treeclimb_unexport(struct exportinfo *);
extern int nfs_visible(struct exportinfo *, vnode_t *, int *);
-extern int nfs_visible_inode(struct exportinfo *, ino64_t, int *);
+extern int nfs_visible_inode(struct exportinfo *, ino64_t,
+ struct exp_visible **);
extern int has_visible(struct exportinfo *, vnode_t *);
extern void free_visible(struct exp_visible *);
extern int nfs_exported(struct exportinfo *, vnode_t *);
@@ -643,6 +645,9 @@ extern struct exportinfo *pseudo_exportfs(vnode_t *, fid_t *,
struct exp_visible *, struct exportdata *);
extern int vop_fid_pseudo(vnode_t *, fid_t *);
extern int nfs4_vget_pseudo(struct exportinfo *, vnode_t **, fid_t *);
+extern bool_t nfs_visible_change(struct exportinfo *, vnode_t *,
+ timespec_t *);
+extern void tree_update_change(treenode_t *, timespec_t *);
/*
* Functions that handle the NFSv4 server namespace security flavors
* information.