summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJosh Wilsdon <jwilsdon@joyent.com>2016-11-30 11:47:19 -0800
committerJosh Wilsdon <jwilsdon@joyent.com>2016-11-30 11:47:19 -0800
commitf44f59dde7be918b195c0ddda03e2e248b478839 (patch)
treeb99a68d736093caf8b40b308a18a420853cbad80
parent4d8f2dc1b38a19fb1fd1cca5ecb1324f24445f22 (diff)
parent80c431c3af17a3f5c86dac722986210ac5675994 (diff)
downloadillumos-joyent-f44f59dde7be918b195c0ddda03e2e248b478839.tar.gz
Merge branch 'master' into OS-5783
-rw-r--r--usr/src/boot/sys/boot/libstand/Makefile.com8
-rw-r--r--usr/src/cmd/mdb/common/mdb/mdb_io.c11
-rw-r--r--usr/src/cmd/mdb/common/modules/zfs/zfs.c208
-rw-r--r--usr/src/cmd/sgs/Makefile4
-rw-r--r--usr/src/cmd/sgs/Makefile.com14
-rw-r--r--usr/src/cmd/sgs/ld/Makefile.com4
-rw-r--r--usr/src/cmd/sgs/ld/Makefile.targ6
-rw-r--r--usr/src/cmd/sgs/ldprof/Makefile.com2
-rw-r--r--usr/src/cmd/sgs/libelf/Makefile.targ12
-rw-r--r--usr/src/cmd/sgs/libld/Makefile.targ13
-rw-r--r--usr/src/cmd/sgs/liblddbg/Makefile.targ13
-rw-r--r--usr/src/cmd/sgs/libldmake/Makefile.com2
-rw-r--r--usr/src/cmd/sgs/libldstab/Makefile.targ7
-rw-r--r--usr/src/cmd/sgs/librtld/Makefile.targ13
-rw-r--r--usr/src/cmd/sgs/link_audit/Makefile.com2
-rw-r--r--usr/src/cmd/zdb/zdb.c25
-rw-r--r--usr/src/cmd/zfs/zfs_main.c27
-rw-r--r--usr/src/cmd/zstreamdump/zstreamdump.c30
-rw-r--r--usr/src/cmd/ztest/ztest.c2
-rw-r--r--usr/src/common/bzip2/bzlib.c66
-rw-r--r--usr/src/lib/brand/lx/lx_brand/Makefile.com1
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/clock.c152
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/lx_brand.c12
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/priority.c117
-rw-r--r--usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h4
-rw-r--r--usr/src/lib/libzfs/common/libzfs.h3
-rw-r--r--usr/src/lib/libzfs/common/libzfs_dataset.c5
-rw-r--r--usr/src/lib/libzfs/common/libzfs_sendrecv.c51
-rw-r--r--usr/src/lib/libzfs_core/common/libzfs_core.c11
-rw-r--r--usr/src/lib/libzfs_core/common/libzfs_core.h5
-rw-r--r--usr/src/lib/libzpool/common/kernel.c5
-rw-r--r--usr/src/lib/libzpool/common/sys/zfs_context.h1
-rw-r--r--usr/src/man/man1m/zfs.1m48
-rw-r--r--usr/src/pkg/manifests/system-test-zfstest.mf23
-rw-r--r--usr/src/test/zfs-tests/include/commands.cfg3
-rw-r--r--usr/src/test/zfs-tests/include/libtest.shlib45
-rw-r--r--usr/src/test/zfs-tests/include/properties.shlib94
-rw-r--r--usr/src/test/zfs-tests/runfiles/delphix.run8
-rw-r--r--usr/src/test/zfs-tests/runfiles/omnios.run8
-rw-r--r--usr/src/test/zfs-tests/runfiles/openindiana.run8
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/Makefile17
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend.cfg4
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend.kshlib122
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh3
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh3
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh3
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh3
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh3
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-cD.ksh77
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_embedded_blocks.ksh103
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_incremental.ksh100
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_lz4_disabled.ksh73
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_mixed_compression.ksh54
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_props.ksh67
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_dedup.ksh55
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled.ksh68
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_resume.ksh49
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate.ksh91
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_contents.ksh55
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_ratio.ksh66
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_volume.ksh80
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-c_zstreamdump.ksh59
-rw-r--r--usr/src/test/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize.ksh198
-rw-r--r--usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio4
-rw-r--r--usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio4
-rw-r--r--usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio4
-rw-r--r--usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio4
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_brand.c60
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_syscall.c12
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_brand.h11
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_syscalls.h3
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_getpid.c8
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_priority.c192
-rw-r--r--usr/src/uts/common/brand/lx/syscall/lx_timer.c253
-rw-r--r--usr/src/uts/common/disp/priocntl.c4
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_client.c11
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_srv_attr.c123
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_srv_ns.c213
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_srv_readdir.c26
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_vnops.c4
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_export.c27
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c1030
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c171
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c46
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c11
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c231
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dataset.c8
-rw-r--r--usr/src/uts/common/fs/zfs/lz4.c2
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c976
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c13
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c2
-rw-r--r--usr/src/uts/common/fs/zfs/space_map.c1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc.h20
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h7
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_send.h9
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_dataset.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab.h13
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab_impl.h106
-rw-r--r--usr/src/uts/common/fs/zfs/sys/refcount.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_debug.h17
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h25
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h32
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_compress.h23
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c35
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c125
-rw-r--r--usr/src/uts/common/nfs/export.h9
-rw-r--r--usr/src/uts/common/os/timer.c257
-rw-r--r--usr/src/uts/common/sys/timer.h26
-rw-r--r--usr/src/uts/intel/Makefile.files1
109 files changed, 4998 insertions, 1591 deletions
diff --git a/usr/src/boot/sys/boot/libstand/Makefile.com b/usr/src/boot/sys/boot/libstand/Makefile.com
index b69784ef42..8594b787fd 100644
--- a/usr/src/boot/sys/boot/libstand/Makefile.com
+++ b/usr/src/boot/sys/boot/libstand/Makefile.com
@@ -32,11 +32,9 @@ CPPFLAGS += -I../../../sys -I. -I$(SRC)/common/bzip2
CFLAGS = -O2 -ffreestanding -Wformat
CFLAGS += -mno-mmx -mno-3dnow -mno-sse -mno-sse2 -mno-sse3 -msoft-float
-CFLAGS += -Wno-pointer-sign -Wno-empty-body -Wno-string-plus-int \
- -Wno-unused-const-variable -Wno-tautological-compare \
- -Wno-unused-value -Wno-parentheses-equality \
- -Wno-unused-function -Wno-enum-conversion -Wno-switch \
- -Wno-switch-enum -Wno-knr-promoted-parameter -Wno-parentheses
+CFLAGS += -Wno-pointer-sign -Wno-empty-body -Wno-unused-value \
+ -Wno-unused-function -Wno-switch \
+ -Wno-switch-enum -Wno-parentheses
include ${LIBSTAND_SRC}/Makefile.inc
diff --git a/usr/src/cmd/mdb/common/mdb/mdb_io.c b/usr/src/cmd/mdb/common/mdb/mdb_io.c
index 8b4f1aaaef..84e7c92784 100644
--- a/usr/src/cmd/mdb/common/mdb/mdb_io.c
+++ b/usr/src/cmd/mdb/common/mdb/mdb_io.c
@@ -25,6 +25,7 @@
/*
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
*/
/*
@@ -1513,7 +1514,17 @@ iob_doprnt(mdb_iob_t *iob, const char *format, varglist_t *ap)
/*
* If the string and the option altstr won't fit on this line
* and auto-wrap is set (default), skip to the next line.
+ * If the string contains \n, and the \n terminated substring
+ * + altstr is shorter than the above, use the shorter lf_len.
*/
+ if (u.str != NULL) {
+ char *np = strchr(u.str, '\n');
+ if (np != NULL) {
+ int lf_len = (np - u.str) + altlen;
+ if (lf_len < width)
+ width = lf_len;
+ }
+ }
if (IOB_WRAPNOW(iob, width))
mdb_iob_nl(iob);
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
index fd419f6ea2..f4846e94de 100644
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -312,20 +312,26 @@ objset_name(uintptr_t addr, char *buf)
return (0);
}
-static void
-enum_lookup(char *out, size_t size, mdb_ctf_id_t id, int val,
- const char *prefix)
+static int
+enum_lookup(char *type, int val, const char *prefix, size_t size, char *out)
{
const char *cp;
size_t len = strlen(prefix);
+ mdb_ctf_id_t enum_type;
+
+ if (mdb_ctf_lookup_by_name(type, &enum_type) != 0) {
+ mdb_warn("Could not find enum for %s", type);
+ return (-1);
+ }
- if ((cp = mdb_ctf_enum_name(id, val)) != NULL) {
+ if ((cp = mdb_ctf_enum_name(enum_type, val)) != NULL) {
if (strncmp(cp, prefix, len) == 0)
cp += len;
(void) strncpy(out, cp, size);
} else {
mdb_snprintf(out, size, "? (%d)", val);
}
+ return (0);
}
/* ARGSUSED */
@@ -418,7 +424,6 @@ zfs_params(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
static int
blkptr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
{
- mdb_ctf_id_t type_enum, checksum_enum, compress_enum;
char type[80], checksum[80], compress[80];
blkptr_t blk, *bp = &blk;
char buf[BP_SPRINTF_LEN];
@@ -428,20 +433,16 @@ blkptr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
return (DCMD_ERR);
}
- if (mdb_ctf_lookup_by_name("enum dmu_object_type", &type_enum) == -1 ||
- mdb_ctf_lookup_by_name("enum zio_checksum", &checksum_enum) == -1 ||
- mdb_ctf_lookup_by_name("enum zio_compress", &compress_enum) == -1) {
+ if (enum_lookup("enum dmu_object_type", BP_GET_TYPE(bp), "DMU_OT_",
+ sizeof (type), type) == -1 ||
+ enum_lookup("enum zio_checksum", BP_GET_CHECKSUM(bp),
+ "ZIO_CHECKSUM_", sizeof (checksum), checksum) == -1 ||
+ enum_lookup("enum zio_compress", BP_GET_COMPRESS(bp),
+ "ZIO_COMPRESS_", sizeof (compress), compress) == -1) {
mdb_warn("Could not find blkptr enumerated types");
return (DCMD_ERR);
}
- enum_lookup(type, sizeof (type), type_enum,
- BP_GET_TYPE(bp), "DMU_OT_");
- enum_lookup(checksum, sizeof (checksum), checksum_enum,
- BP_GET_CHECKSUM(bp), "ZIO_CHECKSUM_");
- enum_lookup(compress, sizeof (compress), compress_enum,
- BP_GET_COMPRESS(bp), "ZIO_COMPRESS_");
-
SNPRINTF_BLKPTR(mdb_snprintf, '\n', buf, sizeof (buf), bp, type,
checksum, compress);
@@ -1287,13 +1288,16 @@ typedef struct mdb_range_tree {
typedef struct mdb_metaslab_group {
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+ uintptr_t mg_vd;
} mdb_metaslab_group_t;
typedef struct mdb_metaslab {
uint64_t ms_id;
uint64_t ms_start;
uint64_t ms_size;
+ int64_t ms_deferspace;
uint64_t ms_fragmentation;
+ uint64_t ms_weight;
uintptr_t ms_alloctree[TXG_SIZE];
uintptr_t ms_freetree[TXG_SIZE];
uintptr_t ms_tree;
@@ -1313,11 +1317,18 @@ typedef struct mdb_space_map {
} mdb_space_map_t;
typedef struct mdb_vdev {
+ uintptr_t vdev_path;
uintptr_t vdev_ms;
+ uintptr_t vdev_ops;
uint64_t vdev_ms_count;
+ uint64_t vdev_id;
vdev_stat_t vdev_stat;
} mdb_vdev_t;
+typedef struct mdb_vdev_ops {
+ char vdev_op_type[16];
+} mdb_vdev_ops_t;
+
static int
metaslab_stats(uintptr_t addr, int spa_flags)
{
@@ -1631,6 +1642,165 @@ vdev_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
return (do_print_vdev(addr, flags, (int)depth, recursive, spa_flags));
}
+typedef struct mdb_metaslab_alloc_trace {
+ uintptr_t mat_mg;
+ uintptr_t mat_msp;
+ uint64_t mat_size;
+ uint64_t mat_weight;
+ uint64_t mat_offset;
+ uint32_t mat_dva_id;
+} mdb_metaslab_alloc_trace_t;
+
+static void
+metaslab_print_weight(uint64_t weight)
+{
+ char buf[100];
+
+ if (WEIGHT_IS_SPACEBASED(weight)) {
+ mdb_nicenum(
+ weight & ~(METASLAB_ACTIVE_MASK | METASLAB_WEIGHT_TYPE),
+ buf);
+ } else {
+ char size[NICENUM_BUFLEN];
+ mdb_nicenum(1ULL << WEIGHT_GET_INDEX(weight), size);
+ (void) mdb_snprintf(buf, sizeof (buf), "%llu x %s",
+ WEIGHT_GET_COUNT(weight), size);
+ }
+ mdb_printf("%11s ", buf);
+}
+
+/* ARGSUSED */
+static int
+metaslab_weight(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ uint64_t weight = 0;
+ char active;
+
+ if (argc == 0 && (flags & DCMD_ADDRSPEC)) {
+ if (mdb_vread(&weight, sizeof (uint64_t), addr) == -1) {
+ mdb_warn("failed to read weight at %p\n", addr);
+ return (DCMD_ERR);
+ }
+ } else if (argc == 1 && !(flags & DCMD_ADDRSPEC)) {
+ weight = (argv[0].a_type == MDB_TYPE_IMMEDIATE) ?
+ argv[0].a_un.a_val : mdb_strtoull(argv[0].a_un.a_str);
+ } else {
+ return (DCMD_USAGE);
+ }
+
+ if (DCMD_HDRSPEC(flags)) {
+ mdb_printf("%<u>%-6s %9s %9s%</u>\n",
+ "ACTIVE", "ALGORITHM", "WEIGHT");
+ }
+
+ if (weight & METASLAB_WEIGHT_PRIMARY)
+ active = 'P';
+ else if (weight & METASLAB_WEIGHT_SECONDARY)
+ active = 'S';
+ else
+ active = '-';
+ mdb_printf("%6c %8s ", active,
+ WEIGHT_IS_SPACEBASED(weight) ? "SPACE" : "SEGMENT");
+ metaslab_print_weight(weight);
+ mdb_printf("\n");
+
+ return (DCMD_OK);
+}
+
+/* ARGSUSED */
+static int
+metaslab_trace(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ mdb_metaslab_alloc_trace_t mat;
+ mdb_metaslab_group_t mg = { 0 };
+ char result_type[100];
+
+ if (mdb_ctf_vread(&mat, "metaslab_alloc_trace_t",
+ "mdb_metaslab_alloc_trace_t", addr, 0) == -1) {
+ return (DCMD_ERR);
+ }
+
+ if (!(flags & DCMD_PIPE_OUT) && DCMD_HDRSPEC(flags)) {
+ mdb_printf("%<u>%6s %6s %8s %11s %18s %18s%</u>\n",
+ "MSID", "DVA", "ASIZE", "WEIGHT", "RESULT", "VDEV");
+ }
+
+ if (mat.mat_msp != NULL) {
+ mdb_metaslab_t ms;
+
+ if (mdb_ctf_vread(&ms, "metaslab_t", "mdb_metaslab_t",
+ mat.mat_msp, 0) == -1) {
+ return (DCMD_ERR);
+ }
+ mdb_printf("%6llu ", ms.ms_id);
+ } else {
+ mdb_printf("%6s ", "-");
+ }
+
+ mdb_printf("%6d %8llx ", mat.mat_dva_id, mat.mat_size);
+
+ metaslab_print_weight(mat.mat_weight);
+
+ if ((int64_t)mat.mat_offset < 0) {
+ if (enum_lookup("enum trace_alloc_type", mat.mat_offset,
+ "TRACE_", sizeof (result_type), result_type) == -1) {
+ mdb_warn("Could not find enum for trace_alloc_type");
+ return (DCMD_ERR);
+ }
+ mdb_printf("%18s ", result_type);
+ } else {
+ mdb_printf("%<b>%18llx%</b> ", mat.mat_offset);
+ }
+
+ if (mat.mat_mg != NULL &&
+ mdb_ctf_vread(&mg, "metaslab_group_t", "mdb_metaslab_group_t",
+ mat.mat_mg, 0) == -1) {
+ return (DCMD_ERR);
+ }
+
+ if (mg.mg_vd != NULL) {
+ mdb_vdev_t vdev;
+ char desc[MAXNAMELEN];
+
+ if (mdb_ctf_vread(&vdev, "vdev_t", "mdb_vdev_t",
+ mg.mg_vd, 0) == -1) {
+ return (DCMD_ERR);
+ }
+
+ if (vdev.vdev_path != NULL) {
+ char path[MAXNAMELEN];
+
+ if (mdb_readstr(path, sizeof (path),
+ vdev.vdev_path) == -1) {
+ mdb_warn("failed to read vdev_path at %p\n",
+ vdev.vdev_path);
+ return (DCMD_ERR);
+ }
+ char *slash;
+ if ((slash = strrchr(path, '/')) != NULL) {
+ strcpy(desc, slash + 1);
+ } else {
+ strcpy(desc, path);
+ }
+ } else if (vdev.vdev_ops != NULL) {
+ mdb_vdev_ops_t ops;
+ if (mdb_ctf_vread(&ops, "vdev_ops_t", "mdb_vdev_ops_t",
+ vdev.vdev_ops, 0) == -1) {
+ mdb_warn("failed to read vdev_ops at %p\n",
+ vdev.vdev_ops);
+ return (DCMD_ERR);
+ }
+ (void) mdb_snprintf(desc, sizeof (desc),
+ "%s-%llu", ops.vdev_op_type, vdev.vdev_id);
+ } else {
+ (void) strcpy(desc, "<unknown>");
+ }
+ mdb_printf("%18s\n", desc);
+ }
+
+ return (DCMD_OK);
+}
+
typedef struct metaslab_walk_data {
uint64_t mw_numvdevs;
uintptr_t *mw_vdevs;
@@ -1748,6 +1918,7 @@ typedef struct space_data {
uint64_t ms_alloctree[TXG_SIZE];
uint64_t ms_freetree[TXG_SIZE];
uint64_t ms_tree;
+ int64_t ms_deferspace;
uint64_t avail;
uint64_t nowavail;
} space_data_t;
@@ -1795,6 +1966,7 @@ space_cb(uintptr_t addr, const void *unknown, void *arg)
"mdb_space_map_phys_t", sm.sm_phys, 0);
}
+ sd->ms_deferspace += ms.ms_deferspace;
sd->ms_tree += rt.rt_space;
sd->avail += sm.sm_size - sm.sm_alloc;
sd->nowavail += sm.sm_size - smp.smp_alloc;
@@ -1877,6 +2049,8 @@ spa_space(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
sd.ms_freetree[2] >> shift, suffix,
sd.ms_freetree[3] >> shift, suffix);
mdb_printf("ms_tree = %llu%s\n", sd.ms_tree >> shift, suffix);
+ mdb_printf("ms_deferspace = %llu%s\n",
+ sd.ms_deferspace >> shift, suffix);
mdb_printf("last synced avail = %llu%s\n", sd.avail >> shift, suffix);
mdb_printf("current syncing avail = %llu%s\n",
sd.nowavail >> shift, suffix);
@@ -3790,6 +3964,10 @@ static const mdb_dcmd_t dcmds[] = {
"print zfs debug log", dbgmsg},
{ "rrwlock", ":",
"print rrwlock_t, including readers", rrwlock},
+ { "metaslab_weight", "weight",
+ "print metaslab weight", metaslab_weight},
+ { "metaslab_trace", ":",
+ "print metaslab allocation trace records", metaslab_trace},
{ "arc_compression_stats", ":[-vabrf]\n"
"\t-v verbose, display a linearly scaled histogram\n"
"\t-a display ARC_anon state statistics individually\n"
diff --git a/usr/src/cmd/sgs/Makefile b/usr/src/cmd/sgs/Makefile
index a583a83fb5..f2128aaaa2 100644
--- a/usr/src/cmd/sgs/Makefile
+++ b/usr/src/cmd/sgs/Makefile
@@ -20,6 +20,7 @@
#
#
# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright 2016 RackTop Systems.
#
include $(SRC)/cmd/Makefile.cmd
@@ -173,6 +174,9 @@ native-clobber:
native-proto:
-@mkdir -p proto/$(MACH)
+ -@mkdir -p proto/$(MACH)/lib
+ -@mkdir -p proto/$(MACH)/usr
+ -@mkdir -p proto/$(MACH)/usr/bin
FRC:
diff --git a/usr/src/cmd/sgs/Makefile.com b/usr/src/cmd/sgs/Makefile.com
index 38f138d24b..650a93e967 100644
--- a/usr/src/cmd/sgs/Makefile.com
+++ b/usr/src/cmd/sgs/Makefile.com
@@ -21,6 +21,7 @@
#
# Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright 2016 RackTop Systems.
#
.KEEP_STATE:
@@ -83,6 +84,12 @@ SGSONLD = $(ROOT)/opt/SUNWonld
SGSRPATH = /usr/lib
SGSRPATH64 = $(SGSRPATH)/$(MACH64)
+# Mimic the structure of an installed system.
+
+SGSLIBDIR = $(SGSPROTO)/lib
+SGSPREFIX = $(SGSPROTO)/usr
+SGSBINDIR = $(SGSPREFIX)/bin
+
#
# Macros to be used to include link against libconv and include vernote.o
#
@@ -120,12 +127,13 @@ CHKMSGFLAGS = $(SGSMSGTARG:%=-m %) $(SGSMSGCHK:%=-m %)
# Native targets should use the minimum of ld(1) flags to allow building on
# previous releases. We use mapfiles to scope, but don't bother versioning.
-native := DYNFLAGS = -R$(SGSPROTO) -L$(SGSPROTO) $(ZNOVERSION)
+native := DYNFLAGS = -R$(SGSLIBDIR) -L$(SGSLIBDIR) $(ZNOVERSION) \
+ $(HSONAME)
# Comment out the following two lines to have the sgs built from the system
# link-editor, rather than the local proto link-editor.
-CC_USE_PROTO = -Yl,$(SGSPROTO)
-LD_USE_PROTO = $(SGSPROTO)/
+CC_USE_PROTO = -Yl,$(SGSBINDIR)
+LD_USE_PROTO = $(SGSBINDIR)/
#
# lint-related stuff
diff --git a/usr/src/cmd/sgs/ld/Makefile.com b/usr/src/cmd/sgs/ld/Makefile.com
index 01ba9c1ff1..277b62c95b 100644
--- a/usr/src/cmd/sgs/ld/Makefile.com
+++ b/usr/src/cmd/sgs/ld/Makefile.com
@@ -50,8 +50,8 @@ LINTFLAGS64 += -x $(VAR_LINTFLAGS64)
CLEANFILES += $(LINTOUTS)
-native := LDFLAGS = -R$(SGSPROTO) $(ZNOVERSION)
-native := LDLIBS = -L$(SGSPROTO) $(LD_LIB) -lelf $(CONVLIBDIR) \
+native := LDFLAGS = -R$(SGSLIBDIR) $(ZNOVERSION)
+native := LDLIBS = -L$(SGSLIBDIR) $(LD_LIB) -lelf $(CONVLIBDIR) \
$(CONV_LIB)
native := CPPFLAGS += -DNATIVE_BUILD
diff --git a/usr/src/cmd/sgs/ld/Makefile.targ b/usr/src/cmd/sgs/ld/Makefile.targ
index debb091dae..42ee9938d6 100644
--- a/usr/src/cmd/sgs/ld/Makefile.targ
+++ b/usr/src/cmd/sgs/ld/Makefile.targ
@@ -23,7 +23,7 @@
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
+# Copyright 2016 RackTop Systems.
#
all: $(PROG)
@@ -51,9 +51,9 @@ include $(SRC)/cmd/sgs/Makefile.targ
# Special target for native builds (ie. when we need to build a version of ld
# to build a version of ld :-).
-native: $(SGSPROTO)/$(PROG)
+native: $(SGSBINDIR)/$(PROG)
-$(SGSPROTO)/$(PROG) \
+$(SGSBINDIR)/$(PROG) \
$(PROG): $$(OBJS)
$(LINK.c) $(OBJS) -o $@ $(LDLIBS)
$(POST_PROCESS)
diff --git a/usr/src/cmd/sgs/ldprof/Makefile.com b/usr/src/cmd/sgs/ldprof/Makefile.com
index f9406e08fc..1e7a111101 100644
--- a/usr/src/cmd/sgs/ldprof/Makefile.com
+++ b/usr/src/cmd/sgs/ldprof/Makefile.com
@@ -34,8 +34,6 @@ include ../../Makefile.com
ROOTLIBDIR= $(ROOT)/usr/lib/link_audit
-SGSPROTO= ../../proto/$(MACH)
-
MAPFILES = ../common/mapfile-vers
DYNFLAGS += $(CC_USE_PROTO)
diff --git a/usr/src/cmd/sgs/libelf/Makefile.targ b/usr/src/cmd/sgs/libelf/Makefile.targ
index 0c5f3ce0a7..3c70014e0e 100644
--- a/usr/src/cmd/sgs/libelf/Makefile.targ
+++ b/usr/src/cmd/sgs/libelf/Makefile.targ
@@ -20,6 +20,7 @@
#
#
# Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright 2016 RackTop Systems.
#
objs/%.o \
@@ -131,14 +132,15 @@ pics/xlate.o: xlate.c
# Special target for native builds (ie. when we need to build a version of ld
# to build a version of ld :-).
-native: $(SGSPROTO)/$(DYNLIB)
+native: $(SGSLIBDIR)/$(LIBLINKS)
-$(SGSPROTO)/$(DYNLIB): \
- pics .WAIT $$(PICS)
+$(SGSLIBDIR)/$(DYNLIB): pics .WAIT $$(PICS)
$(BUILD.SO)
$(POST_PROCESS_SO)
- -@$(RM) $(SGSPROTO)/$(LIBLINKS)
- $(SYMLINK) $(DYNLIB) $(SGSPROTO)/$(LIBLINKS)
+
+$(SGSLIBDIR)/$(LIBLINKS): $(SGSLIBDIR)/$(DYNLIB)
+ -@$(RM) $(SGSLIBDIR)/$(LIBLINKS)
+ $(SYMLINK) $(DYNLIB) $(SGSLIBDIR)/$(LIBLINKS)
# Derived source and header files (messaging). Make sure that the sgsmsg
diff --git a/usr/src/cmd/sgs/libld/Makefile.targ b/usr/src/cmd/sgs/libld/Makefile.targ
index b23215ce80..85f71e63c7 100644
--- a/usr/src/cmd/sgs/libld/Makefile.targ
+++ b/usr/src/cmd/sgs/libld/Makefile.targ
@@ -22,7 +22,7 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
+# Copyright 2016 RackTop Systems.
#
pics/%.o: ../common/%.c
@@ -101,14 +101,15 @@ lint: $(LINTLIB32) $(LINTOUT32) $(LINTLIB64) $(LINTOUT64) \
# Special target for native builds (ie. when we need to build a version of ld
# to build a version of ld :-).
-native: $(SGSPROTO)/$(DYNLIB)
+native: $(SGSLIBDIR)/$(LIBLINKS)
-$(SGSPROTO)/$(DYNLIB): \
- pics .WAIT $$(PICS)
+$(SGSLIBDIR)/$(DYNLIB): pics .WAIT $$(PICS)
$(BUILD.SO)
$(POST_PROCESS_SO)
- @$(RM) $(SGSPROTO)/$(LIBLINKS)
- $(SYMLINK) $(DYNLIB) $(SGSPROTO)/$(LIBLINKS)
+
+$(SGSLIBDIR)/$(LIBLINKS): $(SGSLIBDIR)/$(DYNLIB)
+ @$(RM) $(SGSLIBDIR)/$(LIBLINKS)
+ $(SYMLINK) $(DYNLIB) $(SGSLIBDIR)/$(LIBLINKS)
include $(SRC)/lib/Makefile.targ
include $(SRC)/cmd/sgs/Makefile.targ
diff --git a/usr/src/cmd/sgs/liblddbg/Makefile.targ b/usr/src/cmd/sgs/liblddbg/Makefile.targ
index 71c17d0bae..d5a7e8f6dd 100644
--- a/usr/src/cmd/sgs/liblddbg/Makefile.targ
+++ b/usr/src/cmd/sgs/liblddbg/Makefile.targ
@@ -22,7 +22,7 @@
# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
+# Copyright 2016 RackTop Systems.
#
pics/%.o: ../common/%.c
@@ -55,14 +55,15 @@ lint: $(LINTLIB32) $(LINTOUT32) $(LINTLIB64) $(LINTOUT64) \
# Special target for native builds (ie. when we need to build a version of ld
# to build a version of ld :-).
-native: $(SGSPROTO)/$(DYNLIB)
+native: $(SGSLIBDIR)/$(LIBLINKS)
-$(SGSPROTO)/$(DYNLIB): \
- pics .WAIT $$(PICS)
+$(SGSLIBDIR)/$(DYNLIB): pics .WAIT $$(PICS)
$(BUILD.SO)
$(POST_PROCESS_SO)
- -@$(RM) $(SGSPROTO)/$(LIBLINKS)
- $(SYMLINK) $(DYNLIB) $(SGSPROTO)/$(LIBLINKS)
+
+$(SGSLIBDIR)/$(LIBLINKS): $(SGSLIBDIR)/$(DYNLIB)
+ -@$(RM) $(SGSLIBDIR)/$(LIBLINKS)
+ $(SYMLINK) $(DYNLIB) $(SGSLIBDIR)/$(LIBLINKS)
include $(SRC)/lib/Makefile.targ
include $(SRC)/cmd/sgs/Makefile.targ
diff --git a/usr/src/cmd/sgs/libldmake/Makefile.com b/usr/src/cmd/sgs/libldmake/Makefile.com
index a8ab393d8a..6333a7b795 100644
--- a/usr/src/cmd/sgs/libldmake/Makefile.com
+++ b/usr/src/cmd/sgs/libldmake/Makefile.com
@@ -34,8 +34,6 @@ include $(SRC)/cmd/sgs/Makefile.com
ROOTLIBDIR= $(ROOT)/opt/SUNWonld/lib
ROOTLIBDIR64= $(ROOT)/opt/SUNWonld/lib/$(MACH64)
-SGSPROTO= ../../proto/$(MACH)
-
SRCDIR = ../common
DYNFLAGS += $(CC_USE_PROTO)
diff --git a/usr/src/cmd/sgs/libldstab/Makefile.targ b/usr/src/cmd/sgs/libldstab/Makefile.targ
index b20710f983..6b47b4ae2b 100644
--- a/usr/src/cmd/sgs/libldstab/Makefile.targ
+++ b/usr/src/cmd/sgs/libldstab/Makefile.targ
@@ -22,6 +22,8 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
+# Copyright 2016 RackTop Systems.
+#
pics/%.o: ../common/%.c
$(COMPILE.c) -o $@ $<
@@ -39,10 +41,9 @@ delete:
# Special target for native builds (ie. when we need to build a version of ld
# to build a version of ld :-).
-native: $(SGSPROTO)/$(DYNLIB)
+native: $(SGSLIBDIR)/$(DYNLIB)
-$(SGSPROTO)/$(DYNLIB): \
- pics .WAIT $$(PICS)
+$(SGSLIBDIR)/$(DYNLIB): pics .WAIT $$(PICS)
$(BUILD.SO)
$(POST_PROCESS_SO)
diff --git a/usr/src/cmd/sgs/librtld/Makefile.targ b/usr/src/cmd/sgs/librtld/Makefile.targ
index 134daba96a..71a73947c8 100644
--- a/usr/src/cmd/sgs/librtld/Makefile.targ
+++ b/usr/src/cmd/sgs/librtld/Makefile.targ
@@ -22,7 +22,7 @@
# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
+# Copyright 2016 RackTop Systems.
#
pics/%.o: ../common/%.c
@@ -38,14 +38,15 @@ delete:
# Special target for native builds (ie. when we need to build a version of ld
# to build a version of ld :-).
-native: $(SGSPROTO)/$(DYNLIB)
+native: $(SGSLIBDIR)/$(LIBLINKS)
-$(SGSPROTO)/$(DYNLIB): \
- pics .WAIT $$(PICS)
+$(SGSLIBDIR)/$(DYNLIB): pics .WAIT $$(PICS)
$(BUILD.SO)
$(POST_PROCESS_SO)
- -@$(RM) $(SGSPROTO)/$(LIBLINKS)
- $(SYMLINK) $(DYNLIB) $(SGSPROTO)/$(LIBLINKS)
+
+$(SGSLIBDIR)/$(LIBLINKS): $(SGSLIBDIR)/$(DYNLIB)
+ -@$(RM) $(SGSLIBDIR)/$(LIBLINKS)
+ $(SYMLINK) $(DYNLIB) $(SGSLIBDIR)/$(LIBLINKS)
include $(SRC)/lib/Makefile.targ
include $(SRC)/cmd/sgs/Makefile.targ
diff --git a/usr/src/cmd/sgs/link_audit/Makefile.com b/usr/src/cmd/sgs/link_audit/Makefile.com
index c15aa1e204..b5de5f5be8 100644
--- a/usr/src/cmd/sgs/link_audit/Makefile.com
+++ b/usr/src/cmd/sgs/link_audit/Makefile.com
@@ -28,8 +28,6 @@ include ../../Makefile.com
NO_ASM_WARN= -erroff=E_ASM_DISABLES_OPTIMIZATION
-SGSPROTO= ../../proto/$(MACH)
-
TRUSSLIB= truss.so.1
TRUSSSRC= truss.c
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 95985a6c48..bd9c5c2408 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -2569,10 +2569,21 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
if (!dump_opt['L']) {
vdev_t *rvd = spa->spa_root_vdev;
+
+ /*
+ * We are going to be changing the meaning of the metaslab's
+ * ms_tree. Ensure that the allocator doesn't try to
+ * use the tree.
+ */
+ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
+ spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
+
for (uint64_t c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
+ metaslab_group_t *mg = vd->vdev_mg;
for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
+ ASSERT3P(msp->ms_group, ==, mg);
mutex_enter(&msp->ms_lock);
metaslab_unload(msp);
@@ -2593,8 +2604,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
(longlong_t)m,
(longlong_t)vd->vdev_ms_count);
- msp->ms_ops = &zdb_metaslab_ops;
-
/*
* We don't want to spend the CPU
* manipulating the size-ordered
@@ -2604,7 +2613,10 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
msp->ms_tree->rt_ops = NULL;
VERIFY0(space_map_load(msp->ms_sm,
msp->ms_tree, SM_ALLOC));
- msp->ms_loaded = B_TRUE;
+
+ if (!msp->ms_loaded) {
+ msp->ms_loaded = B_TRUE;
+ }
}
mutex_exit(&msp->ms_lock);
}
@@ -2626,8 +2638,10 @@ zdb_leak_fini(spa_t *spa)
vdev_t *rvd = spa->spa_root_vdev;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
+ metaslab_group_t *mg = vd->vdev_mg;
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
+ ASSERT3P(mg, ==, msp->ms_group);
mutex_enter(&msp->ms_lock);
/*
@@ -2641,7 +2655,10 @@ zdb_leak_fini(spa_t *spa)
* from the ms_tree.
*/
range_tree_vacate(msp->ms_tree, zdb_leak, vd);
- msp->ms_loaded = B_FALSE;
+
+ if (msp->ms_loaded) {
+ msp->ms_loaded = B_FALSE;
+ }
mutex_exit(&msp->ms_lock);
}
diff --git a/usr/src/cmd/zfs/zfs_main.c b/usr/src/cmd/zfs/zfs_main.c
index 7d9153f9fe..0132ab81bb 100644
--- a/usr/src/cmd/zfs/zfs_main.c
+++ b/usr/src/cmd/zfs/zfs_main.c
@@ -34,6 +34,7 @@
#include <assert.h>
#include <ctype.h>
#include <errno.h>
+#include <getopt.h>
#include <libgen.h>
#include <libintl.h>
#include <libuutil.h>
@@ -262,7 +263,7 @@ get_usage(zfs_help_t idx)
case HELP_ROLLBACK:
return (gettext("\trollback [-rRf] <snapshot>\n"));
case HELP_SEND:
- return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
+ return (gettext("\tsend [-DnPpRvLec] [-[iI] snapshot] "
"<snapshot>\n"
"\tsend [-Le] [-i snapshot|bookmark] "
"<filesystem|volume|snapshot>\n"
@@ -3784,8 +3785,23 @@ zfs_do_send(int argc, char **argv)
nvlist_t *dbgnv = NULL;
boolean_t extraverbose = B_FALSE;
+ struct option long_options[] = {
+ {"replicate", no_argument, NULL, 'R'},
+ {"props", no_argument, NULL, 'p'},
+ {"parsable", no_argument, NULL, 'P'},
+ {"dedup", no_argument, NULL, 'D'},
+ {"verbose", no_argument, NULL, 'v'},
+ {"dryrun", no_argument, NULL, 'n'},
+ {"large-block", no_argument, NULL, 'L'},
+ {"embed", no_argument, NULL, 'e'},
+ {"resume", required_argument, NULL, 't'},
+ {"compressed", no_argument, NULL, 'c'},
+ {0, 0, 0, 0}
+ };
+
/* check options */
- while ((c = getopt(argc, argv, ":i:I:RDpvnPLet:")) != -1) {
+ while ((c = getopt_long(argc, argv, ":i:I:RbDpvnPLet:c", long_options,
+ NULL)) != -1) {
switch (c) {
case 'i':
if (fromname)
@@ -3829,12 +3845,17 @@ zfs_do_send(int argc, char **argv)
case 't':
resume_token = optarg;
break;
+ case 'c':
+ flags.compress = B_TRUE;
+ break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
+ /*FALLTHROUGH*/
+ default:
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
@@ -3905,6 +3926,8 @@ zfs_do_send(int argc, char **argv)
lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
if (flags.embed_data)
lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
+ if (flags.compress)
+ lzc_flags |= LZC_SEND_FLAG_COMPRESS;
if (fromname != NULL &&
(fromname[0] == '#' || fromname[0] == '@')) {
diff --git a/usr/src/cmd/zstreamdump/zstreamdump.c b/usr/src/cmd/zstreamdump/zstreamdump.c
index 3b390a4663..17adbecd79 100644
--- a/usr/src/cmd/zstreamdump/zstreamdump.c
+++ b/usr/src/cmd/zstreamdump/zstreamdump.c
@@ -25,8 +25,8 @@
*/
/*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <ctype.h>
@@ -39,6 +39,7 @@
#include <sys/dmu.h>
#include <sys/zfs_ioctl.h>
+#include <sys/zio.h>
#include <zfs_fletcher.h>
/*
@@ -251,6 +252,7 @@ main(int argc, char *argv[])
(void) fprintf(stderr, "invalid option '%c'\n",
optopt);
usage();
+ break;
}
}
@@ -452,38 +454,50 @@ main(int argc, char *argv[])
drrw->drr_object = BSWAP_64(drrw->drr_object);
drrw->drr_type = BSWAP_32(drrw->drr_type);
drrw->drr_offset = BSWAP_64(drrw->drr_offset);
- drrw->drr_length = BSWAP_64(drrw->drr_length);
+ drrw->drr_logical_size =
+ BSWAP_64(drrw->drr_logical_size);
drrw->drr_toguid = BSWAP_64(drrw->drr_toguid);
drrw->drr_key.ddk_prop =
BSWAP_64(drrw->drr_key.ddk_prop);
+ drrw->drr_compressed_size =
+ BSWAP_64(drrw->drr_compressed_size);
}
+
+ uint64_t payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+
/*
* If this is verbose and/or dump output,
* print info on the modified block
*/
if (verbose) {
(void) printf("WRITE object = %llu type = %u "
- "checksum type = %u\n"
- " offset = %llu length = %llu "
+ "checksum type = %u compression type = %u\n"
+ " offset = %llu logical_size = %llu "
+ "compressed_size = %llu "
+ "payload_size = %llu "
"props = %llx\n",
(u_longlong_t)drrw->drr_object,
drrw->drr_type,
drrw->drr_checksumtype,
+ drrw->drr_compressiontype,
(u_longlong_t)drrw->drr_offset,
- (u_longlong_t)drrw->drr_length,
+ (u_longlong_t)drrw->drr_logical_size,
+ (u_longlong_t)drrw->drr_compressed_size,
+ (u_longlong_t)payload_size,
(u_longlong_t)drrw->drr_key.ddk_prop);
}
+
/*
* Read the contents of the block in from STDIN to buf
*/
- (void) ssread(buf, drrw->drr_length, &zc);
+ (void) ssread(buf, payload_size, &zc);
/*
* If in dump mode
*/
if (dump) {
- print_block(buf, drrw->drr_length);
+ print_block(buf, payload_size);
}
- total_write_size += drrw->drr_length;
+ total_write_size += payload_size;
break;
case DRR_WRITE_BYREF:
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index ae0fd4d958..75a3d5245f 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -171,7 +171,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = {
.zo_mirrors = 2,
.zo_raidz = 4,
.zo_raidz_parity = 1,
- .zo_vdev_size = SPA_MINDEVSIZE * 2,
+ .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */
.zo_datasets = 7,
.zo_threads = 23,
.zo_passtime = 60, /* 60 seconds */
diff --git a/usr/src/common/bzip2/bzlib.c b/usr/src/common/bzip2/bzlib.c
index be878f92c4..22e2d8826e 100644
--- a/usr/src/common/bzip2/bzlib.c
+++ b/usr/src/common/bzip2/bzlib.c
@@ -30,6 +30,7 @@
#include "bzlib_private.h"
+#ifndef BZ_NO_COMPRESS
/*---------------------------------------------------*/
/*--- Compression stuff ---*/
@@ -85,6 +86,7 @@ void BZ2_bz__AssertH__fail ( int errcode )
}
#endif
+#endif /* BZ_NO_COMPRESS */
/*---------------------------------------------------*/
static
@@ -193,6 +195,7 @@ void default_bzfree ( void* opaque, void* addr )
#endif /* _KERNEL */
/*---------------------------------------------------*/
+#ifndef BZ_NO_COMPRESS
static
void prepare_new_block ( EState* s )
{
@@ -346,34 +349,6 @@ int BZ_API(BZ2_bzCompressReset) ( bz_stream *strm )
return BZ_OK;
}
-int BZ_API(BZ2_bzDecompressReset) ( bz_stream* strm )
-{
- DState* s = strm->state;
-
- if (!bz_config_ok()) return BZ_CONFIG_ERROR;
-
- if (strm == NULL) return BZ_PARAM_ERROR;
-
- s->strm = strm;
-
- s->state = BZ_X_MAGIC_1;
- s->bsLive = 0;
- s->bsBuff = 0;
- s->calculatedCombinedCRC = 0;
- strm->total_in_lo32 = 0;
- strm->total_in_hi32 = 0;
- strm->total_out_lo32 = 0;
- strm->total_out_hi32 = 0;
-
- s->ll4 = NULL;
- s->ll16 = NULL;
- s->tt = NULL;
- s->currBlockNo = 0;
-
-
- return BZ_OK;
-}
-
/*---------------------------------------------------*/
static
@@ -647,6 +622,7 @@ int BZ_API(BZ2_bzCompressEnd) ( bz_stream *strm )
return BZ_OK;
}
+#endif /* BZ_NO_COMPRESS */
/*---------------------------------------------------*/
/*--- Decompression stuff ---*/
@@ -691,6 +667,38 @@ int BZ_API(BZ2_bzDecompressInit)
return BZ_OK;
}
+/*---------------------------------------------------*/
+/*
+ * added to allow reuse of bz_stream without malloc/free
+ */
+int BZ_API(BZ2_bzDecompressReset) ( bz_stream* strm )
+{
+ DState* s = strm->state;
+
+ if (!bz_config_ok()) return BZ_CONFIG_ERROR;
+
+ if (strm == NULL) return BZ_PARAM_ERROR;
+
+ s->strm = strm;
+
+ s->state = BZ_X_MAGIC_1;
+ s->bsLive = 0;
+ s->bsBuff = 0;
+ s->calculatedCombinedCRC = 0;
+ strm->total_in_lo32 = 0;
+ strm->total_in_hi32 = 0;
+ strm->total_out_lo32 = 0;
+ strm->total_out_hi32 = 0;
+
+ s->ll4 = NULL;
+ s->ll16 = NULL;
+ s->tt = NULL;
+ s->currBlockNo = 0;
+
+
+ return BZ_OK;
+}
+
/*---------------------------------------------------*/
/* Return True iff data corruption is discovered.
@@ -1043,6 +1051,7 @@ int BZ_API(BZ2_bzDecompressEnd) ( bz_stream *strm )
return BZ_OK;
}
+#ifndef BZ_NO_COMPRESS
#ifndef BZ_NO_STDIO
/*---------------------------------------------------*/
@@ -1732,6 +1741,7 @@ const char * BZ_API(BZ2_bzerror) (BZFILE *b, int *errnum)
}
#endif
+#endif /* BZ_NO_COMPRESS */
/*-------------------------------------------------------------*/
/*--- end bzlib.c ---*/
diff --git a/usr/src/lib/brand/lx/lx_brand/Makefile.com b/usr/src/lib/brand/lx/lx_brand/Makefile.com
index 53f5246834..262356884f 100644
--- a/usr/src/lib/brand/lx/lx_brand/Makefile.com
+++ b/usr/src/lib/brand/lx/lx_brand/Makefile.com
@@ -43,7 +43,6 @@ COBJS = aio.o \
module.o \
mount.o \
mount_nfs.o \
- priority.o \
ptrace.o \
sendfile.o \
signal.o \
diff --git a/usr/src/lib/brand/lx/lx_brand/common/clock.c b/usr/src/lib/brand/lx/lx_brand/common/clock.c
index 4c7458e051..e627df68dc 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/clock.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/clock.c
@@ -26,12 +26,9 @@
*/
#include <errno.h>
-#include <stdlib.h>
#include <string.h>
#include <time.h>
-#include <unistd.h>
#include <sys/resource.h>
-#include <sys/syscall.h>
#include <sys/timerfd.h>
#include <sys/lx_misc.h>
#include <sys/lx_syscall.h>
@@ -84,34 +81,6 @@ static int ltos_clock[] = {
#define LX_CLOCK_MAX (sizeof (ltos_clock) / sizeof (ltos_clock[0]))
-#define LX_SIGEV_PAD_SIZE ((64 - \
- (sizeof (int) * 2 + sizeof (union sigval))) / sizeof (int))
-
-typedef struct {
- union sigval lx_sigev_value; /* same layout for both */
- int lx_sigev_signo;
- int lx_sigev_notify;
- union {
- int lx_pad[LX_SIGEV_PAD_SIZE];
- int lx_tid;
- struct {
- void (*lx_notify_function)(union sigval);
- void *lx_notify_attribute;
- } lx_sigev_thread;
- } lx_sigev_un;
-} lx_sigevent_t;
-
-/* sigevent sigev_notify conversion table */
-static int ltos_sigev[] = {
- SIGEV_SIGNAL,
- SIGEV_NONE,
- SIGEV_THREAD,
- 0, /* Linux skips event 3 */
- SIGEV_THREAD /* Linux SIGEV_THREAD_ID -- see lx_sigev_thread_id() */
-};
-
-#define LX_SIGEV_MAX (sizeof (ltos_sigev) / sizeof (ltos_sigev[0]))
-#define LX_SIGEV_THREAD_ID 4
long
lx_clock_nanosleep(int clock, int flags, struct timespec *rqtp,
@@ -157,127 +126,6 @@ lx_adjtimex(void *tp)
return (-EPERM);
}
-/*
- * Notification function for use with native SIGEV_THREAD in order to
- * emulate Linux SIGEV_THREAD_ID. Native SIGEV_THREAD is used as the
- * timer mechanism and B_SIGEV_THREAD_ID performs the actual event
- * delivery to the appropriate lx tid.
- */
-static void
-lx_sigev_thread_id(union sigval sival)
-{
- lx_sigevent_t *lev = (lx_sigevent_t *)sival.sival_ptr;
- (void) syscall(SYS_brand, B_SIGEV_THREAD_ID, lev->lx_sigev_un.lx_tid,
- lev->lx_sigev_signo, lev->lx_sigev_value.sival_ptr);
- free(lev);
-}
-
-
-/*
- * The illumos timer_create man page says it accepts the following clocks:
- * CLOCK_REALTIME (3) wall clock
- * CLOCK_VIRTUAL (1) user CPU usage clock - No Backend
- * CLOCK_PROF (2) user and system CPU usage clock - No Backend
- * CLOCK_HIGHRES (4) non-adjustable, high-resolution clock
- * However, in reality the illumos timer_create only accepts CLOCK_REALTIME
- * and CLOCK_HIGHRES.
- *
- * Linux has complicated support for clock IDs. For example, the
- * clock_getcpuclockid() function can return a negative clock_id. See the Linux
- * source and the comment in include/linux/posix-timers.h (above CLOCKFD) which
- * describes clock file descriptors and shows how they map to a virt. or sched.
- * clock ID. A process can pass one of these negative IDs to timer_create so we
- * need to convert it and we currently only allow CLOCK_PROCESS_CPUTIME_ID
- * against the current process as the input.
- */
-long
-lx_timer_create(int clock, struct sigevent *lx_sevp, timer_t *tid)
-{
- lx_sigevent_t lev;
- struct sigevent sev;
-
- if (clock < 0) {
- if (clock != 0xfffffffe)
- return (-EINVAL);
- clock = CLOCK_RT_SLOT; /* force our use of CLOCK_REALTIME */
- }
-
- if (clock >= LX_CLOCK_MAX)
- return (-EINVAL);
-
- /* We have to convert the Linux sigevent layout to the illumos layout */
- if (uucopy(lx_sevp, &lev, sizeof (lev)) < 0)
- return (-EFAULT);
-
- if (lev.lx_sigev_notify < 0 || lev.lx_sigev_notify > LX_SIGEV_MAX)
- return (-EINVAL);
-
- sev.sigev_notify = ltos_sigev[lev.lx_sigev_notify];
- sev.sigev_signo = lx_ltos_signo(lev.lx_sigev_signo, 0);
- sev.sigev_value = lev.lx_sigev_value;
-
- /*
- * The signal number is meaningless in SIGEV_NONE, Linux
- * accepts any value. We convert invalid signals to 0 so other
- * parts of lx signal handling don't break.
- */
- if ((sev.sigev_notify != SIGEV_NONE) && (sev.sigev_signo == 0))
- return (-EINVAL);
-
- /*
- * Assume all Linux libc implementations map SIGEV_THREAD to
- * SIGEV_THREAD_ID and ignore passed-in attributes.
- */
- sev.sigev_notify_attributes = NULL;
-
- if (lev.lx_sigev_notify == LX_SIGEV_THREAD_ID) {
- pid_t caller_pid = getpid();
- pid_t target_pid;
- lwpid_t ignore;
- lx_sigevent_t *lev_copy;
-
- if (lx_lpid_to_spair(lev.lx_sigev_un.lx_tid,
- &target_pid, &ignore) != 0)
- return (-EINVAL);
-
- /*
- * The caller of SIGEV_THREAD_ID must be in the same
- * process as the target thread.
- */
- if (caller_pid != target_pid)
- return (-EINVAL);
-
- /*
- * Pass the original lx sigevent_t to the native
- * notify function so that it may pass it to the lx
- * helper thread. It is the responsibility of
- * lx_sigev_thread_id() to free lev_copy after the
- * information is relayed to lx.
- *
- * If the calling process is forked without an exec
- * after this copy but before the timer fires then
- * lev_copy will leak in the child. This is acceptable
- * given the rarity of this event, the miniscule
- * amount leaked, and the fact that the memory is
- * reclaimed when the proc dies. It is firmly in the
- * land of "good enough".
- */
- lev_copy = malloc(sizeof (lx_sigevent_t));
- if (lev_copy == NULL)
- return (-ENOMEM);
-
- if (uucopy(&lev, lev_copy, sizeof (lx_sigevent_t)) < 0) {
- free(lev_copy);
- return (-EFAULT);
- }
-
- sev.sigev_notify_function = lx_sigev_thread_id;
- sev.sigev_value.sival_ptr = lev_copy;
- }
-
- return ((timer_create(ltos_clock[clock], &sev, tid) < 0) ? -errno : 0);
-}
-
long
lx_timer_settime(timer_t tid, int flags, struct itimerspec *new_val,
struct itimerspec *old_val)
diff --git a/usr/src/lib/brand/lx/lx_brand/common/lx_brand.c b/usr/src/lib/brand/lx/lx_brand/common/lx_brand.c
index 5724b6cbba..c027cfed5e 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/lx_brand.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/lx_brand.c
@@ -1151,8 +1151,8 @@ static lx_syscall_handler_t lx_handlers[] = {
lx_statfs, /* 137: statfs */
lx_fstatfs, /* 138: fstatfs */
lx_sysfs, /* 139: sysfs */
- lx_getpriority, /* 140: getpriority */
- lx_setpriority, /* 141: setpriority */
+ NULL, /* 140: getpriority */
+ NULL, /* 141: setpriority */
NULL, /* 142: sched_setparam */
NULL, /* 143: sched_getparam */
NULL, /* 144: sched_setscheduler */
@@ -1233,7 +1233,7 @@ static lx_syscall_handler_t lx_handlers[] = {
NULL, /* 219: restart_syscall */
lx_semtimedop, /* 220: semtimedop */
NULL, /* 221: fadvise64 */
- lx_timer_create, /* 222: timer_create */
+ NULL, /* 222: timer_create */
lx_timer_settime, /* 223: timer_settime */
lx_timer_gettime, /* 224: timer_gettime */
lx_timer_getoverrun, /* 225: timer_getoverrun */
@@ -1438,8 +1438,8 @@ static lx_syscall_handler_t lx_handlers[] = {
lx_ftruncate, /* 93: ftruncate */
NULL, /* 94: fchmod */
NULL, /* 95: fchown16 */
- lx_getpriority, /* 96: getpriority */
- lx_setpriority, /* 97: setpriority */
+ NULL, /* 96: getpriority */
+ NULL, /* 97: setpriority */
NULL, /* 98: profil */
lx_statfs, /* 99: statfs */
lx_fstatfs, /* 100: fstatfs */
@@ -1601,7 +1601,7 @@ static lx_syscall_handler_t lx_handlers[] = {
NULL, /* 256: epoll_wait */
NULL, /* 257: remap_file_pages */
NULL, /* 258: set_tid_address */
- lx_timer_create, /* 259: timer_create */
+ NULL, /* 259: timer_create */
lx_timer_settime, /* 260: timer_settime */
lx_timer_gettime, /* 261: timer_gettime */
lx_timer_getoverrun, /* 262: timer_getoverrun */
diff --git a/usr/src/lib/brand/lx/lx_brand/common/priority.c b/usr/src/lib/brand/lx/lx_brand/common/priority.c
deleted file mode 100644
index 5974abe40e..0000000000
--- a/usr/src/lib/brand/lx/lx_brand/common/priority.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- * Copyright 2015 Joyent, Inc. All rights reserved.
- */
-
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/lx_debug.h>
-#include <sys/lx_misc.h>
-#include <sys/lx_syscall.h>
-#include <sys/lx_types.h>
-#include <sys/resource.h>
-#include <sys/lx_misc.h>
-#include <sched.h>
-
-/*
- * The Linux syscall returns priorities in the range (lowest) 40-1 (highest)
- * and then glibc adjusts these to the range -20 - 19.
- */
-long
-lx_getpriority(uintptr_t p1, uintptr_t p2)
-{
- int which = (int)p1;
- id_t who = (id_t)p2;
- int ret;
-
- /*
- * The only valid values for 'which' are positive integers, and unlike
- * Solaris, linux doesn't support anything past PRIO_USER.
- */
- if (which < 0 || which > PRIO_USER)
- return (-EINVAL);
-
- lx_debug("\tgetpriority(%d, %d)", which, who);
-
- errno = 0;
-
- if ((which == PRIO_PROCESS) && (who == 1))
- who = zoneinit_pid;
-
- ret = getpriority(which, who);
- if (ret == -1 && errno != 0) {
- pid_t mypid = getpid();
-
- if (which == PRIO_PROCESS &&
- (who == mypid || who == 0 || who == P_MYID) &&
- sched_getscheduler(mypid) == SCHED_RR) {
- /*
- * The getpriority kernel handling will always return
- * an error if we're in the RT class. The zone itself
- * won't be able to put itself or any of its processes
- * into RT but if we put the whole zone into RT via
- * the scheduling-class property, then getpriority will
- * always fail. This breaks pam and prevents any login.
- * Just pretend to be the highest priority.
- */
- return (1);
- }
-
- /*
- * Linux does not return EINVAL for invalid 'who' values, it
- * returns ESRCH instead. We already validated 'which' above.
- */
- if (errno == EINVAL)
- errno = ESRCH;
- return (-errno);
- }
-
- /*
- * The return value of the getpriority syscall is biased by 20 to avoid
- * returning negative values when successful.
- */
- return (20 - ret);
-}
-
-long
-lx_setpriority(uintptr_t p1, uintptr_t p2, uintptr_t p3)
-{
- int which = (int)p1;
- id_t who = (id_t)p2;
- int prio = (int)p3;
- int rval;
-
- if (which > PRIO_USER)
- return (-EINVAL);
-
- lx_debug("\tsetpriority(%d, %d, %d)", which, who, prio);
-
- if ((which == PRIO_PROCESS) && (who == 1))
- who = zoneinit_pid;
-
- rval = setpriority(which, who, prio);
-
- return ((rval == -1) ? -errno : rval);
-}
diff --git a/usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h b/usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h
index 64e1ca6ab8..e26ff7333c 100644
--- a/usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h
+++ b/usr/src/lib/brand/lx/lx_brand/sys/lx_syscall.h
@@ -71,7 +71,6 @@ extern long lx_capset(uintptr_t, uintptr_t);
extern long lx_clock_nanosleep(int, int flags, struct timespec *,
struct timespec *);
extern long lx_adjtimex(void *);
-extern long lx_timer_create(int, struct sigevent *, timer_t *);
extern long lx_timer_settime(timer_t, int, struct itimerspec *,
struct itimerspec *);
extern long lx_timer_gettime(timer_t, struct itimerspec *);
@@ -157,9 +156,6 @@ extern long lx_fork(void);
extern long lx_vfork(void);
extern long lx_exec(uintptr_t, uintptr_t, uintptr_t);
-extern long lx_getpriority(uintptr_t, uintptr_t);
-extern long lx_setpriority(uintptr_t, uintptr_t, uintptr_t);
-
extern long lx_ptrace(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
extern long lx_xattr2(uintptr_t, uintptr_t);
diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h
index 524809a9e7..5b823c9525 100644
--- a/usr/src/lib/libzfs/common/libzfs.h
+++ b/usr/src/lib/libzfs/common/libzfs.h
@@ -602,6 +602,9 @@ typedef struct sendflags {
/* WRITE_EMBEDDED records of type DATA are permitted */
boolean_t embed_data;
+
+ /* compressed WRITE records are permitted */
+ boolean_t compress;
} sendflags_t;
typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c
index bbb0acaa3d..8f56c4042b 100644
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c
@@ -2134,9 +2134,12 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
/*
* If we tried to use a default value for a
* readonly property, it means that it was not
- * present.
+ * present. Note this only applies to "truly"
+ * readonly properties, not set-once properties
+ * like volblocksize.
*/
if (zfs_prop_readonly(prop) &&
+ !zfs_prop_setonce(prop) &&
*source != NULL && (*source)[0] == '\0') {
*source = NULL;
return (-1);
diff --git a/usr/src/lib/libzfs/common/libzfs_sendrecv.c b/usr/src/lib/libzfs/common/libzfs_sendrecv.c
index eab6d4bacb..b8db4c9c8e 100644
--- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c
+++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c
@@ -347,8 +347,10 @@ cksummer(void *arg)
{
struct drr_write *drrw = &drr->drr_u.drr_write;
dataref_t dataref;
+ uint64_t payload_size;
- (void) ssread(buf, drrw->drr_length, ofp);
+ payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+ (void) ssread(buf, payload_size, ofp);
/*
* Use the existing checksum if it's dedup-capable,
@@ -362,7 +364,7 @@ cksummer(void *arg)
zio_cksum_t tmpsha256;
SHA256Init(&ctx);
- SHA256Update(&ctx, buf, drrw->drr_length);
+ SHA256Update(&ctx, buf, payload_size);
SHA256Final(&tmpsha256, &ctx);
drrw->drr_key.ddk_cksum.zc_word[0] =
BE_64(tmpsha256.zc_word[0]);
@@ -392,7 +394,7 @@ cksummer(void *arg)
wbr_drrr->drr_object = drrw->drr_object;
wbr_drrr->drr_offset = drrw->drr_offset;
- wbr_drrr->drr_length = drrw->drr_length;
+ wbr_drrr->drr_length = drrw->drr_logical_size;
wbr_drrr->drr_toguid = drrw->drr_toguid;
wbr_drrr->drr_refguid = dataref.ref_guid;
wbr_drrr->drr_refobject =
@@ -414,7 +416,7 @@ cksummer(void *arg)
goto out;
} else {
/* block not previously seen */
- if (dump_record(drr, buf, drrw->drr_length,
+ if (dump_record(drr, buf, payload_size,
&stream_cksum, outfd) != 0)
goto out;
}
@@ -917,7 +919,7 @@ typedef struct send_dump_data {
uint64_t prevsnap_obj;
boolean_t seenfrom, seento, replicate, doall, fromorigin;
boolean_t verbose, dryrun, parsable, progress, embed_data, std_out;
- boolean_t large_block;
+ boolean_t large_block, compress;
int outfd;
boolean_t err;
nvlist_t *fss;
@@ -933,7 +935,7 @@ typedef struct send_dump_data {
static int
estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
- boolean_t fromorigin, uint64_t *sizep)
+ boolean_t fromorigin, enum lzc_send_flags flags, uint64_t *sizep)
{
zfs_cmd_t zc = { 0 };
libzfs_handle_t *hdl = zhp->zfs_hdl;
@@ -946,6 +948,7 @@ estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
zc.zc_fromobj = fromsnap_obj;
zc.zc_guid = 1; /* estimate flag */
+ zc.zc_flags = flags;
if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) {
char errbuf[1024];
@@ -1184,6 +1187,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
progress_arg_t pa = { 0 };
pthread_t tid;
char *thissnap;
+ enum lzc_send_flags flags = 0;
int err;
boolean_t isfromsnap, istosnap, fromorigin;
boolean_t exclude = B_FALSE;
@@ -1212,6 +1216,13 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
if (istosnap)
sdd->seento = B_TRUE;
+ if (sdd->large_block)
+ flags |= LZC_SEND_FLAG_LARGE_BLOCK;
+ if (sdd->embed_data)
+ flags |= LZC_SEND_FLAG_EMBED_DATA;
+ if (sdd->compress)
+ flags |= LZC_SEND_FLAG_COMPRESS;
+
if (!sdd->doall && !isfromsnap && !istosnap) {
if (sdd->replicate) {
char *snapname;
@@ -1258,7 +1269,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
if (sdd->verbose) {
uint64_t size = 0;
(void) estimate_ioctl(zhp, sdd->prevsnap_obj,
- fromorigin, &size);
+ fromorigin, flags, &size);
send_print_verbose(fout, zhp->zfs_name,
sdd->prevsnap[0] ? sdd->prevsnap : NULL,
@@ -1283,12 +1294,6 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
}
}
- enum lzc_send_flags flags = 0;
- if (sdd->large_block)
- flags |= LZC_SEND_FLAG_LARGE_BLOCK;
- if (sdd->embed_data)
- flags |= LZC_SEND_FLAG_EMBED_DATA;
-
err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
fromorigin, sdd->outfd, flags, sdd->debugnv);
@@ -1594,8 +1599,12 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd,
fromguid = 0;
(void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid);
+ if (flags->largeblock || nvlist_exists(resume_nvl, "largeblockok"))
+ lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
if (flags->embed_data || nvlist_exists(resume_nvl, "embedok"))
lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
+ if (flags->compress || nvlist_exists(resume_nvl, "compressok"))
+ lzc_flags |= LZC_SEND_FLAG_COMPRESS;
if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) {
if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) {
@@ -1628,7 +1637,8 @@ zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd,
if (flags->verbose) {
uint64_t size = 0;
- error = lzc_send_space(zhp->zfs_name, fromname, &size);
+ error = lzc_send_space(zhp->zfs_name, fromname,
+ lzc_flags, &size);
if (error == 0)
size = MAX(0, (int64_t)(size - bytes));
send_print_verbose(stderr, zhp->zfs_name, fromname,
@@ -1856,6 +1866,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
sdd.dryrun = flags->dryrun;
sdd.large_block = flags->largeblock;
sdd.embed_data = flags->embed_data;
+ sdd.compress = flags->compress;
sdd.filter_cb = filter_func;
sdd.filter_cb_arg = cb_arg;
if (debugnvp)
@@ -2921,11 +2932,17 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
case DRR_WRITE:
if (byteswap) {
- drr->drr_u.drr_write.drr_length =
- BSWAP_64(drr->drr_u.drr_write.drr_length);
+ drr->drr_u.drr_write.drr_logical_size =
+ BSWAP_64(
+ drr->drr_u.drr_write.drr_logical_size);
+ drr->drr_u.drr_write.drr_compressed_size =
+ BSWAP_64(
+ drr->drr_u.drr_write.drr_compressed_size);
}
+ uint64_t payload_size =
+ DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write);
(void) recv_read(hdl, fd, buf,
- drr->drr_u.drr_write.drr_length, B_FALSE, NULL);
+ payload_size, B_FALSE, NULL);
break;
case DRR_SPILL:
if (byteswap) {
diff --git a/usr/src/lib/libzfs_core/common/libzfs_core.c b/usr/src/lib/libzfs_core/common/libzfs_core.c
index cc5e2a781b..7e7891798d 100644
--- a/usr/src/lib/libzfs_core/common/libzfs_core.c
+++ b/usr/src/lib/libzfs_core/common/libzfs_core.c
@@ -487,6 +487,8 @@ lzc_send_resume(const char *snapname, const char *from, int fd,
fnvlist_add_boolean(args, "largeblockok");
if (flags & LZC_SEND_FLAG_EMBED_DATA)
fnvlist_add_boolean(args, "embedok");
+ if (flags & LZC_SEND_FLAG_COMPRESS)
+ fnvlist_add_boolean(args, "compressok");
if (resumeobj != 0 || resumeoff != 0) {
fnvlist_add_uint64(args, "resume_object", resumeobj);
fnvlist_add_uint64(args, "resume_offset", resumeoff);
@@ -512,7 +514,8 @@ lzc_send_resume(const char *snapname, const char *from, int fd,
* an equivalent snapshot.
*/
int
-lzc_send_space(const char *snapname, const char *from, uint64_t *spacep)
+lzc_send_space(const char *snapname, const char *from,
+ enum lzc_send_flags flags, uint64_t *spacep)
{
nvlist_t *args;
nvlist_t *result;
@@ -521,6 +524,12 @@ lzc_send_space(const char *snapname, const char *from, uint64_t *spacep)
args = fnvlist_alloc();
if (from != NULL)
fnvlist_add_string(args, "from", from);
+ if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
+ fnvlist_add_boolean(args, "largeblockok");
+ if (flags & LZC_SEND_FLAG_EMBED_DATA)
+ fnvlist_add_boolean(args, "embedok");
+ if (flags & LZC_SEND_FLAG_COMPRESS)
+ fnvlist_add_boolean(args, "compressok");
err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result);
nvlist_free(args);
if (err == 0)
diff --git a/usr/src/lib/libzfs_core/common/libzfs_core.h b/usr/src/lib/libzfs_core/common/libzfs_core.h
index 6b4575ddeb..094fa257e4 100644
--- a/usr/src/lib/libzfs_core/common/libzfs_core.h
+++ b/usr/src/lib/libzfs_core/common/libzfs_core.h
@@ -62,13 +62,14 @@ int lzc_get_holds(const char *, nvlist_t **);
enum lzc_send_flags {
LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
- LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1
+ LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1,
+ LZC_SEND_FLAG_COMPRESS = 1 << 2
};
int lzc_send(const char *, const char *, int, enum lzc_send_flags);
int lzc_send_resume(const char *, const char *, int,
enum lzc_send_flags, uint64_t, uint64_t);
-int lzc_send_space(const char *, const char *, uint64_t *);
+int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *);
struct dmu_replay_record;
diff --git a/usr/src/lib/libzpool/common/kernel.c b/usr/src/lib/libzpool/common/kernel.c
index 4160f1d7c9..2290164413 100644
--- a/usr/src/lib/libzpool/common/kernel.c
+++ b/usr/src/lib/libzpool/common/kernel.c
@@ -95,6 +95,11 @@ kstat_create(const char *module, int instance, const char *name,
/*ARGSUSED*/
void
+kstat_named_init(kstat_named_t *knp, const char *name, uchar_t type)
+{}
+
+/*ARGSUSED*/
+void
kstat_install(kstat_t *ksp)
{}
diff --git a/usr/src/lib/libzpool/common/sys/zfs_context.h b/usr/src/lib/libzpool/common/sys/zfs_context.h
index c45923aad0..21853a2e70 100644
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h
@@ -303,6 +303,7 @@ extern void cv_broadcast(kcondvar_t *cv);
*/
extern kstat_t *kstat_create(const char *, int,
const char *, const char *, uchar_t, ulong_t, uchar_t);
+extern void kstat_named_init(kstat_named_t *, const char *, uchar_t);
extern void kstat_install(kstat_t *);
extern void kstat_delete(kstat_t *);
extern void kstat_waitq_enter(kstat_io_t *);
diff --git a/usr/src/man/man1m/zfs.1m b/usr/src/man/man1m/zfs.1m
index 8c78343cc4..ee49174cb4 100644
--- a/usr/src/man/man1m/zfs.1m
+++ b/usr/src/man/man1m/zfs.1m
@@ -165,12 +165,12 @@
.Ar snapshot bookmark
.Nm
.Cm send
-.Op Fl DLPRenpv
+.Op Fl DLPRcenpv
.Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot
.Ar snapshot
.Nm
.Cm send
-.Op Fl Le
+.Op Fl Lce
.Op Fl i Ar snapshot Ns | Ns Ar bookmark
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
.Nm
@@ -2451,7 +2451,7 @@ feature.
.It Xo
.Nm
.Cm send
-.Op Fl DLPRenpv
+.Op Fl DLPRcenpv
.Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot
.Ar snapshot
.Xc
@@ -2464,7 +2464,7 @@ to a different system
.Pc .
By default, a full stream is generated.
.Bl -tag -width "-D"
-.It Fl D
+.It Fl D, -dedup
Generate a deduplicated stream. Blocks which would have been sent multiple times
in the send stream will only be sent once. The receiving system must also
support this feature to receive a deduplicated stream. This flag can be used
@@ -2484,7 +2484,7 @@ is similar to
The incremental source may be specified as with the
.Fl i
option.
-.It Fl L
+.It Fl L, -large-block
Generate a stream which may contain blocks larger than 128KB. This flag has no
effect if the
.Sy large_blocks
@@ -2498,9 +2498,9 @@ pool feature enabled as well. See
for details on ZFS feature flags and the
.Sy large_blocks
feature.
-.It Fl P
+.It Fl P, -parsable
Print machine-parsable verbose information about the stream package generated.
-.It Fl R
+.It Fl R, -replicate
Generate a replication stream package, which will replicate the specified
file system, and all descendent file systems, up to the named snapshot. When
received, all properties, snapshots, descendent file systems, and clones are
@@ -2518,7 +2518,7 @@ is received. If the
.Fl F
flag is specified when this stream is received, snapshots and file systems that
do not exist on the sending side are destroyed.
-.It Fl e
+.It Fl e, -embed
Generate a more compact stream by using
.Sy WRITE_EMBEDDED
records for blocks which are stored more compactly on disk by the
@@ -2535,6 +2535,16 @@ that feature enabled as well. See
for details on ZFS feature flags and the
.Sy embedded_data
feature.
+.It Fl c, -compressed
+Generate a more compact stream by using compressed WRITE records for blocks
+which are compressed on disk and in memory (see the
+.Sy compression No property for details). If the Sy lz4_compress No feature
+is active on the sending system, then the receiving system must have that
+feature enabled as well. If the
+.Sy large_blocks No feature is enabled on the sending system but the Fl L
+option is not supplied in conjunction with
+.Fl c, No then the data will be decompressed before sending so it can be split
+into smaller block sizes.
.It Fl i Ar snapshot
Generate an incremental stream from the first
.Ar snapshot
@@ -2557,7 +2567,7 @@ be fully specified
not just
.Em @origin
.Pc .
-.It Fl n
+.It Fl n, -dryrun
Do a dry-run
.Pq Qq No-op
send. Do not generate any actual send data. This is useful in conjunction with
@@ -2570,11 +2580,11 @@ be written to standard output
.Po contrast with a non-dry-run, where the stream is written to standard output
and the verbose output goes to standard error
.Pc .
-.It Fl p
+.It Fl p, -props
Include the dataset's properties in the stream. This flag is implicit when
.Fl R
is specified. The receiving system must also support this feature.
-.It Fl v
+.It Fl v, -verbose
Print verbose information about the stream package generated. This information
includes a per-second report of how much data has been sent.
.Pp
@@ -2584,7 +2594,7 @@ on future versions of ZFS .
.It Xo
.Nm
.Cm send
-.Op Fl Le
+.Op Fl Lce
.Op Fl i Ar snapshot Ns | Ns Ar bookmark
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
.Xc
@@ -2594,7 +2604,7 @@ read-only, or the filesystem must not be mounted. When the stream generated from
a filesystem or volume is received, the default snapshot name will be
.Qq --head-- .
.Bl -tag -width "-L"
-.It Fl L
+.It Fl L, -large-block
Generate a stream which may contain blocks larger than 128KB. This flag has no
effect if the
.Sy large_blocks
@@ -2608,7 +2618,17 @@ pool feature enabled as well. See
for details on ZFS feature flags and the
.Sy large_blocks
feature.
-.It Fl e
+.It Fl c, -compressed
+Generate a more compact stream by using compressed WRITE records for blocks
+which are compressed on disk and in memory (see the
+.Sy compression No property for details). If the Sy lz4_compress No feature is
+active on the sending system, then the receiving system must have that feature
+enabled as well. If the
+.Sy large_blocks No feature is enabled on the sending system but the Fl L
+option is not supplied in conjunction with
+.Fl c, No then the data will be decompressed before sending so it can be split
+into smaller block sizes.
+.It Fl e, -embed
Generate a more compact stream by using
.Sy WRITE_EMBEDDED
records for blocks which are stored more compactly on disk by the
diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf
index 1ce41fa420..6a0ad9b813 100644
--- a/usr/src/pkg/manifests/system-test-zfstest.mf
+++ b/usr/src/pkg/manifests/system-test-zfstest.mf
@@ -2058,6 +2058,27 @@ file path=opt/zfs-tests/tests/functional/rsend/rsend_020_pos mode=0555
file path=opt/zfs-tests/tests/functional/rsend/rsend_021_pos mode=0555
file path=opt/zfs-tests/tests/functional/rsend/rsend_022_pos mode=0555
file path=opt/zfs-tests/tests/functional/rsend/rsend_024_pos mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-cD mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_embedded_blocks \
+ mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_incremental mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_lz4_disabled mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_mixed_compression \
+ mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_props mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_recv_dedup mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled \
+ mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_resume mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate \
+ mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_verify_contents \
+ mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_verify_ratio mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_volume mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-c_zstreamdump mode=0555
+file path=opt/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize \
+ mode=0555
file path=opt/zfs-tests/tests/functional/rsend/setup mode=0555
file path=opt/zfs-tests/tests/functional/scrub_mirror/cleanup mode=0555
file path=opt/zfs-tests/tests/functional/scrub_mirror/default.cfg mode=0555
@@ -2280,7 +2301,7 @@ file path=opt/zfs-tests/tests/perf/scripts/io.d mode=0555
file path=opt/zfs-tests/tests/perf/scripts/prefetch_io.d mode=0555
license cr_Sun license=cr_Sun
license lic_CDDL license=lic_CDDL
-#depend fmri=benchmark/fio type=require
depend fmri=system/file-system/zfs/tests type=require
+depend fmri=system/test/fio type=require
depend fmri=system/test/testrunner type=require
depend fmri=system/xopen/xcu4 type=require
diff --git a/usr/src/test/zfs-tests/include/commands.cfg b/usr/src/test/zfs-tests/include/commands.cfg
index bf60cd9565..a83c22c8c4 100644
--- a/usr/src/test/zfs-tests/include/commands.cfg
+++ b/usr/src/test/zfs-tests/include/commands.cfg
@@ -156,7 +156,8 @@ export USR_SBIN_FILES='arp
zhack
zinject
zoneadm
- zonecfg'
+ zonecfg
+ zstreamdump'
export SBIN_FILES='fdisk
mount
diff --git a/usr/src/test/zfs-tests/include/libtest.shlib b/usr/src/test/zfs-tests/include/libtest.shlib
index 2957cf0808..d40c1aa39e 100644
--- a/usr/src/test/zfs-tests/include/libtest.shlib
+++ b/usr/src/test/zfs-tests/include/libtest.shlib
@@ -2527,3 +2527,48 @@ function get_min
echo $min
}
+
+#
+# Generate a random number between 1 and the argument.
+#
+function random
+{
+ typeset max=$1
+ echo $(( ($RANDOM % $max) + 1 ))
+}
+
+# Write data that can be compressed into a directory
+function write_compressible
+{
+ typeset dir=$1
+ typeset megs=$2
+ typeset nfiles=${3:-1}
+ typeset bs=${4:-1024k}
+ typeset fname=${5:-file}
+
+ [[ -d $dir ]] || log_fail "No directory: $dir"
+
+ log_must eval "fio \
+ --name=job \
+ --fallocate=0 \
+ --minimal \
+ --randrepeat=0 \
+ --buffer_compress_percentage=66 \
+ --buffer_compress_chunk=4096 \
+ --directory=$dir \
+ --numjobs=$nfiles \
+ --rw=write \
+ --bs=$bs \
+ --filesize=$megs \
+ --filename_format='$fname.\$jobnum' >/dev/null"
+}
+
+function get_objnum
+{
+ typeset pathname=$1
+ typeset objnum
+
+ [[ -e $pathname ]] || log_fail "No such file or directory: $pathname"
+ objnum=$(stat -c %i $pathname)
+ echo $objnum
+}
diff --git a/usr/src/test/zfs-tests/include/properties.shlib b/usr/src/test/zfs-tests/include/properties.shlib
index 2897e90c25..b1c1b0be44 100644
--- a/usr/src/test/zfs-tests/include/properties.shlib
+++ b/usr/src/test/zfs-tests/include/properties.shlib
@@ -13,10 +13,29 @@
# Copyright (c) 2012, 2016 by Delphix. All rights reserved.
#
-typeset -a compress_props=('on' 'off' 'lzjb' 'gzip' 'gzip-1' 'gzip-2' 'gzip-3'
- 'gzip-4' 'gzip-5' 'gzip-6' 'gzip-7' 'gzip-8' 'gzip-9' 'zle')
+typeset -a compress_prop_vals=('on' 'off' 'lzjb' 'gzip' 'gzip-1' 'gzip-2'
+ 'gzip-3' 'gzip-4' 'gzip-5' 'gzip-6' 'gzip-7' 'gzip-8' 'gzip-9' 'zle' 'lz4')
+typeset -a checksum_prop_vals=('on' 'off' 'fletcher2' 'fletcher4' 'sha256'
+ 'noparity' 'sha512' 'skein' 'edonr')
+typeset -a recsize_prop_vals=('512' '1024' '2048' '4096' '8192' '16384'
+ '32768' '65536' '131072' '262144' '524288' '1048576')
+typeset -a aclinherit_prop_vals=('discard' 'noallow' 'restricted' 'passthrough'
+ 'passthrough-x')
+typeset -a aclmode_prop_vals=('discard' 'groupmask' 'passthrough' 'restricted')
+typeset -a canmount_prop_vals=('on' 'off' 'noauto')
+typeset -a copies_prop_vals=('1' '2' '3')
+typeset -a logbias_prop_vals=('latency' 'throughput')
+typeset -a primarycache_prop_vals=('all' 'none' 'metadata')
+typeset -a redundant_metadata_prop_vals=('all' 'most')
+typeset -a secondarycache_prop_vals=('all' 'none' 'metadata')
+typeset -a snapdir_prop_vals=('hidden' 'visible')
+typeset -a sync_prop_vals=('standard' 'always' 'disabled')
-typeset -a checksum_props=('on' 'off' 'fletcher2' 'fletcher4' 'sha256')
+typeset -a fs_props=('compress' 'checksum' 'recsize' 'aclinherit' 'aclmode'
+ 'canmount' 'copies' 'logbias' 'primarycache' 'redundant_metadata'
+ 'secondarycache' 'snapdir' 'sync')
+typeset -a vol_props=('compress' 'checksum' 'copies' 'logbias' 'primarycache'
+ 'secondarycache' 'redundant_metadata' 'sync')
#
# Given the property array passed in, return 'num_props' elements to the
@@ -44,20 +63,81 @@ function get_rand_prop
function get_rand_compress
{
- get_rand_prop compress_props $1 2
+ get_rand_prop compress_prop_vals $1 2
}
function get_rand_compress_any
{
- get_rand_prop compress_props $1 0
+ get_rand_prop compress_prop_vals $1 0
}
function get_rand_checksum
{
- get_rand_prop checksum_props $1 2
+ get_rand_prop checksum_prop_vals $1 2
}
function get_rand_checksum_any
{
- get_rand_prop checksum_props $1 0
+ get_rand_prop checksum_prop_vals $1 0
+}
+
+function get_rand_recsize
+{
+ get_rand_prop recsize_prop_vals $1 0
+}
+
+function get_rand_large_recsize
+{
+ get_rand_prop recsize_prop_vals $1 9
+}
+
+#
+# Functions to toggle on/off properties
+#
+typeset -a binary_props=('atime' 'devices' 'exec' 'nbmand' 'readonly' 'setuid'
+ 'xattr' 'zoned')
+
+function toggle_prop
+{
+ typeset ds=$1
+ typeset prop=$2
+
+ datasetexists $ds || log_fail "$ds does not exist"
+ typeset val=$(get_prop $prop $ds)
+ typeset newval='off'
+
+ [[ $val = $newval ]] && newval='on'
+ log_must zfs set $prop=$newval $ds
+}
+
+function toggle_binary_props
+{
+ typeset ds=$1
+ typeset prop
+
+ for prop in "${binary_props[@]}"; do
+ toggle_prop $ds $prop
+ done
+}
+
+function randomize_ds_props
+{
+ typeset ds=$1
+ typeset prop proplist val
+
+ datasetexists $ds || log_fail "$ds does not exist"
+ if ds_is_volume $ds; then
+ toggle_prop $ds readonly
+ proplist="${vol_props[@]}"
+ elif ds_is_filesystem $ds; then
+ toggle_binary_props $ds
+ proplist="${fs_props[@]}"
+ else
+ log_fail "$ds is neither a volume nor a file system"
+ fi
+
+ for prop in $proplist; do
+ typeset val=$(get_rand_prop "${prop}_prop_vals" 1 0)
+ log_must zfs set $prop=$val $ds
+ done
}
diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run
index 7be16fe46e..7f6afe6451 100644
--- a/usr/src/test/zfs-tests/runfiles/delphix.run
+++ b/usr/src/test/zfs-tests/runfiles/delphix.run
@@ -479,7 +479,13 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos',
'rsend_013_pos', 'rsend_014_pos',
'rsend_019_pos', 'rsend_020_pos',
- 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos']
+ 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos',
+ 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props',
+ 'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump',
+ 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
+ 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
+ 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
+ 'send-c_recv_dedup']
[/opt/zfs-tests/tests/functional/scrub_mirror]
tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos',
diff --git a/usr/src/test/zfs-tests/runfiles/omnios.run b/usr/src/test/zfs-tests/runfiles/omnios.run
index a1cef540c5..f66317cd6d 100644
--- a/usr/src/test/zfs-tests/runfiles/omnios.run
+++ b/usr/src/test/zfs-tests/runfiles/omnios.run
@@ -475,7 +475,13 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos',
'rsend_013_pos', 'rsend_014_pos',
'rsend_019_pos', 'rsend_020_pos',
- 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos']
+ 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos',
+ 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props',
+ 'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump',
+ 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
+ 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
+ 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
+ 'send-c_recv_dedup']
[/opt/zfs-tests/tests/functional/scrub_mirror]
tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos',
diff --git a/usr/src/test/zfs-tests/runfiles/openindiana.run b/usr/src/test/zfs-tests/runfiles/openindiana.run
index f8f6af23a7..7293eb949c 100644
--- a/usr/src/test/zfs-tests/runfiles/openindiana.run
+++ b/usr/src/test/zfs-tests/runfiles/openindiana.run
@@ -475,7 +475,13 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos',
'rsend_013_pos', 'rsend_014_pos',
'rsend_019_pos', 'rsend_020_pos',
- 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos']
+ 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos',
+ 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props',
+ 'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump',
+ 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
+ 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
+ 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
+ 'send-c_recv_dedup']
[/opt/zfs-tests/tests/functional/scrub_mirror]
tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos',
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/Makefile b/usr/src/test/zfs-tests/tests/functional/rsend/Makefile
index c482d9d607..918cfcc56e 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/Makefile
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/Makefile
@@ -10,7 +10,7 @@
#
#
-# Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+# Copyright (c) 2013, 2015 by Delphix. All rights reserved.
#
include $(SRC)/Makefile.master
@@ -38,6 +38,21 @@ PROGS = cleanup \
rsend_021_pos \
rsend_022_pos \
rsend_024_pos \
+ send-cD \
+ send-c_embedded_blocks \
+ send-c_incremental \
+ send-c_lz4_disabled \
+ send-c_mixed_compression \
+ send-c_props \
+ send-c_recv_dedup \
+ send-c_recv_lz4_disabled \
+ send-c_resume \
+ send-c_stream_size_estimate \
+ send-c_verify_contents \
+ send-c_verify_ratio \
+ send-c_volume \
+ send-c_zstreamdump \
+ send-cpL_varied_recsize \
setup
FILES = rsend.cfg \
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend.cfg b/usr/src/test/zfs-tests/tests/functional/rsend/rsend.cfg
index 2c1654e089..8400ecfe35 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend.cfg
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend.cfg
@@ -34,6 +34,6 @@ export DISK2=$(echo $DISKS | awk '{print $2}')
export DISK3=$(echo $DISKS | awk '{print $3}')
export POOL=$TESTPOOL
-export POOL2=$TESTPOOL1
-export POOL3=$TESTPOOL2
+export POOL2=$TESTPOOL2
+export POOL3=$TESTPOOL3
export FS=$TESTFS
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend.kshlib b/usr/src/test/zfs-tests/tests/functional/rsend/rsend.kshlib
index da5b7cb3a4..a82d3b3d59 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend.kshlib
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend.kshlib
@@ -29,6 +29,7 @@
#
. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/include/math.shlib
. $STF_SUITE/tests/functional/rsend/rsend.cfg
#
@@ -514,8 +515,8 @@ function test_fs_setup
typeset sendpool=${sendfs%%/*}
typeset recvpool=${recvfs%%/*}
- datasetexists $sendfs && log_must $ZFS destroy -r $sendpool
- datasetexists $recvfs && log_must $ZFS destroy -r $recvpool
+ datasetexists $sendfs && log_must zfs destroy -r $sendpool
+ datasetexists $recvfs && log_must zfs destroy -r $recvpool
if $(datasetexists $sendfs || zfs create -o compress=lz4 $sendfs); then
mk_files 1000 256 0 $sendfs &
@@ -549,3 +550,120 @@ function test_fs_setup
fi
log_must zfs create -o compress=lz4 $sendpool/stream
}
+
+#
+# Check to see if the specified features are set in a send stream.
+# The values for these features are found in uts/common/fs/zfs/sys/zfs_ioctl.h
+#
+# $1 The stream file
+# $2-$n The flags expected in the stream
+#
+function stream_has_features
+{
+ typeset file=$1
+ shift
+
+ [[ -f $file ]] || log_fail "Couldn't find file: $file"
+ typeset flags=$(cat $file | zstreamdump | awk '/features =/ {print $3}')
+ typeset -A feature
+ feature[dedup]="1"
+ feature[dedupprops]="2"
+ feature[sa_spill]="4"
+ feature[embed_data]="10000"
+ feature[lz4]="20000"
+ feature[mooch_byteswap]="40000"
+ feature[large_blocks]="80000"
+ feature[resuming]="100000"
+ feature[redacted]="200000"
+ feature[compressed]="400000"
+
+ typeset flag known derived=0
+ for flag in "$@"; do
+ known=${feature[$flag]}
+ [[ -z $known ]] && log_fail "Unknown feature: $flag"
+
+ derived=$(echo "$flags & ${feature[$flag]} = X" | mdb | sed 's/ //g')
+ [[ $derived = $known ]] || return 1
+ done
+
+ return 0
+}
+
+#
+# Parse zstreamdump -v output. The output varies for each kind of record:
+# BEGIN records are simply output as "BEGIN"
+# END records are output as "END"
+# OBJECT records become "OBJECT <object num>"
+# FREEOBJECTS records become "FREEOBJECTS <startobj> <numobjs>"
+# FREE records become "<record type> <start> <length>"
+# WRITE records become:
+# "<record type> <compression type> <start> <logical size> <compressed size>
+# <data size>"
+#
+function parse_dump
+{
+ sed '/^WRITE/{N;s/\n/ /;}' | grep "^[A-Z]" | awk '{
+ if ($1 == "BEGIN" || $1 == "END") print $1
+ if ($1 == "OBJECT") print $1" "$4
+ if ($1 == "FREEOBJECTS") print $1" "$4" "$7
+ if ($1 == "FREE") print $1" "$7" "$10
+ if ($1 == "WRITE") print $1" "$15" "$18" "$21" "$24" "$27}'
+}
+
+#
+# Given a send stream, verify that the size of the stream matches what's
+# expected based on the source or target dataset. If the stream is an
+# incremental stream, subtract the size of the source snapshot before
+# comparing. This function does not currently handle incremental streams
+# that remove data.
+#
+# $1 The zstreamdump output file
+# $2 The dataset to compare against
+# This can be a source of a send or recv target (fs, not snapshot)
+# $3 The percentage below which verification is deemed a failure
+# $4 The source snapshot of an incremental send
+#
+
+function verify_stream_size
+{
+ typeset stream=$1
+ typeset ds=$2
+ typeset percent=${3:-90}
+ typeset inc_src=$4
+
+ [[ -f $stream ]] || log_fail "No such file: $stream"
+ datasetexists $ds || log_fail "No such dataset: $ds"
+
+ typeset stream_size=$(cat $stream | zstreamdump | sed -n \
+ 's/ Total write size = \(.*\) (0x.*)/\1/p')
+
+ typeset inc_size=0
+ if [[ -n $inc_src ]]; then
+ inc_size=$(get_prop lrefer $inc_src)
+ if stream_has_features $stream compressed; then
+ inc_size=$(get_prop refer $inc_src)
+ fi
+ fi
+
+ if stream_has_features $stream compressed; then
+ ds_size=$(get_prop refer $ds)
+ else
+ ds_size=$(get_prop lrefer $ds)
+ fi
+ ds_size=$((ds_size - inc_size))
+
+ within_percent $stream_size $ds_size $percent || log_fail \
+ "$stream_size $ds_size differed by too much"
+}
+
+# Cleanup function for tests involving resumable send
+function resume_cleanup
+{
+ typeset sendfs=$1
+ typeset streamfs=$2
+
+ datasetexists $sendfs && log_must zfs destroy -r $sendfs
+ datasetexists $streamfs && log_must zfs destroy -r $streamfs
+ cleanup_pool $POOL2
+ rm -f /$POOL/initial.zsend /$POOL/incremental.zsend
+}
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh
index d6a5fa2b75..79c9bb6d9b 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh
@@ -37,8 +37,9 @@ verify_runnable "both"
log_assert "Verify resumability of a full and incremental ZFS send/receive " \
"in the presence of a corrupted stream"
-log_onexit cleanup_pools $POOL2 $POOL3
+log_onexit resume_cleanup $sendfs $streamfs
+sendfs=$POOL/sendfs
recvfs=$POOL3/recvfs
streamfs=$POOL2/stream
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh
index 1dcbdace8e..97c19f505a 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh
@@ -35,12 +35,13 @@ verify_runnable "both"
log_assert "Verify resumability of full ZFS send/receive with the -D " \
"(dedup) flag"
-log_onexit cleanup_pool $POOL2
sendfs=$POOL/sendfs
recvfs=$POOL2/recvfs
streamfs=$POOL/stream
+log_onexit resume_cleanup $sendfs $streamfs
+
test_fs_setup $sendfs $recvfs
resume_test "zfs send -D -v $sendfs@a" $streamfs $recvfs
file_check $sendfs $recvfs
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh
index 8fb0abb7a5..2d2a3304da 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh
@@ -37,12 +37,13 @@ verify_runnable "both"
log_assert "Verify resumability of a full and incremental ZFS send/receive " \
"with the -e (embedded) flag"
-log_onexit cleanup_pool $POOL2
sendfs=$POOL/sendfs
recvfs=$POOL2/recvfs
streamfs=$POOL/stream
+log_onexit resume_cleanup $sendfs $streamfs
+
test_fs_setup $sendfs $recvfs
resume_test "zfs send -v -e $sendfs@a" $streamfs $recvfs
resume_test "zfs send -v -e -i @a $sendfs@b" $streamfs $recvfs
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh
index 3fdb049422..9592cb9a79 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh
@@ -40,12 +40,13 @@ verify_runnable "both"
log_assert "Verify resumability of an incremental ZFS send/receive with ZFS " \
"bookmarks"
-log_onexit cleanup_pool $POOL2
sendfs=$POOL/sendfs
recvfs=$POOL2/recvfs
streamfs=$POOL/stream
+log_onexit resume_cleanup $sendfs $streamfs
+
test_fs_setup $sendfs $recvfs
log_must zfs bookmark $sendfs@a $sendfs#bm_a
log_must zfs destroy $sendfs@a
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh
index 62fba64589..d5d938e4b7 100644
--- a/usr/src/test/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh
@@ -37,12 +37,13 @@ verify_runnable "both"
log_assert "Verify resumability of a full ZFS send/receive with the source " \
"filesystem unmounted"
-log_onexit cleanup_pool $POOL2
sendfs=$POOL/sendfs
recvfs=$POOL2/recvfs
streamfs=$POOL/stream
+log_onexit resume_cleanup $sendfs $streamfs
+
test_fs_setup $sendfs $recvfs
log_must zfs unmount $sendfs
resume_test "zfs send $sendfs" $streamfs $recvfs
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-cD.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-cD.ksh
new file mode 100644
index 0000000000..25dc46b3c3
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-cD.ksh
@@ -0,0 +1,77 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify that the -c and -D flags do not interfere with each other.
+#
+# Strategy:
+# 1. Write unique data to a filesystem and create a compressed, deduplicated
+# full stream.
+# 2. Verify that the stream and send dataset show the same size
+# 3. Make several copies of the original data, and create both full and
+# incremental compressed, deduplicated send streams
+# 4. Verify the full stream is no bigger than the stream from step 1
+# 5. Verify the streams can be received correctly.
+#
+
+verify_runnable "both"
+
+log_assert "Verify that the -c and -D flags do not interfere with each other"
+log_onexit cleanup_pool $POOL2
+
+typeset sendfs=$POOL2/sendfs
+typeset recvfs=$POOL2/recvfs
+typeset stream0=$BACKDIR/stream.0
+typeset stream1=$BACKDIR/stream.1
+typeset inc=$BACKDIR/stream.inc
+
+log_must zfs create -o compress=lz4 $sendfs
+log_must zfs create -o compress=lz4 $recvfs
+typeset dir=$(get_prop mountpoint $sendfs)
+# Don't use write_compressible: we want compressible but undedupable data here.
+log_must cp /kernel/genunix $dir/file
+log_must zfs snapshot $sendfs@snap0
+log_must eval "zfs send -D -c $sendfs@snap0 >$stream0"
+
+# The stream size should match at this point because the data is all unique
+verify_stream_size $stream0 $sendfs
+
+for i in {0..3}; do
+ log_must cp $dir/file $dir/file.$i
+done
+log_must zfs snapshot $sendfs@snap1
+
+# The stream sizes should match, since the second stream contains no new blocks
+log_must eval "zfs send -D -c $sendfs@snap1 >$stream1"
+typeset size0=$(stat -c %s $stream0)
+typeset size1=$(stat -c %s $stream1)
+within_percent $size0 $size1 90 || log_fail "$size0 and $size1"
+
+# Finally, make sure the receive works correctly.
+log_must eval "zfs send -D -c -i snap0 $sendfs@snap1 >$inc"
+log_must eval "zfs recv -d $recvfs <$stream0"
+log_must eval "zfs recv -d $recvfs <$inc"
+cmp_ds_cont $sendfs $recvfs
+
+# The size of the incremental should be the same as the initial send.
+typeset size2=$(stat -c %s $inc)
+within_percent $size0 $size2 90 || log_fail "$size0 and $size1"
+
+log_pass "The -c and -D flags do not interfere with each other"
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_embedded_blocks.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_embedded_blocks.ksh
new file mode 100644
index 0000000000..1913c71190
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_embedded_blocks.ksh
@@ -0,0 +1,103 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/include/properties.shlib
+
+#
+# Description:
+# Verify that compressed streams can contain embedded blocks.
+#
+# Strategy:
+# 1. Create a filesystem with compressible data and embedded blocks.
+# 2. Verify the created streams can be received correctly.
+# 3. Verify the presence / absence of embedded blocks in the compressed stream,
+# as well as the receiving file system.
+#
+
+verify_runnable "both"
+
+log_assert "Verify that compressed streams can contain embedded blocks."
+log_onexit cleanup_pool $POOL2
+
+typeset objs obj recsize
+typeset sendfs=$POOL2/sendfs
+typeset recvfs=$POOL2/recvfs
+typeset stream=$BACKDIR/stream
+typeset dump=$BACKDIR/dump
+typeset recvfs2=$POOL2/recvfs2
+typeset stream2=$BACKDIR/stream2
+typeset dump2=$BACKDIR/dump2
+log_must zfs create -o compress=lz4 $sendfs
+log_must zfs create -o compress=lz4 $recvfs
+log_must zfs create -o compress=lz4 $recvfs2
+typeset dir=$(get_prop mountpoint $sendfs)
+
+# Populate the send dataset with compressible data and embedded block files.
+write_compressible $dir 16m
+for recsize in "${recsize_prop_vals[@]}"; do
+ # For lz4, this method works for blocks up to 16k, but not larger
+ [[ $recsize -eq $((32 * 1024)) ]] && break
+
+ log_must mkholes -h 0:$((recsize - 8)) -d $((recsize - 8)):8 \
+ $dir/$recsize
+done
+
+# Generate the streams and zstreamdump output.
+log_must zfs snapshot $sendfs@now
+log_must eval "zfs send -c $sendfs@now >$stream"
+log_must eval "zstreamdump -v <$stream >$dump"
+log_must eval "zfs recv -d $recvfs <$stream"
+cmp_ds_cont $sendfs $recvfs
+verify_stream_size $stream $sendfs
+log_mustnot stream_has_features $stream embed_data
+
+log_must eval "zfs send -c -e $sendfs@now >$stream2"
+log_must eval "zstreamdump -v <$stream2 >$dump2"
+log_must eval "zfs recv -d $recvfs2 <$stream2"
+cmp_ds_cont $sendfs $recvfs2
+verify_stream_size $stream2 $sendfs
+log_must stream_has_features $stream2 embed_data
+
+# Verify embedded blocks are present only when expected.
+for recsize in "${recsize_prop_vals[@]}"; do
+ [[ $recsize -eq $((32 * 1024)) ]] && break
+
+ typeset send_obj=$(get_objnum $(get_prop mountpoint $sendfs)/$recsize)
+ typeset recv_obj=$(get_objnum \
+ $(get_prop mountpoint $recvfs/sendfs)/$recsize)
+ typeset recv2_obj=$(get_objnum \
+ $(get_prop mountpoint $recvfs2/sendfs)/$recsize)
+
+ log_must eval "zdb -ddddd $sendfs $send_obj >$BACKDIR/sendfs.zdb"
+ log_must eval "zdb -ddddd $recvfs/sendfs $recv_obj >$BACKDIR/recvfs.zdb"
+ log_must eval "zdb -ddddd $recvfs2/sendfs $recv2_obj >$BACKDIR/recvfs2.zdb"
+
+ grep -q "EMBEDDED" $BACKDIR/sendfs.zdb || \
+ log_fail "Obj $send_obj not embedded in $sendfs"
+ grep -q "EMBEDDED" $BACKDIR/recvfs.zdb || \
+ log_fail "Obj $recv_obj not embedded in $recvfs"
+ grep -q "EMBEDDED" $BACKDIR/recvfs2.zdb || \
+ log_fail "Obj $recv2_obj not embedded in $recvfs2"
+
+ grep -q "WRITE_EMBEDDED object = $send_obj offset = 0" $dump && \
+ log_fail "Obj $obj embedded in zstreamdump output"
+ grep -q "WRITE_EMBEDDED object = $send_obj offset = 0" $dump2 || \
+ log_fail "Obj $obj not embedded in zstreamdump output"
+done
+
+log_pass "Compressed streams can contain embedded blocks."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_incremental.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_incremental.ksh
new file mode 100644
index 0000000000..719970d995
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_incremental.ksh
@@ -0,0 +1,100 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify that compressed send works correctly with incremental sends.
+#
+# Strategy:
+# 1. Randomly choose either a -i or -I incremental.
+# 2. Generate compressed incremental replication streams for a pool, a
+# descendant dataset, and a volume.
+# 3. Receive these streams verifying both the contents, and intermediate
+# snapshots are present or absent as appropriate to the -i or -I option.
+#
+
+verify_runnable "both"
+
+log_assert "Verify compressed send works with incremental send streams."
+log_onexit cleanup_pool $POOL2
+
+typeset opt=$(random_get "-i" "-I")
+typeset final dstlist list vol
+
+log_must eval "zfs send -R $POOL@final > $BACKDIR/final"
+log_must eval "zfs receive -d -F $POOL2 < $BACKDIR/final"
+
+function do_checks
+{
+ log_must cmp_ds_cont $POOL $POOL2
+ [[ $opt = "-I" ]] && log_must cmp_ds_subs $POOL $POOL2
+ [[ $opt = "-i" ]] && log_mustnot cmp_ds_subs $POOL $POOL2
+
+ [[ $1 != "clean" ]] && return
+
+ cleanup_pool $POOL2
+ log_must eval "zfs send -R $POOL@final > $BACKDIR/final"
+ log_must eval "zfs receive -d -F $POOL2 < $BACKDIR/final"
+}
+
+if is_global_zone; then
+ # Send from the pool root
+ final=$(getds_with_suffix $POOL2 @final)
+ list="$final $(getds_with_suffix $POOL2 @snapA)"
+ list="$list $(getds_with_suffix $POOL2 @snapB)"
+ list="$list $(getds_with_suffix $POOL2 @snapC)"
+
+ log_must eval "zfs send -c -R $opt @init $POOL2@final >$BACKDIR/pool"
+ log_must destroy_tree $list
+ log_must eval "zfs recv -d -F $POOL2 <$BACKDIR/pool"
+
+ dstlist=$(getds_with_suffix $POOL2 @final)
+ [[ $final != $dstlist ]] && log_fail "$final != $dstlist"
+
+ do_checks clean
+
+ # Send of a volume
+ vol=$POOL2/$FS/vol
+ final=$(getds_with_suffix $vol @final)
+ log_must eval "zfs send -c -R $opt @init $vol@final >$BACKDIR/vol"
+ log_must destroy_tree $vol@snapB $vol@snapC $vol@final
+ log_must eval "zfs recv -d -F $POOL2 <$BACKDIR/vol"
+
+ dstlist=$(getds_with_suffix $POOL2/$FS/vol @final)
+ [[ $final != $dstlist ]] && log_fail "$final != $dstlist"
+
+ do_checks clean
+fi
+
+# Send of a descendant fs
+final=$(getds_with_suffix $POOL2/$FS @final)
+list="$final $(getds_with_suffix $POOL2/$FS @snapA)"
+list="$list $(getds_with_suffix $POOL2/$FS @snapB)"
+list="$list $(getds_with_suffix $POOL2/$FS @snapC)"
+
+log_must eval "zfs send -c -R $opt @init $POOL2/$FS@final >$BACKDIR/fs"
+log_must destroy_tree $list
+log_must eval "zfs recv -d -F $POOL2 <$BACKDIR/fs"
+
+dstlist=$(getds_with_suffix $POOL2/$FS @final)
+[[ $final != $dstlist ]] && log_fail "$final != $dstlist"
+
+do_checks
+
+log_pass "Compressed send works with incremental send streams."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_lz4_disabled.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_lz4_disabled.ksh
new file mode 100644
index 0000000000..b2df3d01da
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_lz4_disabled.ksh
@@ -0,0 +1,73 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify a pool without the lz4 feature enabled can create compressed send
+# streams, and that they can be received into pools with or without the
+# lz4 feature.
+#
+# Strategy:
+# 1. For each of an uncompressed, and gzip dataset created from a pool with
+# the lz4 feature disabled, receive the stream into a pool with and without
+# the feature enabled.
+#
+
+verify_runnable "both"
+
+log_assert "Verify compressed streams are rejected if incompatible."
+
+typeset send_ds=$POOL2/testds
+typeset recv_ds=$POOL3/testds
+
+function cleanup
+{
+ poolexists $POOL2 && destroy_pool $POOL2
+ poolexists $POOL3 && destroy_pool $POOL3
+ log_must zpool create $POOL2 $DISK2
+}
+log_onexit cleanup
+
+datasetexists $POOL2 && log_must zpool destroy $POOL2
+log_must zpool create -d $POOL2 $DISK2
+
+for compress in off gzip; do
+ for pool_opt in '' -d; do
+ poolexists $POOL3 && destroy_pool $POOL3
+ log_must zpool create $pool_opt $POOL3 $DISK3
+
+ datasetexists $send_ds && log_must zfs destroy -r $send_ds
+ datasetexists $recv_ds && log_must zfs destroy -r $recv_ds
+
+ log_must zfs create -o compress=$compress $send_ds
+ typeset dir=$(get_prop mountpoint $send_ds)
+ write_compressible $dir 16m
+ log_must zfs snapshot $send_ds@full
+
+ log_must eval "zfs send -c $send_ds@full >$BACKDIR/full-c"
+ log_must eval "zfs recv $recv_ds <$BACKDIR/full-c"
+
+ log_must zfs destroy -r $recv_ds
+
+ log_must eval "zfs send $send_ds@full >$BACKDIR/full"
+ log_must eval "zfs recv $recv_ds <$BACKDIR/full"
+ done
+done
+
+log_pass "Compressed streams are rejected if incompatible."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_mixed_compression.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_mixed_compression.ksh
new file mode 100644
index 0000000000..5bc2bb000b
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_mixed_compression.ksh
@@ -0,0 +1,54 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/include/properties.shlib
+
+#
+# Description:
+# Verify datasets using mixed compression algorithms can be received.
+#
+# Strategy:
+# 1. Write data with each of the available compression algorithms
+# 2. Receive a full compressed send, and verify the data and compression ratios
+#
+
+verify_runnable "both"
+
+log_assert "Verify datasets using mixed compression algorithms can be received."
+log_onexit cleanup_pool $POOL2
+
+send_ds=$POOL2/sendfs
+recv_ds=$POOL2/recvfs
+
+log_must zfs create $send_ds
+
+for prop in "${compress_prop_vals[@]}"; do
+ log_must zfs set compress=$prop $send_ds
+ write_compressible $(get_prop mountpoint $send_ds) 16m
+done
+
+log_must zfs set compress=off $send_ds
+log_must zfs snapshot $send_ds@full
+log_must eval "zfs send -c $send_ds@full >$BACKDIR/full"
+log_must eval "zfs recv $recv_ds <$BACKDIR/full"
+
+verify_stream_size $BACKDIR/full $send_ds
+verify_stream_size $BACKDIR/full $recv_ds
+log_must cmp_ds_cont $send_ds $recv_ds
+
+log_pass "Datasets using mixed compression algorithms can be received."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_props.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_props.ksh
new file mode 100644
index 0000000000..49d86a3dce
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_props.ksh
@@ -0,0 +1,67 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/include/properties.shlib
+
+#
+# Description:
+# Verify compressed send streams can still preserve properties
+#
+# Strategy:
+# 1. Randomly modify the properties in the src pool
+# 2. Send a full compressed stream with -p to preserve properties
+# 3. Verify all the received properties match the source datasets
+# 4. Repeat the process with -R instead of -p
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+ destroy_pool $POOL
+ destroy_pool $POOL2
+ log_must zpool create $POOL $DISK1
+ log_must zpool create $POOL2 $DISK2
+ log_must setup_test_model $POOL
+}
+
+log_assert "Compressed send doesn't interfere with preservation of properties"
+log_onexit cleanup
+
+typeset -a datasets=("" "/pclone" "/$FS" "/$FS/fs1" "/$FS/fs1/fs2"
+ "/$FS/fs1/fclone" "/vol" "/$FS/vol")
+
+typeset ds
+for opt in "-p" "-R"; do
+ for ds in ${datasets[@]}; do
+ randomize_ds_props $POOL$ds
+ done
+
+ log_must eval "zfs send -c $opt $POOL@final > $BACKDIR/pool-final$opt"
+ log_must eval "zfs receive -d -F $POOL2 < $BACKDIR/pool-final$opt"
+
+ for ds in ${datasets[@]}; do
+ log_must cmp_ds_prop $POOL$ds $POOL2$ds
+ log_must cmp_ds_prop $POOL$ds@final $POOL2$ds@final
+ done
+
+ # Don't cleanup the second time, since we do that on exit anyway.
+ [[ $opt = "-p" ]] && cleanup
+done
+
+log_pass "Compressed send doesn't interfere with preservation of properties"
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_dedup.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_dedup.ksh
new file mode 100644
index 0000000000..eb8c050bf8
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_dedup.ksh
@@ -0,0 +1,55 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify that we can receive a compressed stream into a deduped filesystem.
+#
+# Strategy:
+# 1. Write heavily duplicated data to a filesystem and create a compressed
+# full stream.
+# 2. Verify that the stream can be received correctly into a dedup=verify
+# filesystem.
+#
+
+verify_runnable "both"
+
+log_pass "Verify a compressed stream can be received into a deduped filesystem"
+log_onexit cleanup_pool $POOL2
+
+typeset sendfs=$POOL2/sendfs
+typeset recvfs=$POOL2/recvfs
+typeset stream0=$BACKDIR/stream.0
+typeset stream1=$BACKDIR/stream.1
+typeset inc=$BACKDIR/stream.inc
+
+log_must zfs create -o compress=lz4 $sendfs
+log_must zfs create -o compress=lz4 -o dedup=verify $recvfs
+typeset dir=$(get_prop mountpoint $sendfs)
+for i in {0..10}; do
+ log_must cp /kernel/genunix $dir/file.$i
+done
+log_must zfs snapshot $sendfs@snap0
+log_must eval "zfs send -c $sendfs@snap0 >$stream0"
+
+# Finally, make sure the receive works correctly.
+log_must eval "zfs recv -d $recvfs <$stream0"
+cmp_ds_cont $sendfs $recvfs
+
+log_pass "The compressed stream could be received into a deduped filesystem"
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled.ksh
new file mode 100644
index 0000000000..822ea3655e
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled.ksh
@@ -0,0 +1,68 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify a pool without the lz4 feature gracefully rejects a compressed stream
+# because on any sending pool that supports it, metadata will be compressed
+# with lz4 even if user data is not compressed.
+#
+# Strategy:
+# 1. For each of an uncompressed, gzip and lz4 dataset, do the following
+# receives into a pool without the lz4 feature:
+# 2. Attempt to receive the compressed stream (should fail)
+# 3. Attempt to receive the uncompressed stream (should succeed)
+#
+
+verify_runnable "both"
+
+log_assert "Verify compressed streams are rejected if incompatible."
+
+typeset compress_types="off gzip lz4"
+typeset send_ds=$POOL2/testds
+typeset recv_ds=$POOL3/testds
+
+function cleanup
+{
+ poolexists $POOL2 && destroy_pool $POOL2
+ poolexists $POOL3 && destroy_pool $POOL3
+ log_must zpool create $POOL2 $DISK2
+}
+log_onexit cleanup
+
+datasetexists $POOL3 && log_must zpool destroy $POOL3
+log_must zpool create -d $POOL3 $DISK3
+
+for compress in $compress_types; do
+ datasetexists $send_ds && log_must zfs destroy -r $send_ds
+ datasetexists $recv_ds && log_must zfs destroy -r $recv_ds
+
+ log_must zfs create -o compress=$compress $send_ds
+ typeset dir=$(get_prop mountpoint $send_ds)
+ write_compressible $dir 16m
+ log_must zfs snapshot $send_ds@full
+
+ log_must eval "zfs send -c $send_ds@full >$BACKDIR/full-c"
+ log_mustnot eval "zfs recv $recv_ds <$BACKDIR/full-c"
+
+ log_must eval "zfs send $send_ds@full >$BACKDIR/full"
+ log_must eval "zfs recv $recv_ds <$BACKDIR/full"
+done
+
+log_pass "Compressed streams are rejected if incompatible."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_resume.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_resume.ksh
new file mode 100644
index 0000000000..8b36177647
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_resume.ksh
@@ -0,0 +1,49 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify resumability of full and incremental ZFS send/receive with the -c
+# (compress) flag in the presence of a corrupted stream.
+#
+# Strategy:
+# 1. Start a full ZFS send with the -c flag (compress), redirect output to
+# a file
+# 2. Mess up the contents of the stream state file on disk
+# 3. Try ZFS receive, which should fail with a checksum mismatch error
+# 4. ZFS send to the stream state file again using the receive_resume_token
+# 5. ZFS receieve and verify the receive completes successfully
+# 6. Repeat steps on an incremental ZFS send
+#
+
+verify_runnable "both"
+
+sendfs=$POOL/sendfs
+recvfs=$POOL2/recvfs
+streamfs=$POOL/stream
+
+log_assert "Verify compressed send streams can be resumed if interrupted"
+log_onexit resume_cleanup $sendfs $streamfs
+
+test_fs_setup $sendfs $recvfs
+resume_test "zfs send -c -v $sendfs@a" $streamfs $recvfs
+resume_test "zfs send -c -v -i @a $sendfs@b" $streamfs $recvfs
+file_check $sendfs $recvfs
+
+log_pass "Compressed send streams can be resumed if interrupted"
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate.ksh
new file mode 100644
index 0000000000..3e4da295d6
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate.ksh
@@ -0,0 +1,91 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify the stream size estimate given by -P accounts for compressed send.
+# Verify the stream size given by -P accounts for compressed send."
+#
+# Strategy:
+# 1. For datasets of varied compression types do the following:
+# 2. Write data, verify stream size estimates with and without -c
+#
+
+verify_runnable "both"
+typeset compress_types="off gzip lz4"
+typeset send_ds="$POOL2/testfs"
+typeset send_vol="$POOL2/vol"
+typeset send_voldev="/dev/zvol/rdsk/$POOL2/vol"
+typeset file="$BACKDIR/file.0"
+typeset megs="16"
+typeset compress
+
+function get_estimated_size
+{
+ typeset cmd=$1
+ typeset ds=${cmd##* }
+ typeset tmpfile=$(mktemp -p $BACKDIR)
+
+ eval "$cmd >$tmpfile"
+ [[ $? -eq 0 ]] || log_fail "get_estimated_size: $cmd"
+ typeset size=$(eval "awk '\$2 == \"$ds\" {print \$3}' $tmpfile")
+ rm -f $tmpfile
+
+ echo $size
+}
+
+log_assert "Verify the stream size given by -P accounts for compressed send."
+log_onexit cleanup_pool $POOL2
+
+write_compressible $BACKDIR ${megs}m
+
+for compress in $compress_types; do
+ datasetexists $send_ds && log_must zfs destroy -r $send_ds
+ datasetexists $send_vol && log_must zfs destroy -r $send_vol
+ log_must zfs create -o compress=$compress $send_ds
+ log_must zfs create -V 1g -o compress=$compress $send_vol
+
+ typeset dir=$(get_prop mountpoint $send_ds)
+ log_must cp $file $dir
+ log_must zfs snapshot $send_ds@snap
+ log_must dd if=$file of=$send_voldev
+ log_must zfs snapshot $send_vol@snap
+
+ typeset ds_size=$(get_estimated_size "zfs send -nP $send_ds@snap")
+ typeset ds_lrefer=$(get_prop lrefer $send_ds)
+ within_percent $ds_size $ds_lrefer 90 || log_fail \
+ "$ds_size and $ds_lrefer differed by too much"
+
+ typeset vol_size=$(get_estimated_size "zfs send -nP $send_vol@snap")
+ typeset vol_lrefer=$(get_prop lrefer $send_vol)
+ within_percent $vol_size $vol_lrefer 90 || log_fail \
+ "$vol_size and $vol_lrefer differed by too much"
+
+ typeset ds_csize=$(get_estimated_size "zfs send -nP -c $send_ds@snap")
+ typeset ds_refer=$(get_prop refer $send_ds)
+ within_percent $ds_csize $ds_refer 90 || log_fail \
+ "$ds_csize and $ds_refer differed by too much"
+
+ typeset vol_csize=$(get_estimated_size "zfs send -nP -c $send_vol@snap")
+ typeset vol_refer=$(get_prop refer $send_vol)
+ within_percent $vol_csize $vol_refer 90 || log_fail \
+ "$vol_csize and $vol_refer differed by too much"
+done
+
+log_pass "The the stream size given by -P accounts for compressed send."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_contents.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_contents.ksh
new file mode 100644
index 0000000000..de2d29c923
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_contents.ksh
@@ -0,0 +1,55 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify compressed send streams replicate data and datasets
+#
+# Strategy:
+# 1. Back up all the data from POOL/FS
+# 2. Verify all the datasets and data can be recovered in POOL2
+# 3. Back up all the data from root filesystem POOL2
+# 4. Verify all the data can be recovered, too
+#
+
+verify_runnable "both"
+
+log_assert "zfs send -c -R send replication stream up to the named snap."
+log_onexit cleanup_pool $POOL2
+
+# Verify the entire pool and descendants can be backed up and restored.
+log_must eval "zfs send -c -R $POOL@final > $BACKDIR/pool-final-R"
+log_must eval "zfs receive -d -F $POOL2 < $BACKDIR/pool-final-R"
+
+dstds=$(get_dst_ds $POOL $POOL2)
+log_must cmp_ds_subs $POOL $dstds
+log_must cmp_ds_cont $POOL $dstds
+
+# Cleanup POOL2
+log_must cleanup_pool $POOL2
+
+# Verify all the filesystems and descendants can be backed up and restored.
+log_must eval "zfs send -c -R $POOL/$FS@final > $BACKDIR/fs-final-R"
+log_must eval "zfs receive -d $POOL2 < $BACKDIR/fs-final-R"
+
+dstds=$(get_dst_ds $POOL/$FS $POOL2)
+log_must cmp_ds_subs $POOL/$FS $dstds
+log_must cmp_ds_cont $POOL/$FS $dstds
+
+log_pass "zfs send -c -R send replication stream up to the named snap."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_ratio.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_ratio.ksh
new file mode 100644
index 0000000000..a5138c527b
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_verify_ratio.ksh
@@ -0,0 +1,66 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/include/properties.shlib
+
+#
+# Description:
+# Verify that the amount of data in a send -c stream matches compressratio.
+#
+# Strategy:
+# 1. For random compression types, and compressible / incompressible data:
+# 2. Create a snap with data
+# 3. Compare the size of the stream with the data on the dataset, adjusted
+# by compressratio for normal send, and compared to used for send -c.
+#
+
+verify_runnable "both"
+
+log_assert "Verify send -c streams are compressed"
+log_onexit cleanup_pool $POOL2
+
+typeset sendfs=$POOL2/$FS
+typeset megs=128
+
+for prop in $(get_rand_compress_any 6); do
+ for compressible in 'yes' 'no'; do
+ log_must zfs create -o compress=$prop $sendfs
+
+ if [[ $compressible = 'yes' ]]; then
+ write_compressible $(get_prop mountpoint $sendfs) \
+ ${megs}m
+ else
+ typeset file="$(get_prop mountpoint $sendfs)/ddfile"
+ log_must dd if=/dev/urandom of=$file bs=1024k count=$megs
+ fi
+
+ log_must zfs snapshot $sendfs@snap
+
+ # Calculate the sizes and verify the compression ratio.
+ log_must eval "zfs send $sendfs@snap >$BACKDIR/uncompressed"
+ verify_stream_size $BACKDIR/uncompressed $sendfs
+
+ log_must eval "zfs send -c $sendfs@snap >$BACKDIR/compressed"
+ verify_stream_size $BACKDIR/compressed $sendfs
+
+ log_must rm $BACKDIR/uncompressed $BACKDIR/compressed
+ log_must zfs destroy -r $sendfs
+ done
+done
+
+log_pass "Verify send -c streams are compressed"
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_volume.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_volume.ksh
new file mode 100644
index 0000000000..4ce3d5a09b
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_volume.ksh
@@ -0,0 +1,80 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify that compressed send correctly handles volumes
+#
+# Strategy:
+# 1. Write compressible data into a volume, take a snap
+# 2. Verify the compressed stream is the correct size, and has the correct data
+# 3. Repeat step 2 for an incremental compressed stream
+#
+
+function cleanup
+{
+ log_must zfs destroy -r $vol
+ cleanup_pool $POOL2
+}
+
+verify_runnable "both"
+
+log_assert "Verify compressed send works with volumes"
+log_onexit cleanup
+
+typeset vol="$POOL/newvol"
+typeset vol2="$POOL2/newvol"
+typeset voldev="/dev/zvol/rdsk/$POOL/newvol"
+typeset voldev2="/dev/zvol/rdsk/$POOL2/newvol"
+typeset data1=$BACKDIR/file.0
+typeset data2=$BACKDIR/file.1
+typeset megs=8
+
+log_must zfs create -V 256m -o compress=lz4 $vol
+
+write_compressible $BACKDIR ${megs}m 2
+md5_1=$(md5sum $data1 | awk '{print $1}')
+md5_2=$(md5sum $data2 | awk '{print $1}')
+
+log_must dd if=$data1 of=$voldev bs=1024k
+log_must zfs snapshot $vol@snap
+
+log_must eval "zfs send -c $vol@snap >$BACKDIR/full"
+log_must eval "zfs recv -d $POOL2 <$BACKDIR/full"
+
+verify_stream_size $BACKDIR/full $vol
+verify_stream_size $BACKDIR/full $vol2
+md5=$(dd if=$voldev2 bs=1024k count=$megs 2>/dev/null | md5sum | \
+ awk '{print $1}')
+[[ $md5 = $md5_1 ]] || log_fail "md5 mismatch: $md5 != $md5_1"
+
+# Repeat, for an incremental send
+log_must dd oseek=$megs if=$data2 of=$voldev bs=1024k
+log_must zfs snapshot $vol@snap2
+
+log_must eval "zfs send -c -i snap $vol@snap2 >$BACKDIR/inc"
+log_must eval "zfs recv -d $POOL2 <$BACKDIR/inc"
+
+verify_stream_size $BACKDIR/inc $vol 90 $vol@snap
+verify_stream_size $BACKDIR/inc $vol2 90 $vol2@snap
+md5=$(dd iseek=$megs if=$voldev2 bs=1024k count=$megs 2>/dev/null | md5sum | \
+ awk '{print $1}')
+[[ $md5 = $md5_2 ]] || log_fail "md5 mismatch: $md5 != $md5_2"
+
+log_pass "Verify compressed send works with volumes"
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-c_zstreamdump.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_zstreamdump.ksh
new file mode 100644
index 0000000000..6f8359e56c
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-c_zstreamdump.ksh
@@ -0,0 +1,59 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+. $STF_SUITE/include/math.shlib
+
+#
+# Description:
+# Verify compression features show up in zstreamdump
+#
+# Strategy:
+# 1. Create a full compressed send stream
+# 2. Verify zstreamdump shows this stream has the relevant features
+# 3. Verify zstreamdump's accounting of logical and compressed size is correct
+#
+
+verify_runnable "both"
+
+log_assert "Verify zstreamdump correctly interprets compressed send streams."
+log_onexit cleanup_pool $POOL2
+
+typeset sendfs=$POOL2/fs
+
+log_must zfs create -o compress=lz4 $sendfs
+typeset dir=$(get_prop mountpoint $sendfs)
+write_compressible $dir 16m
+log_must zfs snapshot $sendfs@full
+
+log_must eval "zfs send -c $sendfs@full >$BACKDIR/full"
+log_must stream_has_features $BACKDIR/full lz4 compressed
+cat $BACKDIR/full | zstreamdump -v | parse_dump > $BACKDIR/dump.out
+
+lsize=$(awk '/^WRITE [^0]/ {lsize += $4} END {printf("%d", lsize)}' \
+ $BACKDIR/dump.out)
+lsize_prop=$(get_prop logicalused $sendfs)
+within_percent $lsize $lsize_prop 90 || log_fail \
+ "$lsize and $lsize_prop differed by too much"
+
+csize=$(awk '/^WRITE [^0]/ {csize += $5} END {printf("%d", csize)}' \
+ $BACKDIR/dump.out)
+csize_prop=$(get_prop used $sendfs)
+within_percent $csize $csize_prop 90 || log_fail \
+ "$csize and $csize_prop differed by too much"
+
+log_pass "zstreamdump correctly interprets compressed send streams."
diff --git a/usr/src/test/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize.ksh b/usr/src/test/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize.ksh
new file mode 100644
index 0000000000..8c33e323b8
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize.ksh
@@ -0,0 +1,198 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify compressed send works correctly with datasets of varying recsize.
+#
+# Strategy:
+# 1. Check the recv behavior (into pools with features enabled and disabled)
+# of all combinations of -c -p and -L. Verify the stream is compressed,
+# and that the recsize property and that of a received file is correct
+# according to this matrix:
+#
+# +---------+--------+------------+------------+-----------+-----------+
+# | send | send | received | received | received | received |
+# | stream | stream | file bs | prop | file bs | props |
+# | recsize | flags | (disabled) | (disabled) | (enabled) | (enabled) |
+# +---------+--------+------------+------------+-----------+-----------+
+# | 128k | | 128k | 128k | 128k | 128k |
+# | 128k | -c | Fails | Fails | 128k | 128k |
+# | 128k | -p | 128k | 128k | 128k | 128k |
+# | 128k | -L | 128k | 128k | 128k | 128k |
+# | 128k | -cp | Fails | Fails | 128k | 128k |
+# | 128k | -cL | Fails | Fails | 128k | 128k |
+# | 128k | -pL | 128k | 128k | 128k | 128k |
+# | 128k | -cpL | Fails | Fails | 128k | 128k |
+# | 1m | | Fails | Fails | 128k | 128k |
+# | 1m | -c | Fails | Fails | 128k | 128k |
+# | 1m | -p | 128k | 128k | 128k | 1m |
+# | 1m | -L | Fails | Fails | 1m | 128k |
+# | 1m | -cp | Fails | Fails | 128k | 1m |
+# | 1m | -cL | Fails | Fails | 1m | 128k |
+# | 1m | -pL | Fails | Fails | 1m | 1m |
+# | 1m | -cpL | Fails | Fails | 1m | 1m |
+# +---------+--------+------------+------------+-----------+-----------+
+#
+
+verify_runnable "both"
+
+function cleanup
+{
+ datasetexists $TESTPOOL/128k && log_must zfs destroy $TESTPOOL/128k
+ datasetexists $TESTPOOL/1m && log_must zfs destroy $TESTPOOL/1m
+ cleanup_pool $POOL2
+ destroy_pool $POOL3
+}
+
+# For a received stream, verify the recsize (prop and file) match expectations.
+function check_recsize
+{
+ typeset recv_ds=$1
+ typeset expected_file_bs=$2
+ typeset expected_recsize=$3
+ typeset file="$(get_prop mountpoint $recv_ds)/testfile"
+
+ [[ -f $file ]] || log_fail "file '$file' doesn't exist"
+
+ typeset read_recsize=$(get_prop recsize $recv_ds)
+ typeset read_file_bs=$(stat $file | sed -n \
+ 's/.*IO Block: \([0-9]*\).*/\1/p')
+
+ [[ $read_recsize = $expected_recsize ]] || log_fail \
+ "read_recsize: $read_recsize expected_recsize: $expected_recsize"
+ [[ $read_file_bs = $expected_file_bs ]] || log_fail \
+ "read_file_bs: $read_file_bs expected_file_bs: $expected_file_bs"
+}
+
+#
+# This function does a zfs send and receive according to the parameters
+# below, and verifies the data shown in the strategy section.
+#
+# -[cpL] flags to pass through to 'zfs send'
+# -d Receive into a pool with all features disabled
+#
+# $1 The recordsize of the send dataset
+# $2 Whether or not the recv should work.
+# $3 The blocksize expected in a received file (default 128k)
+# $4 The recordsize property expected in a received dataset (default 128k)
+#
+function check
+{
+ typeset recv_pool=$POOL2
+ typeset flags='-'
+
+ while getopts "cdpL" opt; do
+ case $opt in
+ c)
+ flags+='c'
+ ;;
+ d)
+ recv_pool=$POOL3
+ ;;
+ p)
+ flags+='p'
+ ;;
+ L)
+ flags+='L'
+ ;;
+ esac
+ done
+ shift $(($OPTIND - 1))
+ [[ ${#flags} -eq 1 ]] && flags=''
+
+ typeset recsize=$1
+ typeset verify=$2
+ typeset expected_file_bs=${3-131072}
+ typeset expected_recsize=${4-131072}
+ typeset send_ds=$TESTPOOL/$recsize
+ typeset send_snap=$send_ds@snap
+ typeset recv_ds=$recv_pool/$recsize
+ typeset stream=$BACKDIR/stream.out
+
+ datasetexists $send_ds || log_fail "send ds: $send_ds doesn't exist"
+ [[ -f $stream ]] && log_must rm $stream
+ log_must eval "zfs send $flags $send_snap >$stream"
+ $verify eval "zfs recv $recv_ds <$stream"
+ typeset stream_size=$(cat $stream | zstreamdump | sed -n \
+ 's/ Total write size = \(.*\) (0x.*)/\1/p')
+
+ #
+ # Special case: For a send dataset with large blocks, don't try to
+ # verify the stream size is correct if the compress flag is present
+ # but the large blocks flag isn't. In these cases, the user data
+ # isn't compressed in the stream (though metadata is) so the
+ # verification would fail.
+ #
+ typeset do_size_test=true
+ [[ $recsize = $large && $flags =~ 'c' && ! $flags =~ 'L' ]] && \
+ do_size_test=false
+
+ $do_size_test && verify_stream_size $stream $send_ds
+
+ if [[ $verify = "log_mustnot" ]]; then
+ datasetnonexists $recv_ds || log_fail "$recv_ds shouldn't exist"
+ return
+ fi
+
+ check_recsize $recv_ds $expected_file_bs $expected_recsize
+ $do_size_test && verify_stream_size $stream $recv_ds
+ log_must zfs destroy -r $recv_ds
+}
+
+log_assert "Verify compressed send works with datasets of varying recsize."
+log_onexit cleanup
+typeset recsize opts dir
+typeset small=$((128 * 1024))
+typeset large=$((1024 * 1024))
+
+# Create POOL3 with features disabled and datasets to create test send streams
+log_must zpool create -d $POOL3 $DISK3
+write_compressible $BACKDIR 32m
+for recsize in $small $large; do
+ log_must zfs create -o compress=gzip -o recsize=$recsize \
+ $TESTPOOL/$recsize
+ dir=$(get_prop mountpoint $TESTPOOL/$recsize)
+ log_must cp $BACKDIR/file.0 $dir/testfile
+ log_must zfs snapshot $TESTPOOL/$recsize@snap
+done
+
+# Run tests for send streams without large blocks
+for opts in '' -d -c -p -dp -L -dL -cp -cL -pL -dpL -cpL; do
+ check $opts $small log_must
+done
+for opts in -dc -dcp -dcL -dcpL; do
+ check $opts $small log_mustnot
+done
+
+# Run tests for send streams with large blocks
+for opts in '' -d -dp -c; do
+ check $opts $large log_must
+done
+for opts in -dc -dL -dcp -dcL -dpL -dcpL; do
+ check $opts $large log_mustnot
+done
+check -p $large log_must $small $large
+check -L $large log_must $large $small
+check -cp $large log_must $small $large
+check -cL $large log_must $large $small
+check -pL $large log_must $large $large
+check -cpL $large log_must $large $large
+
+log_pass "Compressed send works with datasets of varying recsize."
diff --git a/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio b/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio
index f876bd63d3..8289d546de 100644
--- a/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio
+++ b/usr/src/test/zfs-tests/tests/perf/fio/mkfiles.fio
@@ -10,7 +10,7 @@
#
#
-# Copyright (c) 2015 by Delphix. All rights reserved.
+# Copyright (c) 2016 by Delphix. All rights reserved.
#
[global]
@@ -24,7 +24,7 @@ thread=1
directory=/${TESTFS}
numjobs=${NUMJOBS}
filesize=${FILE_SIZE}
-buffer_compress_percentage=33
+buffer_compress_percentage=66
buffer_compress_chunk=4096
[job]
diff --git a/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio b/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio
index 0b750260ff..07090d4dcd 100644
--- a/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio
+++ b/usr/src/test/zfs-tests/tests/perf/fio/random_readwrite.fio
@@ -10,7 +10,7 @@
#
#
-# Copyright (c) 2015 by Delphix. All rights reserved.
+# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
#
[global]
@@ -29,7 +29,7 @@ bssplit=4k/50:8k/30:128k/10:1m/10
ioengine=psync
sync=${SYNC_TYPE}
numjobs=${NUMJOBS}
-buffer_compress_percentage=33
+buffer_compress_percentage=66
buffer_compress_chunk=4096
[job]
diff --git a/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio b/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio
index b1860a71dd..9233a84260 100644
--- a/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio
+++ b/usr/src/test/zfs-tests/tests/perf/fio/random_writes.fio
@@ -10,7 +10,7 @@
#
#
-# Copyright (c) 2015 by Delphix. All rights reserved.
+# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
#
[global]
@@ -27,7 +27,7 @@ ioengine=psync
sync=${SYNC_TYPE}
numjobs=${NUMJOBS}
filesize=${FILESIZE}
-buffer_compress_percentage=33
+buffer_compress_percentage=66
buffer_compress_chunk=4096
[job]
diff --git a/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio b/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio
index df1590cf11..0ee6d091db 100644
--- a/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio
+++ b/usr/src/test/zfs-tests/tests/perf/fio/sequential_writes.fio
@@ -10,7 +10,7 @@
#
#
-# Copyright (c) 2015 by Delphix. All rights reserved.
+# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
#
[global]
@@ -27,7 +27,7 @@ ioengine=psync
sync=${SYNC_TYPE}
numjobs=${NUMJOBS}
filesize=${FILESIZE}
-buffer_compress_percentage=33
+buffer_compress_percentage=66
buffer_compress_chunk=4096
[job]
diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c
index fbca57a418..33bab64751 100644
--- a/usr/src/uts/common/brand/lx/os/lx_brand.c
+++ b/usr/src/uts/common/brand/lx/os/lx_brand.c
@@ -1468,62 +1468,6 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
return (0);
}
- case B_SIGEV_THREAD_ID: {
- /*
- * Emulate Linux's timer_create(2) SIGEV_THREAD_ID
- * notification method. This mechanism is only meant
- * for userland threading libraries such as glibc and
- * is documented as such. Therefore, assume this is
- * only ever invoked for the purpose of alerting a
- * Linux threading library. Assume that the tid is a
- * member of the caller's process and the signal
- * number is valid. See lx_sigev_thread_id() for the
- * userland side of this emulation.
- *
- * The return code from this function is not checked
- * by the caller since it executes in an asynchronous
- * context and there is nothing much to be done. If
- * this function does fail then it will manifest as
- * Linux threads waiting for a signal they will never
- * receive.
- *
- * arg1 -- Linux tid
- * arg2 -- Linux signal number
- * arg3 -- sigval pointer
- */
-
- int native_sig = lx_ltos_signo((int)arg2, 0);
- pid_t spid;
- int stid;
- sigqueue_t *sqp;
-
- if (native_sig == 0)
- return (EINVAL);
-
- if (lx_lpid_to_spair((pid_t)arg1, &spid, &stid) != 0) {
- return (ESRCH);
- }
- sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
- mutex_enter(&curproc->p_lock);
- if ((t = idtot(curproc, stid)) == NULL) {
- mutex_exit(&curproc->p_lock);
- kmem_free(sqp, sizeof (sigqueue_t));
- return (ESRCH);
- }
-
- sqp->sq_info.si_signo = native_sig;
- sqp->sq_info.si_code = SI_TIMER;
- sqp->sq_info.si_pid = curproc->p_pid;
- sqp->sq_info.si_zoneid = getzoneid();
- sqp->sq_info.si_uid = crgetruid(CRED());
- sqp->sq_info.si_value.sival_ptr = (void *)arg3;
- sigaddqa(curproc, t, sqp);
-
- mutex_exit(&curproc->p_lock);
-
- return (0);
- }
-
case B_PTRACE_STOP_FOR_OPT:
return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ?
B_FALSE : B_TRUE, (ulong_t)arg3, arg4));
@@ -1618,10 +1562,6 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3,
(siginfo_t *)arg4));
- case B_SET_THUNK_PID:
- lwpd->br_lx_thunk_pid = arg1;
- return (0);
-
case B_GETPID:
/*
* The usermode clone(2) code needs to be able to call
diff --git a/usr/src/uts/common/brand/lx/os/lx_syscall.c b/usr/src/uts/common/brand/lx/os/lx_syscall.c
index 7cb29f1004..c8824e6783 100644
--- a/usr/src/uts/common/brand/lx/os/lx_syscall.c
+++ b/usr/src/uts/common/brand/lx/os/lx_syscall.c
@@ -615,8 +615,8 @@ lx_sysent_t lx_sysent32[] = {
{"ftruncate", NULL, 0, 2}, /* 93 */
{"fchmod", lx_fchmod, 0, 2}, /* 94 */
{"fchown16", lx_fchown16, 0, 3}, /* 95 */
- {"getpriority", NULL, 0, 2}, /* 96 */
- {"setpriority", NULL, 0, 3}, /* 97 */
+ {"getpriority", lx_getpriority, 0, 2}, /* 96 */
+ {"setpriority", lx_setpriority, 0, 3}, /* 97 */
{"profil", NULL, NOSYS_NO_EQUIV, 0}, /* 98 */
{"statfs", NULL, 0, 2}, /* 99 */
{"fstatfs", NULL, 0, 2}, /* 100 */
@@ -778,7 +778,7 @@ lx_sysent_t lx_sysent32[] = {
{"epoll_wait", lx_epoll_wait, 0, 4}, /* 256 */
{"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 257 */
{"set_tid_address", lx_set_tid_address, 0, 1}, /* 258 */
- {"timer_create", NULL, 0, 3}, /* 259 */
+ {"timer_create", lx_timer_create, 0, 3}, /* 259 */
{"timer_settime", NULL, 0, 4}, /* 260 */
{"timer_gettime", NULL, 0, 2}, /* 261 */
{"timer_getoverrun", NULL, 0, 1}, /* 262 */
@@ -1030,8 +1030,8 @@ lx_sysent_t lx_sysent64[] = {
{"statfs", NULL, 0, 2}, /* 137 */
{"fstatfs", NULL, 0, 2}, /* 138 */
{"sysfs", NULL, 0, 3}, /* 139 */
- {"getpriority", NULL, 0, 2}, /* 140 */
- {"setpriority", NULL, 0, 3}, /* 141 */
+ {"getpriority", lx_getpriority, 0, 2}, /* 140 */
+ {"setpriority", lx_setpriority, 0, 3}, /* 141 */
{"sched_setparam", lx_sched_setparam, 0, 2}, /* 142 */
{"sched_getparam", lx_sched_getparam, 0, 2}, /* 143 */
{"sched_setscheduler", lx_sched_setscheduler, 0, 3}, /* 144 */
@@ -1112,7 +1112,7 @@ lx_sysent_t lx_sysent64[] = {
{"restart_syscall", NULL, NOSYS_NULL, 0}, /* 219 */
{"semtimedop", NULL, 0, 4}, /* 220 */
{"fadvise64", lx_fadvise64, 0, 4}, /* 221 */
- {"timer_create", NULL, 0, 3}, /* 222 */
+ {"timer_create", lx_timer_create, 0, 3}, /* 222 */
{"timer_settime", NULL, 0, 4}, /* 223 */
{"timer_gettime", NULL, 0, 2}, /* 224 */
{"timer_getoverrun", NULL, 0, 1}, /* 225 */
diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h
index 959e84b0a0..30d576044f 100644
--- a/usr/src/uts/common/brand/lx/sys/lx_brand.h
+++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h
@@ -101,7 +101,7 @@ extern "C" {
#define B_STORE_ARGS 137
#define B_GETPID 138
#define B_JUMP_TO_LINUX 139
-#define B_SET_THUNK_PID 140
+/* formerly B_SET_THUNK_PID 140 */
#define B_EXIT_AS_SIG 141
/* formerly B_HELPER_WAITID 142 */
#define B_HELPER_CLONE 143
@@ -109,7 +109,7 @@ extern "C" {
#define B_HELPER_SIGQUEUE 145
#define B_HELPER_TGSIGQUEUE 146
#define B_SET_NATIVE_STACK 147
-#define B_SIGEV_THREAD_ID 148
+/* formerly B_SIGEV_THREAD_ID 148 */
#define B_OVERRIDE_KERN_VER 149
#define B_PTRACE_SIG_RETURN 150
#define B_GET_PERSONALITY 151
@@ -525,13 +525,6 @@ struct lx_lwp_data {
uintptr_t br_ntv_stack_current;
/*
- * If this pid is set, we return it with getpid(). This allows the
- * thunking server to interpose on the pid returned to the Linux
- * syslog software.
- */
- pid_t br_lx_thunk_pid;
-
- /*
* If strict mode is enabled (via LX_STRICT in the environment), any
* call to lx_unsupported() will set this boolean to B_TRUE. This will
* cause us to drop SIGSYS on the LWP as it attempts to return to
diff --git a/usr/src/uts/common/brand/lx/sys/lx_syscalls.h b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h
index f8fb1c145d..2784ed6919 100644
--- a/usr/src/uts/common/brand/lx/sys/lx_syscalls.h
+++ b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h
@@ -102,6 +102,7 @@ extern long lx_getpgrp();
extern long lx_getsockname();
extern long lx_getpid();
extern long lx_getppid();
+extern long lx_getpriority();
extern long lx_getrandom();
extern long lx_getresgid();
extern long lx_getresgid16();
@@ -203,6 +204,7 @@ extern long lx_setgid();
extern long lx_setgid16();
extern long lx_sethostname();
extern long lx_setpgid();
+extern long lx_setpriority();
extern long lx_setregid();
extern long lx_setregid16();
extern long lx_setresgid();
@@ -235,6 +237,7 @@ extern long lx_syslog();
extern long lx_removexattr();
extern long lx_tgkill();
extern long lx_time();
+extern long lx_timer_create();
extern long lx_tkill();
extern long lx_umask();
extern long lx_umount();
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getpid.c b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c
index c2506f52c5..4cef3196c9 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_getpid.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/zone.h>
@@ -48,11 +48,7 @@ lx_getpid(void)
} else {
VERIFY(lwpd != NULL);
- if (lwpd->br_lx_thunk_pid != 0) {
- rv = lwpd->br_lx_thunk_pid;
- } else {
- rv = lwpd->br_tgid;
- }
+ rv = lwpd->br_tgid;
}
return (rv);
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_priority.c b/usr/src/uts/common/brand/lx/syscall/lx_priority.c
new file mode 100644
index 0000000000..44c60b66bf
--- /dev/null
+++ b/usr/src/uts/common/brand/lx/syscall/lx_priority.c
@@ -0,0 +1,192 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/systm.h>
+#include <sys/procset.h>
+#include <sys/resource.h>
+#include <sys/priocntl.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
+
+/* From uts/common/disp/priocntl.c */
+extern int donice(procset_t *, pcnice_t *);
+
+/*
+ * The Linux syscall returns priorities in the range (highest) 40-1 (lowest)
+ * and then glibc adjusts these to the range -20 - 19.
+ */
+long
+lx_getpriority(int which, id_t who)
+{
+ int rval;
+ idtype_t idtype;
+ id_t id, lid;
+ pcnice_t pcnice;
+ procset_t procset;
+
+ switch (which) {
+ case PRIO_PROCESS:
+ idtype = P_PID;
+ if (who > 0 && lx_lpid_to_spair(who, &who, &lid) < 0)
+ return (set_errno(ESRCH));
+ break;
+ case PRIO_PGRP:
+ idtype = P_PGID;
+ break;
+ case PRIO_USER:
+ idtype = P_UID;
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ /* Linux fails with a different errno on a negative id */
+ if (who < 0)
+ return (set_errno(ESRCH));
+
+ id = (who == 0 ? P_MYID : who);
+
+ pcnice.pc_val = 0;
+ pcnice.pc_op = PC_GETNICE;
+
+ setprocset(&procset, POP_AND, idtype, id, P_ALL, 0);
+
+ rval = donice(&procset, &pcnice);
+ if (rval != 0) {
+ if (which == PRIO_PROCESS &&
+ (who == curproc->p_pid || who == 0) &&
+ strcmp(sclass[curthread->t_cid].cl_name, "RT") == 0) {
+ /*
+ * donice() will always return EINVAL if we're in the
+ * RT class. The zone won't be able to put itself or any
+ * of its processes into RT, but if we put the whole
+ * zone into RT via the scheduling-class property, then
+ * getpriority would always fail. This breaks pam and
+ * prevents any login. Just pretend to be the highest
+ * priority.
+ */
+ return (40);
+ }
+
+ /*
+ * Linux does not return EINVAL for invalid 'who' values, it
+ * returns ESRCH instead. We already validated 'which' above.
+ */
+ if (rval == EINVAL)
+ rval = ESRCH;
+ return (set_errno(rval));
+ }
+
+ /*
+ * The return value of the getpriority syscall is biased by 20 to avoid
+ * returning negative values when successful (-20 internally is our
+ * highest priority and 19 is our lowest).
+ */
+ return (20 - pcnice.pc_val);
+}
+
+/*
+ * Return EPERM if the current process is not allowed to operate on the target
+ * process (which is part of the procset for setpriority).
+ */
+/* ARGSUSED */
+static int
+lx_chk_pripriv(proc_t *pp, char *dummy)
+{
+ ASSERT(MUTEX_HELD(&pidlock));
+ mutex_enter(&pp->p_lock);
+ if (!prochasprocperm(pp, curproc, CRED())) {
+ mutex_exit(&pp->p_lock);
+ return (EPERM);
+ }
+ mutex_exit(&pp->p_lock);
+ return (0);
+}
+
+long
+lx_setpriority(int which, id_t who, int prio)
+{
+ int rval;
+ idtype_t idtype;
+ id_t id, lid;
+ pcnice_t pcnice;
+ procset_t procset;
+
+ switch (which) {
+ case PRIO_PROCESS:
+ idtype = P_PID;
+ if (who > 0 && lx_lpid_to_spair(who, &who, &lid) < 0)
+ return (set_errno(ESRCH));
+ break;
+ case PRIO_PGRP:
+ idtype = P_PGID;
+ break;
+ case PRIO_USER:
+ idtype = P_UID;
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ /* Linux fails with a different errno on a negative id */
+ if (who < 0)
+ return (set_errno(ESRCH));
+
+ id = (who == 0 ? P_MYID : who);
+
+ if (prio > NZERO - 1) {
+ prio = NZERO - 1;
+ } else if (prio < -NZERO) {
+ prio = -NZERO;
+ }
+
+ pcnice.pc_val = prio;
+ pcnice.pc_op = PC_SETNICE;
+
+ setprocset(&procset, POP_AND, idtype, id, P_ALL, 0);
+
+ rval = donice(&procset, &pcnice);
+ if (rval != 0) {
+ /*
+ * Once we fully support Linux capabilities, we should update
+ * the following check to look at the CAP_SYS_NICE capability.
+ */
+ if (rval == EPERM && crgetuid(CRED()) != 0) {
+ /*
+ * donice() returns EPERM under two conditions:
+ * 1) if either the real or eff. uid don't match
+ * 2) we lack the privileges to raise the priority
+ *
+ * However, setpriority() must return a different errno
+ * based on the following:
+ * EPERM - real or eff. uid did not match
+ * EACCES - trying to increase priority
+ *
+ * We use lx_chk_pripriv to determine which case we hit.
+ *
+ * Note that the native setpriority(3C) code has the
+ * same race on re-checking.
+ */
+ if (dotoprocs(&procset, lx_chk_pripriv, NULL) != EPERM)
+ rval = EACCES;
+ }
+
+ return (set_errno(rval));
+ }
+
+ return (0);
+}
diff --git a/usr/src/uts/common/brand/lx/syscall/lx_timer.c b/usr/src/uts/common/brand/lx/syscall/lx_timer.c
index 17ca59b534..279bdbddc7 100644
--- a/usr/src/uts/common/brand/lx/syscall/lx_timer.c
+++ b/usr/src/uts/common/brand/lx/syscall/lx_timer.c
@@ -32,7 +32,10 @@
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/cmn_err.h>
+#include <sys/brand.h>
+#include <sys/lx_brand.h>
#include <sys/lx_impl.h>
+#include <lx_signum.h>
/*
* From "uts/common/os/timer.c":
@@ -90,8 +93,64 @@ static lx_clock_backend_t lx_clock_backends[] = {
#define LX_CLOCK_MAX \
(sizeof (lx_clock_backends) / sizeof (lx_clock_backends[0]))
-#define LX_CLOCK_BACKEND(clk) \
- ((clk) < LX_CLOCK_MAX && (clk) >= 0 ? &lx_clock_backends[(clk)] : NULL)
+#define LX_CLOCK_BACKEND(clk) (((clk) < LX_CLOCK_MAX && (clk) >= 0) ? \
+ &lx_clock_backends[(clk)] : NULL)
+
+/*
+ * Linux defines the size of the sigevent structure to be 64 bytes. In order
+ * to meet that definition, the trailing union includes a member which pads it
+ * out to the desired length for the given architecture.
+ */
+#define LX_SIGEV_PAD_SIZE ((64 - \
+ (sizeof (int) * 2 + sizeof (union sigval))) / sizeof (int))
+
+typedef struct {
+ union sigval lx_sigev_value;
+ int lx_sigev_signo;
+ int lx_sigev_notify;
+ union {
+ int lx_pad[LX_SIGEV_PAD_SIZE];
+ int lx_tid;
+ struct {
+ void (*lx_notify_function)(union sigval);
+ void *lx_notify_attribute;
+ } lx_sigev_thread;
+ } lx_sigev_un;
+} lx_sigevent_t;
+
+
+#ifdef _SYSCALL32_IMPL
+
+#define LX_SIGEV32_PAD_SIZE ((64 - \
+ (sizeof (int) * 2 + sizeof (union sigval32))) / sizeof (int))
+
+typedef struct {
+ union sigval32 lx_sigev_value;
+ int lx_sigev_signo;
+ int lx_sigev_notify;
+ union {
+ int lx_pad[LX_SIGEV32_PAD_SIZE];
+ int lx_tid;
+ struct {
+ caddr32_t lx_notify_function;
+ caddr32_t lx_notify_attribute;
+ } lx_sigev_thread;
+ } lx_sigev_un;
+} lx_sigevent32_t;
+
+#endif /* _SYSCALL32_IMPL */
+
+#define LX_SIGEV_SIGNAL 0
+#define LX_SIGEV_NONE 1
+#define LX_SIGEV_THREAD 2
+#define LX_SIGEV_THREAD_ID 4
+
+/*
+ * Access private SIGEV_THREAD_ID callback state in itimer_t
+ */
+#define LX_SIGEV_THREAD_ID_LPID(it) ((it)->it_cb_data[0])
+#define LX_SIGEV_THREAD_ID_TID(it) ((it)->it_cb_data[1])
+
/* ARGSUSED */
static int
@@ -276,6 +335,196 @@ lx_clock_getres(int clock, timespec_t *tp)
return (backend->lclk_clock_getres(backend->lclk_ntv_id, tp));
}
+static int
+lx_ltos_sigev(lx_sigevent_t *lev, struct sigevent *sev)
+{
+ bzero(sev, sizeof (*sev));
+
+ switch (lev->lx_sigev_notify) {
+ case LX_SIGEV_NONE:
+ sev->sigev_notify = SIGEV_NONE;
+ break;
+
+ case LX_SIGEV_SIGNAL:
+ case LX_SIGEV_THREAD_ID:
+ sev->sigev_notify = SIGEV_SIGNAL;
+ break;
+
+ case LX_SIGEV_THREAD:
+ /*
+ * Just as in illumos, SIGEV_THREAD handling is performed in
+ * userspace with the help of SIGEV_SIGNAL/SIGEV_THREAD_ID.
+ *
+ * It's not expected to make an appearance in the syscall.
+ */
+ default:
+ return (EINVAL);
+ }
+
+ sev->sigev_signo = lx_ltos_signo(lev->lx_sigev_signo, 0);
+ sev->sigev_value = lev->lx_sigev_value;
+
+ /* Ensure SIGEV_SIGNAL has a valid signo to work with. */
+ if (sev->sigev_notify == SIGEV_SIGNAL && sev->sigev_signo == 0) {
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static int
+lx_sigev_copyin(lx_sigevent_t *userp, lx_sigevent_t *levp)
+{
+#ifdef _SYSCALL32_IMPL
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ lx_sigevent32_t lev32;
+
+ if (copyin(userp, &lev32, sizeof (lev32)) != 0) {
+ return (EFAULT);
+ }
+ levp->lx_sigev_value.sival_int = lev32.lx_sigev_value.sival_int;
+ levp->lx_sigev_signo = lev32.lx_sigev_signo;
+ levp->lx_sigev_notify = lev32.lx_sigev_notify;
+ levp->lx_sigev_un.lx_tid = lev32.lx_sigev_un.lx_tid;
+ } else
+#endif /* _SYSCALL32_IMPL */
+ {
+ if (copyin(userp, levp, sizeof (lx_sigevent_t)) != 0) {
+ return (EFAULT);
+ }
+ }
+ return (0);
+}
+
+static void
+lx_sigev_thread_fire(itimer_t *it)
+{
+ proc_t *p = it->it_proc;
+ pid_t lpid = (pid_t)LX_SIGEV_THREAD_ID_LPID(it);
+ id_t tid = (id_t)LX_SIGEV_THREAD_ID_TID(it);
+ lwpdir_t *ld;
+
+ ASSERT(MUTEX_HELD(&it->it_mutex));
+ ASSERT(it->it_pending == 0);
+ ASSERT(it->it_flags & IT_SIGNAL);
+ ASSERT(MUTEX_HELD(&p->p_lock));
+
+ ld = lwp_hash_lookup(p, tid);
+ if (ld != NULL) {
+ lx_lwp_data_t *lwpd;
+ kthread_t *t;
+
+ t = ld->ld_entry->le_thread;
+ lwpd = ttolxlwp(t);
+ if (lwpd != NULL && lwpd->br_pid == lpid) {
+ /*
+ * A thread matching the LX pid is still present in the
+ * process. Send a targeted signal as requested.
+ */
+ it->it_pending = 1;
+ mutex_exit(&it->it_mutex);
+ sigaddqa(p, t, it->it_sigq);
+ return;
+ }
+ }
+
+ mutex_exit(&it->it_mutex);
+}
+
+long
+lx_timer_create(int clock, lx_sigevent_t *sevp, timer_t *tidp)
+{
+ int error;
+ lx_sigevent_t lev;
+ struct sigevent sev;
+ clock_backend_t *backend = NULL;
+ proc_t *p = curproc;
+ itimer_t *itp;
+ timer_t tid;
+
+ if (clock == -2) {
+ /*
+ * A change was made to the old userspace timer emulation to
+ * handle this specific clock ID for MapR. It was wrongly
+ * mapped to CLOCK_REALTIME rather than CLOCK_THREAD_CPUTIME_ID
+ * which it maps to. Until the CLOCK_*_CPUTIME_ID timers can
+ * be emulated, the admittedly incorrect mapping will remain.
+ */
+ backend = clock_get_backend(CLOCK_REALTIME);
+ } else {
+ lx_clock_backend_t *lback = LX_CLOCK_BACKEND(clock);
+
+ if (lback != NULL) {
+ backend = clock_get_backend(lback->lclk_ntv_id);
+ }
+ }
+ if (backend == NULL) {
+ return (set_errno(EINVAL));
+ }
+
+ /* We have to convert the Linux sigevent layout to the illumos layout */
+ if (sevp != NULL) {
+ if ((error = lx_sigev_copyin(sevp, &lev)) != 0) {
+ return (set_errno(error));
+ }
+ if ((error = lx_ltos_sigev(&lev, &sev)) != 0) {
+ return (set_errno(error));
+ }
+ } else {
+ bzero(&sev, sizeof (sev));
+ sev.sigev_notify = SIGEV_SIGNAL;
+ sev.sigev_signo = SIGALRM;
+ }
+
+ if ((error = timer_setup(backend, &sev, NULL, &itp, &tid)) != 0) {
+ return (set_errno(error));
+ }
+
+ /*
+ * The SIGEV_THREAD_ID notification method in Linux allows the caller
+ * to target a specific thread to receive the signal. The IT_CALLBACK
+ * timer functionality is used to fulfill this need. After translating
+ * the LX pid to a SunOS thread ID (ensuring it exists in the current
+ * process), those IDs are attached to the timer along with the custom
+ * lx_sigev_thread_fire callback. This targets the signal notification
+ * properly when the timer fires.
+ */
+ if (lev.lx_sigev_notify == LX_SIGEV_THREAD_ID) {
+ pid_t lpid, spid;
+ id_t stid;
+
+ lpid = (pid_t)lev.lx_sigev_un.lx_tid;
+ if (lx_lpid_to_spair(lpid, &spid, &stid) != 0 ||
+ spid != curproc->p_pid) {
+ error = EINVAL;
+ goto err;
+ }
+
+ itp->it_flags |= IT_CALLBACK;
+ itp->it_cb_func = lx_sigev_thread_fire;
+ LX_SIGEV_THREAD_ID_LPID(itp) = lpid;
+ LX_SIGEV_THREAD_ID_TID(itp) = stid;
+ }
+
+ /*
+ * When the sigevent is not specified, its sigev_value field is
+ * expected to be populated with the timer ID.
+ */
+ if (sevp == NULL) {
+ itp->it_sigq->sq_info.si_value.sival_int = tid;
+ }
+
+ if (copyout(&tid, tidp, sizeof (timer_t)) != 0) {
+ error = EFAULT;
+ goto err;
+ }
+
+ timer_release(p, itp);
+ return (0);
+
+err:
+ timer_delete_grabbed(p, tid, itp);
+ return (set_errno(error));
+}
long
lx_gettimeofday(struct timeval *tvp, struct lx_timezone *tzp)
diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c
index 0101d12432..fb48f15306 100644
--- a/usr/src/uts/common/disp/priocntl.c
+++ b/usr/src/uts/common/disp/priocntl.c
@@ -113,7 +113,7 @@ copyin_vaparms32(caddr_t arg, pc_vaparms_t *vap, uio_seg_t seg)
#endif
-static int donice(procset_t *, pcnice_t *);
+int donice(procset_t *, pcnice_t *);
static int doprio(procset_t *, pcprio_t *);
static int proccmp(proc_t *, struct pcmpargs *);
static int setparms(proc_t *, struct stprmargs *);
@@ -990,7 +990,7 @@ setprocnice(proc_t *pp, pcnice_t *pcnice)
/*
* Update the nice value of the specified LWP or set of processes.
*/
-static int
+int
donice(procset_t *procset, pcnice_t *pcnice)
{
int err_proc = 0;
diff --git a/usr/src/uts/common/fs/nfs/nfs4_client.c b/usr/src/uts/common/fs/nfs/nfs4_client.c
index 7bfa46e1fb..5438038105 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_client.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_client.c
@@ -184,7 +184,6 @@ nfs4_validate_caches(vnode_t *vp, cred_t *cr)
return (0);
}
- gar.n4g_va.va_mask = AT_ALL;
return (nfs4_getattr_otw(vp, &gar, cr, 0));
}
@@ -582,6 +581,16 @@ nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
rp->r_attr.va_ctime.tv_nsec !=
vap->va_ctime.tv_nsec)
ctime_changed = 1;
+
+ /*
+ * If the change attribute was not provided by server
+ * or it differs, then flush all caches.
+ */
+ if (!garp->n4g_change_valid ||
+ rp->r_change != garp->n4g_change) {
+ mtime_changed = 1;
+ ctime_changed = 1;
+ }
} else {
writemodify_set = B_TRUE;
}
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
index 855cd8cd92..7240faa356 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
@@ -209,7 +209,7 @@ rfs4_attr_init()
/* ARGSUSED */
static int
rfs4_fattr4_supported_attrs(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -251,7 +251,7 @@ static nfs_ftype4 vt_to_nf4[] = {
/* ARGSUSED */
static int
rfs4_fattr4_type(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -357,7 +357,7 @@ fattr4_get_fh_expire_type(struct exportinfo *exi, uint32_t *fh_expire_typep)
/* ARGSUSED */
static int
rfs4_fattr4_fh_expire_type(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
uint32_t fh_expire_type;
int error = 0;
@@ -396,6 +396,7 @@ fattr4_get_change(struct nfs4_svgetit_arg *sarg, fattr4_change *changep)
struct compound_state *cs = sarg->cs;
vnode_t *vp = cs->vp;
nfsstat4 status;
+ timespec_t vis_change;
if ((vap->va_mask & AT_CTIME) == 0) {
if (sarg->rdattr_error && (vp == NULL)) {
@@ -408,14 +409,22 @@ fattr4_get_change(struct nfs4_svgetit_arg *sarg, fattr4_change *changep)
if (status != NFS4_OK)
return (geterrno4(status));
}
- NFS4_SET_FATTR4_CHANGE(*changep, vap->va_ctime)
+ NFS4_SET_FATTR4_CHANGE(*changep, vap->va_ctime);
+
+ if (nfs_visible_change(cs->exi, vp, &vis_change)) {
+ fattr4_change visch;
+ NFS4_SET_FATTR4_CHANGE(visch, vis_change);
+ if (visch > *changep)
+ *changep = visch;
+ }
+
return (0);
}
/* ARGSUSED */
static int
rfs4_fattr4_change(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
fattr4_change change;
@@ -453,7 +462,7 @@ rfs4_fattr4_change(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_size(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -490,7 +499,7 @@ rfs4_fattr4_size(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_link_support(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -525,7 +534,7 @@ rfs4_fattr4_link_support(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_symlink_support(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -556,7 +565,7 @@ rfs4_fattr4_symlink_support(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_named_attr(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
ulong_t val;
@@ -626,7 +635,7 @@ rfs4_fattr4_named_attr(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_fsid(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
int *pmaj = (int *)&na->fsid.major;
@@ -681,7 +690,7 @@ rfs4_fattr4_fsid(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_unique_handles(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
/*
* XXX
@@ -718,7 +727,7 @@ rfs4_fattr4_unique_handles(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_lease_time(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -749,7 +758,7 @@ rfs4_fattr4_lease_time(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_rdattr_error(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -798,7 +807,7 @@ rfs4fhcmp(nfs_fh4 *wirefh, nfs_fh4 *srvfh)
/* ARGSUSED */
static int
rfs4_fattr4_filehandle(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
nfs_fh4 *fh;
@@ -861,7 +870,7 @@ rfs4_fattr4_filehandle(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_acl(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
vsecattr_t vs_native, vs_ace4;
@@ -1047,7 +1056,7 @@ rfs4_fattr4_acl(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_aclsupport(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1079,7 +1088,7 @@ rfs4_fattr4_aclsupport(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_archive(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -1087,7 +1096,7 @@ rfs4_fattr4_archive(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_cansettime(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1125,7 +1134,7 @@ rfs4_fattr4_cansettime(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_case_insensitive(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1159,7 +1168,7 @@ rfs4_fattr4_case_insensitive(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_case_preserving(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1194,7 +1203,7 @@ rfs4_fattr4_case_preserving(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_chown_restricted(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
ulong_t val;
@@ -1244,7 +1253,7 @@ rfs4_fattr4_chown_restricted(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_fileid(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1352,7 +1361,7 @@ rfs4_get_mntdfileid(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg)
/* ARGSUSED */
static int
rfs4_fattr4_mounted_on_fileid(nfs4_attr_cmd_t cmd,
- struct nfs4_svgetit_arg *sarg, union nfs4_attr_u *na)
+ struct nfs4_svgetit_arg *sarg, union nfs4_attr_u *na)
{
int error = 0;
@@ -1391,7 +1400,7 @@ rfs4_fattr4_mounted_on_fileid(nfs4_attr_cmd_t cmd,
/* ARGSUSED */
static int
rfs4_fattr4_files_avail(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1431,7 +1440,7 @@ rfs4_fattr4_files_avail(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_files_free(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1471,7 +1480,7 @@ rfs4_fattr4_files_free(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_files_total(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1571,7 +1580,7 @@ rfs4_free_fs_locations4(fs_locations4 *fsls4)
/* ARGSUSED */
static int
rfs4_fattr4_fs_locations(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
fs_locations4 *fsl;
@@ -1617,7 +1626,7 @@ rfs4_fattr4_fs_locations(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_hidden(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -1625,7 +1634,7 @@ rfs4_fattr4_hidden(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_homogeneous(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1659,7 +1668,7 @@ rfs4_fattr4_homogeneous(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_maxfilesize(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
ulong_t val;
@@ -1737,7 +1746,7 @@ rfs4_fattr4_maxfilesize(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_maxlink(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
ulong_t val;
@@ -1784,7 +1793,7 @@ rfs4_fattr4_maxlink(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_maxname(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
ulong_t val;
@@ -1831,7 +1840,7 @@ rfs4_fattr4_maxname(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_maxread(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1865,7 +1874,7 @@ rfs4_fattr4_maxread(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_maxwrite(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1899,7 +1908,7 @@ rfs4_fattr4_maxwrite(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_mimetype(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -1907,7 +1916,7 @@ rfs4_fattr4_mimetype(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_mode(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1950,7 +1959,7 @@ rfs4_fattr4_mode(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_no_trunc(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -1984,7 +1993,7 @@ rfs4_fattr4_no_trunc(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_numlinks(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2024,7 +2033,7 @@ rfs4_fattr4_numlinks(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_owner(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
uid_t uid;
@@ -2136,7 +2145,7 @@ rfs4_fattr4_owner(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_owner_group(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
gid_t gid;
@@ -2252,7 +2261,7 @@ rfs4_fattr4_owner_group(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_quota_avail_hard(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -2260,7 +2269,7 @@ rfs4_fattr4_quota_avail_hard(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_quota_avail_soft(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -2268,7 +2277,7 @@ rfs4_fattr4_quota_avail_soft(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_quota_used(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -2276,7 +2285,7 @@ rfs4_fattr4_quota_used(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_rawdev(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2320,7 +2329,7 @@ rfs4_fattr4_rawdev(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_space_avail(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2367,7 +2376,7 @@ rfs4_fattr4_space_avail(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_space_free(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2414,7 +2423,7 @@ rfs4_fattr4_space_free(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_space_total(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2461,7 +2470,7 @@ rfs4_fattr4_space_total(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_space_used(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2502,7 +2511,7 @@ rfs4_fattr4_space_used(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_system(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -2510,7 +2519,7 @@ rfs4_fattr4_system(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_access(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
timestruc_t atime;
@@ -2557,7 +2566,7 @@ rfs4_fattr4_time_access(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_access_set(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
settime4 *ta;
@@ -2601,7 +2610,7 @@ rfs4_fattr4_time_access_set(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_backup(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -2609,7 +2618,7 @@ rfs4_fattr4_time_backup(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_create(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
return (ENOTSUP);
}
@@ -2617,7 +2626,7 @@ rfs4_fattr4_time_create(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_delta(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
@@ -2653,7 +2662,7 @@ rfs4_fattr4_time_delta(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_metadata(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
timestruc_t ctime;
@@ -2698,7 +2707,7 @@ rfs4_fattr4_time_metadata(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_modify(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
timestruc_t mtime;
@@ -2745,7 +2754,7 @@ rfs4_fattr4_time_modify(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
/* ARGSUSED */
static int
rfs4_fattr4_time_modify_set(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
- union nfs4_attr_u *na)
+ union nfs4_attr_u *na)
{
int error = 0;
settime4 *tm;
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
index 3ee41939ac..4ad799be46 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
@@ -144,7 +144,7 @@ nfs4_vget_pseudo(struct exportinfo *exi, vnode_t **vpp, fid_t *fidp)
*/
struct exportinfo *
pseudo_exportfs(vnode_t *vp, fid_t *fid, struct exp_visible *vis_head,
- struct exportdata *exdata)
+ struct exportdata *exdata)
{
struct exportinfo *exi;
struct exportdata *kex;
@@ -446,8 +446,12 @@ more_visible(struct exportinfo *exi, treenode_t *tree_head)
* list just assign the entire supplied list.
*/
if (exi->exi_visible == NULL) {
- tree_add_child(exi->exi_tree, tree_head);
+ tree_add_child(connect_point, tree_head);
exi->exi_visible = vis_head;
+
+ /* Update the change timestamp */
+ tree_update_change(connect_point, &vis_head->vis_change);
+
return;
}
@@ -504,6 +508,11 @@ more_visible(struct exportinfo *exi, treenode_t *tree_head)
connect_point = child;
} else { /* Branching */
tree_add_child(connect_point, curr);
+
+ /* Update the change timestamp */
+ tree_update_change(connect_point,
+ &curr->tree_vis->vis_change);
+
connect_point = NULL;
}
}
@@ -612,15 +621,17 @@ treeclimb_export(struct exportinfo *exip)
fid_t fid;
int error;
int exportdir;
- struct exportinfo *exi = NULL;
struct exportinfo *new_exi = exip;
struct exp_visible *visp;
struct exp_visible *vis_head = NULL;
struct vattr va;
treenode_t *tree_head = NULL;
+ timespec_t now;
ASSERT(RW_WRITE_HELD(&exported_lock));
+ gethrestime(&now);
+
vp = exip->exi_vp;
VN_HOLD(vp);
exportdir = 1;
@@ -633,36 +644,33 @@ treeclimb_export(struct exportinfo *exip)
if (error)
break;
- if (! exportdir) {
- /*
- * Check if this exportroot is a VROOT dir. If so,
- * then attach the pseudonodes. If not, then
- * continue .. traversal until we hit a VROOT
- * export (pseudo or real).
- */
- exi = checkexport4(&vp->v_vfsp->vfs_fsid, &fid, vp);
- if (exi != NULL && vp->v_flag & VROOT) {
- /*
- * Found an export info
- *
- * Extend the list of visible
- * directories whether it's a pseudo
- * or a real export.
- */
- more_visible(exi, tree_head);
- break; /* and climb no further */
- }
- }
-
/*
- * If at the root of the filesystem, need
- * to traverse across the mountpoint
- * and continue the climb on the mounted-on
- * filesystem.
+ * The root of the file system needs special handling
*/
if (vp->v_flag & VROOT) {
-
if (! exportdir) {
+ struct exportinfo *exi;
+
+ /*
+ * Check if this VROOT dir is already exported.
+ * If so, then attach the pseudonodes. If not,
+ * then continue .. traversal until we hit a
+ * VROOT export (pseudo or real).
+ */
+ exi = checkexport4(&vp->v_vfsp->vfs_fsid, &fid,
+ vp);
+ if (exi != NULL) {
+ /*
+ * Found an export info
+ *
+ * Extend the list of visible
+ * directories whether it's a pseudo
+ * or a real export.
+ */
+ more_visible(exi, tree_head);
+ break; /* and climb no further */
+ }
+
/*
* Found the root directory of a filesystem
* that isn't exported. Need to export
@@ -679,13 +687,21 @@ treeclimb_export(struct exportinfo *exip)
/*
* If sharing "/", new_exi is shared exportinfo
* (exip). Otherwise, new_exi is exportinfo
- * created in pseudo_exportfs() above.
+ * created by pseudo_exportfs() above.
*/
- ns_root = tree_prepend_node(tree_head, 0,
+ ns_root = tree_prepend_node(tree_head, NULL,
new_exi);
+
+ /* Update the change timestamp */
+ tree_update_change(ns_root, &now);
+
break;
}
+ /*
+ * Traverse across the mountpoint and continue the
+ * climb on the mounted-on filesystem.
+ */
vp = untraverse(vp);
exportdir = 0;
continue;
@@ -712,10 +728,10 @@ treeclimb_export(struct exportinfo *exip)
visp->vis_exported = exportdir;
visp->vis_secinfo = NULL;
visp->vis_seccnt = 0;
+ visp->vis_change = now; /* structure copy */
visp->vis_next = vis_head;
vis_head = visp;
-
/*
* Will set treenode's pointer to exportinfo to
* 1. shared exportinfo (exip) - if first visit here
@@ -765,7 +781,7 @@ treeclimb_export(struct exportinfo *exip)
/* Connect unconnected exportinfo, if there is any. */
if (new_exi && new_exi != exip)
- tree_head = tree_prepend_node(tree_head, 0, new_exi);
+ tree_head = tree_prepend_node(tree_head, NULL, new_exi);
while (tree_head) {
treenode_t *t2 = tree_head;
@@ -799,6 +815,7 @@ void
treeclimb_unexport(struct exportinfo *exip)
{
treenode_t *tnode, *old_nd;
+ treenode_t *connect_point = NULL;
ASSERT(RW_WRITE_HELD(&exported_lock));
@@ -809,25 +826,25 @@ treeclimb_unexport(struct exportinfo *exip)
*/
tnode->tree_exi = NULL;
- if (tnode->tree_vis) /* system root has tree_vis == NULL */
+ if (tnode->tree_vis != NULL) /* system root has tree_vis == NULL */
tnode->tree_vis->vis_exported = 0;
- while (tnode) {
+ while (tnode != NULL) {
/* Stop at VROOT node which is exported or has child */
if (TREE_ROOT(tnode) &&
- (TREE_EXPORTED(tnode) || tnode->tree_child_first))
+ (TREE_EXPORTED(tnode) || tnode->tree_child_first != NULL))
break;
/* Release pseudo export if it has no child */
if (TREE_ROOT(tnode) && !TREE_EXPORTED(tnode) &&
- tnode->tree_child_first == 0) {
+ tnode->tree_child_first == NULL) {
export_unlink(tnode->tree_exi);
exi_rele(tnode->tree_exi);
}
/* Release visible in parent's exportinfo */
- if (tnode->tree_vis)
+ if (tnode->tree_vis != NULL)
less_visible(vis2exi(tnode), tnode->tree_vis);
/* Continue with parent */
@@ -835,9 +852,16 @@ treeclimb_unexport(struct exportinfo *exip)
tnode = tnode->tree_parent;
/* Remove itself, if this is a leaf and non-exported node */
- if (old_nd->tree_child_first == NULL && !TREE_EXPORTED(old_nd))
+ if (old_nd->tree_child_first == NULL &&
+ !TREE_EXPORTED(old_nd)) {
tree_remove_node(old_nd);
+ connect_point = tnode;
+ }
}
+
+ /* Update the change timestamp */
+ if (connect_point != NULL)
+ tree_update_change(connect_point, NULL);
}
/*
@@ -929,7 +953,7 @@ has_visible(struct exportinfo *exi, vnode_t *vp)
fid_t fid;
bool_t vp_is_exported;
- vp_is_exported = VN_CMP(vp, exi->exi_vp);
+ vp_is_exported = VN_CMP(vp, exi->exi_vp);
/*
* An exported root vnode has a sub-dir shared if it has a visible list.
@@ -1111,10 +1135,9 @@ nfs_exported(struct exportinfo *exi, vnode_t *vp)
* skips . and .. entries.
*/
int
-nfs_visible_inode(struct exportinfo *exi, ino64_t ino, int *expseudo)
+nfs_visible_inode(struct exportinfo *exi, ino64_t ino,
+ struct exp_visible **visp)
{
- struct exp_visible *visp;
-
/*
* Only a PSEUDO node has a visible list or an exported VROOT
* node may have a visible list.
@@ -1122,12 +1145,108 @@ nfs_visible_inode(struct exportinfo *exi, ino64_t ino, int *expseudo)
if (! PSEUDO(exi))
exi = get_root_export(exi);
- for (visp = exi->exi_visible; visp; visp = visp->vis_next)
- if ((u_longlong_t)ino == visp->vis_ino) {
- *expseudo = visp->vis_exported;
+ for (*visp = exi->exi_visible; *visp != NULL; *visp = (*visp)->vis_next)
+ if ((u_longlong_t)ino == (*visp)->vis_ino) {
return (1);
}
- *expseudo = 0;
return (0);
}
+
+/*
+ * The change attribute value of the root of nfs pseudo namespace.
+ *
+ * The ns_root_change is protected by exported_lock because all of the treenode
+ * operations are protected by exported_lock too.
+ */
+static timespec_t ns_root_change;
+
+/*
+ * Get the change attribute from visible and returns TRUE.
+ * If the change value is not available returns FALSE.
+ */
+bool_t
+nfs_visible_change(struct exportinfo *exi, vnode_t *vp, timespec_t *change)
+{
+ struct exp_visible *visp;
+ fid_t fid;
+ treenode_t *node;
+
+ /*
+ * First check to see if vp is export root.
+ */
+ if (VN_CMP(vp, exi->exi_vp))
+ goto exproot;
+
+ /*
+ * Only a PSEUDO node has a visible list or an exported VROOT
+ * node may have a visible list.
+ */
+ if (!PSEUDO(exi))
+ exi = get_root_export(exi);
+
+ /* Get the fid of the vnode */
+ bzero(&fid, sizeof (fid));
+ fid.fid_len = MAXFIDSZ;
+ if (vop_fid_pseudo(vp, &fid) != 0)
+ return (FALSE);
+
+ /*
+ * We can't trust VN_CMP() above because of LOFS.
+ * Even though VOP_CMP will do the right thing for LOFS
+ * objects, VN_CMP will short circuit out early when the
+ * vnode ops ptrs are different. Just in case we're dealing
+ * with LOFS, compare exi_fid/fsid here.
+ */
+ if (EQFID(&exi->exi_fid, &fid) &&
+ EQFSID(&exi->exi_fsid, &vp->v_vfsp->vfs_fsid))
+ goto exproot;
+
+ /* See if it matches any fid in the visible list */
+ for (visp = exi->exi_visible; visp; visp = visp->vis_next) {
+ if (EQFID(&fid, &visp->vis_fid)) {
+ *change = visp->vis_change;
+ return (TRUE);
+ }
+ }
+
+ return (FALSE);
+
+exproot:
+ /* The VROOT export have its visible available through treenode */
+ node = exi->exi_tree;
+ if (node != ns_root) {
+ ASSERT(node->tree_vis != NULL);
+ *change = node->tree_vis->vis_change;
+ } else {
+ ASSERT(node->tree_vis == NULL);
+ *change = ns_root_change;
+ }
+
+ return (TRUE);
+}
+
+/*
+ * Update the change attribute value for a particular treenode. The change
+ * attribute value is stored in the visible attached to the treenode, or in the
+ * ns_root_change.
+ *
+ * If the change value is not supplied, the current time is used.
+ */
+void
+tree_update_change(treenode_t *tnode, timespec_t *change)
+{
+ timespec_t *vis_change;
+
+ ASSERT(tnode != NULL);
+ ASSERT((tnode != ns_root && tnode->tree_vis != NULL) ||
+ (tnode == ns_root && tnode->tree_vis == NULL));
+
+ vis_change = tnode == ns_root ? &ns_root_change
+ : &tnode->tree_vis->vis_change;
+
+ if (change != NULL)
+ *vis_change = *change;
+ else
+ gethrestime(vis_change);
+}
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_readdir.c b/usr/src/uts/common/fs/nfs/nfs4_srv_readdir.c
index 276d3b4f19..01c76cb203 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_readdir.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_readdir.c
@@ -104,8 +104,8 @@ static nfs_ftype4 vt_to_nf4[] = {
int
nfs4_readdir_getvp(vnode_t *dvp, char *d_name, vnode_t **vpp,
- struct exportinfo **exi, struct svc_req *req,
- struct compound_state *cs, int expseudo)
+ struct exportinfo **exi, struct svc_req *req, struct compound_state *cs,
+ int expseudo)
{
int error;
int ismntpt;
@@ -382,8 +382,8 @@ rfs4_get_sb_encode(vfs_t *vfsp, rfs4_sb_encode_t *psbe)
*/
/* ARGSUSED */
void
-rfs4_op_readdir(nfs_argop4 *argop, nfs_resop4 *resop,
- struct svc_req *req, struct compound_state *cs)
+rfs4_op_readdir(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req,
+ struct compound_state *cs)
{
READDIR4args *args = &argop->nfs_argop4_u.opreaddir;
READDIR4res *resp = &resop->nfs_resop4_u.opreaddir;
@@ -409,7 +409,7 @@ rfs4_op_readdir(nfs_argop4 *argop, nfs_resop4 *resop,
struct uio uio;
int tsize;
int check_visible;
- int expseudo = 0;
+ struct exp_visible *visp;
uint32_t *ptr, *ptr_redzone;
uint32_t *beginning_ptr;
@@ -687,8 +687,8 @@ readagain:
for (dp = (struct dirent64 *)rddir_data;
!no_space && rddir_result_size > 0; dp = nextdp(dp)) {
- /* reset expseudo */
- expseudo = 0;
+ /* reset visp */
+ visp = NULL;
if (vp) {
VN_RELE(vp);
@@ -707,7 +707,7 @@ readagain:
}
if (check_visible &&
- !nfs_visible_inode(cs->exi, dp->d_ino, &expseudo)) {
+ !nfs_visible_inode(cs->exi, dp->d_ino, &visp)) {
rddir_next_offset = dp->d_off;
continue;
}
@@ -724,7 +724,8 @@ readagain:
goto reencode_attrs;
error = nfs4_readdir_getvp(dvp, dp->d_name,
- &vp, &newexi, req, cs, expseudo);
+ &vp, &newexi, req, cs,
+ visp != NULL ? visp->vis_exported : 0);
if (error == ENOENT) {
rddir_next_offset = dp->d_off;
continue;
@@ -917,6 +918,13 @@ reencode_attrs:
u_longlong_t change;
NFS4_SET_FATTR4_CHANGE(change,
va.va_ctime);
+ if (visp != NULL) {
+ u_longlong_t visch;
+ NFS4_SET_FATTR4_CHANGE(visch,
+ visp->vis_change);
+ if (visch > change)
+ change = visch;
+ }
IXDR_PUT_HYPER(ptr, change);
}
if (ae & FATTR4_SIZE_MASK) {
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index 01886e3627..4c6be91e0a 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -2233,7 +2233,6 @@ nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
(rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
return (0);
- gar.n4g_va.va_mask = AT_ALL;
return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
}
@@ -12384,9 +12383,8 @@ nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
/*
* The getattr otw call will always get both the acl, in
* the form of a list of nfsace4's, and the number of acl
- * entries; independent of the value of gar.n4g_vsa.vsa_mask.
+ * entries; independent of the value of gar.n4g_va.va_mask.
*/
- gar.n4g_va.va_mask = AT_ALL;
error = nfs4_getattr_otw(vp, &gar, cr, 1);
if (error) {
vs_ace4_destroy(&gar.n4g_vsa);
diff --git a/usr/src/uts/common/fs/nfs/nfs_export.c b/usr/src/uts/common/fs/nfs/nfs_export.c
index 4c316a3876..200ef6668d 100644
--- a/usr/src/uts/common/fs/nfs/nfs_export.c
+++ b/usr/src/uts/common/fs/nfs/nfs_export.c
@@ -83,7 +83,7 @@ extern void sec_svc_freerootnames(int, int, caddr_t *);
static int build_seclist_nodups(exportdata_t *, secinfo_t *, int);
static void srv_secinfo_add(secinfo_t **, int *, secinfo_t *, int, int);
static void srv_secinfo_remove(secinfo_t **, int *, secinfo_t *, int);
-static void srv_secinfo_treeclimb(exportinfo_t *, secinfo_t *, int, int);
+static void srv_secinfo_treeclimb(exportinfo_t *, secinfo_t *, int, bool_t);
#ifdef VOLATILE_FH_TEST
static struct ex_vol_rename *find_volrnm_fh(exportinfo_t *, nfs_fh4 *);
@@ -703,12 +703,13 @@ vis2exi(treenode_t *tnode)
* given exportinfo from its ancestors upto the system root.
*/
void
-srv_secinfo_treeclimb(exportinfo_t *exip, secinfo_t *sec, int seccnt, int isadd)
+srv_secinfo_treeclimb(exportinfo_t *exip, secinfo_t *sec, int seccnt,
+ bool_t isadd)
{
treenode_t *tnode = exip->exi_tree;
ASSERT(RW_WRITE_HELD(&exported_lock));
- ASSERT(tnode);
+ ASSERT(tnode != NULL);
if (seccnt == 0)
return;
@@ -716,7 +717,7 @@ srv_secinfo_treeclimb(exportinfo_t *exip, secinfo_t *sec, int seccnt, int isadd)
/*
* If flavors are being added and the new export root isn't
* also VROOT, its implicitly allowed flavors are inherited from
- * from its pseudonode.
+ * its pseudonode.
* Note - for VROOT exports the implicitly allowed flavors were
* transferred from the PSEUDO export in exportfs()
*/
@@ -733,10 +734,10 @@ srv_secinfo_treeclimb(exportinfo_t *exip, secinfo_t *sec, int seccnt, int isadd)
*/
tnode = tnode->tree_parent;
- while (tnode) {
+ while (tnode != NULL) {
/* If there is exportinfo, update it */
- if (tnode->tree_exi) {
+ if (tnode->tree_exi != NULL) {
secinfo_t **pxsec =
&tnode->tree_exi->exi_export.ex_secinfo;
int *pxcnt = &tnode->tree_exi->exi_export.ex_seccnt;
@@ -749,7 +750,7 @@ srv_secinfo_treeclimb(exportinfo_t *exip, secinfo_t *sec, int seccnt, int isadd)
}
/* Update every visible - only root node has no visible */
- if (tnode->tree_vis) {
+ if (tnode->tree_vis != NULL) {
secinfo_t **pxsec = &tnode->tree_vis->vis_secinfo;
int *pxcnt = &tnode->tree_vis->vis_seccnt;
if (isadd)
@@ -1517,9 +1518,12 @@ exportfs(struct exportfs_args *args, model_t model, cred_t *cr)
if (error)
goto out7;
} else {
- /* If it's a re-export update namespace tree */
+ /* If it's a re-export update namespace tree */
exi->exi_tree = ex->exi_tree;
exi->exi_tree->tree_exi = exi;
+
+ /* Update the change timestamp */
+ tree_update_change(exi->exi_tree, NULL);
}
/*
@@ -1670,7 +1674,7 @@ unexport(struct exportinfo *exi)
* a pseudo export here to retain the visible list
* for paths to exports below.
*/
- if (exi->exi_visible) {
+ if (exi->exi_visible != NULL) {
struct exportinfo *newexi;
newexi = pseudo_exportfs(exi->exi_vp, &exi->exi_fid,
@@ -1680,6 +1684,9 @@ unexport(struct exportinfo *exi)
/* interconnect the existing treenode with the new exportinfo */
newexi->exi_tree = exi->exi_tree;
newexi->exi_tree->tree_exi = newexi;
+
+ /* Update the change timestamp */
+ tree_update_change(exi->exi_tree, NULL);
} else {
treeclimb_unexport(exi);
}
@@ -1893,7 +1900,7 @@ nfs_getfh(struct nfs_getfh_args *args, model_t model, cred_t *cr)
*/
struct exportinfo *
nfs_vptoexi(vnode_t *dvp, vnode_t *vp, cred_t *cr, int *walk,
- int *err, bool_t v4srv)
+ int *err, bool_t v4srv)
{
fid_t fid;
int error;
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index c4b8d2acc6..f3f6c818a0 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -77,10 +77,10 @@
* A new reference to a cache buffer can be obtained in two
* ways: 1) via a hash table lookup using the DVA as a key,
* or 2) via one of the ARC lists. The arc_read() interface
- * uses method 1, while the internal arc algorithms for
+ * uses method 1, while the internal ARC algorithms for
* adjusting the cache use method 2. We therefore provide two
* types of locks: 1) the hash table lock array, and 2) the
- * arc list locks.
+ * ARC list locks.
*
* Buffers do not have their own mutexes, rather they rely on the
* hash table mutexes for the bulk of their protection (i.e. most
@@ -93,21 +93,12 @@
* buf_hash_remove() expects the appropriate hash mutex to be
* already held before it is invoked.
*
- * Each arc state also has a mutex which is used to protect the
+ * Each ARC state also has a mutex which is used to protect the
* buffer list associated with the state. When attempting to
- * obtain a hash table lock while holding an arc list lock you
+ * obtain a hash table lock while holding an ARC list lock you
* must use: mutex_tryenter() to avoid deadlock. Also note that
* the active state mutex must be held before the ghost state mutex.
*
- * Arc buffers may have an associated eviction callback function.
- * This function will be invoked prior to removing the buffer (e.g.
- * in arc_do_user_evicts()). Note however that the data associated
- * with the buffer may be evicted prior to the callback. The callback
- * must be made with *no locks held* (to prevent deadlock). Additionally,
- * the users of callbacks must ensure that their private data is
- * protected from simultaneous callbacks from arc_clear_callback()
- * and arc_do_user_evicts().
- *
* Note that the majority of the performance stats are manipulated
* with atomic operations.
*
@@ -136,67 +127,81 @@
* are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
* the arc_buf_hdr_t that will point to the data block in memory. A block can
* only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
- * caches data in two ways -- in a list of arc buffers (arc_buf_t) and
+ * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
* also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
- * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC
- * consumer, and always contains uncompressed data. The ARC will provide
- * references to this data and will keep it cached until it is no longer in
- * use. Typically, the arc will try to cache only the L1ARC's physical data
- * block and will aggressively evict any arc_buf_t that is no longer referenced.
- * The amount of memory consumed by the arc_buf_t's can be seen via the
+ *
+ * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
+ * ability to store the physical data (b_pdata) associated with the DVA of the
+ * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block,
+ * it will match its on-disk compression characteristics. This behavior can be
+ * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
+ * compressed ARC functionality is disabled, the b_pdata will point to an
+ * uncompressed version of the on-disk data.
+ *
+ * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
+ * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
+ * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
+ * consumer. The ARC will provide references to this data and will keep it
+ * cached until it is no longer in use. The ARC caches only the L1ARC's physical
+ * data block and will evict any arc_buf_t that is no longer referenced. The
+ * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
* "overhead_size" kstat.
*
+ * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
+ * compressed form. The typical case is that consumers will want uncompressed
+ * data, and when that happens a new data buffer is allocated where the data is
+ * decompressed for them to use. Currently the only consumer who wants
+ * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
+ * exists on disk. When this happens, the arc_buf_t's data buffer is shared
+ * with the arc_buf_hdr_t.
*
- * arc_buf_hdr_t
- * +-----------+
- * | |
- * | |
- * | |
- * +-----------+
- * l2arc_buf_hdr_t| |
- * | |
- * +-----------+
- * l1arc_buf_hdr_t| |
- * | | arc_buf_t
- * | b_buf +------------>+---------+ arc_buf_t
- * | | |b_next +---->+---------+
- * | b_pdata +-+ |---------| |b_next +-->NULL
- * +-----------+ | | | +---------+
- * | |b_data +-+ | |
- * | +---------+ | |b_data +-+
- * +->+------+ | +---------+ |
- * (potentially) | | | |
- * compressed | | | |
- * data +------+ | v
- * +->+------+ +------+
- * uncompressed | | | |
- * data | | | |
- * +------+ +------+
+ * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
+ * first one is owned by a compressed send consumer (and therefore references
+ * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
+ * used by any other consumer (and has its own uncompressed copy of the data
+ * buffer).
*
- * The L1ARC's data pointer, however, may or may not be uncompressed. The
- * ARC has the ability to store the physical data (b_pdata) associated with
- * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk
- * physical block, it will match its on-disk compression characteristics.
- * If the block on-disk is compressed, then the physical data block
- * in the cache will also be compressed and vice-versa. This behavior
- * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
- * compressed ARC functionality is disabled, the b_pdata will point to an
- * uncompressed version of the on-disk data.
+ * arc_buf_hdr_t
+ * +-----------+
+ * | fields |
+ * | common to |
+ * | L1- and |
+ * | L2ARC |
+ * +-----------+
+ * | l2arc_buf_hdr_t
+ * | |
+ * +-----------+
+ * | l1arc_buf_hdr_t
+ * | | arc_buf_t
+ * | b_buf +------------>+-----------+ arc_buf_t
+ * | b_pdata +-+ |b_next +---->+-----------+
+ * +-----------+ | |-----------| |b_next +-->NULL
+ * | |b_comp = T | +-----------+
+ * | |b_data +-+ |b_comp = F |
+ * | +-----------+ | |b_data +-+
+ * +->+------+ | +-----------+ |
+ * compressed | | | |
+ * data | |<--------------+ | uncompressed
+ * +------+ compressed, | data
+ * shared +-->+------+
+ * data | |
+ * | |
+ * +------+
*
* When a consumer reads a block, the ARC must first look to see if the
- * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t,
- * then an additional arc_buf_t is allocated and the uncompressed data is
- * bcopied from the existing arc_buf_t. If the hdr is cached but does not
- * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses
- * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's
- * b_pdata is not compressed, then the block is shared with the newly
- * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t
- * in the arc buffer chain. Sharing the block reduces the memory overhead
- * required when the hdr is caching uncompressed blocks or the compressed
- * arc functionality has been disabled via 'zfs_compressed_arc_enabled'.
+ * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
+ * arc_buf_t and either copies uncompressed data into a new data buffer from an
+ * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a
+ * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the
+ * hdr is compressed and the desired compression characteristics of the
+ * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
+ * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
+ * the last buffer in the hdr's b_buf list, however a shared compressed buf can
+ * be anywhere in the hdr's list.
*
* The diagram below shows an example of an uncompressed ARC hdr that is
- * sharing its data with an arc_buf_t:
+ * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
+ * the last element in the buf list):
*
* arc_buf_hdr_t
* +-----------+
@@ -225,20 +230,24 @@
* | +------+ |
* +---------------------------------+
*
- * Writing to the arc requires that the ARC first discard the b_pdata
+ * Writing to the ARC requires that the ARC first discard the hdr's b_pdata
* since the physical block is about to be rewritten. The new data contents
- * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline
- * performs the write, it may compress the data before writing it to disk.
- * The ARC will be called with the transformed data and will bcopy the
- * transformed on-disk block into a newly allocated b_pdata.
+ * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
+ * it may compress the data before writing it to disk. The ARC will be called
+ * with the transformed data and will bcopy the transformed on-disk block into
+ * a newly allocated b_pdata. Writes are always done into buffers which have
+ * either been loaned (and hence are new and don't have other readers) or
+ * buffers which have been released (and hence have their own hdr, if there
+ * were originally other readers of the buf's original hdr). This ensures that
+ * the ARC only needs to update a single buf and its hdr after a write occurs.
*
* When the L2ARC is in use, it will also take advantage of the b_pdata. The
* L2ARC will always write the contents of b_pdata to the L2ARC. This means
- * that when compressed arc is enabled that the L2ARC blocks are identical
+ * that when compressed ARC is enabled that the L2ARC blocks are identical
* to the on-disk block in the main data pool. This provides a significant
* advantage since the ARC can leverage the bp's checksum when reading from the
* L2ARC to determine if the contents are valid. However, if the compressed
- * arc is disabled, then the L2ARC's block must be transformed to look
+ * ARC is disabled, then the L2ARC's block must be transformed to look
* like the physical block in the main data pool before comparing the
* checksum and determining its validity.
*/
@@ -805,6 +814,7 @@ struct arc_callback {
void *acb_private;
arc_done_func_t *acb_done;
arc_buf_t *acb_buf;
+ boolean_t acb_compressed;
zio_t *acb_zio_dummy;
arc_callback_t *acb_next;
};
@@ -856,7 +866,7 @@ typedef struct l1arc_buf_hdr {
zio_cksum_t *b_freeze_cksum;
#ifdef ZFS_DEBUG
/*
- * used for debugging wtih kmem_flags - by allocating and freeing
+ * Used for debugging with kmem_flags - by allocating and freeing
* b_thawed when the buffer is thawed, we get a record of the stack
* trace that thawed it.
*/
@@ -971,6 +981,8 @@ struct arc_buf_hdr {
HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
+#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
+#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
/*
* Other sizes
@@ -1065,7 +1077,7 @@ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
static uint64_t l2arc_ndev; /* number of devices */
typedef struct l2arc_read_callback {
- arc_buf_hdr_t *l2rcb_hdr; /* read buffer */
+ arc_buf_hdr_t *l2rcb_hdr; /* read header */
blkptr_t l2rcb_bp; /* original blkptr */
zbookmark_phys_t l2rcb_zb; /* original bookmark */
int l2rcb_flags; /* original flags */
@@ -1400,6 +1412,31 @@ retry:
}
}
+/*
+ * This is the size that the buf occupies in memory. If the buf is compressed,
+ * it will correspond to the compressed size. You should use this method of
+ * getting the buf size unless you explicitly need the logical size.
+ */
+int32_t
+arc_buf_size(arc_buf_t *buf)
+{
+ return (ARC_BUF_COMPRESSED(buf) ?
+ HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
+}
+
+int32_t
+arc_buf_lsize(arc_buf_t *buf)
+{
+ return (HDR_GET_LSIZE(buf->b_hdr));
+}
+
+enum zio_compress
+arc_get_compression(arc_buf_t *buf)
+{
+ return (ARC_BUF_COMPRESSED(buf) ?
+ HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
+}
+
#define ARC_MINTIME (hz>>4) /* 62 ms */
static inline boolean_t
@@ -1408,9 +1445,21 @@ arc_buf_is_shared(arc_buf_t *buf)
boolean_t shared = (buf->b_data != NULL &&
buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
+ IMPLY(shared, ARC_BUF_SHARED(buf));
+ IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
+
+ /*
+ * It would be nice to assert arc_can_share() too, but the "hdr isn't
+ * already being shared" requirement prevents us from doing that.
+ */
+
return (shared);
}
+/*
+ * Free the checksum associated with this header. If there is no checksum, this
+ * is a no-op.
+ */
static inline void
arc_cksum_free(arc_buf_hdr_t *hdr)
{
@@ -1423,6 +1472,25 @@ arc_cksum_free(arc_buf_hdr_t *hdr)
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
}
+/*
+ * Return true iff at least one of the bufs on hdr is not compressed.
+ */
+static boolean_t
+arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
+{
+ for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
+ if (!ARC_BUF_COMPRESSED(b)) {
+ return (B_TRUE);
+ }
+ }
+ return (B_FALSE);
+}
+
+/*
+ * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
+ * matches the checksum that is stored in the hdr. If there is no checksum,
+ * or if the buf is compressed, this is a no-op.
+ */
static void
arc_cksum_verify(arc_buf_t *buf)
{
@@ -1432,6 +1500,12 @@ arc_cksum_verify(arc_buf_t *buf)
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
+ }
+
ASSERT(HDR_HAS_L1HDR(hdr));
mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
@@ -1439,7 +1513,8 @@ arc_cksum_verify(arc_buf_t *buf)
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
- fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc);
+
+ fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
panic("buffer modified while frozen!");
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
@@ -1513,6 +1588,12 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
return (valid_cksum);
}
+/*
+ * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
+ * checksum and attaches it to the buf's hdr so that we can ensure that the buf
+ * isn't modified later on. If buf is compressed or there is already a checksum
+ * on the hdr, this is a no-op (we only checksum uncompressed bufs).
+ */
static void
arc_cksum_compute(arc_buf_t *buf)
{
@@ -1522,14 +1603,21 @@ arc_cksum_compute(arc_buf_t *buf)
return;
ASSERT(HDR_HAS_L1HDR(hdr));
+
mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+ ASSERT(arc_hdr_has_uncompressed_buf(hdr));
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+ return;
+ } else if (ARC_BUF_COMPRESSED(buf)) {
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
+
+ ASSERT(!ARC_BUF_COMPRESSED(buf));
hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
KM_SLEEP);
- fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL,
+ fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
hdr->b_l1hdr.b_freeze_cksum);
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
arc_buf_watch(buf);
@@ -1570,7 +1658,7 @@ arc_buf_watch(arc_buf_t *buf)
procctl_t ctl;
ctl.cmd = PCWATCH;
ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
- ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr);
+ ctl.prwatch.pr_size = arc_buf_size(buf);
ctl.prwatch.pr_wflags = WA_WRITE;
result = write(arc_procfd, &ctl, sizeof (ctl));
ASSERT3U(result, ==, sizeof (ctl));
@@ -1591,6 +1679,12 @@ arc_buf_type(arc_buf_hdr_t *hdr)
return (type);
}
+boolean_t
+arc_is_metadata(arc_buf_t *buf)
+{
+ return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
+}
+
static uint32_t
arc_bufc_to_flags(arc_buf_contents_t type)
{
@@ -1612,12 +1706,19 @@ arc_buf_thaw(arc_buf_t *buf)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (hdr->b_l1hdr.b_state != arc_anon)
- panic("modifying non-anon buffer!");
- if (HDR_IO_IN_PROGRESS(hdr))
- panic("modifying buffer while i/o in progress!");
- arc_cksum_verify(buf);
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+
+ arc_cksum_verify(buf);
+
+ /*
+ * Compressed buffers do not manipulate the b_freeze_cksum or
+ * allocate b_thawed.
+ */
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
}
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -1646,6 +1747,12 @@ arc_buf_freeze(arc_buf_t *buf)
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
+ }
+
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
@@ -1654,7 +1761,6 @@ arc_buf_freeze(arc_buf_t *buf)
hdr->b_l1hdr.b_state == arc_anon);
arc_cksum_compute(buf);
mutex_exit(hash_lock);
-
}
/*
@@ -1711,47 +1817,157 @@ arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
}
}
+/*
+ * Looks for another buf on the same hdr which has the data decompressed, copies
+ * from it, and returns true. If no such buf exists, returns false.
+ */
+static boolean_t
+arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ boolean_t copied = B_FALSE;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3P(buf->b_data, !=, NULL);
+ ASSERT(!ARC_BUF_COMPRESSED(buf));
+
+ for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
+ from = from->b_next) {
+ /* can't use our own data buffer */
+ if (from == buf) {
+ continue;
+ }
+
+ if (!ARC_BUF_COMPRESSED(from)) {
+ bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
+ copied = B_TRUE;
+ break;
+ }
+ }
+
+ /*
+ * There were no decompressed bufs, so there should not be a
+ * checksum on the hdr either.
+ */
+ EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
+
+ return (copied);
+}
+
+/*
+ * Given a buf that has a data buffer attached to it, this function will
+ * efficiently fill the buf with data of the specified compression setting from
+ * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
+ * are already sharing a data buf, no copy is performed.
+ *
+ * If the buf is marked as compressed but uncompressed data was requested, this
+ * will allocate a new data buffer for the buf, remove that flag, and fill the
+ * buf with uncompressed data. You can't request a compressed buf on a hdr with
+ * uncompressed data, and (since we haven't added support for it yet) if you
+ * want compressed data your buf must already be marked as compressed and have
+ * the correct-sized data buffer.
+ */
static int
-arc_decompress(arc_buf_t *buf)
+arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
+ boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
- int error;
- if (arc_buf_is_shared(buf)) {
- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
- } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
- /*
- * The arc_buf_hdr_t is either not compressed or is
- * associated with an embedded block or a hole in which
- * case they remain anonymous.
- */
- IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 ||
- HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr));
- ASSERT(!HDR_SHARED_DATA(hdr));
- bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr));
+ ASSERT3P(buf->b_data, !=, NULL);
+ IMPLY(compressed, hdr_compressed);
+ IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
+
+ if (hdr_compressed == compressed) {
+ if (!arc_buf_is_shared(buf)) {
+ bcopy(hdr->b_l1hdr.b_pdata, buf->b_data,
+ arc_buf_size(buf));
+ }
} else {
- ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT(hdr_compressed);
+ ASSERT(!compressed);
ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
- error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
- hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr),
- HDR_GET_LSIZE(hdr));
- if (error != 0) {
- zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d",
- hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr),
- HDR_GET_LSIZE(hdr));
- return (SET_ERROR(EIO));
+
+ /*
+ * If the buf is sharing its data with the hdr, unlink it and
+ * allocate a new data buffer for the buf.
+ */
+ if (arc_buf_is_shared(buf)) {
+ ASSERT(ARC_BUF_COMPRESSED(buf));
+
+ /* We need to give the buf it's own b_data */
+ buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
+ buf->b_data =
+ arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+
+ /* Previously overhead was 0; just add new overhead */
+ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
+ } else if (ARC_BUF_COMPRESSED(buf)) {
+ /* We need to reallocate the buf's b_data */
+ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
+ buf);
+ buf->b_data =
+ arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+
+ /* We increased the size of b_data; update overhead */
+ ARCSTAT_INCR(arcstat_overhead_size,
+ HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
+ }
+
+ /*
+ * Regardless of the buf's previous compression settings, it
+ * should not be compressed at the end of this function.
+ */
+ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+
+ /*
+ * Try copying the data from another buf which already has a
+ * decompressed version. If that's not possible, it's time to
+ * bite the bullet and decompress the data from the hdr.
+ */
+ if (arc_buf_try_copy_decompressed_data(buf)) {
+ /* Skip byteswapping and checksumming (already done) */
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
+ return (0);
+ } else {
+ int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+ hdr->b_l1hdr.b_pdata, buf->b_data,
+ HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
+
+ /*
+ * Absent hardware errors or software bugs, this should
+ * be impossible, but log it anyway so we can debug it.
+ */
+ if (error != 0) {
+ zfs_dbgmsg(
+ "hdr %p, compress %d, psize %d, lsize %d",
+ hdr, HDR_GET_COMPRESS(hdr),
+ HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
+ return (SET_ERROR(EIO));
+ }
}
}
+
+ /* Byteswap the buf's data if necessary */
if (bswap != DMU_BSWAP_NUMFUNCS) {
ASSERT(!HDR_SHARED_DATA(hdr));
ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
}
+
+ /* Compute the hdr's checksum if necessary */
arc_cksum_compute(buf);
+
return (0);
}
+int
+arc_decompress(arc_buf_t *buf)
+{
+ return (arc_buf_fill(buf, B_FALSE));
+}
+
/*
* Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t.
*/
@@ -1779,7 +1995,6 @@ static void
arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
{
arc_buf_contents_t type = arc_buf_type(hdr);
- uint64_t lsize = HDR_GET_LSIZE(hdr);
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -1787,7 +2002,8 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
- (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr);
+ (void) refcount_add_many(&state->arcs_esize[type],
+ HDR_GET_LSIZE(hdr), hdr);
return;
}
@@ -1798,11 +2014,10 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
}
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
+ if (arc_buf_is_shared(buf))
continue;
- }
- (void) refcount_add_many(&state->arcs_esize[type], lsize, buf);
+ (void) refcount_add_many(&state->arcs_esize[type],
+ arc_buf_size(buf), buf);
}
}
@@ -1812,10 +2027,9 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
* so that we can add and remove them from the refcount individually.
*/
static void
-arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
+arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
{
arc_buf_contents_t type = arc_buf_type(hdr);
- uint64_t lsize = HDR_GET_LSIZE(hdr);
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -1824,7 +2038,7 @@ arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
(void) refcount_remove_many(&state->arcs_esize[type],
- lsize, hdr);
+ HDR_GET_LSIZE(hdr), hdr);
return;
}
@@ -1835,12 +2049,10 @@ arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
}
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
+ if (arc_buf_is_shared(buf))
continue;
- }
(void) refcount_remove_many(&state->arcs_esize[type],
- lsize, buf);
+ arc_buf_size(buf), buf);
}
}
@@ -1868,7 +2080,7 @@ add_reference(arc_buf_hdr_t *hdr, void *tag)
if (state != arc_l2c_only) {
multilist_remove(&state->arcs_list[arc_buf_type(hdr)],
hdr);
- arc_evitable_space_decrement(hdr, state);
+ arc_evictable_space_decrement(hdr, state);
}
/* remove the prefetch flag if we get a reference */
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
@@ -1956,7 +2168,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
update_old = B_TRUE;
}
- arc_evitable_space_decrement(hdr, old_state);
+ arc_evictable_space_decrement(hdr, old_state);
}
if (new_state != arc_anon && new_state != arc_l2c_only) {
@@ -2019,13 +2231,11 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
* add to the refcount if the arc_buf_t is
* not shared.
*/
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
+ if (arc_buf_is_shared(buf))
continue;
- }
(void) refcount_add_many(&new_state->arcs_size,
- HDR_GET_LSIZE(hdr), buf);
+ arc_buf_size(buf), buf);
}
ASSERT3U(bufcnt, ==, buffers);
@@ -2042,6 +2252,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(old_state)) {
ASSERT0(bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
/*
* When moving a header off of a ghost state,
@@ -2053,7 +2264,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
(void) refcount_remove_many(&old_state->arcs_size,
HDR_GET_LSIZE(hdr), hdr);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
} else {
uint32_t buffers = 0;
@@ -2064,7 +2274,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
*/
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
- ASSERT3P(bufcnt, !=, 0);
+ ASSERT3U(bufcnt, !=, 0);
buffers++;
/*
@@ -2074,13 +2284,11 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
* add to the refcount if the arc_buf_t is
* not shared.
*/
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
+ if (arc_buf_is_shared(buf))
continue;
- }
(void) refcount_remove_many(
- &old_state->arcs_size, HDR_GET_LSIZE(hdr),
+ &old_state->arcs_size, arc_buf_size(buf),
buf);
}
ASSERT3U(bufcnt, ==, buffers);
@@ -2165,11 +2373,50 @@ arc_space_return(uint64_t space, arc_space_type_t type)
}
/*
- * Allocate an initial buffer for this hdr, subsequent buffers will
- * use arc_buf_clone().
+ * Given a hdr and a buf, returns whether that buf can share its b_data buffer
+ * with the hdr's b_pdata.
*/
-static arc_buf_t *
-arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag)
+static boolean_t
+arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+ /*
+ * The criteria for sharing a hdr's data are:
+ * 1. the hdr's compression matches the buf's compression
+ * 2. the hdr doesn't need to be byteswapped
+ * 3. the hdr isn't already being shared
+ * 4. the buf is either compressed or it is the last buf in the hdr list
+ *
+ * Criterion #4 maintains the invariant that shared uncompressed
+ * bufs must be the final buf in the hdr's b_buf list. Reading this, you
+ * might ask, "if a compressed buf is allocated first, won't that be the
+ * last thing in the list?", but in that case it's impossible to create
+ * a shared uncompressed buf anyway (because the hdr must be compressed
+ * to have the compressed buf). You might also think that #3 is
+ * sufficient to make this guarantee, however it's possible
+ * (specifically in the rare L2ARC write race mentioned in
+ * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
+ * is sharable, but wasn't at the time of its allocation. Rather than
+ * allow a new shared uncompressed buf to be created and then shuffle
+ * the list around to make it the last element, this simply disallows
+ * sharing if the new buf isn't the first to be added.
+ */
+ ASSERT3P(buf->b_hdr, ==, hdr);
+ boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF;
+ boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
+ return (buf_compressed == hdr_compressed &&
+ hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
+ !HDR_SHARED_DATA(hdr) &&
+ (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
+}
+
+/*
+ * Allocate a buf for this hdr. If you care about the data that's in the hdr,
+ * or if you want a compressed buffer, pass those flags in. Returns 0 if the
+ * copy was made successfully, or an error code otherwise.
+ */
+static int
+arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
+ boolean_t fill, arc_buf_t **ret)
{
arc_buf_t *buf;
@@ -2177,15 +2424,14 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag)
ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
VERIFY(hdr->b_type == ARC_BUFC_DATA ||
hdr->b_type == ARC_BUFC_METADATA);
+ ASSERT3P(ret, !=, NULL);
+ ASSERT3P(*ret, ==, NULL);
- ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT0(hdr->b_l1hdr.b_bufcnt);
-
- buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+ buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
- buf->b_next = NULL;
+ buf->b_next = hdr->b_l1hdr.b_buf;
+ buf->b_flags = 0;
add_reference(hdr, tag);
@@ -2196,58 +2442,63 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag)
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
- * If the hdr's data can be shared (no byteswapping, hdr is
- * uncompressed, hdr's data is not currently being written to the
- * L2ARC write) then we share the data buffer and set the appropriate
- * bit in the hdr's b_flags to indicate the hdr is sharing it's
- * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to
- * store the buf's data.
+ * Only honor requests for compressed bufs if the hdr is actually
+ * compressed.
*/
- if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
- HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) {
+ if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
+ buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
+
+ /*
+ * If the hdr's data can be shared then we share the data buffer and
+ * set the appropriate bit in the hdr's b_flags to indicate the hdr is
+ * sharing it's b_pdata with the arc_buf_t. Otherwise, we allocate a new
+ * buffer to store the buf's data.
+ *
+ * There is one additional restriction here because we're sharing
+ * hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively
+ * involved in an L2ARC write, because if this buf is used by an
+ * arc_write() then the hdr's data buffer will be released when the
+ * write completes, even though the L2ARC write might still be using it.
+ */
+ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr);
+
+ /* Set up b_data and sharing */
+ if (can_share) {
buf->b_data = hdr->b_l1hdr.b_pdata;
+ buf->b_flags |= ARC_BUF_FLAG_SHARED;
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
} else {
- buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
- ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
- arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+ buf->b_data =
+ arc_get_data_buf(hdr, arc_buf_size(buf), buf);
+ ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
}
VERIFY3P(buf->b_data, !=, NULL);
hdr->b_l1hdr.b_buf = buf;
hdr->b_l1hdr.b_bufcnt += 1;
- return (buf);
-}
+ /*
+ * If the user wants the data from the hdr, we need to either copy or
+ * decompress the data.
+ */
+ if (fill) {
+ return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0));
+ }
-/*
- * Used when allocating additional buffers.
- */
-static arc_buf_t *
-arc_buf_clone(arc_buf_t *from)
-{
- arc_buf_t *buf;
- arc_buf_hdr_t *hdr = from->b_hdr;
- uint64_t size = HDR_GET_LSIZE(hdr);
+ return (0);
+}
- ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(hdr->b_l1hdr.b_state != arc_anon);
+static char *arc_onloan_tag = "onloan";
- buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
- buf->b_hdr = hdr;
- buf->b_data = NULL;
- buf->b_next = hdr->b_l1hdr.b_buf;
- hdr->b_l1hdr.b_buf = buf;
- buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
- bcopy(from->b_data, buf->b_data, size);
- hdr->b_l1hdr.b_bufcnt += 1;
+static inline void
+arc_loaned_bytes_update(int64_t delta)
+{
+ atomic_add_64(&arc_loaned_bytes, delta);
- ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
- return (buf);
+ /* assert that it did not wrap around */
+ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
}
-static char *arc_onloan_tag = "onloan";
-
/*
* Loan out an anonymous arc buffer. Loaned buffers are not counted as in
* flight data by arc_tempreserve_space() until they are "returned". Loaned
@@ -2255,16 +2506,29 @@ static char *arc_onloan_tag = "onloan";
* freed.
*/
arc_buf_t *
-arc_loan_buf(spa_t *spa, int size)
+arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
{
- arc_buf_t *buf;
+ arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
+ is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
+
+ arc_loaned_bytes_update(size);
+
+ return (buf);
+}
- buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
+arc_buf_t *
+arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type)
+{
+ arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
+ psize, lsize, compression_type);
+
+ arc_loaned_bytes_update(psize);
- atomic_add_64(&arc_loaned_bytes, size);
return (buf);
}
+
/*
* Return a loaned arc buffer to the arc.
*/
@@ -2278,7 +2542,7 @@ arc_return_buf(arc_buf_t *buf, void *tag)
(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
- atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr));
+ arc_loaned_bytes_update(-arc_buf_size(buf));
}
/* Detach an arc_buf from a dbuf (tag) */
@@ -2292,7 +2556,7 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
- atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr));
+ arc_loaned_bytes_update(arc_buf_size(buf));
}
static void
@@ -2338,8 +2602,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
- ASSERT(!HDR_SHARED_DATA(hdr));
- ASSERT(!arc_buf_is_shared(buf));
+ ASSERT(arc_can_share(hdr, buf));
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
@@ -2351,6 +2614,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
refcount_transfer_ownership(&state->arcs_size, buf, hdr);
hdr->b_l1hdr.b_pdata = buf->b_data;
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
+ buf->b_flags |= ARC_BUF_FLAG_SHARED;
/*
* Since we've transferred ownership to the hdr we need
@@ -2359,7 +2623,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
*/
ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
- ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr));
+ ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
}
static void
@@ -2367,7 +2631,6 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
- ASSERT(HDR_SHARED_DATA(hdr));
ASSERT(arc_buf_is_shared(buf));
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
@@ -2379,6 +2642,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
refcount_transfer_ownership(&state->arcs_size, hdr, buf);
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
hdr->b_l1hdr.b_pdata = NULL;
+ buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
/*
* Since the buffer is no longer shared between
@@ -2386,26 +2650,63 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
*/
ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
- ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
+ ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
}
/*
- * Free up buf->b_data and if 'remove' is set, then pull the
- * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
+ * Remove an arc_buf_t from the hdr's buf list and return the last
+ * arc_buf_t on the list. If no buffers remain on the list then return
+ * NULL.
+ */
+static arc_buf_t *
+arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+ arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
+ arc_buf_t *lastbuf = NULL;
+
+ /*
+ * Remove the buf from the hdr list and locate the last
+ * remaining buffer on the list.
+ */
+ while (*bufp != NULL) {
+ if (*bufp == buf)
+ *bufp = buf->b_next;
+
+ /*
+ * If we've removed a buffer in the middle of
+ * the list then update the lastbuf and update
+ * bufp.
+ */
+ if (*bufp != NULL) {
+ lastbuf = *bufp;
+ bufp = &(*bufp)->b_next;
+ }
+ }
+ buf->b_next = NULL;
+ ASSERT3P(lastbuf, !=, buf);
+ IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
+ IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
+ IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
+
+ return (lastbuf);
+}
+
+/*
+ * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
+ * list and free it.
*/
static void
-arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove)
+arc_buf_destroy_impl(arc_buf_t *buf)
{
- arc_buf_t **bufp;
arc_buf_hdr_t *hdr = buf->b_hdr;
- uint64_t size = HDR_GET_LSIZE(hdr);
- boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf);
/*
- * Free up the data associated with the buf but only
- * if we're not sharing this with the hdr. If we are sharing
- * it with the hdr, then hdr will have performed the allocation
- * so allow it to do the free.
+ * Free up the data associated with the buf but only if we're not
+ * sharing this with the hdr. If we are sharing it with the hdr, the
+ * hdr is responsible for doing the free.
*/
if (buf->b_data != NULL) {
/*
@@ -2417,11 +2718,10 @@ arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove)
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
- if (destroyed_buf_is_shared) {
- ASSERT(ARC_BUF_LAST(buf));
- ASSERT(HDR_SHARED_DATA(hdr));
+ if (arc_buf_is_shared(buf)) {
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
} else {
+ uint64_t size = arc_buf_size(buf);
arc_free_data_buf(hdr, buf->b_data, size, buf);
ARCSTAT_INCR(arcstat_overhead_size, -size);
}
@@ -2431,58 +2731,58 @@ arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove)
hdr->b_l1hdr.b_bufcnt -= 1;
}
- /* only remove the buf if requested */
- if (!remove)
- return;
-
- /* remove the buf from the hdr list */
- arc_buf_t *lastbuf = NULL;
- bufp = &hdr->b_l1hdr.b_buf;
- while (*bufp != NULL) {
- if (*bufp == buf)
- *bufp = buf->b_next;
+ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
+ if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
/*
- * If we've removed a buffer in the middle of
- * the list then update the lastbuf and update
- * bufp.
+ * If the current arc_buf_t is sharing its data buffer with the
+ * hdr, then reassign the hdr's b_pdata to share it with the new
+ * buffer at the end of the list. The shared buffer is always
+ * the last one on the hdr's buffer list.
+ *
+ * There is an equivalent case for compressed bufs, but since
+ * they aren't guaranteed to be the last buf in the list and
+ * that is an exceedingly rare case, we just allow that space be
+ * wasted temporarily.
*/
- if (*bufp != NULL) {
- lastbuf = *bufp;
- bufp = &(*bufp)->b_next;
- }
- }
- buf->b_next = NULL;
- ASSERT3P(lastbuf, !=, buf);
-
- /*
- * If the current arc_buf_t is sharing its data
- * buffer with the hdr, then reassign the hdr's
- * b_pdata to share it with the new buffer at the end
- * of the list. The shared buffer is always the last one
- * on the hdr's buffer list.
- */
- if (destroyed_buf_is_shared && lastbuf != NULL) {
- ASSERT(ARC_BUF_LAST(buf));
- ASSERT(ARC_BUF_LAST(lastbuf));
- VERIFY(!arc_buf_is_shared(lastbuf));
+ if (lastbuf != NULL) {
+ /* Only one buf can be shared at once */
+ VERIFY(!arc_buf_is_shared(lastbuf));
+ /* hdr is uncompressed so can't have compressed buf */
+ VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
- arc_hdr_free_pdata(hdr);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ arc_hdr_free_pdata(hdr);
+ /*
+ * We must setup a new shared block between the
+ * last buffer and the hdr. The data would have
+ * been allocated by the arc buf so we need to transfer
+ * ownership to the hdr since it's now being shared.
+ */
+ arc_share_buf(hdr, lastbuf);
+ }
+ } else if (HDR_SHARED_DATA(hdr)) {
/*
- * We must setup a new shared block between the
- * last buffer and the hdr. The data would have
- * been allocated by the arc buf so we need to transfer
- * ownership to the hdr since it's now being shared.
+ * Uncompressed shared buffers are always at the end
+ * of the list. Compressed buffers don't have the
+ * same requirements. This makes it hard to
+ * simply assert that the lastbuf is shared so
+ * we rely on the hdr's compression flags to determine
+ * if we have a compressed, shared buffer.
*/
- arc_share_buf(hdr, lastbuf);
- } else if (HDR_SHARED_DATA(hdr)) {
- ASSERT(arc_buf_is_shared(lastbuf));
+ ASSERT3P(lastbuf, !=, NULL);
+ ASSERT(arc_buf_is_shared(lastbuf) ||
+ HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
}
- if (hdr->b_l1hdr.b_bufcnt == 0)
+ /*
+ * Free the checksum if we're removing the last uncompressed buf from
+ * this hdr.
+ */
+ if (!arc_hdr_has_uncompressed_buf(hdr)) {
arc_cksum_free(hdr);
+ }
/* clean up the buf */
buf->b_hdr = NULL;
@@ -2533,11 +2833,10 @@ arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
static arc_buf_hdr_t *
arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
- enum zio_compress compress, arc_buf_contents_t type)
+ enum zio_compress compression_type, arc_buf_contents_t type)
{
arc_buf_hdr_t *hdr;
- ASSERT3U(lsize, >, 0);
VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
@@ -2550,7 +2849,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
hdr->b_type = type;
hdr->b_flags = 0;
arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
- arc_hdr_set_compress(hdr, compress);
+ arc_hdr_set_compress(hdr, compression_type);
hdr->b_l1hdr.b_state = arc_anon;
hdr->b_l1hdr.b_arc_access = 0;
@@ -2679,13 +2978,41 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
* The buf is returned thawed since we expect the consumer to modify it.
*/
arc_buf_t *
-arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
+arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
{
arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
ZIO_COMPRESS_OFF, type);
ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
- arc_buf_t *buf = arc_buf_alloc_impl(hdr, tag);
+
+ arc_buf_t *buf = NULL;
+ VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf));
arc_buf_thaw(buf);
+
+ return (buf);
+}
+
+/*
+ * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
+ * for bufs containing metadata.
+ */
+arc_buf_t *
+arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type)
+{
+ ASSERT3U(lsize, >, 0);
+ ASSERT3U(lsize, >=, psize);
+ ASSERT(compression_type > ZIO_COMPRESS_OFF);
+ ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS);
+
+ arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+ compression_type, ARC_BUFC_DATA);
+ ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
+
+ arc_buf_t *buf = NULL;
+ VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf));
+ arc_buf_thaw(buf);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+
return (buf);
}
@@ -2752,7 +3079,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
arc_cksum_free(hdr);
while (hdr->b_l1hdr.b_buf != NULL)
- arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE);
+ arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
#ifdef ZFS_DEBUG
if (hdr->b_l1hdr.b_thawed != NULL) {
@@ -2798,16 +3125,10 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
ASSERT3P(buf->b_data, !=, NULL);
(void) remove_reference(hdr, hash_lock, tag);
- arc_buf_destroy_impl(buf, B_TRUE);
+ arc_buf_destroy_impl(buf);
mutex_exit(hash_lock);
}
-int32_t
-arc_buf_size(arc_buf_t *buf)
-{
- return (HDR_GET_LSIZE(buf->b_hdr));
-}
-
/*
* Evict the arc_buf_hdr that is provided as a parameter. The resultant
* state of the header is dependent on it's state prior to entering this
@@ -2853,7 +3174,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
if (HDR_HAS_L2HDR(hdr)) {
- ASSERT(hdr->b_l1hdr.b_pdata == NULL);
/*
* This buffer is cached on the 2nd Level ARC;
* don't destroy the header.
@@ -2866,7 +3186,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
hdr = arc_hdr_realloc(hdr, hdr_full_cache,
hdr_l2only_cache);
} else {
- ASSERT(hdr->b_l1hdr.b_pdata == NULL);
arc_change_state(arc_anon, hdr, hash_lock);
arc_hdr_destroy(hdr);
}
@@ -2895,7 +3214,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
if (buf->b_data != NULL)
bytes_evicted += HDR_GET_LSIZE(hdr);
mutex_exit(&buf->b_evict_lock);
- arc_buf_destroy_impl(buf, B_TRUE);
+ arc_buf_destroy_impl(buf);
}
if (HDR_HAS_L2HDR(hdr)) {
@@ -3244,7 +3563,7 @@ arc_adjust_meta(void)
/*
* Similar to the above, we want to evict enough bytes to get us
* below the meta limit, but not so much as to drop us below the
- * space alloted to the MFU (which is defined as arc_c - arc_p).
+ * space allotted to the MFU (which is defined as arc_c - arc_p).
*/
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
(int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
@@ -4197,7 +4516,7 @@ void
arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
if (zio == NULL || zio->io_error == 0)
- bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr));
+ bcopy(buf->b_data, arg, arc_buf_size(buf));
arc_buf_destroy(buf, arg);
}
@@ -4235,10 +4554,11 @@ static void
arc_read_done(zio_t *zio)
{
arc_buf_hdr_t *hdr = zio->io_private;
- arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */
kmutex_t *hash_lock = NULL;
- arc_callback_t *callback_list, *acb;
- int freeable = B_FALSE;
+ arc_callback_t *callback_list;
+ arc_callback_t *acb;
+ boolean_t freeable = B_FALSE;
+ boolean_t no_zio_error = (zio->io_error == 0);
/*
* The hdr was inserted into hash-table and removed from lists
@@ -4264,7 +4584,7 @@ arc_read_done(zio_t *zio)
ASSERT3P(hash_lock, !=, NULL);
}
- if (zio->io_error == 0) {
+ if (no_zio_error) {
/* byteswap if necessary */
if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
if (BP_GET_LEVEL(zio->io_bp) > 0) {
@@ -4285,8 +4605,7 @@ arc_read_done(zio_t *zio)
callback_list = hdr->b_l1hdr.b_acb;
ASSERT3P(callback_list, !=, NULL);
- if (hash_lock && zio->io_error == 0 &&
- hdr->b_l1hdr.b_state == arc_anon) {
+ if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
/*
* Only call arc_access on anonymous buffers. This is because
* if we've issued an I/O for an evicted buffer, we've already
@@ -4296,39 +4615,29 @@ arc_read_done(zio_t *zio)
arc_access(hdr, hash_lock);
}
- /* create copies of the data buffer for the callers */
- for (acb = callback_list; acb; acb = acb->acb_next) {
- if (acb->acb_done != NULL) {
- /*
- * If we're here, then this must be a demand read
- * since prefetch requests don't have callbacks.
- * If a read request has a callback (i.e. acb_done is
- * not NULL), then we decompress the data for the
- * first request and clone the rest. This avoids
- * having to waste cpu resources decompressing data
- * that nobody is explicitly waiting to read.
- */
- if (abuf == NULL) {
- acb->acb_buf = arc_buf_alloc_impl(hdr,
- acb->acb_private);
- if (zio->io_error == 0) {
- zio->io_error =
- arc_decompress(acb->acb_buf);
- }
- abuf = acb->acb_buf;
- } else {
- add_reference(hdr, acb->acb_private);
- acb->acb_buf = arc_buf_clone(abuf);
- }
+ /*
+ * If a read request has a callback (i.e. acb_done is not NULL), then we
+ * make a buf containing the data according to the parameters which were
+ * passed in. The implementation of arc_buf_alloc_impl() ensures that we
+ * aren't needlessly decompressing the data multiple times.
+ */
+ int callback_cnt = 0;
+ for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
+ if (!acb->acb_done)
+ continue;
+
+ /* This is a demand read since prefetches don't use callbacks */
+ callback_cnt++;
+
+ int error = arc_buf_alloc_impl(hdr, acb->acb_private,
+ acb->acb_compressed, no_zio_error, &acb->acb_buf);
+ if (no_zio_error) {
+ zio->io_error = error;
}
}
hdr->b_l1hdr.b_acb = NULL;
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
- if (abuf == NULL) {
- /*
- * This buffer didn't have a callback so it must
- * be a prefetch.
- */
+ if (callback_cnt == 0) {
ASSERT(HDR_PREFETCH(hdr));
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
@@ -4337,7 +4646,7 @@ arc_read_done(zio_t *zio)
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
callback_list != NULL);
- if (zio->io_error == 0) {
+ if (no_zio_error) {
arc_hdr_verify(hdr, zio->io_bp);
} else {
arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
@@ -4413,6 +4722,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
kmutex_t *hash_lock = NULL;
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
+ boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
ASSERT(!BP_IS_EMBEDDED(bp) ||
BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
@@ -4477,6 +4787,7 @@ top:
KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
+ acb->acb_compressed = compressed_read;
if (pio != NULL)
acb->acb_zio_dummy = zio_null(pio,
spa, NULL, NULL, NULL, zio_flags);
@@ -4511,23 +4822,9 @@ top:
}
ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
- /*
- * If this block is already in use, create a new
- * copy of the data so that we will be guaranteed
- * that arc_release() will always succeed.
- */
- buf = hdr->b_l1hdr.b_buf;
- if (buf == NULL) {
- ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
- buf = arc_buf_alloc_impl(hdr, private);
- VERIFY0(arc_decompress(buf));
- } else {
- add_reference(hdr, private);
- buf = arc_buf_clone(buf);
- }
- ASSERT3P(buf->b_data, !=, NULL);
-
+ /* Get a buf with the desired data in it. */
+ VERIFY0(arc_buf_alloc_impl(hdr, private,
+ compressed_read, B_TRUE, &buf));
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
@@ -4587,6 +4884,7 @@ top:
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
/*
* This is a delicate dance that we play here.
@@ -4627,6 +4925,7 @@ top:
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
+ acb->acb_compressed = compressed_read;
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
hdr->b_l1hdr.b_acb = acb;
@@ -4873,7 +5172,7 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT3P(state, !=, arc_anon);
/* this buffer is not on any list */
- ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
+ ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
if (HDR_HAS_L2HDR(hdr)) {
mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
@@ -4897,7 +5196,6 @@ arc_release(arc_buf_t *buf, void *tag)
*/
if (hdr->b_l1hdr.b_bufcnt > 1) {
arc_buf_hdr_t *nhdr;
- arc_buf_t **bufp;
uint64_t spa = hdr->b_spa;
uint64_t psize = HDR_GET_PSIZE(hdr);
uint64_t lsize = HDR_GET_LSIZE(hdr);
@@ -4908,8 +5206,7 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
(void) remove_reference(hdr, hash_lock, tag);
- if (arc_buf_is_shared(buf)) {
- ASSERT(HDR_SHARED_DATA(hdr));
+ if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
ASSERT(ARC_BUF_LAST(buf));
}
@@ -4919,60 +5216,58 @@ arc_release(arc_buf_t *buf, void *tag)
* a new anonymous hdr. Also find the last buffer
* in the hdr's buffer list.
*/
- arc_buf_t *lastbuf = NULL;
- bufp = &hdr->b_l1hdr.b_buf;
- while (*bufp != NULL) {
- if (*bufp == buf) {
- *bufp = buf->b_next;
- }
-
- /*
- * If we've removed a buffer in the middle of
- * the list then update the lastbuf and update
- * bufp.
- */
- if (*bufp != NULL) {
- lastbuf = *bufp;
- bufp = &(*bufp)->b_next;
- }
- }
- buf->b_next = NULL;
- ASSERT3P(lastbuf, !=, buf);
+ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
ASSERT3P(lastbuf, !=, NULL);
/*
* If the current arc_buf_t and the hdr are sharing their data
- * buffer, then we must stop sharing that block, transfer
- * ownership and setup sharing with a new arc_buf_t at the end
- * of the hdr's b_buf list.
+ * buffer, then we must stop sharing that block.
*/
if (arc_buf_is_shared(buf)) {
- ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
- ASSERT(ARC_BUF_LAST(lastbuf));
VERIFY(!arc_buf_is_shared(lastbuf));
/*
* First, sever the block sharing relationship between
- * buf and the arc_buf_hdr_t. Then, setup a new
- * block sharing relationship with the last buffer
- * on the arc_buf_t list.
+ * buf and the arc_buf_hdr_t.
*/
arc_unshare_buf(hdr, buf);
- arc_share_buf(hdr, lastbuf);
+
+ /*
+ * Now we need to recreate the hdr's b_pdata. Since we
+ * have lastbuf handy, we try to share with it, but if
+ * we can't then we allocate a new b_pdata and copy the
+ * data from buf into it.
+ */
+ if (arc_can_share(hdr, lastbuf)) {
+ arc_share_buf(hdr, lastbuf);
+ } else {
+ arc_hdr_alloc_pdata(hdr);
+ bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize);
+ }
VERIFY3P(lastbuf->b_data, !=, NULL);
} else if (HDR_SHARED_DATA(hdr)) {
- ASSERT(arc_buf_is_shared(lastbuf));
+ /*
+ * Uncompressed shared buffers are always at the end
+ * of the list. Compressed buffers don't have the
+ * same requirements. This makes it hard to
+ * simply assert that the lastbuf is shared so
+ * we rely on the hdr's compression flags to determine
+ * if we have a compressed, shared buffer.
+ */
+ ASSERT(arc_buf_is_shared(lastbuf) ||
+ HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
+ ASSERT(!ARC_BUF_SHARED(buf));
}
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
ASSERT3P(state, !=, arc_l2c_only);
(void) refcount_remove_many(&state->arcs_size,
- HDR_GET_LSIZE(hdr), buf);
+ arc_buf_size(buf), buf);
if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
ASSERT3P(state, !=, arc_l2c_only);
(void) refcount_remove_many(&state->arcs_esize[type],
- HDR_GET_LSIZE(hdr), buf);
+ arc_buf_size(buf), buf);
}
hdr->b_l1hdr.b_bufcnt -= 1;
@@ -4999,7 +5294,7 @@ arc_release(arc_buf_t *buf, void *tag)
mutex_exit(&buf->b_evict_lock);
(void) refcount_add_many(&arc_anon->arcs_size,
- HDR_GET_LSIZE(nhdr), buf);
+ arc_buf_size(buf), buf);
} else {
mutex_exit(&buf->b_evict_lock);
ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
@@ -5055,15 +5350,13 @@ arc_write_ready(zio_t *zio)
/*
* If we're reexecuting this zio because the pool suspended, then
* cleanup any state that was previously set the first time the
- * callback as invoked.
+ * callback was invoked.
*/
if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
arc_cksum_free(hdr);
arc_buf_unwatch(buf);
if (hdr->b_l1hdr.b_pdata != NULL) {
if (arc_buf_is_shared(buf)) {
- ASSERT(HDR_SHARED_DATA(hdr));
-
arc_unshare_buf(hdr, buf);
} else {
arc_hdr_free_pdata(hdr);
@@ -5100,26 +5393,23 @@ arc_write_ready(zio_t *zio)
* arc thus the on-disk block may or may not match what we maintain
* in the hdr's b_pdata field.
*/
- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
- ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF);
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ !ARC_BUF_COMPRESSED(buf)) {
+ ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF);
ASSERT3U(psize, >, 0);
arc_hdr_alloc_pdata(hdr);
bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize);
} else {
ASSERT3P(buf->b_data, ==, zio->io_orig_data);
- ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr));
- ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS);
- ASSERT(!HDR_SHARED_DATA(hdr));
- ASSERT(!arc_buf_is_shared(buf));
+ ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
/*
* This hdr is not compressed so we're able to share
* the arc_buf_t data buffer with the hdr.
*/
arc_share_buf(hdr, buf);
- VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata,
+ ASSERT0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata,
HDR_GET_LSIZE(hdr)));
}
arc_hdr_verify(hdr, zio->io_bp);
@@ -5178,7 +5468,7 @@ arc_write_done(zio_t *zio)
arc_buf_hdr_t *exists;
kmutex_t *hash_lock;
- ASSERT(zio->io_error == 0);
+ ASSERT3U(zio->io_error, ==, 0);
arc_cksum_verify(buf);
@@ -5248,6 +5538,11 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
if (l2arc)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_OFF);
+ ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
+ zio_flags |= ZIO_FLAG_RAW;
+ }
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
callback->awcb_children_ready = children_ready;
@@ -5268,7 +5563,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
* buf will take sole ownership of the block.
*/
if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
arc_unshare_buf(hdr, buf);
} else {
arc_hdr_free_pdata(hdr);
@@ -5279,8 +5573,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
ASSERT(!arc_buf_is_shared(buf));
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
- zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp,
- arc_write_ready,
+ zio = zio_write(pio, spa, txg, bp, buf->b_data,
+ HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, arc_write_ready,
(children_ready != NULL) ? arc_write_children_ready : NULL,
arc_write_physdone, arc_write_done, callback,
priority, zio_flags, zb);
@@ -5352,6 +5646,10 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
* network delays from blocking transactions that are ready to be
* assigned to a txg.
*/
+
+ /* assert that it has not wrapped around */
+ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
+
anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
arc_loaned_bytes), 0);
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 71ae0c4434..08d1cca1d9 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -850,7 +850,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
spa_t *spa = db->db_objset->os_spa;
mutex_exit(&db->db_mtx);
- abuf = arc_loan_buf(spa, blksz);
+ abuf = arc_loan_buf(spa, B_FALSE, blksz);
bcopy(db->db.db_data, abuf->b_data, blksz);
} else {
abuf = db->db_buf;
@@ -984,8 +984,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
BP_IS_HOLE(db->db_blkptr)))) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa,
- db->db.db_size, db, type));
+ dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type,
+ db->db.db_size));
bzero(db->db.db_data, db->db.db_size);
if (db->db_blkptr != NULL && db->db_level > 0 &&
@@ -1034,6 +1034,68 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
&aflags, &zb);
}
+/*
+ * This is our just-in-time copy function. It makes a copy of buffers that
+ * have been modified in a previous transaction group before we access them in
+ * the current active group.
+ *
+ * This function is used in three places: when we are dirtying a buffer for the
+ * first time in a txg, when we are freeing a range in a dnode that includes
+ * this buffer, and when we are accessing a buffer which was received compressed
+ * and later referenced in a WRITE_BYREF record.
+ *
+ * Note that when we are called from dbuf_free_range() we do not put a hold on
+ * the buffer, we just traverse the active dbuf list for the dnode.
+ */
+static void
+dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db.db_data != NULL);
+ ASSERT(db->db_level == 0);
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
+
+ if (dr == NULL ||
+ (dr->dt.dl.dr_data !=
+ ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+ return;
+
+ /*
+ * If the last dirty record for this dbuf has not yet synced
+ * and its referencing the dbuf data, either:
+ * reset the reference to point to a new copy,
+ * or (if there a no active holders)
+ * just null out the current db_data pointer.
+ */
+ ASSERT(dr->dr_txg >= txg - 2);
+ if (db->db_blkid == DMU_BONUS_BLKID) {
+ /* Note that the data bufs here are zio_bufs */
+ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
+ } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ int size = arc_buf_size(db->db_buf);
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ spa_t *spa = db->db_objset->os_spa;
+ enum zio_compress compress_type =
+ arc_get_compression(db->db_buf);
+
+ if (compress_type == ZIO_COMPRESS_OFF) {
+ dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
+ } else {
+ ASSERT3U(type, ==, ARC_BUFC_DATA);
+ dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
+ size, arc_buf_lsize(db->db_buf), compress_type);
+ }
+ bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
+ } else {
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
+ }
+}
+
int
dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
@@ -1062,6 +1124,18 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
+ /*
+ * If the arc buf is compressed, we need to decompress it to
+ * read the data. This could happen during the "zfs receive" of
+ * a stream which is compressed and deduplicated.
+ */
+ if (db->db_buf != NULL &&
+ arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
+ dbuf_fix_old_data(db,
+ spa_syncing_txg(dmu_objset_spa(db->db_objset)));
+ err = arc_decompress(db->db_buf);
+ dbuf_set_data(db, db->db_buf);
+ }
mutex_exit(&db->db_mtx);
if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
@@ -1137,7 +1211,7 @@ dbuf_noread(dmu_buf_impl_t *db)
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
- dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type));
+ dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size));
db->db_state = DB_FILL;
} else if (db->db_state == DB_NOFILL) {
dbuf_clear_data(db);
@@ -1147,60 +1221,6 @@ dbuf_noread(dmu_buf_impl_t *db)
mutex_exit(&db->db_mtx);
}
-/*
- * This is our just-in-time copy function. It makes a copy of
- * buffers, that have been modified in a previous transaction
- * group, before we modify them in the current active group.
- *
- * This function is used in two places: when we are dirtying a
- * buffer for the first time in a txg, and when we are freeing
- * a range in a dnode that includes this buffer.
- *
- * Note that when we are called from dbuf_free_range() we do
- * not put a hold on the buffer, we just traverse the active
- * dbuf list for the dnode.
- */
-static void
-dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
-{
- dbuf_dirty_record_t *dr = db->db_last_dirty;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db.db_data != NULL);
- ASSERT(db->db_level == 0);
- ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
-
- if (dr == NULL ||
- (dr->dt.dl.dr_data !=
- ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
- return;
-
- /*
- * If the last dirty record for this dbuf has not yet synced
- * and its referencing the dbuf data, either:
- * reset the reference to point to a new copy,
- * or (if there a no active holders)
- * just null out the current db_data pointer.
- */
- ASSERT(dr->dr_txg >= txg - 2);
- if (db->db_blkid == DMU_BONUS_BLKID) {
- /* Note that the data bufs here are zio_bufs */
- dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
- bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
- } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
- int size = db->db.db_size;
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- spa_t *spa = db->db_objset->os_spa;
-
- dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type);
- bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
- } else {
- db->db_buf = NULL;
- dbuf_clear_data(db);
- }
-}
-
void
dbuf_unoverride(dbuf_dirty_record_t *dr)
{
@@ -1401,7 +1421,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
dmu_buf_will_dirty(&db->db, tx);
/* create the data buffer for the new block */
- buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type);
+ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
/* copy old block data to the new block */
obuf = db->db_buf;
@@ -1995,9 +2015,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
ASSERT(!refcount_is_zero(&db->db_holds));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(db->db_level == 0);
- ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
+ ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
ASSERT(buf != NULL);
- ASSERT(arc_buf_size(buf) == db->db.db_size);
+ ASSERT(arc_buf_lsize(buf) == db->db.db_size);
ASSERT(tx->tx_txg != 0);
arc_return_buf(buf, db);
@@ -2594,8 +2614,8 @@ top:
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db,
- arc_alloc_buf(dn->dn_objset->os_spa,
- db->db.db_size, db, type));
+ arc_alloc_buf(dn->dn_objset->os_spa, db, type,
+ db->db.db_size));
bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
db->db.db_size);
}
@@ -3140,10 +3160,19 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
* objects only modified in the syncing context (e.g.
* DNONE_DNODE blocks).
*/
- int blksz = arc_buf_size(*datap);
+ int psize = arc_buf_size(*datap);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- *datap = arc_alloc_buf(os->os_spa, blksz, db, type);
- bcopy(db->db.db_data, (*datap)->b_data, blksz);
+ enum zio_compress compress_type = arc_get_compression(*datap);
+
+ if (compress_type == ZIO_COMPRESS_OFF) {
+ *datap = arc_alloc_buf(os->os_spa, db, type, psize);
+ } else {
+ ASSERT3U(type, ==, ARC_BUFC_DATA);
+ int lsize = arc_buf_lsize(*datap);
+ *datap = arc_alloc_compressed_buf(os->os_spa, db,
+ psize, lsize, compress_type);
+ }
+ bcopy(db->db.db_data, (*datap)->b_data, psize);
}
db->db_data_pending = dr;
@@ -3548,7 +3577,9 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
wp_flag = WP_SPILL;
wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
- dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+ dmu_write_policy(os, dn, db->db_level, wp_flag,
+ (data != NULL && arc_get_compression(data) != ZIO_COMPRESS_OFF) ?
+ arc_get_compression(data) : ZIO_COMPRESS_INHERIT, &zp);
DB_DNODE_EXIT(db);
/*
@@ -3567,8 +3598,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
*/
void *contents = (data != NULL) ? data->b_data : NULL;
- dr->dr_zio = zio_write(zio, os->os_spa, txg,
- &dr->dr_bp_copy, contents, db->db.db_size, &zp,
+ dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
+ contents, db->db.db_size, db->db.db_size, &zp,
dbuf_write_override_ready, NULL, NULL,
dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
@@ -3581,7 +3612,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(zio, os->os_spa, txg,
- &dr->dr_bp_copy, NULL, db->db.db_size, &zp,
+ &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
dbuf_write_nofill_ready, NULL, NULL,
dbuf_write_nofill_done, db,
ZIO_PRIORITY_ASYNC_WRITE,
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 20da2d5512..e858c701a4 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -1024,7 +1024,7 @@ dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
int i = priv->next++;
ASSERT(i < priv->cnt);
- ASSERT(off + n <= arc_buf_size(abuf));
+ ASSERT(off + n <= arc_buf_lsize(abuf));
iov = uio->uio_iov + i;
iov->iov_base = (char *)abuf->b_data + off;
iov->iov_len = n;
@@ -1079,13 +1079,13 @@ xuio_stat_fini(void)
}
void
-xuio_stat_wbuf_copied()
+xuio_stat_wbuf_copied(void)
{
XUIOSTAT_BUMP(xuiostat_wbuf_copied);
}
void
-xuio_stat_wbuf_nocopy()
+xuio_stat_wbuf_nocopy(void)
{
XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
}
@@ -1370,7 +1370,7 @@ dmu_request_arcbuf(dmu_buf_t *handle, int size)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
- return (arc_loan_buf(db->db_objset->os_spa, size));
+ return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
}
/*
@@ -1395,7 +1395,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
dnode_t *dn;
dmu_buf_impl_t *db;
- uint32_t blksz = (uint32_t)arc_buf_size(buf);
+ uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
uint64_t blkid;
DB_DNODE_ENTER(dbuf);
@@ -1408,18 +1408,19 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
/*
* We can only assign if the offset is aligned, the arc buf is the
- * same size as the dbuf, and the dbuf is not metadata. It
- * can't be metadata because the loaned arc buf comes from the
- * user-data kmem arena.
+ * same size as the dbuf, and the dbuf is not metadata.
*/
- if (offset == db->db.db_offset && blksz == db->db.db_size &&
- DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
+ if (offset == db->db.db_offset && blksz == db->db.db_size) {
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
objset_t *os;
uint64_t object;
+ /* compressed bufs must always be assignable to their dbuf */
+ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
+ ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
+
DB_DNODE_ENTER(dbuf);
dn = DB_DNODE(dbuf);
os = dn->dn_objset;
@@ -1569,8 +1570,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
dsa->dsa_zgd = zgd;
dsa->dsa_tx = tx;
- zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx),
- zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size,
+ zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
+ zgd->zgd_db->db_data, zgd->zgd_db->db_size, zgd->zgd_db->db_size,
zp, dmu_sync_late_arrival_ready, NULL,
NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL, zb));
@@ -1624,7 +1625,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
- dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
+ dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC,
+ ZIO_COMPRESS_INHERIT, &zp);
DB_DNODE_EXIT(db);
/*
@@ -1794,7 +1796,8 @@ int zfs_mdcomp_disable = 0;
int zfs_redundant_metadata_most_ditto_level = 2;
void
-dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
+dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
+ enum zio_compress override_compress, zio_prop_t *zp)
{
dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
@@ -1806,6 +1809,10 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
boolean_t nopwrite = B_FALSE;
boolean_t dedup_verify = os->os_dedup_verify;
int copies = os->os_copies;
+ boolean_t lz4_ac = spa_feature_is_active(os->os_spa,
+ SPA_FEATURE_LZ4_COMPRESS);
+
+ IMPLY(override_compress == ZIO_COMPRESS_LZ4, lz4_ac);
/*
* We maintain different write policies for each of the following
@@ -1892,7 +1899,16 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
}
zp->zp_checksum = checksum;
- zp->zp_compress = compress;
+
+ /*
+ * If we're writing a pre-compressed buffer, the compression type we use
+ * must match the data. If it hasn't been compressed yet, then we should
+ * use the value dictated by the policies above.
+ */
+ zp->zp_compress = override_compress != ZIO_COMPRESS_INHERIT
+ ? override_compress : compress;
+ ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
+
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 0734c1b42b..3ed68f7133 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -339,9 +339,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
/* Increase the blocksize if we are permitted. */
if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
- arc_buf_t *buf = arc_alloc_buf(spa,
- sizeof (objset_phys_t), &os->os_phys_buf,
- ARC_BUFC_METADATA);
+ arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
+ ARC_BUFC_METADATA, sizeof (objset_phys_t));
bzero(buf->b_data, sizeof (objset_phys_t));
bcopy(os->os_phys_buf->b_data, buf->b_data,
arc_buf_size(os->os_phys_buf));
@@ -354,8 +353,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
} else {
int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
- os->os_phys_buf = arc_alloc_buf(spa, size,
- &os->os_phys_buf, ARC_BUFC_METADATA);
+ os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
+ ARC_BUFC_METADATA, size);
os->os_phys = os->os_phys_buf->b_data;
bzero(os->os_phys, size);
}
@@ -1138,7 +1137,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
arc_release(os->os_phys_buf, &os->os_phys_buf);
- dmu_write_policy(os, NULL, 0, 0, &zp);
+ dmu_write_policy(os, NULL, 0, 0, ZIO_COMPRESS_INHERIT, &zp);
zio = arc_write(pio, os->os_spa, tx->tx_txg,
blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index 18ab28dc2a..72247ce381 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -249,8 +249,10 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
static int
dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
- uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
+ uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp,
+ void *data)
{
+ uint64_t payload_size;
struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
/*
@@ -261,7 +263,7 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
(object == dsp->dsa_last_data_object &&
offset > dsp->dsa_last_data_offset));
dsp->dsa_last_data_object = object;
- dsp->dsa_last_data_offset = offset + blksz - 1;
+ dsp->dsa_last_data_offset = offset + lsize - 1;
/*
* If there is any kind of pending aggregation (currently either
@@ -280,8 +282,26 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_object = object;
drrw->drr_type = type;
drrw->drr_offset = offset;
- drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
+ drrw->drr_logical_size = lsize;
+
+ /* only set the compression fields if the buf is compressed */
+ if (lsize != psize) {
+ ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED);
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT(!BP_SHOULD_BYTESWAP(bp));
+ ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
+ ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
+ ASSERT3S(psize, >, 0);
+ ASSERT3S(lsize, >=, psize);
+
+ drrw->drr_compressiontype = BP_GET_COMPRESS(bp);
+ drrw->drr_compressed_size = psize;
+ payload_size = drrw->drr_compressed_size;
+ } else {
+ payload_size = drrw->drr_logical_size;
+ }
+
if (bp == NULL || BP_IS_EMBEDDED(bp)) {
/*
* There's no pre-computed checksum for partial-block
@@ -301,7 +321,7 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_key.ddk_cksum = bp->blk_cksum;
}
- if (dump_record(dsp, data, blksz) != 0)
+ if (dump_record(dsp, data, payload_size) != 0)
return (SET_ERROR(EINTR));
return (0);
}
@@ -476,7 +496,7 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
* Compression function must be legacy, or explicitly enabled.
*/
if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
- !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
+ !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4)))
return (B_FALSE);
/*
@@ -639,18 +659,49 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
uint64_t offset;
+ /*
+ * If we have large blocks stored on disk but the send flags
+ * don't allow us to send large blocks, we split the data from
+ * the arc buf into chunks.
+ */
+ boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE &&
+ !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS);
+ /*
+ * We should only request compressed data from the ARC if all
+ * the following are true:
+ * - stream compression was requested
+ * - we aren't splitting large blocks into smaller chunks
+ * - the data won't need to be byteswapped before sending
+ * - this isn't an embedded block
+ * - this isn't metadata (if receiving on a different endian
+ * system it can be byteswapped more easily)
+ */
+ boolean_t request_compressed =
+ (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) &&
+ !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
+ !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
+
+ ASSERT0(zb->zb_level);
+ ASSERT(zb->zb_object > dsa->dsa_resume_object ||
+ (zb->zb_object == dsa->dsa_resume_object &&
+ zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
+
ASSERT0(zb->zb_level);
ASSERT(zb->zb_object > dsa->dsa_resume_object ||
(zb->zb_object == dsa->dsa_resume_object &&
zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
+ ASSERT3U(blksz, ==, BP_GET_LSIZE(bp));
+
+ enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
+ if (request_compressed)
+ zioflags |= ZIO_FLAG_RAW;
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
- &aflags, zb) != 0) {
+ ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) {
if (zfs_send_corrupt_data) {
/* Send a block filled with 0x"zfs badd bloc" */
- abuf = arc_alloc_buf(spa, blksz, &abuf,
- ARC_BUFC_DATA);
+ abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA,
+ blksz);
uint64_t *ptr;
for (ptr = abuf->b_data;
(char *)ptr < (char *)abuf->b_data + blksz;
@@ -663,21 +714,21 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
offset = zb->zb_blkid * blksz;
- if (!(dsa->dsa_featureflags &
- DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
- blksz > SPA_OLD_MAXBLOCKSIZE) {
+ if (split_large_blocks) {
+ ASSERT3U(arc_get_compression(abuf), ==,
+ ZIO_COMPRESS_OFF);
char *buf = abuf->b_data;
while (blksz > 0 && err == 0) {
int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
err = dump_write(dsa, type, zb->zb_object,
- offset, n, NULL, buf);
+ offset, n, n, NULL, buf);
offset += n;
buf += n;
blksz -= n;
}
} else {
- err = dump_write(dsa, type, zb->zb_object,
- offset, blksz, bp, abuf->b_data);
+ err = dump_write(dsa, type, zb->zb_object, offset,
+ blksz, arc_buf_size(abuf), bp, abuf->b_data);
}
arc_buf_destroy(abuf, &abuf);
}
@@ -704,9 +755,9 @@ get_next_record(bqueue_t *bq, struct send_block_record *data)
*/
static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
- zfs_bookmark_phys_t *ancestor_zb,
- boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd,
- uint64_t resumeobj, uint64_t resumeoff,
+ zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone,
+ boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
+ int outfd, uint64_t resumeobj, uint64_t resumeoff,
vnode_t *vp, offset_t *off)
{
objset_t *os;
@@ -749,7 +800,15 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
- featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
+ featureflags |= DMU_BACKUP_FEATURE_LZ4;
+ }
+ if (compressok) {
+ featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
+ }
+ if ((featureflags &
+ (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED)) !=
+ 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
+ featureflags |= DMU_BACKUP_FEATURE_LZ4;
}
if (resumeobj != 0 || resumeoff != 0) {
@@ -898,7 +957,7 @@ out:
int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
- boolean_t embedok, boolean_t large_block_ok,
+ boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
@@ -935,10 +994,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG);
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
- embedok, large_block_ok, outfd, 0, 0, vp, off);
+ embedok, large_block_ok, compressok, outfd, 0, 0, vp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
- embedok, large_block_ok, outfd, 0, 0, vp, off);
+ embedok, large_block_ok, compressok, outfd, 0, 0, vp, off);
}
dsl_dataset_rele(ds, FTAG);
return (err);
@@ -946,7 +1005,8 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
int
dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
- boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
+ boolean_t large_block_ok, boolean_t compressok, int outfd,
+ uint64_t resumeobj, uint64_t resumeoff,
vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
@@ -1014,11 +1074,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
return (err);
}
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
- embedok, large_block_ok,
+ embedok, large_block_ok, compressok,
outfd, resumeobj, resumeoff, vp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
- embedok, large_block_ok,
+ embedok, large_block_ok, compressok,
outfd, resumeobj, resumeoff, vp, off);
}
if (owned)
@@ -1029,33 +1089,45 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
}
static int
-dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size,
- uint64_t *sizep)
+dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
+ uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
{
int err;
+ uint64_t size;
/*
* Assume that space (both on-disk and in-stream) is dominated by
* data. We will adjust for indirect blocks and the copies property,
* but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
*/
+ uint64_t recordsize;
+ uint64_t record_count;
+
+ /* Assume all (uncompressed) blocks are recordsize. */
+ err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ &recordsize);
+ if (err != 0)
+ return (err);
+ record_count = uncompressed / recordsize;
+
+ /*
+ * If we're estimating a send size for a compressed stream, use the
+ * compressed data size to estimate the stream size. Otherwise, use the
+ * uncompressed data size.
+ */
+ size = stream_compressed ? compressed : uncompressed;
/*
* Subtract out approximate space used by indirect blocks.
* Assume most space is used by data blocks (non-indirect, non-dnode).
- * Assume all blocks are recordsize. Assume ditto blocks and
- * internal fragmentation counter out compression.
+ * Assume no ditto blocks or internal fragmentation.
*
* Therefore, space used by indirect blocks is sizeof(blkptr_t) per
- * block, which we observe in practice.
+ * block.
*/
- uint64_t recordsize;
- err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
- if (err != 0)
- return (err);
- size -= size / recordsize * sizeof (blkptr_t);
+ size -= record_count * sizeof (blkptr_t);
/* Add in the space for the record associated with each block. */
- size += size / recordsize * sizeof (dmu_replay_record_t);
+ size += record_count * sizeof (dmu_replay_record_t);
*sizep = size;
@@ -1063,11 +1135,12 @@ dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size,
}
int
-dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
+dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds,
+ boolean_t stream_compressed, uint64_t *sizep)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
int err;
- uint64_t size;
+ uint64_t uncomp, comp;
ASSERT(dsl_pool_config_held(dp));
@@ -1086,33 +1159,41 @@ dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
return (SET_ERROR(EXDEV));
- /* Get uncompressed size estimate of changed data. */
+ /* Get compressed and uncompressed size estimates of changed data. */
if (fromds == NULL) {
- size = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+ uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+ comp = dsl_dataset_phys(ds)->ds_compressed_bytes;
} else {
- uint64_t used, comp;
+ uint64_t used;
err = dsl_dataset_space_written(fromds, ds,
- &used, &comp, &size);
+ &used, &comp, &uncomp);
if (err != 0)
return (err);
}
- err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+ err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp,
+ stream_compressed, sizep);
return (err);
}
+struct calculate_send_arg {
+ uint64_t uncompressed;
+ uint64_t compressed;
+};
+
/*
* Simple callback used to traverse the blocks of a snapshot and sum their
- * uncompressed size
+ * uncompressed and compressed sizes.
*/
/* ARGSUSED */
static int
dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
- uint64_t *spaceptr = arg;
+ struct calculate_send_arg *space = arg;
if (bp != NULL && !BP_IS_HOLE(bp)) {
- *spaceptr += BP_GET_UCSIZE(bp);
+ space->uncompressed += BP_GET_UCSIZE(bp);
+ space->compressed += BP_GET_PSIZE(bp);
}
return (0);
}
@@ -1124,16 +1205,16 @@ dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/
int
dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
- uint64_t *sizep)
+ boolean_t stream_compressed, uint64_t *sizep)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
int err;
- uint64_t size = 0;
+ struct calculate_send_arg size = { 0 };
ASSERT(dsl_pool_config_held(dp));
/* tosnap must be a snapshot */
- if (!dsl_dataset_is_snapshot(ds))
+ if (!ds->ds_is_snapshot)
return (SET_ERROR(EINVAL));
/* verify that from_txg is before the provided snapshot was taken */
@@ -1150,7 +1231,8 @@ dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
if (err)
return (err);
- err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+ err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed,
+ size.compressed, stream_compressed, sizep);
return (err);
}
@@ -1281,14 +1363,14 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/*
* The receiving code doesn't know how to translate a WRITE_EMBEDDED
- * record to a plan WRITE record, so the pool must have the
+ * record to a plain WRITE record, so the pool must have the
* EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
* records. Same with WRITE_EMBEDDED records that use LZ4 compression.
*/
if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
return (SET_ERROR(ENOTSUP));
- if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+ if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
@@ -1458,10 +1540,20 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
8, 1, &zero, tx));
if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK,
+ 8, 1, &one, tx));
+ }
+ if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
DMU_BACKUP_FEATURE_EMBED_DATA) {
VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
8, 1, &one, tx));
}
+ if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_COMPRESSED) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK,
+ 8, 1, &one, tx));
+ }
}
dmu_buf_will_dirty(newds->ds_dbuf, tx);
@@ -1517,7 +1609,7 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
return (SET_ERROR(ENOTSUP));
- if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+ if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
@@ -1724,7 +1816,7 @@ struct receive_objnode {
uint64_t object;
};
-struct receive_arg {
+struct receive_arg {
objset_t *os;
vnode_t *vp; /* The vnode to read the stream from */
uint64_t voff; /* The current offset in the stream */
@@ -1852,10 +1944,11 @@ byteswap_record(dmu_replay_record_t *drr)
DO64(drr_write.drr_object);
DO32(drr_write.drr_type);
DO64(drr_write.drr_offset);
- DO64(drr_write.drr_length);
+ DO64(drr_write.drr_logical_size);
DO64(drr_write.drr_toguid);
ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
DO64(drr_write.drr_key.ddk_prop);
+ DO64(drr_write.drr_compressed_size);
break;
case DRR_WRITE_BYREF:
DO64(drr_write_byref.drr_object);
@@ -2085,7 +2178,7 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
dmu_tx_t *tx;
int err;
- if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
+ if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset ||
!DMU_OT_IS_VALID(drrw->drr_type))
return (SET_ERROR(EINVAL));
@@ -2107,7 +2200,7 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
tx = dmu_tx_create(rwa->os);
dmu_tx_hold_write(tx, drrw->drr_object,
- drrw->drr_offset, drrw->drr_length);
+ drrw->drr_offset, drrw->drr_logical_size);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err != 0) {
dmu_tx_abort(tx);
@@ -2117,9 +2210,10 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
dmu_object_byteswap_t byteswap =
DMU_OT_BYTESWAP(drrw->drr_type);
dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
- drrw->drr_length);
+ DRR_WRITE_PAYLOAD_SIZE(drrw));
}
+ /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */
dmu_buf_t *bonus;
if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
return (SET_ERROR(EINVAL));
@@ -2536,18 +2630,31 @@ receive_read_record(struct receive_arg *ra)
case DRR_WRITE:
{
struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
- arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os),
- drrw->drr_length);
+ arc_buf_t *abuf;
+ boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type);
+ if (DRR_WRITE_COMPRESSED(drrw)) {
+ ASSERT3U(drrw->drr_compressed_size, >, 0);
+ ASSERT3U(drrw->drr_logical_size, >=,
+ drrw->drr_compressed_size);
+ ASSERT(!is_meta);
+ abuf = arc_loan_compressed_buf(
+ dmu_objset_spa(ra->os),
+ drrw->drr_compressed_size, drrw->drr_logical_size,
+ drrw->drr_compressiontype);
+ } else {
+ abuf = arc_loan_buf(dmu_objset_spa(ra->os),
+ is_meta, drrw->drr_logical_size);
+ }
err = receive_read_payload_and_next_header(ra,
- drrw->drr_length, abuf->b_data);
+ DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data);
if (err != 0) {
dmu_return_arcbuf(abuf);
return (err);
}
ra->rrd->write_buf = abuf;
receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
- drrw->drr_length);
+ drrw->drr_logical_size);
return (err);
}
case DRR_WRITE_BYREF:
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index bac325b3a1..8bc528e1d4 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -1799,9 +1799,17 @@ get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
fnvlist_add_string(token_nv, "toname", buf);
}
if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_LARGEBLOCK) == 0) {
+ fnvlist_add_boolean(token_nv, "largeblockok");
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
DS_FIELD_RESUME_EMBEDOK) == 0) {
fnvlist_add_boolean(token_nv, "embedok");
}
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_COMPRESSOK) == 0) {
+ fnvlist_add_boolean(token_nv, "compressok");
+ }
packed = fnvlist_pack(token_nv, &packed_size);
fnvlist_free(token_nv);
compressed = kmem_alloc(packed_size, KM_SLEEP);
diff --git a/usr/src/uts/common/fs/zfs/lz4.c b/usr/src/uts/common/fs/zfs/lz4.c
index 656360a6f2..3aa1b74ef3 100644
--- a/usr/src/uts/common/fs/zfs/lz4.c
+++ b/usr/src/uts/common/fs/zfs/lz4.c
@@ -85,7 +85,7 @@ lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
/*
* Returns 0 on success (decompression function returned non-negative)
- * and non-zero on failure (decompression function returned negative.
+ * and non-zero on failure (decompression function returned negative).
*/
return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
d_start, bufsiz, d_len) < 0);
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 9efcc29646..0554a8262f 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -38,18 +38,13 @@
#define GANG_ALLOCATION(flags) \
((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
-#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
-#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
-#define METASLAB_ACTIVE_MASK \
- (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
-
uint64_t metaslab_aliquot = 512ULL << 10;
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
* The in-core space map representation is more compact than its on-disk form.
* The zfs_condense_pct determines how much more compact the in-core
- * space_map representation must be before we compact it on-disk.
+ * space map representation must be before we compact it on-disk.
* Values should be greater than or equal to 100.
*/
int zfs_condense_pct = 200;
@@ -127,7 +122,7 @@ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
/*
* The minimum free space, in percent, which must be available
* in a space map to continue allocations in a first-fit fashion.
- * Once the space_map's free space drops below this level we dynamically
+ * Once the space map's free space drops below this level we dynamically
* switch to using best-fit allocations.
*/
int metaslab_df_free_pct = 4;
@@ -175,7 +170,38 @@ boolean_t metaslab_lba_weighting_enabled = B_TRUE;
*/
boolean_t metaslab_bias_enabled = B_TRUE;
-static uint64_t metaslab_fragmentation(metaslab_t *);
+/*
+ * Enable/disable segment-based metaslab selection.
+ */
+boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
+
+/*
+ * When using segment-based metaslab selection, we will continue
+ * allocating from the active metaslab until we have exhausted
+ * zfs_metaslab_switch_threshold of its buckets.
+ */
+int zfs_metaslab_switch_threshold = 2;
+
+/*
+ * Internal switch to enable/disable the metaslab allocation tracing
+ * facility.
+ */
+boolean_t metaslab_trace_enabled = B_TRUE;
+
+/*
+ * Maximum entries that the metaslab allocation tracing facility will keep
+ * in a given list when running in non-debug mode. We limit the number
+ * of entries in non-debug mode to prevent us from using up too much memory.
+ * The limit should be sufficiently large that we don't expect any allocation
+ * to every exceed this value. In debug mode, the system will panic if this
+ * limit is ever reached allowing for further investigation.
+ */
+uint64_t metaslab_trace_max_entries = 5000;
+
+static uint64_t metaslab_weight(metaslab_t *);
+static void metaslab_set_fragmentation(metaslab_t *);
+
+kmem_cache_t *metaslab_alloc_trace_cache;
/*
* ==========================================================================
@@ -393,11 +419,6 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
return (space);
}
-/*
- * ==========================================================================
- * Metaslab groups
- * ==========================================================================
- */
static int
metaslab_compare(const void *x1, const void *x2)
{
@@ -423,6 +444,57 @@ metaslab_compare(const void *x1, const void *x2)
}
/*
+ * Verify that the space accounting on disk matches the in-core range_trees.
+ */
+void
+metaslab_verify_space(metaslab_t *msp, uint64_t txg)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ uint64_t allocated = 0;
+ uint64_t freed = 0;
+ uint64_t sm_free_space, msp_free_space;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+ return;
+
+ /*
+ * We can only verify the metaslab space when we're called
+ * from syncing context with a loaded metaslab that has an allocated
+ * space map. Calling this in non-syncing context does not
+ * provide a consistent view of the metaslab since we're performing
+ * allocations in the future.
+ */
+ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
+ !msp->ms_loaded)
+ return;
+
+ sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
+ space_map_alloc_delta(msp->ms_sm);
+
+ /*
+ * Account for future allocations since we would have already
+ * deducted that space from the ms_freetree.
+ */
+ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
+ allocated +=
+ range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
+ }
+ freed = range_tree_space(msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]);
+
+ msp_free_space = range_tree_space(msp->ms_tree) + allocated +
+ msp->ms_deferspace + freed;
+
+ VERIFY3U(sm_free_space, ==, msp_free_space);
+}
+
+/*
+ * ==========================================================================
+ * Metaslab groups
+ * ==========================================================================
+ */
+/*
* Update the allocatable flag and the metaslab group's capacity.
* The allocatable flag is set to true if the capacity is below
* the zfs_mg_noalloc_threshold or has a fragmentation value that is
@@ -994,7 +1066,7 @@ static range_tree_ops_t metaslab_rt_ops = {
/*
* ==========================================================================
- * Metaslab block operations
+ * Common allocator routines
* ==========================================================================
*/
@@ -1013,31 +1085,22 @@ metaslab_block_maxsize(metaslab_t *msp)
return (rs->rs_end - rs->rs_start);
}
-uint64_t
-metaslab_block_alloc(metaslab_t *msp, uint64_t size)
+static range_seg_t *
+metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
{
- uint64_t start;
- range_tree_t *rt = msp->ms_tree;
-
- VERIFY(!msp->ms_condensing);
+ range_seg_t *rs, rsearch;
+ avl_index_t where;
- start = msp->ms_ops->msop_alloc(msp, size);
- if (start != -1ULL) {
- vdev_t *vd = msp->ms_group->mg_vd;
+ rsearch.rs_start = start;
+ rsearch.rs_end = start + size;
- VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
- VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
- VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
- range_tree_remove(rt, start, size);
+ rs = avl_find(t, &rsearch, &where);
+ if (rs == NULL) {
+ rs = avl_nearest(t, where, AVL_AFTER);
}
- return (start);
-}
-/*
- * ==========================================================================
- * Common allocator routines
- * ==========================================================================
- */
+ return (rs);
+}
/*
* This is a helper function that can be used by the allocator to find
@@ -1048,15 +1111,7 @@ static uint64_t
metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
uint64_t align)
{
- range_seg_t *rs, rsearch;
- avl_index_t where;
-
- rsearch.rs_start = *cursor;
- rsearch.rs_end = *cursor + size;
-
- rs = avl_find(t, &rsearch, &where);
- if (rs == NULL)
- rs = avl_nearest(t, where, AVL_AFTER);
+ range_seg_t *rs = metaslab_block_find(t, *cursor, size);
while (rs != NULL) {
uint64_t offset = P2ROUNDUP(rs->rs_start, align);
@@ -1281,6 +1336,7 @@ int
metaslab_load(metaslab_t *msp)
{
int error = 0;
+ boolean_t success = B_FALSE;
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(!msp->ms_loaded);
@@ -1298,14 +1354,18 @@ metaslab_load(metaslab_t *msp)
else
range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
- msp->ms_loaded = (error == 0);
+ success = (error == 0);
msp->ms_loading = B_FALSE;
- if (msp->ms_loaded) {
+ if (success) {
+ ASSERT3P(msp->ms_group, !=, NULL);
+ msp->ms_loaded = B_TRUE;
+
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
range_tree_walk(msp->ms_defertree[t],
range_tree_remove, msp->ms_tree);
}
+ msp->ms_max_size = metaslab_block_maxsize(msp);
}
cv_broadcast(&msp->ms_load_cv);
return (error);
@@ -1318,6 +1378,7 @@ metaslab_unload(metaslab_t *msp)
range_tree_vacate(msp->ms_tree, NULL, NULL);
msp->ms_loaded = B_FALSE;
msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
+ msp->ms_max_size = 0;
}
int
@@ -1362,21 +1423,23 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
metaslab_group_add(mg, ms);
- ms->ms_fragmentation = metaslab_fragmentation(ms);
- ms->ms_ops = mg->mg_class->mc_ops;
+ metaslab_set_fragmentation(ms);
/*
* If we're opening an existing pool (txg == 0) or creating
* a new one (txg == TXG_INITIAL), all space is available now.
* If we're adding space to an existing pool, the new space
* does not become available until after this txg has synced.
+ * The metaslab's weight will also be initialized when we sync
+ * out this txg. This ensures that we don't attempt to allocate
+ * from it before we have initialized it completely.
*/
if (txg <= TXG_INITIAL)
metaslab_sync_done(ms, 0);
/*
* If metaslab_debug_load is set and we're initializing a metaslab
- * that has an allocated space_map object then load the its space
+ * that has an allocated space map object then load the its space
* map so that can verify frees.
*/
if (metaslab_debug_load && ms->ms_sm != NULL) {
@@ -1403,7 +1466,6 @@ metaslab_fini(metaslab_t *msp)
metaslab_group_remove(mg, msp);
mutex_enter(&msp->ms_lock);
-
VERIFY(msp->ms_group == NULL);
vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
0, -msp->ms_size);
@@ -1476,8 +1538,8 @@ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
* not support this metric. Otherwise, the return value should be in the
* range [0, 100].
*/
-static uint64_t
-metaslab_fragmentation(metaslab_t *msp)
+static void
+metaslab_set_fragmentation(metaslab_t *msp)
{
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
uint64_t fragmentation = 0;
@@ -1485,18 +1547,22 @@ metaslab_fragmentation(metaslab_t *msp)
boolean_t feature_enabled = spa_feature_is_enabled(spa,
SPA_FEATURE_SPACEMAP_HISTOGRAM);
- if (!feature_enabled)
- return (ZFS_FRAG_INVALID);
+ if (!feature_enabled) {
+ msp->ms_fragmentation = ZFS_FRAG_INVALID;
+ return;
+ }
/*
* A null space map means that the entire metaslab is free
* and thus is not fragmented.
*/
- if (msp->ms_sm == NULL)
- return (0);
+ if (msp->ms_sm == NULL) {
+ msp->ms_fragmentation = 0;
+ return;
+ }
/*
- * If this metaslab's space_map has not been upgraded, flag it
+ * If this metaslab's space map has not been upgraded, flag it
* so that we upgrade next time we encounter it.
*/
if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
@@ -1509,12 +1575,14 @@ metaslab_fragmentation(metaslab_t *msp)
spa_dbgmsg(spa, "txg %llu, requesting force condense: "
"msp %p, vd %p", txg, msp, vd);
}
- return (ZFS_FRAG_INVALID);
+ msp->ms_fragmentation = ZFS_FRAG_INVALID;
+ return;
}
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
uint64_t space = 0;
uint8_t shift = msp->ms_sm->sm_shift;
+
int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
FRAGMENTATION_TABLE_SIZE - 1);
@@ -1531,7 +1599,8 @@ metaslab_fragmentation(metaslab_t *msp)
if (total > 0)
fragmentation /= total;
ASSERT3U(fragmentation, <=, 100);
- return (fragmentation);
+
+ msp->ms_fragmentation = fragmentation;
}
/*
@@ -1540,30 +1609,20 @@ metaslab_fragmentation(metaslab_t *msp)
* the LBA range, and whether the metaslab is loaded.
*/
static uint64_t
-metaslab_weight(metaslab_t *msp)
+metaslab_space_weight(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
vdev_t *vd = mg->mg_vd;
uint64_t weight, space;
ASSERT(MUTEX_HELD(&msp->ms_lock));
-
- /*
- * This vdev is in the process of being removed so there is nothing
- * for us to do here.
- */
- if (vd->vdev_removing) {
- ASSERT0(space_map_allocated(msp->ms_sm));
- ASSERT0(vd->vdev_ms_shift);
- return (0);
- }
+ ASSERT(!vd->vdev_removing);
/*
* The baseline weight is the metaslab's free space.
*/
space = msp->ms_size - space_map_allocated(msp->ms_sm);
- msp->ms_fragmentation = metaslab_fragmentation(msp);
if (metaslab_fragmentation_factor_enabled &&
msp->ms_fragmentation != ZFS_FRAG_INVALID) {
/*
@@ -1612,6 +1671,210 @@ metaslab_weight(metaslab_t *msp)
weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
}
+ WEIGHT_SET_SPACEBASED(weight);
+ return (weight);
+}
+
+/*
+ * Return the weight of the specified metaslab, according to the segment-based
+ * weighting algorithm. The metaslab must be loaded. This function can
+ * be called within a sync pass since it relies only on the metaslab's
+ * range tree which is always accurate when the metaslab is loaded.
+ */
+static uint64_t
+metaslab_weight_from_range_tree(metaslab_t *msp)
+{
+ uint64_t weight = 0;
+ uint32_t segments = 0;
+
+ ASSERT(msp->ms_loaded);
+
+ for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
+ i--) {
+ uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
+ int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
+
+ segments <<= 1;
+ segments += msp->ms_tree->rt_histogram[i];
+
+ /*
+ * The range tree provides more precision than the space map
+ * and must be downgraded so that all values fit within the
+ * space map's histogram. This allows us to compare loaded
+ * vs. unloaded metaslabs to determine which metaslab is
+ * considered "best".
+ */
+ if (i > max_idx)
+ continue;
+
+ if (segments != 0) {
+ WEIGHT_SET_COUNT(weight, segments);
+ WEIGHT_SET_INDEX(weight, i);
+ WEIGHT_SET_ACTIVE(weight, 0);
+ break;
+ }
+ }
+ return (weight);
+}
+
+/*
+ * Calculate the weight based on the on-disk histogram. This should only
+ * be called after a sync pass has completely finished since the on-disk
+ * information is updated in metaslab_sync().
+ */
+static uint64_t
+metaslab_weight_from_spacemap(metaslab_t *msp)
+{
+ uint64_t weight = 0;
+
+ for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
+ if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
+ WEIGHT_SET_COUNT(weight,
+ msp->ms_sm->sm_phys->smp_histogram[i]);
+ WEIGHT_SET_INDEX(weight, i +
+ msp->ms_sm->sm_shift);
+ WEIGHT_SET_ACTIVE(weight, 0);
+ break;
+ }
+ }
+ return (weight);
+}
+
+/*
+ * Compute a segment-based weight for the specified metaslab. The weight
+ * is determined by highest bucket in the histogram. The information
+ * for the highest bucket is encoded into the weight value.
+ */
+static uint64_t
+metaslab_segment_weight(metaslab_t *msp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ uint64_t weight = 0;
+ uint8_t shift = mg->mg_vd->vdev_ashift;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * The metaslab is completely free.
+ */
+ if (space_map_allocated(msp->ms_sm) == 0) {
+ int idx = highbit64(msp->ms_size) - 1;
+ int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
+
+ if (idx < max_idx) {
+ WEIGHT_SET_COUNT(weight, 1ULL);
+ WEIGHT_SET_INDEX(weight, idx);
+ } else {
+ WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
+ WEIGHT_SET_INDEX(weight, max_idx);
+ }
+ WEIGHT_SET_ACTIVE(weight, 0);
+ ASSERT(!WEIGHT_IS_SPACEBASED(weight));
+
+ return (weight);
+ }
+
+ ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
+
+ /*
+ * If the metaslab is fully allocated then just make the weight 0.
+ */
+ if (space_map_allocated(msp->ms_sm) == msp->ms_size)
+ return (0);
+ /*
+ * If the metaslab is already loaded, then use the range tree to
+ * determine the weight. Otherwise, we rely on the space map information
+ * to generate the weight.
+ */
+ if (msp->ms_loaded) {
+ weight = metaslab_weight_from_range_tree(msp);
+ } else {
+ weight = metaslab_weight_from_spacemap(msp);
+ }
+
+ /*
+ * If the metaslab was active the last time we calculated its weight
+ * then keep it active. We want to consume the entire region that
+ * is associated with this weight.
+ */
+ if (msp->ms_activation_weight != 0 && weight != 0)
+ WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
+ return (weight);
+}
+
+/*
+ * Determine if we should attempt to allocate from this metaslab. If the
+ * metaslab has a maximum size then we can quickly determine if the desired
+ * allocation size can be satisfied. Otherwise, if we're using segment-based
+ * weighting then we can determine the maximum allocation that this metaslab
+ * can accommodate based on the index encoded in the weight. If we're using
+ * space-based weights then rely on the entire weight (excluding the weight
+ * type bit).
+ */
+boolean_t
+metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
+{
+ boolean_t should_allocate;
+
+ if (msp->ms_max_size != 0)
+ return (msp->ms_max_size >= asize);
+
+ if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
+ /*
+ * The metaslab segment weight indicates segments in the
+ * range [2^i, 2^(i+1)), where i is the index in the weight.
+ * Since the asize might be in the middle of the range, we
+ * should attempt the allocation if asize < 2^(i+1).
+ */
+ should_allocate = (asize <
+ 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
+ } else {
+ should_allocate = (asize <=
+ (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
+ }
+ return (should_allocate);
+}
+
+static uint64_t
+metaslab_weight(metaslab_t *msp)
+{
+ vdev_t *vd = msp->ms_group->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ uint64_t weight;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * This vdev is in the process of being removed so there is nothing
+ * for us to do here.
+ */
+ if (vd->vdev_removing) {
+ ASSERT0(space_map_allocated(msp->ms_sm));
+ ASSERT0(vd->vdev_ms_shift);
+ return (0);
+ }
+
+ metaslab_set_fragmentation(msp);
+
+ /*
+ * Update the maximum size if the metaslab is loaded. This will
+ * ensure that we get an accurate maximum size if newly freed space
+ * has been added back into the free tree.
+ */
+ if (msp->ms_loaded)
+ msp->ms_max_size = metaslab_block_maxsize(msp);
+
+ /*
+ * Segment-based weighting requires space map histogram support.
+ */
+ if (zfs_metaslab_segment_weight_enabled &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
+ (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
+ sizeof (space_map_phys_t))) {
+ weight = metaslab_segment_weight(msp);
+ } else {
+ weight = metaslab_space_weight(msp);
+ }
return (weight);
}
@@ -1630,6 +1893,7 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
}
}
+ msp->ms_activation_weight = msp->ms_weight;
metaslab_group_sort(msp->ms_group, msp,
msp->ms_weight | activation_weight);
}
@@ -1640,18 +1904,56 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
}
static void
-metaslab_passivate(metaslab_t *msp, uint64_t size)
+metaslab_passivate(metaslab_t *msp, uint64_t weight)
{
+ uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
+
/*
* If size < SPA_MINBLOCKSIZE, then we will not allocate from
* this metaslab again. In that case, it had better be empty,
* or we would be leaving space on the table.
*/
- ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
- metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
+ ASSERT(size >= SPA_MINBLOCKSIZE ||
+ range_tree_space(msp->ms_tree) == 0);
+ ASSERT0(weight & METASLAB_ACTIVE_MASK);
+
+ msp->ms_activation_weight = 0;
+ metaslab_group_sort(msp->ms_group, msp, weight);
ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
}
+/*
+ * Segment-based metaslabs are activated once and remain active until
+ * we either fail an allocation attempt (similar to space-based metaslabs)
+ * or have exhausted the free space in zfs_metaslab_switch_threshold
+ * buckets since the metaslab was activated. This function checks to see
+ * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
+ * metaslab and passivates it proactively. This will allow us to select a
+ * metaslabs with larger contiguous region if any remaining within this
+ * metaslab group. If we're in sync pass > 1, then we continue using this
+ * metaslab so that we don't dirty more block and cause more sync passes.
+ */
+void
+metaslab_segment_may_passivate(metaslab_t *msp)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+ if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
+ return;
+
+ /*
+ * Since we are in the middle of a sync pass, the most accurate
+ * information that is accessible to us is the in-core range tree
+ * histogram; calculate the new weight based on that information.
+ */
+ uint64_t weight = metaslab_weight_from_range_tree(msp);
+ int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
+ int current_idx = WEIGHT_GET_INDEX(weight);
+
+ if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
+ metaslab_passivate(msp, weight);
+}
+
static void
metaslab_preload(void *arg)
{
@@ -1664,11 +1966,7 @@ metaslab_preload(void *arg)
metaslab_load_wait(msp);
if (!msp->ms_loaded)
(void) metaslab_load(msp);
-
- /*
- * Set the ms_access_txg value so that we don't unload it right away.
- */
- msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
+ msp->ms_selected_txg = spa_syncing_txg(spa);
mutex_exit(&msp->ms_lock);
}
@@ -1689,10 +1987,7 @@ metaslab_group_preload(metaslab_group_t *mg)
/*
* Load the next potential metaslabs
*/
- msp = avl_first(t);
- while (msp != NULL) {
- metaslab_t *msp_next = AVL_NEXT(t, msp);
-
+ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
/*
* We preload only the maximum number of metaslabs specified
* by metaslab_preload_limit. If a metaslab is being forced
@@ -1700,27 +1995,11 @@ metaslab_group_preload(metaslab_group_t *mg)
* that force condensing happens in the next txg.
*/
if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
- msp = msp_next;
continue;
}
- /*
- * We must drop the metaslab group lock here to preserve
- * lock ordering with the ms_lock (when grabbing both
- * the mg_lock and the ms_lock, the ms_lock must be taken
- * first). As a result, it is possible that the ordering
- * of the metaslabs within the avl tree may change before
- * we reacquire the lock. The metaslab cannot be removed from
- * the tree while we're in syncing context so it is safe to
- * drop the mg_lock here. If the metaslabs are reordered
- * nothing will break -- we just may end up loading a
- * less than optimal one.
- */
- mutex_exit(&mg->mg_lock);
VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
msp, TQ_SLEEP) != NULL);
- mutex_enter(&mg->mg_lock);
- msp = msp_next;
}
mutex_exit(&mg->mg_lock);
}
@@ -1872,7 +2151,7 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
mutex_enter(&msp->ms_lock);
/*
- * While we would ideally like to create a space_map representation
+ * While we would ideally like to create a space map representation
* that consists only of allocation records, doing so can be
* prohibitively expensive because the in-core free tree can be
* large, and therefore computationally expensive to subtract
@@ -1935,7 +2214,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* metaslab_sync() is the metaslab's ms_tree. No other thread can
* be modifying this txg's alloctree, freetree, freed_tree, or
* space_map_phys_t. Therefore, we only hold ms_lock to satify
- * space_map ASSERTs. We drop it whenever we call into the DMU,
+ * space map ASSERTs. We drop it whenever we call into the DMU,
* because the DMU can call down to us (e.g. via zio_free()) at
* any time.
*/
@@ -1957,7 +2236,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
mutex_enter(&msp->ms_lock);
/*
- * Note: metaslab_condense() clears the space_map's histogram.
+ * Note: metaslab_condense() clears the space map's histogram.
* Therefore we must verify and remove this histogram before
* condensing.
*/
@@ -1982,16 +2261,38 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
*/
space_map_histogram_clear(msp->ms_sm);
space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
- } else {
+
/*
- * Since the space map is not loaded we simply update the
- * exisiting histogram with what was freed in this txg. This
- * means that the on-disk histogram may not have an accurate
- * view of the free space but it's close enough to allow
- * us to make allocation decisions.
+ * Since we've cleared the histogram we need to add back
+ * any free space that has already been processed, plus
+ * any deferred space. This allows the on-disk histogram
+ * to accurately reflect all free space even if some space
+ * is not yet available for allocation (i.e. deferred).
*/
- space_map_histogram_add(msp->ms_sm, *freetree, tx);
+ space_map_histogram_add(msp->ms_sm, *freed_tree, tx);
+
+ /*
+ * Add back any deferred free space that has not been
+ * added back into the in-core free tree yet. This will
+ * ensure that we don't end up with a space map histogram
+ * that is completely empty unless the metaslab is fully
+ * allocated.
+ */
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ space_map_histogram_add(msp->ms_sm,
+ msp->ms_defertree[t], tx);
+ }
}
+
+ /*
+ * Always add the free space from this sync pass to the space
+ * map histogram. We want to make sure that the on-disk histogram
+ * accounts for all free space. If the space map is not loaded,
+ * then we will lose some accuracy but will correct it the next
+ * time we load the space map.
+ */
+ space_map_histogram_add(msp->ms_sm, *freetree, tx);
+
metaslab_group_histogram_add(mg, msp);
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
@@ -2010,6 +2311,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
range_tree_vacate(alloctree, NULL, NULL);
ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
mutex_exit(&msp->ms_lock);
@@ -2031,9 +2333,11 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
{
metaslab_group_t *mg = msp->ms_group;
vdev_t *vd = mg->mg_vd;
+ spa_t *spa = vd->vdev_spa;
range_tree_t **freed_tree;
range_tree_t **defer_tree;
int64_t alloc_delta, defer_delta;
+ boolean_t defer_allowed = B_TRUE;
ASSERT(!vd->vdev_ishole);
@@ -2068,9 +2372,20 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
+ uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
+ metaslab_class_get_alloc(spa_normal_class(spa));
+ if (free_space <= spa_get_slop_space(spa)) {
+ defer_allowed = B_FALSE;
+ }
+
+ defer_delta = 0;
alloc_delta = space_map_alloc_delta(msp->ms_sm);
- defer_delta = range_tree_space(*freed_tree) -
- range_tree_space(*defer_tree);
+ if (defer_allowed) {
+ defer_delta = range_tree_space(*freed_tree) -
+ range_tree_space(*defer_tree);
+ } else {
+ defer_delta -= range_tree_space(*defer_tree);
+ }
vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
@@ -2091,7 +2406,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
*/
range_tree_vacate(*defer_tree,
msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
- range_tree_swap(freed_tree, defer_tree);
+ if (defer_allowed) {
+ range_tree_swap(freed_tree, defer_tree);
+ } else {
+ range_tree_vacate(*freed_tree,
+ msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+ }
space_map_update(msp->ms_sm);
@@ -2106,7 +2426,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
}
- if (msp->ms_loaded && msp->ms_access_txg < txg) {
+ /*
+ * Calculate the new weights before unloading any metaslabs.
+ * This will give us the most accurate weighting.
+ */
+ metaslab_group_sort(mg, msp, metaslab_weight(msp));
+
+ /*
+ * If the metaslab is loaded and we've not tried to load or allocate
+ * from it in 'metaslab_unload_delay' txgs, then unload it.
+ */
+ if (msp->ms_loaded &&
+ msp->ms_selected_txg + metaslab_unload_delay < txg) {
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
VERIFY0(range_tree_space(
msp->ms_alloctree[(txg + t) & TXG_MASK]));
@@ -2116,7 +2447,6 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
metaslab_unload(msp);
}
- metaslab_group_sort(mg, msp, metaslab_weight(msp));
mutex_exit(&msp->ms_lock);
}
@@ -2151,6 +2481,113 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
/*
* ==========================================================================
+ * Metaslab allocation tracing facility
+ * ==========================================================================
+ */
+kstat_t *metaslab_trace_ksp;
+kstat_named_t metaslab_trace_over_limit;
+
+void
+metaslab_alloc_trace_init(void)
+{
+ ASSERT(metaslab_alloc_trace_cache == NULL);
+ metaslab_alloc_trace_cache = kmem_cache_create(
+ "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
+ "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
+ if (metaslab_trace_ksp != NULL) {
+ metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
+ kstat_named_init(&metaslab_trace_over_limit,
+ "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
+ kstat_install(metaslab_trace_ksp);
+ }
+}
+
+void
+metaslab_alloc_trace_fini(void)
+{
+ if (metaslab_trace_ksp != NULL) {
+ kstat_delete(metaslab_trace_ksp);
+ metaslab_trace_ksp = NULL;
+ }
+ kmem_cache_destroy(metaslab_alloc_trace_cache);
+ metaslab_alloc_trace_cache = NULL;
+}
+
+/*
+ * Add an allocation trace element to the allocation tracing list.
+ */
+static void
+metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
+ metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
+{
+ if (!metaslab_trace_enabled)
+ return;
+
+ /*
+ * When the tracing list reaches its maximum we remove
+ * the second element in the list before adding a new one.
+ * By removing the second element we preserve the original
+ * entry as a clue to what allocations steps have already been
+ * performed.
+ */
+ if (zal->zal_size == metaslab_trace_max_entries) {
+ metaslab_alloc_trace_t *mat_next;
+#ifdef DEBUG
+ panic("too many entries in allocation list");
+#endif
+ atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
+ zal->zal_size--;
+ mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
+ list_remove(&zal->zal_list, mat_next);
+ kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
+ }
+
+ metaslab_alloc_trace_t *mat =
+ kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
+ list_link_init(&mat->mat_list_node);
+ mat->mat_mg = mg;
+ mat->mat_msp = msp;
+ mat->mat_size = psize;
+ mat->mat_dva_id = dva_id;
+ mat->mat_offset = offset;
+ mat->mat_weight = 0;
+
+ if (msp != NULL)
+ mat->mat_weight = msp->ms_weight;
+
+ /*
+ * The list is part of the zio so locking is not required. Only
+ * a single thread will perform allocations for a given zio.
+ */
+ list_insert_tail(&zal->zal_list, mat);
+ zal->zal_size++;
+
+ ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
+}
+
+void
+metaslab_trace_init(zio_alloc_list_t *zal)
+{
+ list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
+ offsetof(metaslab_alloc_trace_t, mat_list_node));
+ zal->zal_size = 0;
+}
+
+void
+metaslab_trace_fini(zio_alloc_list_t *zal)
+{
+ metaslab_alloc_trace_t *mat;
+
+ while ((mat = list_remove_head(&zal->zal_list)) != NULL)
+ kmem_cache_free(metaslab_alloc_trace_cache, mat);
+ list_destroy(&zal->zal_list);
+ zal->zal_size = 0;
+}
+
+/*
+ * ==========================================================================
* Metaslab block operations
* ==========================================================================
*/
@@ -2199,13 +2636,48 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
}
static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
- uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
+{
+ uint64_t start;
+ range_tree_t *rt = msp->ms_tree;
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+
+ VERIFY(!msp->ms_condensing);
+
+ start = mc->mc_ops->msop_alloc(msp, size);
+ if (start != -1ULL) {
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+
+ VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
+ range_tree_remove(rt, start, size);
+
+ if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+ vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
+
+ range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
+
+ /* Track the last successful allocation */
+ msp->ms_alloc_txg = txg;
+ metaslab_verify_space(msp, txg);
+ }
+
+ /*
+ * Now that we've attempted the allocation we need to update the
+ * metaslab's maximum block size since it may have changed.
+ */
+ msp->ms_max_size = metaslab_block_maxsize(msp);
+ return (start);
+}
+
+static uint64_t
+metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
+ uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
{
- spa_t *spa = mg->mg_vd->vdev_spa;
metaslab_t *msp = NULL;
uint64_t offset = -1ULL;
- avl_tree_t *t = &mg->mg_metaslab_tree;
uint64_t activation_weight;
uint64_t target_distance;
int i;
@@ -2218,20 +2690,39 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
}
}
+ metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
+ search->ms_weight = UINT64_MAX;
+ search->ms_start = 0;
for (;;) {
boolean_t was_active;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ avl_index_t idx;
mutex_enter(&mg->mg_lock);
- for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
- if (msp->ms_weight < asize) {
- spa_dbgmsg(spa, "%s: failed to meet weight "
- "requirement: vdev %llu, txg %llu, mg %p, "
- "msp %p, asize %llu, "
- "weight %llu", spa_name(spa),
- mg->mg_vd->vdev_id, txg,
- mg, msp, asize, msp->ms_weight);
- mutex_exit(&mg->mg_lock);
- return (-1ULL);
+
+ /*
+ * Find the metaslab with the highest weight that is less
+ * than what we've already tried. In the common case, this
+ * means that we will examine each metaslab at most once.
+ * Note that concurrent callers could reorder metaslabs
+ * by activation/passivation once we have dropped the mg_lock.
+ * If a metaslab is activated by another thread, and we fail
+ * to allocate from the metaslab we have selected, we may
+ * not try the newly-activated metaslab, and instead activate
+ * another metaslab. This is not optimal, but generally
+ * does not cause any problems (a possible exception being
+ * if every metaslab is completely full except for the
+ * the newly-activated metaslab which we fail to examine).
+ */
+ msp = avl_find(t, search, &idx);
+ if (msp == NULL)
+ msp = avl_nearest(t, idx, AVL_AFTER);
+ for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
+
+ if (!metaslab_should_allocate(msp, asize)) {
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_TOO_SMALL);
+ continue;
}
/*
@@ -2248,16 +2739,21 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
(space_map_allocated(msp->ms_sm) != 0 ? 0 :
min_distance >> 1);
- for (i = 0; i < d; i++)
+ for (i = 0; i < d; i++) {
if (metaslab_distance(msp, &dva[i]) <
target_distance)
break;
+ }
if (i == d)
break;
}
mutex_exit(&mg->mg_lock);
- if (msp == NULL)
+ if (msp == NULL) {
+ kmem_free(search, sizeof (*search));
return (-1ULL);
+ }
+ search->ms_weight = msp->ms_weight;
+ search->ms_start = msp->ms_start + 1;
mutex_enter(&msp->ms_lock);
@@ -2265,11 +2761,11 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
* Ensure that the metaslab we have selected is still
* capable of handling our request. It's possible that
* another thread may have changed the weight while we
- * were blocked on the metaslab lock.
+ * were blocked on the metaslab lock. We check the
+ * active status first to see if we need to reselect
+ * a new metaslab.
*/
- if (msp->ms_weight < asize || (was_active &&
- !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
- activation_weight == METASLAB_WEIGHT_PRIMARY)) {
+ if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
mutex_exit(&msp->ms_lock);
continue;
}
@@ -2286,6 +2782,21 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
mutex_exit(&msp->ms_lock);
continue;
}
+ msp->ms_selected_txg = txg;
+
+ /*
+ * Now that we have the lock, recheck to see if we should
+ * continue to use this metaslab for this allocation. The
+ * the metaslab is now loaded so metaslab_should_allocate() can
+ * accurately determine if the allocation attempt should
+ * proceed.
+ */
+ if (!metaslab_should_allocate(msp, asize)) {
+ /* Passivate this metaslab and select a new one. */
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_TOO_SMALL);
+ goto next;
+ }
/*
* If this metaslab is currently condensing then pick again as
@@ -2293,50 +2804,131 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
* to disk.
*/
if (msp->ms_condensing) {
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_CONDENSING);
mutex_exit(&msp->ms_lock);
continue;
}
- if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
+ offset = metaslab_block_alloc(msp, asize, txg);
+ metaslab_trace_add(zal, mg, msp, asize, d, offset);
+
+ if (offset != -1ULL) {
+ /* Proactively passivate the metaslab, if needed */
+ metaslab_segment_may_passivate(msp);
break;
+ }
+next:
+ ASSERT(msp->ms_loaded);
+
+ /*
+ * We were unable to allocate from this metaslab so determine
+ * a new weight for this metaslab. Now that we have loaded
+ * the metaslab we can provide a better hint to the metaslab
+ * selector.
+ *
+ * For space-based metaslabs, we use the maximum block size.
+ * This information is only available when the metaslab
+ * is loaded and is more accurate than the generic free
+ * space weight that was calculated by metaslab_weight().
+ * This information allows us to quickly compare the maximum
+ * available allocation in the metaslab to the allocation
+ * size being requested.
+ *
+ * For segment-based metaslabs, determine the new weight
+ * based on the highest bucket in the range tree. We
+ * explicitly use the loaded segment weight (i.e. the range
+ * tree histogram) since it contains the space that is
+ * currently available for allocation and is accurate
+ * even within a sync pass.
+ */
+ if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
+ uint64_t weight = metaslab_block_maxsize(msp);
+ WEIGHT_SET_SPACEBASED(weight);
+ metaslab_passivate(msp, weight);
+ } else {
+ metaslab_passivate(msp,
+ metaslab_weight_from_range_tree(msp));
+ }
- metaslab_passivate(msp, metaslab_block_maxsize(msp));
+ /*
+ * We have just failed an allocation attempt, check
+ * that metaslab_should_allocate() agrees. Otherwise,
+ * we may end up in an infinite loop retrying the same
+ * metaslab.
+ */
+ ASSERT(!metaslab_should_allocate(msp, asize));
mutex_exit(&msp->ms_lock);
}
+ mutex_exit(&msp->ms_lock);
+ kmem_free(search, sizeof (*search));
+ return (offset);
+}
- if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
- vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
+static uint64_t
+metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
+ uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+{
+ uint64_t offset;
+ ASSERT(mg->mg_initialized);
- range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
- msp->ms_access_txg = txg + metaslab_unload_delay;
+ offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
+ min_distance, dva, d);
- mutex_exit(&msp->ms_lock);
+ mutex_enter(&mg->mg_lock);
+ if (offset == -1ULL) {
+ mg->mg_failed_allocations++;
+ metaslab_trace_add(zal, mg, NULL, asize, d,
+ TRACE_GROUP_FAILURE);
+ if (asize == SPA_GANGBLOCKSIZE) {
+ /*
+ * This metaslab group was unable to allocate
+ * the minimum gang block size so it must be out of
+ * space. We must notify the allocation throttle
+ * to start skipping allocation attempts to this
+ * metaslab group until more space becomes available.
+ * Note: this failure cannot be caused by the
+ * allocation throttle since the allocation throttle
+ * is only responsible for skipping devices and
+ * not failing block allocations.
+ */
+ mg->mg_no_free_space = B_TRUE;
+ }
+ }
+ mg->mg_allocations++;
+ mutex_exit(&mg->mg_lock);
return (offset);
}
/*
+ * If we have to write a ditto block (i.e. more than one DVA for a given BP)
+ * on the same vdev as an existing DVA of this BP, then try to allocate it
+ * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
+ * existing DVAs.
+ */
+int ditto_same_vdev_distance_shift = 3;
+
+/*
* Allocate a block for the specified i/o.
*/
static int
metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
- dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
+ dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
+ zio_alloc_list_t *zal)
{
metaslab_group_t *mg, *rotor;
vdev_t *vd;
- int dshift = 3;
- int all_zero;
- int zio_lock = B_FALSE;
- boolean_t allocatable;
- uint64_t asize;
- uint64_t distance;
+ boolean_t try_hard = B_FALSE;
ASSERT(!DVA_IS_VALID(&dva[d]));
/*
* For testing, make some blocks above a certain size be gang blocks.
*/
- if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
+ if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
+ metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
return (SET_ERROR(ENOSPC));
+ }
/*
* Start at the rotor and loop through all mgs until we find something.
@@ -2393,15 +2985,16 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
rotor = mg;
top:
- all_zero = B_TRUE;
do {
+ boolean_t allocatable;
+
ASSERT(mg->mg_activation_count == 1);
vd = mg->mg_vd;
/*
* Don't allocate from faulted devices.
*/
- if (zio_lock) {
+ if (try_hard) {
spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
allocatable = vdev_allocatable(vd);
spa_config_exit(spa, SCL_ZIO, FTAG);
@@ -2416,63 +3009,54 @@ top:
* inadvertently return ENOSPC and suspend the pool
* even though space is still available.
*/
- if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) {
+ if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
allocatable = metaslab_group_allocatable(mg, rotor,
psize);
}
- if (!allocatable)
+ if (!allocatable) {
+ metaslab_trace_add(zal, mg, NULL, psize, d,
+ TRACE_NOT_ALLOCATABLE);
goto next;
+ }
ASSERT(mg->mg_initialized);
/*
- * Avoid writing single-copy data to a failing vdev.
+ * Avoid writing single-copy data to a failing,
+ * non-redundant vdev, unless we've already tried all
+ * other vdevs.
*/
if ((vd->vdev_stat.vs_write_errors > 0 ||
vd->vdev_state < VDEV_STATE_HEALTHY) &&
- d == 0 && dshift == 3 && vd->vdev_children == 0) {
- all_zero = B_FALSE;
+ d == 0 && !try_hard && vd->vdev_children == 0) {
+ metaslab_trace_add(zal, mg, NULL, psize, d,
+ TRACE_VDEV_ERROR);
goto next;
}
ASSERT(mg->mg_class == mc);
- distance = vd->vdev_asize >> dshift;
- if (distance <= (1ULL << vd->vdev_ms_shift))
- distance = 0;
- else
- all_zero = B_FALSE;
+ /*
+ * If we don't need to try hard, then require that the
+ * block be 1/8th of the device away from any other DVAs
+ * in this BP. If we are trying hard, allow any offset
+ * to be used (distance=0).
+ */
+ uint64_t distance = 0;
+ if (!try_hard) {
+ distance = vd->vdev_asize >>
+ ditto_same_vdev_distance_shift;
+ if (distance <= (1ULL << vd->vdev_ms_shift))
+ distance = 0;
+ }
- asize = vdev_psize_to_asize(vd, psize);
+ uint64_t asize = vdev_psize_to_asize(vd, psize);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
- uint64_t offset = metaslab_group_alloc(mg, asize, txg,
+ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
distance, dva, d);
- mutex_enter(&mg->mg_lock);
- if (offset == -1ULL) {
- mg->mg_failed_allocations++;
- if (asize == SPA_GANGBLOCKSIZE) {
- /*
- * This metaslab group was unable to allocate
- * the minimum gang block size so it must be
- * out of space. We must notify the allocation
- * throttle to start skipping allocation
- * attempts to this metaslab group until more
- * space becomes available.
- *
- * Note: this failure cannot be caused by the
- * allocation throttle since the allocation
- * throttle is only responsible for skipping
- * devices and not failing block allocations.
- */
- mg->mg_no_free_space = B_TRUE;
- }
- }
- mg->mg_allocations++;
- mutex_exit(&mg->mg_lock);
-
if (offset != -1ULL) {
/*
* If we've just selected this metaslab group,
@@ -2524,20 +3108,17 @@ next:
mc->mc_aliquot = 0;
} while ((mg = mg->mg_next) != rotor);
- if (!all_zero) {
- dshift++;
- ASSERT(dshift < 64);
- goto top;
- }
-
- if (!allocatable && !zio_lock) {
- dshift = 3;
- zio_lock = B_TRUE;
+ /*
+ * If we haven't tried hard, do so now.
+ */
+ if (!try_hard) {
+ try_hard = B_TRUE;
goto top;
}
bzero(&dva[d], sizeof (dva_t));
+ metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
return (SET_ERROR(ENOSPC));
}
@@ -2586,6 +3167,7 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
range_tree_add(msp->ms_tree, offset, size);
+ msp->ms_max_size = metaslab_block_maxsize(msp);
} else {
if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
vdev_dirty(vd, VDD_METASLAB, msp, txg);
@@ -2703,7 +3285,8 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
int
metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
- int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio)
+ int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
+ zio_alloc_list_t *zal, zio_t *zio)
{
dva_t *dva = bp->blk_dva;
dva_t *hintdva = hintbp->blk_dva;
@@ -2722,10 +3305,11 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
ASSERT(BP_GET_NDVAS(bp) == 0);
ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
+ ASSERT3P(zal, !=, NULL);
for (int d = 0; d < ndvas; d++) {
error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
- txg, flags);
+ txg, flags, zal);
if (error != 0) {
for (d--; d >= 0; d--) {
metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 4b26b007a7..bb1ecaa4cc 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -1241,6 +1241,19 @@ spa_unload(spa_t *spa)
}
/*
+ * Even though vdev_free() also calls vdev_metaslab_fini, we need
+ * to call it earlier, before we wait for async i/o to complete.
+ * This ensures that there is no async metaslab prefetching, by
+ * calling taskq_wait(mg_taskq).
+ */
+ if (spa->spa_root_vdev != NULL) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
+ vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ }
+
+ /*
* Wait for any outstanding async I/O to complete.
*/
if (spa->spa_async_zio_root != NULL) {
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 7e00d9f42a..5882d18f41 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -1877,6 +1877,7 @@ spa_init(int mode)
refcount_init();
unique_init();
range_tree_init();
+ metaslab_alloc_trace_init();
zio_init();
dmu_init();
zil_init();
@@ -1899,6 +1900,7 @@ spa_fini(void)
zil_fini();
dmu_fini();
zio_fini();
+ metaslab_alloc_trace_fini();
range_tree_fini();
unique_fini();
refcount_fini();
diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c
index e0fe6ac260..0b3af50a11 100644
--- a/usr/src/uts/common/fs/zfs/space_map.c
+++ b/usr/src/uts/common/fs/zfs/space_map.c
@@ -170,7 +170,6 @@ space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
dmu_buf_will_dirty(sm->sm_dbuf, tx);
ASSERT(space_map_histogram_verify(sm, rt));
-
/*
* Transfer the content of the range tree histogram to the space
* map histogram. The space map histogram contains 32 buckets ranging
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index b1e9456f5a..ad42cf7bcc 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -122,11 +122,17 @@ typedef enum arc_flags
} arc_flags_t;
+typedef enum arc_buf_flags {
+ ARC_BUF_FLAG_SHARED = 1 << 0,
+ ARC_BUF_FLAG_COMPRESSED = 1 << 1
+} arc_buf_flags_t;
+
struct arc_buf {
arc_buf_hdr_t *b_hdr;
arc_buf_t *b_next;
kmutex_t b_evict_lock;
void *b_data;
+ arc_buf_flags_t b_flags;
};
typedef enum arc_buf_contents {
@@ -150,13 +156,21 @@ typedef enum arc_space_type {
void arc_space_consume(uint64_t space, arc_space_type_t type);
void arc_space_return(uint64_t space, arc_space_type_t type);
-arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag,
- arc_buf_contents_t type);
-arc_buf_t *arc_loan_buf(spa_t *spa, int size);
+boolean_t arc_is_metadata(arc_buf_t *buf);
+enum zio_compress arc_get_compression(arc_buf_t *buf);
+int arc_decompress(arc_buf_t *buf);
+arc_buf_t *arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type,
+ int32_t size);
+arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, void *tag,
+ uint64_t psize, uint64_t lsize, enum zio_compress compression_type);
+arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size);
+arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type);
void arc_return_buf(arc_buf_t *buf, void *tag);
void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
void arc_buf_destroy(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf);
+int arc_buf_lsize(arc_buf_t *buf);
void arc_release(arc_buf_t *buf, void *tag);
int arc_released(arc_buf_t *buf);
void arc_buf_freeze(arc_buf_t *buf);
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 3e3c654448..3304027ccc 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -48,6 +48,7 @@
#include <sys/inttypes.h>
#include <sys/cred.h>
#include <sys/fs/zfs.h>
+#include <sys/zio_compress.h>
#include <sys/zio_priority.h>
#ifdef __cplusplus
@@ -419,7 +420,7 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
#define WP_SPILL 0x4
void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
- struct zio_prop *zp);
+ enum zio_compress compress_override, struct zio_prop *zp);
/*
* The bonus data is accessed more or less like a regular buffer.
* You must dmu_bonus_hold() to get the buffer, which will give you a
@@ -753,8 +754,8 @@ int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
int dmu_xuio_cnt(struct xuio *uio);
struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
void dmu_xuio_clear(struct xuio *uio, int i);
-void xuio_stat_wbuf_copied();
-void xuio_stat_wbuf_nocopy();
+void xuio_stat_wbuf_copied(void);
+void xuio_stat_wbuf_nocopy(void);
extern boolean_t zfs_prefetch_disable;
extern int zfs_max_recordsize;
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_send.h b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
index 21d9cb4bb0..38b1b042e5 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_send.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
@@ -42,14 +42,15 @@ struct dmu_replay_record;
extern const char *recv_clone_name;
int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
- boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
+ boolean_t large_block_ok, boolean_t compressok, int outfd,
+ uint64_t resumeobj, uint64_t resumeoff,
struct vnode *vp, offset_t *off);
int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
- uint64_t *sizep);
+ boolean_t stream_compressed, uint64_t *sizep);
int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg,
- uint64_t *sizep);
+ boolean_t stream_compressed, uint64_t *sizep);
int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
- boolean_t embedok, boolean_t large_block_ok,
+ boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
int outfd, struct vnode *vp, offset_t *off);
typedef struct dmu_recv_cookie {
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
index 22c67b48a9..cab7cbb10f 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -96,7 +96,9 @@ struct dsl_pool;
#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
+#define DS_FIELD_RESUME_LARGEBLOCK "com.delphix:resume_largeblockok"
#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
+#define DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok"
/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
index f7271f08ad..82ed08c728 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
@@ -36,10 +36,12 @@
extern "C" {
#endif
+
typedef struct metaslab_ops {
- uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size);
+ uint64_t (*msop_alloc)(metaslab_t *, uint64_t);
} metaslab_ops_t;
+
extern metaslab_ops_t *zfs_metaslab_ops;
int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
@@ -63,11 +65,16 @@ uint64_t metaslab_block_maxsize(metaslab_t *);
#define METASLAB_DONT_THROTTLE 0x10
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
- blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *);
+ blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
void metaslab_check_free(spa_t *, const blkptr_t *);
+void metaslab_alloc_trace_init(void);
+void metaslab_alloc_trace_fini(void);
+void metaslab_trace_init(zio_alloc_list_t *);
+void metaslab_trace_fini(zio_alloc_list_t *);
+
metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *);
void metaslab_class_destroy(metaslab_class_t *);
int metaslab_class_validate(metaslab_class_t *);
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
index 1c8993aca5..c43f457b9f 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -42,6 +42,94 @@ extern "C" {
#endif
/*
+ * Metaslab allocation tracing record.
+ */
+typedef struct metaslab_alloc_trace {
+ list_node_t mat_list_node;
+ metaslab_group_t *mat_mg;
+ metaslab_t *mat_msp;
+ uint64_t mat_size;
+ uint64_t mat_weight;
+ uint32_t mat_dva_id;
+ uint64_t mat_offset;
+} metaslab_alloc_trace_t;
+
+/*
+ * Used by the metaslab allocation tracing facility to indicate
+ * error conditions. These errors are stored to the offset member
+ * of the metaslab_alloc_trace_t record and displayed by mdb.
+ */
+typedef enum trace_alloc_type {
+ TRACE_ALLOC_FAILURE = -1ULL,
+ TRACE_TOO_SMALL = -2ULL,
+ TRACE_FORCE_GANG = -3ULL,
+ TRACE_NOT_ALLOCATABLE = -4ULL,
+ TRACE_GROUP_FAILURE = -5ULL,
+ TRACE_ENOSPC = -6ULL,
+ TRACE_CONDENSING = -7ULL,
+ TRACE_VDEV_ERROR = -8ULL
+} trace_alloc_type_t;
+
+#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
+#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
+#define METASLAB_WEIGHT_TYPE (1ULL << 61)
+#define METASLAB_ACTIVE_MASK \
+ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+
+/*
+ * The metaslab weight is used to encode the amount of free space in a
+ * metaslab, such that the "best" metaslab appears first when sorting the
+ * metaslabs by weight. The weight (and therefore the "best" metaslab) can
+ * be determined in two different ways: by computing a weighted sum of all
+ * the free space in the metaslab (a space based weight) or by counting only
+ * the free segments of the largest size (a segment based weight). We prefer
+ * the segment based weight because it reflects how the free space is
+ * comprised, but we cannot always use it -- legacy pools do not have the
+ * space map histogram information necessary to determine the largest
+ * contiguous regions. Pools that have the space map histogram determine
+ * the segment weight by looking at each bucket in the histogram and
+ * determining the free space whose size in bytes is in the range:
+ * [2^i, 2^(i+1))
+ * We then encode the largest index, i, that contains regions into the
+ * segment-weighted value.
+ *
+ * Space-based weight:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * |PS1| weighted-free space |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * PS - indicates primary and secondary activation
+ * space - the fragmentation-weighted space
+ *
+ * Segment-based weight:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * |PS0| idx| count of segments in region |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * PS - indicates primary and secondary activation
+ * idx - index for the highest bucket in the histogram
+ * count - number of segments in the specified bucket
+ */
+#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 62, 2)
+#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 62, 2, x)
+
+#define WEIGHT_IS_SPACEBASED(weight) \
+ ((weight) == 0 || BF64_GET((weight), 61, 1))
+#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 61, 1, 1)
+
+/*
+ * These macros are only applicable to segment-based weighting.
+ */
+#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 55, 6)
+#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 55, 6, x)
+#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 55)
+#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 55, x)
+
+/*
* A metaslab class encompasses a category of allocatable top-level vdevs.
* Each top-level vdev is associated with a metaslab group which defines
* the allocatable region for that vdev. Examples of these categories include
@@ -220,7 +308,6 @@ struct metaslab {
kmutex_t ms_lock;
kcondvar_t ms_load_cv;
space_map_t *ms_sm;
- metaslab_ops_t *ms_ops;
uint64_t ms_id;
uint64_t ms_start;
uint64_t ms_size;
@@ -233,12 +320,27 @@ struct metaslab {
boolean_t ms_condensing; /* condensing? */
boolean_t ms_condense_wanted;
+
+ /*
+ * We must hold both ms_lock and ms_group->mg_lock in order to
+ * modify ms_loaded.
+ */
boolean_t ms_loaded;
boolean_t ms_loading;
int64_t ms_deferspace; /* sum of ms_defermap[] space */
uint64_t ms_weight; /* weight vs. others in group */
- uint64_t ms_access_txg;
+ uint64_t ms_activation_weight; /* activation weight */
+
+ /*
+ * Track of whenever a metaslab is selected for loading or allocation.
+ * We use this value to determine how long the metaslab should
+ * stay cached.
+ */
+ uint64_t ms_selected_txg;
+
+ uint64_t ms_alloc_txg; /* last successful alloc (debug only) */
+ uint64_t ms_max_size; /* maximum allocatable size */
/*
* The metaslab block allocators can optionally use a size-ordered
diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h
index 345d42aa28..3f50cddb6f 100644
--- a/usr/src/uts/common/fs/zfs/sys/refcount.h
+++ b/usr/src/uts/common/fs/zfs/sys/refcount.h
@@ -103,7 +103,7 @@ typedef struct refcount {
atomic_add_64(&(src)->rc_count, -__tmp); \
atomic_add_64(&(dst)->rc_count, __tmp); \
}
-#define refcount_transfer_ownership(rc, current_holder, new_holder)
+#define refcount_transfer_ownership(rc, current_holder, new_holder) (void)0
#define refcount_held(rc, holder) ((rc)->rc_count > 0)
#define refcount_not_held(rc, holder) (B_TRUE)
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
index 3dd992bd25..06c73f3941 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
@@ -50,14 +50,15 @@ extern int zfs_flags;
extern boolean_t zfs_recover;
extern boolean_t zfs_free_leak_on_eio;
-#define ZFS_DEBUG_DPRINTF (1<<0)
-#define ZFS_DEBUG_DBUF_VERIFY (1<<1)
-#define ZFS_DEBUG_DNODE_VERIFY (1<<2)
-#define ZFS_DEBUG_SNAPNAMES (1<<3)
-#define ZFS_DEBUG_MODIFY (1<<4)
-#define ZFS_DEBUG_SPA (1<<5)
-#define ZFS_DEBUG_ZIO_FREE (1<<6)
-#define ZFS_DEBUG_HISTOGRAM_VERIFY (1<<7)
+#define ZFS_DEBUG_DPRINTF (1 << 0)
+#define ZFS_DEBUG_DBUF_VERIFY (1 << 1)
+#define ZFS_DEBUG_DNODE_VERIFY (1 << 2)
+#define ZFS_DEBUG_SNAPNAMES (1 << 3)
+#define ZFS_DEBUG_MODIFY (1 << 4)
+#define ZFS_DEBUG_SPA (1 << 5)
+#define ZFS_DEBUG_ZIO_FREE (1 << 6)
+#define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7)
+#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8)
#ifdef ZFS_DEBUG
extern void __dprintf(const char *file, const char *func,
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index bc83f87483..d86e3b45f1 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -87,19 +87,22 @@ typedef enum drr_headertype {
#define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2)
/* flags #3 - #15 are reserved for incompatible closed-source implementations */
#define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16)
-#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1 << 17)
+#define DMU_BACKUP_FEATURE_LZ4 (1 << 17)
/* flag #18 is reserved for a Delphix feature */
#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19)
#define DMU_BACKUP_FEATURE_RESUMING (1 << 20)
+/* flag #21 is reserved for a Delphix feature */
+#define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22)
/*
* Mask of all supported backup features
*/
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
- DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
+ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \
DMU_BACKUP_FEATURE_RESUMING | \
- DMU_BACKUP_FEATURE_LARGE_BLOCKS)
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS | \
+ DMU_BACKUP_FEATURE_COMPRESSED)
/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
@@ -152,6 +155,12 @@ typedef enum dmu_send_resume_token_version {
#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP)
+/* deal with compressed drr_write replay records */
+#define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0)
+#define DRR_WRITE_PAYLOAD_SIZE(drrw) \
+ (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \
+ (drrw)->drr_logical_size)
+
/*
* zfs ioctl command structure
*/
@@ -199,12 +208,16 @@ typedef struct dmu_replay_record {
dmu_object_type_t drr_type;
uint32_t drr_pad;
uint64_t drr_offset;
- uint64_t drr_length;
+ uint64_t drr_logical_size;
uint64_t drr_toguid;
uint8_t drr_checksumtype;
uint8_t drr_checksumflags;
- uint8_t drr_pad2[6];
- ddt_key_t drr_key; /* deduplication key */
+ uint8_t drr_compressiontype;
+ uint8_t drr_pad2[5];
+ /* deduplication key */
+ ddt_key_t drr_key;
+ /* only nonzero if drr_compressiontype is not 0 */
+ uint64_t drr_compressed_size;
/* content follows */
} drr_write;
struct drr_free {
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 873d38ec7f..7b5f0ccbf0 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -104,26 +104,6 @@ enum zio_checksum {
#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256
#define ZIO_DEDUPDITTO_MIN 100
-enum zio_compress {
- ZIO_COMPRESS_INHERIT = 0,
- ZIO_COMPRESS_ON,
- ZIO_COMPRESS_OFF,
- ZIO_COMPRESS_LZJB,
- ZIO_COMPRESS_EMPTY,
- ZIO_COMPRESS_GZIP_1,
- ZIO_COMPRESS_GZIP_2,
- ZIO_COMPRESS_GZIP_3,
- ZIO_COMPRESS_GZIP_4,
- ZIO_COMPRESS_GZIP_5,
- ZIO_COMPRESS_GZIP_6,
- ZIO_COMPRESS_GZIP_7,
- ZIO_COMPRESS_GZIP_8,
- ZIO_COMPRESS_GZIP_9,
- ZIO_COMPRESS_ZLE,
- ZIO_COMPRESS_LZ4,
- ZIO_COMPRESS_FUNCTIONS
-};
-
/*
* The number of "legacy" compression functions which can be set on individual
* objects.
@@ -382,6 +362,11 @@ typedef int zio_pipe_stage_t(zio_t *zio);
#define ZIO_REEXECUTE_NOW 0x01
#define ZIO_REEXECUTE_SUSPEND 0x02
+typedef struct zio_alloc_list {
+ list_t zal_list;
+ uint64_t zal_size;
+} zio_alloc_list_t;
+
typedef struct zio_link {
zio_t *zl_parent;
zio_t *zl_child;
@@ -423,6 +408,8 @@ struct zio {
void *io_orig_data;
uint64_t io_size;
uint64_t io_orig_size;
+ /* io_lsize != io_orig_size iff this is a raw write */
+ uint64_t io_lsize;
/* Stuff for the vdev stack */
vdev_t *io_vd;
@@ -437,6 +424,7 @@ struct zio {
avl_node_t io_queue_node;
avl_node_t io_offset_node;
avl_node_t io_alloc_node;
+ zio_alloc_list_t io_alloc_list;
/* Internal pipeline state */
enum zio_flag io_flags;
@@ -478,11 +466,11 @@ extern zio_t *zio_root(spa_t *spa,
zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
- uint64_t size, zio_done_func_t *done, void *private,
+ uint64_t lsize, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, const zio_prop_t *zp,
+ void *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, enum zio_flag flags,
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_compress.h b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
index f4cb84511a..0c1783b140 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_compress.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
@@ -25,17 +25,36 @@
*/
/*
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZIO_COMPRESS_H
#define _SYS_ZIO_COMPRESS_H
-#include <sys/zio.h>
-
#ifdef __cplusplus
extern "C" {
#endif
+enum zio_compress {
+ ZIO_COMPRESS_INHERIT = 0,
+ ZIO_COMPRESS_ON,
+ ZIO_COMPRESS_OFF,
+ ZIO_COMPRESS_LZJB,
+ ZIO_COMPRESS_EMPTY,
+ ZIO_COMPRESS_GZIP_1,
+ ZIO_COMPRESS_GZIP_2,
+ ZIO_COMPRESS_GZIP_3,
+ ZIO_COMPRESS_GZIP_4,
+ ZIO_COMPRESS_GZIP_5,
+ ZIO_COMPRESS_GZIP_6,
+ ZIO_COMPRESS_GZIP_7,
+ ZIO_COMPRESS_GZIP_8,
+ ZIO_COMPRESS_GZIP_9,
+ ZIO_COMPRESS_ZLE,
+ ZIO_COMPRESS_LZ4,
+ ZIO_COMPRESS_FUNCTIONS
+};
+
/* Common signature for all zio compress functions. */
typedef size_t zio_compress_func_t(void *src, void *dst,
size_t s_len, size_t d_len, int);
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 823822e44c..c3cf3bc6d5 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -4450,6 +4450,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
boolean_t estimate = (zc->zc_guid != 0);
boolean_t embedok = (zc->zc_flags & 0x1);
boolean_t large_block_ok = (zc->zc_flags & 0x2);
+ boolean_t compressok = (zc->zc_flags & 0x4);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
@@ -4497,7 +4498,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
}
}
- error = dmu_send_estimate(tosnap, fromsnap,
+ error = dmu_send_estimate(tosnap, fromsnap, compressok,
&zc->zc_objset_type);
if (fromsnap != NULL)
@@ -4511,7 +4512,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
- zc->zc_fromobj, embedok, large_block_ok,
+ zc->zc_fromobj, embedok, large_block_ok, compressok,
zc->zc_cookie, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
@@ -5444,6 +5445,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
* indicates that blocks > 128KB are permitted
* (optional) "embedok" -> (value ignored)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
+ * (optional) "compressok" -> (value ignored)
+ * presence indicates compressed DRR_WRITE records are permitted
* (optional) "resume_object" and "resume_offset" -> (uint64)
* if present, resume send stream from specified object and offset.
* }
@@ -5460,6 +5463,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
int fd;
boolean_t largeblockok;
boolean_t embedok;
+ boolean_t compressok;
uint64_t resumeobj = 0;
uint64_t resumeoff = 0;
@@ -5471,6 +5475,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
largeblockok = nvlist_exists(innvl, "largeblockok");
embedok = nvlist_exists(innvl, "embedok");
+ compressok = nvlist_exists(innvl, "compressok");
(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
@@ -5480,8 +5485,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
return (SET_ERROR(EBADF));
off = fp->f_offset;
- error = dmu_send(snapname, fromname, embedok, largeblockok, fd,
- resumeobj, resumeoff, fp->f_vnode, &off);
+ error = dmu_send(snapname, fromname, embedok, largeblockok, compressok,
+ fd, resumeobj, resumeoff, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
@@ -5496,6 +5501,12 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
* innvl: {
* (optional) "from" -> full snap or bookmark name to send an incremental
* from
+ * (optional) "largeblockok" -> (value ignored)
+ * indicates that blocks > 128KB are permitted
+ * (optional) "embedok" -> (value ignored)
+ * presence indicates DRR_WRITE_EMBEDDED records are permitted
+ * (optional) "compressok" -> (value ignored)
+ * presence indicates compressed DRR_WRITE records are permitted
* }
*
* outnvl: {
@@ -5509,6 +5520,11 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
dsl_dataset_t *tosnap;
int error;
char *fromname;
+ /* LINTED E_FUNC_SET_NOT_USED */
+ boolean_t largeblockok;
+ /* LINTED E_FUNC_SET_NOT_USED */
+ boolean_t embedok;
+ boolean_t compressok;
uint64_t space;
error = dsl_pool_hold(snapname, FTAG, &dp);
@@ -5521,6 +5537,10 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
return (error);
}
+ largeblockok = nvlist_exists(innvl, "largeblockok");
+ embedok = nvlist_exists(innvl, "embedok");
+ compressok = nvlist_exists(innvl, "compressok");
+
error = nvlist_lookup_string(innvl, "from", &fromname);
if (error == 0) {
if (strchr(fromname, '@') != NULL) {
@@ -5533,7 +5553,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
if (error != 0)
goto out;
- error = dmu_send_estimate(tosnap, fromsnap, &space);
+ error = dmu_send_estimate(tosnap, fromsnap, compressok,
+ &space);
dsl_dataset_rele(fromsnap, FTAG);
} else if (strchr(fromname, '#') != NULL) {
/*
@@ -5548,7 +5569,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
if (error != 0)
goto out;
error = dmu_send_estimate_from_txg(tosnap,
- frombm.zbm_creation_txg, &space);
+ frombm.zbm_creation_txg, compressok, &space);
} else {
/*
* from is not properly formatted as a snapshot or
@@ -5559,7 +5580,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
}
} else {
// If estimating the size of a full send, use dmu_send_estimate
- error = dmu_send_estimate(tosnap, NULL, &space);
+ error = dmu_send_estimate(tosnap, NULL, compressok, &space);
}
fnvlist_add_uint64(outnvl, "space", space);
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 4921034c08..9eaea7666f 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -520,21 +520,23 @@ zio_timestamp_compare(const void *x1, const void *x2)
*/
static zio_t *
zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
- void *data, uint64_t size, zio_done_func_t *done, void *private,
- zio_type_t type, zio_priority_t priority, enum zio_flag flags,
- vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
- enum zio_stage stage, enum zio_stage pipeline)
+ void *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
+ void *private, zio_type_t type, zio_priority_t priority,
+ enum zio_flag flags, vdev_t *vd, uint64_t offset,
+ const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline)
{
zio_t *zio;
- ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
- ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+ ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE);
+ ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
ASSERT(vd || stage == ZIO_STAGE_OPEN);
+ IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0);
+
zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
bzero(zio, sizeof (zio_t));
@@ -545,6 +547,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
offsetof(zio_link_t, zl_parent_node));
list_create(&zio->io_child_list, sizeof (zio_link_t),
offsetof(zio_link_t, zl_child_node));
+ metaslab_trace_init(&zio->io_alloc_list);
if (vd != NULL)
zio->io_child_type = ZIO_CHILD_VDEV;
@@ -577,7 +580,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_vd = vd;
zio->io_offset = offset;
zio->io_orig_data = zio->io_data = data;
- zio->io_orig_size = zio->io_size = size;
+ zio->io_orig_size = zio->io_size = psize;
+ zio->io_lsize = lsize;
zio->io_orig_flags = zio->io_flags = flags;
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
@@ -606,6 +610,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
static void
zio_destroy(zio_t *zio)
{
+ metaslab_trace_fini(&zio->io_alloc_list);
list_destroy(&zio->io_parent_list);
list_destroy(&zio->io_child_list);
mutex_destroy(&zio->io_lock);
@@ -619,7 +624,7 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
{
zio_t *zio;
- zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
@@ -724,7 +729,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
zfs_blkptr_verify(spa, bp);
zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
- data, size, done, private,
+ data, size, size, done, private,
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
@@ -734,7 +739,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, const zio_prop_t *zp,
+ void *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, enum zio_flag flags,
@@ -751,7 +756,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zp->zp_copies > 0 &&
zp->zp_copies <= spa_max_replication(spa));
- zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
@@ -781,7 +786,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
{
zio_t *zio;
- zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
@@ -861,8 +866,8 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
stage |= ZIO_STAGE_ISSUE_ASYNC;
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
- NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
- NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
+ BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
+ flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
return (zio);
}
@@ -895,8 +900,8 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
- done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
- NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+ BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
+ flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
ASSERT0(zio->io_queued_timestamp);
return (zio);
@@ -910,7 +915,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
int c;
if (vd->vdev_children == 0) {
- zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
@@ -938,9 +943,9 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
ASSERT3U(offset + size, <=, vd->vdev_psize);
- zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
- ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
- NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+ zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
+ private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
+ offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
zio->io_prop.zp_checksum = checksum;
@@ -959,9 +964,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
ASSERT3U(offset + size, <=, vd->vdev_psize);
- zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
- ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
- NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+ zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
+ private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
+ offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
zio->io_prop.zp_checksum = checksum;
@@ -1037,7 +1042,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
flags &= ~ZIO_FLAG_IO_ALLOCATING;
}
- zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
+ zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
@@ -1059,7 +1064,7 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
ASSERT(vd->vdev_ops->vdev_op_leaf);
zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
- data, size, done, private, type, priority,
+ data, size, size, done, private, type, priority,
flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
vd, offset, NULL,
ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
@@ -1088,8 +1093,11 @@ zio_shrink(zio_t *zio, uint64_t size)
* Note, BP_IS_RAIDZ() assumes no compression.
*/
ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
- if (!BP_IS_RAIDZ(zio->io_bp))
- zio->io_orig_size = zio->io_size = size;
+ if (!BP_IS_RAIDZ(zio->io_bp)) {
+ /* we are not doing a raw write */
+ ASSERT3U(zio->io_size, ==, zio->io_lsize);
+ zio->io_orig_size = zio->io_size = zio->io_lsize = size;
+ }
}
/*
@@ -1198,10 +1206,12 @@ zio_write_compress(zio_t *zio)
zio_prop_t *zp = &zio->io_prop;
enum zio_compress compress = zp->zp_compress;
blkptr_t *bp = zio->io_bp;
- uint64_t lsize = zio->io_size;
- uint64_t psize = lsize;
+ uint64_t lsize = zio->io_lsize;
+ uint64_t psize = zio->io_size;
int pass = 1;
+ EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);
+
/*
* If our children haven't all reached the ready stage,
* wait for them and then repeat this pipeline stage.
@@ -1250,7 +1260,8 @@ zio_write_compress(zio_t *zio)
spa_max_replication(spa)) == BP_GET_NDVAS(bp));
}
- if (compress != ZIO_COMPRESS_OFF) {
+ /* If it's a compressed write that is not raw, compress the buffer. */
+ if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
void *cbuf = zio_buf_alloc(lsize);
psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
if (psize == 0 || psize == lsize) {
@@ -1301,6 +1312,8 @@ zio_write_compress(zio_t *zio)
zio->io_bp_override = NULL;
*bp = zio->io_bp_orig;
zio->io_pipeline = zio->io_orig_pipeline;
+ } else {
+ ASSERT3U(psize, !=, 0);
}
/*
@@ -2104,7 +2117,8 @@ zio_write_gang_block(zio_t *pio)
}
error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
- bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio);
+ bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
+ &pio->io_alloc_list, pio);
if (error) {
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
@@ -2159,8 +2173,8 @@ zio_write_gang_block(zio_t *pio)
zp.zp_nopwrite = B_FALSE;
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
- (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
- zio_write_gang_member_ready, NULL, NULL, NULL,
+ (char *)pio->io_data + (pio->io_size - resid), lsize, lsize,
+ &zp, zio_write_gang_member_ready, NULL, NULL, NULL,
&gn->gn_child[g], pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
@@ -2365,6 +2379,10 @@ static boolean_t
zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
{
spa_t *spa = zio->io_spa;
+ boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW);
+
+ /* We should never get a raw, override zio */
+ ASSERT(!(zio->io_bp_override && do_raw));
/*
* Note: we compare the original data, not the transformed data,
@@ -2388,6 +2406,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (ddp->ddp_phys_birth != 0) {
arc_buf_t *abuf = NULL;
arc_flags_t aflags = ARC_FLAG_WAIT;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
blkptr_t blk = *zio->io_bp;
int error;
@@ -2395,10 +2414,26 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
ddt_exit(ddt);
+ /*
+ * Intuitively, it would make more sense to compare
+ * io_data than io_orig_data in the raw case since you
+ * don't want to look at any transformations that have
+ * happened to the data. However, for raw I/Os the
+ * data will actually be the same in io_data and
+ * io_orig_data, so all we have to do is issue this as
+ * a raw ARC read.
+ */
+ if (do_raw) {
+ zio_flags |= ZIO_FLAG_RAW;
+ ASSERT3U(zio->io_size, ==, zio->io_orig_size);
+ ASSERT0(bcmp(zio->io_data, zio->io_orig_data,
+ zio->io_size));
+ ASSERT3P(zio->io_transform_stack, ==, NULL);
+ }
+
error = arc_read(NULL, spa, &blk,
arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
- &aflags, &zio->io_bookmark);
+ zio_flags, &aflags, &zio->io_bookmark);
if (error == 0) {
if (arc_buf_size(abuf) != zio->io_orig_size ||
@@ -2513,6 +2548,7 @@ zio_ddt_write(zio_t *zio)
ASSERT(BP_GET_DEDUP(bp));
ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+ ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_TRUE);
@@ -2533,7 +2569,9 @@ zio_ddt_write(zio_t *zio)
BP_ZERO(bp);
} else {
zp->zp_dedup = B_FALSE;
+ BP_SET_DEDUP(bp, B_FALSE);
}
+ ASSERT(!BP_GET_DEDUP(bp));
zio->io_pipeline = ZIO_WRITE_PIPELINE;
ddt_exit(ddt);
return (ZIO_PIPELINE_CONTINUE);
@@ -2566,7 +2604,7 @@ zio_ddt_write(zio_t *zio)
}
dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
- zio->io_orig_size, &czp, NULL, NULL,
+ zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
@@ -2588,7 +2626,7 @@ zio_ddt_write(zio_t *zio)
ddt_phys_addref(ddp);
} else {
cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
- zio->io_orig_size, zp,
+ zio->io_orig_size, zio->io_orig_size, zp,
zio_ddt_child_write_ready, NULL, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
@@ -2757,7 +2795,8 @@ zio_dva_allocate(zio_t *zio)
}
error = metaslab_alloc(spa, mc, zio->io_size, bp,
- zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio);
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
+ &zio->io_alloc_list, zio);
if (error != 0) {
spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
@@ -2821,18 +2860,24 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
uint64_t size, boolean_t use_slog)
{
int error = 1;
+ zio_alloc_list_t io_alloc_list;
ASSERT(txg > spa_syncing_txg(spa));
+ metaslab_trace_init(&io_alloc_list);
+
if (use_slog) {
error = metaslab_alloc(spa, spa_log_class(spa), size,
- new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
+ new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
+ &io_alloc_list, NULL);
}
if (error) {
error = metaslab_alloc(spa, spa_normal_class(spa), size,
- new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
+ new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
+ &io_alloc_list, NULL);
}
+ metaslab_trace_fini(&io_alloc_list);
if (error == 0) {
BP_SET_LSIZE(new_bp, size);
diff --git a/usr/src/uts/common/nfs/export.h b/usr/src/uts/common/nfs/export.h
index b6d223627d..66b86cdf8f 100644
--- a/usr/src/uts/common/nfs/export.h
+++ b/usr/src/uts/common/nfs/export.h
@@ -539,7 +539,7 @@ typedef struct secinfo secinfo_t;
* a real export at the mount point (VROOT) which has a subtree shared
* has a visible list.
*
- * The exi_visible field is NULL for normal, non=pseudo filesystems
+ * The exi_visible field is NULL for normal, non-pseudo filesystems
* which do not have any subtree exported. If the field is non-null,
* it points to a list of visible entries, identified by vis_fid and/or
* vis_ino. The presence of a "visible" list means that if this export
@@ -568,6 +568,7 @@ struct exp_visible {
struct exp_visible *vis_next;
struct secinfo *vis_secinfo;
int vis_seccnt;
+ timespec_t vis_change;
};
typedef struct exp_visible exp_visible_t;
@@ -635,7 +636,8 @@ extern exportinfo_t *vis2exi(treenode_t *);
extern int treeclimb_export(struct exportinfo *);
extern void treeclimb_unexport(struct exportinfo *);
extern int nfs_visible(struct exportinfo *, vnode_t *, int *);
-extern int nfs_visible_inode(struct exportinfo *, ino64_t, int *);
+extern int nfs_visible_inode(struct exportinfo *, ino64_t,
+ struct exp_visible **);
extern int has_visible(struct exportinfo *, vnode_t *);
extern void free_visible(struct exp_visible *);
extern int nfs_exported(struct exportinfo *, vnode_t *);
@@ -643,6 +645,9 @@ extern struct exportinfo *pseudo_exportfs(vnode_t *, fid_t *,
struct exp_visible *, struct exportdata *);
extern int vop_fid_pseudo(vnode_t *, fid_t *);
extern int nfs4_vget_pseudo(struct exportinfo *, vnode_t **, fid_t *);
+extern bool_t nfs_visible_change(struct exportinfo *, vnode_t *,
+ timespec_t *);
+extern void tree_update_change(treenode_t *, timespec_t *);
/*
* Functions that handle the NFSv4 server namespace security flavors
* information.
diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c
index 845b93da6e..e87e759563 100644
--- a/usr/src/uts/common/os/timer.c
+++ b/usr/src/uts/common/os/timer.c
@@ -140,7 +140,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
it->it_backend->clk_timer_delete(it);
- if (it->it_portev) {
+ if (it->it_flags & IT_PORT) {
mutex_enter(&it->it_mutex);
if (it->it_portev) {
port_kevent_t *pev;
@@ -237,7 +237,7 @@ timer_grab(proc_t *p, timer_t tid)
* should not be held on entry; timer_release() will acquire p_lock but
* will drop it before returning.
*/
-static void
+void
timer_release(proc_t *p, itimer_t *it)
{
mutex_enter(&p->p_lock);
@@ -250,7 +250,7 @@ timer_release(proc_t *p, itimer_t *it)
* p_lock should not be held on entry; timer_delete_grabbed() will acquire
* p_lock, but will drop it before returning.
*/
-static void
+void
timer_delete_grabbed(proc_t *p, timer_t tid, itimer_t *it)
{
mutex_enter(&p->p_lock);
@@ -465,6 +465,9 @@ timer_fire(itimer_t *it)
it->it_pending = 1;
port_send_event((port_kevent_t *)it->it_portev);
mutex_exit(&it->it_mutex);
+ } else if (it->it_flags & IT_CALLBACK) {
+ it->it_cb_func(it);
+ ASSERT(MUTEX_NOT_HELD(&it->it_mutex));
} else if (it->it_flags & IT_SIGNAL) {
it->it_pending = 1;
mutex_exit(&it->it_mutex);
@@ -580,85 +583,27 @@ retry:
return (it);
}
+/*
+ * Setup a timer
+ *
+ * This allocates an itimer_t (including a timer_t ID and slot in the process),
+ * wires it up according to the provided sigevent, and associates it with the
+ * desired clock backend. Upon successful completion, the timer will be
+ * locked, preventing it from being armed via timer_settime() or deleted via
+ * timer_delete(). This gives the caller a chance to perform any last minute
+ * manipulations (such as configuring the IT_CALLBACK functionality and/or
+ * copying the timer_t out to userspace) before using timer_release() to unlock
+ * it or timer_delete_grabbed() to delete it.
+ */
int
-timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
+timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp,
+ itimer_t **itp, timer_t *tidp)
{
- struct sigevent ev;
proc_t *p = curproc;
- clock_backend_t *backend;
+ int error = 0;
itimer_t *it;
sigqueue_t *sigq;
- cred_t *cr = CRED();
- int error = 0;
- timer_t i;
- port_notify_t tim_pnevp;
- port_kevent_t *pkevp = NULL;
-
- if ((backend = CLOCK_BACKEND(clock)) == NULL)
- return (set_errno(EINVAL));
-
- if (evp != NULL) {
- /*
- * short copyin() for binary compatibility
- * fetch oldsigevent to determine how much to copy in.
- */
- if (get_udatamodel() == DATAMODEL_NATIVE) {
- if (copyin(evp, &ev, sizeof (struct oldsigevent)))
- return (set_errno(EFAULT));
-
- if (ev.sigev_notify == SIGEV_PORT ||
- ev.sigev_notify == SIGEV_THREAD) {
- if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp,
- sizeof (port_notify_t)))
- return (set_errno(EFAULT));
- }
-#ifdef _SYSCALL32_IMPL
- } else {
- struct sigevent32 ev32;
- port_notify32_t tim_pnevp32;
-
- if (copyin(evp, &ev32, sizeof (struct oldsigevent32)))
- return (set_errno(EFAULT));
- ev.sigev_notify = ev32.sigev_notify;
- ev.sigev_signo = ev32.sigev_signo;
- /*
- * See comment in sigqueue32() on handling of 32-bit
- * sigvals in a 64-bit kernel.
- */
- ev.sigev_value.sival_int = ev32.sigev_value.sival_int;
- if (ev.sigev_notify == SIGEV_PORT ||
- ev.sigev_notify == SIGEV_THREAD) {
- if (copyin((void *)(uintptr_t)
- ev32.sigev_value.sival_ptr,
- (void *)&tim_pnevp32,
- sizeof (port_notify32_t)))
- return (set_errno(EFAULT));
- tim_pnevp.portnfy_port =
- tim_pnevp32.portnfy_port;
- tim_pnevp.portnfy_user =
- (void *)(uintptr_t)tim_pnevp32.portnfy_user;
- }
-#endif
- }
- switch (ev.sigev_notify) {
- case SIGEV_NONE:
- break;
- case SIGEV_SIGNAL:
- if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG)
- return (set_errno(EINVAL));
- break;
- case SIGEV_THREAD:
- case SIGEV_PORT:
- break;
- default:
- return (set_errno(EINVAL));
- }
- } else {
- /*
- * Use the clock's default sigevent (this is a structure copy).
- */
- ev = backend->clk_default;
- }
+ timer_t tid;
/*
* We'll allocate our sigqueue now, before we grab p_lock.
@@ -669,31 +614,29 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
/*
* Allocate a timer and choose a slot for it. This acquires p_lock.
*/
- it = timer_alloc(p, &i);
+ it = timer_alloc(p, &tid);
ASSERT(MUTEX_HELD(&p->p_lock));
if (it == NULL) {
mutex_exit(&p->p_lock);
kmem_free(sigq, sizeof (sigqueue_t));
- return (set_errno(EAGAIN));
+ return (EAGAIN);
}
- ASSERT(i < p->p_itimer_sz && p->p_itimer[i] == NULL);
+ ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL);
+ ASSERT(evp != NULL);
/*
* If we develop other notification mechanisms, this will need
* to call into (yet another) backend.
*/
- sigq->sq_info.si_signo = ev.sigev_signo;
- if (evp == NULL)
- sigq->sq_info.si_value.sival_int = i;
- else
- sigq->sq_info.si_value = ev.sigev_value;
+ sigq->sq_info.si_signo = evp->sigev_signo;
+ sigq->sq_info.si_value = evp->sigev_value;
sigq->sq_info.si_code = SI_TIMER;
sigq->sq_info.si_pid = p->p_pid;
sigq->sq_info.si_ctid = PRCTID(p);
sigq->sq_info.si_zoneid = getzoneid();
- sigq->sq_info.si_uid = crgetruid(cr);
+ sigq->sq_info.si_uid = crgetruid(CRED());
sigq->sq_func = timer_signal;
sigq->sq_next = NULL;
sigq->sq_backptr = it;
@@ -701,9 +644,12 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
it->it_backend = backend;
it->it_lock = ITLK_LOCKED;
- if (ev.sigev_notify == SIGEV_THREAD ||
- ev.sigev_notify == SIGEV_PORT) {
+ if (evp->sigev_notify == SIGEV_THREAD ||
+ evp->sigev_notify == SIGEV_PORT) {
int port;
+ port_kevent_t *pkevp = NULL;
+
+ ASSERT(pnp != NULL);
/*
* This timer is programmed to use event port notification when
@@ -723,7 +669,7 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
*/
it->it_flags |= IT_PORT;
- port = tim_pnevp.portnfy_port;
+ port = pnp->portnfy_port;
/* associate timer as event source with the port */
error = port_associate_ksource(port, PORT_SOURCE_TIMER,
@@ -733,7 +679,7 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
mutex_exit(&p->p_lock);
kmem_cache_free(clock_timer_cache, it);
kmem_free(sigq, sizeof (sigqueue_t));
- return (set_errno(error));
+ return (error);
}
/* allocate an event structure/slot */
@@ -745,21 +691,21 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
mutex_exit(&p->p_lock);
kmem_cache_free(clock_timer_cache, it);
kmem_free(sigq, sizeof (sigqueue_t));
- return (set_errno(error));
+ return (error);
}
/* initialize event data */
- port_init_event(pkevp, i, tim_pnevp.portnfy_user,
+ port_init_event(pkevp, tid, pnp->portnfy_user,
timer_port_callback, it);
it->it_portev = pkevp;
it->it_portfd = port;
} else {
- if (ev.sigev_notify == SIGEV_SIGNAL)
+ if (evp->sigev_notify == SIGEV_SIGNAL)
it->it_flags |= IT_SIGNAL;
}
/* Populate the slot now that the timer is prepped. */
- p->p_itimer[i] = it;
+ p->p_itimer[tid] = it;
mutex_exit(&p->p_lock);
/*
@@ -772,17 +718,8 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid)
it->it_lwp = ttolwp(curthread);
it->it_proc = p;
- if (copyout(&i, tid, sizeof (timer_t)) != 0) {
- error = EFAULT;
- goto err;
- }
-
- /*
- * If we're here, then we have successfully created the timer; we
- * just need to release the timer and return.
- */
- timer_release(p, it);
-
+ *itp = it;
+ *tidp = tid;
return (0);
err:
@@ -793,11 +730,115 @@ err:
* impossible for a removal to be pending.
*/
ASSERT(!(it->it_lock & ITLK_REMOVE));
- timer_delete_grabbed(p, i, it);
+ timer_delete_grabbed(p, tid, it);
+
+ return (error);
+}
+
- return (set_errno(error));
+int
+timer_create(clockid_t clock, struct sigevent *evp, timer_t *tidp)
+{
+ int error = 0;
+ proc_t *p = curproc;
+ clock_backend_t *backend;
+ struct sigevent ev;
+ itimer_t *it;
+ timer_t tid;
+ port_notify_t tim_pnevp;
+
+ if ((backend = CLOCK_BACKEND(clock)) == NULL)
+ return (set_errno(EINVAL));
+
+ if (evp != NULL) {
+ /*
+ * short copyin() for binary compatibility
+ * fetch oldsigevent to determine how much to copy in.
+ */
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(evp, &ev, sizeof (struct oldsigevent)))
+ return (set_errno(EFAULT));
+
+ if (ev.sigev_notify == SIGEV_PORT ||
+ ev.sigev_notify == SIGEV_THREAD) {
+ if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp,
+ sizeof (port_notify_t)))
+ return (set_errno(EFAULT));
+ }
+#ifdef _SYSCALL32_IMPL
+ } else {
+ struct sigevent32 ev32;
+ port_notify32_t tim_pnevp32;
+
+ if (copyin(evp, &ev32, sizeof (struct oldsigevent32)))
+ return (set_errno(EFAULT));
+ ev.sigev_notify = ev32.sigev_notify;
+ ev.sigev_signo = ev32.sigev_signo;
+ /*
+ * See comment in sigqueue32() on handling of 32-bit
+ * sigvals in a 64-bit kernel.
+ */
+ ev.sigev_value.sival_int = ev32.sigev_value.sival_int;
+ if (ev.sigev_notify == SIGEV_PORT ||
+ ev.sigev_notify == SIGEV_THREAD) {
+ if (copyin((void *)(uintptr_t)
+ ev32.sigev_value.sival_ptr,
+ (void *)&tim_pnevp32,
+ sizeof (port_notify32_t)))
+ return (set_errno(EFAULT));
+ tim_pnevp.portnfy_port =
+ tim_pnevp32.portnfy_port;
+ tim_pnevp.portnfy_user =
+ (void *)(uintptr_t)tim_pnevp32.portnfy_user;
+ }
+#endif
+ }
+ switch (ev.sigev_notify) {
+ case SIGEV_NONE:
+ break;
+ case SIGEV_SIGNAL:
+ if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG)
+ return (set_errno(EINVAL));
+ break;
+ case SIGEV_THREAD:
+ case SIGEV_PORT:
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+ } else {
+ /*
+ * Use the clock's default sigevent (this is a structure copy).
+ */
+ ev = backend->clk_default;
+ }
+
+ if ((error = timer_setup(backend, &ev, &tim_pnevp, &it, &tid)) != 0) {
+ return (set_errno(error));
+ }
+
+ /*
+ * Populate si_value with the timer ID if no sigevent was passed in.
+ */
+ if (evp == NULL) {
+ it->it_sigq->sq_info.si_value.sival_int = tid;
+ }
+
+ if (copyout(&tid, tidp, sizeof (timer_t)) != 0) {
+ timer_delete_grabbed(p, tid, it);
+ return (set_errno(EFAULT));
+ }
+
+ /*
+ * If we're here, then we have successfully created the timer; we
+ * just need to release the timer and return.
+ */
+ timer_release(p, it);
+
+ return (0);
}
+
int
timer_gettime(timer_t tid, itimerspec_t *val)
{
@@ -1065,7 +1106,7 @@ timer_close_port(void *arg, int port, pid_t pid, int lastclose)
for (tid = 0; tid < timer_max; tid++) {
if ((it = timer_grab(p, tid)) == NULL)
continue;
- if (it->it_portev) {
+ if (it->it_flags & IT_PORT) {
mutex_enter(&it->it_mutex);
if (it->it_portfd == port) {
port_kevent_t *pev;
diff --git a/usr/src/uts/common/sys/timer.h b/usr/src/uts/common/sys/timer.h
index 688a381ecc..748e0c0627 100644
--- a/usr/src/uts/common/sys/timer.h
+++ b/usr/src/uts/common/sys/timer.h
@@ -35,6 +35,8 @@
#include <sys/proc.h>
#include <sys/thread.h>
#include <sys/param.h>
+#include <sys/siginfo.h>
+#include <sys/port.h>
#ifdef __cplusplus
extern "C" {
@@ -63,6 +65,7 @@ extern int timer_max;
*/
#define IT_SIGNAL 0x01
#define IT_PORT 0x02 /* use event port notification */
+#define IT_CALLBACK 0x04 /* custom callback function */
struct clock_backend;
@@ -90,14 +93,27 @@ struct itimer {
struct clock_backend *it_backend;
void (*it_fire)(itimer_t *);
kmutex_t it_mutex;
- void *it_portev; /* port_kevent_t pointer */
- void *it_portsrc; /* port_source_t pointer */
- int it_portfd; /* port file descriptor */
+ union {
+ struct {
+ void *_it_portev; /* port_kevent_t pointer */
+ void *_it_portsrc; /* port_source_t pointer */
+ int _it_portfd; /* port file descriptor */
+ } _it_ev_port;
+ struct {
+ void (*_it_cb_func)(itimer_t *);
+ uintptr_t _it_cb_data[2];
+ } _it_ev_cb;
+ } _it_ev_data;
};
#define it_sigq __data.__proc.__it_sigq
#define it_lwp __data.__proc.__it_lwp
#define it_frontend __data.__it_frontend
+#define it_portev _it_ev_data._it_ev_port._it_portev
+#define it_portsrc _it_ev_data._it_ev_port._it_portsrc
+#define it_portfd _it_ev_data._it_ev_port._it_portfd
+#define it_cb_func _it_ev_data._it_ev_cb._it_cb_func
+#define it_cb_data _it_ev_data._it_ev_cb._it_cb_data
typedef struct clock_backend {
struct sigevent clk_default;
@@ -114,7 +130,11 @@ typedef struct clock_backend {
extern void clock_add_backend(clockid_t clock, clock_backend_t *backend);
extern clock_backend_t *clock_get_backend(clockid_t clock);
+extern void timer_release(struct proc *, itimer_t *);
+extern void timer_delete_grabbed(struct proc *, timer_t tid, itimer_t *it);
extern void timer_lwpbind();
+extern int timer_setup(clock_backend_t *, struct sigevent *, port_notify_t *,
+ itimer_t **, timer_t *);
extern void timer_func(sigqueue_t *);
extern void timer_exit(void);
diff --git a/usr/src/uts/intel/Makefile.files b/usr/src/uts/intel/Makefile.files
index aeddaa9203..5e819809fd 100644
--- a/usr/src/uts/intel/Makefile.files
+++ b/usr/src/uts/intel/Makefile.files
@@ -346,6 +346,7 @@ LX_BRAND_OBJS = \
lx_pipe.o \
lx_poll.o \
lx_prctl.o \
+ lx_priority.o \
lx_ptrace.o \
lx_rename.o \
lx_rlimit.o \