summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--usr/src/Makefile.master2
-rw-r--r--usr/src/cmd/sgs/libld/common/unwind.c21
-rw-r--r--usr/src/cmd/zdb/zdb.c59
-rw-r--r--usr/src/cmd/zfs/zfs_main.c11
-rw-r--r--usr/src/cmd/zstreamdump/zstreamdump.c19
-rw-r--r--usr/src/cmd/ztest/ztest.c14
-rw-r--r--usr/src/common/zfs/zfeature_common.c12
-rw-r--r--usr/src/common/zfs/zfeature_common.h1
-rw-r--r--usr/src/common/zfs/zfs_prop.c4
-rw-r--r--usr/src/common/zfs/zpool_prop.c2
-rw-r--r--usr/src/grub/grub-0.97/stage2/fsys_zfs.c12
-rw-r--r--usr/src/grub/grub-0.97/stage2/zfs-include/spa.h7
-rw-r--r--usr/src/lib/libzfs/common/libzfs.h3
-rw-r--r--usr/src/lib/libzfs/common/libzfs_dataset.c38
-rw-r--r--usr/src/lib/libzfs/common/libzfs_sendrecv.c15
-rw-r--r--usr/src/lib/libzfs_core/common/libzfs_core.c6
-rw-r--r--usr/src/lib/libzfs_core/common/libzfs_core.h3
-rw-r--r--usr/src/lib/libzpool/common/taskq.c4
-rw-r--r--usr/src/man/man1m/zfs.1m42
-rw-r--r--usr/src/man/man5/zpool-features.528
-rw-r--r--usr/src/uts/common/fs/zfs/bpobj.c5
-rw-r--r--usr/src/uts/common/fs/zfs/bptree.c2
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c6
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c16
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c83
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_traverse.c41
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c24
-rw-r--r--usr/src/uts/common/fs/zfs/dnode.c10
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dataset.c105
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_deadlist.c8
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_destroy.c7
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_pool.c4
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c2
-rw-r--r--usr/src/uts/common/fs/zfs/sa.c6
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c26
-rw-r--r--usr/src/uts/common/fs/zfs/spa_history.c2
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c9
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_objset.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_send.h6
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_dataset.h11
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h22
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zap_impl.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_znode.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zil.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zil_impl.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h3
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c6
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c15
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_file.c12
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c9
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_missing.c6
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c2
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c15
-rw-r--r--usr/src/uts/common/fs/zfs/zap_micro.c16
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c54
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_log.c2
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c9
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c8
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_znode.c8
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c11
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c32
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c11
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h1
66 files changed, 702 insertions, 235 deletions
diff --git a/usr/src/Makefile.master b/usr/src/Makefile.master
index 63a46e0468..988f34da46 100644
--- a/usr/src/Makefile.master
+++ b/usr/src/Makefile.master
@@ -984,7 +984,7 @@ STRIP_STABS= :
$(RELEASE_BUILD)STRIP_STABS= $(STRIP) -x $@
$(SRCDBGBLD)STRIP_STABS= :
-POST_PROCESS_O= $(PROCESS_COMMENT) $@
+POST_PROCESS_O=
POST_PROCESS_A=
POST_PROCESS_SO= $(PROCESS_COMMENT) $@ ; $(STRIP_STABS) ; \
$(ELFSIGN_OBJECT)
diff --git a/usr/src/cmd/sgs/libld/common/unwind.c b/usr/src/cmd/sgs/libld/common/unwind.c
index 1e10b4664d..adaf253287 100644
--- a/usr/src/cmd/sgs/libld/common/unwind.c
+++ b/usr/src/cmd/sgs/libld/common/unwind.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2014 Nexenta Systems, Inc.
*/
#include <string.h>
@@ -409,15 +410,10 @@ ld_unwind_make_hdr(Ofl_desc *ofl)
*/
if (id == 0) {
uint_t cieversion;
- /*
- * The only CIE version supported
- * is '1' - quick sanity check
- * here.
- */
cieversion = data[off + ndx];
ndx += 1;
/* BEGIN CSTYLED */
- if (cieversion != 1) {
+ if (cieversion != 1 && cieversion != 3) {
ld_eprintf(ofl, ERR_FATAL,
MSG_INTL(MSG_UNW_BADCIEVERS),
isp->is_file->ifl_name,
@@ -582,6 +578,7 @@ ld_unwind_populate_hdr(Ofl_desc *ofl)
if (id == 0) {
char *cieaugstr;
uint_t cieaugndx;
+ uint_t cieversion;
ciePflag = 0;
cieRflag = 0;
@@ -592,10 +589,8 @@ ld_unwind_populate_hdr(Ofl_desc *ofl)
* are encoded.
*/
- /*
- * burn through version
- */
- ndx++;
+ cieversion = data[off + ndx];
+ ndx += 1;
/*
* augstr
@@ -612,8 +607,10 @@ ld_unwind_populate_hdr(Ofl_desc *ofl)
/*
* retreg
*/
- ndx++;
-
+ if (cieversion == 1)
+ ndx++;
+ else
+ (void) uleb_extract(&data[off], &ndx);
/*
* we walk through the augmentation
* section now looking for the Rflag
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 63752a51de..36c4a8515b 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -77,9 +77,11 @@
#ifndef lint
extern boolean_t zfs_recover;
extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
+extern int zfs_vdev_async_read_max_active;
#else
boolean_t zfs_recover;
uint64_t zfs_arc_max, zfs_arc_meta_limit;
+int zfs_vdev_async_read_max_active;
#endif
const char cmdname[] = "zdb";
@@ -2118,6 +2120,8 @@ dump_label(const char *dev)
(void) close(fd);
}
+static uint64_t num_large_blocks;
+
/*ARGSUSED*/
static int
dump_one_dir(const char *dsname, void *arg)
@@ -2130,6 +2134,8 @@ dump_one_dir(const char *dsname, void *arg)
(void) printf("Could not open %s, error %d\n", dsname, error);
return (0);
}
+ if (dmu_objset_ds(os)->ds_large_blocks)
+ num_large_blocks++;
dump_dir(os);
dmu_objset_disown(os, FTAG);
fuid_table_destroy();
@@ -2140,7 +2146,7 @@ dump_one_dir(const char *dsname, void *arg)
/*
* Block statistics.
*/
-#define PSIZE_HISTO_SIZE (SPA_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1)
+#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
typedef struct zdb_blkstats {
uint64_t zb_asize;
uint64_t zb_lsize;
@@ -2205,7 +2211,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
zb->zb_lsize += BP_GET_LSIZE(bp);
zb->zb_psize += BP_GET_PSIZE(bp);
zb->zb_count++;
- zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++;
+
+ /*
+ * The histogram is only big enough to record blocks up to
+ * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
+ * "other", bucket.
+ */
+ int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
+ idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
+ zb->zb_psize_histogram[idx]++;
zb->zb_gangs += BP_COUNT_GANG(bp);
@@ -2355,8 +2369,14 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
zcb->zcb_readfails = 0;
- if (dump_opt['b'] < 5 &&
- gethrtime() > zcb->zcb_lastprint + NANOSEC) {
+ /* only call gethrtime() every 100 blocks */
+ static int iters;
+ if (++iters > 100)
+ iters = 0;
+ else
+ return (0);
+
+ if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
uint64_t now = gethrtime();
char buf[10];
uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
@@ -2465,6 +2485,14 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
(longlong_t)vd->vdev_ms_count);
msp->ms_ops = &zdb_metaslab_ops;
+
+ /*
+ * We don't want to spend the CPU
+ * manipulating the size-ordered
+ * tree, so clear the range_tree
+ * ops.
+ */
+ msp->ms_tree->rt_ops = NULL;
VERIFY0(space_map_load(msp->ms_sm,
msp->ms_tree, SM_ALLOC));
msp->ms_loaded = B_TRUE;
@@ -2901,6 +2929,7 @@ dump_zpool(spa_t *spa)
dump_metaslab_groups(spa);
if (dump_opt['d'] || dump_opt['i']) {
+ uint64_t refcount;
dump_dir(dp->dp_meta_objset);
if (dump_opt['d'] >= 3) {
dump_bpobj(&spa->spa_deferred_bpobj,
@@ -2920,8 +2949,21 @@ dump_zpool(spa_t *spa)
}
(void) dmu_objset_find(spa_name(spa), dump_one_dir,
NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+
+ (void) feature_get_refcount(spa,
+ &spa_feature_table[SPA_FEATURE_LARGE_BLOCKS], &refcount);
+ if (num_large_blocks != refcount) {
+ (void) printf("large_blocks feature refcount mismatch: "
+ "expected %lld != actual %lld\n",
+ (longlong_t)num_large_blocks,
+ (longlong_t)refcount);
+ rc = 2;
+ } else {
+ (void) printf("Verified large_blocks feature refcount "
+ "is correct (%llu)\n", (longlong_t)refcount);
+ }
}
- if (dump_opt['b'] || dump_opt['c'])
+ if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
rc = dump_block_stats(spa);
if (rc == 0)
@@ -3478,6 +3520,13 @@ main(int argc, char **argv)
*/
zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
+ /*
+ * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
+ * "zdb -b" uses traversal prefetch which uses async reads.
+ * For good performance, let several of them be active at once.
+ */
+ zfs_vdev_async_read_max_active = 10;
+
kernel_init(FREAD);
g_zfs = libzfs_init();
ASSERT(g_zfs != NULL);
diff --git a/usr/src/cmd/zfs/zfs_main.c b/usr/src/cmd/zfs/zfs_main.c
index 8d035f4253..4228f5c50a 100644
--- a/usr/src/cmd/zfs/zfs_main.c
+++ b/usr/src/cmd/zfs/zfs_main.c
@@ -256,9 +256,9 @@ get_usage(zfs_help_t idx)
case HELP_ROLLBACK:
return (gettext("\trollback [-rRf] <snapshot>\n"));
case HELP_SEND:
- return (gettext("\tsend [-DnPpRve] [-[iI] snapshot] "
+ return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
"<snapshot>\n"
- "\tsend [-e] [-i snapshot|bookmark] "
+ "\tsend [-Le] [-i snapshot|bookmark] "
"<filesystem|volume|snapshot>\n"));
case HELP_SET:
return (gettext("\tset <property=value> "
@@ -3720,7 +3720,7 @@ zfs_do_send(int argc, char **argv)
boolean_t extraverbose = B_FALSE;
/* check options */
- while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) {
+ while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) {
switch (c) {
case 'i':
if (fromname)
@@ -3755,6 +3755,9 @@ zfs_do_send(int argc, char **argv)
case 'n':
flags.dryrun = B_TRUE;
break;
+ case 'L':
+ flags.largeblock = B_TRUE;
+ break;
case 'e':
flags.embed_data = B_TRUE;
break;
@@ -3811,6 +3814,8 @@ zfs_do_send(int argc, char **argv)
if (zhp == NULL)
return (1);
+ if (flags.largeblock)
+ lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
if (flags.embed_data)
lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
diff --git a/usr/src/cmd/zstreamdump/zstreamdump.c b/usr/src/cmd/zstreamdump/zstreamdump.c
index dce1cb3d76..d99d8014f0 100644
--- a/usr/src/cmd/zstreamdump/zstreamdump.c
+++ b/usr/src/cmd/zstreamdump/zstreamdump.c
@@ -54,7 +54,6 @@ uint64_t total_stream_len = 0;
FILE *send_stream = 0;
boolean_t do_byteswap = B_FALSE;
boolean_t do_cksum = B_TRUE;
-#define INITIAL_BUFLEN (1<<20)
static void
usage(void)
@@ -67,6 +66,18 @@ usage(void)
exit(1);
}
+static void *
+safe_malloc(size_t size)
+{
+ void *rv = malloc(size);
+ if (rv == NULL) {
+ (void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n",
+ size);
+ abort();
+ }
+ return (rv);
+}
+
/*
* ssread - send stream read.
*
@@ -158,7 +169,7 @@ print_block(char *buf, int length)
int
main(int argc, char *argv[])
{
- char *buf = malloc(INITIAL_BUFLEN);
+ char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
uint64_t total_records = 0;
dmu_replay_record_t thedrr;
@@ -307,9 +318,9 @@ main(int argc, char *argv[])
nvlist_t *nv;
int sz = drr->drr_payloadlen;
- if (sz > INITIAL_BUFLEN) {
+ if (sz > SPA_MAXBLOCKSIZE) {
free(buf);
- buf = malloc(sz);
+ buf = safe_malloc(sz);
}
(void) ssread(buf, sz, &zc);
if (ferror(send_stream))
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index 6a29b2f32c..a5f9600804 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -985,9 +985,15 @@ ztest_spa_get_ashift() {
static int
ztest_random_blocksize(void)
{
- // Choose a block size >= the ashift.
- uint64_t block_shift =
- ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1);
+ uint64_t block_shift;
+ /*
+ * Choose a block size >= the ashift.
+ * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
+ */
+ int maxbs = SPA_OLD_MAXBLOCKSHIFT;
+ if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
+ maxbs = 20;
+ block_shift = ztest_random(maxbs - ztest_spa_get_ashift() + 1);
return (1 << (SPA_MINBLOCKSHIFT + block_shift));
}
@@ -4787,7 +4793,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
char path0[MAXPATHLEN];
char pathrand[MAXPATHLEN];
size_t fsize;
- int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
+ int bshift = SPA_OLD_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
int iters = 1000;
int maxfaults;
int mirror_save;
diff --git a/usr/src/common/zfs/zfeature_common.c b/usr/src/common/zfs/zfeature_common.c
index 9b046ab07d..3358e5eb87 100644
--- a/usr/src/common/zfs/zfeature_common.c
+++ b/usr/src/common/zfs/zfeature_common.c
@@ -57,7 +57,8 @@ valid_char(char c, boolean_t after_colon)
{
return ((c >= 'a' && c <= 'z') ||
(c >= '0' && c <= '9') ||
- c == (after_colon ? '_' : '.'));
+ (after_colon && c == '_') ||
+ (!after_colon && (c == '.' || c == '-')));
}
/*
@@ -221,4 +222,13 @@ zpool_feature_init(void)
"com.delphix:embedded_data", "embedded_data",
"Blocks which compress very well use even less space.",
B_FALSE, B_TRUE, B_TRUE, NULL);
+
+ static const spa_feature_t large_blocks_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
+ "org.open-zfs:large_blocks", "large_blocks",
+ "Support for blocks larger than 128KB.", B_FALSE, B_FALSE, B_FALSE,
+ large_blocks_deps);
}
diff --git a/usr/src/common/zfs/zfeature_common.h b/usr/src/common/zfs/zfeature_common.h
index be2111be91..3f54b392be 100644
--- a/usr/src/common/zfs/zfeature_common.h
+++ b/usr/src/common/zfs/zfeature_common.h
@@ -51,6 +51,7 @@ typedef enum spa_feature {
SPA_FEATURE_EMBEDDED_DATA,
SPA_FEATURE_BOOKMARKS,
SPA_FEATURE_FS_SS_LIMIT,
+ SPA_FEATURE_LARGE_BLOCKS,
SPA_FEATURES
} spa_feature_t;
diff --git a/usr/src/common/zfs/zfs_prop.c b/usr/src/common/zfs/zfs_prop.c
index e718baa32c..54aaa027dd 100644
--- a/usr/src/common/zfs/zfs_prop.c
+++ b/usr/src/common/zfs/zfs_prop.c
@@ -397,8 +397,8 @@ zfs_prop_init(void)
/* inherit number properties */
zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
- SPA_MAXBLOCKSIZE, PROP_INHERIT,
- ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE");
+ SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
/* hidden properties */
zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
diff --git a/usr/src/common/zfs/zpool_prop.c b/usr/src/common/zfs/zpool_prop.c
index a400f821e2..4d906b02bc 100644
--- a/usr/src/common/zfs/zpool_prop.c
+++ b/usr/src/common/zfs/zpool_prop.c
@@ -127,6 +127,8 @@ zpool_prop_init(void)
/* hidden properties */
zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+ zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
}
/*
diff --git a/usr/src/grub/grub-0.97/stage2/fsys_zfs.c b/usr/src/grub/grub-0.97/stage2/fsys_zfs.c
index 341b6cd971..91b4bcec6f 100644
--- a/usr/src/grub/grub-0.97/stage2/fsys_zfs.c
+++ b/usr/src/grub/grub-0.97/stage2/fsys_zfs.c
@@ -1048,6 +1048,7 @@ static const char *spa_feature_names[] = {
"com.delphix:hole_birth",
"com.delphix:extensible_dataset",
"com.delphix:embedded_data",
+ "org.open-zfs:large_blocks",
NULL
};
@@ -1837,6 +1838,17 @@ zfs_read(char *buf, int len)
blksz = DNODE->dn_datablkszsec << SPA_MINBLOCKSHIFT;
/*
+ * Note: for GRUB, SPA_MAXBLOCKSIZE is 128KB. There is not enough
+ * memory to allocate the new max blocksize (16MB), so while
+ * GRUB understands the large_blocks on-disk feature, it can't
+ * actually read large blocks.
+ */
+ if (blksz > SPA_MAXBLOCKSIZE) {
+ grub_printf("blocks larger than 128K are not supported\n");
+ return (0);
+ }
+
+ /*
* Entire Dnode is too big to fit into the space available. We
* will need to read it in chunks. This could be optimized to
* read in as large a chunk as there is space available, but for
diff --git a/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h b/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h
index 19fe52f13f..deb52cfda3 100644
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h
@@ -56,17 +56,14 @@
BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
/*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * Note: GRUB can't actually read blocks larger than 128KB, due to lack
+ * of memory. Therefore its SPA_MAXBLOCKSIZE is still 128KB.
*/
#define SPA_MINBLOCKSHIFT 9
#define SPA_MAXBLOCKSHIFT 17
#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
-#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
-
/*
* Size of block to hold the configuration data (a packed nvlist)
*/
diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h
index ef5224c763..eb8ae6ceb7 100644
--- a/usr/src/lib/libzfs/common/libzfs.h
+++ b/usr/src/lib/libzfs/common/libzfs.h
@@ -593,6 +593,9 @@ typedef struct sendflags {
/* show progress (ie. -v) */
boolean_t progress;
+ /* large blocks (>128K) are permitted */
+ boolean_t largeblock;
+
/* WRITE_EMBEDDED records of type DATA are permitted */
boolean_t embed_data;
} sendflags_t;
diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c
index 75a391de0f..1172abc647 100644
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c
@@ -1052,21 +1052,36 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
break;
}
- case ZFS_PROP_RECORDSIZE:
case ZFS_PROP_VOLBLOCKSIZE:
- /* must be power of two within SPA_{MIN,MAX}BLOCKSIZE */
+ case ZFS_PROP_RECORDSIZE:
+ {
+ int maxbs = SPA_MAXBLOCKSIZE;
+ if (zhp != NULL) {
+ maxbs = zpool_get_prop_int(zhp->zpool_hdl,
+ ZPOOL_PROP_MAXBLOCKSIZE, NULL);
+ }
+ /*
+ * Volumes are limited to a volblocksize of 128KB,
+ * because they typically service workloads with
+ * small random writes, which incur a large performance
+ * penalty with large blocks.
+ */
+ if (prop == ZFS_PROP_VOLBLOCKSIZE)
+ maxbs = SPA_OLD_MAXBLOCKSIZE;
+ /*
+ * The value must be a power of two between
+ * SPA_MINBLOCKSIZE and maxbs.
+ */
if (intval < SPA_MINBLOCKSIZE ||
- intval > SPA_MAXBLOCKSIZE || !ISP2(intval)) {
+ intval > maxbs || !ISP2(intval)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "'%s' must be power of 2 from %u "
- "to %uk"), propname,
- (uint_t)SPA_MINBLOCKSIZE,
- (uint_t)SPA_MAXBLOCKSIZE >> 10);
+ "'%s' must be power of 2 from 512B "
+ "to %uKB"), propname, maxbs >> 10);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
break;
-
+ }
case ZFS_PROP_MLSLABEL:
{
/*
@@ -1441,7 +1456,8 @@ zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
break;
case ERANGE:
- if (prop == ZFS_PROP_COMPRESSION) {
+ if (prop == ZFS_PROP_COMPRESSION ||
+ prop == ZFS_PROP_RECORDSIZE) {
(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property setting is not allowed on "
"bootable datasets"));
@@ -3156,9 +3172,7 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
case EDOM:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"volume block size must be power of 2 from "
- "%u to %uk"),
- (uint_t)SPA_MINBLOCKSIZE,
- (uint_t)SPA_MAXBLOCKSIZE >> 10);
+ "512B to 128KB"));
return (zfs_error(hdl, EZFS_BADPROP, errbuf));
diff --git a/usr/src/lib/libzfs/common/libzfs_sendrecv.c b/usr/src/lib/libzfs/common/libzfs_sendrecv.c
index 6697b52831..c4944438aa 100644
--- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c
+++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c
@@ -206,7 +206,7 @@ static void *
cksummer(void *arg)
{
dedup_arg_t *dda = arg;
- char *buf = malloc(1<<20);
+ char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE);
dmu_replay_record_t thedrr;
dmu_replay_record_t *drr = &thedrr;
struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
@@ -271,9 +271,9 @@ cksummer(void *arg)
DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) {
int sz = drr->drr_payloadlen;
- if (sz > 1<<20) {
- free(buf);
- buf = malloc(sz);
+ if (sz > SPA_MAXBLOCKSIZE) {
+ buf = zfs_realloc(dda->dedup_hdl, buf,
+ SPA_MAXBLOCKSIZE, sz);
}
(void) ssread(buf, sz, ofp);
if (ferror(stdin))
@@ -806,7 +806,7 @@ typedef struct send_dump_data {
char prevsnap[ZFS_MAXNAMELEN];
uint64_t prevsnap_obj;
boolean_t seenfrom, seento, replicate, doall, fromorigin;
- boolean_t verbose, dryrun, parsable, progress, embed_data;
+ boolean_t verbose, dryrun, parsable, progress, embed_data, large_block;
int outfd;
boolean_t err;
nvlist_t *fss;
@@ -1153,6 +1153,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
}
enum lzc_send_flags flags = 0;
+ if (sdd->large_block)
+ flags |= LZC_SEND_FLAG_LARGE_BLOCK;
if (sdd->embed_data)
flags |= LZC_SEND_FLAG_EMBED_DATA;
@@ -1501,6 +1503,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
sdd.parsable = flags->parsable;
sdd.progress = flags->progress;
sdd.dryrun = flags->dryrun;
+ sdd.large_block = flags->largeblock;
sdd.embed_data = flags->embed_data;
sdd.filter_cb = filter_func;
sdd.filter_cb_arg = cb_arg;
@@ -2506,7 +2509,7 @@ static int
recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
{
dmu_replay_record_t *drr;
- void *buf = malloc(1<<20);
+ void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE);
char errbuf[1024];
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
diff --git a/usr/src/lib/libzfs_core/common/libzfs_core.c b/usr/src/lib/libzfs_core/common/libzfs_core.c
index 6f36568667..06221fab4a 100644
--- a/usr/src/lib/libzfs_core/common/libzfs_core.c
+++ b/usr/src/lib/libzfs_core/common/libzfs_core.c
@@ -455,6 +455,10 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp)
*
* "fd" is the file descriptor to write the send stream to.
*
+ * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted
+ * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT
+ * records with drr_blksz > 128K.
+ *
* If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
* to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
* which the receiving system must support (as indicated by support
@@ -471,6 +475,8 @@ lzc_send(const char *snapname, const char *from, int fd,
fnvlist_add_int32(args, "fd", fd);
if (from != NULL)
fnvlist_add_string(args, "fromsnap", from);
+ if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
+ fnvlist_add_boolean(args, "largeblockok");
if (flags & LZC_SEND_FLAG_EMBED_DATA)
fnvlist_add_boolean(args, "embedok");
err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
diff --git a/usr/src/lib/libzfs_core/common/libzfs_core.h b/usr/src/lib/libzfs_core/common/libzfs_core.h
index d7d767055d..bdd6c951ee 100644
--- a/usr/src/lib/libzfs_core/common/libzfs_core.h
+++ b/usr/src/lib/libzfs_core/common/libzfs_core.h
@@ -53,7 +53,8 @@ int lzc_release(nvlist_t *, nvlist_t **);
int lzc_get_holds(const char *, nvlist_t **);
enum lzc_send_flags {
- LZC_SEND_FLAG_EMBED_DATA = 1 << 0
+ LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
+ LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1
};
int lzc_send(const char *, const char *, int, enum lzc_send_flags);
diff --git a/usr/src/lib/libzpool/common/taskq.c b/usr/src/lib/libzpool/common/taskq.c
index 2c5dfd86dc..a4ab58963d 100644
--- a/usr/src/lib/libzpool/common/taskq.c
+++ b/usr/src/lib/libzpool/common/taskq.c
@@ -25,6 +25,7 @@
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
+ * Copyright (c) 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -33,8 +34,10 @@ int taskq_now;
taskq_t *system_taskq;
#define TASKQ_ACTIVE 0x00010000
+#define TASKQ_NAMELEN 31
struct taskq {
+ char tq_name[TASKQ_NAMELEN + 1];
kmutex_t tq_lock;
krwlock_t tq_threadlock;
kcondvar_t tq_dispatch_cv;
@@ -247,6 +250,7 @@ taskq_create(const char *name, int nthreads, pri_t pri,
cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL);
+ (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1);
tq->tq_flags = flags | TASKQ_ACTIVE;
tq->tq_active = nthreads;
tq->tq_nthreads = nthreads;
diff --git a/usr/src/man/man1m/zfs.1m b/usr/src/man/man1m/zfs.1m
index 98fc8e7491..6795e8ecf2 100644
--- a/usr/src/man/man1m/zfs.1m
+++ b/usr/src/man/man1m/zfs.1m
@@ -176,12 +176,12 @@ zfs \- configures ZFS file systems
.LP
.nf
-\fBzfs\fR \fBsend\fR [\fB-DnPpRve\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
+\fBzfs\fR \fBsend\fR [\fB-DnPpRveL\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
.fi
.LP
.nf
-\fBzfs\fR \fBsend\fR [\fB-e\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs\fR \fBsend\fR [\fB-eL\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
.fi
.LP
@@ -1244,7 +1244,9 @@ significant performance gains. Use of this property for general purpose file
systems is strongly discouraged, and may adversely affect performance.
.sp
The size specified must be a power of two greater than or equal to 512 and less
-than or equal to 128 Kbytes.
+than or equal to 128 Kbytes. If the \fBlarge_blocks\fR feature is enabled
+on the pool, the size may be up to 1 Mbyte. See \fBzpool-features\fR(5)
+for details on ZFS feature flags.
.sp
Changing the file system's \fBrecordsize\fR affects only files created
afterward; existing files are unaffected.
@@ -2946,7 +2948,7 @@ See \fBzpool-features\fR(5) for details on ZFS feature flags and the
.sp
.ne 2
.na
-\fBzfs send\fR [\fB-DnPpRrve\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
+\fBzfs send\fR [\fB-DnPpRrveL\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
.ad
.sp .6
.RS 4n
@@ -3034,6 +3036,21 @@ property information is only included if the \fB-p\fR flag is specified.
.sp
.ne 2
.na
+\fB\fB-L\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a stream which may contain blocks larger than 128KB. This flag
+has no effect if the \fBlarge_blocks\fR pool feature is disabled, or if
+the \fRrecordsize\fR property of this filesystem has never been set above
+128KB. The receiving system must have the \fBlarge_blocks\fR pool feature
+enabled as well. See \fBzpool-features\fR(5) for details on ZFS feature
+flags and the \fBlarge_blocks\fR feature.
+.RE
+
+.sp
+.ne 2
+.na
\fB\fB-e\fR\fR
.ad
.sp .6
@@ -3099,7 +3116,7 @@ on future versions of \fBZFS\fR.
.sp
.ne 2
.na
-\fBzfs send\fR [\fB-e\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs send\fR [\fB-eL\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
.ad
.sp .6
.RS 4n
@@ -3130,6 +3147,21 @@ or the origin's origin, etc.
.sp
.ne 2
.na
+\fB\fB-L\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a stream which may contain blocks larger than 128KB. This flag
+has no effect if the \fBlarge_blocks\fR pool feature is disabled, or if
+the \fRrecordsize\fR property of this filesystem has never been set above
+128KB. The receiving system must have the \fBlarge_blocks\fR pool feature
+enabled as well. See \fBzpool-features\fR(5) for details on ZFS feature
+flags and the \fBlarge_blocks\fR feature.
+.RE
+
+.sp
+.ne 2
+.na
\fB\fB-e\fR\fR
.ad
.sp .6
diff --git a/usr/src/man/man5/zpool-features.5 b/usr/src/man/man5/zpool-features.5
index 24398124f0..fbefb37b80 100644
--- a/usr/src/man/man5/zpool-features.5
+++ b/usr/src/man/man5/zpool-features.5
@@ -18,7 +18,6 @@
.SH NAME
zpool\-features \- ZFS pool feature descriptions
.SH DESCRIPTION
-.sp
.LP
ZFS pool on\-disk format versions are specified via "features" which replace
the old on\-disk format numbers (the last supported on\-disk format number is
@@ -36,7 +35,6 @@ format of the pool is specified by the set of all features marked as
\fBactive\fR on the pool. If the pool was created by another software version
this set may include unsupported features.
.SS "Identifying features"
-.sp
.LP
Every feature has a guid of the form \fIcom.example:feature_name\fR. The reverse
DNS name ensures that the feature's guid is unique across all ZFS
@@ -51,7 +49,6 @@ name is the portion of its guid which follows the ':' (e.g.
however a feature's short name may differ across ZFS implementations if
following the convention would result in name conflicts.
.SS "Feature states"
-.sp
.LP
Features can be in one of three states:
.sp
@@ -97,7 +94,6 @@ cannot be disabled once they have been enabled.
The state of supported features is exposed through pool properties of the form
\fIfeature@short_name\fR.
.SS "Read\-only compatibility"
-.sp
.LP
Some features may make on\-disk format changes that do not interfere with other
software's ability to read from the pool. These features are referred to as
@@ -106,7 +102,6 @@ compatible, the pool can be imported in read\-only mode by setting the
\fBreadonly\fR property during import (see \fBzpool\fR(1M) for details on
importing pools).
.SS "Unsupported features"
-.sp
.LP
For each unsupported feature enabled on an imported pool a pool property
named \fIunsupported@feature_guid\fR will indicate why the import was allowed
@@ -133,13 +128,11 @@ read\-only mode.
.RE
.SS "Feature dependencies"
-.sp
.LP
Some features depend on other features being enabled in order to function
properly. Enabling a feature will automatically enable any features it
depends on.
.SH FEATURES
-.sp
.LP
The following features are supported on this system:
.sp
@@ -430,5 +423,26 @@ never return to being \fBenabled\fR.
.RE
+.sp
+.ne 2
+.na
+\fB\fBlarge_blocks\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID org.open-zfs:large_block
+READ\-ONLY COMPATIBLE no
+DEPENDENCIES extensible_dataset
+.TE
+
+The \fBlarge_block\fR feature allows the record size on a dataset to be
+set larger than 128KB.
+
+This feature becomes \fBactive\fR once a \fBrecordsize\fR property has been
+set larger than 128KB, and will return to being \fBenabled\fR once all
+filesystems that have ever had their recordsize larger than 128KB are destroyed.
+.RE
+
.SH "SEE ALSO"
\fBzpool\fR(1M)
diff --git a/usr/src/uts/common/fs/zfs/bpobj.c b/usr/src/uts/common/fs/zfs/bpobj.c
index e75ae72f9e..da4d38a3a9 100644
--- a/usr/src/uts/common/fs/zfs/bpobj.c
+++ b/usr/src/uts/common/fs/zfs/bpobj.c
@@ -43,7 +43,7 @@ bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
ASSERT0(dp->dp_empty_bpobj);
dp->dp_empty_bpobj =
- bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+ bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY(zap_add(os,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
@@ -396,7 +396,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
if (bpo->bpo_phys->bpo_subobjs == 0) {
bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
- DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+ DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OT_NONE, 0, tx);
}
dmu_object_info_t doi;
diff --git a/usr/src/uts/common/fs/zfs/bptree.c b/usr/src/uts/common/fs/zfs/bptree.c
index c724ed0741..5f7d76f0e2 100644
--- a/usr/src/uts/common/fs/zfs/bptree.c
+++ b/usr/src/uts/common/fs/zfs/bptree.c
@@ -65,7 +65,7 @@ bptree_alloc(objset_t *os, dmu_tx_t *tx)
bptree_phys_t *bt;
obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
- SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
sizeof (bptree_phys_t), tx);
/*
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 040b1ac313..7d96048ca3 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -2033,10 +2033,8 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
return (SET_ERROR(ENOTSUP));
if (blksz == 0)
blksz = SPA_MINBLOCKSIZE;
- if (blksz > SPA_MAXBLOCKSIZE)
- blksz = SPA_MAXBLOCKSIZE;
- else
- blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+ ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+ blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 73b8e056cc..e7aeed17fb 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -255,6 +255,14 @@ logbias_changed_cb(void *arg, uint64_t newval)
zil_set_logbias(os->os_zil, newval);
}
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ os->os_recordsize = newval;
+}
+
void
dmu_objset_byteswap(void *buf, size_t size)
{
@@ -384,6 +392,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ZFS_PROP_REDUNDANT_METADATA),
redundant_metadata_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ recordsize_changed_cb, os);
+ }
}
if (err != 0) {
VERIFY(arc_buf_remove_ref(os->os_phys_buf,
@@ -642,6 +655,9 @@ dmu_objset_evict(objset_t *os)
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
redundant_metadata_changed_cb, os));
+ VERIFY0(dsl_prop_unregister(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ recordsize_changed_cb, os));
}
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index fed1b86a1b..2c08e7075f 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -206,11 +206,12 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_offset = offset;
drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
- if (BP_IS_EMBEDDED(bp)) {
+ if (bp == NULL || BP_IS_EMBEDDED(bp)) {
/*
- * There's no pre-computed checksum of embedded BP's, so
- * (like fletcher4-checkummed blocks) userland will have
- * to compute a dedup-capable checksum itself.
+ * There's no pre-computed checksum for partial-block
+ * writes or embedded BP's, so (like
+ * fletcher4-checkummed blocks) userland will have to
+ * compute a dedup-capable checksum itself.
*/
drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
} else {
@@ -372,6 +373,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
drro->drr_compress = dnp->dn_compress;
drro->drr_toguid = dsp->dsa_toguid;
+ if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+ drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR));
@@ -491,6 +496,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
+ uint64_t offset;
ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT0(zb->zb_level);
@@ -511,8 +517,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
}
}
- err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
- blksz, bp, abuf->b_data);
+ offset = zb->zb_blkid * blksz;
+
+ if (!(dsp->dsa_featureflags &
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ blksz > SPA_OLD_MAXBLOCKSIZE) {
+ char *buf = abuf->b_data;
+ while (blksz > 0 && err == 0) {
+ int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+ err = dump_write(dsp, type, zb->zb_object,
+ offset, n, NULL, buf);
+ offset += n;
+ buf += n;
+ blksz -= n;
+ }
+ } else {
+ err = dump_write(dsp, type, zb->zb_object,
+ offset, blksz, bp, abuf->b_data);
+ }
(void) arc_buf_remove_ref(abuf, &abuf);
}
@@ -526,7 +548,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
- int outfd, vnode_t *vp, offset_t *off)
+ boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
{
objset_t *os;
dmu_replay_record_t *drr;
@@ -561,6 +583,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
}
#endif
+ if (large_block_ok && ds->ds_large_blocks)
+ featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
if (embedok &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -656,7 +680,8 @@ out:
int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
- boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
+ boolean_t embedok, boolean_t large_block_ok,
+ int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
@@ -690,18 +715,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
zb.zbm_guid = fromds->ds_phys->ds_guid;
is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG);
- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
- outfd, vp, off);
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ embedok, large_block_ok, outfd, vp, off);
} else {
- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
- outfd, vp, off);
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ embedok, large_block_ok, outfd, vp, off);
}
dsl_dataset_rele(ds, FTAG);
return (err);
}
int
-dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+dmu_send(const char *tosnap, const char *fromsnap,
+ boolean_t embedok, boolean_t large_block_ok,
int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
@@ -768,11 +794,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
dsl_pool_rele(dp, FTAG);
return (err);
}
- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
- outfd, vp, off);
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ embedok, large_block_ok, outfd, vp, off);
} else {
- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
- outfd, vp, off);
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ embedok, large_block_ok, outfd, vp, off);
}
if (owned)
dsl_dataset_disown(ds, FTAG);
@@ -972,6 +998,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
+ /*
+ * The receiving code doesn't know how to translate large blocks
+ * to smaller ones, so the pool must have the LARGE_BLOCKS
+ * feature enabled if the stream has LARGE_BLOCKS.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
/* target fs already exists; recv into temp clone */
@@ -1097,6 +1132,13 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
}
VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
+ if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !newds->ds_large_blocks) {
+ dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+ newds->ds_large_blocks = B_TRUE;
+ }
+
dmu_buf_will_dirty(newds->ds_dbuf, tx);
newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
@@ -1222,6 +1264,7 @@ restore_read(struct restorearg *ra, int len, char *buf)
/* some things will require 8-byte alignment, so everything must */
ASSERT0(len % 8);
+ ASSERT3U(len, <=, ra->bufsize);
while (done < len) {
ssize_t resid;
@@ -1361,7 +1404,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
- drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+ drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
drro->drr_bonuslen > DN_MAX_BONUSLEN) {
return (SET_ERROR(EINVAL));
}
@@ -1634,7 +1677,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
int err;
if (drrs->drr_length < SPA_MINBLOCKSIZE ||
- drrs->drr_length > SPA_MAXBLOCKSIZE)
+ drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
return (SET_ERROR(EINVAL));
data = restore_read(ra, drrs->drr_length, NULL);
@@ -1721,7 +1764,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
ra.cksum = drc->drc_cksum;
ra.vp = vp;
ra.voff = *voffp;
- ra.bufsize = 1<<20;
+ ra.bufsize = SPA_MAXBLOCKSIZE;
ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
/* these were verified in dmu_recv_begin */
diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c
index fb1ec0755c..5836549d20 100644
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c
@@ -59,6 +59,7 @@ typedef struct traverse_data {
int td_flags;
prefetch_data_t *td_pfd;
boolean_t td_paused;
+ uint64_t td_hole_birth_enabled_txg;
blkptr_cb_t *td_func;
void *td_arg;
} traverse_data_t;
@@ -229,25 +230,20 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
}
if (bp->blk_birth == 0) {
- if (spa_feature_is_active(td->td_spa, SPA_FEATURE_HOLE_BIRTH)) {
- /*
- * Since this block has a birth time of 0 it must be a
- * hole created before the SPA_FEATURE_HOLE_BIRTH
- * feature was enabled. If SPA_FEATURE_HOLE_BIRTH
- * was enabled before the min_txg for this traveral we
- * know the hole must have been created before the
- * min_txg for this traveral, so we can skip it. If
- * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg
- * for this traveral we cannot tell if the hole was
- * created before or after the min_txg for this
- * traversal, so we cannot skip it.
- */
- uint64_t hole_birth_enabled_txg;
- VERIFY(spa_feature_enabled_txg(td->td_spa,
- SPA_FEATURE_HOLE_BIRTH, &hole_birth_enabled_txg));
- if (hole_birth_enabled_txg < td->td_min_txg)
- return (0);
- }
+ /*
+ * Since this block has a birth time of 0 it must be a
+ * hole created before the SPA_FEATURE_HOLE_BIRTH
+ * feature was enabled. If SPA_FEATURE_HOLE_BIRTH
+ * was enabled before the min_txg for this traveral we
+ * know the hole must have been created before the
+ * min_txg for this traveral, so we can skip it. If
+ * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg
+ * for this traveral we cannot tell if the hole was
+ * created before or after the min_txg for this
+ * traversal, so we cannot skip it.
+ */
+ if (td->td_hole_birth_enabled_txg < td->td_min_txg)
+ return (0);
} else if (bp->blk_birth <= td->td_min_txg) {
return (0);
}
@@ -523,6 +519,13 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
td.td_flags = flags;
td.td_paused = B_FALSE;
+ if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+ VERIFY(spa_feature_enabled_txg(spa,
+ SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
+ } else {
+ td.td_hole_birth_enabled_txg = 0;
+ }
+
pd.pd_blks_max = zfs_pd_blks_max;
pd.pd_flags = flags;
mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index b31e37cdb8..991de2b1bc 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -226,7 +226,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE);
min_bs = SPA_MINBLOCKSHIFT;
- max_bs = SPA_MAXBLOCKSHIFT;
+ max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
min_ibs = DN_MIN_INDBLKSHIFT;
max_ibs = DN_MAX_INDBLKSHIFT;
@@ -295,6 +295,14 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
*/
ASSERT(dn->dn_datablkshift != 0);
min_bs = max_bs = dn->dn_datablkshift;
+ } else {
+ /*
+ * The blocksize can increase up to the recordsize,
+ * or if it is already more than the recordsize,
+ * up to the next power of 2.
+ */
+ min_bs = highbit64(dn->dn_datablksz - 1);
+ max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
}
/*
@@ -752,11 +760,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
bp = &dn->dn_phys->dn_blkptr[0];
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
bp, bp->blk_birth))
- txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
else
- txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_towrite += MZAP_MAX_BLKSZ;
if (!BP_IS_HOLE(bp))
- txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tounref += MZAP_MAX_BLKSZ;
return;
}
@@ -1545,18 +1553,18 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
/* If blkptr doesn't exist then add space to towrite */
if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
- txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
} else {
blkptr_t *bp;
bp = &dn->dn_phys->dn_spill;
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
bp, bp->blk_birth))
- txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
else
- txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
if (!BP_IS_HOLE(bp))
- txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
}
}
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
index 175157714d..9c70c2bfb1 100644
--- a/usr/src/uts/common/fs/zfs/dnode.c
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -510,10 +510,10 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
{
int i;
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (blocksize == 0)
blocksize = 1 << zfs_default_bs;
- else if (blocksize > SPA_MAXBLOCKSIZE)
- blocksize = SPA_MAXBLOCKSIZE;
else
blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
@@ -594,7 +594,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
int nblkptr;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
- ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
ASSERT0(blocksize % SPA_MINBLOCKSIZE);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
ASSERT(tx->tx_txg != 0);
@@ -1347,10 +1348,9 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
dmu_buf_impl_t *db;
int err;
+ ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (size == 0)
size = SPA_MINBLOCKSIZE;
- if (size > SPA_MAXBLOCKSIZE)
- size = SPA_MAXBLOCKSIZE;
else
size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index f1b92f3eaa..e7ed750902 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -50,6 +50,17 @@
#include <sys/dsl_userhold.h>
#include <sys/dsl_bookmark.h>
+/*
+ * The SPA supports block sizes up to 16MB. However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator. Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB). Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+
#define SWITCH64(x, y) \
{ \
uint64_t __tmp = (x); \
@@ -59,8 +70,6 @@
#define DS_REF_MAX (1ULL << 62)
-#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
-
/*
* Figure out how much of this delta should be propogated to the dsl_dir
* layer. If there's a refreservation, that space has already been
@@ -110,6 +119,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
ds->ds_phys->ds_compressed_bytes += compressed;
ds->ds_phys->ds_uncompressed_bytes += uncompressed;
ds->ds_phys->ds_unique_bytes += used;
+ if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
+ ds->ds_need_large_blocks = B_TRUE;
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
compressed, uncompressed, tx);
@@ -387,6 +398,14 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
offsetof(dmu_sendarg_t, dsa_link));
+ if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+ err = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS);
+ if (err == 0)
+ ds->ds_large_blocks = B_TRUE;
+ else
+ ASSERT3U(err, ==, ENOENT);
+ }
+
if (err == 0) {
err = dsl_dir_hold_obj(dp,
ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
@@ -700,6 +719,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
dsphys->ds_flags |= origin->ds_phys->ds_flags &
(DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
+ if (origin->ds_large_blocks)
+ dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
dmu_buf_will_dirty(origin->ds_dbuf, tx);
origin->ds_phys->ds_num_children++;
@@ -1213,6 +1235,9 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
dsphys->ds_bp = ds->ds_phys->ds_bp;
dmu_buf_rele(dbuf, FTAG);
+ if (ds->ds_large_blocks)
+ dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
if (ds->ds_prev) {
uint64_t next_clones_obj =
@@ -1486,6 +1511,11 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
dmu_objset_sync(ds->ds_objset, zio, tx);
+
+ if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
+ dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+ ds->ds_large_blocks = B_TRUE;
+ }
}
static void
@@ -3128,6 +3158,77 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
return (err);
}
+static int
+dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
+{
+ const char *dsname = arg;
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int error = 0;
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+
+ ASSERT(spa_feature_is_enabled(dp->dp_spa,
+ SPA_FEATURE_EXTENSIBLE_DATASET));
+
+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if (ds->ds_large_blocks)
+ error = EALREADY;
+ dsl_dataset_rele(ds, FTAG);
+
+ return (error);
+}
+
+void
+dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+ uint64_t zero = 0;
+
+ spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+ dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
+ sizeof (zero), 1, &zero, tx));
+}
+
+static void
+dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
+{
+ const char *dsname = arg;
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
+
+ dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+ ASSERT(!ds->ds_large_blocks);
+ ds->ds_large_blocks = B_TRUE;
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_activate_large_blocks(const char *dsname)
+{
+ int error;
+
+ error = dsl_sync_task(dsname,
+ dsl_dataset_activate_large_blocks_check,
+ dsl_dataset_activate_large_blocks_sync, (void *)dsname,
+ 1, ZFS_SPACE_CHECK_RESERVED);
+
+ /*
+ * EALREADY indicates that this dataset already supports large blocks.
+ */
+ if (error == EALREADY)
+ error = 0;
+ return (error);
+}
+
/*
* Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
* For example, they could both be snapshots of the same filesystem, and
diff --git a/usr/src/uts/common/fs/zfs/dsl_deadlist.c b/usr/src/uts/common/fs/zfs/dsl_deadlist.c
index 4f39c397a0..8c8e3746ee 100644
--- a/usr/src/uts/common/fs/zfs/dsl_deadlist.c
+++ b/usr/src/uts/common/fs/zfs/dsl_deadlist.c
@@ -143,7 +143,7 @@ uint64_t
dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
{
if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
- return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+ return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
sizeof (dsl_deadlist_phys_t), tx));
}
@@ -180,7 +180,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
{
if (dle->dle_bpobj.bpo_object ==
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
- uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
bpobj_close(&dle->dle_bpobj);
bpobj_decr_empty(dl->dl_os, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
@@ -254,7 +254,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
dle->dle_mintxg = mintxg;
- obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
avl_add(&dl->dl_tree, dle);
@@ -338,7 +338,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
if (dle->dle_mintxg >= maxtxg)
break;
- obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
dle->dle_mintxg, obj, tx));
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_destroy.c b/usr/src/uts/common/fs/zfs/dsl_destroy.c
index f8a4546535..1237641583 100644
--- a/usr/src/uts/common/fs/zfs/dsl_destroy.c
+++ b/usr/src/uts/common/fs/zfs/dsl_destroy.c
@@ -264,6 +264,10 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
obj = ds->ds_object;
+ if (ds->ds_large_blocks) {
+ ASSERT0(zap_contains(mos, obj, DS_FIELD_LARGE_BLOCKS));
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+ }
if (ds->ds_phys->ds_prev_snap_obj != 0) {
ASSERT3P(ds->ds_prev, ==, NULL);
VERIFY0(dsl_dataset_hold_obj(dp,
@@ -720,6 +724,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
ASSERT0(ds->ds_reserved);
}
+ if (ds->ds_large_blocks)
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+
dsl_scan_ds_destroyed(ds, tx);
obj = ds->ds_object;
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index f08f8559d6..2bb699d43e 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -368,7 +368,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
FREE_DIR_NAME, &dp->dp_free_dir));
/* create and open the free_bplist */
- obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
VERIFY0(bpobj_open(&dp->dp_free_bpobj,
@@ -793,7 +793,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
* subobj support. So call dmu_object_alloc() directly.
*/
obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
- SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index a5d462cacf..3411fbf32c 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -130,7 +130,7 @@ int metaslab_debug_unload = 0;
* an allocation of this size then it switches to using more
* aggressive strategy (i.e search by size rather than offset).
*/
-uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
+uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
/*
* The minimum free space, in percent, which must be available
diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c
index bd2ebce5df..8b3963aed9 100644
--- a/usr/src/uts/common/fs/zfs/sa.c
+++ b/usr/src/uts/common/fs/zfs/sa.c
@@ -500,7 +500,7 @@ sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
if (size == 0) {
blocksize = SPA_MINBLOCKSIZE;
- } else if (size > SPA_MAXBLOCKSIZE) {
+ } else if (size > SPA_OLD_MAXBLOCKSIZE) {
ASSERT(0);
return (SET_ERROR(EFBIG));
} else {
@@ -675,7 +675,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
SA_BONUS, &i, &used, &spilling);
- if (used > SPA_MAXBLOCKSIZE)
+ if (used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
@@ -699,7 +699,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
attr_count - i, hdl->sa_spill, SA_SPILL, &i,
&spill_used, &dummy);
- if (spill_used > SPA_MAXBLOCKSIZE)
+ if (spill_used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
buf_space = hdl->sa_spill->db_size - spillhdrsize;
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index be308e2f87..634967c46f 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -267,6 +267,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
0, ZPROP_SRC_LOCAL);
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+ MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+ SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
+ }
+
if ((dp = list_head(&spa->spa_config_list)) != NULL) {
if (dp->scd_path == NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -481,7 +489,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
if (!error) {
objset_t *os;
- uint64_t compress;
+ uint64_t propval;
if (strval == NULL || strval[0] == '\0') {
objnum = zpool_prop_default_numeric(
@@ -492,15 +500,25 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
if (error = dmu_objset_hold(strval, FTAG, &os))
break;
- /* Must be ZPL and not gzip compressed. */
+ /*
+ * Must be ZPL, and its property settings
+ * must be supported by GRUB (compression
+ * is not gzip, and large blocks are not used).
+ */
if (dmu_objset_type(os) != DMU_OST_ZFS) {
error = SET_ERROR(ENOTSUP);
} else if ((error =
dsl_prop_get_int_ds(dmu_objset_ds(os),
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
- &compress)) == 0 &&
- !BOOTFS_COMPRESS_VALID(compress)) {
+ &propval)) == 0 &&
+ !BOOTFS_COMPRESS_VALID(propval)) {
+ error = SET_ERROR(ENOTSUP);
+ } else if ((error =
+ dsl_prop_get_int_ds(dmu_objset_ds(os),
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ &propval)) == 0 &&
+ propval > SPA_OLD_MAXBLOCKSIZE) {
error = SET_ERROR(ENOTSUP);
} else {
objnum = dmu_objset_id(os);
diff --git a/usr/src/uts/common/fs/zfs/spa_history.c b/usr/src/uts/common/fs/zfs/spa_history.c
index cf72c8ad88..ce64f70b28 100644
--- a/usr/src/uts/common/fs/zfs/spa_history.c
+++ b/usr/src/uts/common/fs/zfs/spa_history.c
@@ -90,7 +90,7 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
ASSERT(spa->spa_history == 0);
spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
- SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
sizeof (spa_history_phys_t), tx);
VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 0504362726..1729ba0503 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -1963,3 +1963,12 @@ spa_debug_enabled(spa_t *spa)
{
return (spa->spa_debug);
}
+
+int
+spa_maxblocksize(spa_t *spa)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SPA_MAXBLOCKSIZE);
+ else
+ return (SPA_OLD_MAXBLOCKSIZE);
+}
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index e9fa39d5d6..93165a9595 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -249,7 +249,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
* The maximum number of bytes that can be accessed as part of one
* operation, including metadata.
*/
-#define DMU_MAX_ACCESS (10<<20) /* 10MB */
+#define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
#define DMU_USERUSED_OBJECT (-1ULL)
@@ -637,6 +637,7 @@ void xuio_stat_wbuf_copied();
void xuio_stat_wbuf_nocopy();
extern int zfs_prefetch_disable;
+extern int zfs_max_recordsize;
/*
* Asynchronously try to read in the data.
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
index 23d88fd048..804f0c182b 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -95,6 +95,7 @@ struct objset {
zfs_cache_type_t os_secondary_cache;
zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata;
+ int os_recordsize;
/* no lock needed: */
struct dmu_tx *os_synctx; /* XXX sketchy */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_send.h b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
index dc183c02c3..3a8dc89abd 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_send.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
@@ -37,12 +37,14 @@ struct dsl_dataset;
struct drr_begin;
struct avl_tree;
-int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+int dmu_send(const char *tosnap, const char *fromsnap,
+ boolean_t embedok, boolean_t large_block_ok,
int outfd, struct vnode *vp, offset_t *off);
int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
uint64_t *sizep);
int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
- boolean_t embedok, int outfd, vnode_t *vp, offset_t *off);
+ boolean_t embedok, boolean_t large_block_ok,
+ int outfd, struct vnode *vp, offset_t *off);
typedef struct dmu_recv_cookie {
struct dsl_dataset *drc_ds;
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
index d9552b2260..ff90f8b439 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -83,6 +83,13 @@ struct dsl_pool;
#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
/*
+ * This field is present (with value=0) if this dataset may contain large
+ * blocks (>128KB). If it is present, then this dataset
+ * is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature.
+ */
+#define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks"
+
+/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
* name lookups should be performed case-insensitively.
*/
@@ -135,6 +142,8 @@ typedef struct dsl_dataset {
/* only used in syncing context, only valid for non-snapshots: */
struct dsl_dataset *ds_prev;
uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */
+ boolean_t ds_large_blocks;
+ boolean_t ds_need_large_blocks;
/* has internal locking: */
dsl_deadlist_t ds_deadlist;
@@ -244,6 +253,8 @@ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
+int dsl_dataset_activate_large_blocks(const char *dsname);
+void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx);
int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index 6c7d34cb82..e4731ae5a7 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -94,17 +94,26 @@ _NOTE(CONSTCOND) } while (0)
_NOTE(CONSTCOND) } while (0)
/*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * We currently support block sizes from 512 bytes to 16MB.
+ * The benefits of larger blocks, and thus larger IO, need to be weighed
+ * against the cost of COWing a giant block to modify one byte, and the
+ * large latency of reading or writing a large block.
+ *
+ * Note that although blocks up to 16MB are supported, the recordsize
+ * property can not be set larger than zfs_max_recordsize (default 1MB).
+ * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
+ *
+ * Note that although the LSIZE field of the blkptr_t can store sizes up
+ * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
+ * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
*/
#define SPA_MINBLOCKSHIFT 9
-#define SPA_MAXBLOCKSHIFT 17
+#define SPA_OLD_MAXBLOCKSHIFT 17
+#define SPA_MAXBLOCKSHIFT 24
#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
+#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT)
#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
-#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
-
/*
* Size of block to hold the configuration data (a packed nvlist)
*/
@@ -781,6 +790,7 @@ extern boolean_t spa_has_slogs(spa_t *spa);
extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa);
extern boolean_t spa_has_pending_synctask(spa_t *spa);
+extern int spa_maxblocksize(spa_t *spa);
extern int spa_mode(spa_t *spa);
extern uint64_t strtonum(const char *str, char **nptr);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 0326aa9c03..e717f131cc 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -60,7 +60,7 @@ typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
uint64_t *ashift);
typedef void vdev_close_func_t(vdev_t *vd);
typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
-typedef int vdev_io_start_func_t(zio_t *zio);
+typedef void vdev_io_start_func_t(zio_t *zio);
typedef void vdev_io_done_func_t(zio_t *zio);
typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
typedef void vdev_hold_func_t(vdev_t *vd);
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
index 466aab02ba..8b4a8b2b56 100644
--- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
@@ -42,8 +42,7 @@ extern int fzap_default_block_shift;
#define MZAP_ENT_LEN 64
#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
-#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
-#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
+#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE
#define ZAP_NEED_CD (-1U)
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index bf9f83c376..62f6ff997d 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -85,13 +85,16 @@ typedef enum drr_headertype {
/* flags #3 - #15 are reserved for incompatible closed-source implementations */
#define DMU_BACKUP_FEATURE_EMBED_DATA (1<<16)
#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17)
+/* flag #18 is reserved for a Delphix feature */
+#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1<<19)
/*
* Mask of all supported backup features
*/
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
- DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)
+ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS)
/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
index bc389b4a43..df08aad7b4 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
@@ -136,8 +136,6 @@ extern "C" {
#define ZFS_SHARES_DIR "SHARES"
#define ZFS_SA_ATTRS "SA_ATTRS"
-#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
-
/*
* Path component length
*
diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h
index 15ef2aa8bf..895d632a26 100644
--- a/usr/src/uts/common/fs/zfs/sys/zil.h
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h
@@ -90,7 +90,6 @@ typedef struct zil_chain {
} zil_chain_t;
#define ZIL_MIN_BLKSZ 4096ULL
-#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE
/*
* The words of a log block checksum.
diff --git a/usr/src/uts/common/fs/zfs/sys/zil_impl.h b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
index 58566203b6..b5c666c02b 100644
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
@@ -139,7 +139,7 @@ typedef struct zil_bp_node {
avl_node_t zn_node;
} zil_bp_node_t;
-#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
+#define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
sizeof (lr_write_t))
#ifdef __cplusplus
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 248b08501c..6c1da96fe5 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -151,9 +151,6 @@ typedef enum zio_priority {
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
} zio_priority_t;
-#define ZIO_PIPELINE_CONTINUE 0x100
-#define ZIO_PIPELINE_STOP 0x101
-
enum zio_flag {
/*
* Flags inherited by gang, ddt, and vdev children,
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 7571b21a5f..67b58edeed 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -828,9 +828,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
/*
* Compute the raidz-deflation ratio. Note, we hard-code
- * in 128k (1 << 17) because it is the current "typical" blocksize.
- * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
- * or we will inconsistently account for existing bp's.
+ * in 128k (1 << 17) because it is the "typical" blocksize.
+ * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
+ * otherwise it would inconsistently account for existing bp's.
*/
vd->vdev_deflate_ratio = (1 << 17) /
(vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index f584ca52a4..e4cc42452a 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -725,7 +725,7 @@ vdev_disk_ioctl_done(void *zio_arg, int error)
zio_interrupt(zio);
}
-static int
+static void
vdev_disk_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -741,14 +741,16 @@ vdev_disk_io_start(zio_t *zio)
*/
if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
zio->io_error = ENXIO;
- return (ZIO_PIPELINE_CONTINUE);
+ zio_interrupt(zio);
+ return;
}
if (zio->io_type == ZIO_TYPE_IOCTL) {
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = SET_ERROR(ENXIO);
- return (ZIO_PIPELINE_CONTINUE);
+ zio_interrupt(zio);
+ return;
}
switch (zio->io_cmd) {
@@ -779,7 +781,7 @@ vdev_disk_io_start(zio_t *zio)
* and will call vdev_disk_ioctl_done()
* upon completion.
*/
- return (ZIO_PIPELINE_STOP);
+ return;
}
if (error == ENOTSUP || error == ENOTTY) {
@@ -800,7 +802,8 @@ vdev_disk_io_start(zio_t *zio)
zio->io_error = SET_ERROR(ENOTSUP);
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
+ return;
}
vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
@@ -823,8 +826,6 @@ vdev_disk_io_start(zio_t *zio)
/* ldi_strategy() will return non-zero only on programming errors */
VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
-
- return (ZIO_PIPELINE_STOP);
}
static void
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index a05abeb9d9..5dfc331d20 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -182,7 +182,7 @@ vdev_file_io_strategy(void *arg)
}
}
-static int
+static void
vdev_file_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -194,7 +194,8 @@ vdev_file_io_start(zio_t *zio)
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = SET_ERROR(ENXIO);
- return (ZIO_PIPELINE_CONTINUE);
+ zio_interrupt(zio);
+ return;
}
switch (zio->io_cmd) {
@@ -206,7 +207,8 @@ vdev_file_io_start(zio_t *zio)
zio->io_error = SET_ERROR(ENOTSUP);
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
+ return;
}
vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
@@ -225,8 +227,6 @@ vdev_file_io_start(zio_t *zio)
VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp,
TQ_SLEEP), !=, 0);
-
- return (ZIO_PIPELINE_STOP);
}
/* ARGSUSED */
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index f62c1e3617..8749e539f4 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -260,7 +260,7 @@ vdev_mirror_child_select(zio_t *zio)
return (-1);
}
-static int
+static void
vdev_mirror_io_start(zio_t *zio)
{
mirror_map_t *mm;
@@ -285,7 +285,8 @@ vdev_mirror_io_start(zio_t *zio)
zio->io_type, zio->io_priority, 0,
vdev_mirror_scrub_done, mc));
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
+ return;
}
/*
* For normal reads just pick one child.
@@ -311,7 +312,7 @@ vdev_mirror_io_start(zio_t *zio)
c++;
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
}
static int
diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c
index b9eb99d180..2287573342 100644
--- a/usr/src/uts/common/fs/zfs/vdev_missing.c
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
/*
@@ -66,11 +66,11 @@ vdev_missing_close(vdev_t *vd)
}
/* ARGSUSED */
-static int
+static void
vdev_missing_io_start(zio_t *zio)
{
zio->io_error = SET_ERROR(ENOTSUP);
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
}
/* ARGSUSED */
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 5d02f3e7ed..00246fff22 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -164,7 +164,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
* we include spans of optional I/Os to aid aggregation at the disk even when
* they aren't able to help us aggregate at this level.
*/
-int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
int zfs_vdev_read_gap_limit = 32 << 10;
int zfs_vdev_write_gap_limit = 4 << 10;
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 480141dc63..085d1250a1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
@@ -1604,7 +1604,7 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
/*
* Don't write past the end of the block
*/
- VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
+ VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
start = offset;
end = start + size;
@@ -1619,8 +1619,8 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
* KB size.
*/
rm = vdev_raidz_map_alloc(data - (offset - origoffset),
- SPA_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, vd->vdev_children,
- vd->vdev_nparity);
+ SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
+ vd->vdev_children, vd->vdev_nparity);
coloffset = origoffset;
@@ -1711,7 +1711,7 @@ vdev_raidz_child_done(zio_t *zio)
* vdevs have had errors, then create zio read operations to the parity
* columns' VDevs as well.
*/
-static int
+static void
vdev_raidz_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -1759,7 +1759,8 @@ vdev_raidz_io_start(zio_t *zio)
ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
+ return;
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -1799,7 +1800,7 @@ vdev_raidz_io_start(zio_t *zio)
}
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
}
diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c
index 1152f9072f..59a9f97044 100644
--- a/usr/src/uts/common/fs/zfs/zap_micro.c
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c
@@ -33,6 +33,7 @@
#include <sys/zap_leaf.h>
#include <sys/avl.h>
#include <sys/arc.h>
+#include <sys/dmu_objset.h>
#ifdef _KERNEL
#include <sys/sunddi.h>
@@ -653,9 +654,9 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
- leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+ leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
indirect_blockshift >= SPA_MINBLOCKSHIFT &&
- indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+ indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
VERIFY(dmu_object_set_blocksize(os, obj,
1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
@@ -1345,7 +1346,6 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
zap_t *zap;
int err = 0;
-
/*
* Since, we don't have a name, we cannot figure out which blocks will
* be affected in this operation. So, account for the worst case :
@@ -1358,7 +1358,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
* large microzap results in a promotion to fatzap.
*/
if (name == NULL) {
- *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+ *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
return (err);
}
@@ -1382,7 +1382,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
/*
* We treat this case as similar to (name == NULL)
*/
- *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+ *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
}
} else {
/*
@@ -1401,12 +1401,12 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
* ptrtbl blocks
*/
if (dmu_buf_freeable(zap->zap_dbuf))
- *tooverwrite += SPA_MAXBLOCKSIZE;
+ *tooverwrite += MZAP_MAX_BLKSZ;
else
- *towrite += SPA_MAXBLOCKSIZE;
+ *towrite += MZAP_MAX_BLKSZ;
if (add) {
- *towrite += 4 * SPA_MAXBLOCKSIZE;
+ *towrite += 4 * MZAP_MAX_BLKSZ;
}
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index f04add92fe..ffb828c316 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -2414,7 +2414,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
const char *propname = nvpair_name(pair);
zfs_prop_t prop = zfs_name_to_prop(propname);
uint64_t intval;
- int err;
+ int err = -1;
if (prop == ZPROP_INVAL) {
if (zfs_prop_userquota(propname))
@@ -3807,8 +3807,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
* the SPA supports it. We ignore any errors here since
* we'll catch them later.
*/
- if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
- nvpair_value_uint64(pair, &intval) == 0) {
+ if (nvpair_value_uint64(pair, &intval) == 0) {
if (intval >= ZIO_COMPRESS_GZIP_1 &&
intval <= ZIO_COMPRESS_GZIP_9 &&
zfs_earlier_version(dsname,
@@ -3859,6 +3858,42 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
return (SET_ERROR(ENOTSUP));
break;
+ case ZFS_PROP_RECORDSIZE:
+ /* Record sizes above 128k need the feature to be enabled */
+ if (nvpair_value_uint64(pair, &intval) == 0 &&
+ intval > SPA_OLD_MAXBLOCKSIZE) {
+ spa_t *spa;
+
+ /*
+ * If this is a bootable dataset then
+ * the we don't allow large (>128K) blocks,
+ * because GRUB doesn't support them.
+ */
+ if (zfs_is_bootfs(dsname) &&
+ intval > SPA_OLD_MAXBLOCKSIZE) {
+ return (SET_ERROR(EDOM));
+ }
+
+ /*
+ * We don't allow setting the property above 1MB,
+ * unless the tunable has been changed.
+ */
+ if (intval > zfs_max_recordsize ||
+ intval > SPA_MAXBLOCKSIZE)
+ return (SET_ERROR(EDOM));
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LARGE_BLOCKS)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+ break;
+
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
@@ -4280,7 +4315,7 @@ out:
* zc_fromobj objsetid of incremental fromsnap (may be zero)
* zc_guid if set, estimate size of stream only. zc_cookie is ignored.
* output size in zc_objset_type.
- * zc_flags if =1, WRITE_EMBEDDED records are permitted
+ * zc_flags lzc_send_flags
*
* outputs:
* zc_objset_type estimated size, if zc_guid is set
@@ -4292,6 +4327,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
offset_t off;
boolean_t estimate = (zc->zc_guid != 0);
boolean_t embedok = (zc->zc_flags & 0x1);
+ boolean_t large_block_ok = (zc->zc_flags & 0x2);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
@@ -4352,7 +4388,8 @@ zfs_ioc_send(zfs_cmd_t *zc)
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
- zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off);
+ zc->zc_fromobj, embedok, large_block_ok,
+ zc->zc_cookie, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
@@ -5254,6 +5291,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
* innvl: {
* "fd" -> file descriptor to write stream to (int32)
* (optional) "fromsnap" -> full snap name to send an incremental from
+ * (optional) "largeblockok" -> (value ignored)
+ * indicates that blocks > 128KB are permitted
* (optional) "embedok" -> (value ignored)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
* }
@@ -5268,6 +5307,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
offset_t off;
char *fromname = NULL;
int fd;
+ boolean_t largeblockok;
boolean_t embedok;
error = nvlist_lookup_int32(innvl, "fd", &fd);
@@ -5276,6 +5316,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
+ largeblockok = nvlist_exists(innvl, "largeblockok");
embedok = nvlist_exists(innvl, "embedok");
file_t *fp = getf(fd);
@@ -5283,7 +5324,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
return (SET_ERROR(EBADF));
off = fp->f_offset;
- error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off);
+ error = dmu_send(snapname, fromname, embedok, largeblockok,
+ fd, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
diff --git a/usr/src/uts/common/fs/zfs/zfs_log.c b/usr/src/uts/common/fs/zfs/zfs_log.c
index aeaba2233a..47d32a45c3 100644
--- a/usr/src/uts/common/fs/zfs/zfs_log.c
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c
@@ -485,7 +485,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
* If the write would overflow the largest block then split it.
*/
if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
- len = SPA_MAXBLOCKSIZE >> 1;
+ len = SPA_OLD_MAXBLOCKSIZE >> 1;
else
len = resid;
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 03f71cb841..f9b7986c56 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -273,10 +273,9 @@ static void
blksz_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
-
- if (newval < SPA_MINBLOCKSIZE ||
- newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
- newval = SPA_MAXBLOCKSIZE;
+ ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+ ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+ ASSERT(ISP2(newval));
zfsvfs->z_max_blksz = newval;
zfsvfs->z_vfs->vfs_bsize = newval;
@@ -907,7 +906,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
*/
zfsvfs->z_vfs = NULL;
zfsvfs->z_parent = zfsvfs;
- zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+ zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
zfsvfs->z_os = os;
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 05e37f0974..58468f2cbd 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -822,8 +822,14 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
uint64_t new_blksz;
if (zp->z_blksz > max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
ASSERT(!ISP2(zp->z_blksz));
- new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+ new_blksz = MIN(end_size,
+ 1 << highbit64(zp->z_blksz));
} else {
new_blksz = MIN(end_size, max_blksz);
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
index 7577250408..4664899d13 100644
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -58,6 +58,7 @@
#endif /* _KERNEL */
#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
#include <sys/refcount.h>
#include <sys/stat.h>
#include <sys/zap.h>
@@ -1474,8 +1475,13 @@ zfs_extend(znode_t *zp, uint64_t end)
* We are growing the file past the current block size.
*/
if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
ASSERT(!ISP2(zp->z_blksz));
- newblksz = MIN(end, SPA_MAXBLOCKSIZE);
+ newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
} else {
newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
}
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 6377285fe2..6beacadf71 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -220,6 +220,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
error = SET_ERROR(ECKSUM);
} else {
+ ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
bcopy(lr, dst, len);
*end = (char *)dst + len;
*nbp = zilc->zc_next_blk;
@@ -234,6 +235,8 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
(zilc->zc_nused > (size - sizeof (*zilc)))) {
error = SET_ERROR(ECKSUM);
} else {
+ ASSERT3U(zilc->zc_nused, <=,
+ SPA_OLD_MAXBLOCKSIZE);
bcopy(lr, dst, zilc->zc_nused);
*end = (char *)dst + zilc->zc_nused;
*nbp = zilc->zc_next_blk;
@@ -317,7 +320,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
- lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+ lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
zil_bp_tree_init(zilog);
for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
@@ -364,7 +367,7 @@ done:
(max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
zil_bp_tree_fini(zilog);
- zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+ zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
return (error);
}
@@ -896,7 +899,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
*
* These must be a multiple of 4KB. Note only the amount used (again
* aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
*/
uint64_t zil_block_buckets[] = {
4096, /* non TX_WRITE */
@@ -978,7 +981,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
continue;
zil_blksz = zil_block_buckets[i];
if (zil_blksz == UINT64_MAX)
- zil_blksz = SPA_MAXBLOCKSIZE;
+ zil_blksz = SPA_OLD_MAXBLOCKSIZE;
zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
for (i = 0; i < ZIL_PREV_BLKS; i++)
zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 5354f30e76..2bbb0c0684 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -65,6 +65,9 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
extern vmem_t *zio_alloc_arena;
#endif
+#define ZIO_PIPELINE_CONTINUE 0x100
+#define ZIO_PIPELINE_STOP 0x101
+
/*
* The following actions directly effect the spa's sync-to-convergence logic.
* The values below define the sync pass when we start performing the action.
@@ -111,9 +114,8 @@ zio_init(void)
/*
* For small buffers, we want a cache for each multiple of
- * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
- * for each quarter-power of 2. For large buffers, we want
- * a cache for each multiple of PAGESIZE.
+ * SPA_MINBLOCKSIZE. For larger buffers, we want a cache
+ * for each quarter-power of 2.
*/
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
@@ -136,10 +138,8 @@ zio_init(void)
#endif
if (size <= 4 * SPA_MINBLOCKSIZE) {
align = SPA_MINBLOCKSIZE;
- } else if (IS_P2ALIGNED(size, PAGESIZE)) {
- align = PAGESIZE;
} else if (IS_P2ALIGNED(size, p2 >> 2)) {
- align = p2 >> 2;
+ align = MIN(p2 >> 2, PAGESIZE);
}
if (align != 0) {
@@ -2500,6 +2500,18 @@ zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
* Read and write to physical devices
* ==========================================================================
*/
+
+
+/*
+ * Issue an I/O to the underlying vdev. Typically the issue pipeline
+ * stops after this stage and will resume upon I/O completion.
+ * However, there are instances where the vdev layer may need to
+ * continue the pipeline when an I/O was not issued. Since the I/O
+ * that was sent to the vdev layer might be different than the one
+ * currently active in the pipeline (see vdev_queue_io()), we explicitly
+ * force the underlying vdev layers to call either zio_execute() or
+ * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
+ */
static int
zio_vdev_io_start(zio_t *zio)
{
@@ -2517,7 +2529,8 @@ zio_vdev_io_start(zio_t *zio)
/*
* The mirror_ops handle multiple DVAs in a single BP.
*/
- return (vdev_mirror_ops.vdev_op_io_start(zio));
+ vdev_mirror_ops.vdev_op_io_start(zio);
+ return (ZIO_PIPELINE_STOP);
}
/*
@@ -2525,7 +2538,7 @@ zio_vdev_io_start(zio_t *zio)
* can quickly react to certain workloads. In particular, we care
* about non-scrubbing, top-level reads and writes with the following
* characteristics:
- * - synchronous writes of user data to non-slog devices
+ * - synchronous writes of user data to non-slog devices
* - any reads of user data
* When these conditions are met, adjust the timestamp of spa_last_io
* which allows the scan thread to adjust its workload accordingly.
@@ -2611,7 +2624,8 @@ zio_vdev_io_start(zio_t *zio)
}
}
- return (vd->vdev_ops->vdev_op_io_start(zio));
+ vd->vdev_ops->vdev_op_io_start(zio);
+ return (ZIO_PIPELINE_STOP);
}
static int
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 6fc43fa232..f681b1dc65 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -197,7 +197,7 @@ int
zvol_check_volblocksize(uint64_t volblocksize)
{
if (volblocksize < SPA_MINBLOCKSIZE ||
- volblocksize > SPA_MAXBLOCKSIZE ||
+ volblocksize > SPA_OLD_MAXBLOCKSIZE ||
!ISP2(volblocksize))
return (SET_ERROR(EDOM));
@@ -698,7 +698,7 @@ zvol_prealloc(zvol_state_t *zv)
while (resid != 0) {
int error;
- uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
+ uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
@@ -1762,7 +1762,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
(void) strcpy(dki.dki_dname, "zvol");
dki.dki_ctype = DKC_UNKNOWN;
dki.dki_unit = getminor(dev);
- dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
+ dki.dki_maxtransfer =
+ 1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
mutex_exit(&zfsdev_state_lock);
if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
error = SET_ERROR(EFAULT);
@@ -2079,14 +2080,14 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
&vbs, tx);
error = error ? error : dmu_object_set_blocksize(
- os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
+ os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
if (version >= SPA_VERSION_DEDUP) {
error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
&dedup, tx);
}
if (error == 0)
- zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
+ zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
}
dmu_tx_commit(tx);
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index 7155d9702b..569fae2091 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -192,6 +192,7 @@ typedef enum {
ZPOOL_PROP_FREEING,
ZPOOL_PROP_FRAGMENTATION,
ZPOOL_PROP_LEAKED,
+ ZPOOL_PROP_MAXBLOCKSIZE,
ZPOOL_NUM_PROPS
} zpool_prop_t;