summaryrefslogtreecommitdiff
path: root/usr/src/uts/common
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common')
-rw-r--r--usr/src/uts/common/fs/zfs/bpobj.c5
-rw-r--r--usr/src/uts/common/fs/zfs/bptree.c2
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c6
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c16
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c83
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_traverse.c41
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c24
-rw-r--r--usr/src/uts/common/fs/zfs/dnode.c10
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dataset.c105
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_deadlist.c8
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_destroy.c7
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_pool.c4
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c2
-rw-r--r--usr/src/uts/common/fs/zfs/sa.c6
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c26
-rw-r--r--usr/src/uts/common/fs/zfs/spa_history.c2
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c9
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_objset.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_send.h6
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_dataset.h11
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h22
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zap_impl.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_znode.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zil.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zil_impl.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h3
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c6
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c15
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_file.c12
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c9
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_missing.c6
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c2
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c15
-rw-r--r--usr/src/uts/common/fs/zfs/zap_micro.c16
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c54
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_log.c2
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c9
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c8
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_znode.c8
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c11
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c32
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c11
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h1
46 files changed, 467 insertions, 167 deletions
diff --git a/usr/src/uts/common/fs/zfs/bpobj.c b/usr/src/uts/common/fs/zfs/bpobj.c
index e75ae72f9e..da4d38a3a9 100644
--- a/usr/src/uts/common/fs/zfs/bpobj.c
+++ b/usr/src/uts/common/fs/zfs/bpobj.c
@@ -43,7 +43,7 @@ bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
ASSERT0(dp->dp_empty_bpobj);
dp->dp_empty_bpobj =
- bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+ bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY(zap_add(os,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
@@ -396,7 +396,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
if (bpo->bpo_phys->bpo_subobjs == 0) {
bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
- DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+ DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OT_NONE, 0, tx);
}
dmu_object_info_t doi;
diff --git a/usr/src/uts/common/fs/zfs/bptree.c b/usr/src/uts/common/fs/zfs/bptree.c
index c724ed0741..5f7d76f0e2 100644
--- a/usr/src/uts/common/fs/zfs/bptree.c
+++ b/usr/src/uts/common/fs/zfs/bptree.c
@@ -65,7 +65,7 @@ bptree_alloc(objset_t *os, dmu_tx_t *tx)
bptree_phys_t *bt;
obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
- SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
sizeof (bptree_phys_t), tx);
/*
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 040b1ac313..7d96048ca3 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -2033,10 +2033,8 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
return (SET_ERROR(ENOTSUP));
if (blksz == 0)
blksz = SPA_MINBLOCKSIZE;
- if (blksz > SPA_MAXBLOCKSIZE)
- blksz = SPA_MAXBLOCKSIZE;
- else
- blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+ ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+ blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 73b8e056cc..e7aeed17fb 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -255,6 +255,14 @@ logbias_changed_cb(void *arg, uint64_t newval)
zil_set_logbias(os->os_zil, newval);
}
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ os->os_recordsize = newval;
+}
+
void
dmu_objset_byteswap(void *buf, size_t size)
{
@@ -384,6 +392,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ZFS_PROP_REDUNDANT_METADATA),
redundant_metadata_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ recordsize_changed_cb, os);
+ }
}
if (err != 0) {
VERIFY(arc_buf_remove_ref(os->os_phys_buf,
@@ -642,6 +655,9 @@ dmu_objset_evict(objset_t *os)
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
redundant_metadata_changed_cb, os));
+ VERIFY0(dsl_prop_unregister(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ recordsize_changed_cb, os));
}
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index fed1b86a1b..2c08e7075f 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -206,11 +206,12 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_offset = offset;
drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
- if (BP_IS_EMBEDDED(bp)) {
+ if (bp == NULL || BP_IS_EMBEDDED(bp)) {
/*
- * There's no pre-computed checksum of embedded BP's, so
- * (like fletcher4-checkummed blocks) userland will have
- * to compute a dedup-capable checksum itself.
+ * There's no pre-computed checksum for partial-block
+ * writes or embedded BP's, so (like
+ * fletcher4-checkummed blocks) userland will have to
+ * compute a dedup-capable checksum itself.
*/
drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
} else {
@@ -372,6 +373,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
drro->drr_compress = dnp->dn_compress;
drro->drr_toguid = dsp->dsa_toguid;
+ if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+ drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR));
@@ -491,6 +496,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
+ uint64_t offset;
ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT0(zb->zb_level);
@@ -511,8 +517,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
}
}
- err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
- blksz, bp, abuf->b_data);
+ offset = zb->zb_blkid * blksz;
+
+ if (!(dsp->dsa_featureflags &
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ blksz > SPA_OLD_MAXBLOCKSIZE) {
+ char *buf = abuf->b_data;
+ while (blksz > 0 && err == 0) {
+ int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+ err = dump_write(dsp, type, zb->zb_object,
+ offset, n, NULL, buf);
+ offset += n;
+ buf += n;
+ blksz -= n;
+ }
+ } else {
+ err = dump_write(dsp, type, zb->zb_object,
+ offset, blksz, bp, abuf->b_data);
+ }
(void) arc_buf_remove_ref(abuf, &abuf);
}
@@ -526,7 +548,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
- int outfd, vnode_t *vp, offset_t *off)
+ boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
{
objset_t *os;
dmu_replay_record_t *drr;
@@ -561,6 +583,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
}
#endif
+ if (large_block_ok && ds->ds_large_blocks)
+ featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
if (embedok &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -656,7 +680,8 @@ out:
int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
- boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
+ boolean_t embedok, boolean_t large_block_ok,
+ int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
@@ -690,18 +715,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
zb.zbm_guid = fromds->ds_phys->ds_guid;
is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG);
- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
- outfd, vp, off);
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ embedok, large_block_ok, outfd, vp, off);
} else {
- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
- outfd, vp, off);
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ embedok, large_block_ok, outfd, vp, off);
}
dsl_dataset_rele(ds, FTAG);
return (err);
}
int
-dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+dmu_send(const char *tosnap, const char *fromsnap,
+ boolean_t embedok, boolean_t large_block_ok,
int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
@@ -768,11 +794,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
dsl_pool_rele(dp, FTAG);
return (err);
}
- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
- outfd, vp, off);
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ embedok, large_block_ok, outfd, vp, off);
} else {
- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
- outfd, vp, off);
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ embedok, large_block_ok, outfd, vp, off);
}
if (owned)
dsl_dataset_disown(ds, FTAG);
@@ -972,6 +998,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
+ /*
+ * The receiving code doesn't know how to translate large blocks
+ * to smaller ones, so the pool must have the LARGE_BLOCKS
+ * feature enabled if the stream has LARGE_BLOCKS.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
/* target fs already exists; recv into temp clone */
@@ -1097,6 +1132,13 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
}
VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
+ if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !newds->ds_large_blocks) {
+ dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+ newds->ds_large_blocks = B_TRUE;
+ }
+
dmu_buf_will_dirty(newds->ds_dbuf, tx);
newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
@@ -1222,6 +1264,7 @@ restore_read(struct restorearg *ra, int len, char *buf)
/* some things will require 8-byte alignment, so everything must */
ASSERT0(len % 8);
+ ASSERT3U(len, <=, ra->bufsize);
while (done < len) {
ssize_t resid;
@@ -1361,7 +1404,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
- drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+ drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
drro->drr_bonuslen > DN_MAX_BONUSLEN) {
return (SET_ERROR(EINVAL));
}
@@ -1634,7 +1677,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
int err;
if (drrs->drr_length < SPA_MINBLOCKSIZE ||
- drrs->drr_length > SPA_MAXBLOCKSIZE)
+ drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
return (SET_ERROR(EINVAL));
data = restore_read(ra, drrs->drr_length, NULL);
@@ -1721,7 +1764,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
ra.cksum = drc->drc_cksum;
ra.vp = vp;
ra.voff = *voffp;
- ra.bufsize = 1<<20;
+ ra.bufsize = SPA_MAXBLOCKSIZE;
ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
/* these were verified in dmu_recv_begin */
diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c
index fb1ec0755c..5836549d20 100644
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c
@@ -59,6 +59,7 @@ typedef struct traverse_data {
int td_flags;
prefetch_data_t *td_pfd;
boolean_t td_paused;
+ uint64_t td_hole_birth_enabled_txg;
blkptr_cb_t *td_func;
void *td_arg;
} traverse_data_t;
@@ -229,25 +230,20 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
}
if (bp->blk_birth == 0) {
- if (spa_feature_is_active(td->td_spa, SPA_FEATURE_HOLE_BIRTH)) {
- /*
- * Since this block has a birth time of 0 it must be a
- * hole created before the SPA_FEATURE_HOLE_BIRTH
- * feature was enabled. If SPA_FEATURE_HOLE_BIRTH
- * was enabled before the min_txg for this traveral we
- * know the hole must have been created before the
- * min_txg for this traveral, so we can skip it. If
- * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg
- * for this traveral we cannot tell if the hole was
- * created before or after the min_txg for this
- * traversal, so we cannot skip it.
- */
- uint64_t hole_birth_enabled_txg;
- VERIFY(spa_feature_enabled_txg(td->td_spa,
- SPA_FEATURE_HOLE_BIRTH, &hole_birth_enabled_txg));
- if (hole_birth_enabled_txg < td->td_min_txg)
- return (0);
- }
+ /*
+ * Since this block has a birth time of 0 it must be a
+ * hole created before the SPA_FEATURE_HOLE_BIRTH
+ * feature was enabled. If SPA_FEATURE_HOLE_BIRTH
+ * was enabled before the min_txg for this traveral we
+ * know the hole must have been created before the
+ * min_txg for this traveral, so we can skip it. If
+ * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg
+ * for this traveral we cannot tell if the hole was
+ * created before or after the min_txg for this
+ * traversal, so we cannot skip it.
+ */
+ if (td->td_hole_birth_enabled_txg < td->td_min_txg)
+ return (0);
} else if (bp->blk_birth <= td->td_min_txg) {
return (0);
}
@@ -523,6 +519,13 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
td.td_flags = flags;
td.td_paused = B_FALSE;
+ if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+ VERIFY(spa_feature_enabled_txg(spa,
+ SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
+ } else {
+ td.td_hole_birth_enabled_txg = 0;
+ }
+
pd.pd_blks_max = zfs_pd_blks_max;
pd.pd_flags = flags;
mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index b31e37cdb8..991de2b1bc 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -226,7 +226,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE);
min_bs = SPA_MINBLOCKSHIFT;
- max_bs = SPA_MAXBLOCKSHIFT;
+ max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
min_ibs = DN_MIN_INDBLKSHIFT;
max_ibs = DN_MAX_INDBLKSHIFT;
@@ -295,6 +295,14 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
*/
ASSERT(dn->dn_datablkshift != 0);
min_bs = max_bs = dn->dn_datablkshift;
+ } else {
+ /*
+ * The blocksize can increase up to the recordsize,
+ * or if it is already more than the recordsize,
+ * up to the next power of 2.
+ */
+ min_bs = highbit64(dn->dn_datablksz - 1);
+ max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
}
/*
@@ -752,11 +760,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
bp = &dn->dn_phys->dn_blkptr[0];
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
bp, bp->blk_birth))
- txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
else
- txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_towrite += MZAP_MAX_BLKSZ;
if (!BP_IS_HOLE(bp))
- txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tounref += MZAP_MAX_BLKSZ;
return;
}
@@ -1545,18 +1553,18 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
/* If blkptr doesn't exist then add space to towrite */
if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
- txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
} else {
blkptr_t *bp;
bp = &dn->dn_phys->dn_spill;
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
bp, bp->blk_birth))
- txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
else
- txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
if (!BP_IS_HOLE(bp))
- txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
}
}
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
index 175157714d..9c70c2bfb1 100644
--- a/usr/src/uts/common/fs/zfs/dnode.c
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -510,10 +510,10 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
{
int i;
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (blocksize == 0)
blocksize = 1 << zfs_default_bs;
- else if (blocksize > SPA_MAXBLOCKSIZE)
- blocksize = SPA_MAXBLOCKSIZE;
else
blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
@@ -594,7 +594,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
int nblkptr;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
- ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
ASSERT0(blocksize % SPA_MINBLOCKSIZE);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
ASSERT(tx->tx_txg != 0);
@@ -1347,10 +1348,9 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
dmu_buf_impl_t *db;
int err;
+ ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (size == 0)
size = SPA_MINBLOCKSIZE;
- if (size > SPA_MAXBLOCKSIZE)
- size = SPA_MAXBLOCKSIZE;
else
size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index f1b92f3eaa..e7ed750902 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -50,6 +50,17 @@
#include <sys/dsl_userhold.h>
#include <sys/dsl_bookmark.h>
+/*
+ * The SPA supports block sizes up to 16MB. However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator. Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB). Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+
#define SWITCH64(x, y) \
{ \
uint64_t __tmp = (x); \
@@ -59,8 +70,6 @@
#define DS_REF_MAX (1ULL << 62)
-#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
-
/*
* Figure out how much of this delta should be propogated to the dsl_dir
* layer. If there's a refreservation, that space has already been
@@ -110,6 +119,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
ds->ds_phys->ds_compressed_bytes += compressed;
ds->ds_phys->ds_uncompressed_bytes += uncompressed;
ds->ds_phys->ds_unique_bytes += used;
+ if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
+ ds->ds_need_large_blocks = B_TRUE;
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
compressed, uncompressed, tx);
@@ -387,6 +398,14 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
offsetof(dmu_sendarg_t, dsa_link));
+ if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+ err = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS);
+ if (err == 0)
+ ds->ds_large_blocks = B_TRUE;
+ else
+ ASSERT3U(err, ==, ENOENT);
+ }
+
if (err == 0) {
err = dsl_dir_hold_obj(dp,
ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
@@ -700,6 +719,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
dsphys->ds_flags |= origin->ds_phys->ds_flags &
(DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
+ if (origin->ds_large_blocks)
+ dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
dmu_buf_will_dirty(origin->ds_dbuf, tx);
origin->ds_phys->ds_num_children++;
@@ -1213,6 +1235,9 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
dsphys->ds_bp = ds->ds_phys->ds_bp;
dmu_buf_rele(dbuf, FTAG);
+ if (ds->ds_large_blocks)
+ dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
if (ds->ds_prev) {
uint64_t next_clones_obj =
@@ -1486,6 +1511,11 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
dmu_objset_sync(ds->ds_objset, zio, tx);
+
+ if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
+ dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+ ds->ds_large_blocks = B_TRUE;
+ }
}
static void
@@ -3128,6 +3158,77 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
return (err);
}
+static int
+dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
+{
+ const char *dsname = arg;
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int error = 0;
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+
+ ASSERT(spa_feature_is_enabled(dp->dp_spa,
+ SPA_FEATURE_EXTENSIBLE_DATASET));
+
+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if (ds->ds_large_blocks)
+ error = EALREADY;
+ dsl_dataset_rele(ds, FTAG);
+
+ return (error);
+}
+
+void
+dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+ uint64_t zero = 0;
+
+ spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+ dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
+ sizeof (zero), 1, &zero, tx));
+}
+
+static void
+dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
+{
+ const char *dsname = arg;
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
+
+ dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+ ASSERT(!ds->ds_large_blocks);
+ ds->ds_large_blocks = B_TRUE;
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_activate_large_blocks(const char *dsname)
+{
+ int error;
+
+ error = dsl_sync_task(dsname,
+ dsl_dataset_activate_large_blocks_check,
+ dsl_dataset_activate_large_blocks_sync, (void *)dsname,
+ 1, ZFS_SPACE_CHECK_RESERVED);
+
+ /*
+ * EALREADY indicates that this dataset already supports large blocks.
+ */
+ if (error == EALREADY)
+ error = 0;
+ return (error);
+}
+
/*
* Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
* For example, they could both be snapshots of the same filesystem, and
diff --git a/usr/src/uts/common/fs/zfs/dsl_deadlist.c b/usr/src/uts/common/fs/zfs/dsl_deadlist.c
index 4f39c397a0..8c8e3746ee 100644
--- a/usr/src/uts/common/fs/zfs/dsl_deadlist.c
+++ b/usr/src/uts/common/fs/zfs/dsl_deadlist.c
@@ -143,7 +143,7 @@ uint64_t
dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
{
if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
- return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+ return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
sizeof (dsl_deadlist_phys_t), tx));
}
@@ -180,7 +180,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
{
if (dle->dle_bpobj.bpo_object ==
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
- uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
bpobj_close(&dle->dle_bpobj);
bpobj_decr_empty(dl->dl_os, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
@@ -254,7 +254,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
dle->dle_mintxg = mintxg;
- obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
avl_add(&dl->dl_tree, dle);
@@ -338,7 +338,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
if (dle->dle_mintxg >= maxtxg)
break;
- obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
dle->dle_mintxg, obj, tx));
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_destroy.c b/usr/src/uts/common/fs/zfs/dsl_destroy.c
index f8a4546535..1237641583 100644
--- a/usr/src/uts/common/fs/zfs/dsl_destroy.c
+++ b/usr/src/uts/common/fs/zfs/dsl_destroy.c
@@ -264,6 +264,10 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
obj = ds->ds_object;
+ if (ds->ds_large_blocks) {
+ ASSERT0(zap_contains(mos, obj, DS_FIELD_LARGE_BLOCKS));
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+ }
if (ds->ds_phys->ds_prev_snap_obj != 0) {
ASSERT3P(ds->ds_prev, ==, NULL);
VERIFY0(dsl_dataset_hold_obj(dp,
@@ -720,6 +724,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
ASSERT0(ds->ds_reserved);
}
+ if (ds->ds_large_blocks)
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+
dsl_scan_ds_destroyed(ds, tx);
obj = ds->ds_object;
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index f08f8559d6..2bb699d43e 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -368,7 +368,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
FREE_DIR_NAME, &dp->dp_free_dir));
/* create and open the free_bplist */
- obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
VERIFY0(bpobj_open(&dp->dp_free_bpobj,
@@ -793,7 +793,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
* subobj support. So call dmu_object_alloc() directly.
*/
obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
- SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index a5d462cacf..3411fbf32c 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -130,7 +130,7 @@ int metaslab_debug_unload = 0;
* an allocation of this size then it switches to using more
* aggressive strategy (i.e search by size rather than offset).
*/
-uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
+uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
/*
* The minimum free space, in percent, which must be available
diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c
index bd2ebce5df..8b3963aed9 100644
--- a/usr/src/uts/common/fs/zfs/sa.c
+++ b/usr/src/uts/common/fs/zfs/sa.c
@@ -500,7 +500,7 @@ sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
if (size == 0) {
blocksize = SPA_MINBLOCKSIZE;
- } else if (size > SPA_MAXBLOCKSIZE) {
+ } else if (size > SPA_OLD_MAXBLOCKSIZE) {
ASSERT(0);
return (SET_ERROR(EFBIG));
} else {
@@ -675,7 +675,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
SA_BONUS, &i, &used, &spilling);
- if (used > SPA_MAXBLOCKSIZE)
+ if (used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
@@ -699,7 +699,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
attr_count - i, hdl->sa_spill, SA_SPILL, &i,
&spill_used, &dummy);
- if (spill_used > SPA_MAXBLOCKSIZE)
+ if (spill_used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
buf_space = hdl->sa_spill->db_size - spillhdrsize;
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index be308e2f87..634967c46f 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -267,6 +267,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
0, ZPROP_SRC_LOCAL);
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+ MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+ SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
+ }
+
if ((dp = list_head(&spa->spa_config_list)) != NULL) {
if (dp->scd_path == NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -481,7 +489,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
if (!error) {
objset_t *os;
- uint64_t compress;
+ uint64_t propval;
if (strval == NULL || strval[0] == '\0') {
objnum = zpool_prop_default_numeric(
@@ -492,15 +500,25 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
if (error = dmu_objset_hold(strval, FTAG, &os))
break;
- /* Must be ZPL and not gzip compressed. */
+ /*
+ * Must be ZPL, and its property settings
+ * must be supported by GRUB (compression
+ * is not gzip, and large blocks are not used).
+ */
if (dmu_objset_type(os) != DMU_OST_ZFS) {
error = SET_ERROR(ENOTSUP);
} else if ((error =
dsl_prop_get_int_ds(dmu_objset_ds(os),
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
- &compress)) == 0 &&
- !BOOTFS_COMPRESS_VALID(compress)) {
+ &propval)) == 0 &&
+ !BOOTFS_COMPRESS_VALID(propval)) {
+ error = SET_ERROR(ENOTSUP);
+ } else if ((error =
+ dsl_prop_get_int_ds(dmu_objset_ds(os),
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ &propval)) == 0 &&
+ propval > SPA_OLD_MAXBLOCKSIZE) {
error = SET_ERROR(ENOTSUP);
} else {
objnum = dmu_objset_id(os);
diff --git a/usr/src/uts/common/fs/zfs/spa_history.c b/usr/src/uts/common/fs/zfs/spa_history.c
index cf72c8ad88..ce64f70b28 100644
--- a/usr/src/uts/common/fs/zfs/spa_history.c
+++ b/usr/src/uts/common/fs/zfs/spa_history.c
@@ -90,7 +90,7 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
ASSERT(spa->spa_history == 0);
spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
- SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
sizeof (spa_history_phys_t), tx);
VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 0504362726..1729ba0503 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -1963,3 +1963,12 @@ spa_debug_enabled(spa_t *spa)
{
return (spa->spa_debug);
}
+
+int
+spa_maxblocksize(spa_t *spa)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SPA_MAXBLOCKSIZE);
+ else
+ return (SPA_OLD_MAXBLOCKSIZE);
+}
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index e9fa39d5d6..93165a9595 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -249,7 +249,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
* The maximum number of bytes that can be accessed as part of one
* operation, including metadata.
*/
-#define DMU_MAX_ACCESS (10<<20) /* 10MB */
+#define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
#define DMU_USERUSED_OBJECT (-1ULL)
@@ -637,6 +637,7 @@ void xuio_stat_wbuf_copied();
void xuio_stat_wbuf_nocopy();
extern int zfs_prefetch_disable;
+extern int zfs_max_recordsize;
/*
* Asynchronously try to read in the data.
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
index 23d88fd048..804f0c182b 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -95,6 +95,7 @@ struct objset {
zfs_cache_type_t os_secondary_cache;
zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata;
+ int os_recordsize;
/* no lock needed: */
struct dmu_tx *os_synctx; /* XXX sketchy */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_send.h b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
index dc183c02c3..3a8dc89abd 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_send.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
@@ -37,12 +37,14 @@ struct dsl_dataset;
struct drr_begin;
struct avl_tree;
-int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+int dmu_send(const char *tosnap, const char *fromsnap,
+ boolean_t embedok, boolean_t large_block_ok,
int outfd, struct vnode *vp, offset_t *off);
int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
uint64_t *sizep);
int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
- boolean_t embedok, int outfd, vnode_t *vp, offset_t *off);
+ boolean_t embedok, boolean_t large_block_ok,
+ int outfd, struct vnode *vp, offset_t *off);
typedef struct dmu_recv_cookie {
struct dsl_dataset *drc_ds;
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
index d9552b2260..ff90f8b439 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -83,6 +83,13 @@ struct dsl_pool;
#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
/*
+ * This field is present (with value=0) if this dataset may contain large
+ * blocks (>128KB). If it is present, then this dataset
+ * is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature.
+ */
+#define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks"
+
+/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
* name lookups should be performed case-insensitively.
*/
@@ -135,6 +142,8 @@ typedef struct dsl_dataset {
/* only used in syncing context, only valid for non-snapshots: */
struct dsl_dataset *ds_prev;
uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */
+ boolean_t ds_large_blocks;
+ boolean_t ds_need_large_blocks;
/* has internal locking: */
dsl_deadlist_t ds_deadlist;
@@ -244,6 +253,8 @@ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
+int dsl_dataset_activate_large_blocks(const char *dsname);
+void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx);
int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index 6c7d34cb82..e4731ae5a7 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -94,17 +94,26 @@ _NOTE(CONSTCOND) } while (0)
_NOTE(CONSTCOND) } while (0)
/*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * We currently support block sizes from 512 bytes to 16MB.
+ * The benefits of larger blocks, and thus larger IO, need to be weighed
+ * against the cost of COWing a giant block to modify one byte, and the
+ * large latency of reading or writing a large block.
+ *
+ * Note that although blocks up to 16MB are supported, the recordsize
+ * property can not be set larger than zfs_max_recordsize (default 1MB).
+ * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
+ *
+ * Note that although the LSIZE field of the blkptr_t can store sizes up
+ * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
+ * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
*/
#define SPA_MINBLOCKSHIFT 9
-#define SPA_MAXBLOCKSHIFT 17
+#define SPA_OLD_MAXBLOCKSHIFT 17
+#define SPA_MAXBLOCKSHIFT 24
#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
+#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT)
#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
-#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
-
/*
* Size of block to hold the configuration data (a packed nvlist)
*/
@@ -781,6 +790,7 @@ extern boolean_t spa_has_slogs(spa_t *spa);
extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa);
extern boolean_t spa_has_pending_synctask(spa_t *spa);
+extern int spa_maxblocksize(spa_t *spa);
extern int spa_mode(spa_t *spa);
extern uint64_t strtonum(const char *str, char **nptr);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 0326aa9c03..e717f131cc 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -60,7 +60,7 @@ typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
uint64_t *ashift);
typedef void vdev_close_func_t(vdev_t *vd);
typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
-typedef int vdev_io_start_func_t(zio_t *zio);
+typedef void vdev_io_start_func_t(zio_t *zio);
typedef void vdev_io_done_func_t(zio_t *zio);
typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
typedef void vdev_hold_func_t(vdev_t *vd);
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
index 466aab02ba..8b4a8b2b56 100644
--- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
@@ -42,8 +42,7 @@ extern int fzap_default_block_shift;
#define MZAP_ENT_LEN 64
#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
-#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
-#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
+#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE
#define ZAP_NEED_CD (-1U)
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index bf9f83c376..62f6ff997d 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -85,13 +85,16 @@ typedef enum drr_headertype {
/* flags #3 - #15 are reserved for incompatible closed-source implementations */
#define DMU_BACKUP_FEATURE_EMBED_DATA (1<<16)
#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17)
+/* flag #18 is reserved for a Delphix feature */
+#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1<<19)
/*
* Mask of all supported backup features
*/
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
- DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)
+ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS)
/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
index bc389b4a43..df08aad7b4 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
@@ -136,8 +136,6 @@ extern "C" {
#define ZFS_SHARES_DIR "SHARES"
#define ZFS_SA_ATTRS "SA_ATTRS"
-#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
-
/*
* Path component length
*
diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h
index 15ef2aa8bf..895d632a26 100644
--- a/usr/src/uts/common/fs/zfs/sys/zil.h
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h
@@ -90,7 +90,6 @@ typedef struct zil_chain {
} zil_chain_t;
#define ZIL_MIN_BLKSZ 4096ULL
-#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE
/*
* The words of a log block checksum.
diff --git a/usr/src/uts/common/fs/zfs/sys/zil_impl.h b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
index 58566203b6..b5c666c02b 100644
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
@@ -139,7 +139,7 @@ typedef struct zil_bp_node {
avl_node_t zn_node;
} zil_bp_node_t;
-#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
+#define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
sizeof (lr_write_t))
#ifdef __cplusplus
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 248b08501c..6c1da96fe5 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -151,9 +151,6 @@ typedef enum zio_priority {
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
} zio_priority_t;
-#define ZIO_PIPELINE_CONTINUE 0x100
-#define ZIO_PIPELINE_STOP 0x101
-
enum zio_flag {
/*
* Flags inherited by gang, ddt, and vdev children,
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 7571b21a5f..67b58edeed 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -828,9 +828,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
/*
* Compute the raidz-deflation ratio. Note, we hard-code
- * in 128k (1 << 17) because it is the current "typical" blocksize.
- * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
- * or we will inconsistently account for existing bp's.
+ * in 128k (1 << 17) because it is the "typical" blocksize.
+ * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
+ * otherwise it would inconsistently account for existing bp's.
*/
vd->vdev_deflate_ratio = (1 << 17) /
(vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index f584ca52a4..e4cc42452a 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -725,7 +725,7 @@ vdev_disk_ioctl_done(void *zio_arg, int error)
zio_interrupt(zio);
}
-static int
+static void
vdev_disk_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -741,14 +741,16 @@ vdev_disk_io_start(zio_t *zio)
*/
if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
zio->io_error = ENXIO;
- return (ZIO_PIPELINE_CONTINUE);
+ zio_interrupt(zio);
+ return;
}
if (zio->io_type == ZIO_TYPE_IOCTL) {
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = SET_ERROR(ENXIO);
- return (ZIO_PIPELINE_CONTINUE);
+ zio_interrupt(zio);
+ return;
}
switch (zio->io_cmd) {
@@ -779,7 +781,7 @@ vdev_disk_io_start(zio_t *zio)
* and will call vdev_disk_ioctl_done()
* upon completion.
*/
- return (ZIO_PIPELINE_STOP);
+ return;
}
if (error == ENOTSUP || error == ENOTTY) {
@@ -800,7 +802,8 @@ vdev_disk_io_start(zio_t *zio)
zio->io_error = SET_ERROR(ENOTSUP);
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
+ return;
}
vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
@@ -823,8 +826,6 @@ vdev_disk_io_start(zio_t *zio)
/* ldi_strategy() will return non-zero only on programming errors */
VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
-
- return (ZIO_PIPELINE_STOP);
}
static void
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index a05abeb9d9..5dfc331d20 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -182,7 +182,7 @@ vdev_file_io_strategy(void *arg)
}
}
-static int
+static void
vdev_file_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -194,7 +194,8 @@ vdev_file_io_start(zio_t *zio)
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = SET_ERROR(ENXIO);
- return (ZIO_PIPELINE_CONTINUE);
+ zio_interrupt(zio);
+ return;
}
switch (zio->io_cmd) {
@@ -206,7 +207,8 @@ vdev_file_io_start(zio_t *zio)
zio->io_error = SET_ERROR(ENOTSUP);
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
+ return;
}
vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
@@ -225,8 +227,6 @@ vdev_file_io_start(zio_t *zio)
VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp,
TQ_SLEEP), !=, 0);
-
- return (ZIO_PIPELINE_STOP);
}
/* ARGSUSED */
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index f62c1e3617..8749e539f4 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -260,7 +260,7 @@ vdev_mirror_child_select(zio_t *zio)
return (-1);
}
-static int
+static void
vdev_mirror_io_start(zio_t *zio)
{
mirror_map_t *mm;
@@ -285,7 +285,8 @@ vdev_mirror_io_start(zio_t *zio)
zio->io_type, zio->io_priority, 0,
vdev_mirror_scrub_done, mc));
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
+ return;
}
/*
* For normal reads just pick one child.
@@ -311,7 +312,7 @@ vdev_mirror_io_start(zio_t *zio)
c++;
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
}
static int
diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c
index b9eb99d180..2287573342 100644
--- a/usr/src/uts/common/fs/zfs/vdev_missing.c
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
/*
@@ -66,11 +66,11 @@ vdev_missing_close(vdev_t *vd)
}
/* ARGSUSED */
-static int
+static void
vdev_missing_io_start(zio_t *zio)
{
zio->io_error = SET_ERROR(ENOTSUP);
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
}
/* ARGSUSED */
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 5d02f3e7ed..00246fff22 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -164,7 +164,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
* we include spans of optional I/Os to aid aggregation at the disk even when
* they aren't able to help us aggregate at this level.
*/
-int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
int zfs_vdev_read_gap_limit = 32 << 10;
int zfs_vdev_write_gap_limit = 4 << 10;
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 480141dc63..085d1250a1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
@@ -1604,7 +1604,7 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
/*
* Don't write past the end of the block
*/
- VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
+ VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
start = offset;
end = start + size;
@@ -1619,8 +1619,8 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
* KB size.
*/
rm = vdev_raidz_map_alloc(data - (offset - origoffset),
- SPA_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, vd->vdev_children,
- vd->vdev_nparity);
+ SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
+ vd->vdev_children, vd->vdev_nparity);
coloffset = origoffset;
@@ -1711,7 +1711,7 @@ vdev_raidz_child_done(zio_t *zio)
* vdevs have had errors, then create zio read operations to the parity
* columns' VDevs as well.
*/
-static int
+static void
vdev_raidz_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -1759,7 +1759,8 @@ vdev_raidz_io_start(zio_t *zio)
ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
+ return;
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -1799,7 +1800,7 @@ vdev_raidz_io_start(zio_t *zio)
}
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
}
diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c
index 1152f9072f..59a9f97044 100644
--- a/usr/src/uts/common/fs/zfs/zap_micro.c
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c
@@ -33,6 +33,7 @@
#include <sys/zap_leaf.h>
#include <sys/avl.h>
#include <sys/arc.h>
+#include <sys/dmu_objset.h>
#ifdef _KERNEL
#include <sys/sunddi.h>
@@ -653,9 +654,9 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
- leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+ leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
indirect_blockshift >= SPA_MINBLOCKSHIFT &&
- indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+ indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
VERIFY(dmu_object_set_blocksize(os, obj,
1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
@@ -1345,7 +1346,6 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
zap_t *zap;
int err = 0;
-
/*
* Since, we don't have a name, we cannot figure out which blocks will
* be affected in this operation. So, account for the worst case :
@@ -1358,7 +1358,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
* large microzap results in a promotion to fatzap.
*/
if (name == NULL) {
- *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+ *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
return (err);
}
@@ -1382,7 +1382,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
/*
* We treat this case as similar to (name == NULL)
*/
- *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+ *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
}
} else {
/*
@@ -1401,12 +1401,12 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
* ptrtbl blocks
*/
if (dmu_buf_freeable(zap->zap_dbuf))
- *tooverwrite += SPA_MAXBLOCKSIZE;
+ *tooverwrite += MZAP_MAX_BLKSZ;
else
- *towrite += SPA_MAXBLOCKSIZE;
+ *towrite += MZAP_MAX_BLKSZ;
if (add) {
- *towrite += 4 * SPA_MAXBLOCKSIZE;
+ *towrite += 4 * MZAP_MAX_BLKSZ;
}
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index f04add92fe..ffb828c316 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -2414,7 +2414,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
const char *propname = nvpair_name(pair);
zfs_prop_t prop = zfs_name_to_prop(propname);
uint64_t intval;
- int err;
+ int err = -1;
if (prop == ZPROP_INVAL) {
if (zfs_prop_userquota(propname))
@@ -3807,8 +3807,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
* the SPA supports it. We ignore any errors here since
* we'll catch them later.
*/
- if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
- nvpair_value_uint64(pair, &intval) == 0) {
+ if (nvpair_value_uint64(pair, &intval) == 0) {
if (intval >= ZIO_COMPRESS_GZIP_1 &&
intval <= ZIO_COMPRESS_GZIP_9 &&
zfs_earlier_version(dsname,
@@ -3859,6 +3858,42 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
return (SET_ERROR(ENOTSUP));
break;
+ case ZFS_PROP_RECORDSIZE:
+ /* Record sizes above 128k need the feature to be enabled */
+ if (nvpair_value_uint64(pair, &intval) == 0 &&
+ intval > SPA_OLD_MAXBLOCKSIZE) {
+ spa_t *spa;
+
+ /*
+ * If this is a bootable dataset then
+ * the we don't allow large (>128K) blocks,
+ * because GRUB doesn't support them.
+ */
+ if (zfs_is_bootfs(dsname) &&
+ intval > SPA_OLD_MAXBLOCKSIZE) {
+ return (SET_ERROR(EDOM));
+ }
+
+ /*
+ * We don't allow setting the property above 1MB,
+ * unless the tunable has been changed.
+ */
+ if (intval > zfs_max_recordsize ||
+ intval > SPA_MAXBLOCKSIZE)
+ return (SET_ERROR(EDOM));
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LARGE_BLOCKS)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+ break;
+
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
@@ -4280,7 +4315,7 @@ out:
* zc_fromobj objsetid of incremental fromsnap (may be zero)
* zc_guid if set, estimate size of stream only. zc_cookie is ignored.
* output size in zc_objset_type.
- * zc_flags if =1, WRITE_EMBEDDED records are permitted
+ * zc_flags lzc_send_flags
*
* outputs:
* zc_objset_type estimated size, if zc_guid is set
@@ -4292,6 +4327,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
offset_t off;
boolean_t estimate = (zc->zc_guid != 0);
boolean_t embedok = (zc->zc_flags & 0x1);
+ boolean_t large_block_ok = (zc->zc_flags & 0x2);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
@@ -4352,7 +4388,8 @@ zfs_ioc_send(zfs_cmd_t *zc)
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
- zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off);
+ zc->zc_fromobj, embedok, large_block_ok,
+ zc->zc_cookie, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
@@ -5254,6 +5291,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
* innvl: {
* "fd" -> file descriptor to write stream to (int32)
* (optional) "fromsnap" -> full snap name to send an incremental from
+ * (optional) "largeblockok" -> (value ignored)
+ * indicates that blocks > 128KB are permitted
* (optional) "embedok" -> (value ignored)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
* }
@@ -5268,6 +5307,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
offset_t off;
char *fromname = NULL;
int fd;
+ boolean_t largeblockok;
boolean_t embedok;
error = nvlist_lookup_int32(innvl, "fd", &fd);
@@ -5276,6 +5316,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
+ largeblockok = nvlist_exists(innvl, "largeblockok");
embedok = nvlist_exists(innvl, "embedok");
file_t *fp = getf(fd);
@@ -5283,7 +5324,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
return (SET_ERROR(EBADF));
off = fp->f_offset;
- error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off);
+ error = dmu_send(snapname, fromname, embedok, largeblockok,
+ fd, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
diff --git a/usr/src/uts/common/fs/zfs/zfs_log.c b/usr/src/uts/common/fs/zfs/zfs_log.c
index aeaba2233a..47d32a45c3 100644
--- a/usr/src/uts/common/fs/zfs/zfs_log.c
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c
@@ -485,7 +485,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
* If the write would overflow the largest block then split it.
*/
if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
- len = SPA_MAXBLOCKSIZE >> 1;
+ len = SPA_OLD_MAXBLOCKSIZE >> 1;
else
len = resid;
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 03f71cb841..f9b7986c56 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -273,10 +273,9 @@ static void
blksz_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
-
- if (newval < SPA_MINBLOCKSIZE ||
- newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
- newval = SPA_MAXBLOCKSIZE;
+ ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+ ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+ ASSERT(ISP2(newval));
zfsvfs->z_max_blksz = newval;
zfsvfs->z_vfs->vfs_bsize = newval;
@@ -907,7 +906,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
*/
zfsvfs->z_vfs = NULL;
zfsvfs->z_parent = zfsvfs;
- zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+ zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
zfsvfs->z_os = os;
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 05e37f0974..58468f2cbd 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -822,8 +822,14 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
uint64_t new_blksz;
if (zp->z_blksz > max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
ASSERT(!ISP2(zp->z_blksz));
- new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+ new_blksz = MIN(end_size,
+ 1 << highbit64(zp->z_blksz));
} else {
new_blksz = MIN(end_size, max_blksz);
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
index 7577250408..4664899d13 100644
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -58,6 +58,7 @@
#endif /* _KERNEL */
#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
#include <sys/refcount.h>
#include <sys/stat.h>
#include <sys/zap.h>
@@ -1474,8 +1475,13 @@ zfs_extend(znode_t *zp, uint64_t end)
* We are growing the file past the current block size.
*/
if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
ASSERT(!ISP2(zp->z_blksz));
- newblksz = MIN(end, SPA_MAXBLOCKSIZE);
+ newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
} else {
newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
}
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 6377285fe2..6beacadf71 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -220,6 +220,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
error = SET_ERROR(ECKSUM);
} else {
+ ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
bcopy(lr, dst, len);
*end = (char *)dst + len;
*nbp = zilc->zc_next_blk;
@@ -234,6 +235,8 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
(zilc->zc_nused > (size - sizeof (*zilc)))) {
error = SET_ERROR(ECKSUM);
} else {
+ ASSERT3U(zilc->zc_nused, <=,
+ SPA_OLD_MAXBLOCKSIZE);
bcopy(lr, dst, zilc->zc_nused);
*end = (char *)dst + zilc->zc_nused;
*nbp = zilc->zc_next_blk;
@@ -317,7 +320,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
- lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+ lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
zil_bp_tree_init(zilog);
for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
@@ -364,7 +367,7 @@ done:
(max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
zil_bp_tree_fini(zilog);
- zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+ zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
return (error);
}
@@ -896,7 +899,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
*
* These must be a multiple of 4KB. Note only the amount used (again
* aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
*/
uint64_t zil_block_buckets[] = {
4096, /* non TX_WRITE */
@@ -978,7 +981,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
continue;
zil_blksz = zil_block_buckets[i];
if (zil_blksz == UINT64_MAX)
- zil_blksz = SPA_MAXBLOCKSIZE;
+ zil_blksz = SPA_OLD_MAXBLOCKSIZE;
zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
for (i = 0; i < ZIL_PREV_BLKS; i++)
zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 5354f30e76..2bbb0c0684 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -65,6 +65,9 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
extern vmem_t *zio_alloc_arena;
#endif
+#define ZIO_PIPELINE_CONTINUE 0x100
+#define ZIO_PIPELINE_STOP 0x101
+
/*
* The following actions directly effect the spa's sync-to-convergence logic.
* The values below define the sync pass when we start performing the action.
@@ -111,9 +114,8 @@ zio_init(void)
/*
* For small buffers, we want a cache for each multiple of
- * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
- * for each quarter-power of 2. For large buffers, we want
- * a cache for each multiple of PAGESIZE.
+ * SPA_MINBLOCKSIZE. For larger buffers, we want a cache
+ * for each quarter-power of 2.
*/
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
@@ -136,10 +138,8 @@ zio_init(void)
#endif
if (size <= 4 * SPA_MINBLOCKSIZE) {
align = SPA_MINBLOCKSIZE;
- } else if (IS_P2ALIGNED(size, PAGESIZE)) {
- align = PAGESIZE;
} else if (IS_P2ALIGNED(size, p2 >> 2)) {
- align = p2 >> 2;
+ align = MIN(p2 >> 2, PAGESIZE);
}
if (align != 0) {
@@ -2500,6 +2500,18 @@ zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
* Read and write to physical devices
* ==========================================================================
*/
+
+
+/*
+ * Issue an I/O to the underlying vdev. Typically the issue pipeline
+ * stops after this stage and will resume upon I/O completion.
+ * However, there are instances where the vdev layer may need to
+ * continue the pipeline when an I/O was not issued. Since the I/O
+ * that was sent to the vdev layer might be different than the one
+ * currently active in the pipeline (see vdev_queue_io()), we explicitly
+ * force the underlying vdev layers to call either zio_execute() or
+ * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
+ */
static int
zio_vdev_io_start(zio_t *zio)
{
@@ -2517,7 +2529,8 @@ zio_vdev_io_start(zio_t *zio)
/*
* The mirror_ops handle multiple DVAs in a single BP.
*/
- return (vdev_mirror_ops.vdev_op_io_start(zio));
+ vdev_mirror_ops.vdev_op_io_start(zio);
+ return (ZIO_PIPELINE_STOP);
}
/*
@@ -2525,7 +2538,7 @@ zio_vdev_io_start(zio_t *zio)
* can quickly react to certain workloads. In particular, we care
* about non-scrubbing, top-level reads and writes with the following
* characteristics:
- * - synchronous writes of user data to non-slog devices
+ * - synchronous writes of user data to non-slog devices
* - any reads of user data
* When these conditions are met, adjust the timestamp of spa_last_io
* which allows the scan thread to adjust its workload accordingly.
@@ -2611,7 +2624,8 @@ zio_vdev_io_start(zio_t *zio)
}
}
- return (vd->vdev_ops->vdev_op_io_start(zio));
+ vd->vdev_ops->vdev_op_io_start(zio);
+ return (ZIO_PIPELINE_STOP);
}
static int
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 6fc43fa232..f681b1dc65 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -197,7 +197,7 @@ int
zvol_check_volblocksize(uint64_t volblocksize)
{
if (volblocksize < SPA_MINBLOCKSIZE ||
- volblocksize > SPA_MAXBLOCKSIZE ||
+ volblocksize > SPA_OLD_MAXBLOCKSIZE ||
!ISP2(volblocksize))
return (SET_ERROR(EDOM));
@@ -698,7 +698,7 @@ zvol_prealloc(zvol_state_t *zv)
while (resid != 0) {
int error;
- uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
+ uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
@@ -1762,7 +1762,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
(void) strcpy(dki.dki_dname, "zvol");
dki.dki_ctype = DKC_UNKNOWN;
dki.dki_unit = getminor(dev);
- dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
+ dki.dki_maxtransfer =
+ 1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
mutex_exit(&zfsdev_state_lock);
if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
error = SET_ERROR(EFAULT);
@@ -2079,14 +2080,14 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
&vbs, tx);
error = error ? error : dmu_object_set_blocksize(
- os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
+ os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
if (version >= SPA_VERSION_DEDUP) {
error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
&dedup, tx);
}
if (error == 0)
- zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
+ zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
}
dmu_tx_commit(tx);
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index 7155d9702b..569fae2091 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -192,6 +192,7 @@ typedef enum {
ZPOOL_PROP_FREEING,
ZPOOL_PROP_FRAGMENTATION,
ZPOOL_PROP_LEAKED,
+ ZPOOL_PROP_MAXBLOCKSIZE,
ZPOOL_NUM_PROPS
} zpool_prop_t;