summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2017-05-24 11:46:08 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2017-05-24 11:46:08 +0000
commit5afadcad0a6461d081c4e5bf91eef476df0632c3 (patch)
tree0f08abbc25279097e044b4edcb3623a765544201
parent2250f63cda40ce2b0b403556720b3513e937d810 (diff)
parentbc83969fdbd1cb0d97ba00218c0a3de5c89fba92 (diff)
downloadillumos-joyent-5afadcad0a6461d081c4e5bf91eef476df0632c3.tar.gz
[illumos-gate merge]
commit bc83969fdbd1cb0d97ba00218c0a3de5c89fba92 8265 Reserve send stream flag for large dnode feature commit d77f81966c796267d14a0cc81ff3491176405cc0 8067 zdb should be able to dump literal embedded block pointer commit c5ee46810f82e8a53d2cc5a487568a573f449039 7578 Fix/improve some aspects of ZIL writing.
-rw-r--r--usr/src/cmd/zdb/zdb.c52
-rw-r--r--usr/src/cmd/ztest/ztest.c1
-rw-r--r--usr/src/man/man1m/zdb.1m12
-rw-r--r--usr/src/uts/common/fs/zfs/blkptr.c33
-rw-r--r--usr/src/uts/common/fs/zfs/sys/blkptr.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zil.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zil_impl.h20
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h2
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_log.c37
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c107
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c17
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c50
13 files changed, 214 insertions, 120 deletions
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 0137e6f448..1cf3e85ce0 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -61,6 +61,7 @@
#include <sys/ddt.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
+#include <sys/blkptr.h>
#include <zfs_comutil.h>
#undef verify
#include <libzfs.h>
@@ -134,10 +135,11 @@ usage(void)
"\t%s -O <dataset> <path>\n"
"\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
"\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
+ "\t%s -E [-A] word0:word1:...:word15\n"
"\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
"<poolname>\n\n",
cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
- cmdname);
+ cmdname, cmdname);
(void) fprintf(stderr, " Dataset name must include at least one "
"separator character '/' or '@'\n");
@@ -152,6 +154,8 @@ usage(void)
(void) fprintf(stderr, " -C config (or cachefile if alone)\n");
(void) fprintf(stderr, " -d dataset(s)\n");
(void) fprintf(stderr, " -D dedup statistics\n");
+ (void) fprintf(stderr, " -E decode and display block from an "
+ "embedded block pointer\n");
(void) fprintf(stderr, " -h pool history\n");
(void) fprintf(stderr, " -i intent logs\n");
(void) fprintf(stderr, " -l read label contents\n");
@@ -3623,6 +3627,33 @@ out:
free(dup);
}
+static void
+zdb_embedded_block(char *thing)
+{
+ blkptr_t bp = { 0 };
+ unsigned long long *words = (void *)&bp;
+ char buf[SPA_MAXBLOCKSIZE];
+ int err;
+
+ err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
+ "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
+ words + 0, words + 1, words + 2, words + 3,
+ words + 4, words + 5, words + 6, words + 7,
+ words + 8, words + 9, words + 10, words + 11,
+ words + 12, words + 13, words + 14, words + 15);
+ if (err != 16) {
+ (void) printf("invalid input format\n");
+ exit(1);
+ }
+ ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
+ err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
+ if (err != 0) {
+ (void) printf("decode failed: %u\n", err);
+ exit(1);
+ }
+ zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
+}
+
static boolean_t
pool_match(nvlist_t *cfg, char *tgt)
{
@@ -3741,13 +3772,14 @@ main(int argc, char **argv)
spa_config_path = spa_config_path_env;
while ((c = getopt(argc, argv,
- "AbcCdDeFGhiI:lLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
+ "AbcCdDeEFGhiI:lLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
switch (c) {
case 'b':
case 'c':
case 'C':
case 'd':
case 'D':
+ case 'E':
case 'G':
case 'h':
case 'i':
@@ -3811,6 +3843,12 @@ main(int argc, char **argv)
break;
case 'U':
spa_config_path = optarg;
+ if (spa_config_path[0] != '/') {
+ (void) fprintf(stderr,
+ "cachefile must be an absolute path "
+ "(i.e. start with a slash)\n");
+ usage();
+ }
break;
case 'v':
verbose++;
@@ -3858,7 +3896,7 @@ main(int argc, char **argv)
verbose = MAX(verbose, 1);
for (c = 0; c < 256; c++) {
- if (dump_all && strchr("AeFlLOPRSX", c) == NULL)
+ if (dump_all && strchr("AeEFlLOPRSX", c) == NULL)
dump_opt[c] = 1;
if (dump_opt[c])
dump_opt[c] += verbose;
@@ -3872,6 +3910,14 @@ main(int argc, char **argv)
if (argc < 2 && dump_opt['R'])
usage();
+
+ if (dump_opt['E']) {
+ if (argc != 1)
+ usage();
+ zdb_embedded_block(argv[0]);
+ return (0);
+ }
+
if (argc < 1) {
if (!dump_opt['e'] && dump_opt['C']) {
dump_cachefile(spa_config_path);
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index 16f79b52ef..91abd30e9c 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -1377,7 +1377,6 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
itx->itx_private = zd;
itx->itx_wr_state = write_state;
itx->itx_sync = (ztest_random(8) == 0);
- itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
sizeof (*lr) - sizeof (lr_t));
diff --git a/usr/src/man/man1m/zdb.1m b/usr/src/man/man1m/zdb.1m
index 1bd47bbfa4..f341df4e20 100644
--- a/usr/src/man/man1m/zdb.1m
+++ b/usr/src/man/man1m/zdb.1m
@@ -10,10 +10,10 @@
.\"
.\"
.\" Copyright 2012, Richard Lowe.
-.\" Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved.
.\" Copyright 2017 Nexenta Systems, Inc.
.\"
-.Dd January 14, 2017
+.Dd April 14, 2017
.Dt ZDB 1M
.Os
.Sh NAME
@@ -39,6 +39,10 @@
.Op Fl A
.Op Fl U Ar cache
.Nm
+.Fl E
+.Op Fl A
+.Ar word0 Ns : Ns Ar word1 Ns :...: Ns Ar word15
+.Nm
.Fl l
.Op Fl Aqu
.Ar device
@@ -153,6 +157,10 @@ Display the statistics independently for each deduplication table.
Dump the contents of the deduplication tables describing duplicate blocks.
.It Fl DDDDD
Also dump the contents of the deduplication tables describing unique blocks.
+.It Fl E Ar word0 Ns : Ns Ar word1 Ns :...: Ns Ar word15
+Decode and display block from an embedded block pointer specified by the
+.Ar word
+arguments.
.It Fl h
Display pool history similar to
.Nm zpool Cm history ,
diff --git a/usr/src/uts/common/fs/zfs/blkptr.c b/usr/src/uts/common/fs/zfs/blkptr.c
index ff93ff4456..779539abbc 100644
--- a/usr/src/uts/common/fs/zfs/blkptr.c
+++ b/usr/src/uts/common/fs/zfs/blkptr.c
@@ -117,3 +117,36 @@ decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
}
}
+
+/*
+ * Fill in the buffer with the (decompressed) payload of the embedded
+ * blkptr_t. Takes into account compression and byteorder (the payload is
+ * treated as a stream of bytes).
+ * Return 0 on success, or ENOSPC if it won't fit in the buffer.
+ */
+int
+decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
+{
+ int lsize, psize;
+
+ ASSERT(BP_IS_EMBEDDED(bp));
+
+ lsize = BPE_GET_LSIZE(bp);
+ psize = BPE_GET_PSIZE(bp);
+
+ if (lsize > buflen)
+ return (ENOSPC);
+ ASSERT3U(lsize, ==, buflen);
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+ uint8_t dstbuf[BPE_PAYLOAD_SIZE];
+ decode_embedded_bp_compressed(bp, dstbuf);
+ VERIFY0(zio_decompress_data(BP_GET_COMPRESS(bp),
+ dstbuf, buf, psize, buflen));
+ } else {
+ ASSERT3U(lsize, ==, psize);
+ decode_embedded_bp_compressed(bp, buf);
+ }
+
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/sys/blkptr.h b/usr/src/uts/common/fs/zfs/sys/blkptr.h
index b720482a73..77b1b827ac 100644
--- a/usr/src/uts/common/fs/zfs/sys/blkptr.h
+++ b/usr/src/uts/common/fs/zfs/sys/blkptr.h
@@ -30,6 +30,7 @@ extern "C" {
void encode_embedded_bp_compressed(blkptr_t *, void *,
enum zio_compress, int, int);
void decode_embedded_bp_compressed(const blkptr_t *, void *);
+int decode_embedded_bp(const blkptr_t *, void *, int);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index d86e3b45f1..16514e6939 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -93,6 +93,7 @@ typedef enum drr_headertype {
#define DMU_BACKUP_FEATURE_RESUMING (1 << 20)
/* flag #21 is reserved for a Delphix feature */
#define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22)
+/* flag #23 is reserved for the large dnode feature */
/*
* Mask of all supported backup features
diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h
index 319a24988f..23ef83dfe4 100644
--- a/usr/src/uts/common/fs/zfs/sys/zil.h
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h
@@ -378,7 +378,6 @@ typedef struct itx {
void *itx_private; /* type-specific opaque data */
itx_wr_state_t itx_wr_state; /* write state */
uint8_t itx_sync; /* synchronous transaction */
- uint64_t itx_sod; /* record size on disk */
uint64_t itx_oid; /* object id */
lr_t itx_lr; /* common part of log record */
/* followed by type-specific part of lr_xx_t and its immediate data */
diff --git a/usr/src/uts/common/fs/zfs/sys/zil_impl.h b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
index ac908bd322..1613033daf 100644
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
@@ -42,6 +42,7 @@ extern "C" {
typedef struct lwb {
zilog_t *lwb_zilog; /* back pointer to log struct */
blkptr_t lwb_blk; /* on disk address of this log blk */
+ boolean_t lwb_slog; /* lwb_blk is on SLOG device */
int lwb_nused; /* # used bytes in buffer */
int lwb_sz; /* size of block and buffer */
char *lwb_buf; /* log write buffer */
@@ -62,7 +63,6 @@ typedef struct itxs {
typedef struct itxg {
kmutex_t itxg_lock; /* lock for this structure */
uint64_t itxg_txg; /* txg for this chain */
- uint64_t itxg_sod; /* total size on disk for this txg */
itxs_t *itxg_itxs; /* sync and async itxs */
} itxg_t;
@@ -120,7 +120,6 @@ struct zilog {
kcondvar_t zl_cv_batch[2]; /* batch condition variables */
itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
list_t zl_itx_commit_list; /* itx list to be committed */
- uint64_t zl_itx_list_sz; /* total size of records on list */
uint64_t zl_cur_used; /* current commit log size used */
list_t zl_lwb_list; /* in-flight log write list */
kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */
@@ -140,9 +139,26 @@ typedef struct zil_bp_node {
avl_node_t zn_node;
} zil_bp_node_t;
+/*
+ * Maximum amount of write data that can be put into single log block.
+ */
#define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
sizeof (lr_write_t))
+/*
+ * Maximum amount of log space we agree to waste to reduce number of
+ * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
+ */
+#define ZIL_MAX_WASTE_SPACE (ZIL_MAX_LOG_DATA / 8)
+
+/*
+ * Maximum amount of write data for WR_COPIED. Fall back to WR_NEED_COPY
+ * as more space efficient if we can't fit at least two log records into
+ * maximum sized log block.
+ */
+#define ZIL_MAX_COPIED_DATA ((SPA_OLD_MAXBLOCKSIZE - \
+ sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t))
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index b50df27774..9c2beafaa6 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -507,7 +507,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
const blkptr_t *bp, enum zio_flag flags);
extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
- blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+ blkptr_t *old_bp, uint64_t size, boolean_t *slog);
extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern void zio_shrink(zio_t *zio, uint64_t size);
diff --git a/usr/src/uts/common/fs/zfs/zfs_log.c b/usr/src/uts/common/fs/zfs/zfs_log.c
index 99a3917289..fbac2d99c2 100644
--- a/usr/src/uts/common/fs/zfs/zfs_log.c
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c
@@ -454,20 +454,17 @@ void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, offset_t off, ssize_t resid, int ioflag)
{
+ uint32_t blocksize = zp->z_blksz;
itx_wr_state_t write_state;
- boolean_t slogging;
uintptr_t fsync_cnt;
- ssize_t immediate_write_sz;
if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
- ? 0 : zfs_immediate_write_sz;
-
- slogging = spa_has_slogs(zilog->zl_spa) &&
- (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
- if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
+ if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+ write_state = WR_INDIRECT;
+ else if (!spa_has_slogs(zilog->zl_spa) &&
+ resid >= zfs_immediate_write_sz)
write_state = WR_INDIRECT;
else if (ioflag & (FSYNC | FDSYNC))
write_state = WR_COPIED;
@@ -481,30 +478,26 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
while (resid) {
itx_t *itx;
lr_write_t *lr;
- ssize_t len;
+ itx_wr_state_t wr_state = write_state;
+ ssize_t len = resid;
- /*
- * If the write would overflow the largest block then split it.
- */
- if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
- len = SPA_OLD_MAXBLOCKSIZE >> 1;
- else
- len = resid;
+ if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
+ wr_state = WR_NEED_COPY;
+ else if (wr_state == WR_INDIRECT)
+ len = MIN(blocksize - P2PHASE(off, blocksize), resid);
itx = zil_itx_create(txtype, sizeof (*lr) +
- (write_state == WR_COPIED ? len : 0));
+ (wr_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr;
- if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
+ if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
zil_itx_destroy(itx);
itx = zil_itx_create(txtype, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
- write_state = WR_NEED_COPY;
+ wr_state = WR_NEED_COPY;
}
- itx->itx_wr_state = write_state;
- if (write_state == WR_NEED_COPY)
- itx->itx_sod += len;
+ itx->itx_wr_state = wr_state;
lr->lr_foid = zp->z_id;
lr->lr_offset = off;
lr->lr_length = len;
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index c8c8db0885..a998411ef9 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -80,6 +80,13 @@ int zil_replay_disable = 0;
*/
boolean_t zfs_nocacheflush = B_FALSE;
+/*
+ * Limit SLOG write size per commit executed with synchronous priority.
+ * Any writes above that will be executed with lower (asynchronous) priority
+ * to limit potential SLOG device abuse by single active ZIL writer.
+ */
+uint64_t zil_slog_bulk = 768 * 1024;
+
static kmem_cache_t *zil_lwb_cache;
static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
@@ -431,13 +438,14 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
}
static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
{
lwb_t *lwb;
lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
lwb->lwb_zilog = zilog;
lwb->lwb_blk = *bp;
+ lwb->lwb_slog = slog;
lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
lwb->lwb_max_txg = txg;
lwb->lwb_zio = NULL;
@@ -521,6 +529,7 @@ zil_create(zilog_t *zilog)
dmu_tx_t *tx = NULL;
blkptr_t blk;
int error = 0;
+ boolean_t slog = FALSE;
/*
* Wait for any previous destroy to complete.
@@ -549,7 +558,7 @@ zil_create(zilog_t *zilog)
}
error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
- ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+ ZIL_MIN_BLKSZ, &slog);
if (error == 0)
zil_init_log_chain(zilog, &blk);
@@ -559,7 +568,7 @@ zil_create(zilog_t *zilog)
* Allocate a log write buffer (lwb) for the first log block.
*/
if (error == 0)
- lwb = zil_alloc_lwb(zilog, &blk, txg);
+ lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
/*
* If we just allocated the first log block, commit our transaction
@@ -891,6 +900,7 @@ static void
zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
{
zbookmark_phys_t zb;
+ zio_priority_t prio;
SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@@ -903,9 +913,13 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
if (lwb->lwb_zio == NULL) {
abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
BP_GET_LSIZE(&lwb->lwb_blk));
+ if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
+ prio = ZIO_PRIORITY_SYNC_WRITE;
+ else
+ prio = ZIO_PRIORITY_ASYNC_WRITE;
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
- zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
+ zil_lwb_write_done, lwb, prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
}
}
@@ -925,16 +939,6 @@ uint64_t zil_block_buckets[] = {
};
/*
- * Use the slog as long as the logbias is 'latency' and the current commit size
- * is less than the limit or the total list size is less than 2X the limit.
- * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
- */
-uint64_t zil_slog_limit = 1024 * 1024;
-#define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
- (((zilog)->zl_cur_used < zil_slog_limit) || \
- ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
-
-/*
* Start a log block write and advance to the next log block.
* Calls are serialized.
*/
@@ -949,6 +953,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
uint64_t txg;
uint64_t zil_blksz, wsz;
int i, error;
+ boolean_t slog;
if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
zilc = (zil_chain_t *)lwb->lwb_buf;
@@ -1005,8 +1010,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
BP_ZERO(bp);
/* pass the old blkptr in order to spread log blocks across devs */
- error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
- USE_SLOG(zilog));
+ error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
if (error == 0) {
ASSERT3U(bp->blk_birth, ==, txg);
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
@@ -1015,7 +1019,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
/*
* Allocate a new log write buffer (lwb).
*/
- nlwb = zil_alloc_lwb(zilog, bp, txg);
+ nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
/* Record the block for later vdev flushing */
zil_add_block(zilog, &lwb->lwb_blk);
@@ -1052,45 +1056,53 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
static lwb_t *
zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
{
- lr_t *lrc = &itx->itx_lr; /* common log record */
- lr_write_t *lrw = (lr_write_t *)lrc;
+ lr_t *lrcb, *lrc;
+ lr_write_t *lrwb, *lrw;
char *lr_buf;
- uint64_t txg = lrc->lrc_txg;
- uint64_t reclen = lrc->lrc_reclen;
- uint64_t dlen = 0;
+ uint64_t dlen, dnow, lwb_sp, reclen, txg;
if (lwb == NULL)
return (NULL);
ASSERT(lwb->lwb_buf != NULL);
- if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
+ lrc = &itx->itx_lr; /* Common log record inside itx. */
+ lrw = (lr_write_t *)lrc; /* Write log record inside itx. */
+ if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
dlen = P2ROUNDUP_TYPED(
lrw->lr_length, sizeof (uint64_t), uint64_t);
-
+ } else {
+ dlen = 0;
+ }
+ reclen = lrc->lrc_reclen;
zilog->zl_cur_used += (reclen + dlen);
+ txg = lrc->lrc_txg;
zil_lwb_write_init(zilog, lwb);
+cont:
/*
* If this record won't fit in the current log block, start a new one.
+ * For WR_NEED_COPY optimize layout for minimal number of chunks.
*/
- if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
+ lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+ if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
+ lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 ||
+ lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
lwb = zil_lwb_write_start(zilog, lwb);
if (lwb == NULL)
return (NULL);
zil_lwb_write_init(zilog, lwb);
ASSERT(LWB_EMPTY(lwb));
- if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
- txg_wait_synced(zilog->zl_dmu_pool, txg);
- return (lwb);
- }
+ lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+ ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
}
+ dnow = MIN(dlen, lwb_sp - reclen);
lr_buf = lwb->lwb_buf + lwb->lwb_nused;
bcopy(lrc, lr_buf, reclen);
- lrc = (lr_t *)lr_buf;
- lrw = (lr_write_t *)lrc;
+ lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */
+ lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
@@ -1102,16 +1114,19 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
char *dbuf;
int error;
- if (dlen) {
- ASSERT(itx->itx_wr_state == WR_NEED_COPY);
+ if (itx->itx_wr_state == WR_NEED_COPY) {
dbuf = lr_buf + reclen;
- lrw->lr_common.lrc_reclen += dlen;
+ lrcb->lrc_reclen += dnow;
+ if (lrwb->lr_length > dnow)
+ lrwb->lr_length = dnow;
+ lrw->lr_offset += dnow;
+ lrw->lr_length -= dnow;
} else {
ASSERT(itx->itx_wr_state == WR_INDIRECT);
dbuf = NULL;
}
error = zilog->zl_get_data(
- itx->itx_private, lrw, dbuf, lwb->lwb_zio);
+ itx->itx_private, lrwb, dbuf, lwb->lwb_zio);
if (error == EIO) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
return (lwb);
@@ -1130,12 +1145,18 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
* equal to the itx sequence number because not all transactions
* are synchronous, and sometimes spa_sync() gets there first.
*/
- lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
- lwb->lwb_nused += reclen + dlen;
+ lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
+ lwb->lwb_nused += reclen + dnow;
lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
+ dlen -= dnow;
+ if (dlen > 0) {
+ zilog->zl_cur_used += reclen;
+ goto cont;
+ }
+
return (lwb);
}
@@ -1149,7 +1170,6 @@ zil_itx_create(uint64_t txtype, size_t lrsize)
itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
itx->itx_lr.lrc_txtype = txtype;
itx->itx_lr.lrc_reclen = lrsize;
- itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
itx->itx_lr.lrc_seq = 0; /* defensive */
itx->itx_sync = B_TRUE; /* default is synchronous */
@@ -1300,11 +1320,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
*/
zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
"txg %llu", itxg->itxg_txg);
- atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
- itxg->itxg_sod = 0;
clean = itxg->itxg_itxs;
}
- ASSERT(itxg->itxg_sod == 0);
itxg->itxg_txg = txg;
itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
@@ -1316,8 +1333,6 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
}
if (itx->itx_sync) {
list_insert_tail(&itxs->i_sync_list, itx);
- atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
- itxg->itxg_sod += itx->itx_sod;
} else {
avl_tree_t *t = &itxs->i_async_tree;
uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
@@ -1365,8 +1380,6 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
ASSERT3U(itxg->itxg_txg, <=, synced_txg);
ASSERT(itxg->itxg_txg != 0);
ASSERT(zilog->zl_clean_taskq != NULL);
- atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
- itxg->itxg_sod = 0;
clean_me = itxg->itxg_itxs;
itxg->itxg_itxs = NULL;
itxg->itxg_txg = 0;
@@ -1390,7 +1403,6 @@ zil_get_commit_list(zilog_t *zilog)
{
uint64_t otxg, txg;
list_t *commit_list = &zilog->zl_itx_commit_list;
- uint64_t push_sod = 0;
if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
otxg = ZILTEST_TXG;
@@ -1422,12 +1434,9 @@ zil_get_commit_list(zilog_t *zilog)
ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
- push_sod += itxg->itxg_sod;
- itxg->itxg_sod = 0;
mutex_exit(&itxg->itxg_lock);
}
- atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
}
/*
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index da09434078..f26e109267 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -2920,7 +2920,7 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
*/
int
zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
- uint64_t size, boolean_t use_slog)
+ uint64_t size, boolean_t *slog)
{
int error = 1;
zio_alloc_list_t io_alloc_list;
@@ -2928,17 +2928,16 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
ASSERT(txg > spa_syncing_txg(spa));
metaslab_trace_init(&io_alloc_list);
-
- if (use_slog) {
- error = metaslab_alloc(spa, spa_log_class(spa), size,
- new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
- &io_alloc_list, NULL);
- }
-
- if (error) {
+ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
+ txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL);
+ if (error == 0) {
+ *slog = TRUE;
+ } else {
error = metaslab_alloc(spa, spa_normal_class(spa), size,
new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
&io_alloc_list, NULL);
+ if (error == 0)
+ *slog = FALSE;
}
metaslab_trace_fini(&io_alloc_list);
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 35ae5a8fd1..11c7dc4b14 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -1064,54 +1064,44 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
{
uint32_t blocksize = zv->zv_volblocksize;
zilog_t *zilog = zv->zv_zilog;
- boolean_t slogging;
- ssize_t immediate_write_sz;
+ itx_wr_state_t write_state;
if (zil_replaying(zilog, tx))
return;
- immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
- ? 0 : zvol_immediate_write_sz;
-
- slogging = spa_has_slogs(zilog->zl_spa) &&
- (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+ if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+ write_state = WR_INDIRECT;
+ else if (!spa_has_slogs(zilog->zl_spa) &&
+ resid >= blocksize && blocksize > zvol_immediate_write_sz)
+ write_state = WR_INDIRECT;
+ else if (sync)
+ write_state = WR_COPIED;
+ else
+ write_state = WR_NEED_COPY;
while (resid) {
itx_t *itx;
lr_write_t *lr;
- ssize_t len;
- itx_wr_state_t write_state;
+ itx_wr_state_t wr_state = write_state;
+ ssize_t len = resid;
- /*
- * Unlike zfs_log_write() we can be called with
- * upto DMU_MAX_ACCESS/2 (5MB) writes.
- */
- if (blocksize > immediate_write_sz && !slogging &&
- resid >= blocksize && off % blocksize == 0) {
- write_state = WR_INDIRECT; /* uses dmu_sync */
- len = blocksize;
- } else if (sync) {
- write_state = WR_COPIED;
- len = MIN(ZIL_MAX_LOG_DATA, resid);
- } else {
- write_state = WR_NEED_COPY;
- len = MIN(ZIL_MAX_LOG_DATA, resid);
- }
+ if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
+ wr_state = WR_NEED_COPY;
+ else if (wr_state == WR_INDIRECT)
+ len = MIN(blocksize - P2PHASE(off, blocksize), resid);
itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
- (write_state == WR_COPIED ? len : 0));
+ (wr_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr;
- if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
+ if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
zil_itx_destroy(itx);
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
- write_state = WR_NEED_COPY;
+ wr_state = WR_NEED_COPY;
}
- itx->itx_wr_state = write_state;
- if (write_state == WR_NEED_COPY)
- itx->itx_sod += len;
+ itx->itx_wr_state = wr_state;
lr->lr_foid = ZVOL_OBJ;
lr->lr_offset = off;
lr->lr_length = len;