summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorNeil Perrin <Neil.Perrin@Sun.COM>2009-10-15 11:39:49 -0600
committerNeil Perrin <Neil.Perrin@Sun.COM>2009-10-15 11:39:49 -0600
commit975c32a05c38c6fa808592dd35fa6dba183ca077 (patch)
tree12a92c3406b435c65e688afb076dc7a14c6b84bb /usr/src
parentc4cbca4f3a766d8c662ce2e0e36a6f1e41ff0a80 (diff)
downloadillumos-gate-975c32a05c38c6fa808592dd35fa6dba183ca077.tar.gz
6880764 fsync on zfs is broken if writes are greater than 32kb on a hard crash and no log attached
6793430 zdb -ivvvv assertion failure: bp->blk_cksum.zc_word[2] == dmu_objset_id(zilog->zl_os)
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/zdb/zdb_il.c18
-rw-r--r--usr/src/cmd/ztest/ztest.c7
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c25
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zil.h5
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_replay.c57
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c16
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c81
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c22
9 files changed, 177 insertions, 58 deletions
diff --git a/usr/src/cmd/zdb/zdb_il.c b/usr/src/cmd/zdb/zdb_il.c
index cc08ef5148..1b3c18fab1 100644
--- a/usr/src/cmd/zdb/zdb_il.c
+++ b/usr/src/cmd/zdb/zdb_il.c
@@ -115,7 +115,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
(u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
(u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blkoff);
- if (verbose < 5)
+ if (txtype == TX_WRITE2 || verbose < 5)
return;
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
@@ -123,18 +123,19 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
"will claim" : "won't claim");
print_log_bp(bp, "\t\t\t");
+ if (BP_IS_HOLE(bp)) {
+ (void) printf("\t\t\tLSIZE 0x%llx\n",
+ (u_longlong_t)BP_GET_LSIZE(bp));
+ }
if (bp->blk_birth == 0) {
bzero(buf, sizeof (buf));
} else {
zbookmark_t zb;
- ASSERT3U(bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ==,
- dmu_objset_id(zilog->zl_os));
-
- zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+ zb.zb_objset = dmu_objset_id(zilog->zl_os);
+ zb.zb_object = lr->lr_foid;
+ zb.zb_level = 0;
+ zb.zb_blkid = -1; /* unknown */
error = zio_wait(zio_read(NULL, zilog->zl_spa,
bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
@@ -251,6 +252,7 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
{ zil_prt_rec_create, "TX_MKDIR_ACL " },
{ zil_prt_rec_create, "TX_MKDIR_ATTR " },
{ zil_prt_rec_create, "TX_MKDIR_ACL_ATTR " },
+ { zil_prt_rec_write, "TX_WRITE2 " },
};
/* ARGSUSED */
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index fabe9ae345..730ad3eb2e 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -797,6 +797,13 @@ zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
NULL, /* TX_TRUNCATE */
NULL, /* TX_SETATTR */
NULL, /* TX_ACL */
+ NULL, /* TX_CREATE_ACL */
+ NULL, /* TX_CREATE_ATTR */
+ NULL, /* TX_CREATE_ACL_ATTR */
+ NULL, /* TX_MKDIR_ACL */
+ NULL, /* TX_MKDIR_ATTR */
+ NULL, /* TX_MKDIR_ACL_ATTR */
+ NULL, /* TX_WRITE2 */
};
/*
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 3f54c64ef7..aff4832767 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -869,15 +869,28 @@ static void
dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
{
blkptr_t *bp = zio->io_bp;
+ dmu_sync_arg_t *in = varg;
+ dbuf_dirty_record_t *dr = in->dr;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ mutex_enter(&db->db_mtx);
+ ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
if (!BP_IS_HOLE(bp)) {
- dmu_sync_arg_t *in = varg;
- dbuf_dirty_record_t *dr = in->dr;
- dmu_buf_impl_t *db = dr->dr_dbuf;
ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type);
ASSERT(BP_GET_LEVEL(bp) == 0);
bp->blk_fill = 1;
+ dr->dt.dl.dr_overridden_by = *zio->io_bp;
+ } else {
+ dr->dt.dl.dr_overridden_by = *zio->io_bp;
+ /*
+ * dmu_sync() can compress a block of zeros to a null blkptr
+ * but the block size still needs to be passed through to replay
+ */
+ BP_SET_LSIZE(bp, db->db.db_size);
}
+ dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+ mutex_exit(&db->db_mtx);
}
/* ARGSUSED */
@@ -889,13 +902,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
dmu_buf_impl_t *db = dr->dr_dbuf;
dmu_sync_cb_t *done = in->done;
- mutex_enter(&db->db_mtx);
- ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
- dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
- dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
cv_broadcast(&db->db_changed);
- mutex_exit(&db->db_mtx);
-
if (done)
done(&(db->db), in->arg);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index 7a1175a43c..0f44371fc3 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -208,8 +208,8 @@ typedef struct blkptr {
#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
#define BP_GET_LSIZE(bp) \
- (BP_IS_HOLE(bp) ? 0 : \
- BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+ BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
+
#define BP_SET_LSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h
index acec5da2d9..edc049ef1c 100644
--- a/usr/src/uts/common/fs/zfs/sys/zil.h
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h
@@ -139,7 +139,8 @@ typedef enum zil_create {
#define TX_MKDIR_ACL 17 /* mkdir with ACL */
#define TX_MKDIR_ATTR 18 /* mkdir with attr */
#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
-#define TX_MAX_TYPE 20 /* Max transaction type */
+#define TX_WRITE2 20 /* dmu_sync EALREADY write */
+#define TX_MAX_TYPE 21 /* Max transaction type */
/*
* The transactions for mkdir, symlink, remove, rmdir, link, and rename
@@ -396,8 +397,8 @@ extern int zil_suspend(zilog_t *zilog);
extern void zil_resume(zilog_t *zilog);
extern void zil_add_block(zilog_t *zilog, blkptr_t *bp);
-
extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval);
+extern void zil_get_replay_data(zilog_t *zilog, lr_write_t *lr);
extern int zil_disable;
diff --git a/usr/src/uts/common/fs/zfs/zfs_replay.c b/usr/src/uts/common/fs/zfs/zfs_replay.c
index 232422b0f8..0db4ba08f7 100644
--- a/usr/src/uts/common/fs/zfs/zfs_replay.c
+++ b/usr/src/uts/common/fs/zfs/zfs_replay.c
@@ -625,6 +625,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
znode_t *zp;
int error;
ssize_t resid;
+ uint64_t orig_eof, eod;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -639,10 +640,65 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
error = 0;
return (error);
}
+ orig_eof = zp->z_phys->zp_size;
+ eod = lr->lr_offset + lr->lr_length; /* end of data for this write */
+
+ /* If it's a dmu_sync() block get the data and write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
+ zil_get_replay_data(zfsvfs->z_log, lr);
error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+ /*
+ * This may be a write from a dmu_sync() for a whole block,
+ * and may extend beyond the current end of the file.
+ * We can't just replay what was written for this TX_WRITE as
+ * a future TX_WRITE2 may extend the eof and the data for that
+ * write needs to be there. So we write the whole block and
+ * reduce the eof.
+ */
+ if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */
+ zp->z_phys->zp_size = eod;
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+}
+
+/*
+ * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
+ * meaning the pool block is already being synced. So now that we always write
+ * out full blocks, all we have to do is expand the eof if
+ * the file is grown.
+ */
+static int
+zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
+{
+ znode_t *zp;
+ int error;
+ uint64_t end;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+ /*
+ * As we can log writes out of order, it's possible the
+ * file has been removed. In this case just drop the write
+ * and return success.
+ */
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+ }
+
+ end = lr->lr_offset + lr->lr_length;
+ if (end > zp->z_phys->zp_size) {
+ ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz);
+ zp->z_phys->zp_size = end;
+ }
+
VN_RELE(ZTOV(zp));
return (error);
@@ -875,4 +931,5 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
zfs_replay_create_acl, /* TX_MKDIR_ACL */
zfs_replay_create, /* TX_MKDIR_ATTR */
zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
+ zfs_replay_write2, /* TX_WRITE2 */
};
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 6513640437..08bd0d378b 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -961,16 +961,28 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
lr->lr_common.lrc_txg, zfs_get_done, zgd);
ASSERT((error && error != EINPROGRESS) ||
lr->lr_length <= zp->z_blksz);
- if (error == 0)
+ if (error == 0) {
+ /*
+ * dmu_sync() can compress a block of zeros to a null
+ * blkptr but the block size still needs to be passed
+ * through to replay.
+ */
+ BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
+ }
+
/*
* If we get EINPROGRESS, then we need to wait for a
* write IO initiated by dmu_sync() to complete before
* we can release this dbuf. We will finish everything
* up in the zfs_get_done() callback.
*/
- if (error == EINPROGRESS)
+ if (error == EINPROGRESS) {
return (0);
+ } else if (error == EALREADY) {
+ lr->lr_common.lrc_txtype = TX_WRITE2;
+ error = 0;
+ }
dmu_buf_rele(db, zgd);
kmem_free(zgd, sizeof (zgd_t));
}
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 625ec719fb..ff13d9ab6b 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -1454,6 +1454,53 @@ zil_resume(zilog_t *zilog)
mutex_exit(&zilog->zl_lock);
}
+/*
+ * Read in the data for the dmu_sync()ed block, and change the log
+ * record to write this whole block.
+ */
+void
+zil_get_replay_data(zilog_t *zilog, lr_write_t *lr)
+{
+ blkptr_t *wbp = &lr->lr_blkptr;
+ char *wbuf = (char *)(lr + 1); /* data follows lr_write_t */
+ uint64_t blksz;
+
+ if (BP_IS_HOLE(wbp)) { /* compressed to a hole */
+ blksz = BP_GET_LSIZE(&lr->lr_blkptr);
+ /*
+ * If the blksz is zero then we must be replaying a log
+ * from an version prior to setting the blksize of null blocks.
+ * So we just zero the actual write size reqeusted.
+ */
+ if (blksz == 0) {
+ bzero(wbuf, lr->lr_length);
+ return;
+ }
+ bzero(wbuf, blksz);
+ } else {
+ /*
+ * A subsequent write may have overwritten this block, in which
+ * case wbp may have been been freed and reallocated, and our
+ * read of wbp may fail with a checksum error. We can safely
+ * ignore this because the later write will provide the
+ * correct data.
+ */
+ zbookmark_t zb;
+
+ zb.zb_objset = dmu_objset_id(zilog->zl_os);
+ zb.zb_object = lr->lr_foid;
+ zb.zb_level = 0;
+ zb.zb_blkid = -1; /* unknown */
+
+ blksz = BP_GET_LSIZE(&lr->lr_blkptr);
+ (void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, blksz,
+ NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
+ }
+ lr->lr_offset -= lr->lr_offset % blksz;
+ lr->lr_length = blksz;
+}
+
typedef struct zil_replay_arg {
objset_t *zr_os;
zil_replay_func_t **zr_replay;
@@ -1505,40 +1552,6 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
byteswap_uint64_array(zr->zr_lrbuf, reclen);
/*
- * If this is a TX_WRITE with a blkptr, suck in the data.
- */
- if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
- lr_write_t *lrw = (lr_write_t *)lr;
- blkptr_t *wbp = &lrw->lr_blkptr;
- uint64_t wlen = lrw->lr_length;
- char *wbuf = zr->zr_lrbuf + reclen;
-
- if (BP_IS_HOLE(wbp)) { /* compressed to a hole */
- bzero(wbuf, wlen);
- } else {
- /*
- * A subsequent write may have overwritten this block,
- * in which case wbp may have been been freed and
- * reallocated, and our read of wbp may fail with a
- * checksum error. We can safely ignore this because
- * the later write will provide the correct data.
- */
- zbookmark_t zb;
-
- zb.zb_objset = dmu_objset_id(zilog->zl_os);
- zb.zb_object = lrw->lr_foid;
- zb.zb_level = -1;
- zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);
-
- (void) zio_wait(zio_read(NULL, zilog->zl_spa,
- wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
- ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
- (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
- }
- }
-
- /*
* We must now do two things atomically: replay this log record,
* and update the log header sequence number to reflect the fact that
* we did so. At the end of each replay function the sequence number
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 68b6b05117..5ca01514c9 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -369,6 +369,10 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
+ /* If it's a dmu_sync() block get the data and write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
+ zil_get_replay_data(dmu_objset_zil(os), lr);
+
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
error = dmu_tx_assign(tx, TXG_WAIT);
@@ -407,6 +411,13 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
zvol_replay_err, /* TX_TRUNCATE */
zvol_replay_err, /* TX_SETATTR */
zvol_replay_err, /* TX_ACL */
+ zvol_replay_err, /* TX_CREATE_ACL */
+ zvol_replay_err, /* TX_CREATE_ATTR */
+ zvol_replay_err, /* TX_CREATE_ACL_ATTR */
+ zvol_replay_err, /* TX_MKDIR_ACL */
+ zvol_replay_err, /* TX_MKDIR_ATTR */
+ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */
+ zvol_replay_err, /* TX_WRITE2 */
};
int
@@ -926,10 +937,19 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
zgd->zgd_rl = rl;
VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
+
error = dmu_sync(zio, db, &lr->lr_blkptr,
lr->lr_common.lrc_txg, zvol_get_done, zgd);
- if (error == 0)
+ if (error == 0) {
+ /*
+ * dmu_sync() can compress a block of zeros to a null blkptr
+ * but the block size still needs to be passed through to
+ * replay.
+ */
+ BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
zil_add_block(zv->zv_zilog, &lr->lr_blkptr);
+ }
+
/*
* If we get EINPROGRESS, then we need to wait for a
* write IO initiated by dmu_sync() to complete before