summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs/zfs/zfs_replay.c
diff options
context:
space:
mode:
authorNeil Perrin <Neil.Perrin@Sun.COM>2009-10-15 11:39:49 -0600
committerNeil Perrin <Neil.Perrin@Sun.COM>2009-10-15 11:39:49 -0600
commit975c32a05c38c6fa808592dd35fa6dba183ca077 (patch)
tree12a92c3406b435c65e688afb076dc7a14c6b84bb /usr/src/uts/common/fs/zfs/zfs_replay.c
parentc4cbca4f3a766d8c662ce2e0e36a6f1e41ff0a80 (diff)
downloadillumos-gate-975c32a05c38c6fa808592dd35fa6dba183ca077.tar.gz
6880764 fsync on zfs is broken if writes are greater than 32kb on a hard crash and no log attached
6793430 zdb -ivvvv assertion failure: bp->blk_cksum.zc_word[2] == dmu_objset_id(zilog->zl_os)
Diffstat (limited to 'usr/src/uts/common/fs/zfs/zfs_replay.c')
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_replay.c57
1 files changed, 57 insertions, 0 deletions
diff --git a/usr/src/uts/common/fs/zfs/zfs_replay.c b/usr/src/uts/common/fs/zfs/zfs_replay.c
index 232422b0f8..0db4ba08f7 100644
--- a/usr/src/uts/common/fs/zfs/zfs_replay.c
+++ b/usr/src/uts/common/fs/zfs/zfs_replay.c
@@ -625,6 +625,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
znode_t *zp;
int error;
ssize_t resid;
+ uint64_t orig_eof, eod;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -639,10 +640,65 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
error = 0;
return (error);
}
+ orig_eof = zp->z_phys->zp_size;
+ eod = lr->lr_offset + lr->lr_length; /* end of data for this write */
+
+ /* If it's a dmu_sync() block get the data and write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
+ zil_get_replay_data(zfsvfs->z_log, lr);
error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+ /*
+ * This may be a write from a dmu_sync() for a whole block,
+ * and may extend beyond the current end of the file.
+ * We can't just replay what was written for this TX_WRITE as
+ * a future TX_WRITE2 may extend the eof and the data for that
+ * write needs to be there. So we write the whole block and
+ * reduce the eof.
+ */
+ if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */
+ zp->z_phys->zp_size = eod;
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+}
+
+/*
+ * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
+ * meaning the pool block is already being synced. So now that we always write
+ * out full blocks, all we have to do is expand the eof if
+ * the file is grown.
+ */
+static int
+zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
+{
+ znode_t *zp;
+ int error;
+ uint64_t end;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+ /*
+ * As we can log writes out of order, it's possible the
+ * file has been removed. In this case just drop the write
+ * and return success.
+ */
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+ }
+
+ end = lr->lr_offset + lr->lr_length;
+ if (end > zp->z_phys->zp_size) {
+ ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz);
+ zp->z_phys->zp_size = end;
+ }
+
VN_RELE(ZTOV(zp));
return (error);
@@ -875,4 +931,5 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
zfs_replay_create_acl, /* TX_MKDIR_ACL */
zfs_replay_create, /* TX_MKDIR_ATTR */
zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
+ zfs_replay_write2, /* TX_WRITE2 */
};