6880764 fsync on zfs is broken if writes are greater than 32kb on a hard crash and no log attached

6793430 zdb -ivvvv assertion failure: bp->blk_cksum.zc_word[2] == dmu_objset_id(zilog->zl_os)
author: Neil Perrin <Neil.Perrin@Sun.COM> 2009-10-15 11:39:49 -0600
committer: Neil Perrin <Neil.Perrin@Sun.COM> 2009-10-15 11:39:49 -0600
commit: 975c32a05c38c6fa808592dd35fa6dba183ca077 (patch)
tree: 12a92c3406b435c65e688afb076dc7a14c6b84bb /usr/src/uts/common/fs/zfs/zfs_replay.c
parent: c4cbca4f3a766d8c662ce2e0e36a6f1e41ff0a80 (diff)
download: illumos-gate-975c32a05c38c6fa808592dd35fa6dba183ca077.tar.gz
1 files changed, 57 insertions, 0 deletions
diff --git a/usr/src/uts/common/fs/zfs/zfs_replay.c b/usr/src/uts/common/fs/zfs/zfs_replay.c
index 232422b0f8..0db4ba08f7 100644
--- a/usr/src/uts/common/fs/zfs/zfs_replay.c
+++ b/usr/src/uts/common/fs/zfs/zfs_replay.c
@@ -625,6 +625,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
 	znode_t	*zp;
 	int error;
 	ssize_t resid;
+	uint64_t orig_eof, eod;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
@@ -639,10 +640,65 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
 			error = 0;
 		return (error);
 	}
+	orig_eof = zp->z_phys->zp_size;
+	eod = lr->lr_offset + lr->lr_length; /* end of data for this write */
+
+	/* If it's a dmu_sync() block get the data and write the whole block */
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
+		zil_get_replay_data(zfsvfs->z_log, lr);
 
 	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
 	    lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 
+	/*
+	 * This may be a write from a dmu_sync() for a whole block,
+	 * and may extend beyond the current end of the file.
+	 * We can't just replay what was written for this TX_WRITE as
+	 * a future TX_WRITE2 may extend the eof and the data for that
+	 * write needs to be there. So we write the whole block and
+	 * reduce the eof.
+	 */
+	if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */
+		zp->z_phys->zp_size = eod;
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+/*
+ * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
+ * meaning the pool block is already being synced. So now that we always write
+ * out full blocks, all we have to do is expand the eof if
+ * the file is grown.
+ */
+static int
+zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
+{
+	znode_t	*zp;
+	int error;
+	uint64_t end;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+		/*
+		 * As we can log writes out of order, it's possible the
+		 * file has been removed. In this case just drop the write
+		 * and return success.
+		 */
+		if (error == ENOENT)
+			error = 0;
+		return (error);
+	}
+
+	end = lr->lr_offset + lr->lr_length;
+	if (end > zp->z_phys->zp_size) {
+		ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz);
+		zp->z_phys->zp_size = end;
+	}
+
 	VN_RELE(ZTOV(zp));
 
 	return (error);
@@ -875,4 +931,5 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
 	zfs_replay_create_acl,	/* TX_MKDIR_ACL */
 	zfs_replay_create,	/* TX_MKDIR_ATTR */
 	zfs_replay_create_acl,	/* TX_MKDIR_ACL_ATTR */
+	zfs_replay_write2,	/* TX_WRITE2 */
 };
author	Neil Perrin <Neil.Perrin@Sun.COM>	2009-10-15 11:39:49 -0600
committer	Neil Perrin <Neil.Perrin@Sun.COM>	2009-10-15 11:39:49 -0600
commit	975c32a05c38c6fa808592dd35fa6dba183ca077 (patch)
tree	12a92c3406b435c65e688afb076dc7a14c6b84bb /usr/src/uts/common/fs/zfs/zfs_replay.c
parent	c4cbca4f3a766d8c662ce2e0e36a6f1e41ff0a80 (diff)
download	illumos-gate-975c32a05c38c6fa808592dd35fa6dba183ca077.tar.gz