diff options
author | Neil Perrin <Neil.Perrin@Sun.COM> | 2009-10-15 11:39:49 -0600 |
---|---|---|
committer | Neil Perrin <Neil.Perrin@Sun.COM> | 2009-10-15 11:39:49 -0600 |
commit | 975c32a05c38c6fa808592dd35fa6dba183ca077 (patch) | |
tree | 12a92c3406b435c65e688afb076dc7a14c6b84bb /usr/src/uts/common/fs/zfs/zfs_replay.c | |
parent | c4cbca4f3a766d8c662ce2e0e36a6f1e41ff0a80 (diff) | |
download | illumos-gate-975c32a05c38c6fa808592dd35fa6dba183ca077.tar.gz |
6880764 fsync on zfs is broken if writes are greater than 32kb on a hard crash and no log attached
6793430 zdb -ivvvv assertion failure: bp->blk_cksum.zc_word[2] == dmu_objset_id(zilog->zl_os)
Diffstat (limited to 'usr/src/uts/common/fs/zfs/zfs_replay.c')
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_replay.c | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/usr/src/uts/common/fs/zfs/zfs_replay.c b/usr/src/uts/common/fs/zfs/zfs_replay.c index 232422b0f8..0db4ba08f7 100644 --- a/usr/src/uts/common/fs/zfs/zfs_replay.c +++ b/usr/src/uts/common/fs/zfs/zfs_replay.c @@ -625,6 +625,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) znode_t *zp; int error; ssize_t resid; + uint64_t orig_eof, eod; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -639,10 +640,65 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) error = 0; return (error); } + orig_eof = zp->z_phys->zp_size; + eod = lr->lr_offset + lr->lr_length; /* end of data for this write */ + + /* If it's a dmu_sync() block get the data and write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) + zil_get_replay_data(zfsvfs->z_log, lr); error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length, lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + /* + * This may be a write from a dmu_sync() for a whole block, + * and may extend beyond the current end of the file. + * We can't just replay what was written for this TX_WRITE as + * a future TX_WRITE2 may extend the eof and the data for that + * write needs to be there. So we write the whole block and + * reduce the eof. + */ + if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */ + zp->z_phys->zp_size = eod; + + VN_RELE(ZTOV(zp)); + + return (error); +} + +/* + * TX_WRITE2 are only generated when dmu_sync() returns EALREADY + * meaning the pool block is already being synced. So now that we always write + * out full blocks, all we have to do is expand the eof if + * the file is grown. + */ +static int +zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) +{ + znode_t *zp; + int error; + uint64_t end; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log writes out of order, it's possible the + * file has been removed. In this case just drop the write + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + end = lr->lr_offset + lr->lr_length; + if (end > zp->z_phys->zp_size) { + ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz); + zp->z_phys->zp_size = end; + } + VN_RELE(ZTOV(zp)); return (error); @@ -875,4 +931,5 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_create_acl, /* TX_MKDIR_ACL */ zfs_replay_create, /* TX_MKDIR_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ + zfs_replay_write2, /* TX_WRITE2 */ }; |