diff options
author | Neil Perrin <Neil.Perrin@Sun.COM> | 2009-10-15 11:39:49 -0600 |
---|---|---|
committer | Neil Perrin <Neil.Perrin@Sun.COM> | 2009-10-15 11:39:49 -0600 |
commit | 975c32a05c38c6fa808592dd35fa6dba183ca077 (patch) | |
tree | 12a92c3406b435c65e688afb076dc7a14c6b84bb /usr/src | |
parent | c4cbca4f3a766d8c662ce2e0e36a6f1e41ff0a80 (diff) | |
download | illumos-gate-975c32a05c38c6fa808592dd35fa6dba183ca077.tar.gz |
6880764 fsync on zfs is broken if writes are greater than 32kb on a hard crash and no log attached
6793430 zdb -ivvvv assertion failure: bp->blk_cksum.zc_word[2] == dmu_objset_id(zilog->zl_os)
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/cmd/zdb/zdb_il.c | 18 | ||||
-rw-r--r-- | usr/src/cmd/ztest/ztest.c | 7 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dmu.c | 25 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/spa.h | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zil.h | 5 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_replay.c | 57 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_vnops.c | 16 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zil.c | 81 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zvol.c | 22 |
9 files changed, 177 insertions, 58 deletions
diff --git a/usr/src/cmd/zdb/zdb_il.c b/usr/src/cmd/zdb/zdb_il.c index cc08ef5148..1b3c18fab1 100644 --- a/usr/src/cmd/zdb/zdb_il.c +++ b/usr/src/cmd/zdb/zdb_il.c @@ -115,7 +115,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset, (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blkoff); - if (verbose < 5) + if (txtype == TX_WRITE2 || verbose < 5) return; if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { @@ -123,18 +123,19 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) bp->blk_birth >= spa_first_txg(zilog->zl_spa) ? "will claim" : "won't claim"); print_log_bp(bp, "\t\t\t"); + if (BP_IS_HOLE(bp)) { + (void) printf("\t\t\tLSIZE 0x%llx\n", + (u_longlong_t)BP_GET_LSIZE(bp)); + } if (bp->blk_birth == 0) { bzero(buf, sizeof (buf)); } else { zbookmark_t zb; - ASSERT3U(bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ==, - dmu_objset_id(zilog->zl_os)); - - zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET]; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; + zb.zb_objset = dmu_objset_id(zilog->zl_os); + zb.zb_object = lr->lr_foid; + zb.zb_level = 0; + zb.zb_blkid = -1; /* unknown */ error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, BP_GET_LSIZE(bp), NULL, NULL, @@ -251,6 +252,7 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { { zil_prt_rec_create, "TX_MKDIR_ACL " }, { zil_prt_rec_create, "TX_MKDIR_ATTR " }, { zil_prt_rec_create, "TX_MKDIR_ACL_ATTR " }, + { zil_prt_rec_write, "TX_WRITE2 " }, }; /* ARGSUSED */ diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index fabe9ae345..730ad3eb2e 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -797,6 +797,13 @@ zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { NULL, /* TX_TRUNCATE */ NULL, /* TX_SETATTR */ NULL, /* TX_ACL */ + NULL, /* TX_CREATE_ACL */ + NULL, /* TX_CREATE_ATTR */ + NULL, /* TX_CREATE_ACL_ATTR */ + NULL, /* TX_MKDIR_ACL */ + NULL, /* TX_MKDIR_ATTR */ + NULL, /* TX_MKDIR_ACL_ATTR */ + NULL, /* TX_WRITE2 */ }; /* diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 3f54c64ef7..aff4832767 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -869,15 +869,28 @@ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { blkptr_t *bp = zio->io_bp; + dmu_sync_arg_t *in = varg; + dbuf_dirty_record_t *dr = in->dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + + mutex_enter(&db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (!BP_IS_HOLE(bp)) { - dmu_sync_arg_t *in = varg; - dbuf_dirty_record_t *dr = in->dr; - dmu_buf_impl_t *db = dr->dr_dbuf; ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type); ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; + dr->dt.dl.dr_overridden_by = *zio->io_bp; + } else { + dr->dt.dl.dr_overridden_by = *zio->io_bp; + /* + * dmu_sync() can compress a block of zeros to a null blkptr + * but the block size still needs to be passed through to replay + */ + BP_SET_LSIZE(bp, db->db.db_size); } + dr->dt.dl.dr_override_state = DR_OVERRIDDEN; + mutex_exit(&db->db_mtx); } /* ARGSUSED */ @@ -889,13 +902,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) dmu_buf_impl_t *db = dr->dr_dbuf; dmu_sync_cb_t *done = in->done; - mutex_enter(&db->db_mtx); - ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); - dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ - dr->dt.dl.dr_override_state = DR_OVERRIDDEN; cv_broadcast(&db->db_changed); - mutex_exit(&db->db_mtx); - if (done) done(&(db->db), in->arg); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 7a1175a43c..0f44371fc3 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -208,8 +208,8 @@ typedef struct blkptr { #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ - (BP_IS_HOLE(bp) ? 0 : \ - BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)) + BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) + #define BP_SET_LSIZE(bp, x) \ BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h index acec5da2d9..edc049ef1c 100644 --- a/usr/src/uts/common/fs/zfs/sys/zil.h +++ b/usr/src/uts/common/fs/zfs/sys/zil.h @@ -139,7 +139,8 @@ typedef enum zil_create { #define TX_MKDIR_ACL 17 /* mkdir with ACL */ #define TX_MKDIR_ATTR 18 /* mkdir with attr */ #define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ -#define TX_MAX_TYPE 20 /* Max transaction type */ +#define TX_WRITE2 20 /* dmu_sync EALREADY write */ +#define TX_MAX_TYPE 21 /* Max transaction type */ /* * The transactions for mkdir, symlink, remove, rmdir, link, and rename @@ -396,8 +397,8 @@ extern int zil_suspend(zilog_t *zilog); extern void zil_resume(zilog_t *zilog); extern void zil_add_block(zilog_t *zilog, blkptr_t *bp); - extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); +extern void zil_get_replay_data(zilog_t *zilog, lr_write_t *lr); extern int zil_disable; diff --git a/usr/src/uts/common/fs/zfs/zfs_replay.c b/usr/src/uts/common/fs/zfs/zfs_replay.c index 232422b0f8..0db4ba08f7 100644 --- a/usr/src/uts/common/fs/zfs/zfs_replay.c +++ b/usr/src/uts/common/fs/zfs/zfs_replay.c @@ -625,6 +625,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) znode_t *zp; int error; ssize_t resid; + uint64_t orig_eof, eod; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -639,10 +640,65 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) error = 0; return (error); } + orig_eof = zp->z_phys->zp_size; + eod = lr->lr_offset + lr->lr_length; /* end of data for this write */ + + /* If it's a dmu_sync() block get the data and write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) + zil_get_replay_data(zfsvfs->z_log, lr); error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length, lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + /* + * This may be a write from a dmu_sync() for a whole block, + * and may extend beyond the current end of the file. + * We can't just replay what was written for this TX_WRITE as + * a future TX_WRITE2 may extend the eof and the data for that + * write needs to be there. So we write the whole block and + * reduce the eof. + */ + if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */ + zp->z_phys->zp_size = eod; + + VN_RELE(ZTOV(zp)); + + return (error); +} + +/* + * TX_WRITE2 are only generated when dmu_sync() returns EALREADY + * meaning the pool block is already being synced. So now that we always write + * out full blocks, all we have to do is expand the eof if + * the file is grown. + */ +static int +zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) +{ + znode_t *zp; + int error; + uint64_t end; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log writes out of order, it's possible the + * file has been removed. In this case just drop the write + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + end = lr->lr_offset + lr->lr_length; + if (end > zp->z_phys->zp_size) { + ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz); + zp->z_phys->zp_size = end; + } + VN_RELE(ZTOV(zp)); return (error); @@ -875,4 +931,5 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_create_acl, /* TX_MKDIR_ACL */ zfs_replay_create, /* TX_MKDIR_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ + zfs_replay_write2, /* TX_WRITE2 */ }; diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index 6513640437..08bd0d378b 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -961,16 +961,28 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT((error && error != EINPROGRESS) || lr->lr_length <= zp->z_blksz); - if (error == 0) + if (error == 0) { + /* + * dmu_sync() can compress a block of zeros to a null + * blkptr but the block size still needs to be passed + * through to replay. + */ + BP_SET_LSIZE(&lr->lr_blkptr, db->db_size); zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); + } + /* * If we get EINPROGRESS, then we need to wait for a * write IO initiated by dmu_sync() to complete before * we can release this dbuf. We will finish everything * up in the zfs_get_done() callback. */ - if (error == EINPROGRESS) + if (error == EINPROGRESS) { return (0); + } else if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + error = 0; + } dmu_buf_rele(db, zgd); kmem_free(zgd, sizeof (zgd_t)); } diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 625ec719fb..ff13d9ab6b 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -1454,6 +1454,53 @@ zil_resume(zilog_t *zilog) mutex_exit(&zilog->zl_lock); } +/* + * Read in the data for the dmu_sync()ed block, and change the log + * record to write this whole block. + */ +void +zil_get_replay_data(zilog_t *zilog, lr_write_t *lr) +{ + blkptr_t *wbp = &lr->lr_blkptr; + char *wbuf = (char *)(lr + 1); /* data follows lr_write_t */ + uint64_t blksz; + + if (BP_IS_HOLE(wbp)) { /* compressed to a hole */ + blksz = BP_GET_LSIZE(&lr->lr_blkptr); + /* + * If the blksz is zero then we must be replaying a log + * from an version prior to setting the blksize of null blocks. + * So we just zero the actual write size reqeusted. + */ + if (blksz == 0) { + bzero(wbuf, lr->lr_length); + return; + } + bzero(wbuf, blksz); + } else { + /* + * A subsequent write may have overwritten this block, in which + * case wbp may have been been freed and reallocated, and our + * read of wbp may fail with a checksum error. We can safely + * ignore this because the later write will provide the + * correct data. + */ + zbookmark_t zb; + + zb.zb_objset = dmu_objset_id(zilog->zl_os); + zb.zb_object = lr->lr_foid; + zb.zb_level = 0; + zb.zb_blkid = -1; /* unknown */ + + blksz = BP_GET_LSIZE(&lr->lr_blkptr); + (void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, blksz, + NULL, NULL, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); + } + lr->lr_offset -= lr->lr_offset % blksz; + lr->lr_length = blksz; +} + typedef struct zil_replay_arg { objset_t *zr_os; zil_replay_func_t **zr_replay; @@ -1505,40 +1552,6 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) byteswap_uint64_array(zr->zr_lrbuf, reclen); /* - * If this is a TX_WRITE with a blkptr, suck in the data. - */ - if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { - lr_write_t *lrw = (lr_write_t *)lr; - blkptr_t *wbp = &lrw->lr_blkptr; - uint64_t wlen = lrw->lr_length; - char *wbuf = zr->zr_lrbuf + reclen; - - if (BP_IS_HOLE(wbp)) { /* compressed to a hole */ - bzero(wbuf, wlen); - } else { - /* - * A subsequent write may have overwritten this block, - * in which case wbp may have been been freed and - * reallocated, and our read of wbp may fail with a - * checksum error. We can safely ignore this because - * the later write will provide the correct data. - */ - zbookmark_t zb; - - zb.zb_objset = dmu_objset_id(zilog->zl_os); - zb.zb_object = lrw->lr_foid; - zb.zb_level = -1; - zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp); - - (void) zio_wait(zio_read(NULL, zilog->zl_spa, - wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL, - ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); - (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen); - } - } - - /* * We must now do two things atomically: replay this log record, * and update the log header sequence number to reflect the fact that * we did so. At the end of each replay function the sequence number diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 68b6b05117..5ca01514c9 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -369,6 +369,10 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); + /* If it's a dmu_sync() block get the data and write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) + zil_get_replay_data(dmu_objset_zil(os), lr); + tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); error = dmu_tx_assign(tx, TXG_WAIT); @@ -407,6 +411,13 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ zvol_replay_err, /* TX_ACL */ + zvol_replay_err, /* TX_CREATE_ACL */ + zvol_replay_err, /* TX_CREATE_ATTR */ + zvol_replay_err, /* TX_CREATE_ACL_ATTR */ + zvol_replay_err, /* TX_MKDIR_ACL */ + zvol_replay_err, /* TX_MKDIR_ATTR */ + zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ + zvol_replay_err, /* TX_WRITE2 */ }; int @@ -926,10 +937,19 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) zgd->zgd_rl = rl; VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db)); + error = dmu_sync(zio, db, &lr->lr_blkptr, lr->lr_common.lrc_txg, zvol_get_done, zgd); - if (error == 0) + if (error == 0) { + /* + * dmu_sync() can compress a block of zeros to a null blkptr + * but the block size still needs to be passed through to + * replay. + */ + BP_SET_LSIZE(&lr->lr_blkptr, db->db_size); zil_add_block(zv->zv_zilog, &lr->lr_blkptr); + } + /* * If we get EINPROGRESS, then we need to wait for a * write IO initiated by dmu_sync() to complete before |