summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorNeil Perrin <Neil.Perrin@Sun.COM>2008-09-18 17:18:10 -0600
committerNeil Perrin <Neil.Perrin@Sun.COM>2008-09-18 17:18:10 -0600
commita6e57bd4c7a2bf9cc33be939d674d4c7d3e67cce (patch)
treead4ace2ad4611c1dba1a700fb23f196912165c5f /usr/src
parentef18c5ec8528dd90b6150e9cd33c26cf8894be02 (diff)
downloadillumos-gate-a6e57bd4c7a2bf9cc33be939d674d4c7d3e67cce.tar.gz
6741237 zfs hang in txg_wait_open() on boot
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/ztest/ztest.c6
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zil.h6
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_dir.c20
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c35
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c7
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c2
6 files changed, 35 insertions, 41 deletions
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index db958c5c60..1ca474ffbd 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* The objective of this program is to provide a DMU/ZAP/SPA stress test
* that runs entirely in userland, is easy to use, and easy to extend.
@@ -1221,7 +1219,7 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za)
if (ztest_random(2) == 0 &&
dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os) == 0) {
zr.zr_os = os;
- zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector);
+ zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector, NULL);
dmu_objset_close(os);
}
@@ -3247,7 +3245,7 @@ ztest_run(char *pool)
ztest_dmu_check_future_leak(&za[t]);
zr.zr_os = za[d].za_os;
zil_replay(zr.zr_os, &zr, &zr.zr_assign,
- ztest_replay_vector);
+ ztest_replay_vector, NULL);
za[d].za_zilog = zil_open(za[d].za_os, NULL);
}
diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h
index aaba38536c..4d02d14f70 100644
--- a/usr/src/uts/common/fs/zfs/sys/zil.h
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h
@@ -26,8 +26,6 @@
#ifndef _SYS_ZIL_H
#define _SYS_ZIL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/spa.h>
#include <sys/zio.h>
@@ -337,6 +335,7 @@ typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
uint64_t txg);
typedef int zil_replay_func_t();
+typedef void zil_replay_cleaner_t();
typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
@@ -352,7 +351,8 @@ extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data);
extern void zil_close(zilog_t *zilog);
extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp,
- zil_replay_func_t *replay_func[TX_MAX_TYPE]);
+ zil_replay_func_t *replay_func[TX_MAX_TYPE],
+ zil_replay_cleaner_t *replay_cleaner);
extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c
index f1bebd467b..1ec4932646 100644
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/param.h>
#include <sys/time.h>
@@ -564,6 +562,24 @@ zfs_rmnode(znode_t *zp)
ASSERT(zp->z_phys->zp_links == 0);
/*
+ * If this is a ZIL replay then leave the object in the unlinked set.
+ * Otherwise we can get a deadlock, because the delete can be
+ * quite large and span multiple tx's and txgs, but each replay
+ * creates a tx to atomically run the replay function and mark the
+ * replay record as complete. We deadlock trying to start a tx in
+ * a new txg to further the deletion but can't because the replay
+ * tx hasn't finished.
+ *
+ * We actually delete the object if we get a failure to create an
+ * object in zil_replay_log_record(), or after calling zil_replay().
+ */
+ if (zfsvfs->z_assign >= TXG_INITIAL) {
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ return;
+ }
+
+ /*
* If this is an attribute directory, purge its contents.
*/
if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) {
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index ed23be8dd7..3ee726bb29 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
@@ -559,7 +557,6 @@ unregister:
static int
zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
{
- uint_t readonly;
int error;
error = zfs_register_callbacks(zfsvfs->z_vfs);
@@ -579,44 +576,22 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
* operations out since we closed the ZIL.
*/
if (mounting) {
+ boolean_t readonly;
+
/*
* During replay we remove the read only flag to
* allow replays to succeed.
*/
readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
- if (readonly != 0)
- zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
- else
- zfs_unlinked_drain(zfsvfs);
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
/*
* Parse and replay the intent log.
- *
- * Because of ziltest, this must be done after
- * zfs_unlinked_drain(). (Further note: ziltest doesn't
- * use readonly mounts, where zfs_unlinked_drain() isn't
- * called.) This is because ziltest causes spa_sync()
- * to think it's committed, but actually it is not, so
- * the intent log contains many txg's worth of changes.
- *
- * In particular, if object N is in the unlinked set in
- * the last txg to actually sync, then it could be
- * actually freed in a later txg and then reallocated in
- * a yet later txg. This would write a "create object
- * N" record to the intent log. Normally, this would be
- * fine because the spa_sync() would have written out
- * the fact that object N is free, before we could write
- * the "create object N" intent log record.
- *
- * But when we are in ziltest mode, we advance the "open
- * txg" without actually spa_sync()-ing the changes to
- * disk. So we would see that object N is still
- * allocated and in the unlinked set, and there is an
- * intent log record saying to allocate it.
*/
zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
- zfs_replay_vector);
+ zfs_replay_vector, zfs_unlinked_drain);
+ zfs_unlinked_drain(zfsvfs);
zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
}
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index ebafdf0a7b..11416e7ec5 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -1453,6 +1453,7 @@ zil_resume(zilog_t *zilog)
typedef struct zil_replay_arg {
objset_t *zr_os;
zil_replay_func_t **zr_replay;
+ zil_replay_cleaner_t *zr_replay_cleaner;
void *zr_arg;
uint64_t *zr_txgp;
boolean_t zr_byteswap;
@@ -1583,6 +1584,8 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
* transaction.
*/
if (error != ERESTART && !sunk) {
+ if (zr->zr_replay_cleaner)
+ zr->zr_replay_cleaner(zr->zr_arg);
txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
sunk = B_TRUE;
continue; /* retry */
@@ -1621,7 +1624,8 @@ zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
*/
void
zil_replay(objset_t *os, void *arg, uint64_t *txgp,
- zil_replay_func_t *replay_func[TX_MAX_TYPE])
+ zil_replay_func_t *replay_func[TX_MAX_TYPE],
+ zil_replay_cleaner_t *replay_cleaner)
{
zilog_t *zilog = dmu_objset_zil(os);
const zil_header_t *zh = zilog->zl_header;
@@ -1634,6 +1638,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
zr.zr_os = os;
zr.zr_replay = replay_func;
+ zr.zr_replay_cleaner = replay_cleaner;
zr.zr_arg = arg;
zr.zr_txgp = txgp;
zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 2c7865fee4..b9916d9830 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -713,7 +713,7 @@ zvol_create_minor(const char *name, major_t maj)
ASSERT(error == 0);
zv->zv_volblocksize = doi.doi_data_block_size;
- zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector);
+ zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL);
zvol_size_changed(zv, maj);
/* XXX this should handle the possible i/o error */