diff options
Diffstat (limited to 'usr/src/uts/common')
| -rw-r--r-- | usr/src/uts/common/fs/zfs/dmu_objset.c | 2 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/dmu_traverse.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/metaslab.c | 204 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/spa.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/spa_misc.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/metaslab.h | 5 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/spa.h | 2 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zil.h | 15 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zil_impl.h | 13 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zio.h | 9 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/vdev.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_mirror.c | 7 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_queue.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/zil.c | 412 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/zio.c | 29 | 
15 files changed, 428 insertions, 290 deletions
| diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 7784049a23..248612e3cc 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -541,7 +541,7 @@ dmu_objset_destroy(const char *name)  	 */  	error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);  	if (error == 0) { -		zil_destroy(dmu_objset_zil(os)); +		zil_destroy(dmu_objset_zil(os), B_FALSE);  		dmu_objset_close(os);  	} diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c index 950b2af548..3d2bc3e476 100644 --- a/usr/src/uts/common/fs/zfs/dmu_traverse.c +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c @@ -484,7 +484,7 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)  	if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {  		zb->zb_object = 0; -		zb->zb_blkid = bp->blk_cksum.zc_word[3]; +		zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];  		bc->bc_blkptr = *bp;  		(void) traverse_callback(th, zseg, bc);  	} @@ -539,7 +539,7 @@ traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)  	zilog = zil_alloc(dp->dp_meta_objset, zh); -	zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, +	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,  	    claim_txg);  	zil_free(zilog); diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 8728f21d7e..07494dacd4 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -593,52 +593,6 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)  	mutex_exit(&msp->ms_lock);  } -/* - * Intent log support: upon opening the pool after a crash, notify the SPA - * of blocks that the intent log has allocated for immediate write, but - * which are still considered free by the SPA because the last transaction - * group didn't commit yet. - */ -int -metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg) -{ -	uint64_t vdev = DVA_GET_VDEV(dva); -	uint64_t offset = DVA_GET_OFFSET(dva); -	uint64_t size = DVA_GET_ASIZE(dva); -	vdev_t *vd; -	metaslab_t *msp; -	int error; - -	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) -		return (ENXIO); - -	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) -		return (ENXIO); - -	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - -	if (DVA_GET_GANG(dva)) -		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - -	mutex_enter(&msp->ms_lock); - -	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); -	if (error) { -		mutex_exit(&msp->ms_lock); -		return (error); -	} - -	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) -		vdev_dirty(vd, VDD_METASLAB, msp, txg); - -	space_map_claim(&msp->ms_map, offset, size); -	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); - -	mutex_exit(&msp->ms_lock); - -	return (0); -} -  static uint64_t  metaslab_distance(metaslab_t *msp, dva_t *dva)  { @@ -735,7 +689,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,   * Allocate a block for the specified i/o.   */  static int -metaslab_alloc_one(spa_t *spa, uint64_t psize, dva_t *dva, int d, +metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,      dva_t *hintdva, uint64_t txg)  {  	metaslab_group_t *mg, *rotor; @@ -747,6 +701,8 @@ metaslab_alloc_one(spa_t *spa, uint64_t psize, dva_t *dva, int d,  	uint64_t asize;  	uint64_t distance; +	ASSERT(!DVA_IS_VALID(&dva[d])); +  	mc = spa_metaslab_class_select(spa);  	/* @@ -854,41 +810,12 @@ top:  	return (ENOSPC);  } -int -metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ncopies, -    uint64_t txg, blkptr_t *hintbp) -{ -	int d, error; -	dva_t *dva = bp->blk_dva; -	dva_t *hintdva = hintbp->blk_dva; - -	ASSERT(ncopies > 0 && ncopies <= spa_max_replication(spa)); -	ASSERT(BP_GET_NDVAS(bp) == 0); -	ASSERT(hintbp == NULL || ncopies <= BP_GET_NDVAS(hintbp)); - -	for (d = 0; d < ncopies; d++) { -		error = metaslab_alloc_one(spa, psize, dva, d, hintdva, txg); -		if (error) { -			for (d--; d >= 0; d--) { -				ASSERT(DVA_IS_VALID(&dva[d])); -				metaslab_free(spa, &dva[d], txg, B_TRUE); -				bzero(&dva[d], sizeof (dva_t)); -			} -			return (ENOSPC); -		} -	} -	ASSERT(error == 0); -	ASSERT(BP_GET_NDVAS(bp) == ncopies); - -	return (0); -} -  /*   * Free the block represented by DVA in the context of the specified   * transaction group.   */ -void -metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now) +static void +metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)  {  	uint64_t vdev = DVA_GET_VDEV(dva);  	uint64_t offset = DVA_GET_OFFSET(dva); @@ -896,19 +823,15 @@ metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now)  	vdev_t *vd;  	metaslab_t *msp; +	ASSERT(DVA_IS_VALID(dva)); +  	if (txg > spa_freeze_txg(spa))  		return; -	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { -		cmn_err(CE_WARN, "metaslab_free(): bad vdev %llu", -		    (u_longlong_t)vdev); -		ASSERT(0); -		return; -	} - -	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { -		cmn_err(CE_WARN, "metaslab_free(): bad offset %llu", -		    (u_longlong_t)offset); +	if ((vd = vdev_lookup_top(spa, vdev)) == NULL || +	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { +		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", +		    (u_longlong_t)vdev, (u_longlong_t)offset);  		ASSERT(0);  		return;  	} @@ -932,3 +855,108 @@ metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now)  	mutex_exit(&msp->ms_lock);  } + +/* + * Intent log support: upon opening the pool after a crash, notify the SPA + * of blocks that the intent log has allocated for immediate write, but + * which are still considered free by the SPA because the last transaction + * group didn't commit yet. + */ +static int +metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +{ +	uint64_t vdev = DVA_GET_VDEV(dva); +	uint64_t offset = DVA_GET_OFFSET(dva); +	uint64_t size = DVA_GET_ASIZE(dva); +	vdev_t *vd; +	metaslab_t *msp; +	int error; + +	ASSERT(DVA_IS_VALID(dva)); + +	if ((vd = vdev_lookup_top(spa, vdev)) == NULL || +	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) +		return (ENXIO); + +	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + +	if (DVA_GET_GANG(dva)) +		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + +	mutex_enter(&msp->ms_lock); + +	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); +	if (error) { +		mutex_exit(&msp->ms_lock); +		return (error); +	} + +	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) +		vdev_dirty(vd, VDD_METASLAB, msp, txg); + +	space_map_claim(&msp->ms_map, offset, size); +	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + +	mutex_exit(&msp->ms_lock); + +	return (0); +} + +int +metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas, +    uint64_t txg, blkptr_t *hintbp) +{ +	dva_t *dva = bp->blk_dva; +	dva_t *hintdva = hintbp->blk_dva; +	int d; +	int error = 0; + +	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); +	ASSERT(BP_GET_NDVAS(bp) == 0); +	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); + +	for (d = 0; d < ndvas; d++) { +		error = metaslab_alloc_dva(spa, psize, dva, d, hintdva, txg); +		if (error) { +			for (d--; d >= 0; d--) { +				metaslab_free_dva(spa, &dva[d], txg, B_TRUE); +				bzero(&dva[d], sizeof (dva_t)); +			} +			return (error); +		} +	} +	ASSERT(error == 0); +	ASSERT(BP_GET_NDVAS(bp) == ndvas); + +	return (0); +} + +void +metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) +{ +	const dva_t *dva = bp->blk_dva; +	int ndvas = BP_GET_NDVAS(bp); +	int d; + +	ASSERT(!BP_IS_HOLE(bp)); + +	for (d = 0; d < ndvas; d++) +		metaslab_free_dva(spa, &dva[d], txg, now); +} + +int +metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) +{ +	const dva_t *dva = bp->blk_dva; +	int ndvas = BP_GET_NDVAS(bp); +	int d, error; +	int last_error = 0; + +	ASSERT(!BP_IS_HOLE(bp)); + +	for (d = 0; d < ndvas; d++) +		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) +			last_error = error; + +	return (last_error); +} diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 95f633eac1..8de9585e2d 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -426,7 +426,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)  	error = zap_lookup(spa->spa_meta_objset,  	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,  	    sizeof (uint64_t), 1, &spa->spa_errlog_last); -	if (error != 0 &&error != ENOENT) { +	if (error != 0 && error != ENOENT) {  		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,  		    VDEV_AUX_CORRUPT_DATA);  		error = EIO; @@ -1530,7 +1530,7 @@ spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,  	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)  		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */ -	flags |= ZIO_FLAG_CANFAIL; +	flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;  	zio_nowait(zio_read(NULL, spa, bp, data, size,  	    spa_scrub_io_done, NULL, priority, flags, zb)); diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 843b77d9ff..11267729d9 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -616,7 +616,7 @@ spa_get_random(uint64_t range)  }  void -sprintf_blkptr(char *buf, int len, blkptr_t *bp) +sprintf_blkptr(char *buf, int len, const blkptr_t *bp)  {  	int d; @@ -637,7 +637,7 @@ sprintf_blkptr(char *buf, int len, blkptr_t *bp)  	    (u_longlong_t)BP_GET_PSIZE(bp));  	for (d = 0; d < BP_GET_NDVAS(bp); d++) { -		dva_t *dva = &bp->blk_dva[d]; +		const dva_t *dva = &bp->blk_dva[d];  		(void) snprintf(buf + strlen(buf), len - strlen(buf),  		    "DVA[%d]=<%llu:%llx:%llx> ", d,  		    (u_longlong_t)DVA_GET_VDEV(dva), diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h index c72b5ddf16..3811e636f3 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h @@ -49,8 +49,9 @@ extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);  extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp,      int ncopies, uint64_t txg, blkptr_t *hintbp); -extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now); -extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg); +extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, +    boolean_t now); +extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);  extern metaslab_class_t *metaslab_class_create(void);  extern void metaslab_class_destroy(metaslab_class_t *mc); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 265d19f63a..a51cfd524f 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -407,7 +407,7 @@ extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);  extern char *spa_strdup(const char *);  extern void spa_strfree(char *);  extern uint64_t spa_get_random(uint64_t range); -extern void sprintf_blkptr(char *buf, int len, blkptr_t *bp); +extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);  extern void spa_freeze(spa_t *spa);  extern void spa_upgrade(spa_t *spa);  extern void spa_evict_all(void); diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h index 81ccb6beef..040c096f3f 100644 --- a/usr/src/uts/common/fs/zfs/sys/zil.h +++ b/usr/src/uts/common/fs/zfs/sys/zil.h @@ -57,7 +57,8 @@ typedef struct zil_header {  	uint64_t zh_claim_txg;	/* txg in which log blocks were claimed */  	uint64_t zh_replay_seq;	/* highest replayed sequence number */  	blkptr_t zh_log;	/* log chain */ -	uint64_t zit_pad[6]; +	uint64_t zh_claim_seq;	/* highest claimed sequence number */ +	uint64_t zh_pad[5];  } zil_header_t;  /* @@ -80,6 +81,14 @@ typedef struct zil_trailer {  #define	ZIL_BLK_DATA_SZ(lwb)	((lwb)->lwb_sz - sizeof (zil_trailer_t))  /* + * The words of a log block checksum. + */ +#define	ZIL_ZC_GUID_0	0 +#define	ZIL_ZC_GUID_1	1 +#define	ZIL_ZC_OBJSET	2 +#define	ZIL_ZC_SEQ	3 + +/*   * Intent log transaction types and record structures   */  #define	TX_CREATE	1		/* Create file */ @@ -208,7 +217,7 @@ typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,  typedef int zil_replay_func_t();  typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf); -extern void	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, +extern uint64_t	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,      zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);  extern void	zil_init(void); @@ -222,7 +231,7 @@ extern void	zil_close(zilog_t *zilog);  extern void	zil_replay(objset_t *os, void *arg, uint64_t *txgp,      zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_wait)(void *)); -extern void	zil_destroy(zilog_t *zilog); +extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);  extern itx_t	*zil_itx_create(int txtype, size_t lrsize);  extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); diff --git a/usr/src/uts/common/fs/zfs/sys/zil_impl.h b/usr/src/uts/common/fs/zfs/sys/zil_impl.h index 53951b809c..f36bd94352 100644 --- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h @@ -2,9 +2,8 @@   * CDDL HEADER START   *   * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License").  You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License.   *   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE   * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2005 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -75,7 +74,7 @@ struct zilog {  	kmutex_t	zl_lock;	/* protects most zilog_t fields */  	struct dsl_pool	*zl_dmu_pool;	/* DSL pool */  	spa_t		*zl_spa;	/* handle for read/write log */ -	zil_header_t	*zl_header;	/* log header buffer */ +	const zil_header_t *zl_header;	/* log header buffer */  	objset_t	*zl_os;		/* object set we're logging */  	zil_get_data_t	*zl_get_data;	/* callback to get object content */  	uint64_t	zl_itx_seq;	/* itx sequence number */ @@ -85,6 +84,9 @@ struct zilog {  	uint32_t	zl_suspend;	/* log suspend count */  	kcondvar_t	zl_cv_write;	/* for waiting to write to log */  	kcondvar_t	zl_cv_seq;	/* for committing a sequence */ +	kcondvar_t	zl_cv_suspend;	/* log suspend completion */ +	uint8_t		zl_suspending;	/* log is currently suspending */ +	uint8_t		zl_keep_first;	/* keep first log block in destroy */  	uint8_t		zl_stop_replay;	/* don't replay any further */  	uint8_t		zl_stop_sync;	/* for debugging */  	uint8_t		zl_writer;	/* boolean: write setup in progress */ @@ -97,7 +99,6 @@ struct zilog {  	list_t		zl_vdev_list;	/* list of [vdev, seq] pairs */  	taskq_t		*zl_clean_taskq; /* runs lwb and itx clean tasks */  	avl_tree_t	zl_dva_tree;	/* track DVAs during log parse */ -	kmutex_t	zl_destroy_lock; /* serializes zil_destroy() calls */  };  typedef struct zil_dva_node { diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 66c9a910ca..b4958ee3fd 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -125,7 +125,8 @@ enum zio_compress {  #define	ZIO_FLAG_RESILVER		0x01000  #define	ZIO_FLAG_SCRUB			0x02000 -#define	ZIO_FLAG_SUBBLOCK		0x04000 +#define	ZIO_FLAG_SCRUB_THREAD		0x04000 +#define	ZIO_FLAG_SUBBLOCK		0x08000  #define	ZIO_FLAG_NOBOOKMARK		0x10000 @@ -137,7 +138,8 @@ enum zio_compress {  	ZIO_FLAG_IO_REPAIR |		\  	ZIO_FLAG_SPECULATIVE |		\  	ZIO_FLAG_RESILVER |		\ -	ZIO_FLAG_SCRUB) +	ZIO_FLAG_SCRUB |		\ +	ZIO_FLAG_SCRUB_THREAD)  #define	ZIO_FLAG_VDEV_INHERIT		\  	(ZIO_FLAG_GANG_INHERIT |	\ @@ -282,8 +284,7 @@ extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,      uint64_t size, void *data, int checksum,      zio_done_func_t *done, void *private, int priority, int flags); -extern int zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, -    blkptr_t *bp, uint64_t txg); +extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *bp, uint64_t txg);  extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);  extern int zio_wait(zio_t *zio); diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 7836041872..1914d8d903 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -1502,7 +1502,7 @@ vdev_stat_update(zio_t *zio)  		if ((flags & ZIO_FLAG_IO_REPAIR) &&  		    zio->io_delegate_list == NULL) {  			mutex_enter(&vd->vdev_stat_lock); -			if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) +			if (flags & ZIO_FLAG_SCRUB_THREAD)  				vs->vs_scrub_repaired += zio->io_size;  			else  				vs->vs_self_healed += zio->io_size; @@ -1530,7 +1530,7 @@ vdev_stat_update(zio_t *zio)  	if (type == ZIO_TYPE_WRITE) {  		if (txg == 0 || vd->vdev_children != 0)  			return; -		if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { +		if (flags & ZIO_FLAG_SCRUB_THREAD) {  			ASSERT(flags & ZIO_FLAG_IO_REPAIR);  			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)  				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c index d79c38a32e..eb3f0a862d 100644 --- a/usr/src/uts/common/fs/zfs/vdev_mirror.c +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c @@ -389,7 +389,9 @@ vdev_mirror_io_done(zio_t *zio)  		ASSERT(zio->io_error != 0);  	if (good_copies && (spa_mode & FWRITE) && -	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { +	    (unexpected_errors || +	    (zio->io_flags & ZIO_FLAG_RESILVER) || +	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {  		zio_t *rio;  		/* @@ -415,7 +417,8 @@ vdev_mirror_io_done(zio_t *zio)  			if (mc->mc_error == 0) {  				if (mc->mc_tried)  					continue; -				if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, +				if (!(zio->io_flags & ZIO_FLAG_SCRUB) && +				    !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,  				    zio->io_txg, 1))  					continue;  				mc->mc_error = ESTALE; diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index bb838fedd1..631948bb1b 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -118,7 +118,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)  	avl_add(&vq->vq_deadline_tree, zio);  	avl_add(zio->io_vdev_tree, zio); -	if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) && +	if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) &&  	    ++vq->vq_scrub_count >= vq->vq_scrub_limit)  		spa_scrub_throttle(zio->io_spa, 1);  } @@ -126,7 +126,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)  static void  vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)  { -	if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) && +	if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) &&  	    vq->vq_scrub_count-- >= vq->vq_scrub_limit)  		spa_scrub_throttle(zio->io_spa, -1); diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 7a0f71dc31..397d410a24 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -127,76 +127,94 @@ zil_dva_tree_add(avl_tree_t *t, dva_t *dva)  	return (0);  } +static zil_header_t * +zil_header_in_syncing_context(zilog_t *zilog) +{ +	return ((zil_header_t *)zilog->zl_header); +} + +static void +zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) +{ +	zio_cksum_t *zc = &bp->blk_cksum; + +	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); +	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); +	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); +	zc->zc_word[ZIL_ZC_SEQ] = 1ULL; +} +  /*   * Read a log block, make sure it's valid, and byteswap it if necessary.   */  static int -zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf) +zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)  { -	uint64_t blksz = BP_GET_LSIZE(bp); -	zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1; -	zio_cksum_t cksum; +	blkptr_t blk = *bp;  	zbookmark_t zb;  	int error; -	zb.zb_objset = bp->blk_cksum.zc_word[2]; +	zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];  	zb.zb_object = 0;  	zb.zb_level = -1; -	zb.zb_blkid = bp->blk_cksum.zc_word[3]; +	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; -	error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz, -	    NULL, NULL, ZIO_PRIORITY_SYNC_READ, -	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); -	if (error) { -		dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ", -		    zilog, bp, error); -		return (error); -	} +	*abufpp = NULL; -	if (BP_SHOULD_BYTESWAP(bp)) -		byteswap_uint64_array(buf, blksz); +	error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array, +	    arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | +	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, ARC_WAIT, &zb); -	/* -	 * Sequence numbers should be... sequential.  The checksum verifier for -	 * the next block should be: <logid[0], logid[1], objset id, seq + 1>. -	 */ -	cksum = bp->blk_cksum; -	cksum.zc_word[3]++; -	if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)) != 0) { -		dprintf_bp(bp, "zilog %p bp %p stale pointer: ", zilog, bp); -		return (ESTALE); -	} +	if (error == 0) { +		char *data = (*abufpp)->b_data; +		uint64_t blksz = BP_GET_LSIZE(bp); +		zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1; +		zio_cksum_t cksum = bp->blk_cksum; -	if (BP_IS_HOLE(&ztp->zit_next_blk)) { -		dprintf_bp(bp, "zilog %p bp %p hole: ", zilog, bp); -		return (ENOENT); -	} +		/* +		 * Sequence numbers should be... sequential.  The checksum +		 * verifier for the next block should be bp's checksum plus 1. +		 */ +		cksum.zc_word[ZIL_ZC_SEQ]++; + +		if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum))) +			error = ESTALE; +		else if (BP_IS_HOLE(&ztp->zit_next_blk)) +			error = ENOENT; +		else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) +			error = EOVERFLOW; -	if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) { -		dprintf("zilog %p bp %p nused exceeds blksz\n", zilog, bp); -		return (EOVERFLOW); +		if (error) { +			VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1); +			*abufpp = NULL; +		}  	} -	dprintf_bp(bp, "zilog %p bp %p good block: ", zilog, bp); +	dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid); -	return (0); +	return (error);  }  /*   * Parse the intent log, and call parse_func for each valid record within. + * Return the highest sequence number.   */ -void +uint64_t  zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,      zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)  { -	blkptr_t blk; +	const zil_header_t *zh = zilog->zl_header; +	uint64_t claim_seq = zh->zh_claim_seq; +	uint64_t seq = 0; +	uint64_t max_seq = 0; +	blkptr_t blk = zh->zh_log; +	arc_buf_t *abuf;  	char *lrbuf, *lrp;  	zil_trailer_t *ztp;  	int reclen, error; -	blk = zilog->zl_header->zh_log;  	if (BP_IS_HOLE(&blk)) -		return; +		return (max_seq);  	/*  	 * Starting at the block pointed to by zh_log we read the log chain. @@ -204,11 +222,20 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,  	 * ensure its validity.  We stop when an invalid block is found.  	 * For each block pointer in the chain we call parse_blk_func().  	 * For each record in each valid block we call parse_lr_func(). +	 * If the log has been claimed, stop if we encounter a sequence +	 * number greater than the highest claimed sequence number.  	 */  	zil_dva_tree_init(&zilog->zl_dva_tree); -	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);  	for (;;) { -		error = zil_read_log_block(zilog, &blk, lrbuf); +		seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; + +		if (claim_seq != 0 && seq > claim_seq) +			break; + +		ASSERT(max_seq < seq); +		max_seq = seq; + +		error = zil_read_log_block(zilog, &blk, &abuf);  		if (parse_blk_func != NULL)  			parse_blk_func(zilog, &blk, arg, txg); @@ -216,11 +243,14 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,  		if (error)  			break; +		lrbuf = abuf->b_data;  		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;  		blk = ztp->zit_next_blk; -		if (parse_lr_func == NULL) +		if (parse_lr_func == NULL) { +			VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);  			continue; +		}  		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {  			lr_t *lr = (lr_t *)lrp; @@ -228,9 +258,11 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,  			ASSERT3U(reclen, >=, sizeof (lr_t));  			parse_lr_func(zilog, lr, arg, txg);  		} +		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);  	} -	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);  	zil_dva_tree_fini(&zilog->zl_dva_tree); + +	return (max_seq);  }  /* ARGSUSED */ @@ -240,8 +272,6 @@ zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)  	spa_t *spa = zilog->zl_spa;  	int err; -	dprintf_bp(bp, "first_txg %llu: ", first_txg); -  	/*  	 * Claim log block if not already committed and not already claimed.  	 */ @@ -291,44 +321,42 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)  static void  zil_create(zilog_t *zilog)  { +	const zil_header_t *zh = zilog->zl_header;  	lwb_t *lwb; -	uint64_t txg; -	dmu_tx_t *tx; +	uint64_t txg = 0; +	dmu_tx_t *tx = NULL;  	blkptr_t blk; -	int error; -	int no_blk; - -	ASSERT(zilog->zl_header->zh_claim_txg == 0); -	ASSERT(zilog->zl_header->zh_replay_seq == 0); +	int error = 0;  	/* -	 * Initialize the log header block. +	 * Wait for any previous destroy to complete.  	 */ -	tx = dmu_tx_create(zilog->zl_os); -	(void) dmu_tx_assign(tx, TXG_WAIT); -	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); -	txg = dmu_tx_get_txg(tx); +	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + +	ASSERT(zh->zh_claim_txg == 0); +	ASSERT(zh->zh_replay_seq == 0); + +	blk = zh->zh_log;  	/* -	 * If we don't have a log block already then -	 * allocate the first log block and assign its checksum verifier. +	 * If we don't already have an initial log block, allocate one now.  	 */ -	no_blk = BP_IS_HOLE(&zilog->zl_header->zh_log); -	if (no_blk) { -		error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG, -		    ZIL_MIN_BLKSZ, &blk, txg); -	} else { -		blk = zilog->zl_header->zh_log; -		error = 0; +	if (BP_IS_HOLE(&blk)) { +		tx = dmu_tx_create(zilog->zl_os); +		(void) dmu_tx_assign(tx, TXG_WAIT); +		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); +		txg = dmu_tx_get_txg(tx); + +		error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, txg); + +		if (error == 0) +			zil_init_log_chain(zilog, &blk);  	} -	if (error == 0) { -		ZIO_SET_CHECKSUM(&blk.blk_cksum, -		    spa_get_random(-1ULL), spa_get_random(-1ULL), -		    dmu_objset_id(zilog->zl_os), 1ULL); -		/* -		 * Allocate a log write buffer (lwb) for the first log block. -		 */ +	/* +	 * Allocate a log write buffer (lwb) for the first log block. +	 */ +	if (error == 0) {  		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);  		lwb->lwb_zilog = zilog;  		lwb->lwb_blk = blk; @@ -343,43 +371,81 @@ zil_create(zilog_t *zilog)  		mutex_exit(&zilog->zl_lock);  	} -	dmu_tx_commit(tx); -	if (no_blk) +	/* +	 * If we just allocated the first log block, commit our transaction +	 * and wait for zil_sync() to stuff the block poiner into zh_log. +	 * (zh is part of the MOS, so we cannot modify it in open context.) +	 */ +	if (tx != NULL) { +		dmu_tx_commit(tx);  		txg_wait_synced(zilog->zl_dmu_pool, txg); +	} + +	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);  }  /*   * In one tx, free all log blocks and clear the log header. + * If keep_first is set, then we're replaying a log with no content. + * We want to keep the first block, however, so that the first + * synchronous transaction doesn't require a txg_wait_synced() + * in zil_create().  We don't need to txg_wait_synced() here either + * when keep_first is set, because both zil_create() and zil_destroy() + * will wait for any in-progress destroys to complete.   */  void -zil_destroy(zilog_t *zilog) +zil_destroy(zilog_t *zilog, boolean_t keep_first)  { +	const zil_header_t *zh = zilog->zl_header; +	lwb_t *lwb;  	dmu_tx_t *tx;  	uint64_t txg; -	mutex_enter(&zilog->zl_destroy_lock); +	/* +	 * Wait for any previous destroy to complete. +	 */ +	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); -	if (BP_IS_HOLE(&zilog->zl_header->zh_log)) { -		mutex_exit(&zilog->zl_destroy_lock); +	if (BP_IS_HOLE(&zh->zh_log))  		return; -	}  	tx = dmu_tx_create(zilog->zl_os);  	(void) dmu_tx_assign(tx, TXG_WAIT);  	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);  	txg = dmu_tx_get_txg(tx); -	zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx, -	    zilog->zl_header->zh_claim_txg); -	/* -	 * zil_sync clears the zil header as soon as the zl_destroy_txg commits -	 */ +	mutex_enter(&zilog->zl_lock); + +	ASSERT3U(zilog->zl_destroy_txg, <, txg);  	zilog->zl_destroy_txg = txg; +	zilog->zl_keep_first = keep_first; + +	if (!list_is_empty(&zilog->zl_lwb_list)) { +		ASSERT(zh->zh_claim_txg == 0); +		ASSERT(!keep_first); +		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { +			list_remove(&zilog->zl_lwb_list, lwb); +			if (lwb->lwb_buf != NULL) +				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); +			zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg); +			kmem_cache_free(zil_lwb_cache, lwb); +		} +		mutex_exit(&zilog->zl_lock); +	} else { +		mutex_exit(&zilog->zl_lock); +		if (!keep_first) { +			(void) zil_parse(zilog, zil_free_log_block, +			    zil_free_log_record, tx, zh->zh_claim_txg); +		} +	}  	dmu_tx_commit(tx); -	txg_wait_synced(zilog->zl_dmu_pool, txg); -	mutex_exit(&zilog->zl_destroy_lock); +	if (keep_first)			/* no need to wait in this case */ +		return; + +	txg_wait_synced(zilog->zl_dmu_pool, txg); +	ASSERT(BP_IS_HOLE(&zh->zh_log));  }  void @@ -399,18 +465,23 @@ zil_claim(char *osname, void *txarg)  	}  	zilog = dmu_objset_zil(os); -	zh = zilog->zl_header; +	zh = zil_header_in_syncing_context(zilog);  	/* -	 * Claim all log blocks if we haven't already done so. +	 * Claim all log blocks if we haven't already done so, and remember +	 * the highest claimed sequence number.  This ensures that if we can +	 * read only part of the log now (e.g. due to a missing device), +	 * but we can read the entire log later, we will not try to replay +	 * or destroy beyond the last block we successfully claimed.  	 */  	ASSERT3U(zh->zh_claim_txg, <=, first_txg);  	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {  		zh->zh_claim_txg = first_txg; -		zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, -		    tx, first_txg); +		zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block, +		    zil_claim_log_record, tx, first_txg);  		dsl_dataset_dirty(dmu_objset_ds(os), tx);  	} +  	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));  	dmu_objset_close(os);  } @@ -555,6 +626,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)  {  	lwb_t *nlwb;  	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1; +	spa_t *spa = zilog->zl_spa; +	blkptr_t *bp = &ztp->zit_next_blk;  	uint64_t txg;  	uint64_t zil_blksz;  	zbookmark_t zb; @@ -583,8 +656,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)  	if (zil_blksz > ZIL_MAX_BLKSZ)  		zil_blksz = ZIL_MAX_BLKSZ; -	error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG, -	    zil_blksz, &ztp->zit_next_blk, txg); +	error = zio_alloc_blk(spa, zil_blksz, bp, txg);  	if (error) {  		/*  		 * Reinitialise the lwb. @@ -599,12 +671,12 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)  		return (NULL);  	} -	ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg); +	ASSERT3U(bp->blk_birth, ==, txg);  	ztp->zit_pad = 0;  	ztp->zit_nused = lwb->lwb_nused;  	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; -	ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum; -	ztp->zit_next_blk.blk_cksum.zc_word[3]++; +	bp->blk_cksum = lwb->lwb_blk.blk_cksum; +	bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;  	/*  	 * Allocate a new log write buffer (lwb). @@ -612,7 +684,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)  	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);  	nlwb->lwb_zilog = zilog; -	nlwb->lwb_blk = ztp->zit_next_blk; +	nlwb->lwb_blk = *bp;  	nlwb->lwb_nused = 0;  	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);  	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz); @@ -633,14 +705,12 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)  	/*  	 * write the old log block  	 */ -	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg); - -	zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[2]; +	zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];  	zb.zb_object = 0;  	zb.zb_level = -1; -	zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[3]; +	zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; -	zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0, +	zio_nowait(zio_rewrite(NULL, spa, ZIO_CHECKSUM_ZILOG, 0,  	    &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,  	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb)); @@ -949,21 +1019,40 @@ zil_commit(zilog_t *zilog, uint64_t seq, int ioflag)  void  zil_sync(zilog_t *zilog, dmu_tx_t *tx)  { +	zil_header_t *zh = zil_header_in_syncing_context(zilog);  	uint64_t txg = dmu_tx_get_txg(tx);  	spa_t *spa = zilog->zl_spa;  	lwb_t *lwb; +	mutex_enter(&zilog->zl_lock); +  	ASSERT(zilog->zl_stop_sync == 0); -	zilog->zl_header->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; +	zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];  	if (zilog->zl_destroy_txg == txg) { -		bzero(zilog->zl_header, sizeof (zil_header_t)); +		blkptr_t blk = zh->zh_log; + +		ASSERT(list_head(&zilog->zl_lwb_list) == NULL); +		ASSERT(spa_sync_pass(spa) == 1); + +		bzero(zh, sizeof (zil_header_t));  		bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq)); -		zilog->zl_destroy_txg = 0; + +		if (zilog->zl_keep_first) { +			/* +			 * If this block was part of log chain that couldn't +			 * be claimed because a device was missing during +			 * zil_claim(), but that device later returns, +			 * then this block could erroneously appear valid. +			 * To guard against this, assign a new GUID to the new +			 * log chain so it doesn't matter what blk points to. +			 */ +			zil_init_log_chain(zilog, &blk); +			zh->zh_log = blk; +		}  	} -	mutex_enter(&zilog->zl_lock);  	for (;;) {  		lwb = list_head(&zilog->zl_lwb_list);  		if (lwb == NULL) { @@ -976,7 +1065,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)  		zio_free_blk(spa, &lwb->lwb_blk, txg);  		kmem_cache_free(zil_lwb_cache, lwb);  	} -	zilog->zl_header->zh_log = lwb->lwb_blk; +	zh->zh_log = lwb->lwb_blk;  	mutex_exit(&zilog->zl_lock);  } @@ -1004,6 +1093,7 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)  	zilog->zl_os = os;  	zilog->zl_spa = dmu_objset_spa(os);  	zilog->zl_dmu_pool = dmu_objset_pool(os); +	zilog->zl_destroy_txg = TXG_INITIAL - 1;  	list_create(&zilog->zl_itx_list, sizeof (itx_t),  	    offsetof(itx_t, itx_node)); @@ -1051,18 +1141,17 @@ zil_free(zilog_t *zilog)  static int  zil_empty(zilog_t *zilog)  { -	blkptr_t blk; -	char *lrbuf; -	int error; +	const zil_header_t *zh = zilog->zl_header; +	arc_buf_t *abuf = NULL; -	blk = zilog->zl_header->zh_log; -	if (BP_IS_HOLE(&blk)) +	if (BP_IS_HOLE(&zh->zh_log))  		return (1); -	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); -	error = zil_read_log_block(zilog, &blk, lrbuf); -	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); -	return (error ? 1 : 0); +	if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0) +		return (1); + +	VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); +	return (0);  }  /* @@ -1086,8 +1175,20 @@ zil_open(objset_t *os, zil_get_data_t *get_data)  void  zil_close(zilog_t *zilog)  { -	if (!zil_is_committed(zilog)) -		txg_wait_synced(zilog->zl_dmu_pool, 0); +	/* +	 * If the log isn't already committed, mark the objset dirty +	 * (so zil_sync() will be called) and wait for that txg to sync. +	 */ +	if (!zil_is_committed(zilog)) { +		uint64_t txg; +		dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); +		(void) dmu_tx_assign(tx, TXG_WAIT); +		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); +		txg = dmu_tx_get_txg(tx); +		dmu_tx_commit(tx); +		txg_wait_synced(zilog->zl_dmu_pool, txg); +	} +  	taskq_destroy(zilog->zl_clean_taskq);  	zilog->zl_clean_taskq = NULL;  	zilog->zl_get_data = NULL; @@ -1105,38 +1206,55 @@ zil_close(zilog_t *zilog)  int  zil_suspend(zilog_t *zilog)  { +	const zil_header_t *zh = zilog->zl_header;  	lwb_t *lwb;  	mutex_enter(&zilog->zl_lock); -	if (zilog->zl_header->zh_claim_txg != 0) {	/* unplayed log */ +	if (zh->zh_claim_txg != 0) {		/* unplayed log */  		mutex_exit(&zilog->zl_lock);  		return (EBUSY);  	} -	zilog->zl_suspend++; +	if (zilog->zl_suspend++ != 0) { +		/* +		 * Someone else already began a suspend. +		 * Just wait for them to finish. +		 */ +		while (zilog->zl_suspending) +			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); +		ASSERT(BP_IS_HOLE(&zh->zh_log)); +		mutex_exit(&zilog->zl_lock); +		return (0); +	} +	zilog->zl_suspending = B_TRUE;  	mutex_exit(&zilog->zl_lock);  	zil_commit(zilog, UINT64_MAX, FSYNC);  	mutex_enter(&zilog->zl_lock); -	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { -		if (lwb->lwb_buf != NULL) { -			/* -			 * Wait for the buffer if it's in the process of -			 * being written. -			 */ -			if ((lwb->lwb_seq != 0) && -			    (lwb->lwb_state != SEQ_COMPLETE)) { -				cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock); -				continue; -			} -			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); -		} -		list_remove(&zilog->zl_lwb_list, lwb); -		kmem_cache_free(zil_lwb_cache, lwb); +	for (;;) { +		/* +		 * Wait for any in-flight log writes to complete. +		 */ +		for (lwb = list_head(&zilog->zl_lwb_list); lwb != NULL; +		    lwb = list_next(&zilog->zl_lwb_list, lwb)) +			if (lwb->lwb_seq != 0 && lwb->lwb_state != SEQ_COMPLETE) +				break; + +		if (lwb == NULL) +			break; + +		cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);  	} +  	mutex_exit(&zilog->zl_lock); -	zil_destroy(zilog); +	zil_destroy(zilog, B_FALSE); + +	mutex_enter(&zilog->zl_lock); +	ASSERT(BP_IS_HOLE(&zh->zh_log)); +	zilog->zl_suspending = B_FALSE; +	cv_broadcast(&zilog->zl_cv_suspend); +	mutex_exit(&zilog->zl_lock);  	return (0);  } @@ -1164,7 +1282,7 @@ static void  zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)  {  	zil_replay_arg_t *zr = zra; -	zil_header_t *zh = zilog->zl_header; +	const zil_header_t *zh = zilog->zl_header;  	uint64_t reclen = lr->lrc_reclen;  	uint64_t txtype = lr->lrc_txtype;  	int pass, error; @@ -1310,15 +1428,11 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,  	zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_sync)(void *arg))  {  	zilog_t *zilog = dmu_objset_zil(os); -		zil_replay_arg_t zr; +	const zil_header_t *zh = zilog->zl_header; +	zil_replay_arg_t zr;  	if (zil_empty(zilog)) { -		/* -		 * Initialise the log header but don't free the log block -		 * which will get reused. -		 */ -		zilog->zl_header->zh_claim_txg = 0; -		zilog->zl_header->zh_replay_seq = 0; +		zil_destroy(zilog, B_TRUE);  		return;  	} @@ -1327,7 +1441,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,  	zr.zr_arg = arg;  	zr.zr_rm_sync = rm_sync;  	zr.zr_txgp = txgp; -	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zilog->zl_header->zh_log); +	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);  	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);  	/* @@ -1338,11 +1452,11 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,  	txg_wait_synced(zilog->zl_dmu_pool, 0);  	zilog->zl_stop_replay = 0; -	zil_parse(zilog, NULL, zil_replay_log_record, &zr, -	    zilog->zl_header->zh_claim_txg); +	(void) zil_parse(zilog, NULL, zil_replay_log_record, &zr, +	    zh->zh_claim_txg);  	kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE); -	zil_destroy(zilog); +	zil_destroy(zilog, B_FALSE);  }  /* @@ -1353,7 +1467,7 @@ zil_is_committed(zilog_t *zilog)  {  	lwb_t *lwb; -	if (zilog == NULL || list_head(&zilog->zl_itx_list)) +	if (!list_is_empty(&zilog->zl_itx_list))  		return (B_FALSE);  	/* diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 373d0c41d0..bf7c9791fe 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -1263,13 +1263,8 @@ static void  zio_dva_free(zio_t *zio)  {  	blkptr_t *bp = zio->io_bp; -	dva_t *dva = bp->blk_dva; -	int d; - -	ASSERT(!BP_IS_HOLE(bp)); -	for (d = 0; d < BP_GET_NDVAS(bp); d++) -		metaslab_free(zio->io_spa, &dva[d], zio->io_txg, B_FALSE); +	metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);  	BP_ZERO(bp); @@ -1279,18 +1274,7 @@ zio_dva_free(zio_t *zio)  static void  zio_dva_claim(zio_t *zio)  { -	blkptr_t *bp = zio->io_bp; -	dva_t *dva = bp->blk_dva; -	int error = 0; -	int d; - -	ASSERT(!BP_IS_HOLE(bp)); - -	for (d = 0; d < BP_GET_NDVAS(bp); d++) { -		error = metaslab_claim(zio->io_spa, &dva[d], zio->io_txg); -		if (error) -			zio->io_error = error; -	} +	zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);  	zio_next_stage(zio);  } @@ -1669,8 +1653,7 @@ zio_next_stage_async(zio_t *zio)   * Try to allocate an intent log block.  Return 0 on success, errno on failure.   */  int -zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, -    uint64_t txg) +zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *bp, uint64_t txg)  {  	int error; @@ -1681,10 +1664,10 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp,  	error = metaslab_alloc(spa, size, bp, 1, txg, NULL);  	if (error == 0) { -		BP_SET_CHECKSUM(bp, checksum);  		BP_SET_LSIZE(bp, size);  		BP_SET_PSIZE(bp, size);  		BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); +		BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_ZILOG);  		BP_SET_TYPE(bp, DMU_OT_INTENT_LOG);  		BP_SET_LEVEL(bp, 0);  		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); @@ -1705,11 +1688,9 @@ zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)  {  	ASSERT(!BP_IS_GANG(bp)); -	dprintf_bp(bp, "txg %llu: ", txg); -  	spa_config_enter(spa, RW_READER, FTAG); -	metaslab_free(spa, BP_IDENTITY(bp), txg, B_FALSE); +	metaslab_free(spa, bp, txg, B_FALSE);  	spa_config_exit(spa, FTAG);  } | 
