diff options
author | bonwick <none@none> | 2006-04-13 16:15:06 -0700 |
---|---|---|
committer | bonwick <none@none> | 2006-04-13 16:15:06 -0700 |
commit | d80c45e0f58fa434ba37259ea2e2b12e0380c19a (patch) | |
tree | 857b0f3c3b06a61e15ee0a5788b0a4162784c946 /usr/src/uts/common | |
parent | dc8169d4bd081d8011b71f02f6eb10df71d4d686 (diff) | |
download | illumos-gate-d80c45e0f58fa434ba37259ea2e2b12e0380c19a.tar.gz |
6410711 intent log blocks don't get invited to pool parties
Diffstat (limited to 'usr/src/uts/common')
-rw-r--r-- | usr/src/uts/common/fs/zfs/dmu_objset.c | 2 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dmu_traverse.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/metaslab.c | 204 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/spa.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/spa_misc.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/metaslab.h | 5 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/spa.h | 2 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zil.h | 15 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zil_impl.h | 13 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zio.h | 9 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_mirror.c | 7 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_queue.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zil.c | 412 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zio.c | 29 |
15 files changed, 428 insertions, 290 deletions
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 7784049a23..248612e3cc 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -541,7 +541,7 @@ dmu_objset_destroy(const char *name) */ error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os); if (error == 0) { - zil_destroy(dmu_objset_zil(os)); + zil_destroy(dmu_objset_zil(os), B_FALSE); dmu_objset_close(os); } diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c index 950b2af548..3d2bc3e476 100644 --- a/usr/src/uts/common/fs/zfs/dmu_traverse.c +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c @@ -484,7 +484,7 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) { zb->zb_object = 0; - zb->zb_blkid = bp->blk_cksum.zc_word[3]; + zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; bc->bc_blkptr = *bp; (void) traverse_callback(th, zseg, bc); } @@ -539,7 +539,7 @@ traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc) zilog = zil_alloc(dp->dp_meta_objset, zh); - zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, + (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, claim_txg); zil_free(zilog); diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 8728f21d7e..07494dacd4 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -593,52 +593,6 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) mutex_exit(&msp->ms_lock); } -/* - * Intent log support: upon opening the pool after a crash, notify the SPA - * of blocks that the intent log has allocated for immediate write, but - * which are still considered free by the SPA because the last transaction - * group didn't commit yet. - */ -int -metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg) -{ - uint64_t vdev = DVA_GET_VDEV(dva); - uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t size = DVA_GET_ASIZE(dva); - vdev_t *vd; - metaslab_t *msp; - int error; - - if ((vd = vdev_lookup_top(spa, vdev)) == NULL) - return (ENXIO); - - if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) - return (ENXIO); - - msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - - if (DVA_GET_GANG(dva)) - size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); - - mutex_enter(&msp->ms_lock); - - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); - if (error) { - mutex_exit(&msp->ms_lock); - return (error); - } - - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) - vdev_dirty(vd, VDD_METASLAB, msp, txg); - - space_map_claim(&msp->ms_map, offset, size); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); - - mutex_exit(&msp->ms_lock); - - return (0); -} - static uint64_t metaslab_distance(metaslab_t *msp, dva_t *dva) { @@ -735,7 +689,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, * Allocate a block for the specified i/o. */ static int -metaslab_alloc_one(spa_t *spa, uint64_t psize, dva_t *dva, int d, +metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg) { metaslab_group_t *mg, *rotor; @@ -747,6 +701,8 @@ metaslab_alloc_one(spa_t *spa, uint64_t psize, dva_t *dva, int d, uint64_t asize; uint64_t distance; + ASSERT(!DVA_IS_VALID(&dva[d])); + mc = spa_metaslab_class_select(spa); /* @@ -854,41 +810,12 @@ top: return (ENOSPC); } -int -metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ncopies, - uint64_t txg, blkptr_t *hintbp) -{ - int d, error; - dva_t *dva = bp->blk_dva; - dva_t *hintdva = hintbp->blk_dva; - - ASSERT(ncopies > 0 && ncopies <= spa_max_replication(spa)); - ASSERT(BP_GET_NDVAS(bp) == 0); - ASSERT(hintbp == NULL || ncopies <= BP_GET_NDVAS(hintbp)); - - for (d = 0; d < ncopies; d++) { - error = metaslab_alloc_one(spa, psize, dva, d, hintdva, txg); - if (error) { - for (d--; d >= 0; d--) { - ASSERT(DVA_IS_VALID(&dva[d])); - metaslab_free(spa, &dva[d], txg, B_TRUE); - bzero(&dva[d], sizeof (dva_t)); - } - return (ENOSPC); - } - } - ASSERT(error == 0); - ASSERT(BP_GET_NDVAS(bp) == ncopies); - - return (0); -} - /* * Free the block represented by DVA in the context of the specified * transaction group. */ -void -metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now) +static void +metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); @@ -896,19 +823,15 @@ metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now) vdev_t *vd; metaslab_t *msp; + ASSERT(DVA_IS_VALID(dva)); + if (txg > spa_freeze_txg(spa)) return; - if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { - cmn_err(CE_WARN, "metaslab_free(): bad vdev %llu", - (u_longlong_t)vdev); - ASSERT(0); - return; - } - - if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { - cmn_err(CE_WARN, "metaslab_free(): bad offset %llu", - (u_longlong_t)offset); + if ((vd = vdev_lookup_top(spa, vdev)) == NULL || + (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { + cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", + (u_longlong_t)vdev, (u_longlong_t)offset); ASSERT(0); return; } @@ -932,3 +855,108 @@ metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now) mutex_exit(&msp->ms_lock); } + +/* + * Intent log support: upon opening the pool after a crash, notify the SPA + * of blocks that the intent log has allocated for immediate write, but + * which are still considered free by the SPA because the last transaction + * group didn't commit yet. + */ +static int +metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd; + metaslab_t *msp; + int error; + + ASSERT(DVA_IS_VALID(dva)); + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL || + (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) + return (ENXIO); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + mutex_enter(&msp->ms_lock); + + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + if (error) { + mutex_exit(&msp->ms_lock); + return (error); + } + + if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + vdev_dirty(vd, VDD_METASLAB, msp, txg); + + space_map_claim(&msp->ms_map, offset, size); + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + + mutex_exit(&msp->ms_lock); + + return (0); +} + +int +metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas, + uint64_t txg, blkptr_t *hintbp) +{ + dva_t *dva = bp->blk_dva; + dva_t *hintdva = hintbp->blk_dva; + int d; + int error = 0; + + ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); + ASSERT(BP_GET_NDVAS(bp) == 0); + ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); + + for (d = 0; d < ndvas; d++) { + error = metaslab_alloc_dva(spa, psize, dva, d, hintdva, txg); + if (error) { + for (d--; d >= 0; d--) { + metaslab_free_dva(spa, &dva[d], txg, B_TRUE); + bzero(&dva[d], sizeof (dva_t)); + } + return (error); + } + } + ASSERT(error == 0); + ASSERT(BP_GET_NDVAS(bp) == ndvas); + + return (0); +} + +void +metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + int d; + + ASSERT(!BP_IS_HOLE(bp)); + + for (d = 0; d < ndvas; d++) + metaslab_free_dva(spa, &dva[d], txg, now); +} + +int +metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + int d, error; + int last_error = 0; + + ASSERT(!BP_IS_HOLE(bp)); + + for (d = 0; d < ndvas; d++) + if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) + last_error = error; + + return (last_error); +} diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 95f633eac1..8de9585e2d 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -426,7 +426,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, &spa->spa_errlog_last); - if (error != 0 &&error != ENOENT) { + if (error != 0 && error != ENOENT) { vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); error = EIO; @@ -1530,7 +1530,7 @@ spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ - flags |= ZIO_FLAG_CANFAIL; + flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; zio_nowait(zio_read(NULL, spa, bp, data, size, spa_scrub_io_done, NULL, priority, flags, zb)); diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 843b77d9ff..11267729d9 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -616,7 +616,7 @@ spa_get_random(uint64_t range) } void -sprintf_blkptr(char *buf, int len, blkptr_t *bp) +sprintf_blkptr(char *buf, int len, const blkptr_t *bp) { int d; @@ -637,7 +637,7 @@ sprintf_blkptr(char *buf, int len, blkptr_t *bp) (u_longlong_t)BP_GET_PSIZE(bp)); for (d = 0; d < BP_GET_NDVAS(bp); d++) { - dva_t *dva = &bp->blk_dva[d]; + const dva_t *dva = &bp->blk_dva[d]; (void) snprintf(buf + strlen(buf), len - strlen(buf), "DVA[%d]=<%llu:%llx:%llx> ", d, (u_longlong_t)DVA_GET_VDEV(dva), diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h index c72b5ddf16..3811e636f3 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h @@ -49,8 +49,9 @@ extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp); -extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now); -extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg); +extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, + boolean_t now); +extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); extern metaslab_class_t *metaslab_class_create(void); extern void metaslab_class_destroy(metaslab_class_t *mc); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 265d19f63a..a51cfd524f 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -407,7 +407,7 @@ extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_get_random(uint64_t range); -extern void sprintf_blkptr(char *buf, int len, blkptr_t *bp); +extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern void spa_upgrade(spa_t *spa); extern void spa_evict_all(void); diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h index 81ccb6beef..040c096f3f 100644 --- a/usr/src/uts/common/fs/zfs/sys/zil.h +++ b/usr/src/uts/common/fs/zfs/sys/zil.h @@ -57,7 +57,8 @@ typedef struct zil_header { uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ uint64_t zh_replay_seq; /* highest replayed sequence number */ blkptr_t zh_log; /* log chain */ - uint64_t zit_pad[6]; + uint64_t zh_claim_seq; /* highest claimed sequence number */ + uint64_t zh_pad[5]; } zil_header_t; /* @@ -80,6 +81,14 @@ typedef struct zil_trailer { #define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t)) /* + * The words of a log block checksum. + */ +#define ZIL_ZC_GUID_0 0 +#define ZIL_ZC_GUID_1 1 +#define ZIL_ZC_OBJSET 2 +#define ZIL_ZC_SEQ 3 + +/* * Intent log transaction types and record structures */ #define TX_CREATE 1 /* Create file */ @@ -208,7 +217,7 @@ typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, typedef int zil_replay_func_t(); typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf); -extern void zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, +extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); extern void zil_init(void); @@ -222,7 +231,7 @@ extern void zil_close(zilog_t *zilog); extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp, zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_wait)(void *)); -extern void zil_destroy(zilog_t *zilog); +extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); extern itx_t *zil_itx_create(int txtype, size_t lrsize); extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); diff --git a/usr/src/uts/common/fs/zfs/sys/zil_impl.h b/usr/src/uts/common/fs/zfs/sys/zil_impl.h index 53951b809c..f36bd94352 100644 --- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -75,7 +74,7 @@ struct zilog { kmutex_t zl_lock; /* protects most zilog_t fields */ struct dsl_pool *zl_dmu_pool; /* DSL pool */ spa_t *zl_spa; /* handle for read/write log */ - zil_header_t *zl_header; /* log header buffer */ + const zil_header_t *zl_header; /* log header buffer */ objset_t *zl_os; /* object set we're logging */ zil_get_data_t *zl_get_data; /* callback to get object content */ uint64_t zl_itx_seq; /* itx sequence number */ @@ -85,6 +84,9 @@ struct zilog { uint32_t zl_suspend; /* log suspend count */ kcondvar_t zl_cv_write; /* for waiting to write to log */ kcondvar_t zl_cv_seq; /* for committing a sequence */ + kcondvar_t zl_cv_suspend; /* log suspend completion */ + uint8_t zl_suspending; /* log is currently suspending */ + uint8_t zl_keep_first; /* keep first log block in destroy */ uint8_t zl_stop_replay; /* don't replay any further */ uint8_t zl_stop_sync; /* for debugging */ uint8_t zl_writer; /* boolean: write setup in progress */ @@ -97,7 +99,6 @@ struct zilog { list_t zl_vdev_list; /* list of [vdev, seq] pairs */ taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */ avl_tree_t zl_dva_tree; /* track DVAs during log parse */ - kmutex_t zl_destroy_lock; /* serializes zil_destroy() calls */ }; typedef struct zil_dva_node { diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 66c9a910ca..b4958ee3fd 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -125,7 +125,8 @@ enum zio_compress { #define ZIO_FLAG_RESILVER 0x01000 #define ZIO_FLAG_SCRUB 0x02000 -#define ZIO_FLAG_SUBBLOCK 0x04000 +#define ZIO_FLAG_SCRUB_THREAD 0x04000 +#define ZIO_FLAG_SUBBLOCK 0x08000 #define ZIO_FLAG_NOBOOKMARK 0x10000 @@ -137,7 +138,8 @@ enum zio_compress { ZIO_FLAG_IO_REPAIR | \ ZIO_FLAG_SPECULATIVE | \ ZIO_FLAG_RESILVER | \ - ZIO_FLAG_SCRUB) + ZIO_FLAG_SCRUB | \ + ZIO_FLAG_SCRUB_THREAD) #define ZIO_FLAG_VDEV_INHERIT \ (ZIO_FLAG_GANG_INHERIT | \ @@ -282,8 +284,7 @@ extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, int priority, int flags); -extern int zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, - blkptr_t *bp, uint64_t txg); +extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *bp, uint64_t txg); extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg); extern int zio_wait(zio_t *zio); diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 7836041872..1914d8d903 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -1502,7 +1502,7 @@ vdev_stat_update(zio_t *zio) if ((flags & ZIO_FLAG_IO_REPAIR) && zio->io_delegate_list == NULL) { mutex_enter(&vd->vdev_stat_lock); - if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) + if (flags & ZIO_FLAG_SCRUB_THREAD) vs->vs_scrub_repaired += zio->io_size; else vs->vs_self_healed += zio->io_size; @@ -1530,7 +1530,7 @@ vdev_stat_update(zio_t *zio) if (type == ZIO_TYPE_WRITE) { if (txg == 0 || vd->vdev_children != 0) return; - if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { + if (flags & ZIO_FLAG_SCRUB_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c index d79c38a32e..eb3f0a862d 100644 --- a/usr/src/uts/common/fs/zfs/vdev_mirror.c +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c @@ -389,7 +389,9 @@ vdev_mirror_io_done(zio_t *zio) ASSERT(zio->io_error != 0); if (good_copies && (spa_mode & FWRITE) && - (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + (unexpected_errors || + (zio->io_flags & ZIO_FLAG_RESILVER) || + ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { zio_t *rio; /* @@ -415,7 +417,8 @@ vdev_mirror_io_done(zio_t *zio) if (mc->mc_error == 0) { if (mc->mc_tried) continue; - if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, + if (!(zio->io_flags & ZIO_FLAG_SCRUB) && + !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, zio->io_txg, 1)) continue; mc->mc_error = ESTALE; diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index bb838fedd1..631948bb1b 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -118,7 +118,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) avl_add(&vq->vq_deadline_tree, zio); avl_add(zio->io_vdev_tree, zio); - if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) && + if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) && ++vq->vq_scrub_count >= vq->vq_scrub_limit) spa_scrub_throttle(zio->io_spa, 1); } @@ -126,7 +126,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { - if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) && + if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) && vq->vq_scrub_count-- >= vq->vq_scrub_limit) spa_scrub_throttle(zio->io_spa, -1); diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 7a0f71dc31..397d410a24 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -127,76 +127,94 @@ zil_dva_tree_add(avl_tree_t *t, dva_t *dva) return (0); } +static zil_header_t * +zil_header_in_syncing_context(zilog_t *zilog) +{ + return ((zil_header_t *)zilog->zl_header); +} + +static void +zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) +{ + zio_cksum_t *zc = &bp->blk_cksum; + + zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); + zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); + zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); + zc->zc_word[ZIL_ZC_SEQ] = 1ULL; +} + /* * Read a log block, make sure it's valid, and byteswap it if necessary. */ static int -zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf) +zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp) { - uint64_t blksz = BP_GET_LSIZE(bp); - zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1; - zio_cksum_t cksum; + blkptr_t blk = *bp; zbookmark_t zb; int error; - zb.zb_objset = bp->blk_cksum.zc_word[2]; + zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET]; zb.zb_object = 0; zb.zb_level = -1; - zb.zb_blkid = bp->blk_cksum.zc_word[3]; + zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; - error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz, - NULL, NULL, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); - if (error) { - dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ", - zilog, bp, error); - return (error); - } + *abufpp = NULL; - if (BP_SHOULD_BYTESWAP(bp)) - byteswap_uint64_array(buf, blksz); + error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array, + arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, ARC_WAIT, &zb); - /* - * Sequence numbers should be... sequential. The checksum verifier for - * the next block should be: <logid[0], logid[1], objset id, seq + 1>. - */ - cksum = bp->blk_cksum; - cksum.zc_word[3]++; - if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)) != 0) { - dprintf_bp(bp, "zilog %p bp %p stale pointer: ", zilog, bp); - return (ESTALE); - } + if (error == 0) { + char *data = (*abufpp)->b_data; + uint64_t blksz = BP_GET_LSIZE(bp); + zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1; + zio_cksum_t cksum = bp->blk_cksum; - if (BP_IS_HOLE(&ztp->zit_next_blk)) { - dprintf_bp(bp, "zilog %p bp %p hole: ", zilog, bp); - return (ENOENT); - } + /* + * Sequence numbers should be... sequential. The checksum + * verifier for the next block should be bp's checksum plus 1. + */ + cksum.zc_word[ZIL_ZC_SEQ]++; + + if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum))) + error = ESTALE; + else if (BP_IS_HOLE(&ztp->zit_next_blk)) + error = ENOENT; + else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) + error = EOVERFLOW; - if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) { - dprintf("zilog %p bp %p nused exceeds blksz\n", zilog, bp); - return (EOVERFLOW); + if (error) { + VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1); + *abufpp = NULL; + } } - dprintf_bp(bp, "zilog %p bp %p good block: ", zilog, bp); + dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid); - return (0); + return (error); } /* * Parse the intent log, and call parse_func for each valid record within. + * Return the highest sequence number. */ -void +uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) { - blkptr_t blk; + const zil_header_t *zh = zilog->zl_header; + uint64_t claim_seq = zh->zh_claim_seq; + uint64_t seq = 0; + uint64_t max_seq = 0; + blkptr_t blk = zh->zh_log; + arc_buf_t *abuf; char *lrbuf, *lrp; zil_trailer_t *ztp; int reclen, error; - blk = zilog->zl_header->zh_log; if (BP_IS_HOLE(&blk)) - return; + return (max_seq); /* * Starting at the block pointed to by zh_log we read the log chain. @@ -204,11 +222,20 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, * ensure its validity. We stop when an invalid block is found. * For each block pointer in the chain we call parse_blk_func(). * For each record in each valid block we call parse_lr_func(). + * If the log has been claimed, stop if we encounter a sequence + * number greater than the highest claimed sequence number. */ zil_dva_tree_init(&zilog->zl_dva_tree); - lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); for (;;) { - error = zil_read_log_block(zilog, &blk, lrbuf); + seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; + + if (claim_seq != 0 && seq > claim_seq) + break; + + ASSERT(max_seq < seq); + max_seq = seq; + + error = zil_read_log_block(zilog, &blk, &abuf); if (parse_blk_func != NULL) parse_blk_func(zilog, &blk, arg, txg); @@ -216,11 +243,14 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, if (error) break; + lrbuf = abuf->b_data; ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1; blk = ztp->zit_next_blk; - if (parse_lr_func == NULL) + if (parse_lr_func == NULL) { + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); continue; + } for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) { lr_t *lr = (lr_t *)lrp; @@ -228,9 +258,11 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, ASSERT3U(reclen, >=, sizeof (lr_t)); parse_lr_func(zilog, lr, arg, txg); } + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); } - zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); zil_dva_tree_fini(&zilog->zl_dva_tree); + + return (max_seq); } /* ARGSUSED */ @@ -240,8 +272,6 @@ zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) spa_t *spa = zilog->zl_spa; int err; - dprintf_bp(bp, "first_txg %llu: ", first_txg); - /* * Claim log block if not already committed and not already claimed. */ @@ -291,44 +321,42 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) static void zil_create(zilog_t *zilog) { + const zil_header_t *zh = zilog->zl_header; lwb_t *lwb; - uint64_t txg; - dmu_tx_t *tx; + uint64_t txg = 0; + dmu_tx_t *tx = NULL; blkptr_t blk; - int error; - int no_blk; - - ASSERT(zilog->zl_header->zh_claim_txg == 0); - ASSERT(zilog->zl_header->zh_replay_seq == 0); + int error = 0; /* - * Initialize the log header block. + * Wait for any previous destroy to complete. */ - tx = dmu_tx_create(zilog->zl_os); - (void) dmu_tx_assign(tx, TXG_WAIT); - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - txg = dmu_tx_get_txg(tx); + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + + ASSERT(zh->zh_claim_txg == 0); + ASSERT(zh->zh_replay_seq == 0); + + blk = zh->zh_log; /* - * If we don't have a log block already then - * allocate the first log block and assign its checksum verifier. + * If we don't already have an initial log block, allocate one now. */ - no_blk = BP_IS_HOLE(&zilog->zl_header->zh_log); - if (no_blk) { - error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG, - ZIL_MIN_BLKSZ, &blk, txg); - } else { - blk = zilog->zl_header->zh_log; - error = 0; + if (BP_IS_HOLE(&blk)) { + tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, txg); + + if (error == 0) + zil_init_log_chain(zilog, &blk); } - if (error == 0) { - ZIO_SET_CHECKSUM(&blk.blk_cksum, - spa_get_random(-1ULL), spa_get_random(-1ULL), - dmu_objset_id(zilog->zl_os), 1ULL); - /* - * Allocate a log write buffer (lwb) for the first log block. - */ + /* + * Allocate a log write buffer (lwb) for the first log block. + */ + if (error == 0) { lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; lwb->lwb_blk = blk; @@ -343,43 +371,81 @@ zil_create(zilog_t *zilog) mutex_exit(&zilog->zl_lock); } - dmu_tx_commit(tx); - if (no_blk) + /* + * If we just allocated the first log block, commit our transaction + * and wait for zil_sync() to stuff the block poiner into zh_log. + * (zh is part of the MOS, so we cannot modify it in open context.) + */ + if (tx != NULL) { + dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); + } + + ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); } /* * In one tx, free all log blocks and clear the log header. + * If keep_first is set, then we're replaying a log with no content. + * We want to keep the first block, however, so that the first + * synchronous transaction doesn't require a txg_wait_synced() + * in zil_create(). We don't need to txg_wait_synced() here either + * when keep_first is set, because both zil_create() and zil_destroy() + * will wait for any in-progress destroys to complete. */ void -zil_destroy(zilog_t *zilog) +zil_destroy(zilog_t *zilog, boolean_t keep_first) { + const zil_header_t *zh = zilog->zl_header; + lwb_t *lwb; dmu_tx_t *tx; uint64_t txg; - mutex_enter(&zilog->zl_destroy_lock); + /* + * Wait for any previous destroy to complete. + */ + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); - if (BP_IS_HOLE(&zilog->zl_header->zh_log)) { - mutex_exit(&zilog->zl_destroy_lock); + if (BP_IS_HOLE(&zh->zh_log)) return; - } tx = dmu_tx_create(zilog->zl_os); (void) dmu_tx_assign(tx, TXG_WAIT); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); - zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx, - zilog->zl_header->zh_claim_txg); - /* - * zil_sync clears the zil header as soon as the zl_destroy_txg commits - */ + mutex_enter(&zilog->zl_lock); + + ASSERT3U(zilog->zl_destroy_txg, <, txg); zilog->zl_destroy_txg = txg; + zilog->zl_keep_first = keep_first; + + if (!list_is_empty(&zilog->zl_lwb_list)) { + ASSERT(zh->zh_claim_txg == 0); + ASSERT(!keep_first); + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + list_remove(&zilog->zl_lwb_list, lwb); + if (lwb->lwb_buf != NULL) + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg); + kmem_cache_free(zil_lwb_cache, lwb); + } + mutex_exit(&zilog->zl_lock); + } else { + mutex_exit(&zilog->zl_lock); + if (!keep_first) { + (void) zil_parse(zilog, zil_free_log_block, + zil_free_log_record, tx, zh->zh_claim_txg); + } + } dmu_tx_commit(tx); - txg_wait_synced(zilog->zl_dmu_pool, txg); - mutex_exit(&zilog->zl_destroy_lock); + if (keep_first) /* no need to wait in this case */ + return; + + txg_wait_synced(zilog->zl_dmu_pool, txg); + ASSERT(BP_IS_HOLE(&zh->zh_log)); } void @@ -399,18 +465,23 @@ zil_claim(char *osname, void *txarg) } zilog = dmu_objset_zil(os); - zh = zilog->zl_header; + zh = zil_header_in_syncing_context(zilog); /* - * Claim all log blocks if we haven't already done so. + * Claim all log blocks if we haven't already done so, and remember + * the highest claimed sequence number. This ensures that if we can + * read only part of the log now (e.g. due to a missing device), + * but we can read the entire log later, we will not try to replay + * or destroy beyond the last block we successfully claimed. */ ASSERT3U(zh->zh_claim_txg, <=, first_txg); if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { zh->zh_claim_txg = first_txg; - zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, - tx, first_txg); + zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block, + zil_claim_log_record, tx, first_txg); dsl_dataset_dirty(dmu_objset_ds(os), tx); } + ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); dmu_objset_close(os); } @@ -555,6 +626,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) { lwb_t *nlwb; zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1; + spa_t *spa = zilog->zl_spa; + blkptr_t *bp = &ztp->zit_next_blk; uint64_t txg; uint64_t zil_blksz; zbookmark_t zb; @@ -583,8 +656,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) if (zil_blksz > ZIL_MAX_BLKSZ) zil_blksz = ZIL_MAX_BLKSZ; - error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG, - zil_blksz, &ztp->zit_next_blk, txg); + error = zio_alloc_blk(spa, zil_blksz, bp, txg); if (error) { /* * Reinitialise the lwb. @@ -599,12 +671,12 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) return (NULL); } - ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg); + ASSERT3U(bp->blk_birth, ==, txg); ztp->zit_pad = 0; ztp->zit_nused = lwb->lwb_nused; ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; - ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum; - ztp->zit_next_blk.blk_cksum.zc_word[3]++; + bp->blk_cksum = lwb->lwb_blk.blk_cksum; + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; /* * Allocate a new log write buffer (lwb). @@ -612,7 +684,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); nlwb->lwb_zilog = zilog; - nlwb->lwb_blk = ztp->zit_next_blk; + nlwb->lwb_blk = *bp; nlwb->lwb_nused = 0; nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk); nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz); @@ -633,14 +705,12 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) /* * write the old log block */ - dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg); - - zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[2]; + zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET]; zb.zb_object = 0; zb.zb_level = -1; - zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[3]; + zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; - zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0, + zio_nowait(zio_rewrite(NULL, spa, ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb)); @@ -949,21 +1019,40 @@ zil_commit(zilog_t *zilog, uint64_t seq, int ioflag) void zil_sync(zilog_t *zilog, dmu_tx_t *tx) { + zil_header_t *zh = zil_header_in_syncing_context(zilog); uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = zilog->zl_spa; lwb_t *lwb; + mutex_enter(&zilog->zl_lock); + ASSERT(zilog->zl_stop_sync == 0); - zilog->zl_header->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; + zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; if (zilog->zl_destroy_txg == txg) { - bzero(zilog->zl_header, sizeof (zil_header_t)); + blkptr_t blk = zh->zh_log; + + ASSERT(list_head(&zilog->zl_lwb_list) == NULL); + ASSERT(spa_sync_pass(spa) == 1); + + bzero(zh, sizeof (zil_header_t)); bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq)); - zilog->zl_destroy_txg = 0; + + if (zilog->zl_keep_first) { + /* + * If this block was part of log chain that couldn't + * be claimed because a device was missing during + * zil_claim(), but that device later returns, + * then this block could erroneously appear valid. + * To guard against this, assign a new GUID to the new + * log chain so it doesn't matter what blk points to. + */ + zil_init_log_chain(zilog, &blk); + zh->zh_log = blk; + } } - mutex_enter(&zilog->zl_lock); for (;;) { lwb = list_head(&zilog->zl_lwb_list); if (lwb == NULL) { @@ -976,7 +1065,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) zio_free_blk(spa, &lwb->lwb_blk, txg); kmem_cache_free(zil_lwb_cache, lwb); } - zilog->zl_header->zh_log = lwb->lwb_blk; + zh->zh_log = lwb->lwb_blk; mutex_exit(&zilog->zl_lock); } @@ -1004,6 +1093,7 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) zilog->zl_os = os; zilog->zl_spa = dmu_objset_spa(os); zilog->zl_dmu_pool = dmu_objset_pool(os); + zilog->zl_destroy_txg = TXG_INITIAL - 1; list_create(&zilog->zl_itx_list, sizeof (itx_t), offsetof(itx_t, itx_node)); @@ -1051,18 +1141,17 @@ zil_free(zilog_t *zilog) static int zil_empty(zilog_t *zilog) { - blkptr_t blk; - char *lrbuf; - int error; + const zil_header_t *zh = zilog->zl_header; + arc_buf_t *abuf = NULL; - blk = zilog->zl_header->zh_log; - if (BP_IS_HOLE(&blk)) + if (BP_IS_HOLE(&zh->zh_log)) return (1); - lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); - error = zil_read_log_block(zilog, &blk, lrbuf); - zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); - return (error ? 1 : 0); + if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0) + return (1); + + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + return (0); } /* @@ -1086,8 +1175,20 @@ zil_open(objset_t *os, zil_get_data_t *get_data) void zil_close(zilog_t *zilog) { - if (!zil_is_committed(zilog)) - txg_wait_synced(zilog->zl_dmu_pool, 0); + /* + * If the log isn't already committed, mark the objset dirty + * (so zil_sync() will be called) and wait for that txg to sync. + */ + if (!zil_is_committed(zilog)) { + uint64_t txg; + dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + dmu_tx_commit(tx); + txg_wait_synced(zilog->zl_dmu_pool, txg); + } + taskq_destroy(zilog->zl_clean_taskq); zilog->zl_clean_taskq = NULL; zilog->zl_get_data = NULL; @@ -1105,38 +1206,55 @@ zil_close(zilog_t *zilog) int zil_suspend(zilog_t *zilog) { + const zil_header_t *zh = zilog->zl_header; lwb_t *lwb; mutex_enter(&zilog->zl_lock); - if (zilog->zl_header->zh_claim_txg != 0) { /* unplayed log */ + if (zh->zh_claim_txg != 0) { /* unplayed log */ mutex_exit(&zilog->zl_lock); return (EBUSY); } - zilog->zl_suspend++; + if (zilog->zl_suspend++ != 0) { + /* + * Someone else already began a suspend. + * Just wait for them to finish. + */ + while (zilog->zl_suspending) + cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); + ASSERT(BP_IS_HOLE(&zh->zh_log)); + mutex_exit(&zilog->zl_lock); + return (0); + } + zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); zil_commit(zilog, UINT64_MAX, FSYNC); mutex_enter(&zilog->zl_lock); - while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { - if (lwb->lwb_buf != NULL) { - /* - * Wait for the buffer if it's in the process of - * being written. - */ - if ((lwb->lwb_seq != 0) && - (lwb->lwb_state != SEQ_COMPLETE)) { - cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock); - continue; - } - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - } - list_remove(&zilog->zl_lwb_list, lwb); - kmem_cache_free(zil_lwb_cache, lwb); + for (;;) { + /* + * Wait for any in-flight log writes to complete. + */ + for (lwb = list_head(&zilog->zl_lwb_list); lwb != NULL; + lwb = list_next(&zilog->zl_lwb_list, lwb)) + if (lwb->lwb_seq != 0 && lwb->lwb_state != SEQ_COMPLETE) + break; + + if (lwb == NULL) + break; + + cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock); } + mutex_exit(&zilog->zl_lock); - zil_destroy(zilog); + zil_destroy(zilog, B_FALSE); + + mutex_enter(&zilog->zl_lock); + ASSERT(BP_IS_HOLE(&zh->zh_log)); + zilog->zl_suspending = B_FALSE; + cv_broadcast(&zilog->zl_cv_suspend); + mutex_exit(&zilog->zl_lock); return (0); } @@ -1164,7 +1282,7 @@ static void zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; - zil_header_t *zh = zilog->zl_header; + const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; int pass, error; @@ -1310,15 +1428,11 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_sync)(void *arg)) { zilog_t *zilog = dmu_objset_zil(os); - zil_replay_arg_t zr; + const zil_header_t *zh = zilog->zl_header; + zil_replay_arg_t zr; if (zil_empty(zilog)) { - /* - * Initialise the log header but don't free the log block - * which will get reused. - */ - zilog->zl_header->zh_claim_txg = 0; - zilog->zl_header->zh_replay_seq = 0; + zil_destroy(zilog, B_TRUE); return; } @@ -1327,7 +1441,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, zr.zr_arg = arg; zr.zr_rm_sync = rm_sync; zr.zr_txgp = txgp; - zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zilog->zl_header->zh_log); + zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); /* @@ -1338,11 +1452,11 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, txg_wait_synced(zilog->zl_dmu_pool, 0); zilog->zl_stop_replay = 0; - zil_parse(zilog, NULL, zil_replay_log_record, &zr, - zilog->zl_header->zh_claim_txg); + (void) zil_parse(zilog, NULL, zil_replay_log_record, &zr, + zh->zh_claim_txg); kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE); - zil_destroy(zilog); + zil_destroy(zilog, B_FALSE); } /* @@ -1353,7 +1467,7 @@ zil_is_committed(zilog_t *zilog) { lwb_t *lwb; - if (zilog == NULL || list_head(&zilog->zl_itx_list)) + if (!list_is_empty(&zilog->zl_itx_list)) return (B_FALSE); /* diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 373d0c41d0..bf7c9791fe 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -1263,13 +1263,8 @@ static void zio_dva_free(zio_t *zio) { blkptr_t *bp = zio->io_bp; - dva_t *dva = bp->blk_dva; - int d; - - ASSERT(!BP_IS_HOLE(bp)); - for (d = 0; d < BP_GET_NDVAS(bp); d++) - metaslab_free(zio->io_spa, &dva[d], zio->io_txg, B_FALSE); + metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); BP_ZERO(bp); @@ -1279,18 +1274,7 @@ zio_dva_free(zio_t *zio) static void zio_dva_claim(zio_t *zio) { - blkptr_t *bp = zio->io_bp; - dva_t *dva = bp->blk_dva; - int error = 0; - int d; - - ASSERT(!BP_IS_HOLE(bp)); - - for (d = 0; d < BP_GET_NDVAS(bp); d++) { - error = metaslab_claim(zio->io_spa, &dva[d], zio->io_txg); - if (error) - zio->io_error = error; - } + zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); zio_next_stage(zio); } @@ -1669,8 +1653,7 @@ zio_next_stage_async(zio_t *zio) * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int -zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, - uint64_t txg) +zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *bp, uint64_t txg) { int error; @@ -1681,10 +1664,10 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, error = metaslab_alloc(spa, size, bp, 1, txg, NULL); if (error == 0) { - BP_SET_CHECKSUM(bp, checksum); BP_SET_LSIZE(bp, size); BP_SET_PSIZE(bp, size); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); @@ -1705,11 +1688,9 @@ zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) { ASSERT(!BP_IS_GANG(bp)); - dprintf_bp(bp, "txg %llu: ", txg); - spa_config_enter(spa, RW_READER, FTAG); - metaslab_free(spa, BP_IDENTITY(bp), txg, B_FALSE); + metaslab_free(spa, bp, txg, B_FALSE); spa_config_exit(spa, FTAG); } |