diff options
Diffstat (limited to 'usr/src/uts/common/fs')
60 files changed, 2869 insertions, 2026 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index f485fe9f7c..057f207bc6 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -437,6 +436,7 @@ struct arc_buf_hdr { kmutex_t b_freeze_lock; zio_cksum_t *b_freeze_cksum; + void *b_thawed; arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; @@ -545,8 +545,8 @@ static buf_hash_table_t buf_hash_table; (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) -#define HDR_LOCK(buf) \ - (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) +#define HDR_LOCK(hdr) \ + (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; @@ -664,6 +664,15 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((buf)->b_birth == birth) && ((buf)->b_spa == spa) +static void +buf_discard_identity(arc_buf_hdr_t *hdr) +{ + hdr->b_dva.dva_word[0] = 0; + hdr->b_dva.dva_word[1] = 0; + hdr->b_birth = 0; + hdr->b_cksum0 = 0; +} + static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) { @@ -797,7 +806,8 @@ buf_cons(void *vbuf, void *unused, int kmflag) arc_buf_t *buf = vbuf; bzero(buf, sizeof (arc_buf_t)); - rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); @@ -826,7 +836,8 @@ buf_dest(void *vbuf, void *unused) { arc_buf_t *buf = vbuf; - rw_destroy(&buf->b_lock); + mutex_destroy(&buf->b_evict_lock); + rw_destroy(&buf->b_data_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } @@ -941,6 +952,11 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) void arc_buf_thaw(arc_buf_t *buf) { + kmutex_t *hash_lock; + + hash_lock = HDR_LOCK(buf->b_hdr); + mutex_enter(hash_lock); + if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_state != arc_anon) panic("modifying non-anon buffer!"); @@ -954,18 +970,32 @@ arc_buf_thaw(arc_buf_t *buf) kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } + + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (buf->b_hdr->b_thawed) + kmem_free(buf->b_hdr->b_thawed, 1); + buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); + } + mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(hash_lock); } void arc_buf_freeze(arc_buf_t *buf) { + kmutex_t *hash_lock; + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + hash_lock = HDR_LOCK(buf->b_hdr); + mutex_enter(hash_lock); + ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); + mutex_exit(hash_lock); } static void @@ -1037,7 +1067,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT(new_state != old_state); ASSERT(refcnt == 0 || ab->b_datacnt > 0); ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); - ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon); ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); from_delta = to_delta = ab->b_datacnt * ab->b_size; @@ -1059,7 +1088,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) /* * If prefetching out of the ghost cache, - * we will have a non-null datacnt. + * we will have a non-zero datacnt. */ if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { /* ghost elements have a ghost size */ @@ -1095,9 +1124,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon) { + if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) buf_hash_remove(ab); - } /* adjust state sizes */ if (to_delta) @@ -1254,7 +1282,6 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr; - rw_enter(&buf->b_lock, RW_WRITER); ASSERT(buf->b_data != NULL); hdr = buf->b_hdr; (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); @@ -1263,7 +1290,6 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) buf->b_private = NULL; atomic_add_64(&arc_loaned_bytes, hdr->b_size); - rw_exit(&buf->b_lock); } static arc_buf_t * @@ -1299,16 +1325,16 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) * must verify b_data != NULL to know if the add_ref * was successful. */ - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); if (buf->b_data == NULL) { - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return; } - hdr = buf->b_hdr; - ASSERT(hdr != NULL); - hash_lock = HDR_LOCK(hdr); + hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); - rw_exit(&buf->b_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + mutex_exit(&buf->b_evict_lock); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); add_reference(hdr, hash_lock, tag); @@ -1394,6 +1420,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) continue; *bufp = buf->b_next; + buf->b_next = NULL; ASSERT(buf->b_efunc == NULL); @@ -1442,23 +1469,21 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) if (!BUF_EMPTY(hdr)) { ASSERT(!HDR_IN_HASH_TABLE(hdr)); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; + buf_discard_identity(hdr); } while (hdr->b_buf) { arc_buf_t *buf = hdr->b_buf; if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); - rw_enter(&buf->b_lock, RW_WRITER); + mutex_enter(&buf->b_evict_lock); ASSERT(buf->b_hdr != NULL); arc_buf_destroy(hdr->b_buf, FALSE, FALSE); hdr->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); mutex_exit(&arc_eviction_mtx); } else { arc_buf_destroy(hdr->b_buf, FALSE, TRUE); @@ -1468,6 +1493,10 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } + if (hdr->b_thawed) { + kmem_free(hdr->b_thawed, 1); + hdr->b_thawed = NULL; + } ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -1488,6 +1517,9 @@ arc_buf_free(arc_buf_t *buf, void *tag) kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_datacnt > 1) { arc_buf_destroy(buf, FALSE, TRUE); @@ -1512,12 +1544,10 @@ arc_buf_free(arc_buf_t *buf, void *tag) if (destroy_hdr) arc_hdr_destroy(hdr); } else { - if (remove_reference(hdr, NULL, tag) > 0) { - ASSERT(HDR_IO_ERROR(hdr)); + if (remove_reference(hdr, NULL, tag) > 0) arc_buf_destroy(buf, FALSE, TRUE); - } else { + else arc_hdr_destroy(hdr); - } } } @@ -1535,6 +1565,8 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) } mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT(hdr->b_state != arc_anon); ASSERT(buf->b_data != NULL); @@ -1613,7 +1645,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, ASSERT(ab->b_datacnt > 0); while (ab->b_buf) { arc_buf_t *buf = ab->b_buf; - if (!rw_tryenter(&buf->b_lock, RW_WRITER)) { + if (!mutex_tryenter(&buf->b_evict_lock)) { missed += 1; break; } @@ -1635,9 +1667,9 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&arc_eviction_mtx); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); } else { - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); arc_buf_destroy(buf, buf->b_data == stolen, TRUE); } @@ -1854,9 +1886,9 @@ arc_do_user_evicts(void) while (arc_eviction_list != NULL) { arc_buf_t *buf = arc_eviction_list; arc_eviction_list = buf->b_next; - rw_enter(&buf->b_lock, RW_WRITER); + mutex_enter(&buf->b_evict_lock); buf->b_hdr = NULL; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); mutex_exit(&arc_eviction_mtx); if (buf->b_efunc != NULL) @@ -2438,7 +2470,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { - bcopy(buf->b_data, arg, buf->b_hdr->b_size); + if (zio == NULL || zio->io_error == 0) + bcopy(buf->b_data, arg, buf->b_hdr->b_size); VERIFY(arc_buf_remove_ref(buf, arg) == 1); } @@ -2452,6 +2485,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) *bufp = NULL; } else { *bufp = buf; + ASSERT(buf->b_data); } } @@ -2606,13 +2640,22 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, { int err; + if (pbuf == NULL) { + /* + * XXX This happens from traverse callback funcs, for + * the objset_phys_t block. + */ + return (arc_read_nolock(pio, spa, bp, done, private, priority, + zio_flags, arc_flags, zb)); + } + ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); - rw_enter(&pbuf->b_lock, RW_READER); + rw_enter(&pbuf->b_data_lock, RW_READER); err = arc_read_nolock(pio, spa, bp, done, private, priority, zio_flags, arc_flags, zb); - rw_exit(&pbuf->b_lock); + rw_exit(&pbuf->b_data_lock); return (err); } @@ -2721,9 +2764,7 @@ top: if (exists) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; + buf_discard_identity(hdr); (void) arc_buf_remove_ref(buf, private); goto top; /* restart the IO request */ } @@ -2901,14 +2942,14 @@ arc_buf_evict(arc_buf_t *buf) kmutex_t *hash_lock; arc_buf_t **bufp; - rw_enter(&buf->b_lock, RW_WRITER); + mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(). */ ASSERT(buf->b_data == NULL); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (0); } else if (buf->b_data == NULL) { arc_buf_t copy = *buf; /* structure assignment */ @@ -2917,14 +2958,15 @@ arc_buf_evict(arc_buf_t *buf) * but let arc_do_user_evicts() do the reaping. */ buf->b_efunc = NULL; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); VERIFY(copy.b_efunc(©) == 0); return (1); } hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(buf->b_hdr == hdr); ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); @@ -2943,6 +2985,7 @@ arc_buf_evict(arc_buf_t *buf) arc_state_t *old_state = hdr->b_state; arc_state_t *evicted_state; + ASSERT(hdr->b_buf == NULL); ASSERT(refcount_is_zero(&hdr->b_refcnt)); evicted_state = @@ -2960,12 +3003,13 @@ arc_buf_evict(arc_buf_t *buf) mutex_exit(&old_state->arcs_mtx); } mutex_exit(hash_lock); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); VERIFY(buf->b_efunc(buf) == 0); buf->b_efunc = NULL; buf->b_private = NULL; buf->b_hdr = NULL; + buf->b_next = NULL; kmem_cache_free(buf_cache, buf); return (1); } @@ -2980,12 +3024,17 @@ void arc_release(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; + kmutex_t *hash_lock = NULL; l2arc_buf_hdr_t *l2hdr; uint64_t buf_size; - boolean_t released = B_FALSE; - rw_enter(&buf->b_lock, RW_WRITER); + /* + * It would be nice to assert that if it's DMU metadata (level > + * 0 || it's the dnode file), then it must be syncing context. + * But we don't know that information at this level. + */ + + mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; /* this buffer is not on any list */ @@ -2993,15 +3042,12 @@ arc_release(arc_buf_t *buf, void *tag) if (hdr->b_state == arc_anon) { /* this buffer is already released */ - ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); - ASSERT(BUF_EMPTY(hdr)); ASSERT(buf->b_efunc == NULL); - arc_buf_thaw(buf); - rw_exit(&buf->b_lock); - released = B_TRUE; } else { hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); } l2hdr = hdr->b_l2hdr; @@ -3011,9 +3057,6 @@ arc_release(arc_buf_t *buf, void *tag) buf_size = hdr->b_size; } - if (released) - goto out; - /* * Do we have more than one buf? */ @@ -3027,14 +3070,14 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(hdr->b_buf != buf || buf->b_next != NULL); /* - * Pull the data off of this buf and attach it to - * a new anonymous buf. + * Pull the data off of this hdr and attach it to + * a new anonymous hdr. */ (void) remove_reference(hdr, hash_lock, tag); bufp = &hdr->b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; - *bufp = (*bufp)->b_next; + *bufp = buf->b_next; buf->b_next = NULL; ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); @@ -3062,26 +3105,25 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_freeze_cksum = NULL; (void) refcount_add(&nhdr->b_refcnt, tag); buf->b_hdr = nhdr; - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); atomic_add_64(&arc_anon->arcs_size, blksz); } else { - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_refcnt) == 1); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - arc_change_state(arc_anon, hdr, hash_lock); + if (hdr->b_state != arc_anon) + arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; - mutex_exit(hash_lock); + if (hash_lock) + mutex_exit(hash_lock); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; + buf_discard_identity(hdr); arc_buf_thaw(buf); } buf->b_efunc = NULL; buf->b_private = NULL; -out: if (l2hdr) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); @@ -3090,14 +3132,27 @@ out: } } +/* + * Release this buffer. If it does not match the provided BP, fill it + * with that block's contents. + */ +/* ARGSUSED */ +int +arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, + zbookmark_t *zb) +{ + arc_release(buf, tag); + return (0); +} + int arc_released(arc_buf_t *buf) { int released; - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (released); } @@ -3106,9 +3161,9 @@ arc_has_callback(arc_buf_t *buf) { int callback; - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); callback = (buf->b_efunc != NULL); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (callback); } @@ -3118,9 +3173,9 @@ arc_referenced(arc_buf_t *buf) { int referenced; - rw_enter(&buf->b_lock, RW_READER); + mutex_enter(&buf->b_evict_lock); referenced = (refcount_count(&buf->b_hdr->b_refcnt)); - rw_exit(&buf->b_lock); + mutex_exit(&buf->b_evict_lock); return (referenced); } #endif @@ -3173,8 +3228,8 @@ arc_write_done(zio_t *zio) /* * If the block to be written was all-zero, we may have * compressed it away. In this case no write was performed - * so there will be no dva/birth-date/checksum. The buffer - * must therefor remain anonymous (and uncached). + * so there will be no dva/birth/checksum. The buffer must + * therefore remain anonymous (and uncached). */ if (!BUF_EMPTY(hdr)) { arc_buf_hdr_t *exists; @@ -3278,9 +3333,7 @@ arc_free(spa_t *spa, const blkptr_t *bp) if (HDR_IN_HASH_TABLE(ab)) buf_hash_remove(ab); ab->b_arc_access = 0; - bzero(&ab->b_dva, sizeof (dva_t)); - ab->b_birth = 0; - ab->b_cksum0 = 0; + buf_discard_identity(ab); ab->b_buf->b_efunc = NULL; ab->b_buf->b_private = NULL; mutex_exit(hash_lock); @@ -3974,11 +4027,11 @@ l2arc_read_done(zio_t *zio) ASSERT(cb != NULL); buf = cb->l2rcb_buf; ASSERT(buf != NULL); - hdr = buf->b_hdr; - ASSERT(hdr != NULL); - hash_lock = HDR_LOCK(hdr); + hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); + hdr = buf->b_hdr; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * Check this survived the L2ARC journey. diff --git a/usr/src/uts/common/fs/zfs/bplist.c b/usr/src/uts/common/fs/zfs/bplist.c index 830b6c1a42..8f8c5e1fcf 100644 --- a/usr/src/uts/common/fs/zfs/bplist.c +++ b/usr/src/uts/common/fs/zfs/bplist.c @@ -175,23 +175,26 @@ bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp) return (err); } - if (*itorp >= bpl->bpl_phys->bpl_entries) { - mutex_exit(&bpl->bpl_lock); - return (ENOENT); - } + do { + if (*itorp >= bpl->bpl_phys->bpl_entries) { + mutex_exit(&bpl->bpl_lock); + return (ENOENT); + } - blk = *itorp >> bpl->bpl_bpshift; - off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift); + blk = *itorp >> bpl->bpl_bpshift; + off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift); - err = bplist_cache(bpl, blk); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } + err = bplist_cache(bpl, blk); + if (err) { + mutex_exit(&bpl->bpl_lock); + return (err); + } + + bparray = bpl->bpl_cached_dbuf->db_data; + *bp = bparray[off]; + (*itorp)++; + } while (bp->blk_birth == 0); - bparray = bpl->bpl_cached_dbuf->db_data; - *bp = bparray[off]; - (*itorp)++; mutex_exit(&bpl->bpl_lock); return (0); } @@ -206,8 +209,10 @@ bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx) ASSERT(!BP_IS_HOLE(bp)); mutex_enter(&bpl->bpl_lock); err = bplist_hold(bpl); - if (err) + if (err) { + mutex_exit(&bpl->bpl_lock); return (err); + } blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift; off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift); diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index c211ff79dd..e1cd431acb 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -363,6 +363,7 @@ dbuf_verify(dmu_buf_impl_t *db) } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && + (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* @@ -477,8 +478,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); - mutex_exit(&db->db_mtx); - dbuf_rele(db, NULL); + dbuf_rele_and_unlock(db, NULL); } static void @@ -549,7 +549,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) else pbuf = db->db_objset->os_phys_buf; - (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, + (void) dsl_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); @@ -727,7 +727,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) /* free this block */ if (!BP_IS_HOLE(bp)) - dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp); + zio_free(db->db_dnode->dn_objset->os_spa, txg, bp); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; /* @@ -921,6 +921,26 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dnode_willuse_space(db->db_dnode, size-osize, tx); } +void +dbuf_release_bp(dmu_buf_impl_t *db) +{ + objset_t *os = db->db_dnode->dn_objset; + zbookmark_t zb; + + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + ASSERT(arc_released(os->os_phys_buf) || + list_link_active(&os->os_dsl_dataset->ds_synced_link)); + ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); + + zb.zb_objset = os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : 0; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; + (void) arc_release_bp(db->db_buf, db, + db->db_blkptr, os->os_spa, &zb); +} + dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { @@ -1717,7 +1737,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) else pbuf = dn->dn_objset->os_phys_buf; - (void) arc_read(NULL, dn->dn_objset->os_spa, + (void) dsl_read(NULL, dn->dn_objset->os_spa, bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); @@ -2463,7 +2483,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) if (BP_IS_HOLE(db->db_blkptr)) { arc_buf_thaw(data); } else { - arc_release(data, db); + dbuf_release_bp(db); } } } diff --git a/usr/src/uts/common/fs/zfs/ddt.c b/usr/src/uts/common/fs/zfs/ddt.c index 852fd1cdc4..64cbcc1f92 100644 --- a/usr/src/uts/common/fs/zfs/ddt.c +++ b/usr/src/uts/common/fs/zfs/ddt.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/zfs_context.h> @@ -35,6 +34,7 @@ #include <sys/dsl_pool.h> #include <sys/zio_checksum.h> #include <sys/zio_compress.h> +#include <sys/dsl_scan.h> static const ddt_ops_t *ddt_ops[DDT_TYPES] = { &ddt_zap_ops, @@ -160,7 +160,7 @@ ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt->ddt_object[type][class], dde)); } -static int +int ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx) { @@ -245,12 +245,13 @@ ddt_bp_create(enum zio_checksum checksum, ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); bp->blk_cksum = ddk->ddk_cksum; + bp->blk_fill = 1; BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); BP_SET_CHECKSUM(bp, checksum); - BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_TYPE(bp, DMU_OT_DEDUP); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); @@ -996,10 +997,17 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) ddt_object_create(ddt, ntype, nclass, tx); VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); - if (dp->dp_scrub_func != SCRUB_FUNC_NONE && - oclass > nclass && - nclass <= dp->dp_scrub_ddt_class_max) - dsl_pool_scrub_ddt_entry(dp, ddt->ddt_checksum, dde); + /* + * If the class changes, the order that we scan this bp + * changes. If it decreases, we could miss it, so + * scan it right now. (This covers both class changing + * while we are doing ddt_walk(), and when we are + * traversing.) + */ + if (nclass < oclass) { + dsl_scan_ddt_entry(dp->dp_scan, + ddt->ddt_checksum, dde, tx); + } } } @@ -1013,7 +1021,6 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) if (avl_numnodes(&ddt->ddt_tree) == 0) return; - ASSERT(spa_sync_pass(spa) == 1); ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); if (spa->spa_ddt_stat_object == 0) { @@ -1081,6 +1088,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) ddb->ddb_type, ddb->ddb_class, &ddb->ddb_cursor, dde); } + dde->dde_type = ddb->ddb_type; + dde->dde_class = ddb->ddb_class; if (error == 0) return (0); if (error != ENOENT) diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 0be72aa4f2..582089b8e8 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -84,7 +84,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, TRUE, "FUID table" }, { byteswap_uint64_array, TRUE, "FUID table size" }, { zap_byteswap, TRUE, "DSL dataset next clones"}, - { zap_byteswap, TRUE, "scrub work queue" }, + { zap_byteswap, TRUE, "scan work queue" }, { zap_byteswap, TRUE, "ZFS user/group used" }, { zap_byteswap, TRUE, "ZFS user/group quota" }, { zap_byteswap, TRUE, "snapshot refcount tags"}, @@ -93,7 +93,10 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, TRUE, "System attributes" }, { zap_byteswap, TRUE, "SA master node" }, { zap_byteswap, TRUE, "SA attr registration" }, - { zap_byteswap, TRUE, "SA attr layouts" }, }; + { zap_byteswap, TRUE, "SA attr layouts" }, + { zap_byteswap, TRUE, "scan translations" }, + { byteswap_uint8_array, FALSE, "deduplicated block" }, +}; int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, @@ -1630,6 +1633,7 @@ byteswap_uint8_array(void *vbuf, size_t size) void dmu_init(void) { + zfs_dbgmsg_init(); dbuf_init(); dnode_init(); zfetch_init(); @@ -1649,4 +1653,5 @@ dmu_fini(void) l2arc_fini(); xuio_stat_fini(); sa_cache_fini(); + zfs_dbgmsg_fini(); } diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 210d693051..546cd98b84 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -259,11 +259,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dprintf_bp(os->os_rootbp, "reading %s", ""); /* - * NB: when bprewrite scrub can change the bp, + * XXX when bprewrite scrub can change the bp, * and this is called from dmu_objset_open_ds_os, the bp * could change, and we'll need a lock. */ - err = arc_read_nolock(NULL, spa, os->os_rootbp, + err = dsl_read_nolock(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err) { @@ -628,6 +628,7 @@ struct oscarg { const char *lastname; dmu_objset_type_t type; uint64_t flags; + cred_t *cr; }; /*ARGSUSED*/ @@ -659,7 +660,7 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct oscarg *oa = arg2; @@ -668,7 +669,7 @@ dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); dsobj = dsl_dataset_create_sync(dd, oa->lastname, - oa->clone_origin, oa->flags, cr, tx); + oa->clone_origin, oa->flags, oa->cr, tx); if (oa->clone_origin == NULL) { dsl_dataset_t *ds; @@ -684,12 +685,12 @@ dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ds, bp, oa->type, tx); if (oa->userfunc) - oa->userfunc(os, oa->userarg, cr, tx); + oa->userfunc(os, oa->userarg, oa->cr, tx); dsl_dataset_rele(ds, FTAG); } - spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa, - tx, cr, "dataset = %llu", dsobj); + spa_history_log_internal(LOG_DS_CREATE, dd->dd_pool->dp_spa, + tx, "dataset = %llu", dsobj); } int @@ -715,6 +716,7 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, oa.lastname = tail; oa.type = type; oa.flags = flags; + oa.cr = CRED(); err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, dmu_objset_create_sync, pdd, &oa, 5); @@ -742,6 +744,7 @@ dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) oa.lastname = tail; oa.clone_origin = clone_origin; oa.flags = flags; + oa.cr = CRED(); err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, dmu_objset_create_sync, pdd, &oa, 5); @@ -795,19 +798,19 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) { objset_t *os = arg1; dsl_dataset_t *ds = os->os_dsl_dataset; struct snaparg *sn = arg2; - dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx); + dsl_dataset_snapshot_sync(ds, sn->snapname, tx); if (sn->props) { dsl_props_arg_t pa; pa.pa_props = sn->props; pa.pa_source = ZPROP_SRC_LOCAL; - dsl_props_set_sync(ds->ds_prev, &pa, cr, tx); + dsl_props_set_sync(ds->ds_prev, &pa, tx); } } @@ -1016,11 +1019,11 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) /* * Create the root block IO */ - arc_release(os->os_phys_buf, &os->os_phys_buf); - SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + VERIFY3U(0, ==, arc_release_bp(os->os_phys_buf, &os->os_phys_buf, + os->os_rootbp, os->os_spa, &zb)); dmu_write_policy(os, NULL, 0, 0, &zp); @@ -1082,7 +1085,7 @@ dmu_objset_is_dirty(objset_t *os, uint64_t txg) !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); } -static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; +objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) @@ -1649,7 +1652,7 @@ dmu_objset_prefetch(const char *name, void *arg) SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - (void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds), + (void) dsl_read_nolock(NULL, dsl_dataset_get_spa(ds), &ds->ds_phys->ds_bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index 86c428b5f2..a675e28b15 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -301,7 +301,7 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) /* ARGSUSED */ static int -backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, +backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct backuparg *ba = arg; @@ -330,7 +330,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; - if (arc_read_nolock(NULL, spa, bp, + if (dsl_read(NULL, spa, bp, pbuf, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (EIO); @@ -361,7 +361,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); - if (arc_read_nolock(NULL, spa, bp, + if (dsl_read(NULL, spa, bp, pbuf, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (EIO); @@ -504,6 +504,7 @@ struct recvbeginsyncarg { uint64_t dsflags; char clonelastname[MAXNAMELEN]; dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ + cred_t *cr; }; /* ARGSUSED */ @@ -536,7 +537,7 @@ recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct recvbeginsyncarg *rbsa = arg2; @@ -545,7 +546,7 @@ recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) /* Create and open new dataset. */ dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, - rbsa->origin, flags, cr, tx); + rbsa->origin, flags, rbsa->cr, tx); VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, B_TRUE, dmu_recv_tag, &rbsa->ds)); @@ -554,8 +555,8 @@ recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); } - spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC, - dd->dd_pool->dp_spa, tx, cr, "dataset = %lld", dsobj); + spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, + dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); } /* ARGSUSED */ @@ -630,7 +631,7 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ohds = arg1; struct recvbeginsyncarg *rbsa = arg2; @@ -641,7 +642,7 @@ recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) /* create and open the temporary clone */ dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, - ohds->ds_prev, flags, cr, tx); + ohds->ds_prev, flags, rbsa->cr, tx); VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); /* @@ -655,8 +656,8 @@ recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) rbsa->ds = cds; - spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, - dp->dp_spa, tx, cr, "dataset = %lld", dsobj); + spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, + dp->dp_spa, tx, "dataset = %lld", dsobj); } @@ -701,6 +702,7 @@ dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, rbsa.type = drrb->drr_type; rbsa.tag = FTAG; rbsa.dsflags = 0; + rbsa.cr = CRED(); versioninfo = drrb->drr_versioninfo; flags = drrb->drr_flags; @@ -1466,12 +1468,12 @@ recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct recvendsyncarg *resa = arg2; - dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx); + dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c index 653c3a2d41..429c76ae11 100644 --- a/usr/src/uts/common/fs/zfs/dmu_traverse.c +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/zfs_context.h> @@ -77,7 +76,7 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); + (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg); return (0); } @@ -102,7 +101,7 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); - (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, + (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg); } return (0); @@ -140,7 +139,8 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, boolean_t hard = td->td_flags & TRAVERSE_HARD; if (bp->blk_birth == 0) { - err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg); + err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp, + td->td_arg); return (err); } @@ -160,7 +160,8 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, } if (td->td_flags & TRAVERSE_PRE) { - err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); + err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, + td->td_arg); if (err) return (err); } @@ -171,7 +172,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - err = arc_read(NULL, td->td_spa, bp, pbuf, + err = dsl_read(NULL, td->td_spa, bp, pbuf, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) @@ -195,7 +196,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - err = arc_read(NULL, td->td_spa, bp, pbuf, + err = dsl_read(NULL, td->td_spa, bp, pbuf, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) @@ -217,7 +218,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, objset_phys_t *osp; dnode_phys_t *dnp; - err = arc_read_nolock(NULL, td->td_spa, bp, + err = dsl_read_nolock(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) @@ -252,8 +253,10 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, if (buf) (void) arc_buf_remove_ref(buf, &buf); - if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) - err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); + if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { + err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, + td->td_arg); + } return (err != 0 ? err : lasterr); } @@ -275,16 +278,17 @@ traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, break; lasterr = err; } - if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { - SET_BOOKMARK(&czb, objset, - object, 0, DMU_SPILL_BLKID); - err = traverse_visitbp(td, dnp, buf, - (blkptr_t *)&dnp->dn_spill, &czb); - if (err) { - if (!hard) - break; - lasterr = err; - } + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + SET_BOOKMARK(&czb, objset, + object, 0, DMU_SPILL_BLKID); + err = traverse_visitbp(td, dnp, buf, + (blkptr_t *)&dnp->dn_spill, &czb); + if (err) { + if (!hard) + return (err); + lasterr = err; } } return (err != 0 ? err : lasterr); @@ -293,7 +297,8 @@ traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, /* ARGSUSED */ static int traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, + void *arg) { struct prefetch_data *pfd = arg; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; @@ -314,7 +319,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, cv_broadcast(&pfd->pd_cv); mutex_exit(&pfd->pd_mtx); - (void) arc_read_nolock(NULL, spa, bp, NULL, NULL, + (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb); diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c index 523aad70da..fa5747c7aa 100644 --- a/usr/src/uts/common/fs/zfs/dnode_sync.c +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c @@ -227,7 +227,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, if (db->db_state != DB_CACHED) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); - arc_release(db->db_buf, db); + dbuf_release_bp(db); bp = (blkptr_t *)db->db.db_data; epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c index ee0ccd7d01..c645d4d785 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dataset.c +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c @@ -38,6 +38,7 @@ #include <sys/spa.h> #include <sys/zfs_znode.h> #include <sys/zvol.h> +#include <sys/dsl_scan.h> static char *dsl_reaper = "the grim reaper"; @@ -80,7 +81,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; - dprintf_bp(bp, "born, ds=%p\n", ds); + dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); /* It could have been compressed away to nothing */ @@ -100,6 +101,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) return; } dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); @@ -150,7 +152,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { int64_t delta; - dprintf_bp(bp, "freeing: %s", ""); + dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); dsl_free(tx->tx_pool, tx->tx_txg, bp); mutex_enter(&ds->ds_dir->dd_lock); @@ -191,7 +193,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, ds->ds_prev->ds_phys->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } - if (bp->blk_birth > ds->ds_origin_txg) { + if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { dsl_dir_transfer_space(ds->ds_dir, used, DD_USED_HEAD, DD_USED_SNAP, tx); } @@ -397,19 +399,6 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev); } - - if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) { - dsl_dataset_t *origin; - - err = dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_origin_obj, - FTAG, &origin); - if (err == 0) { - ds->ds_origin_txg = - origin->ds_phys->ds_creation_txg; - dsl_dataset_rele(origin, FTAG); - } - } } else { if (zfs_flags & ZFS_DEBUG_SNAPNAMES) err = dsl_dataset_get_snapname(ds); @@ -876,10 +865,6 @@ dsl_snapshot_destroy_one(const char *name, void *arg) struct dsl_ds_destroyarg *dsda; dsl_dataset_make_exclusive(ds, da->dstg); - if (ds->ds_objset != NULL) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); dsda->ds = ds; dsda->defer = da->defer; @@ -989,11 +974,6 @@ dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) return (error); dsda->rm_origin = origin; dsl_dataset_make_exclusive(origin, tag); - - if (origin->ds_objset != NULL) { - dmu_objset_evict(origin->ds_objset); - origin->ds_objset = NULL; - } } return (0); @@ -1020,10 +1000,6 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) /* Destroying a snapshot is simpler */ dsl_dataset_make_exclusive(ds, tag); - if (ds->ds_objset != NULL) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } dsda.defer = defer; err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, @@ -1096,24 +1072,10 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) if (err) goto out; - if (ds->ds_objset) { - /* - * We need to sync out all in-flight IO before we try - * to evict (the dataset evict func is trying to clear - * the cached entries for this dataset in the ARC). - */ - txg_wait_synced(dd->dd_pool, 0); - } - /* * Blow away the dsl_dir + head dataset. */ dsl_dataset_make_exclusive(ds, tag); - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } - /* * If we're removing a clone, we might also need to remove its * origin. @@ -1220,7 +1182,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) uint64_t mrs_used; uint64_t dlused, dlcomp, dluncomp; - ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj); + ASSERT(!dsl_dataset_is_snapshot(ds)); if (ds->ds_phys->ds_prev_snap_obj != 0) mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; @@ -1234,21 +1196,11 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) ds->ds_phys->ds_unique_bytes = ds->ds_phys->ds_used_bytes - (mrs_used - dlused); - if (!DS_UNIQUE_IS_ACCURATE(ds) && - spa_version(ds->ds_dir->dd_pool->dp_spa) >= + if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; } -static uint64_t -dsl_dataset_unique(dsl_dataset_t *ds) -{ - if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds)) - dsl_dataset_recalc_head_uniq(ds); - - return (ds->ds_phys->ds_unique_bytes); -} - struct killarg { dsl_dataset_t *ds; dmu_tx_t *tx; @@ -1256,7 +1208,7 @@ struct killarg { /* ARGSUSED */ static int -kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, +kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct killarg *ka = arg; @@ -1315,7 +1267,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_pool_t *dp = ds->ds_dir->dd_pool; @@ -1324,8 +1276,8 @@ dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, - cr, "dataset = %llu", ds->ds_object); + spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, + "dataset = %llu", ds->ds_object); } static int @@ -1499,7 +1451,7 @@ remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) } void -dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { struct dsl_ds_destroyarg *dsda = arg1; dsl_dataset_t *ds = dsda->ds; @@ -1531,6 +1483,11 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) cv_broadcast(&ds->ds_exclusive_cv); mutex_exit(&ds->ds_lock); + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + /* Remove our reservation */ if (ds->ds_reserved != 0) { dsl_prop_setarg_t psa; @@ -1541,13 +1498,13 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) &value); psa.psa_effective_value = 0; /* predict default value */ - dsl_dataset_set_reservation_sync(ds, &psa, cr, tx); + dsl_dataset_set_reservation_sync(ds, &psa, tx); ASSERT3U(ds->ds_reserved, ==, 0); } ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - dsl_pool_ds_destroyed(ds, tx); + dsl_scan_ds_destroyed(ds, tx); obj = ds->ds_object; @@ -1596,7 +1553,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) } } - if (ds->ds_phys->ds_next_snap_obj != 0) { + if (dsl_dataset_is_snapshot(ds)) { blkptr_t bp; zio_t *pio; dsl_dataset_t *ds_next; @@ -1608,7 +1565,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); - old_unique = dsl_dataset_unique(ds_next); + old_unique = ds_next->ds_phys->ds_unique_bytes; dmu_buf_will_dirty(ds_next->ds_dbuf, tx); ds_next->ds_phys->ds_prev_snap_obj = @@ -1664,7 +1621,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) ds_next->ds_phys->ds_deadlist_obj)); ds->ds_phys->ds_deadlist_obj = 0; - if (ds_next->ds_phys->ds_next_snap_obj != 0) { + if (dsl_dataset_is_snapshot(ds_next)) { /* * Update next's unique to include blocks which * were previously shared by only this snapshot @@ -1790,8 +1747,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) dsl_dataset_rele(ds_prev, FTAG); spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); - spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx, - cr, "dataset = %llu", ds->ds_object); + spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx, + "dataset = %llu", ds->ds_object); if (ds->ds_phys->ds_next_clones_obj != 0) { uint64_t count; @@ -1816,7 +1773,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) struct dsl_ds_destroyarg ndsda = {0}; ndsda.ds = dsda->rm_origin; - dsl_dataset_destroy_sync(&ndsda, tag, cr, tx); + dsl_dataset_destroy_sync(&ndsda, tag, tx); } } @@ -1833,7 +1790,8 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) * owned by the snapshot dataset must be accommodated by space * outside of the reservation. */ - asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); + asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE)) return (ENOSPC); @@ -1847,7 +1805,6 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) return (0); } -/* ARGSUSED */ int dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -1888,7 +1845,7 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) } void -dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; const char *snapname = arg2; @@ -1959,9 +1916,11 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) * since our unique space is going to zero. */ if (ds->ds_reserved) { - int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + int64_t delta; + ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); + delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, - add, 0, 0, tx); + delta, 0, 0, tx); } bplist_close(&ds->ds_deadlist); @@ -1987,11 +1946,11 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == dsl_dataset_get_ref(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); - dsl_pool_ds_snapshotted(ds, tx); + dsl_scan_ds_snapshotted(ds, tx); dsl_dir_snap_cmtime_update(ds->ds_dir); - spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx, "dataset = %llu", dsobj); } @@ -2035,7 +1994,7 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, ds->ds_phys->ds_guid); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, - dsl_dataset_unique(ds)); + ds->ds_phys->ds_unique_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, ds->ds_object); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, @@ -2163,8 +2122,7 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, - cred_t *cr, dmu_tx_t *tx) +dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; const char *newsnapname = arg2; @@ -2188,8 +2146,8 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, ds->ds_snapname, 8, 1, &ds->ds_object, tx); ASSERT3U(err, ==, 0); - spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, - cr, "dataset = %llu", ds->ds_object); + spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, + "dataset = %llu", ds->ds_object); dsl_dataset_rele(hds, FTAG); } @@ -2371,14 +2329,14 @@ struct promotenode { struct promotearg { list_t shared_snaps, origin_snaps, clone_snaps; - dsl_dataset_t *origin_origin, *origin_head; + dsl_dataset_t *origin_origin; uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; char *err_ds; }; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); +static boolean_t snaplist_unstable(list_t *l); -/* ARGSUSED */ static int dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -2479,19 +2437,19 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) /* * Note, typically this will not be a clone of a clone, - * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so + * so dd_origin_txg will be < TXG_INITIAL, so * these snaplist_space() -> bplist_space_birthrange() * calls will be fast because they do not have to * iterate over all bps. */ snap = list_head(&pa->origin_snaps); err = snaplist_space(&pa->shared_snaps, - snap->ds->ds_origin_txg, &pa->cloneusedsnap); + snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); if (err) return (err); err = snaplist_space(&pa->clone_snaps, - snap->ds->ds_origin_txg, &space); + snap->ds->ds_dir->dd_origin_txg, &space); if (err) return (err); pa->cloneusedsnap += space; @@ -2510,7 +2468,7 @@ out: } static void -dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *hds = arg1; struct promotearg *pa = arg2; @@ -2554,10 +2512,11 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(dd->dd_dbuf, tx); ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; - hds->ds_origin_txg = origin_head->ds_origin_txg; + dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; dmu_buf_will_dirty(odd->dd_dbuf, tx); odd->dd_phys->dd_origin_obj = origin_ds->ds_object; - origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg; + origin_head->ds_dir->dd_origin_txg = + origin_ds->ds_phys->ds_creation_txg; /* move snapshots to this dir */ for (snap = list_head(&pa->shared_snaps); snap; @@ -2614,8 +2573,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) origin_ds->ds_phys->ds_unique_bytes = pa->unique; /* log history record */ - spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, - cr, "dataset = %llu", hds->ds_object); + spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, + "dataset = %llu", hds->ds_object); dsl_dir_close(odd, FTAG); } @@ -2862,7 +2821,7 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) { struct cloneswaparg *csa = arg1; dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; @@ -2937,9 +2896,9 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) * changing that affects the snapused). */ VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, - csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used)); + csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &cdl_used)); VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist, - csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used)); + csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &odl_used)); dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, DD_USED_HEAD, DD_USED_SNAP, tx); } @@ -2975,7 +2934,7 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, csa->ohds->ds_phys->ds_deadlist_obj)); - dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx); + dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); } /* @@ -3110,24 +3069,24 @@ dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); } -extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *); +extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); void -dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; uint64_t effective_value = psa->psa_effective_value; - dsl_prop_set_sync(ds, psa, cr, tx); + dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); if (ds->ds_quota != effective_value) { dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_quota = effective_value; - spa_history_internal_log(LOG_DS_REFQUOTA, - ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu ", + spa_history_log_internal(LOG_DS_REFQUOTA, + ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ", (longlong_t)ds->ds_quota, ds->ds_object); } } @@ -3188,7 +3147,9 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); mutex_enter(&ds->ds_lock); - unique = dsl_dataset_unique(ds); + if (!DS_UNIQUE_IS_ACCURATE(ds)) + dsl_dataset_recalc_head_uniq(ds); + unique = ds->ds_phys->ds_unique_bytes; mutex_exit(&ds->ds_lock); if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { @@ -3205,10 +3166,8 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); } -/* ARGSUSED */ static void -dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, - dmu_tx_t *tx) +dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; @@ -3216,14 +3175,15 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, uint64_t unique; int64_t delta; - dsl_prop_set_sync(ds, psa, cr, tx); + dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); - unique = dsl_dataset_unique(ds); + ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); + unique = ds->ds_phys->ds_unique_bytes; delta = MAX(0, (int64_t)(effective_value - unique)) - MAX(0, (int64_t)(ds->ds_reserved - unique)); ds->ds_reserved = effective_value; @@ -3232,8 +3192,8 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); mutex_exit(&ds->ds_dir->dd_lock); - spa_history_internal_log(LOG_DS_REFRESERV, - ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", + spa_history_log_internal(LOG_DS_REFRESERV, + ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu", (longlong_t)effective_value, ds->ds_object); } @@ -3311,7 +3271,7 @@ dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct dsl_ds_holdarg *ha = arg2; @@ -3343,8 +3303,8 @@ dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) htag, &now, tx)); } - spa_history_internal_log(LOG_DS_USER_HOLD, - dp->dp_spa, tx, cr, "<%s> temp = %d dataset = %llu", htag, + spa_history_log_internal(LOG_DS_USER_HOLD, + dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag, (int)ha->temphold, ds->ds_object); } @@ -3495,10 +3455,6 @@ dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) */ if (!ra->own) return (EBUSY); - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } } dsda.ds = ds; dsda.releasing = B_TRUE; @@ -3509,7 +3465,7 @@ dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) } static void -dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) { struct dsl_ds_releasearg *ra = arg1; dsl_dataset_t *ds = ra->ds; @@ -3520,6 +3476,11 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) uint64_t refs; int error; + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + mutex_enter(&ds->ds_lock); ds->ds_userrefs--; refs = ds->ds_userrefs; @@ -3536,11 +3497,11 @@ dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) dsda.ds = ds; dsda.releasing = B_TRUE; /* We already did the destroy_check */ - dsl_dataset_destroy_sync(&dsda, tag, cr, tx); + dsl_dataset_destroy_sync(&dsda, tag, tx); } - spa_history_internal_log(LOG_DS_USER_RELEASE, - dp->dp_spa, tx, cr, "<%s> %lld dataset = %llu", + spa_history_log_internal(LOG_DS_USER_RELEASE, + dp->dp_spa, tx, "<%s> %lld dataset = %llu", ra->htag, (longlong_t)refs, dsobj); } diff --git a/usr/src/uts/common/fs/zfs/dsl_deleg.c b/usr/src/uts/common/fs/zfs/dsl_deleg.c index 04053fdf20..85490c8d5f 100644 --- a/usr/src/uts/common/fs/zfs/dsl_deleg.c +++ b/usr/src/uts/common/fs/zfs/dsl_deleg.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -148,7 +147,7 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) } static void -dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; nvlist_t *nvp = arg2; @@ -183,8 +182,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(zap_update(mos, jumpobj, perm, 8, 1, &n, tx) == 0); - spa_history_internal_log(LOG_DS_PERM_UPDATE, - dd->dd_pool->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_PERM_UPDATE, + dd->dd_pool->dp_spa, tx, "%s %s dataset = %llu", whokey, perm, dd->dd_phys->dd_head_dataset_obj); } @@ -192,7 +191,7 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } static void -dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; nvlist_t *nvp = arg2; @@ -215,8 +214,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) (void) zap_remove(mos, zapobj, whokey, tx); VERIFY(0 == zap_destroy(mos, jumpobj, tx)); } - spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE, - dd->dd_pool->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_PERM_WHO_REMOVE, + dd->dd_pool->dp_spa, tx, "%s dataset = %llu", whokey, dd->dd_phys->dd_head_dataset_obj); continue; @@ -236,8 +235,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == zap_destroy(mos, jumpobj, tx)); } - spa_history_internal_log(LOG_DS_PERM_REMOVE, - dd->dd_pool->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_PERM_REMOVE, + dd->dd_pool->dp_spa, tx, "%s %s dataset = %llu", whokey, perm, dd->dd_phys->dd_head_dataset_obj); } diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index 0dfb05da2d..ac86da6590 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/dmu.h> @@ -40,8 +39,7 @@ #include "zfs_namecheck.h" static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); -static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, - cred_t *cr, dmu_tx_t *tx); +static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx); /* ARGSUSED */ @@ -64,8 +62,8 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) spa_close(dd->dd_pool->dp_spa, dd); /* - * The props callback list should be empty since they hold the - * dir open. + * The props callback list should have been cleaned up by + * objset_evict(). */ list_destroy(&dd->dd_prop_cbs); mutex_destroy(&dd->dd_lock); @@ -136,6 +134,25 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); } + if (dsl_dir_is_clone(dd)) { + dmu_buf_t *origin_bonus; + dsl_dataset_phys_t *origin_phys; + + /* + * We can't open the origin dataset, because + * that would require opening this dsl_dir. + * Just look at its phys directly instead. + */ + err = dmu_bonus_hold(dp->dp_meta_objset, + dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); + if (err) + goto errout; + origin_phys = origin_bonus->db_data; + dd->dd_origin_txg = + origin_phys->ds_creation_txg; + dmu_buf_rele(origin_bonus, FTAG); + } + winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, dsl_dir_evict); if (winner) { @@ -458,7 +475,7 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) } void -dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_dir_t *dd = ds->ds_dir; @@ -477,7 +494,7 @@ dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) &value); psa.psa_effective_value = 0; /* predict default value */ - dsl_dir_set_reservation_sync(ds, &psa, cr, tx); + dsl_dir_set_reservation_sync(ds, &psa, tx); ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0); ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); @@ -652,15 +669,6 @@ dsl_dir_space_available(dsl_dir_t *dd, if (used > quota) { /* over quota */ myspace = 0; - - /* - * While it's OK to be a little over quota, if - * we think we are using more space than there - * is in the pool (which is already 1.6% more than - * dsl_pool_adjustedsize()), something is very - * wrong. - */ - ASSERT3U(used, <=, spa_get_dspace(dd->dd_pool->dp_spa)); } else { /* * the lesser of the space provided by our parent and @@ -1033,18 +1041,17 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) return (err); } -extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *); +extern dsl_syncfunc_t dsl_prop_set_sync; -/* ARGSUSED */ static void -dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_dir_t *dd = ds->ds_dir; dsl_prop_setarg_t *psa = arg2; uint64_t effective_value = psa->psa_effective_value; - dsl_prop_set_sync(ds, psa, cr, tx); + dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(dd, psa); dmu_buf_will_dirty(dd->dd_dbuf, tx); @@ -1053,8 +1060,8 @@ dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dd->dd_phys->dd_quota = effective_value; mutex_exit(&dd->dd_lock); - spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa, - tx, cr, "%lld dataset = %llu ", + spa_history_log_internal(LOG_DS_QUOTA, dd->dd_pool->dp_spa, + tx, "%lld dataset = %llu ", (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj); } @@ -1141,9 +1148,8 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); } -/* ARGSUSED */ static void -dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_dir_t *dd = ds->ds_dir; @@ -1152,7 +1158,7 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) uint64_t used; int64_t delta; - dsl_prop_set_sync(ds, psa, cr, tx); + dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(dd, psa); dmu_buf_will_dirty(dd->dd_dbuf, tx); @@ -1170,8 +1176,8 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } mutex_exit(&dd->dd_lock); - spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa, - tx, cr, "%lld dataset = %llu", + spa_history_log_internal(LOG_DS_RESERVATION, dd->dd_pool->dp_spa, + tx, "%lld dataset = %llu", (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj); } @@ -1240,7 +1246,6 @@ struct renamearg { const char *mynewname; }; -/*ARGSUSED*/ static int dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -1287,7 +1292,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct renamearg *ra = arg2; @@ -1336,8 +1341,8 @@ dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dd->dd_myname, 8, 1, &dd->dd_object, tx); ASSERT3U(err, ==, 0); - spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, - tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj); + spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, + tx, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj); } int diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 30a5611365..77aa4af4e3 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -19,14 +19,16 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/dsl_pool.h> #include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> #include <sys/dsl_dir.h> #include <sys/dsl_synctask.h> +#include <sys/dsl_scan.h> +#include <sys/dnode.h> #include <sys/dmu_tx.h> #include <sys/dmu_objset.h> #include <sys/arc.h> @@ -50,7 +52,7 @@ kmutex_t zfs_write_limit_lock; static pgcnt_t old_physmem = 0; -static int +int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { uint64_t obj; @@ -88,7 +90,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) offsetof(dsl_dataset_t, ds_synced_link)); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 1, 4, 0); @@ -150,64 +151,7 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - /* get scrub status */ - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, - &dp->dp_scrub_func); - if (err == 0) { - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, - &dp->dp_scrub_queue_obj); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_min_txg); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_max_txg); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_bookmark); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_ddt_bookmark); - if (err && err != ENOENT) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, - &dp->dp_scrub_ddt_class_max); - if (err && err != ENOENT) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &spa->spa_scrub_errors); - if (err) - goto out; - if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { - /* - * A new-type scrub was in progress on an old - * pool. Restart from the beginning, since the - * old software may have changed the pool in the - * meantime. - */ - dsl_pool_scrub_restart(dp); - } - } else { - /* - * It's OK if there is no scrub in progress (and if - * there was an I/O error, ignore it). - */ - err = 0; - } + err = dsl_scan_init(dp, txg); out: rw_exit(&dp->dp_config_rwlock); @@ -247,9 +191,9 @@ dsl_pool_close(dsl_pool_t *dp) arc_flush(dp->dp_spa); txg_fini(dp); + dsl_scan_fini(dp); rw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); - mutex_destroy(&dp->dp_scrub_cancel_lock); taskq_destroy(dp->dp_vnrele_taskq); if (dp->dp_blkstats) kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); @@ -275,6 +219,9 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); ASSERT3U(err, ==, 0); + /* Initialize scan structures */ + VERIFY3U(0, ==, dsl_scan_init(dp, txg)); + /* create and open the root dir */ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, @@ -318,6 +265,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) uint64_t data_written; int err; + /* + * We need to copy dp_space_towrite() before doing + * dsl_sync_task_group_sync(), because + * dsl_dataset_snapshot_reserve_space() will increase + * dp_space_towrite but not actually write anything. + */ + data_written = dp->dp_space_towrite[txg & TXG_MASK]; + tx = dmu_tx_create_assigned(dp, txg); dp->dp_read_overhead = 0; @@ -347,7 +302,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) /* * Sync the datasets again to push out the changes due to - * userquota updates. This must be done before we process the + * userspace updates. This must be done before we process the * sync tasks, because that could cause a snapshot of a dataset * whose ds_bp will be rewritten when we do this 2nd sync. */ @@ -383,13 +338,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dsl_dir_sync(dd, tx); write_time += gethrtime() - start; - if (spa_sync_pass(dp->dp_spa) == 1) { - dp->dp_scrub_prefetch_zio_root = zio_root(dp->dp_spa, NULL, - NULL, ZIO_FLAG_CANFAIL); - dsl_pool_scrub_sync(dp, tx); - (void) zio_wait(dp->dp_scrub_prefetch_zio_root); - } - start = gethrtime(); if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { @@ -407,7 +355,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dmu_tx_commit(tx); - data_written = dp->dp_space_towrite[txg & TXG_MASK]; dp->dp_space_towrite[txg & TXG_MASK] = 0; ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); @@ -679,7 +626,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, NULL, 0, kcred, tx); VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx); + dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx); VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, dp, &dp->dp_origin_snap)); dsl_dataset_rele(ds, FTAG); diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c index f27305c953..cedd777687 100644 --- a/usr/src/uts/common/fs/zfs/dsl_prop.c +++ b/usr/src/uts/common/fs/zfs/dsl_prop.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/zfs_context.h> @@ -260,11 +259,8 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, cbr->cbr_func(cbr->cbr_arg, value); - VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, - NULL, cbr, &dd)); if (need_rwlock) rw_exit(&dp->dp_config_rwlock); - /* Leave dir open until this callback is unregistered */ return (0); } @@ -464,8 +460,6 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1); kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); - /* Clean up from dsl_prop_register */ - dsl_dir_close(dd, cbr); return (0); } @@ -552,7 +546,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, } void -dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; @@ -707,9 +701,9 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } } - spa_history_internal_log((source == ZPROP_SRC_NONE || + spa_history_log_internal((source == ZPROP_SRC_NONE || source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT : - LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr, + LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, "%s=%s dataset = %llu", propname, (valstr == NULL ? "" : valstr), ds->ds_object); @@ -718,7 +712,7 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } void -dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_props_arg_t *pa = arg2; @@ -756,13 +750,13 @@ dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) psa.psa_numints = 1; psa.psa_value = &intval; } - dsl_prop_set_sync(ds, &psa, cr, tx); + dsl_prop_set_sync(ds, &psa, tx); } } void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, - cred_t *cr, dmu_tx_t *tx) + dmu_tx_t *tx) { objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t zapobj = dd->dd_phys->dd_props_zapobj; @@ -773,7 +767,7 @@ dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE); - spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr, + spa_history_log_internal(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, "%s=%llu dataset = %llu", name, (u_longlong_t)val, dd->dd_phys->dd_head_dataset_obj); } diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c new file mode 100644 index 0000000000..f3b401d602 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c @@ -0,0 +1,1660 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/dsl_scan.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> +#include <sys/dnode.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/arc.h> +#include <sys/zap.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zil_impl.h> +#include <sys/zio_checksum.h> +#include <sys/ddt.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#ifdef _KERNEL +#include <sys/zfs_vfsops.h> +#endif + +typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); + +static scan_cb_t dsl_scan_defrag_cb; +static scan_cb_t dsl_scan_scrub_cb; +static scan_cb_t dsl_scan_remove_cb; +static dsl_syncfunc_t dsl_scan_cancel_sync; +static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); + +int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ +int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ +boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ +boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ +enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; +int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */ + +#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ + ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ + (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) + +extern int zfs_txg_timeout; + +/* the order has to match pool_scan_type */ +static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { + NULL, + dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ + dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ +}; + +int +dsl_scan_init(dsl_pool_t *dp, uint64_t txg) +{ + int err; + dsl_scan_t *scn; + spa_t *spa = dp->dp_spa; + uint64_t f; + + scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); + scn->scn_dp = dp; + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_func", sizeof (uint64_t), 1, &f); + if (err == 0) { + /* + * There was an old-style scrub in progress. Restart a + * new-style scrub from the beginning. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("old-style scrub was in progress; " + "restarting new-style scrub in txg %llu", + scn->scn_restart_txg); + + /* + * Load the queue obj from the old location so that it + * can be freed by dsl_scan_done(). + */ + (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_queue", sizeof (uint64_t), 1, + &scn->scn_phys.scn_queue_obj); + } else { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys); + if (err == ENOENT) + return (0); + else if (err) + return (err); + + if (scn->scn_phys.scn_state == DSS_SCANNING && + spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { + /* + * A new-type scrub was in progress on an old + * pool, and the pool was accessed by old + * software. Restart from the beginning, since + * the old software may have changed the pool in + * the meantime. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("new-style scrub was modified " + "by old software; restarting in txg %llu", + scn->scn_restart_txg); + } + } + + spa_scan_stat_init(spa); + return (0); +} + +void +dsl_scan_fini(dsl_pool_t *dp) +{ + if (dp->dp_scan) { + kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); + dp->dp_scan = NULL; + } +} + +/* ARGSUSED */ +static int +dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg1; + + if (scn->scn_phys.scn_state == DSS_SCANNING) + return (EBUSY); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg1; + pool_scan_func_t *funcp = arg2; + dmu_object_type_t ot = 0; + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); + ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + bzero(&scn->scn_phys, sizeof (scn->scn_phys)); + scn->scn_phys.scn_func = *funcp; + scn->scn_phys.scn_state = DSS_SCANNING; + scn->scn_phys.scn_min_txg = 0; + scn->scn_phys.scn_max_txg = tx->tx_txg; + scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ + scn->scn_phys.scn_start_time = gethrestime_sec(); + scn->scn_phys.scn_errors = 0; + scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; + scn->scn_restart_txg = 0; + spa_scan_stat_init(spa); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; + + /* rewrite all disk labels */ + vdev_config_dirty(spa->spa_root_vdev); + + if (vdev_resilver_needed(spa->spa_root_vdev, + &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { + spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); + } else { + spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START); + } + + spa->spa_scrub_started = B_TRUE; + /* + * If this is an incremental scrub, limit the DDT scrub phase + * to just the auto-ditto class (for correctness); the rest + * of the scrub should go faster using top-down pruning. + */ + if (scn->scn_phys.scn_min_txg > TXG_INITIAL) + scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; + + } + + /* back to the generic stuff */ + + if (dp->dp_blkstats == NULL) { + dp->dp_blkstats = + kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + } + bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + + if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) + ot = DMU_OT_ZAP_OTHER; + + scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, + ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); + + dsl_scan_sync_state(scn, tx); + + spa_history_log_internal(LOG_POOL_SCAN, spa, tx, + "func=%u mintxg=%llu maxtxg=%llu", + *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); +} + +/* ARGSUSED */ +static void +dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) +{ + static const char *old_names[] = { + "scrub_bookmark", + "scrub_ddt_bookmark", + "scrub_ddt_class_max", + "scrub_queue", + "scrub_min_txg", + "scrub_max_txg", + "scrub_func", + "scrub_errors", + NULL + }; + + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + int i; + + /* Remove any remnants of an old-style scrub. */ + for (i = 0; old_names[i]; i++) { + (void) zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); + } + + if (scn->scn_phys.scn_queue_obj != 0) { + VERIFY(0 == dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = 0; + } + + /* + * If we were "restarted" from a stopped state, don't bother + * with anything else. + */ + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (complete) + scn->scn_phys.scn_state = DSS_FINISHED; + else + scn->scn_phys.scn_state = DSS_CANCELED; + + spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx, + "complete=%u", complete); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > 0) { + cv_wait(&spa->spa_scrub_io_cv, + &spa->spa_scrub_lock); + } + mutex_exit(&spa->spa_scrub_lock); + spa->spa_scrub_started = B_FALSE; + spa->spa_scrub_active = B_FALSE; + + /* + * If the scrub/resilver completed, update all DTLs to + * reflect this. Whether it succeeded or not, vacate + * all temporary scrub DTLs. + */ + vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, + complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); + if (complete) { + spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? + ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + } + spa_errlog_rotate(spa); + + /* + * We may have finished replacing a device. + * Let the async thread assess this and handle the detach. + */ + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); + } + + scn->scn_phys.scn_end_time = gethrestime_sec(); +} + +/* ARGSUSED */ +static int +dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg1; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return (ENOENT); + return (0); +} + +/* ARGSUSED */ +static void +dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg1; + + dsl_scan_done(scn, B_FALSE, tx); + dsl_scan_sync_state(scn, tx); +} + +int +dsl_scan_cancel(dsl_pool_t *dp) +{ + boolean_t complete = B_FALSE; + int err; + + err = dsl_sync_task_do(dp, dsl_scan_cancel_check, + dsl_scan_cancel_sync, dp->dp_scan, &complete, 3); + return (err); +} + +static void dsl_scan_visitbp(blkptr_t *bp, + const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf, + dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, + dmu_tx_t *tx); +static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, + dmu_objset_type_t ostype, + dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx); + +void +dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) +{ + zio_free(dp->dp_spa, txg, bp); +} + +void +dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) +{ + ASSERT(dsl_pool_sync_context(dp)); + zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); +} + +int +dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + return (arc_read(pio, spa, bpp, pbuf, done, private, + priority, zio_flags, arc_flags, zb)); +} + +int +dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + return (arc_read_nolock(pio, spa, bpp, done, private, + priority, zio_flags, arc_flags, zb)); +} + +static boolean_t +bookmark_is_zero(const zbookmark_t *zb) +{ + return (zb->zb_objset == 0 && zb->zb_object == 0 && + zb->zb_level == 0 && zb->zb_blkid == 0); +} + +/* dnp is the dnode for zb1->zb_object */ +static boolean_t +bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, + const zbookmark_t *zb2) +{ + uint64_t zb1nextL0, zb2thisobj; + + ASSERT(zb1->zb_objset == zb2->zb_objset); + ASSERT(zb2->zb_level == 0); + + /* + * A bookmark in the deadlist is considered to be after + * everything else. + */ + if (zb2->zb_object == DMU_DEADLIST_OBJECT) + return (B_TRUE); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + zb1nextL0 = (zb1->zb_blkid + 1) << + ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + + zb2thisobj = zb2->zb_object ? zb2->zb_object : + zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + + if (zb1->zb_object == DMU_META_DNODE_OBJECT) { + uint64_t nextobj = zb1nextL0 * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; + return (nextobj <= zb2thisobj); + } + + if (zb1->zb_object < zb2thisobj) + return (B_TRUE); + if (zb1->zb_object > zb2thisobj) + return (B_FALSE); + if (zb2->zb_object == DMU_META_DNODE_OBJECT) + return (B_FALSE); + return (zb1nextL0 <= zb2->zb_blkid); +} + +static uint64_t +dsl_scan_ds_maxtxg(dsl_dataset_t *ds) +{ + uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; + if (dsl_dataset_is_snapshot(ds)) + return (MIN(smt, ds->ds_phys->ds_creation_txg)); + return (smt); +} + +static void +dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) +{ + VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys, tx)); +} + +static boolean_t +dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) +{ + uint64_t elapsed_nanosecs; + int mintime; + + /* we never skip user/group accounting objects */ + if (zb && (int64_t)zb->zb_object < 0) + return (B_FALSE); + + if (scn->scn_pausing) + return (B_TRUE); /* we're already pausing */ + + if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark)) + return (B_FALSE); /* we're resuming */ + + /* We only know how to resume from level-0 blocks. */ + if (zb && zb->zb_level != 0) + return (B_FALSE); + + mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scan_min_time_ms; + elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + (elapsed_nanosecs / MICROSEC > mintime && + txg_sync_waiting(scn->scn_dp)) || + spa_shutting_down(scn->scn_dp->dp_spa)) { + if (zb) { + dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + scn->scn_phys.scn_bookmark = *zb; + } + dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); + scn->scn_pausing = B_TRUE; + return (B_TRUE); + } + return (B_FALSE); +} + +typedef struct zil_scan_arg { + dsl_pool_t *zsa_dp; + zil_header_t *zsa_zh; +} zil_scan_arg_t; + +/* ARGSUSED */ +static int +dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + zbookmark_t zb; + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * One block ("stubby") can be allocated a long time ago; we + * want to visit that one because it has been allocated + * (on-disk) even if it hasn't been claimed (even though for + * scrub there's nothing to do to it). + */ + if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + return (0); +} + +/* ARGSUSED */ +static int +dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + if (lrc->lrc_txtype == TX_WRITE) { + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_t zb; + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * birth can be < claim_txg if this record's txg is + * already txg sync'ed (but this log block contains + * other records that are not synced) + */ + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + } + return (0); +} + +static void +dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + zil_scan_arg_t zsa = { dp, zh }; + zilog_t *zilog; + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed (or, in read-only mode, blocks that *would* be claimed). + */ + if (claim_txg == 0 && spa_writeable(dp->dp_spa)) + return; + + zilog = zil_alloc(dp->dp_meta_objset, zh); + + (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, + claim_txg); + + zil_free(zilog); +} + +/* ARGSUSED */ +static void +dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, + uint64_t objset, uint64_t object, uint64_t blkid) +{ + zbookmark_t czb; + uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + + if (zfs_no_scrub_prefetch) + return; + + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) + return; + + SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); + + /* + * XXX need to make sure all of these arc_read() prefetches are + * done before setting xlateall (similar to dsl_read()) + */ + (void) arc_read(scn->scn_prefetch_zio_root, scn->scn_dp->dp_spa, bp, + buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &flags, &czb); +} + +static boolean_t +dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, + const zbookmark_t *zb) +{ + /* + * We never skip over user/group accounting objects (obj<0) + */ + if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) && + (int64_t)zb->zb_object >= 0) { + /* + * If we already visited this bp & everything below (in + * a prior txg sync), don't bother doing it again. + */ + if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) + return (B_TRUE); + + /* + * If we found the block we're trying to resume from, or + * we went past it to a different object, zero it out to + * indicate that it's OK to start checking for pausing + * again. + */ + if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || + zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { + dprintf("resuming at %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); + } + } + return (B_FALSE); +} + +/* + * Return nonzero on i/o error. + * Return new buf to write out in *bufp. + */ +static int +dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, + dnode_phys_t *dnp, const blkptr_t *bp, + const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) +{ + dsl_pool_t *dp = scn->scn_dp; + int err; + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { + dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset, + zb->zb_object, zb->zb_blkid * epb + i); + } + for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + dsl_scan_visitbp(cbp, &czb, dnp, + *bufp, ds, scn, ostype, tx); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) { + uint32_t flags = ARC_WAIT; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + uint32_t flags = ARC_WAIT; + dnode_phys_t *cdnp; + int i, j; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { + for (j = 0; j < cdnp->dn_nblkptr; j++) { + blkptr_t *cbp = &cdnp->dn_blkptr[j]; + dsl_scan_prefetch(scn, *bufp, cbp, + zb->zb_objset, zb->zb_blkid * epb + i, j); + } + } + for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { + dsl_scan_visitdnode(scn, ds, ostype, + cdnp, *bufp, zb->zb_blkid * epb + i, tx); + } + + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + uint32_t flags = ARC_WAIT; + objset_phys_t *osp; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, bufp, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + + osp = (*bufp)->b_data; + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) + dsl_scan_zil(dp, &osp->os_zil_header); + + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx); + + if (OBJSET_BUF_HAS_USERUSED(*bufp)) { + /* + * We also always visit user/group accounting + * objects, and never skip them, even if we are + * pausing. This is necessary so that the space + * deltas from this txg get integrated. + */ + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_groupused_dnode, *bufp, + DMU_GROUPUSED_OBJECT, tx); + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_userused_dnode, *bufp, + DMU_USERUSED_OBJECT, tx); + } + } + + return (0); +} + +static void +dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, + dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf, + uint64_t object, dmu_tx_t *tx) +{ + int j; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + dnp->dn_nlevels - 1, j); + dsl_scan_visitbp(&dnp->dn_blkptr[j], + &czb, dnp, buf, ds, scn, ostype, tx); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zbookmark_t czb; + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + 0, DMU_SPILL_BLKID); + dsl_scan_visitbp(&dnp->dn_spill, + &czb, dnp, buf, ds, scn, ostype, tx); + } +} + +/* + * The arguments are in this order because mdb can only print the + * first 5; we want them to be useful. + */ +static void +dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, + dnode_phys_t *dnp, arc_buf_t *pbuf, + dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + arc_buf_t *buf = NULL; + blkptr_t bp_toread = *bp; + + /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ + + if (dsl_scan_check_pause(scn, zb)) + return; + + if (dsl_scan_check_resume(scn, dnp, zb)) + return; + + if (bp->blk_birth == 0) + return; + + scn->scn_visited_this_txg++; + + dprintf_bp(bp, + "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p", + ds, ds ? ds->ds_object : 0, + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, + pbuf, bp); + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return; + + if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) { + /* + * For non-user-accounting blocks, we need to read the + * new bp (from a deleted snapshot, found in + * check_existing_xlation). If we used the old bp, + * pointers inside this block from before we resumed + * would be untranslated. + * + * For user-accounting blocks, we need to read the old + * bp, because we will apply the entire space delta to + * it (original untranslated -> translations from + * deleted snap -> now). + */ + bp_toread = *bp; + } + + if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx, + &buf) != 0) + return; + + /* + * If dsl_scan_ddt() has aready visited this block, it will have + * already done any translations or scrubbing, so don't call the + * callback again. + */ + if (ddt_class_contains(dp->dp_spa, + scn->scn_phys.scn_ddt_class_max, bp)) { + ASSERT(buf == NULL); + return; + } + + /* + * If this block is from the future (after cur_max_txg), then we + * are doing this on behalf of a deleted snapshot, and we will + * revisit the future block on the next pass of this dataset. + * Don't scan it now unless we need to because something + * under it was modified. + */ + if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) { + scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); + } + if (buf) + (void) arc_buf_remove_ref(buf, &buf); +} + +static void +dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, + dmu_tx_t *tx) +{ + zbookmark_t zb; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + dsl_scan_visitbp(bp, &zb, NULL, NULL, + ds, scn, DMU_OST_NONE, tx); + + dprintf_ds(ds, "finished scan%s", ""); +} + +void +dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { + if (dsl_dataset_is_snapshot(ds)) { + /* Note, scn_cur_{min,max}_txg stays the same. */ + scn->scn_phys.scn_bookmark.zb_objset = + ds->ds_phys->ds_next_snap_obj; + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_next_snap_obj); + scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; + } else { + SET_BOOKMARK(&scn->scn_phys.scn_bookmark, + ZB_DESTROYED_OBJSET, 0, 0, 0); + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset bookmark to -1,0,0,0", + (u_longlong_t)ds->ds_object); + } + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + if (dsl_dataset_is_snapshot(ds)) { + /* + * We keep the same mintxg; it could be > + * ds_creation_txg if the previous snapshot was + * deleted too. + */ + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0); + zfs_dbgmsg("destroying ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_next_snap_obj); + } else { + zfs_dbgmsg("destroying ds %llu; in queue; removing", + (u_longlong_t)ds->ds_object); + } + } else { + zfs_dbgmsg("destroying ds %llu; ignoring", + (u_longlong_t)ds->ds_object); + } + + /* + * dsl_scan_sync() should be called after this, and should sync + * out our changed state, but just to be safe, do it here. + */ + dsl_scan_sync_state(scn, tx); +} + +void +dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); + + if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = + ds->ds_phys->ds_prev_snap_obj; + zfs_dbgmsg("snapshotting ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0); + zfs_dbgmsg("snapshotting ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); + } + dsl_scan_sync_state(scn, tx); +} + +void +dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds1->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds1->ds_object, &mintxg) == 0) { + int err; + + ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); + err = zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); + VERIFY(err == 0 || err == EEXIST); + if (err == EEXIST) { + /* Both were there to begin with */ + VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds1->ds_object, mintxg, tx)); + } + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { + ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); + VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } + + dsl_scan_sync_state(scn, tx); +} + +struct enqueue_clones_arg { + dmu_tx_t *tx; + uint64_t originobj; +}; + +/* ARGSUSED */ +static int +enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + struct enqueue_clones_arg *eca = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err) + return (err); + + if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { + while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + ds = prev; + } + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, + ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); + } + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + dsl_dataset_t *ds; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + + /* + * Iterate over the bps in this ds. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx); + + char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " + "pausing=%u", + (longlong_t)dsobj, dsname, + (longlong_t)scn->scn_phys.scn_cur_min_txg, + (longlong_t)scn->scn_phys.scn_cur_max_txg, + (int)scn->scn_pausing); + kmem_free(dsname, ZFS_MAXNAMELEN); + + if (scn->scn_pausing) + goto out; + + /* + * We've finished this pass over this dataset. + */ + + /* + * If we did not completely visit this dataset, do another pass. + */ + if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { + zfs_dbgmsg("incomplete pass; visiting again"); + scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, + scn->scn_phys.scn_cur_max_txg, tx) == 0); + goto out; + } + + /* + * Add descendent datasets to work queue. + */ + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj, + ds->ds_phys->ds_creation_txg, tx) == 0); + } + if (ds->ds_phys->ds_num_children > 1) { + boolean_t usenext = B_FALSE; + if (ds->ds_phys->ds_next_clones_obj != 0) { + uint64_t count; + /* + * A bug in a previous version of the code could + * cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a + * missing entry. Therefore we can only use the + * next_clones_obj when its count is correct. + */ + int err = zap_count(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, &count); + if (err == 0 && + count == ds->ds_phys->ds_num_children - 1) + usenext = B_TRUE; + } + + if (usenext) { + VERIFY(zap_join_key(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, + scn->scn_phys.scn_queue_obj, + ds->ds_phys->ds_creation_txg, tx) == 0); + } else { + struct enqueue_clones_arg eca; + eca.tx = tx; + eca.originobj = ds->ds_object; + + (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, + NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); + } + } + +out: + dsl_dataset_rele(ds, FTAG); +} + +/* ARGSUSED */ +static int +enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err) + return (err); + + while (ds->ds_phys->ds_prev_snap_obj != 0) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + /* + * If this is a clone, we don't need to worry about it for now. + */ + if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(prev, FTAG); + return (0); + } + dsl_dataset_rele(ds, FTAG); + ds = prev; + } + + VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0); + dsl_dataset_rele(ds, FTAG); + return (0); +} + +/* + * Scrub/dedup interaction. + * + * If there are N references to a deduped block, we don't want to scrub it + * N times -- ideally, we should scrub it exactly once. + * + * We leverage the fact that the dde's replication class (enum ddt_class) + * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest + * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. + * + * To prevent excess scrubbing, the scrub begins by walking the DDT + * to find all blocks with refcnt > 1, and scrubs each of these once. + * Since there are two replication classes which contain blocks with + * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. + * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. + * + * There would be nothing more to say if a block's refcnt couldn't change + * during a scrub, but of course it can so we must account for changes + * in a block's replication class. + * + * Here's an example of what can occur: + * + * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 + * when visited during the top-down scrub phase, it will be scrubbed twice. + * This negates our scrub optimization, but is otherwise harmless. + * + * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 + * on each visit during the top-down scrub phase, it will never be scrubbed. + * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's + * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to + * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 + * while a scrub is in progress, it scrubs the block right then. + */ +static void +dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) +{ + ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; + ddt_entry_t dde = { 0 }; + int error; + uint64_t n = 0; + + while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { + ddt_t *ddt; + + if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) + break; + dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", + (longlong_t)ddb->ddb_class, + (longlong_t)ddb->ddb_type, + (longlong_t)ddb->ddb_checksum, + (longlong_t)ddb->ddb_cursor); + + /* There should be no pending changes to the dedup table */ + ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; + ASSERT(avl_first(&ddt->ddt_tree) == NULL); + + dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); + n++; + + if (dsl_scan_check_pause(scn, NULL)) + break; + } + + zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u", + (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, + (int)scn->scn_pausing); + + ASSERT(error == 0 || error == ENOENT); + ASSERT(error != ENOENT || + ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); +} + +/* ARGSUSED */ +void +dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + const ddt_key_t *ddk = &dde->dde_key; + ddt_phys_t *ddp = dde->dde_phys; + blkptr_t bp; + zbookmark_t zb = { 0 }; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg) + continue; + ddt_bp_create(checksum, ddk, ddp, &bp); + + scn->scn_visited_this_txg++; + scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); + } +} + +static void +dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + zap_cursor_t zc; + zap_attribute_t za; + + if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= + scn->scn_phys.scn_ddt_class_max) { + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_ddt(scn, tx); + if (scn->scn_pausing) + return; + } + + if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { + /* First do the MOS & ORIGIN */ + + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_visit_rootbp(scn, NULL, + &dp->dp_meta_rootbp, tx); + spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); + if (scn->scn_pausing) + return; + + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, + NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); + } else { + dsl_scan_visitds(scn, + dp->dp_origin_snap->ds_object, tx); + } + ASSERT(!scn->scn_pausing); + } else if (scn->scn_phys.scn_bookmark.zb_objset != + ZB_DESTROYED_OBJSET) { + /* + * If we were paused, continue from here. Note if the + * ds we were paused on was deleted, the zb_objset may + * be -1, so we will skip this and find a new objset + * below. + */ + dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); + if (scn->scn_pausing) + return; + } + + /* + * In case we were paused right at the end of the ds, zero the + * bookmark so we don't think that we're still trying to resume. + */ + bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t)); + + /* keep pulling things out of the zap-object-as-queue */ + while (zap_cursor_init(&zc, dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj), + zap_cursor_retrieve(&zc, &za) == 0) { + dsl_dataset_t *ds; + uint64_t dsobj; + + dsobj = strtonum(za.za_name, NULL); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, dsobj, tx)); + + /* Set up min/max txg */ + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + if (za.za_first_integer != 0) { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, + za.za_first_integer); + } else { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, + ds->ds_phys->ds_prev_snap_txg); + } + scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); + dsl_dataset_rele(ds, FTAG); + + dsl_scan_visitds(scn, dsobj, tx); + zap_cursor_fini(&zc); + if (scn->scn_pausing) + return; + } + zap_cursor_fini(&zc); +} + +void +dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dp->dp_scan; + spa_t *spa = dp->dp_spa; + + /* + * Check for scn_restart_txg before checking spa_load_state, so + * that we can restart an old-style scan while the pool is being + * imported (see dsl_scan_init). + */ + if (scn->scn_restart_txg != 0 && + scn->scn_restart_txg <= tx->tx_txg) { + pool_scan_func_t func = POOL_SCAN_SCRUB; + dsl_scan_done(scn, B_FALSE, tx); + if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + func = POOL_SCAN_RESILVER; + zfs_dbgmsg("restarting scan func=%u txg=%llu", + func, tx->tx_txg); + dsl_scan_setup_sync(scn, &func, tx); + } + + if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE || + spa_shutting_down(spa) || + spa_sync_pass(dp->dp_spa) > 1 || + scn->scn_phys.scn_state != DSS_SCANNING) + return; + + scn->scn_visited_this_txg = 0; + scn->scn_pausing = B_FALSE; + scn->scn_sync_start_time = gethrtime(); + spa->spa_scrub_active = B_TRUE; + + if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= + scn->scn_phys.scn_ddt_class_max) { + zfs_dbgmsg("doing scan sync txg %llu; " + "ddt bm=%llu/%llu/%llu/%llx", + (longlong_t)tx->tx_txg, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); + ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); + } else { + zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", + (longlong_t)tx->tx_txg, + (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, + (longlong_t)scn->scn_phys.scn_bookmark.zb_object, + (longlong_t)scn->scn_phys.scn_bookmark.zb_level, + (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); + } + + scn->scn_prefetch_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + dsl_scan_visit(scn, tx); + (void) zio_wait(scn->scn_prefetch_zio_root); + scn->scn_prefetch_zio_root = NULL; + + zfs_dbgmsg("visited %llu blocks in %llums", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC); + + if (!scn->scn_pausing) { + /* finished with scan. */ + zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg); + dsl_scan_done(scn, B_TRUE, tx); + } + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > 0) { + cv_wait(&spa->spa_scrub_io_cv, + &spa->spa_scrub_lock); + } + mutex_exit(&spa->spa_scrub_lock); + } + + dsl_scan_sync_state(scn, tx); +} + +/* + * This will start a new scan, or restart an existing one. + */ +void +dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) +{ + if (txg == 0) { + dmu_tx_t *tx; + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + + txg = dmu_tx_get_txg(tx); + dp->dp_scan->scn_restart_txg = txg; + dmu_tx_commit(tx); + } else { + dp->dp_scan->scn_restart_txg = txg; + } + zfs_dbgmsg("restarting resilver txg=%llu", txg); +} + +boolean_t +dsl_scan_resilvering(dsl_pool_t *dp) +{ + return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && + dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); +} + +/* + * scrub consumers + */ + +static void +count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) +{ + int i; + + /* + * If we resume after a reboot, zab will be NULL; don't record + * incomplete stats in that case. + */ + if (zab == NULL) + return; + + for (i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; + int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; + zfs_blkstat_t *zb = &zab->zab_type[l][t]; + int equal; + + zb->zb_count++; + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + zb->zb_ditto_2_of_2_samevdev++; + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal == 1) + zb->zb_ditto_2_of_3_samevdev++; + else if (equal == 3) + zb->zb_ditto_3_of_3_samevdev++; + break; + } + } +} + +static void +dsl_scan_scrub_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + + zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { + spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; + } + mutex_exit(&spa->spa_scrub_lock); +} + +static int +dsl_scan_scrub_cb(dsl_pool_t *dp, + const blkptr_t *bp, const zbookmark_t *zb) +{ + dsl_scan_t *scn = dp->dp_scan; + size_t size = BP_GET_PSIZE(bp); + spa_t *spa = dp->dp_spa; + uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); + boolean_t needs_io; + int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; + int zio_priority; + + if (phys_birth <= scn->scn_phys.scn_min_txg || + phys_birth >= scn->scn_phys.scn_max_txg) + return (0); + + count_block(dp->dp_blkstats, bp); + + ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); + if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { + zio_flags |= ZIO_FLAG_SCRUB; + zio_priority = ZIO_PRIORITY_SCRUB; + needs_io = B_TRUE; + } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { + zio_flags |= ZIO_FLAG_RESILVER; + zio_priority = ZIO_PRIORITY_RESILVER; + needs_io = B_FALSE; + } + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == ZB_ZIL_LEVEL) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { + vdev_t *vd = vdev_lookup_top(spa, + DVA_GET_VDEV(&bp->blk_dva[d])); + + /* + * Keep track of how much data we've examined so that + * zpool(1M) status can make useful progress reports. + */ + scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); + spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); + + /* if it's a resilver, this may not be in the target range */ + if (!needs_io) { + if (DVA_GET_GANG(&bp->blk_dva[d])) { + /* + * Gang members may be spread across multiple + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. + * XXX -- it would be better to change our + * allocation policy to ensure that all + * gang members reside on the same vdev. + */ + needs_io = B_TRUE; + } else { + needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, + phys_birth, 1); + } + } + } + + if (needs_io && !zfs_no_scrub_io) { + void *data = zio_data_buf_alloc(size); + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(NULL, spa, bp, data, size, + dsl_scan_scrub_done, NULL, zio_priority, + zio_flags, zb)); + } + + /* do not relocate this block */ + return (0); +} + +int +dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) +{ + spa_t *spa = dp->dp_spa; + + /* + * Purge all vdev caches and probe all devices. We do this here + * rather than in sync context because this requires a writer lock + * on the spa_config lock, which we can't do from sync context. The + * spa_scrub_reopen flag indicates that vdev_open() should not + * attempt to start another scrub. + */ + spa_vdev_state_enter(spa, SCL_NONE); + spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + (void) spa_vdev_state_exit(spa, NULL, 0); + + return (dsl_sync_task_do(dp, dsl_scan_setup_check, + dsl_scan_setup_sync, dp->dp_scan, &func, 0)); +} diff --git a/usr/src/uts/common/fs/zfs/dsl_scrub.c b/usr/src/uts/common/fs/zfs/dsl_scrub.c deleted file mode 100644 index b16ff66586..0000000000 --- a/usr/src/uts/common/fs/zfs/dsl_scrub.c +++ /dev/null @@ -1,1214 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include <sys/dsl_pool.h> -#include <sys/dsl_dataset.h> -#include <sys/dsl_prop.h> -#include <sys/dsl_dir.h> -#include <sys/dsl_synctask.h> -#include <sys/dnode.h> -#include <sys/dmu_tx.h> -#include <sys/dmu_objset.h> -#include <sys/arc.h> -#include <sys/zap.h> -#include <sys/zio.h> -#include <sys/zfs_context.h> -#include <sys/fs/zfs.h> -#include <sys/zfs_znode.h> -#include <sys/spa_impl.h> -#include <sys/vdev_impl.h> -#include <sys/zil_impl.h> -#include <sys/zio_checksum.h> -#include <sys/ddt.h> -#include <sys/sa.h> -#include <sys/sa_impl.h> - -typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); - -static scrub_cb_t dsl_pool_scrub_clean_cb; -static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; -static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, - uint64_t objset, uint64_t object); - -int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ -int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ -boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ -boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ -enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; - -extern int zfs_txg_timeout; - -static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { - NULL, - dsl_pool_scrub_clean_cb -}; - -/* ARGSUSED */ -static void -dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_pool_t *dp = arg1; - enum scrub_func *funcp = arg2; - dmu_object_type_t ot = 0; - boolean_t complete = B_FALSE; - - dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx); - - ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE); - ASSERT(*funcp > SCRUB_FUNC_NONE); - ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS); - - dp->dp_scrub_min_txg = 0; - dp->dp_scrub_max_txg = tx->tx_txg; - dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max; - - if (*funcp == SCRUB_FUNC_CLEAN) { - vdev_t *rvd = dp->dp_spa->spa_root_vdev; - - /* rewrite all disk labels */ - vdev_config_dirty(rvd); - - if (vdev_resilver_needed(rvd, - &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { - spa_event_notify(dp->dp_spa, NULL, - ESC_ZFS_RESILVER_START); - dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, - tx->tx_txg); - } else { - spa_event_notify(dp->dp_spa, NULL, - ESC_ZFS_SCRUB_START); - } - - /* zero out the scrub stats in all vdev_stat_t's */ - vdev_scrub_stat_update(rvd, - dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : - POOL_SCRUB_EVERYTHING, B_FALSE); - - /* - * If this is an incremental scrub, limit the DDT scrub phase - * to just the auto-ditto class (for correctness); the rest - * of the scrub should go faster using top-down pruning. - */ - if (dp->dp_scrub_min_txg > TXG_INITIAL) - dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO; - - dp->dp_spa->spa_scrub_started = B_TRUE; - } - - /* back to the generic stuff */ - - if (dp->dp_blkstats == NULL) { - dp->dp_blkstats = - kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); - } - bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); - - if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) - ot = DMU_OT_ZAP_OTHER; - - dp->dp_scrub_func = *funcp; - dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, - ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); - bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); - bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); - dp->dp_scrub_restart = B_FALSE; - dp->dp_spa->spa_scrub_errors = 0; - - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, - &dp->dp_scrub_func, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, - &dp->dp_scrub_queue_obj, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_min_txg, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_max_txg, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_ddt_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, - &dp->dp_scrub_ddt_class_max, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &dp->dp_spa->spa_scrub_errors, tx)); - - spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr, - "func=%u mintxg=%llu maxtxg=%llu", - *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg); -} - -int -dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func) -{ - return (dsl_sync_task_do(dp, NULL, - dsl_pool_scrub_setup_sync, dp, &func, 0)); -} - -/* ARGSUSED */ -static void -dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_pool_t *dp = arg1; - boolean_t *completep = arg2; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - mutex_enter(&dp->dp_scrub_cancel_lock); - - if (dp->dp_scrub_restart) { - dp->dp_scrub_restart = B_FALSE; - *completep = B_FALSE; - } - - /* XXX this is scrub-clean specific */ - mutex_enter(&dp->dp_spa->spa_scrub_lock); - while (dp->dp_spa->spa_scrub_inflight > 0) { - cv_wait(&dp->dp_spa->spa_scrub_io_cv, - &dp->dp_spa->spa_scrub_lock); - } - mutex_exit(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_active = B_FALSE; - - dp->dp_scrub_func = SCRUB_FUNC_NONE; - VERIFY(0 == dmu_object_free(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, tx)); - dp->dp_scrub_queue_obj = 0; - bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); - bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); - - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_QUEUE, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MIN_TXG, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MAX_TXG, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_FUNC, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, tx)); - - (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_BOOKMARK, tx); - (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_CLASS_MAX, tx); - - spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, - "complete=%u", *completep); - - /* below is scrub-clean specific */ - vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE, - *completep); - /* - * If the scrub/resilver completed, update all DTLs to reflect this. - * Whether it succeeded or not, vacate all temporary scrub DTLs. - */ - vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, - *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); - dp->dp_spa->spa_scrub_started = B_FALSE; - if (*completep) - spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ? - ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); - spa_errlog_rotate(dp->dp_spa); - - /* - * We may have finished replacing a device. - * Let the async thread assess this and handle the detach. - */ - spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE); - - dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0; - mutex_exit(&dp->dp_scrub_cancel_lock); -} - -int -dsl_pool_scrub_cancel(dsl_pool_t *dp) -{ - boolean_t complete = B_FALSE; - - return (dsl_sync_task_do(dp, NULL, - dsl_pool_scrub_cancel_sync, dp, &complete, 3)); -} - -void -dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) -{ - /* - * This function will be used by bp-rewrite wad to intercept frees. - */ - zio_free(dp->dp_spa, txg, bpp); -} - -void -dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) -{ - ASSERT(dsl_pool_sync_context(dp)); - zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); -} - -static boolean_t -bookmark_is_zero(const zbookmark_t *zb) -{ - return (zb->zb_objset == 0 && zb->zb_object == 0 && - zb->zb_level == 0 && zb->zb_blkid == 0); -} - -/* dnp is the dnode for zb1->zb_object */ -static boolean_t -bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, - const zbookmark_t *zb2) -{ - uint64_t zb1nextL0, zb2thisobj; - - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT); - ASSERT(zb2->zb_level == 0); - - /* - * A bookmark in the deadlist is considered to be after - * everything else. - */ - if (zb2->zb_object == DMU_DEADLIST_OBJECT) - return (B_TRUE); - - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); - - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); - - if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); - } - - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == DMU_META_DNODE_OBJECT) - return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); -} - -static boolean_t -scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb) -{ - uint64_t elapsed_nanosecs; - int mintime; - - if (dp->dp_scrub_pausing) - return (B_TRUE); /* we're already pausing */ - - if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) - return (B_FALSE); /* we're resuming */ - - /* We only know how to resume from level-0 blocks. */ - if (zb != NULL && zb->zb_level != 0) - return (B_FALSE); - - mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time_ms : - zfs_scrub_min_time_ms; - elapsed_nanosecs = gethrtime() - dp->dp_scrub_start_time; - if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || - (elapsed_nanosecs / MICROSEC > mintime && txg_sync_waiting(dp))) { - if (zb) { - dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", - (longlong_t)zb->zb_objset, - (longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, - (longlong_t)zb->zb_blkid); - dp->dp_scrub_bookmark = *zb; - } - if (ddb) { - dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", - (longlong_t)ddb->ddb_class, - (longlong_t)ddb->ddb_type, - (longlong_t)ddb->ddb_checksum, - (longlong_t)ddb->ddb_cursor); - ASSERT(&dp->dp_scrub_ddt_bookmark == ddb); - } - dp->dp_scrub_pausing = B_TRUE; - return (B_TRUE); - } - return (B_FALSE); -} - -typedef struct zil_traverse_arg { - dsl_pool_t *zta_dp; - zil_header_t *zta_zh; -} zil_traverse_arg_t; - -/* ARGSUSED */ -static int -traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) -{ - zil_traverse_arg_t *zta = arg; - dsl_pool_t *dp = zta->zta_dp; - zil_header_t *zh = zta->zta_zh; - zbookmark_t zb; - - if (bp->blk_birth <= dp->dp_scrub_min_txg) - return (0); - - /* - * One block ("stubby") can be allocated a long time ago; we - * want to visit that one because it has been allocated - * (on-disk) even if it hasn't been claimed (even though for - * plain scrub there's nothing to do to it). - */ - if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) - return (0); - - SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], - ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - - VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); - return (0); -} - -/* ARGSUSED */ -static int -traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) -{ - if (lrc->lrc_txtype == TX_WRITE) { - zil_traverse_arg_t *zta = arg; - dsl_pool_t *dp = zta->zta_dp; - zil_header_t *zh = zta->zta_zh; - lr_write_t *lr = (lr_write_t *)lrc; - blkptr_t *bp = &lr->lr_blkptr; - zbookmark_t zb; - - if (bp->blk_birth <= dp->dp_scrub_min_txg) - return (0); - - /* - * birth can be < claim_txg if this record's txg is - * already txg sync'ed (but this log block contains - * other records that are not synced) - */ - if (claim_txg == 0 || bp->blk_birth < claim_txg) - return (0); - - SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], - lr->lr_foid, ZB_ZIL_LEVEL, - lr->lr_offset / BP_GET_LSIZE(bp)); - - VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); - } - return (0); -} - -static void -traverse_zil(dsl_pool_t *dp, zil_header_t *zh) -{ - uint64_t claim_txg = zh->zh_claim_txg; - zil_traverse_arg_t zta = { dp, zh }; - zilog_t *zilog; - - /* - * We only want to visit blocks that have been claimed but not yet - * replayed (or, in read-only mode, blocks that *would* be claimed). - */ - if (claim_txg == 0 && spa_writeable(dp->dp_spa)) - return; - - zilog = zil_alloc(dp->dp_meta_objset, zh); - - (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta, - claim_txg); - - zil_free(zilog); -} - -static void -scrub_prefetch(dsl_pool_t *dp, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, - uint64_t object, uint64_t blkid) -{ - zbookmark_t czb; - uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; - - if (zfs_no_scrub_prefetch) - return; - - if (BP_IS_HOLE(bp) || bp->blk_birth <= dp->dp_scrub_min_txg || - (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) - return; - - SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); - - (void) arc_read(dp->dp_scrub_prefetch_zio_root, dp->dp_spa, bp, - buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, - &flags, &czb); -} - -static void -scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, - arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) -{ - int err; - arc_buf_t *buf = NULL; - - if (bp->blk_birth <= dp->dp_scrub_min_txg) - return; - - if (scrub_pause(dp, zb, NULL)) - return; - - if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { - /* - * If we already visited this bp & everything below (in - * a prior txg), don't bother doing it again. - */ - if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark)) - return; - - /* - * If we found the block we're trying to resume from, or - * we went past it to a different object, zero it out to - * indicate that it's OK to start checking for pausing - * again. - */ - if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 || - zb->zb_object > dp->dp_scrub_bookmark.zb_object) { - dprintf("resuming at %llx/%llx/%llx/%llx\n", - (longlong_t)zb->zb_objset, - (longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, - (longlong_t)zb->zb_blkid); - bzero(&dp->dp_scrub_bookmark, sizeof (*zb)); - } - } - - /* - * If dsl_pool_scrub_ddt() has aready scrubbed this block, - * don't scrub it again. - */ - if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp)) - (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); - - if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; - int i; - blkptr_t *cbp; - int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - - err = arc_read(NULL, dp->dp_spa, bp, pbuf, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) { - mutex_enter(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_errors++; - mutex_exit(&dp->dp_spa->spa_scrub_lock); - return; - } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { - scrub_prefetch(dp, buf, cbp, zb->zb_objset, - zb->zb_object, zb->zb_blkid * epb + i); - } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); - scrub_visitbp(dp, dnp, buf, cbp, &czb); - } - } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; - dnode_phys_t *cdnp; - int i, j; - int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - - err = arc_read(NULL, dp->dp_spa, bp, pbuf, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) { - mutex_enter(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_errors++; - mutex_exit(&dp->dp_spa->spa_scrub_lock); - return; - } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { - for (j = 0; j < cdnp->dn_nblkptr; j++) { - blkptr_t *cbp = &cdnp->dn_blkptr[j]; - scrub_prefetch(dp, buf, cbp, zb->zb_objset, - zb->zb_blkid * epb + i, j); - } - } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { - scrub_visitdnode(dp, cdnp, buf, zb->zb_objset, - zb->zb_blkid * epb + i); - } - } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t flags = ARC_WAIT; - objset_phys_t *osp; - - err = arc_read_nolock(NULL, dp->dp_spa, bp, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) { - mutex_enter(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_errors++; - mutex_exit(&dp->dp_spa->spa_scrub_lock); - return; - } - - osp = buf->b_data; - - traverse_zil(dp, &osp->os_zil_header); - - scrub_visitdnode(dp, &osp->os_meta_dnode, - buf, zb->zb_objset, DMU_META_DNODE_OBJECT); - if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { - scrub_visitdnode(dp, &osp->os_userused_dnode, - buf, zb->zb_objset, DMU_USERUSED_OBJECT); - scrub_visitdnode(dp, &osp->os_groupused_dnode, - buf, zb->zb_objset, DMU_GROUPUSED_OBJECT); - } - } - - if (buf) - (void) arc_buf_remove_ref(buf, &buf); -} - -static void -scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, - uint64_t objset, uint64_t object) -{ - int j; - - for (j = 0; j < dnp->dn_nblkptr; j++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); - scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); - - if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { - zbookmark_t czb; - SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); - scrub_visitbp(dp, dnp, buf, &dnp->dn_spill, &czb); - } - } -} - -static void -scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) -{ - zbookmark_t zb; - - SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, - ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - scrub_visitbp(dp, NULL, NULL, bp, &zb); -} - -void -dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { - SET_BOOKMARK(&dp->dp_scrub_bookmark, - ZB_DESTROYED_OBJSET, 0, 0, 0); - } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, tx) != 0) { - return; - } - - if (ds->ds_phys->ds_next_snap_obj != 0) { - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_phys->ds_next_snap_obj, tx) == 0); - } - ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); -} - -void -dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); - - if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { - dp->dp_scrub_bookmark.zb_objset = - ds->ds_phys->ds_prev_snap_obj; - } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, tx) == 0) { - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_phys->ds_prev_snap_obj, tx) == 0); - } -} - -void -dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds1->ds_dir->dd_pool; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) { - dp->dp_scrub_bookmark.zb_objset = ds2->ds_object; - } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) { - dp->dp_scrub_bookmark.zb_objset = ds1->ds_object; - } - - if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds1->ds_object, tx) == 0) { - int err = zap_add_int(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, ds2->ds_object, tx); - VERIFY(err == 0 || err == EEXIST); - if (err == EEXIST) { - /* Both were there to begin with */ - VERIFY(0 == zap_add_int(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, ds1->ds_object, tx)); - } - } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds2->ds_object, tx) == 0) { - VERIFY(0 == zap_add_int(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, ds1->ds_object, tx)); - } -} - -struct enqueue_clones_arg { - dmu_tx_t *tx; - uint64_t originobj; -}; - -/* ARGSUSED */ -static int -enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -{ - struct enqueue_clones_arg *eca = arg; - dsl_dataset_t *ds; - int err; - dsl_pool_t *dp; - - err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); - if (err) - return (err); - dp = ds->ds_dir->dd_pool; - - if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { - while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { - dsl_dataset_t *prev; - err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); - - dsl_dataset_rele(ds, FTAG); - if (err) - return (err); - ds = prev; - } - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, eca->tx) == 0); - } - dsl_dataset_rele(ds, FTAG); - return (0); -} - -static void -scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) -{ - dsl_dataset_t *ds; - uint64_t min_txg_save; - - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - - /* - * Iterate over the bps in this ds. - */ - min_txg_save = dp->dp_scrub_min_txg; - dp->dp_scrub_min_txg = - MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg); - scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp); - dp->dp_scrub_min_txg = min_txg_save; - - if (dp->dp_scrub_pausing) - goto out; - - /* - * Add descendent datasets to work queue. - */ - if (ds->ds_phys->ds_next_snap_obj != 0) { - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_phys->ds_next_snap_obj, tx) == 0); - } - if (ds->ds_phys->ds_num_children > 1) { - boolean_t usenext = B_FALSE; - if (ds->ds_phys->ds_next_clones_obj != 0) { - uint64_t count; - /* - * A bug in a previous version of the code could - * cause upgrade_clones_cb() to not set - * ds_next_snap_obj when it should, leading to a - * missing entry. Therefore we can only use the - * next_clones_obj when its count is correct. - */ - int err = zap_count(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, &count); - if (err == 0 && - count == ds->ds_phys->ds_num_children - 1) - usenext = B_TRUE; - } - - if (usenext) { - VERIFY(zap_join(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, - dp->dp_scrub_queue_obj, tx) == 0); - } else { - struct enqueue_clones_arg eca; - eca.tx = tx; - eca.originobj = ds->ds_object; - - (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, - NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); - } - } - -out: - dsl_dataset_rele(ds, FTAG); -} - -/* ARGSUSED */ -static int -enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -{ - dmu_tx_t *tx = arg; - dsl_dataset_t *ds; - int err; - dsl_pool_t *dp; - - err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); - if (err) - return (err); - - dp = ds->ds_dir->dd_pool; - - while (ds->ds_phys->ds_prev_snap_obj != 0) { - dsl_dataset_t *prev; - err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, - FTAG, &prev); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } - - /* - * If this is a clone, we don't need to worry about it for now. - */ - if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { - dsl_dataset_rele(ds, FTAG); - dsl_dataset_rele(prev, FTAG); - return (0); - } - dsl_dataset_rele(ds, FTAG); - ds = prev; - } - - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, tx) == 0); - dsl_dataset_rele(ds, FTAG); - return (0); -} - -/* - * Scrub/dedup interaction. - * - * If there are N references to a deduped block, we don't want to scrub it - * N times -- ideally, we should scrub it exactly once. - * - * We leverage the fact that the dde's replication class (enum ddt_class) - * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest - * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. - * - * To prevent excess scrubbing, the scrub begins by walking the DDT - * to find all blocks with refcnt > 1, and scrubs each of these once. - * Since there are two replication classes which contain blocks with - * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. - * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. - * - * There would be nothing more to say if a block's refcnt couldn't change - * during a scrub, but of course it can so we must account for changes - * in a block's replication class. - * - * Here's an example of what can occur: - * - * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 - * when visited during the top-down scrub phase, it will be scrubbed twice. - * This negates our scrub optimization, but is otherwise harmless. - * - * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 - * on each visit during the top-down scrub phase, it will never be scrubbed. - * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's - * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to - * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 - * while a scrub is in progress, it scrubs the block right then. - */ -static void -dsl_pool_scrub_ddt(dsl_pool_t *dp) -{ - ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark; - ddt_entry_t dde; - int error; - - while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) { - if (ddb->ddb_class > dp->dp_scrub_ddt_class_max) - return; - dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde); - if (scrub_pause(dp, NULL, ddb)) - return; - } - ASSERT(error == ENOENT); - ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max); -} - -void -dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum, - const ddt_entry_t *dde) -{ - const ddt_key_t *ddk = &dde->dde_key; - const ddt_phys_t *ddp = dde->dde_phys; - blkptr_t blk; - zbookmark_t zb = { 0 }; - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0) - continue; - ddt_bp_create(checksum, ddk, ddp, &blk); - scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb); - } -} - -void -dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) -{ - spa_t *spa = dp->dp_spa; - zap_cursor_t zc; - zap_attribute_t za; - boolean_t complete = B_TRUE; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - /* - * If the pool is not loaded, or is trying to unload, leave it alone. - */ - if (spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa)) - return; - - if (dp->dp_scrub_restart) { - enum scrub_func func = dp->dp_scrub_func; - dp->dp_scrub_restart = B_FALSE; - dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); - } - - if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { - /* - * We must have resumed after rebooting; reset the vdev - * stats to know that we're doing a scrub (although it - * will think we're just starting now). - */ - vdev_scrub_stat_update(spa->spa_root_vdev, - dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : - POOL_SCRUB_EVERYTHING, B_FALSE); - } - - dp->dp_scrub_pausing = B_FALSE; - dp->dp_scrub_start_time = gethrtime(); - dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); - spa->spa_scrub_active = B_TRUE; - - if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) { - dsl_pool_scrub_ddt(dp); - if (dp->dp_scrub_pausing) - goto out; - } - - if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) { - /* First do the MOS & ORIGIN */ - scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); - if (dp->dp_scrub_pausing) - goto out; - - if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { - VERIFY(0 == dmu_objset_find_spa(spa, - NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); - } else { - scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); - } - ASSERT(!dp->dp_scrub_pausing); - } else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { - /* - * If we were paused, continue from here. Note if the ds - * we were paused on was destroyed, the zb_objset will be - * ZB_DESTROYED_OBJSET, so we will skip this and find a new - * objset below. - */ - scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); - if (dp->dp_scrub_pausing) - goto out; - } - - /* - * In case we were paused right at the end of the ds, zero the - * bookmark so we don't think that we're still trying to resume. - */ - bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); - - /* keep pulling things out of the zap-object-as-queue */ - while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj), - zap_cursor_retrieve(&zc, &za) == 0) { - VERIFY(0 == zap_remove(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, za.za_name, tx)); - scrub_visitds(dp, za.za_first_integer, tx); - if (dp->dp_scrub_pausing) - break; - zap_cursor_fini(&zc); - } - zap_cursor_fini(&zc); - if (dp->dp_scrub_pausing) - goto out; - - /* done. */ - - dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); - return; -out: - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_ddt_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, - &dp->dp_scrub_ddt_class_max, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &spa->spa_scrub_errors, tx)); -} - -void -dsl_pool_scrub_restart(dsl_pool_t *dp) -{ - mutex_enter(&dp->dp_scrub_cancel_lock); - dp->dp_scrub_restart = B_TRUE; - mutex_exit(&dp->dp_scrub_cancel_lock); -} - -/* - * scrub consumers - */ - -static void -count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) -{ - int i; - - /* - * If we resume after a reboot, zab will be NULL; don't record - * incomplete stats in that case. - */ - if (zab == NULL) - return; - - for (i = 0; i < 4; i++) { - int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; - int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; - zfs_blkstat_t *zb = &zab->zab_type[l][t]; - int equal; - - zb->zb_count++; - zb->zb_asize += BP_GET_ASIZE(bp); - zb->zb_lsize += BP_GET_LSIZE(bp); - zb->zb_psize += BP_GET_PSIZE(bp); - zb->zb_gangs += BP_COUNT_GANG(bp); - - switch (BP_GET_NDVAS(bp)) { - case 2: - if (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) - zb->zb_ditto_2_of_2_samevdev++; - break; - case 3: - equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) + - (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[2])) + - (DVA_GET_VDEV(&bp->blk_dva[1]) == - DVA_GET_VDEV(&bp->blk_dva[2])); - if (equal == 1) - zb->zb_ditto_2_of_3_samevdev++; - else if (equal == 3) - zb->zb_ditto_3_of_3_samevdev++; - break; - } - } -} - -static void -dsl_pool_scrub_clean_done(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - - zio_data_buf_free(zio->io_data, zio->io_size); - - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_inflight--; - cv_broadcast(&spa->spa_scrub_io_cv); - - if (zio->io_error && (zio->io_error != ECKSUM || - !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) - spa->spa_scrub_errors++; - mutex_exit(&spa->spa_scrub_lock); -} - -static int -dsl_pool_scrub_clean_cb(dsl_pool_t *dp, - const blkptr_t *bp, const zbookmark_t *zb) -{ - size_t size = BP_GET_PSIZE(bp); - spa_t *spa = dp->dp_spa; - uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); - boolean_t needs_io; - int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; - int zio_priority; - - if (phys_birth <= dp->dp_scrub_min_txg || - phys_birth >= dp->dp_scrub_max_txg) - return (0); - - count_block(dp->dp_blkstats, bp); - - if (dp->dp_scrub_isresilver == 0) { - /* It's a scrub */ - zio_flags |= ZIO_FLAG_SCRUB; - zio_priority = ZIO_PRIORITY_SCRUB; - needs_io = B_TRUE; - } else { - /* It's a resilver */ - zio_flags |= ZIO_FLAG_RESILVER; - zio_priority = ZIO_PRIORITY_RESILVER; - needs_io = B_FALSE; - } - - /* If it's an intent log block, failure is expected. */ - if (zb->zb_level == ZB_ZIL_LEVEL) - zio_flags |= ZIO_FLAG_SPECULATIVE; - - for (int d = 0; d < BP_GET_NDVAS(bp); d++) { - vdev_t *vd = vdev_lookup_top(spa, - DVA_GET_VDEV(&bp->blk_dva[d])); - - /* - * Keep track of how much data we've examined so that - * zpool(1M) status can make useful progress reports. - */ - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_scrub_examined += - DVA_GET_ASIZE(&bp->blk_dva[d]); - mutex_exit(&vd->vdev_stat_lock); - - /* if it's a resilver, this may not be in the target range */ - if (!needs_io) { - if (DVA_GET_GANG(&bp->blk_dva[d])) { - /* - * Gang members may be spread across multiple - * vdevs, so the best estimate we have is the - * scrub range, which has already been checked. - * XXX -- it would be better to change our - * allocation policy to ensure that all - * gang members reside on the same vdev. - */ - needs_io = B_TRUE; - } else { - needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, - phys_birth, 1); - } - } - } - - if (needs_io && !zfs_no_scrub_io) { - void *data = zio_data_buf_alloc(size); - - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight++; - mutex_exit(&spa->spa_scrub_lock); - - zio_nowait(zio_read(NULL, spa, bp, data, size, - dsl_pool_scrub_clean_done, NULL, zio_priority, - zio_flags, zb)); - } - - /* do not relocate this block */ - return (0); -} - -int -dsl_pool_scrub_clean(dsl_pool_t *dp) -{ - spa_t *spa = dp->dp_spa; - - /* - * Purge all vdev caches and probe all devices. We do this here - * rather than in sync context because this requires a writer lock - * on the spa_config lock, which we can't do from sync context. The - * spa_scrub_reopen flag indicates that vdev_open() should not - * attempt to start another scrub. - */ - spa_vdev_state_enter(spa, SCL_NONE); - spa->spa_scrub_reopen = B_TRUE; - vdev_reopen(spa->spa_root_vdev); - spa->spa_scrub_reopen = B_FALSE; - (void) spa_vdev_state_exit(spa, NULL, 0); - - return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); -} diff --git a/usr/src/uts/common/fs/zfs/dsl_synctask.c b/usr/src/uts/common/fs/zfs/dsl_synctask.c index 81c6334cc8..832685b0fc 100644 --- a/usr/src/uts/common/fs/zfs/dsl_synctask.c +++ b/usr/src/uts/common/fs/zfs/dsl_synctask.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/dmu.h> @@ -29,7 +28,6 @@ #include <sys/dsl_dir.h> #include <sys/dsl_synctask.h> #include <sys/metaslab.h> -#include <sys/cred.h> #define DST_AVG_BLKSHIFT 14 @@ -49,7 +47,6 @@ dsl_sync_task_group_create(dsl_pool_t *dp) list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), offsetof(dsl_sync_task_t, dst_node)); dstg->dstg_pool = dp; - dstg->dstg_cr = CRED(); return (dstg); } @@ -136,7 +133,6 @@ dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) uint64_t txg; dstg->dstg_nowaiter = B_TRUE; - dstg->dstg_cr = NULL; /* it won't be valid by the time we sync */ txg = dmu_tx_get_txg(tx); /* * We don't generally have many sync tasks, so pay the price of @@ -200,8 +196,7 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) */ for (dst = list_head(&dstg->dstg_tasks); dst; dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, - dstg->dstg_cr, tx); + dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx); } } rw_exit(&dp->dp_config_rwlock); diff --git a/usr/src/uts/common/fs/zfs/refcount.c b/usr/src/uts/common/fs/zfs/refcount.c index f1b3b23fe2..8358b4ceeb 100644 --- a/usr/src/uts/common/fs/zfs/refcount.c +++ b/usr/src/uts/common/fs/zfs/refcount.c @@ -19,12 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/refcount.h> diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index dd0fbccc7e..510472a515 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -59,6 +59,7 @@ #include <sys/systeminfo.h> #include <sys/spa_boot.h> #include <sys/zfs_ioctl.h> +#include <sys/dsl_scan.h> #ifdef _KERNEL #include <sys/bootprops.h> @@ -110,7 +111,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, }; -static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); +static dsl_syncfunc_t spa_sync_props; static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, @@ -1105,7 +1106,7 @@ spa_load_spares(spa_t *spa) KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, - spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); + spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); for (i = 0; i < spa->spa_spares.sav_count; i++) @@ -1231,7 +1232,7 @@ spa_load_l2cache(spa_t *spa) l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, - sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); + sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); out: @@ -1429,7 +1430,7 @@ spa_load_verify_done(zio_t *zio) /*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { if (bp != NULL) { zio_t *rio = arg; @@ -1780,6 +1781,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; + spa->spa_prev_software_version = ub->ub_software_version; error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) @@ -1851,6 +1853,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, + &spa->spa_creation_version); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* * Load the persistent error log. If we have an older pool, this will * not be present. @@ -2076,7 +2083,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * Check all DTLs to see if anything needs resilvering. */ - if (vdev_resilver_needed(rvd, NULL, NULL)) + if (!dsl_scan_resilvering(spa->spa_dsl_pool) && + vdev_resilver_needed(rvd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); /* @@ -2375,7 +2383,7 @@ spa_add_spares(spa_t *spa, nvlist_t *config) if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( - spares[i], ZPOOL_CONFIG_STATS, + spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; @@ -2432,7 +2440,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) ASSERT(vd != NULL); VERIFY(nvlist_lookup_uint64_array(l2cache[i], - ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) + == 0); vdev_get_stats(vd, vs); } } @@ -2819,6 +2828,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, cmn_err(CE_PANIC, "failed to add pool config"); } + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, + sizeof (uint64_t), 1, &version, tx) != 0) { + cmn_err(CE_PANIC, "failed to add pool version"); + } + /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; @@ -2861,7 +2876,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); - spa_sync_props(spa, props, CRED(), tx); + spa_sync_props(spa, props, tx); } dmu_tx_commit(tx); @@ -3219,8 +3234,6 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) return (error); } - spa_async_resume(spa); - /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. @@ -3271,6 +3284,8 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } + spa_async_resume(spa); + /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. @@ -3455,7 +3470,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; - spa->spa_final_txg = spa_last_synced_txg(spa) + 1; + spa->spa_final_txg = spa_last_synced_txg(spa) + + TXG_DEFER_SIZE + 1; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); } @@ -3635,7 +3651,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) { - uint64_t txg, open_txg; + uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; @@ -3771,13 +3787,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) vdev_config_dirty(tvd); /* - * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate - * upward when spa_vdev_exit() calls vdev_dtl_reassess(). + * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account + * for any dmu_sync-ed blocks. It will propagate upward when + * spa_vdev_exit() calls vdev_dtl_reassess(). */ - open_txg = txg + TXG_CONCURRENT_STATES - 1; + dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, - TXG_INITIAL, open_txg - TXG_INITIAL + 1); + vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, + dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); @@ -3793,10 +3810,18 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ vdev_dirty(tvd, VDD_DTL, newvd, txg); - (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); + /* + * Restart the resilver + */ + dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + + /* + * Commit the config + */ + (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); - spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, - CRED(), "%s vdev=%s %s vdev=%s", + spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, + "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); @@ -3804,11 +3829,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) spa_strfree(oldvdpath); spa_strfree(newvdpath); - /* - * Kick off a resilver to update newvd. - */ - VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); - return (0); } @@ -4004,7 +4024,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) error = spa_vdev_exit(spa, vd, txg, 0); - spa_history_internal_log(LOG_POOL_VDEV_DETACH, spa, NULL, CRED(), + spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, "vdev=%s", vdpath); spa_strfree(vdpath); @@ -4024,7 +4044,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); - (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + (void) spa_vdev_remove(spa, unspare_guid, + B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } @@ -4266,8 +4287,8 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, if (vml[c] != NULL) { vdev_split(vml[c]); if (error == 0) - spa_history_internal_log(LOG_POOL_VDEV_DETACH, - spa, tx, CRED(), "vdev=%s", + spa_history_log_internal(LOG_POOL_VDEV_DETACH, + spa, tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } @@ -4283,7 +4304,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ - spa_history_internal_log(LOG_POOL_SPLIT, newspa, NULL, CRED(), + spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, "split new pool %s from pool %s", newname, spa_name(spa)); kmem_free(vml, children * sizeof (vdev_t *)); @@ -4359,21 +4380,13 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, } /* - * Removing a device from the vdev namespace requires several steps - * and can take a significant amount of time. As a result we use - * the spa_vdev_config_[enter/exit] functions which allow us to - * grab and release the spa_config_lock while still holding the namespace - * lock. During each step the configuration is synced out. - */ - -/* * Evacuate the device. */ -int +static int spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) { - int error = 0; uint64_t txg; + int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); @@ -4386,14 +4399,12 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) * should no longer have any blocks allocated on it. */ if (vd->vdev_islog) { - error = dmu_objset_find(spa_name(spa), zil_vdev_offline, - NULL, DS_FIND_CHILDREN); + if (vd->vdev_stat.vs_alloc != 0) + error = spa_offline_log(spa); } else { - error = ENOTSUP; /* until we have bp rewrite */ + error = ENOTSUP; } - txg_wait_synced(spa_get_dsl(spa), 0); - if (error) return (error); @@ -4401,6 +4412,7 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) * The evacuation succeeded. Remove any remaining MOS metadata * associated with this vdev, and wait for these changes to sync. */ + ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); txg = spa_vdev_config_enter(spa); vd->vdev_removing = B_TRUE; vdev_dirty(vd, 0, NULL, txg); @@ -4413,7 +4425,7 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) /* * Complete the removal by cleaning up the namespace. */ -void +static void spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; @@ -4424,6 +4436,12 @@ spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vd == vd->vdev_top); + /* + * Only remove any devices which are empty. + */ + if (vd->vdev_stat.vs_alloc != 0) + return; + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); if (list_link_active(&vd->vdev_state_dirty_node)) @@ -4439,15 +4457,19 @@ spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); vdev_add_child(rvd, vd); } - vdev_config_dirty(rvd); - - /* - * Reassess the health of our root vdev. - */ - vdev_reopen(rvd); } /* + * Remove a device from the pool - + * + * Removing a device from the vdev namespace requires several steps + * and can take a significant amount of time. As a result we use + * the spa_vdev_config_[enter/exit] functions which allow us to + * grab and release the spa_config_lock while still holding the namespace + * lock. During each step the configuration is synced out. + */ + +/* * Remove a device from the pool. Currently, this supports removing only hot * spares, slogs, and level 2 ARC devices. */ @@ -4690,40 +4712,38 @@ spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) /* * ========================================================================== - * SPA Scrubbing + * SPA Scanning * ========================================================================== */ int -spa_scrub(spa_t *spa, pool_scrub_type_t type) +spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + if (dsl_scan_resilvering(spa->spa_dsl_pool)) + return (EBUSY); + return (dsl_scan_cancel(spa->spa_dsl_pool)); +} - if ((uint_t)type >= POOL_SCRUB_TYPES) +int +spa_scan(spa_t *spa, pool_scan_func_t func) +{ + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + + if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (ENOTSUP); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ - if (type == POOL_SCRUB_RESILVER && + if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } - if (type == POOL_SCRUB_EVERYTHING && - spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && - spa->spa_dsl_pool->dp_scrub_isresilver) - return (EBUSY); - - if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { - return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); - } else if (type == POOL_SCRUB_NONE) { - return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); - } else { - return (EINVAL); - } + return (dsl_scan(spa->spa_dsl_pool, func)); } /* @@ -4829,8 +4849,8 @@ spa_async_thread(spa_t *spa) * then log an internal history event. */ if (new_space != old_space) { - spa_history_internal_log(LOG_POOL_VDEV_ONLINE, - spa, NULL, CRED(), + spa_history_log_internal(LOG_POOL_VDEV_ONLINE, + spa, NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), new_space, new_space - old_space); } @@ -4874,7 +4894,7 @@ spa_async_thread(spa_t *spa) * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER) - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); + dsl_resilver_restart(spa->spa_dsl_pool, 0); /* * Let the world know that we're done. @@ -4920,6 +4940,7 @@ spa_async_dispatch(spa_t *spa) void spa_async_request(spa_t *spa, int task) { + zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); @@ -5024,7 +5045,7 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], - B_FALSE, B_FALSE, B_TRUE); + B_FALSE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(nvroot, config, list, sav->sav_count) == 0); for (i = 0; i < sav->sav_count; i++) @@ -5064,7 +5085,7 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) * Set zpool properties. */ static void -spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) { spa_t *spa = arg1; objset_t *mos = spa->spa_meta_objset; @@ -5176,8 +5197,8 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) /* log internal history if this is not a zpool create */ if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && tx->tx_txg != TXG_INITIAL) { - spa_history_internal_log(LOG_POOL_PROPSET, - spa, tx, cr, "%s %lld %s", + spa_history_log_internal(LOG_POOL_PROPSET, + spa, tx, "%s %lld %s", nvpair_name(elem), intval, spa_name(spa)); } } @@ -5272,13 +5293,17 @@ spa_sync(spa_t *spa, uint64_t txg) } /* - * If anything has changed in this txg, push the deferred frees - * from the previous txg. If not, leave them alone so that we - * don't generate work on an otherwise idle system. + * If anything has changed in this txg, or if someone is waiting + * for this txg to sync (eg, spa_vdev_remove()), push the + * deferred frees from the previous txg. If not, leave them + * alone so that we don't generate work on an otherwise idle + * system. */ if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || !txg_list_empty(&dp->dp_dirty_dirs, txg) || - !txg_list_empty(&dp->dp_sync_tasks, txg)) + !txg_list_empty(&dp->dp_sync_tasks, txg) || + ((dp->dp_scan->scn_phys.scn_state == DSS_SCANNING || + txg_sync_waiting(dp)) && !spa_shutting_down(spa))) spa_sync_deferred_bplist(spa, defer_bpl, tx, txg); /* @@ -5304,11 +5329,7 @@ spa_sync(spa_t *spa, uint64_t txg) } ddt_sync(spa, txg); - - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight > 0) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - mutex_exit(&spa->spa_scrub_lock); + dsl_scan_sync(dp, tx); while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) vdev_sync(vd, txg); diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c index 68a40bec89..cdeda3f93c 100644 --- a/usr/src/uts/common/fs/zfs/spa_config.c +++ b/usr/src/uts/common/fs/zfs/spa_config.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/spa.h> @@ -419,7 +418,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) split_guid) == 0); } - nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE); + nvroot = vdev_config_generate(spa, vd, getstats, 0); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); diff --git a/usr/src/uts/common/fs/zfs/spa_errlog.c b/usr/src/uts/common/fs/zfs/spa_errlog.c index 4c834e2d4e..282140b3bd 100644 --- a/usr/src/uts/common/fs/zfs/spa_errlog.c +++ b/usr/src/uts/common/fs/zfs/spa_errlog.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -54,36 +53,6 @@ #include <sys/zap.h> #include <sys/zio.h> -/* - * This is a stripped-down version of strtoull, suitable only for converting - * lowercase hexidecimal numbers that don't overflow. - */ -uint64_t -strtonum(const char *str, char **nptr) -{ - uint64_t val = 0; - char c; - int digit; - - while ((c = *str) != '\0') { - if (c >= '0' && c <= '9') - digit = c - '0'; - else if (c >= 'a' && c <= 'f') - digit = 10 + c - 'a'; - else - break; - - val *= 16; - val += digit; - - str++; - } - - if (nptr) - *nptr = (char *)str; - - return (val); -} /* * Convert a bookmark to a string. diff --git a/usr/src/uts/common/fs/zfs/spa_history.c b/usr/src/uts/common/fs/zfs/spa_history.c index 18d4836bc7..212abae5b8 100644 --- a/usr/src/uts/common/fs/zfs/spa_history.c +++ b/usr/src/uts/common/fs/zfs/spa_history.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/spa.h> @@ -33,6 +32,7 @@ #include <sys/utsname.h> #include <sys/cmn_err.h> #include <sys/sunddi.h> +#include "zfs_comutil.h" #ifdef _KERNEL #include <sys/zone.h> #endif @@ -189,7 +189,7 @@ spa_history_zone() */ /*ARGSUSED*/ static void -spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) { spa_t *spa = arg1; history_arg_t *hap = arg2; @@ -244,6 +244,8 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) hap->ha_log_type == LOG_CMD_NORMAL) { VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, history_str) == 0); + + zfs_dbgmsg("command: %s", history_str); } else { VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT, hap->ha_event) == 0); @@ -251,6 +253,11 @@ spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) tx->tx_txg) == 0); VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR, history_str) == 0); + + zfs_dbgmsg("internal %s pool:%s txg:%llu %s", + zfs_history_event_names[hap->ha_event], spa_name(spa), + (longlong_t)tx->tx_txg, history_str); + } VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0); @@ -418,7 +425,7 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) static void log_internal(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, va_list adx) + dmu_tx_t *tx, const char *fmt, va_list adx) { history_arg_t *ha; @@ -441,7 +448,7 @@ log_internal(history_internal_events_t event, spa_t *spa, ha->ha_uid = 0; if (dmu_tx_is_syncing(tx)) { - spa_history_log_sync(spa, ha, cr, tx); + spa_history_log_sync(spa, ha, tx); } else { dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, spa_history_log_sync, spa, ha, 0, tx); @@ -450,8 +457,8 @@ log_internal(history_internal_events_t event, spa_t *spa, } void -spa_history_internal_log(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, ...) +spa_history_log_internal(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, const char *fmt, ...) { dmu_tx_t *htx = tx; va_list adx; @@ -466,7 +473,7 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa, } va_start(adx, fmt); - log_internal(event, spa, htx, cr, fmt, adx); + log_internal(event, spa, htx, fmt, adx); va_end(adx); /* if we didn't get a tx from the caller, commit the one we made */ @@ -481,7 +488,7 @@ spa_history_log_version(spa_t *spa, history_internal_events_t event) uint64_t current_vers = spa_version(spa); if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) { - spa_history_internal_log(event, spa, NULL, CRED(), + spa_history_log_internal(event, spa, NULL, "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s", (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, utsname.nodename, utsname.release, utsname.version, diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index c815cd6113..8faa84a1b0 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -40,6 +40,7 @@ #include <sys/dsl_pool.h> #include <sys/dsl_dir.h> #include <sys/dsl_prop.h> +#include <sys/dsl_scan.h> #include <sys/fs/zfs.h> #include <sys/metaslab_impl.h> #include <sys/arc.h> @@ -888,10 +889,10 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); /* - * If the config changed, notify the scrub thread that it must restart. + * If the config changed, notify the scrub that it must restart. + * This will initiate a resilver if needed. */ if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { - dsl_pool_scrub_restart(spa->spa_dsl_pool); config_changed = B_TRUE; spa->spa_config_generation++; } @@ -1078,7 +1079,6 @@ spa_rename(const char *name, const char *newname) return (0); } - /* * Determine whether a pool with given pool_guid exists. If device_guid is * non-zero, determine whether the pool exists *and* contains a device with the @@ -1209,6 +1209,37 @@ zfs_panic_recover(const char *fmt, ...) } /* + * This is a stripped-down version of strtoull, suitable only for converting + * lowercase hexidecimal numbers that don't overflow. + */ +uint64_t +strtonum(const char *str, char **nptr) +{ + uint64_t val = 0; + char c; + int digit; + + while ((c = *str) != '\0') { + if (c >= '0' && c <= '9') + digit = c - '0'; + else if (c >= 'a' && c <= 'f') + digit = 10 + c - 'a'; + else + break; + + val *= 16; + val += digit; + + str++; + } + + if (nptr) + *nptr = (char *)str; + + return (val); +} + +/* * ========================================================================== * Accessor functions * ========================================================================== @@ -1390,6 +1421,12 @@ spa_max_replication(spa_t *spa) return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } +int +spa_prev_software_version(spa_t *spa) +{ + return (spa->spa_prev_software_version); +} + uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { @@ -1584,3 +1621,45 @@ spa_dedup_checksum(spa_t *spa) { return (spa->spa_dedup_checksum); } + +/* + * Reset pool scan stat per scan pass (or reboot). + */ +void +spa_scan_stat_init(spa_t *spa) +{ + /* data not stored on disk */ + spa->spa_scan_pass_start = gethrestime_sec(); + spa->spa_scan_pass_exam = 0; + vdev_scan_stat_init(spa->spa_root_vdev); +} + +/* + * Get scan stats for zpool status reports + */ +int +spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) +{ + dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; + + if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) + return (ENOENT); + bzero(ps, sizeof (pool_scan_stat_t)); + + /* data stored on disk */ + ps->pss_func = scn->scn_phys.scn_func; + ps->pss_start_time = scn->scn_phys.scn_start_time; + ps->pss_end_time = scn->scn_phys.scn_end_time; + ps->pss_to_examine = scn->scn_phys.scn_to_examine; + ps->pss_examined = scn->scn_phys.scn_examined; + ps->pss_to_process = scn->scn_phys.scn_to_process; + ps->pss_processed = scn->scn_phys.scn_processed; + ps->pss_errors = scn->scn_phys.scn_errors; + ps->pss_state = scn->scn_phys.scn_state; + + /* data not stored on disk */ + ps->pss_pass_start = spa->spa_scan_pass_start; + ps->pss_pass_exam = spa->spa_scan_pass_exam; + + return (0); +} diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index c528fac1a6..f0ad04ff49 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ARC_H @@ -48,7 +47,8 @@ arc_done_func_t arc_getbuf_func; struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; - krwlock_t b_lock; + kmutex_t b_evict_lock; + krwlock_t b_data_lock; void *b_data; arc_evict_func_t *b_efunc; void *b_private; @@ -92,6 +92,8 @@ void arc_buf_add_ref(arc_buf_t *buf, void *tag); int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); +int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, + zbookmark_t *zb); int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h index 54686ba32d..4c05806e3e 100644 --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DBUF_H @@ -278,6 +277,7 @@ void dbuf_evict(dmu_buf_impl_t *db); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, dmu_tx_t *tx); +void dbuf_release_bp(dmu_buf_impl_t *db); void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, struct dmu_tx *); diff --git a/usr/src/uts/common/fs/zfs/sys/ddt.h b/usr/src/uts/common/fs/zfs/sys/ddt.h index 5eab6a2fb2..bd446acafa 100644 --- a/usr/src/uts/common/fs/zfs/sys/ddt.h +++ b/usr/src/uts/common/fs/zfs/sys/ddt.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DDT_H @@ -232,6 +231,8 @@ extern int ddt_load(spa_t *spa); extern void ddt_unload(spa_t *spa); extern void ddt_sync(spa_t *spa, uint64_t txg); extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); +extern int ddt_object_update(ddt_t *ddt, enum ddt_type type, + enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx); extern const ddt_ops_t ddt_zap_ops; diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 9c9369f8a7..7df5d48321 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -118,7 +118,7 @@ typedef enum dmu_object_type { DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ - DMU_OT_SCRUB_QUEUE, /* ZAP */ + DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_USERREFS, /* ZAP */ @@ -128,6 +128,8 @@ typedef enum dmu_object_type { DMU_OT_SA_MASTER_NODE, /* ZAP */ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ + DMU_OT_SCAN_XLATE, /* ZAP */ + DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ DMU_OT_NUMTYPES } dmu_object_type_t; @@ -220,23 +222,8 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" - -/* 4x8 zbookmark_t */ -#define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark" -/* 4x8 ddt_bookmark_t */ -#define DMU_POOL_SCRUB_DDT_BOOKMARK "scrub_ddt_bookmark" -/* 1x8 max_class */ -#define DMU_POOL_SCRUB_DDT_CLASS_MAX "scrub_ddt_class_max" -/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */ -#define DMU_POOL_SCRUB_QUEUE "scrub_queue" -/* 1x8 txg */ -#define DMU_POOL_SCRUB_MIN_TXG "scrub_min_txg" -/* 1x8 txg */ -#define DMU_POOL_SCRUB_MAX_TXG "scrub_max_txg" -/* 1x4 enum scrub_func */ -#define DMU_POOL_SCRUB_FUNC "scrub_func" -/* 1x8 count */ -#define DMU_POOL_SCRUB_ERRORS "scrub_errors" +#define DMU_POOL_CREATION_VERSION "creation_version" +#define DMU_POOL_SCAN "scan" /* * Allocate an object from this objset. The range of object numbers diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h index ef1782b74d..5c5119a207 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h @@ -46,6 +46,9 @@ struct dmu_tx; #define OBJSET_PHYS_SIZE 2048 #define OBJSET_OLD_PHYS_SIZE 1024 +#define OBJSET_BUF_HAS_USERUSED(buf) \ + (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE) + #define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) typedef struct objset_phys { diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h index 5b0821253d..844e7f1aeb 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DMU_TRAVERSE_H @@ -37,9 +36,11 @@ extern "C" { struct dnode_phys; struct dsl_dataset; struct zilog; +struct arc_buf; typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg); + struct arc_buf *pbuf, const zbookmark_t *zb, const struct dnode_phys *dnp, + void *arg); #define TRAVERSE_PRE (1<<0) #define TRAVERSE_POST (1<<1) diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h index 6eb7505ea5..a21971679c 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_DATASET_H @@ -113,7 +112,6 @@ typedef struct dsl_dataset { /* only used in syncing context, only valid for non-snapshots: */ struct dsl_dataset *ds_prev; - uint64_t ds_origin_txg; /* has internal locking: */ bplist_t ds_deadlist; @@ -235,8 +233,7 @@ int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, uint64_t *ref_rsrv); int dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota); -void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, - dmu_tx_t *tx); +dsl_syncfunc_t dsl_dataset_set_quota_sync; int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, uint64_t reservation); diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h index 14a64e019e..327cda6eec 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_DIR_H @@ -90,6 +89,7 @@ struct dsl_dir { kmutex_t dd_lock; list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */ + uint64_t dd_origin_txg; /* gross estimate of space used by in-flight tx's */ uint64_t dd_tempreserved[TXG_SIZE]; @@ -142,6 +142,7 @@ timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" #define ORIGIN_DIR_NAME "$ORIGIN" +#define XLATION_DIR_NAME "$XLATION" #ifdef ZFS_DEBUG #define dprintf_dd(dd, fmt, ...) do { \ diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h index 97541ad2f1..187040e700 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h @@ -32,6 +32,7 @@ #include <sys/zio.h> #include <sys/dnode.h> #include <sys/ddt.h> +#include <sys/arc.h> #ifdef __cplusplus extern "C" { @@ -42,12 +43,7 @@ struct dsl_dir; struct dsl_dataset; struct dsl_pool; struct dmu_tx; - -enum scrub_func { - SCRUB_FUNC_NONE, - SCRUB_FUNC_CLEAN, - SCRUB_FUNC_NUMFUNCS -}; +struct dsl_scan; /* These macros are for indexing into the zfs_all_blkstats_t. */ #define DMU_OT_DEFERRED DMU_OT_NONE @@ -87,25 +83,13 @@ typedef struct dsl_pool { uint64_t dp_write_limit; uint64_t dp_tmp_userrefs_obj; + struct dsl_scan *dp_scan; + /* Uses dp_lock */ kmutex_t dp_lock; uint64_t dp_space_towrite[TXG_SIZE]; uint64_t dp_tempreserved[TXG_SIZE]; - enum scrub_func dp_scrub_func; - uint64_t dp_scrub_queue_obj; - uint64_t dp_scrub_min_txg; - uint64_t dp_scrub_max_txg; - uint64_t dp_scrub_start_time; - uint64_t dp_scrub_ddt_class_max; - zbookmark_t dp_scrub_bookmark; - ddt_bookmark_t dp_scrub_ddt_bookmark; - boolean_t dp_scrub_pausing; - boolean_t dp_scrub_isresilver; - boolean_t dp_scrub_restart; - kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */ - zio_t *dp_scrub_prefetch_zio_root; - /* Has its own locking */ tx_state_t dp_tx; txg_list_t dp_dirty_datasets; @@ -138,20 +122,15 @@ void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); -void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); -void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); -void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, - struct dmu_tx *tx); +int dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb); +int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb); void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); -int dsl_pool_scrub_cancel(dsl_pool_t *dp); -int dsl_pool_scrub_clean(dsl_pool_t *dp); -void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx); -void dsl_pool_scrub_restart(dsl_pool_t *dp); -void dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum, - const ddt_entry_t *dde); - taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp); extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, @@ -159,6 +138,7 @@ extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, dmu_tx_t *tx); extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); +int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); #ifdef __cplusplus } diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h index d8a8ab2d64..a636ad3509 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_PROP_H @@ -91,7 +90,7 @@ int dsl_prop_set(const char *ddname, const char *propname, zprop_source_t source, int intsz, int numints, const void *buf); int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, - cred_t *cr, dmu_tx_t *tx); + dmu_tx_t *tx); void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, zprop_source_t source, uint64_t *value); diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h new file mode 100644 index 0000000000..c0eaa49567 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h @@ -0,0 +1,107 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_DSL_SCAN_H +#define _SYS_DSL_SCAN_H + +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/ddt.h> +#include <sys/bplist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct objset; +struct dsl_dir; +struct dsl_dataset; +struct dsl_pool; +struct dmu_tx; + +/* + * All members of this structure must be uint64_t, for byteswap + * purposes. + */ +typedef struct dsl_scan_phys { + uint64_t scn_func; /* pool_scan_func_t */ + uint64_t scn_state; /* dsl_scan_state_t */ + uint64_t scn_queue_obj; + uint64_t scn_min_txg; + uint64_t scn_max_txg; + uint64_t scn_cur_min_txg; + uint64_t scn_cur_max_txg; + uint64_t scn_start_time; + uint64_t scn_end_time; + uint64_t scn_to_examine; /* total bytes to be scanned */ + uint64_t scn_examined; /* bytes scanned so far */ + uint64_t scn_to_process; + uint64_t scn_processed; + uint64_t scn_errors; /* scan I/O error count */ + uint64_t scn_ddt_class_max; + ddt_bookmark_t scn_ddt_bookmark; + zbookmark_t scn_bookmark; + uint64_t scn_flags; /* dsl_scan_flags_t */ +} dsl_scan_phys_t; + +#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t)) + +typedef enum dsl_scan_flags { + DSF_VISIT_DS_AGAIN = 1<<0, +} dsl_scan_flags_t; + +typedef struct dsl_scan { + struct dsl_pool *scn_dp; + + boolean_t scn_pausing; + uint64_t scn_restart_txg; + uint64_t scn_sync_start_time; + zio_t *scn_prefetch_zio_root; + + /* for debugging / information */ + uint64_t scn_visited_this_txg; + + dsl_scan_phys_t scn_phys; +} dsl_scan_t; + +int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); +void dsl_scan_fini(struct dsl_pool *dp); +void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); +int dsl_scan_cancel(struct dsl_pool *); +int dsl_scan(struct dsl_pool *, pool_scan_func_t); +void dsl_resilver_restart(struct dsl_pool *, uint64_t txg); +boolean_t dsl_scan_resilvering(struct dsl_pool *dp); +boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); +void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx); +void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, + struct dmu_tx *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_SCAN_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h b/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h index 4995bfe5ac..9126290cdb 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_synctask.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_DSL_SYNCTASK_H #define _SYS_DSL_SYNCTASK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/txg.h> #include <sys/zfs_context.h> @@ -38,7 +35,7 @@ extern "C" { struct dsl_pool; typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *); -typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *); +typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *); typedef struct dsl_sync_task { list_node_t dst_node; @@ -53,7 +50,6 @@ typedef struct dsl_sync_task_group { txg_node_t dstg_node; list_t dstg_tasks; struct dsl_pool *dstg_pool; - cred_t *dstg_cr; uint64_t dstg_txg; int dstg_err; int dstg_space; diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h index 5ce6251ddb..583d6303bd 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_METASLAB_H diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h index d3fe7b1f89..bc3ade80f1 100644 --- a/usr/src/uts/common/fs/zfs/sys/refcount.h +++ b/usr/src/uts/common/fs/zfs/sys/refcount.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_REFCOUNT_H #define _SYS_REFCOUNT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/inttypes.h> #include <sys/list.h> #include <sys/zfs_context.h> @@ -91,6 +88,11 @@ typedef struct refcount { atomic_add_64_nv(&(rc)->rc_count, number) #define refcount_remove_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, -number) +#define refcount_transfer(dst, src) { \ + uint64_t __tmp = (src)->rc_count; \ + atomic_add_64(&(src)->rc_count, -__tmp); \ + atomic_add_64(&(dst)->rc_count, __tmp); \ +} #define refcount_init() #define refcount_fini() diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index a26a55cb42..4cc9244cd0 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SPA_H @@ -262,7 +261,7 @@ typedef struct blkptr { #define BP_GET_UCSIZE(bp) \ ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ - BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)); + BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ @@ -432,6 +431,8 @@ extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); +extern void spa_scan_stat_init(spa_t *spa); +extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 @@ -439,6 +440,14 @@ extern void spa_inject_delref(spa_t *spa); #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 +#define SPA_ASYNC_REMOVE_DONE 0x40 +#define SPA_ASYNC_REMOVE_STOP 0x80 + +/* + * Controls the behavior of spa_vdev_remove(). + */ +#define SPA_REMOVE_UNSPARE 0x01 +#define SPA_REMOVE_DONE 0x02 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); @@ -447,6 +456,7 @@ extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); +extern boolean_t spa_vdev_remove_active(spa_t *spa); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, @@ -465,14 +475,19 @@ extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); extern void spa_l2cache_activate(vdev_t *vd); extern void spa_l2cache_drop(spa_t *spa); -/* scrubbing */ -extern int spa_scrub(spa_t *spa, pool_scrub_type_t type); +/* scanning */ +extern int spa_scan(spa_t *spa, pool_scan_func_t func); +extern int spa_scan_stop(spa_t *spa); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); -#define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */ +/* + * DEFERRED_FREE must be large enough that regular blocks are not + * deferred. XXX so can't we change it back to 1? + */ +#define SYNC_PASS_DEFERRED_FREE 2 /* defer frees after this pass */ #define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */ #define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */ @@ -577,6 +592,7 @@ extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); extern int spa_max_replication(spa_t *spa); +extern int spa_prev_software_version(spa_t *spa); extern int spa_busy(void); extern uint8_t spa_get_failmode(spa_t *spa); extern boolean_t spa_suspended(spa_t *spa); @@ -632,8 +648,8 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf, history_log_type_t what); -extern void spa_history_internal_log(history_internal_events_t event, - spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...); +extern void spa_history_log_internal(history_internal_events_t event, + spa_t *spa, dmu_tx_t *tx, const char *fmt, ...); extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt); /* error handling */ diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index 9daec092b4..f857f733d2 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SPA_IMPL_H @@ -150,13 +149,14 @@ struct spa { kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */ - uint64_t spa_scrub_errors; /* scrub I/O error count */ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ uint8_t spa_scrub_finished; /* indicator to rotate logs */ uint8_t spa_scrub_started; /* started since last boot */ uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ + uint64_t spa_scan_pass_start; /* start time per pass/reboot */ + uint64_t spa_scan_pass_exam; /* examined bytes per pass */ kmutex_t spa_async_lock; /* protect async state */ kthread_t *spa_async_thread; /* thread doing async task */ int spa_async_suspended; /* async tasks suspended */ @@ -212,7 +212,8 @@ struct spa { uint64_t spa_did; /* if procp != p0, did of t1 */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ - + uint64_t spa_creation_version; /* version at pool creation */ + uint64_t spa_prev_software_version; /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h index c135df9b10..6ab6aa3135 100644 --- a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_UBERBLOCK_IMPL_H @@ -52,6 +51,9 @@ struct uberblock { uint64_t ub_guid_sum; /* sum of all vdev guids */ uint64_t ub_timestamp; /* UTC time of last sync */ blkptr_t ub_rootbp; /* MOS objset_phys_t */ + + /* highest SPA_VERSION supported by software that wrote this txg */ + uint64_t ub_software_version; }; #ifdef __cplusplus diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index b37516a984..941f234dc6 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_VDEV_H @@ -83,8 +82,7 @@ extern void vdev_split(vdev_t *vd); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); -extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, - boolean_t complete); +extern void vdev_scan_stat_init(vdev_t *vd); extern void vdev_propagate_state(vdev_t *vd); extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux); @@ -126,9 +124,15 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd); +typedef enum vdev_config_flag { + VDEV_CONFIG_SPARE = 1 << 0, + VDEV_CONFIG_L2CACHE = 1 << 1, + VDEV_CONFIG_REMOVING = 1 << 2 +} vdev_config_flag_t; + extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, - boolean_t getstats, boolean_t isspare, boolean_t isl2cache); + boolean_t getstats, vdev_config_flag_t flags); /* * Label routines diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index f78ec5084e..2b886bc588 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -151,7 +151,7 @@ struct vdev { txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ boolean_t vdev_probe_wanted; /* async probe wanted? */ - boolean_t vdev_removing; /* device is being removed? */ + uint64_t vdev_removing; /* device is being removed? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ diff --git a/usr/src/uts/common/fs/zfs/sys/zap.h b/usr/src/uts/common/fs/zfs/sys/zap.h index 3b9de2a2f9..b44fb8fbba 100644 --- a/usr/src/uts/common/fs/zfs/sys/zap.h +++ b/usr/src/uts/common/fs/zfs/sys/zap.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZAP_H @@ -259,7 +258,6 @@ int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, */ int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); - /* * Returns (in name) the name of the entry whose (value & mask) * (za_first_integer) is value, or ENOENT if not found. The string @@ -276,6 +274,14 @@ int zap_value_search(objset_t *os, uint64_t zapobj, */ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); +/* Same as zap_join, but set the values to 'value'. */ +int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, + uint64_t value, dmu_tx_t *tx); + +/* Same as zap_join, but add together any duplicated entries. */ +int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, + dmu_tx_t *tx); + /* * Manipulate entries where the name + value are the "same" (the name is * a stringified version of the value). @@ -286,6 +292,21 @@ int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, dmu_tx_t *tx); +/* Here the key is an int and the value is a different int. */ +int zap_add_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx); +int zap_lookup_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t *valuep); + +/* + * They name is a stringified version of key; increment its value by + * delta. Zero values will be zap_remove()-ed. + */ +int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx); +int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, + dmu_tx_t *tx); + struct zap; struct zap_leaf; typedef struct zap_cursor { diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h index 5aa0efc98d..739a380b75 100644 --- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZAP_IMPL_H @@ -67,9 +66,12 @@ typedef struct mzap_ent { avl_node_t mze_node; int mze_chunkid; uint64_t mze_hash; - mzap_ent_phys_t mze_phys; + uint32_t mze_cd; /* copy from mze_phys->mze_cd */ } mzap_ent_t; +#define MZE_PHYS(zap, mze) \ + (&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid]) + /* * The (fat) zap is stored in one object. It is an array of * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of: diff --git a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h index 173b6b195e..3a33636741 100644 --- a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h +++ b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h @@ -19,13 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZAP_LEAF_H #define _SYS_ZAP_LEAF_H +#include <sys/zap.h> + #ifdef __cplusplus extern "C" { #endif diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h index 450ac1c81b..50ecf9b362 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZFS_DEBUG_H #define _SYS_ZFS_DEBUG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -68,6 +65,16 @@ extern void __dprintf(const char *file, const char *func, extern void zfs_panic_recover(const char *fmt, ...); +typedef struct zfs_dbgmsg { + list_node_t zdm_node; + time_t zdm_timestamp; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +extern void zfs_dbgmsg_init(void); +extern void zfs_dbgmsg_fini(void); +extern void zfs_dbgmsg(const char *fmt, ...); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index f5884f2925..991b9f2f59 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/zfs_context.h> @@ -28,6 +27,7 @@ #include <sys/dmu_impl.h> #include <sys/dmu_tx.h> #include <sys/dsl_pool.h> +#include <sys/dsl_scan.h> #include <sys/callb.h> /* @@ -136,7 +136,7 @@ txg_sync_start(dsl_pool_t *dp) * 32-bit x86. This is due in part to nested pools and * scrub_visitbp() recursion. */ - tx->tx_sync_thread = thread_create(NULL, 12<<10, txg_sync_thread, + tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, dp, 0, &p0, TS_RUN, minclsyspri); mutex_exit(&tx->tx_sync_lock); @@ -366,12 +366,12 @@ txg_sync_thread(dsl_pool_t *dp) uint64_t txg; /* - * We sync when we're scrubbing, there's someone waiting + * We sync when we're scanning, there's someone waiting * on us, or the quiesce thread has handed off a txg to * us, or we have reached our timeout. */ timer = (delta >= timeout ? 0 : timeout - delta); - while ((dp->dp_scrub_func == SCRUB_FUNC_NONE || + while ((dp->dp_scan->scn_phys.scn_state != DSS_SCANNING || spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa)) && !tx->tx_exiting && timer > 0 && diff --git a/usr/src/uts/common/fs/zfs/uberblock.c b/usr/src/uts/common/fs/zfs/uberblock.c index 34d7e0c3ac..692cda137f 100644 --- a/usr/src/uts/common/fs/zfs/uberblock.c +++ b/usr/src/uts/common/fs/zfs/uberblock.c @@ -19,12 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/uberblock_impl.h> #include <sys/vdev_impl.h> @@ -58,6 +55,7 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg) ub->ub_txg = txg; ub->ub_guid_sum = rvd->vdev_guid_sum; ub->ub_timestamp = gethrestime_sec(); + ub->ub_software_version = SPA_VERSION; return (ub->ub_rootbp.blk_birth == txg); } diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 2b77c45574..a61f29b8e7 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -39,6 +39,7 @@ #include <sys/fs/zfs.h> #include <sys/arc.h> #include <sys/zil.h> +#include <sys/dsl_scan.h> /* * Virtual device management. @@ -486,6 +487,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, + &vd->vdev_removing); } if (parent && !parent->vdev_parent) { @@ -860,7 +863,12 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); - if (oldc == 0) + /* + * If the vdev is being removed we don't activate + * the metaslabs since we want to ensure that no new + * allocations are performed on this device. + */ + if (oldc == 0 && !vd->vdev_removing) metaslab_group_activate(vd->vdev_mg); if (txg == 0) @@ -1648,9 +1656,12 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) return; if (vd->vdev_ops->vdev_op_leaf) { + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + mutex_enter(&vd->vdev_dtl_lock); if (scrub_txg != 0 && - (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { + (spa->spa_scrub_started || + (scn && scn->scn_phys.scn_errors == 0))) { /* * We completed a scrub up to scrub_txg. If we * did it without rebooting, then the scrub dtl @@ -2029,7 +2040,10 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } - if (vd->vdev_removing) + /* + * Remove the metadata associated with this vdev once it's empty. + */ + if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove(vd, txg); while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { @@ -2403,7 +2417,7 @@ vdev_allocatable(vdev_t *vd) * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && - !vd->vdev_cant_write && !vd->vdev_ishole && !vd->vdev_removing); + !vd->vdev_cant_write && !vd->vdev_ishole); } boolean_t @@ -2433,7 +2447,6 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) mutex_enter(&vd->vdev_stat_lock); bcopy(&vd->vdev_stat, vs, sizeof (*vs)); - vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); @@ -2455,7 +2468,7 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } - vs->vs_scrub_examined += cvs->vs_scrub_examined; + cvs->vs_scan_removing = cvd->vdev_removing; mutex_exit(&vd->vdev_stat_lock); } } @@ -2472,6 +2485,19 @@ vdev_clear_stats(vdev_t *vd) } void +vdev_scan_stat_init(vdev_t *vd) +{ + vdev_stat_t *vs = &vd->vdev_stat; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_scan_stat_init(vd->vdev_child[c]); + + mutex_enter(&vd->vdev_stat_lock); + vs->vs_scan_processed = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; @@ -2515,8 +2541,17 @@ vdev_stat_update(zio_t *zio, uint64_t psize) mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { - if (flags & ZIO_FLAG_SCRUB_THREAD) - vs->vs_scrub_repaired += psize; + if (flags & ZIO_FLAG_SCRUB_THREAD) { + dsl_scan_phys_t *scn_phys = + &spa->spa_dsl_pool->dp_scan->scn_phys; + uint64_t *processed = &scn_phys->scn_processed; + + /* XXX cleanup? */ + if (vd->vdev_ops->vdev_op_leaf) + atomic_add_64(processed, psize); + vs->vs_scan_processed += psize; + } + if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } @@ -2602,35 +2637,6 @@ vdev_stat_update(zio_t *zio, uint64_t psize) } } -void -vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) -{ - vdev_stat_t *vs = &vd->vdev_stat; - - for (int c = 0; c < vd->vdev_children; c++) - vdev_scrub_stat_update(vd->vdev_child[c], type, complete); - - mutex_enter(&vd->vdev_stat_lock); - - if (type == POOL_SCRUB_NONE) { - /* - * Update completion and end time. Leave everything else alone - * so we can report what happened during the previous scrub. - */ - vs->vs_scrub_complete = complete; - vs->vs_scrub_end = gethrestime_sec(); - } else { - vs->vs_scrub_type = type; - vs->vs_scrub_complete = 0; - vs->vs_scrub_examined = 0; - vs->vs_scrub_repaired = 0; - vs->vs_scrub_start = gethrestime_sec(); - vs->vs_scrub_end = 0; - } - - mutex_exit(&vd->vdev_stat_lock); -} - /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. @@ -2730,7 +2736,7 @@ vdev_config_dirty(vdev_t *vd) * sketchy, but it will work. */ nvlist_free(aux[c]); - aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE); + aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index d11b3df7c6..75ec545345 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -141,6 +140,7 @@ #include <sys/uberblock_impl.h> #include <sys/metaslab.h> #include <sys/zio.h> +#include <sys/dsl_scan.h> #include <sys/fs/zfs.h> /* @@ -208,7 +208,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, */ nvlist_t * vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - boolean_t isspare, boolean_t isl2cache) + vdev_config_flag_t flags) { nvlist_t *nv = NULL; @@ -216,7 +216,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type) == 0); - if (!isspare && !isl2cache) + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); @@ -270,7 +270,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_isspare) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0); - if (!isspare && !isl2cache && vd == vd->vdev_top) { + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && + vd == vd->vdev_top) { VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, vd->vdev_ms_array) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, @@ -281,6 +282,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vd->vdev_asize) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog) == 0); + if (vd->vdev_removing) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, + vd->vdev_removing) == 0); } if (vd->vdev_dtl_smo.smo_object != 0) @@ -293,28 +297,52 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (getstats) { vdev_stat_t vs; + pool_scan_stat_t ps; + vdev_get_stats(vd, &vs); - VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS, + VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0); + + /* provide either current or previous scan information */ + if (spa_scan_get_stats(spa, &ps) == 0) { + VERIFY(nvlist_add_uint64_array(nv, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, + sizeof (pool_scan_stat_t) / sizeof (uint64_t)) + == 0); + } } if (!vd->vdev_ops->vdev_op_leaf) { nvlist_t **child; - int c; + int c, idx; ASSERT(!vd->vdev_ishole); child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); - for (c = 0; c < vd->vdev_children; c++) - child[c] = vdev_config_generate(spa, vd->vdev_child[c], - getstats, isspare, isl2cache); + for (c = 0, idx = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + /* + * If we're generating an nvlist of removing + * vdevs then skip over any device which is + * not being removed. + */ + if ((flags & VDEV_CONFIG_REMOVING) && + !cvd->vdev_removing) + continue; - VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - child, vd->vdev_children) == 0); + child[idx++] = vdev_config_generate(spa, cvd, + getstats, flags); + } + + if (idx) { + VERIFY(nvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, idx) == 0); + } - for (c = 0; c < vd->vdev_children; c++) + for (c = 0; c < idx; c++) nvlist_free(child[c]); kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); @@ -375,12 +403,11 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config) { vdev_t *rvd = spa->spa_root_vdev; uint64_t *array; - uint_t idx; + uint_t c, idx; array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); - idx = 0; - for (int c = 0; c < rvd->vdev_children; c++) { + for (c = 0, idx = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_ishole) diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index 30415c8abb..4b0f5602c1 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/zfs_context.h> @@ -1604,6 +1603,7 @@ vdev_raidz_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } + /* * Report a checksum error for a child of a RAID-Z device. */ diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c index 6be63ef6b0..df5bd46739 100644 --- a/usr/src/uts/common/fs/zfs/zap.c +++ b/usr/src/uts/common/fs/zfs/zap.c @@ -978,6 +978,56 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) } int +zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, + uint64_t value, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err; + + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + if (za.za_integer_length != 8 || za.za_num_integers != 1) + return (EINVAL); + err = zap_add(os, intoobj, za.za_name, + 8, 1, &value, tx); + if (err) + return (err); + } + zap_cursor_fini(&zc); + return (0); +} + +int +zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, + dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err; + + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + uint64_t delta = 0; + + if (za.za_integer_length != 8 || za.za_num_integers != 1) + return (EINVAL); + + err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta); + if (err != 0 && err != ENOENT) + return (err); + delta += za.za_first_integer; + err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx); + if (err) + return (err); + } + zap_cursor_fini(&zc); + return (0); +} + +int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) { char name[20]; @@ -1005,17 +1055,34 @@ zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) } int -zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, - dmu_tx_t *tx) +zap_add_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_add(os, obj, name, 8, 1, &value, tx)); +} + +int +zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) { char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_lookup(os, obj, name, 8, 1, valuep)); +} + +int +zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, + dmu_tx_t *tx) +{ uint64_t value = 0; int err; if (delta == 0) return (0); - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); err = zap_lookup(os, obj, name, 8, 1, &value); if (err != 0 && err != ENOENT) return (err); @@ -1027,6 +1094,15 @@ zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, return (err); } +int +zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_increment(os, obj, name, delta, tx)); +} /* * Routines for iterating over the attributes. @@ -1101,7 +1177,6 @@ again: return (err); } - static void zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) { diff --git a/usr/src/uts/common/fs/zfs/zap_leaf.c b/usr/src/uts/common/fs/zfs/zap_leaf.c index 285d9c5674..19a795db82 100644 --- a/usr/src/uts/common/fs/zfs/zap_leaf.c +++ b/usr/src/uts/common/fs/zfs/zap_leaf.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -37,6 +36,7 @@ #include <sys/zap.h> #include <sys/zap_impl.h> #include <sys/zap_leaf.h> +#include <sys/arc.h> static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); @@ -538,14 +538,6 @@ zap_entry_update(zap_entry_handle_t *zeh, if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks) return (EAGAIN); - /* - * We should search other chained leaves (via - * zap_entry_remove,create?) otherwise returning EAGAIN will - * just send us into an infinite loop if we have to chain - * another leaf block, rather than being able to split this - * block. - */ - zap_leaf_array_free(l, &le->le_value_chunk); le->le_value_chunk = zap_leaf_array_create(l, buf, integer_size, num_integers); diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c index 2de5812fa2..3fc92ff122 100644 --- a/usr/src/uts/common/fs/zfs/zap_micro.c +++ b/usr/src/uts/common/fs/zfs/zap_micro.c @@ -31,6 +31,7 @@ #include <sys/zap_impl.h> #include <sys/zap_leaf.h> #include <sys/avl.h> +#include <sys/arc.h> #ifdef _KERNEL #include <sys/sunddi.h> @@ -254,26 +255,26 @@ mze_compare(const void *arg1, const void *arg2) return (+1); if (mze1->mze_hash < mze2->mze_hash) return (-1); - if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd) + if (mze1->mze_cd > mze2->mze_cd) return (+1); - if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd) + if (mze1->mze_cd < mze2->mze_cd) return (-1); return (0); } static void -mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) +mze_insert(zap_t *zap, int chunkid, uint64_t hash) { mzap_ent_t *mze; ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT(mzep->mze_cd < zap_maxcd(zap)); mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); mze->mze_chunkid = chunkid; mze->mze_hash = hash; - mze->mze_phys = *mzep; + mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; + ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); avl_add(&zap->zap_m.zap_avl, mze); } @@ -289,14 +290,15 @@ mze_find(zap_name_t *zn) ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); mze_tofind.mze_hash = zn->zn_hash; - mze_tofind.mze_phys.mze_cd = 0; + mze_tofind.mze_cd = 0; again: mze = avl_find(avl, &mze_tofind, &idx); if (mze == NULL) mze = avl_nearest(avl, idx, AVL_AFTER); for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { - if (zap_match(zn, mze->mze_phys.mze_name)) + ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); + if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); } if (zn->zn_matchtype == MT_BEST) { @@ -319,12 +321,12 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash) ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); mze_tofind.mze_hash = hash; - mze_tofind.mze_phys.mze_cd = 0; + mze_tofind.mze_cd = 0; cd = 0; for (mze = avl_find(avl, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { - if (mze->mze_phys.mze_cd != cd) + if (mze->mze_cd != cd) break; cd++; } @@ -408,7 +410,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zap->zap_m.zap_num_entries++; zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); - mze_insert(zap, i, zn->zn_hash, mze); + mze_insert(zap, i, zn->zn_hash); zap_name_free(zn); } } @@ -727,11 +729,11 @@ again: other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { if (zn == NULL) { - zn = zap_name_alloc(zap, mze->mze_phys.mze_name, + zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, MT_FIRST); allocdzn = B_TRUE; } - if (zap_match(zn, other->mze_phys.mze_name)) { + if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); @@ -793,9 +795,10 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, } else if (integer_size != 8) { err = EINVAL; } else { - *(uint64_t *)buf = mze->mze_phys.mze_value; + *(uint64_t *)buf = + MZE_PHYS(zap, mze)->mze_value; (void) strlcpy(realname, - mze->mze_phys.mze_name, rn_len); + MZE_PHYS(zap, mze)->mze_name, rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, zn, mze); @@ -932,7 +935,7 @@ again: if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; - mze_insert(zap, i, zn->zn_hash, mze); + mze_insert(zap, i, zn->zn_hash); return; } } @@ -1017,10 +1020,20 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, { zap_t *zap; mzap_ent_t *mze; + uint64_t oldval; const uint64_t *intval = val; zap_name_t *zn; int err; +#ifdef ZFS_DEBUG + /* + * If there is an old value, it shouldn't change across the + * lockdir (eg, due to bprewrite's xlation). + */ + if (integer_size == 8 && num_integers == 1) + (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); +#endif + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); @@ -1044,9 +1057,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, } else { mze = mze_find(zn); if (mze != NULL) { - mze->mze_phys.mze_value = *intval; - zap->zap_m.zap_phys->mz_chunk - [mze->mze_chunkid].mze_value = *intval; + ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); + MZE_PHYS(zap, mze)->mze_value = *intval; } else { mzap_addent(zn, *intval); } @@ -1245,7 +1257,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) err = ENOENT; mze_tofind.mze_hash = zc->zc_hash; - mze_tofind.mze_phys.mze_cd = zc->zc_cd; + mze_tofind.mze_cd = zc->zc_cd; mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); if (mze == NULL) { @@ -1253,18 +1265,16 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) idx, AVL_AFTER); } if (mze) { - ASSERT(0 == bcmp(&mze->mze_phys, - &zc->zc_zap->zap_m.zap_phys->mz_chunk - [mze->mze_chunkid], sizeof (mze->mze_phys))); - + mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); + ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = mzap_normalization_conflict(zc->zc_zap, NULL, mze); za->za_integer_length = 8; za->za_num_integers = 1; - za->za_first_integer = mze->mze_phys.mze_value; - (void) strcpy(za->za_name, mze->mze_phys.mze_name); + za->za_first_integer = mzep->mze_value; + (void) strcpy(za->za_name, mzep->mze_name); zc->zc_hash = mze->mze_hash; - zc->zc_cd = mze->mze_phys.mze_cd; + zc->zc_cd = mze->mze_cd; err = 0; } else { zc->zc_hash = -1ULL; @@ -1313,7 +1323,7 @@ zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) goto out; } zc->zc_hash = mze->mze_hash; - zc->zc_cd = mze->mze_phys.mze_cd; + zc->zc_cd = mze->mze_cd; } out: diff --git a/usr/src/uts/common/fs/zfs/zfs_debug.c b/usr/src/uts/common/fs/zfs/zfs_debug.c new file mode 100644 index 0000000000..d0f411a993 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_debug.c @@ -0,0 +1,95 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/zfs_context.h> + +list_t zfs_dbgmsgs; +int zfs_dbgmsg_size; +kmutex_t zfs_dbgmsgs_lock; +int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */ + +void +zfs_dbgmsg_init(void) +{ + list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), + offsetof(zfs_dbgmsg_t, zdm_node)); + mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +zfs_dbgmsg_fini(void) +{ + zfs_dbgmsg_t *zdm; + + while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) { + int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_destroy(&zfs_dbgmsgs_lock); + ASSERT3U(zfs_dbgmsg_size, ==, 0); +} + +/* + * Print these messages by running: + * echo ::zfs_dbgmsg | mdb -k + * + * Monitor these messages by running: + * dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' + */ +void +zfs_dbgmsg(const char *fmt, ...) +{ + int size; + va_list adx; + zfs_dbgmsg_t *zdm; + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx); + va_end(adx); + + /* + * There is one byte of string in sizeof (zfs_dbgmsg_t), used + * for the terminating null. + */ + zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP); + zdm->zdm_timestamp = gethrestime_sec(); + + va_start(adx, fmt); + (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx); + va_end(adx); + + DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg); + + mutex_enter(&zfs_dbgmsgs_lock); + list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size; + while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) { + zdm = list_remove_head(&zfs_dbgmsgs); + size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_exit(&zfs_dbgmsgs_lock); +} diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index d280125152..de5fb1e4ce 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/types.h> @@ -62,6 +61,7 @@ #include <sys/zfs_ctldir.h> #include <sys/zfs_dir.h> #include <sys/zvol.h> +#include <sys/dsl_scan.h> #include <sharefs/share.h> #include <sys/dmu_objset.h> @@ -117,7 +117,7 @@ void __dprintf(const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; - char buf[256]; + char buf[512]; va_list adx; /* @@ -1237,8 +1237,13 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * zc_cookie scan func (pool_scan_func_t) + */ static int -zfs_ioc_pool_scrub(zfs_cmd_t *zc) +zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; @@ -1246,7 +1251,10 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = spa_scrub(spa, zc->zc_cookie); + if (zc->zc_cookie == POOL_SCAN_NONE) + error = spa_scan_stop(spa); + else + error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); @@ -1402,6 +1410,12 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * zc_nvlist_conf nvlist of devices to remove + * zc_cookie to stop the remove? + */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { @@ -4250,7 +4264,7 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { B_FALSE }, { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE, B_FALSE }, - { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE, + { zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_TRUE }, { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, B_FALSE }, diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index da9163c963..f68dde85f8 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -560,34 +560,6 @@ unregister: } -static void -uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid, - int64_t delta, dmu_tx_t *tx) -{ - uint64_t used = 0; - char buf[32]; - int err; - uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; - - if (delta == 0) - return; - - (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid); - err = zap_lookup(os, obj, buf, 8, 1, &used); - - ASSERT(err == 0 || err == ENOENT); - /* no underflow/overflow */ - ASSERT(delta > 0 || used >= -delta); - ASSERT(delta < 0 || used + delta > used); - used += delta; - if (used == 0) - err = zap_remove(os, obj, buf, tx); - else - err = zap_update(os, obj, buf, 8, 1, &used, tx); - ASSERT(err == 0); - -} - static int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) @@ -2239,9 +2211,8 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) sa_register_update_callback(os, zfs_sa_upgrade); } - spa_history_internal_log(LOG_DS_UPGRADE, - dmu_objset_spa(os), tx, CRED(), - "oldver=%llu newver=%llu dataset = %llu", + spa_history_log_internal(LOG_DS_UPGRADE, + dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu", zfsvfs->z_version, newvers, dmu_objset_id(os)); dmu_tx_commit(tx); diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index e4c435341f..4aa4d10b07 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -36,6 +36,7 @@ #include <sys/dsl_dataset.h> #include <sys/vdev.h> #include <sys/dmu_tx.h> +#include <sys/dsl_pool.h> /* * The zfs intent log (ZIL) saves transaction records of system calls @@ -179,7 +180,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, + error = dsl_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 4e481b16b7..181d07fbdb 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/zfs_context.h> @@ -661,6 +660,9 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, { zio_t *zio; + dprintf_bp(bp, "freeing in txg %llu, pass %u", + (longlong_t)txg, spa->spa_sync_pass); + ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); @@ -2073,6 +2075,8 @@ zio_ddt_write(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } +ddt_entry_t *freedde; /* for debugging */ + static int zio_ddt_free(zio_t *zio) { @@ -2086,7 +2090,7 @@ zio_ddt_free(zio_t *zio) ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_TRUE); + freedde = dde = ddt_lookup(ddt, bp, B_TRUE); ddp = ddt_phys_select(dde, bp); ddt_phys_decref(ddp); ddt_exit(ddt); diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 3481fb0586..7b4577a9f3 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -249,7 +249,7 @@ struct maparg { /*ARGSUSED*/ static int -zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, +zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct maparg *ma = arg; |