diff options
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/cmd/ztest/ztest.c | 43 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/arc.c | 102 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dbuf.c | 38 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dsl_pool.c | 2 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/metaslab.c | 39 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/spa_misc.c | 12 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/arc.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/spa.h | 1 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/txg.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/vdev_impl.h | 7 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zio.h | 45 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/txg.c | 20 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_queue.c | 16 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_debug.c | 7 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zio.c | 30 | ||||
-rw-r--r-- | usr/src/uts/common/klm/klmops.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/klm/nlm_service.c | 23 |
17 files changed, 250 insertions, 145 deletions
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index c12eb098a3..38dfaaba36 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -808,7 +808,7 @@ static uint64_t ztest_get_ashift(void) { if (ztest_opts.zo_ashift == 0) - return (SPA_MINBLOCKSHIFT + ztest_random(3)); + return (SPA_MINBLOCKSHIFT + ztest_random(5)); return (ztest_opts.zo_ashift); } @@ -967,11 +967,28 @@ ztest_random_spa_version(uint64_t initial_version) return (version); } +/* + * Find the largest ashift used + */ +static uint64_t +ztest_spa_get_ashift() { + uint64_t i; + uint64_t ashift = SPA_MINBLOCKSHIFT; + vdev_t *rvd = ztest_spa->spa_root_vdev; + + for (i = 0; i < rvd->vdev_children; i++) { + ashift = MAX(ashift, rvd->vdev_child[i]->vdev_ashift); + } + return (ashift); +} + static int ztest_random_blocksize(void) { - return (1 << (SPA_MINBLOCKSHIFT + - ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1))); + // Choose a block size >= the ashift. + uint64_t block_shift = + ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1); + return (1 << (SPA_MINBLOCKSHIFT + block_shift)); } static int @@ -5765,16 +5782,30 @@ ztest_freeze(void) spa_freeze(spa); /* + * Because it is hard to predict how much space a write will actually + * require beforehand, we leave ourselves some fudge space to write over + * capacity. + */ + uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; + + /* * Run tests that generate log records but don't alter the pool config * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). * We do a txg_wait_synced() after each iteration to force the txg * to increase well beyond the last synced value in the uberblock. * The ZIL should be OK with that. + * + * Run a random number of times less than zo_maxloops and ensure we do + * not run out of space on the pool. */ while (ztest_random(10) != 0 && - numloops++ < ztest_opts.zo_maxloops) { - ztest_dmu_write_parallel(zd, 0); - ztest_dmu_object_alloc_free(zd, 0); + numloops++ < ztest_opts.zo_maxloops && + metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { + ztest_od_t od; + ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); + VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); + ztest_io(zd, od.od_object, + ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); txg_wait_synced(spa_get_dsl(spa), 0); } diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 168355ddda..911cde9717 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -105,7 +105,7 @@ * with the buffer may be evicted prior to the callback. The callback * must be made with *no locks held* (to prevent deadlock). Additionally, * the users of callbacks must ensure that their private data is - * protected from simultaneous callbacks from arc_buf_evict() + * protected from simultaneous callbacks from arc_clear_callback() * and arc_do_user_evicts(). * * Note that the majority of the performance stats are manipulated @@ -1498,8 +1498,12 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) } } +/* + * Free up buf->b_data and if 'remove' is set, then pull the + * arc_buf_t off of the the arc_buf_hdr_t's list and free it. + */ static void -arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) +arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) { arc_buf_t **bufp; @@ -1550,7 +1554,7 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) } /* only remove the buf if requested */ - if (!all) + if (!remove) return; /* remove the buf from the hdr list */ @@ -2124,7 +2128,7 @@ arc_do_user_evicts(void) mutex_exit(&arc_eviction_mtx); if (buf->b_efunc != NULL) - VERIFY(buf->b_efunc(buf) == 0); + VERIFY0(buf->b_efunc(buf->b_private)); buf->b_efunc = NULL; buf->b_private = NULL; @@ -3263,16 +3267,25 @@ arc_freed(spa_t *spa, const blkptr_t *bp) } /* - * This is used by the DMU to let the ARC know that a buffer is - * being evicted, so the ARC should clean up. If this arc buf - * is not yet in the evicted state, it will be put there. + * Clear the user eviction callback set by arc_set_callback(), first calling + * it if it exists. Because the presence of a callback keeps an arc_buf cached + * clearing the callback may result in the arc_buf being destroyed. However, + * it will not result in the *last* arc_buf being destroyed, hence the data + * will remain cached in the ARC. We make a copy of the arc buffer here so + * that we can process the callback without holding any locks. + * + * It's possible that the callback is already in the process of being cleared + * by another thread. In this case we can not clear the callback. + * + * Returns B_TRUE if the callback was successfully called and cleared. */ -int -arc_buf_evict(arc_buf_t *buf) +boolean_t +arc_clear_callback(arc_buf_t *buf) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; - arc_buf_t **bufp; + arc_evict_func_t *efunc = buf->b_efunc; + void *private = buf->b_private; mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; @@ -3282,17 +3295,16 @@ arc_buf_evict(arc_buf_t *buf) */ ASSERT(buf->b_data == NULL); mutex_exit(&buf->b_evict_lock); - return (0); + return (B_FALSE); } else if (buf->b_data == NULL) { - arc_buf_t copy = *buf; /* structure assignment */ /* * We are on the eviction list; process this buffer now * but let arc_do_user_evicts() do the reaping. */ buf->b_efunc = NULL; mutex_exit(&buf->b_evict_lock); - VERIFY(copy.b_efunc(©) == 0); - return (1); + VERIFY0(efunc(private)); + return (B_TRUE); } hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); @@ -3302,48 +3314,21 @@ arc_buf_evict(arc_buf_t *buf) ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); - /* - * Pull this buffer off of the hdr - */ - bufp = &hdr->b_buf; - while (*bufp != buf) - bufp = &(*bufp)->b_next; - *bufp = buf->b_next; - - ASSERT(buf->b_data != NULL); - arc_buf_destroy(buf, FALSE, FALSE); - - if (hdr->b_datacnt == 0) { - arc_state_t *old_state = hdr->b_state; - arc_state_t *evicted_state; - - ASSERT(hdr->b_buf == NULL); - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - - evicted_state = - (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; - - mutex_enter(&old_state->arcs_mtx); - mutex_enter(&evicted_state->arcs_mtx); - - arc_change_state(evicted_state, hdr, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags |= ARC_IN_HASH_TABLE; - hdr->b_flags &= ~ARC_BUF_AVAILABLE; + buf->b_efunc = NULL; + buf->b_private = NULL; - mutex_exit(&evicted_state->arcs_mtx); - mutex_exit(&old_state->arcs_mtx); + if (hdr->b_datacnt > 1) { + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy(buf, FALSE, TRUE); + } else { + ASSERT(buf == hdr->b_buf); + hdr->b_flags |= ARC_BUF_AVAILABLE; + mutex_exit(&buf->b_evict_lock); } - mutex_exit(hash_lock); - mutex_exit(&buf->b_evict_lock); - VERIFY(buf->b_efunc(buf) == 0); - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_hdr = NULL; - buf->b_next = NULL; - kmem_cache_free(buf_cache, buf); - return (1); + mutex_exit(hash_lock); + VERIFY0(efunc(private)); + return (B_TRUE); } /* @@ -3489,17 +3474,6 @@ arc_released(arc_buf_t *buf) return (released); } -int -arc_has_callback(arc_buf_t *buf) -{ - int callback; - - mutex_enter(&buf->b_evict_lock); - callback = (buf->b_efunc != NULL); - mutex_exit(&buf->b_evict_lock); - return (callback); -} - #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 3480c64ec0..83afd52df4 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -181,8 +181,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) } /* - * Remove an entry from the hash table. This operation will - * fail if there are any existing holds on the db. + * Remove an entry from the hash table. It must be in the EVICTING state. */ static void dbuf_hash_remove(dmu_buf_impl_t *db) @@ -194,7 +193,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) dmu_buf_impl_t *dbf, **dbp; /* - * We musn't hold db_mtx to maintin lock ordering: + * We musn't hold db_mtx to maintain lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(refcount_is_zero(&db->db_holds)); @@ -431,7 +430,6 @@ static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); db->db_buf = buf; if (buf != NULL) { ASSERT(buf->b_data != NULL); @@ -1555,12 +1553,15 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) * when we are not holding the dn_dbufs_mtx, we can't clear the * entry in the dn_dbufs list. We have to wait until dbuf_destroy() * in this case. For callers from the DMU we will usually see: - * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() + * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() * For the arc callback, we will usually see: * dbuf_do_evict()->dbuf_clear();dbuf_destroy() * Sometimes, though, we will get a mix of these two: - * DMU: dbuf_clear()->arc_buf_evict() + * DMU: dbuf_clear()->arc_clear_callback() * ARC: dbuf_do_evict()->dbuf_destroy() + * + * This routine will dissociate the dbuf from the arc, by calling + * arc_clear_callback(), but will not evict the data from the ARC. */ void dbuf_clear(dmu_buf_impl_t *db) @@ -1568,7 +1569,7 @@ dbuf_clear(dmu_buf_impl_t *db) dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; - int dbuf_gone = FALSE; + boolean_t dbuf_gone = B_FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); @@ -1614,7 +1615,7 @@ dbuf_clear(dmu_buf_impl_t *db) } if (db->db_buf) - dbuf_gone = arc_buf_evict(db->db_buf); + dbuf_gone = arc_clear_callback(db->db_buf); if (!dbuf_gone) mutex_exit(&db->db_mtx); @@ -1782,8 +1783,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, static int dbuf_do_evict(void *private) { - arc_buf_t *buf = private; - dmu_buf_impl_t *db = buf->b_private; + dmu_buf_impl_t *db = private; if (!MUTEX_HELD(&db->db_mtx)) mutex_enter(&db->db_mtx); @@ -2146,11 +2146,23 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) * block on-disk. If so, then we simply evict * ourselves. */ - if (!DBUF_IS_CACHEABLE(db) || - arc_buf_eviction_needed(db->db_buf)) + if (!DBUF_IS_CACHEABLE(db)) { + if (db->db_blkptr != NULL && + !BP_IS_HOLE(db->db_blkptr) && + !BP_IS_EMBEDDED(db->db_blkptr)) { + spa_t *spa = + dmu_objset_spa(db->db_objset); + blkptr_t bp = *db->db_blkptr; + dbuf_clear(db); + arc_freed(spa, &bp); + } else { + dbuf_clear(db); + } + } else if (arc_buf_eviction_needed(db->db_buf)) { dbuf_clear(db); - else + } else { mutex_exit(&db->db_mtx); + } } } else { mutex_exit(&db->db_mtx); diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 5aef3977b4..27e3c959a4 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. */ diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 2e2c972387..6e9ae37ebf 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -67,6 +67,21 @@ int zfs_condense_pct = 200; int zfs_condense_never = 0; /* + * Condensing a metaslab is not guaranteed to actually reduce the amount of + * space used on disk. In particular, a space map uses data in increments of + * MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the + * same number of blocks after condensing. Since the goal of condensing is to + * reduce the number of IOPs required to read the space map, we only want to + * condense when we can be sure we will reduce the number of blocks used by the + * space map. Unfortunately, we cannot precisely compute whether or not this is + * the case in metaslab_should_condense since we are holding ms_lock. Instead, + * we apply the following heuristic: do not condense a spacemap unless the + * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold + * blocks. + */ +int zfs_metaslab_condense_block_threshold = 4; + +/* * The zfs_mg_noalloc_threshold defines which metaslab groups should * be eligible for allocation. The value is defined as a percentage of * a free space. Metaslab groups that have more free space than @@ -1284,6 +1299,8 @@ metaslab_group_preload(metaslab_group_t *mg) * times the size than the free space range tree representation * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). * + * 3. The on-disk size of the space map should actually decrease. + * * Checking the first condition is tricky since we don't want to walk * the entire AVL tree calculating the estimated on-disk size. Instead we * use the size-ordered range tree in the metaslab and calculate the @@ -1294,13 +1311,21 @@ metaslab_group_preload(metaslab_group_t *mg) * To determine the second criterion we use a best-case estimate and assume * each segment can be represented on-disk as a single 64-bit entry. We refer * to this best-case estimate as the space map's minimal form. + * + * Unfortunately, we cannot compute the on-disk size of the space map in this + * context because we cannot accurately compute the effects of compression, etc. + * Instead, we apply the heuristic described in the block comment for + * zfs_metaslab_condense_block_threshold - we only condense if the space used + * is greater than a threshold number of blocks. */ static boolean_t metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; range_seg_t *rs; - uint64_t size, entries, segsz; + uint64_t size, entries, segsz, object_size, optimal_size, record_size; + dmu_object_info_t doi; + uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); @@ -1327,9 +1352,15 @@ metaslab_should_condense(metaslab_t *msp) entries = size / (MIN(size, SM_RUN_MAX)); segsz = entries * sizeof (uint64_t); - return (segsz <= space_map_length(msp->ms_sm) && - space_map_length(msp->ms_sm) >= (zfs_condense_pct * - sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100); + optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); + object_size = space_map_length(msp->ms_sm); + + dmu_object_info_from_db(sm->sm_dbuf, &doi); + record_size = MAX(doi.doi_data_block_size, vdev_blocksize); + + return (segsz <= object_size && + object_size >= (optimal_size * zfs_condense_pct / 100) && + object_size > zfs_metaslab_condense_block_threshold * record_size); } /* diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 14bd2a0176..0504362726 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -1876,6 +1876,16 @@ spa_writeable(spa_t *spa) return (!!(spa->spa_mode & FWRITE)); } +/* + * Returns true if there is a pending sync task in any of the current + * syncing txg, the current quiescing txg, or the current open txg. + */ +boolean_t +spa_has_pending_synctask(spa_t *spa) +{ + return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks)); +} + int spa_mode(spa_t *spa) { diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index 15ef0f58d0..04a20c13ee 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -95,7 +95,6 @@ boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); -int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); boolean_t arc_buf_eviction_needed(arc_buf_t *buf); @@ -114,7 +113,7 @@ zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, void arc_freed(spa_t *spa, const blkptr_t *bp); void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); -int arc_buf_evict(arc_buf_t *buf); +boolean_t arc_clear_callback(arc_buf_t *buf); void arc_flush(spa_t *spa); void arc_tempreserve_clear(uint64_t reserve); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 36b6c93bde..6c7d34cb82 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -780,6 +780,7 @@ extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); +extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_mode(spa_t *spa); extern uint64_t strtonum(const char *str, char **nptr); diff --git a/usr/src/uts/common/fs/zfs/sys/txg.h b/usr/src/uts/common/fs/zfs/sys/txg.h index e96c1fa8aa..77e2dba251 100644 --- a/usr/src/uts/common/fs/zfs/sys/txg.h +++ b/usr/src/uts/common/fs/zfs/sys/txg.h @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_TXG_H @@ -112,6 +112,7 @@ extern boolean_t txg_sync_waiting(struct dsl_pool *dp); extern void txg_list_create(txg_list_t *tl, size_t offset); extern void txg_list_destroy(txg_list_t *tl); extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg); +extern boolean_t txg_all_lists_empty(txg_list_t *tl); extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg); extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 9961a373ab..0326aa9c03 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_IMPL_H @@ -230,8 +230,11 @@ struct vdev { #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) +/* The largest uberblock we support is 8k. */ +#define MAX_UBERBLOCK_SHIFT (13) #define VDEV_UBERBLOCK_SHIFT(vd) \ - MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT) + MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \ + MAX_UBERBLOCK_SHIFT) #define VDEV_UBERBLOCK_COUNT(vd) \ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) #define VDEV_UBERBLOCK_OFFSET(vd, n) \ diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 464ecc75fa..248b08501c 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -165,19 +165,20 @@ enum zio_flag { ZIO_FLAG_RESILVER = 1 << 3, ZIO_FLAG_SCRUB = 1 << 4, ZIO_FLAG_SCAN_THREAD = 1 << 5, + ZIO_FLAG_PHYSICAL = 1 << 6, #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ - ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */ - ZIO_FLAG_SPECULATIVE = 1 << 7, - ZIO_FLAG_CONFIG_WRITER = 1 << 8, - ZIO_FLAG_DONT_RETRY = 1 << 9, - ZIO_FLAG_DONT_CACHE = 1 << 10, - ZIO_FLAG_NODATA = 1 << 11, - ZIO_FLAG_INDUCE_DAMAGE = 1 << 12, + ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */ + ZIO_FLAG_SPECULATIVE = 1 << 8, + ZIO_FLAG_CONFIG_WRITER = 1 << 9, + ZIO_FLAG_DONT_RETRY = 1 << 10, + ZIO_FLAG_DONT_CACHE = 1 << 11, + ZIO_FLAG_NODATA = 1 << 12, + ZIO_FLAG_INDUCE_DAMAGE = 1 << 13, #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) @@ -185,27 +186,27 @@ enum zio_flag { /* * Flags inherited by vdev children. */ - ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */ - ZIO_FLAG_PROBE = 1 << 14, - ZIO_FLAG_TRYHARD = 1 << 15, - ZIO_FLAG_OPTIONAL = 1 << 16, + ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */ + ZIO_FLAG_PROBE = 1 << 15, + ZIO_FLAG_TRYHARD = 1 << 16, + ZIO_FLAG_OPTIONAL = 1 << 17, #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ - ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */ - ZIO_FLAG_DONT_PROPAGATE = 1 << 18, - ZIO_FLAG_IO_BYPASS = 1 << 19, - ZIO_FLAG_IO_REWRITE = 1 << 20, - ZIO_FLAG_RAW = 1 << 21, - ZIO_FLAG_GANG_CHILD = 1 << 22, - ZIO_FLAG_DDT_CHILD = 1 << 23, - ZIO_FLAG_GODFATHER = 1 << 24, - ZIO_FLAG_NOPWRITE = 1 << 25, - ZIO_FLAG_REEXECUTED = 1 << 26, - ZIO_FLAG_DELEGATED = 1 << 27, + ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */ + ZIO_FLAG_DONT_PROPAGATE = 1 << 19, + ZIO_FLAG_IO_BYPASS = 1 << 20, + ZIO_FLAG_IO_REWRITE = 1 << 21, + ZIO_FLAG_RAW = 1 << 22, + ZIO_FLAG_GANG_CHILD = 1 << 23, + ZIO_FLAG_DDT_CHILD = 1 << 24, + ZIO_FLAG_GODFATHER = 1 << 25, + ZIO_FLAG_NOPWRITE = 1 << 26, + ZIO_FLAG_REEXECUTED = 1 << 27, + ZIO_FLAG_DELEGATED = 1 << 28, }; #define ZIO_FLAG_MUSTSUCCEED 0 diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index 5a7b3e85ee..915c9bb4b2 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -721,6 +721,24 @@ txg_list_empty(txg_list_t *tl, uint64_t txg) } /* + * Returns true if all txg lists are empty. + * + * Warning: this is inherently racy (an item could be added immediately after this + * function returns). We don't bother with the lock because it wouldn't change the + * semantics. + */ +boolean_t +txg_all_lists_empty(txg_list_t *tl) +{ + for (int i = 0; i < TXG_SIZE; i++) { + if (!txg_list_empty(tl, i)) { + return (B_FALSE); + } + } + return (B_TRUE); +} + +/* * Add an entry to the list (unless it's already on the list). * Returns B_TRUE if it was actually added. */ diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index da50548bcb..5d02f3e7ed 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -356,14 +356,23 @@ vdev_queue_class_min_active(zio_priority_t p) } static int -vdev_queue_max_async_writes(uint64_t dirty) +vdev_queue_max_async_writes(spa_t *spa) { int writes; + uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total; uint64_t min_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_min_dirty_percent / 100; uint64_t max_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_max_dirty_percent / 100; + /* + * Sync tasks correspond to interactive user actions. To reduce the + * execution time of those actions we push data out as fast as possible. + */ + if (spa_has_pending_synctask(spa)) { + return (zfs_vdev_async_write_max_active); + } + if (dirty < min_bytes) return (zfs_vdev_async_write_min_active); if (dirty > max_bytes) @@ -396,8 +405,7 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_max_active); case ZIO_PRIORITY_ASYNC_WRITE: - return (vdev_queue_max_async_writes( - spa->spa_dsl_pool->dp_dirty_total)); + return (vdev_queue_max_async_writes(spa)); case ZIO_PRIORITY_SCRUB: return (zfs_vdev_scrub_max_active); default: diff --git a/usr/src/uts/common/fs/zfs/zfs_debug.c b/usr/src/uts/common/fs/zfs/zfs_debug.c index 85fa7600d9..a9cbe4dfe3 100644 --- a/usr/src/uts/common/fs/zfs/zfs_debug.c +++ b/usr/src/uts/common/fs/zfs/zfs_debug.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -57,7 +57,10 @@ zfs_dbgmsg_fini(void) * echo ::zfs_dbgmsg | mdb -k * * Monitor these messages by running: - * dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' + * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' + * + * When used with libzpool, monitor with: + * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}' */ void zfs_dbgmsg(const char *fmt, ...) diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 93b5194caa..91be6c7741 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -849,8 +849,8 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, - ZIO_TYPE_READ, priority, flags, vd, offset, NULL, - ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); + ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, + NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; @@ -870,8 +870,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, - ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); + ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, + NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; @@ -2542,7 +2542,9 @@ zio_vdev_io_start(zio_t *zio) align = 1ULL << vd->vdev_top->vdev_ashift; - if (P2PHASE(zio->io_size, align) != 0) { + if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && + P2PHASE(zio->io_size, align) != 0) { + /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); char *abuf = zio_buf_alloc(asize); ASSERT(vd == vd->vdev_top); @@ -2553,8 +2555,22 @@ zio_vdev_io_start(zio_t *zio) zio_push_transform(zio, abuf, asize, asize, zio_subblock); } - ASSERT(P2PHASE(zio->io_offset, align) == 0); - ASSERT(P2PHASE(zio->io_size, align) == 0); + /* + * If this is not a physical io, make sure that it is properly aligned + * before proceeding. + */ + if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { + ASSERT0(P2PHASE(zio->io_offset, align)); + ASSERT0(P2PHASE(zio->io_size, align)); + } else { + /* + * For physical writes, we allow 512b aligned writes and assume + * the device will perform a read-modify-write as necessary. + */ + ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); + ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); + } + VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); /* diff --git a/usr/src/uts/common/klm/klmops.c b/usr/src/uts/common/klm/klmops.c index a8adbe76c1..2036ead94d 100644 --- a/usr/src/uts/common/klm/klmops.c +++ b/usr/src/uts/common/klm/klmops.c @@ -123,9 +123,7 @@ lm4_shrlock(struct vnode *vp, int cmd, } /* - * Helper for lm_frlock, lm4_frlock, nfs_lockrelease - * After getting a lock from a remote lock manager, - * register the lock locally. + * Helper for nfs_lockrelease. */ void lm_register_lock_locally(struct vnode *vp, struct lm_sysid *ls, diff --git a/usr/src/uts/common/klm/nlm_service.c b/usr/src/uts/common/klm/nlm_service.c index e3e940ca12..d87e80448d 100644 --- a/usr/src/uts/common/klm/nlm_service.c +++ b/usr/src/uts/common/klm/nlm_service.c @@ -167,17 +167,18 @@ nlm_init_flock(struct flock64 *fl, struct nlm4_lock *nl, * This is just like nfs_fhtovp() but without the exportinfo argument. */ static vnode_t * -lm_fhtovp(fhandle_t *fh) +lm_fhtovp(fhandle3_t *fh) { vfs_t *vfsp; vnode_t *vp; int error; - vfsp = getvfs(&fh->fh_fsid); + vfsp = getvfs(&fh->_fh3_fsid); if (vfsp == NULL) return (NULL); - error = VFS_VGET(vfsp, &vp, (fid_t *)&(fh->fh_len)); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + error = VFS_VGET(vfsp, &vp, (fid_t *)&(fh->_fh3_len)); VFS_RELE(vfsp); if (error || vp == NULL) return (NULL); @@ -193,28 +194,26 @@ lm_fhtovp(fhandle_t *fh) static vnode_t * nlm_fh_to_vp(struct netobj *fh) { - fhandle_t *fhp; + fhandle3_t *fhp; /* * Get a vnode pointer for the given NFS file handle. - * Note that it could be an NFSv2 for NFSv3 handle, + * Note that it could be an NFSv2 or NFSv3 handle, * which means the size might vary. (don't copy) */ - if (fh->n_len > MAX_NETOBJ_SZ || fh->n_len < sizeof (*fhp)) + if (fh->n_len < sizeof (fhandle_t)) return (NULL); /* We know this is aligned (kmem_alloc) */ /* LINTED E_BAD_PTR_CAST_ALIGN */ - fhp = (fhandle_t *)fh->n_bytes; + fhp = (fhandle3_t *)fh->n_bytes; /* * See the comment for NFS_FH3MAXDATA in uts/common/nfs/nfs.h for - * converting fhandles. Check the NFSv3 file handle size in case there - * is some unknown compatability issue here, even though we're only - * using the "legacy" fhandle_t struct. The lockmgr is not used for - * NFS v4. + * converting fhandles. Check the NFSv3 file handle size. The lockmgr + * is not used for NFS v4. */ - if (fhp->fh_len > NFS_FH3MAXDATA || fhp->fh_len == 0) + if (fhp->_fh3_len > NFS_FH3MAXDATA || fhp->_fh3_len == 0) return (NULL); return (lm_fhtovp(fhp)); |