diff options
author | Serapheim Dimitropoulos <serapheim@delphix.com> | 2017-08-04 09:30:49 -0700 |
---|---|---|
committer | Prakash Surya <prakash.surya@delphix.com> | 2018-04-02 09:16:05 -0700 |
commit | 17f11284b49b98353b5119463254074fd9bc0a28 (patch) | |
tree | 7efffb8c98a425199065e78b99ba894b9127795f /usr/src/uts/common/fs/zfs/metaslab.c | |
parent | 1c10ae76c0cb31326c320e7cef1d3f24a1f47125 (diff) | |
download | illumos-gate-17f11284b49b98353b5119463254074fd9bc0a28.tar.gz |
9238 ZFS Spacemap Encoding V2
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@zfsmail.com>
Approved by: Gordon Ross <gwr@nexenta.com>
Diffstat (limited to 'usr/src/uts/common/fs/zfs/metaslab.c')
-rw-r--r-- | usr/src/uts/common/fs/zfs/metaslab.c | 58 |
1 files changed, 17 insertions, 41 deletions
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 25f5cc9649..d6fbefb320 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -2091,17 +2091,6 @@ metaslab_group_preload(metaslab_group_t *mg) * * 3. The on-disk size of the space map should actually decrease. * - * Checking the first condition is tricky since we don't want to walk - * the entire AVL tree calculating the estimated on-disk size. Instead we - * use the size-ordered range tree in the metaslab and calculate the - * size required to write out the largest segment in our free tree. If the - * size required to represent that segment on disk is larger than the space - * map object then we avoid condensing this map. - * - * To determine the second criterion we use a best-case estimate and assume - * each segment can be represented on-disk as a single 64-bit entry. We refer - * to this best-case estimate as the space map's minimal form. - * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. * Instead, we apply the heuristic described in the block comment for @@ -2112,9 +2101,6 @@ static boolean_t metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; - range_seg_t *rs; - uint64_t size, entries, segsz, object_size, optimal_size, record_size; - dmu_object_info_t doi; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1 << vd->vdev_ashift; uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); @@ -2140,34 +2126,22 @@ metaslab_should_condense(metaslab_t *msp) msp->ms_condense_checked_txg = current_txg; /* - * Use the ms_allocatable_by_size range tree, which is ordered by - * size, to obtain the largest segment in the free tree. We always - * condense metaslabs that are empty and metaslabs for which a - * condense request has been made. + * We always condense metaslabs that are empty and metaslabs for + * which a condense request has been made. */ - rs = avl_last(&msp->ms_allocatable_by_size); - if (rs == NULL || msp->ms_condense_wanted) + if (avl_is_empty(&msp->ms_allocatable_by_size) || + msp->ms_condense_wanted) return (B_TRUE); - /* - * Calculate the number of 64-bit entries this segment would - * require when written to disk. If this single segment would be - * larger on-disk than the entire current on-disk structure, then - * clearly condensing will increase the on-disk structure size. - */ - size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; - entries = size / (MIN(size, SM_RUN_MAX)); - segsz = entries * sizeof (uint64_t); - - optimal_size = - sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root); - object_size = space_map_length(msp->ms_sm); + uint64_t object_size = space_map_length(msp->ms_sm); + uint64_t optimal_size = space_map_estimate_optimal_size(sm, + msp->ms_allocatable, SM_NO_VDEVID); + dmu_object_info_t doi; dmu_object_info_from_db(sm->sm_dbuf, &doi); - record_size = MAX(doi.doi_data_block_size, vdev_blocksize); + uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); - return (segsz <= object_size && - object_size >= (optimal_size * zfs_condense_pct / 100) && + return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } @@ -2242,11 +2216,11 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) * optimal, this is typically close to optimal, and much cheaper to * compute. */ - space_map_write(sm, condense_tree, SM_ALLOC, tx); + space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); - space_map_write(sm, msp->ms_allocatable, SM_FREE, tx); + space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; } @@ -2358,8 +2332,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) metaslab_condense(msp, txg, tx); } else { mutex_exit(&msp->ms_lock); - space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); - space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx); + space_map_write(msp->ms_sm, alloctree, SM_ALLOC, + SM_NO_VDEVID, tx); + space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, + SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); } @@ -2374,7 +2350,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, - msp->ms_checkpointing, SM_FREE, tx); + msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); space_map_update(vd->vdev_checkpoint_sm); |