summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorge Wilson <George.Wilson@Sun.COM>2009-04-27 21:16:23 -0700
committerGeorge Wilson <George.Wilson@Sun.COM>2009-04-27 21:16:23 -0700
commitd6e555bdd793b8bc8fe57d5f12c3d69c813d0661 (patch)
tree698ef7698b2a19cc28ec8cd8679f02cc122b4474
parent7f84ffd07dc204d03bf141de7c435181b5a4d039 (diff)
downloadillumos-gate-d6e555bdd793b8bc8fe57d5f12c3d69c813d0661.tar.gz
6596237 Stop looking and start ganging
-rw-r--r--usr/src/cmd/mdb/common/modules/zfs/zfs.c2
-rw-r--r--usr/src/cmd/zdb/Makefile.com4
-rw-r--r--usr/src/cmd/zdb/zdb.c69
-rw-r--r--usr/src/cmd/ztest/ztest.c7
-rw-r--r--usr/src/lib/libzpool/common/llib-lzpool1
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c242
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c4
-rw-r--r--usr/src/uts/common/fs/zfs/space_map.c37
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab.h6
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab_impl.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/space_map.h6
11 files changed, 316 insertions, 67 deletions
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
index 8a9be77c5f..d55ce1d0c5 100644
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -377,6 +377,8 @@ zfs_params(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
"zil_disable",
"zfs_nocacheflush",
"metaslab_gang_bang",
+ "metaslab_df_alloc_threshold",
+ "metaslab_df_free_pct",
"zio_injection_enabled",
"zvol_immediate_write_sz",
};
diff --git a/usr/src/cmd/zdb/Makefile.com b/usr/src/cmd/zdb/Makefile.com
index f7aacf97ce..de7f5a4a24 100644
--- a/usr/src/cmd/zdb/Makefile.com
+++ b/usr/src/cmd/zdb/Makefile.com
@@ -20,7 +20,7 @@
#
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -33,7 +33,7 @@ include ../../Makefile.cmd
INCS += -I../../../lib/libzpool/common
INCS += -I../../../uts/common/fs/zfs
-LDLIBS += -lzpool -lumem -lnvpair -lzfs
+LDLIBS += -lzpool -lumem -lnvpair -lzfs -lavl
C99MODE= -xc99=%all
C99LMODE= -Xc99=%all
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 74546c2cf0..9c84410aa1 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -102,6 +102,7 @@ usage(void)
(void) fprintf(stderr, " -C cached pool configuration\n");
(void) fprintf(stderr, " -i intent logs\n");
(void) fprintf(stderr, " -b block statistics\n");
+ (void) fprintf(stderr, " -m metaslabs\n");
(void) fprintf(stderr, " -c checksum all metadata (twice for "
"all data) blocks\n");
(void) fprintf(stderr, " -s report stats on zdb's I/O\n");
@@ -473,6 +474,21 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
}
static void
+dump_metaslab_stats(metaslab_t *msp)
+{
+ char maxbuf[5];
+ space_map_t *sm = &msp->ms_map;
+ avl_tree_t *t = sm->sm_pp_root;
+ int free_pct = sm->sm_space * 100 / sm->sm_size;
+
+ nicenum(space_map_maxsize(sm), maxbuf);
+
+ (void) printf("\t %20s %10lu %7s %6s %4s %4d%%\n",
+ "segments", avl_numnodes(t), "maxsize", maxbuf,
+ "freepct", free_pct);
+}
+
+static void
dump_metaslab(metaslab_t *msp)
{
char freebuf[5];
@@ -482,22 +498,28 @@ dump_metaslab(metaslab_t *msp)
nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
- if (dump_opt['d'] <= 5) {
- (void) printf("\t%10llx %10llu %5s\n",
- (u_longlong_t)msp->ms_map.sm_start,
- (u_longlong_t)smo->smo_object,
- freebuf);
- return;
- }
-
(void) printf(
- "\tvdev %llu offset %08llx spacemap %4llu free %5s\n",
+ "\tvdev %5llu offset %12llx spacemap %6llu free %5s\n",
(u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
(u_longlong_t)smo->smo_object, freebuf);
- ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
+ if (dump_opt['m'] > 1) {
+ mutex_enter(&msp->ms_lock);
+ VERIFY(space_map_load(&msp->ms_map, zfs_metaslab_ops,
+ SM_FREE, &msp->ms_smo, spa->spa_meta_objset) == 0);
+ dump_metaslab_stats(msp);
+ space_map_unload(&msp->ms_map);
+ mutex_exit(&msp->ms_lock);
+ }
+
+ if (dump_opt['d'] > 5 || dump_opt['m'] > 2) {
+ ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
+
+ mutex_enter(&msp->ms_lock);
+ dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
+ mutex_exit(&msp->ms_lock);
+ }
- dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
}
static void
@@ -512,14 +534,12 @@ dump_metaslabs(spa_t *spa)
for (c = 0; c < rvd->vdev_children; c++) {
vd = rvd->vdev_child[c];
- (void) printf("\n vdev %llu\n\n", (u_longlong_t)vd->vdev_id);
+ (void) printf("\t%-10s %-19s %-15s %-10s\n",
+ "vdev", "offset", "spacemap", "free");
+ (void) printf("\t%10s %19s %15s %10s\n",
+ "----------", "-------------------",
+ "---------------", "-------------");
- if (dump_opt['d'] <= 5) {
- (void) printf("\t%10s %10s %5s\n",
- "offset", "spacemap", "free");
- (void) printf("\t%10s %10s %5s\n",
- "------", "--------", "----");
- }
for (m = 0; m < vd->vdev_ms_count; m++)
dump_metaslab(vd->vdev_ms[m]);
(void) printf("\n");
@@ -1419,7 +1439,8 @@ static space_map_ops_t zdb_space_map_ops = {
zdb_space_map_unload,
NULL, /* alloc */
zdb_space_map_claim,
- NULL /* free */
+ NULL, /* free */
+ NULL /* maxsize */
};
static void
@@ -1809,14 +1830,17 @@ dump_zpool(spa_t *spa)
if (dump_opt['u'])
dump_uberblock(&spa->spa_uberblock);
- if (dump_opt['d'] || dump_opt['i']) {
+ if (dump_opt['d'] || dump_opt['i'] || dump_opt['m']) {
dump_dir(dp->dp_meta_objset);
if (dump_opt['d'] >= 3) {
dump_bplist(dp->dp_meta_objset,
spa->spa_sync_bplist_obj, "Deferred frees");
dump_dtl(spa->spa_root_vdev, 0);
- dump_metaslabs(spa);
}
+
+ if (dump_opt['d'] >= 3 || dump_opt['m'])
+ dump_metaslabs(spa);
+
(void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL,
DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
}
@@ -2292,13 +2316,14 @@ main(int argc, char **argv)
dprintf_setup(&argc, argv);
- while ((c = getopt(argc, argv, "udibcsvCLS:U:lRep:t:")) != -1) {
+ while ((c = getopt(argc, argv, "udibcmsvCLS:U:lRep:t:")) != -1) {
switch (c) {
case 'u':
case 'd':
case 'i':
case 'b':
case 'c':
+ case 'm':
case 's':
case 'C':
case 'l':
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index c9beb00b20..304024c951 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -248,9 +248,11 @@ static ztest_shared_t *ztest_shared;
static int ztest_random_fd;
static int ztest_dump_core = 1;
+static uint64_t metaslab_sz;
static boolean_t ztest_exiting;
extern uint64_t metaslab_gang_bang;
+extern uint64_t metaslab_df_alloc_threshold;
#define ZTEST_DIROBJ 1
#define ZTEST_MICROZAP_OBJ 2
@@ -3767,6 +3769,8 @@ ztest_init(char *pool)
if (error)
fatal(0, "spa_open() = %d", error);
+ metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
+
if (zopt_verbose >= 3)
show_pool_stats(spa);
@@ -3858,6 +3862,9 @@ main(int argc, char **argv)
zi->zi_call_time = 0;
}
+ /* Set the allocation switch size */
+ metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1;
+
pid = fork();
if (pid == -1)
diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool
index 276c2eb6ad..44a7d4a28a 100644
--- a/usr/src/lib/libzpool/common/llib-lzpool
+++ b/usr/src/lib/libzpool/common/llib-lzpool
@@ -49,3 +49,4 @@
#include <sys/arc.h>
extern uint64_t metaslab_gang_bang;
+extern uint64_t metaslab_df_alloc_threshold;
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 412832968d..77556ac5d7 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -36,18 +36,35 @@ uint64_t metaslab_aliquot = 512ULL << 10;
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy. Once the space map cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
+
+/*
+ * The minimum free space, in percent, which must be available
+ * in a space map to continue allocations in a first-fit fashion.
+ * Once the space_map's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+int metaslab_df_free_pct = 30;
+
+/*
* ==========================================================================
* Metaslab classes
* ==========================================================================
*/
metaslab_class_t *
-metaslab_class_create(void)
+metaslab_class_create(space_map_ops_t *ops)
{
metaslab_class_t *mc;
mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
mc->mc_rotor = NULL;
+ mc->mc_ops = ops;
return (mc);
}
@@ -202,30 +219,14 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
}
/*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
*/
-static void
-metaslab_ff_load(space_map_t *sm)
-{
- ASSERT(sm->sm_ppd == NULL);
- sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-}
-
-static void
-metaslab_ff_unload(space_map_t *sm)
-{
- kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
- sm->sm_ppd = NULL;
-}
-
static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+ uint64_t align)
{
- avl_tree_t *t = &sm->sm_root;
- uint64_t align = size & -size;
- uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
space_seg_t *ss, ssearch;
avl_index_t where;
@@ -254,7 +255,37 @@ metaslab_ff_alloc(space_map_t *sm, uint64_t size)
return (-1ULL);
*cursor = 0;
- return (metaslab_ff_alloc(sm, size));
+ return (metaslab_block_picker(t, cursor, size, align));
+}
+
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static void
+metaslab_ff_load(space_map_t *sm)
+{
+ ASSERT(sm->sm_ppd == NULL);
+ sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+ sm->sm_pp_root = NULL;
+}
+
+static void
+metaslab_ff_unload(space_map_t *sm)
+{
+ kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+ sm->sm_ppd = NULL;
+}
+
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t align = size & -size;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+
+ return (metaslab_block_picker(t, cursor, size, align));
}
/* ARGSUSED */
@@ -276,9 +307,136 @@ static space_map_ops_t metaslab_ff_ops = {
metaslab_ff_unload,
metaslab_ff_alloc,
metaslab_ff_claim,
- metaslab_ff_free
+ metaslab_ff_free,
+ NULL /* maxsize */
+};
+
+/*
+ * Dynamic block allocator -
+ * Uses the first fit allocation scheme until space get low and then
+ * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
+ * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ */
+
+uint64_t
+metaslab_df_maxsize(space_map_t *sm)
+{
+ avl_tree_t *t = sm->sm_pp_root;
+ space_seg_t *ss;
+
+ if (t == NULL || (ss = avl_last(t)) == NULL)
+ return (0ULL);
+
+ return (ss->ss_end - ss->ss_start);
+}
+
+static int
+metaslab_df_seg_compare(const void *x1, const void *x2)
+{
+ const space_seg_t *s1 = x1;
+ const space_seg_t *s2 = x2;
+ uint64_t ss_size1 = s1->ss_end - s1->ss_start;
+ uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+
+ if (ss_size1 < ss_size2)
+ return (-1);
+ if (ss_size1 > ss_size2)
+ return (1);
+
+ if (s1->ss_start < s2->ss_start)
+ return (-1);
+ if (s1->ss_start > s2->ss_start)
+ return (1);
+
+ return (0);
+}
+
+static void
+metaslab_df_load(space_map_t *sm)
+{
+ space_seg_t *ss;
+
+ ASSERT(sm->sm_ppd == NULL);
+ sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+
+ sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+ avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
+ sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
+
+ for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+ avl_add(sm->sm_pp_root, ss);
+}
+
+static void
+metaslab_df_unload(space_map_t *sm)
+{
+ void *cookie = NULL;
+
+ kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+ sm->sm_ppd = NULL;
+
+ while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
+ /* tear down the tree */
+ }
+
+ avl_destroy(sm->sm_pp_root);
+ kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
+ sm->sm_pp_root = NULL;
+}
+
+static uint64_t
+metaslab_df_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t align = size & -size;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+ uint64_t max_size = metaslab_df_maxsize(sm);
+ int free_pct = sm->sm_space * 100 / sm->sm_size;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+ if (max_size < size)
+ return (-1ULL);
+
+ /*
+ * If we're running low on space switch to using the size
+ * sorted AVL tree (best-fit).
+ */
+ if (max_size < metaslab_df_alloc_threshold ||
+ free_pct < metaslab_df_free_pct) {
+ t = sm->sm_pp_root;
+ *cursor = 0;
+ }
+
+ return (metaslab_block_picker(t, cursor, size, 1ULL));
+}
+
+/* ARGSUSED */
+static void
+metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+/* ARGSUSED */
+static void
+metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+static space_map_ops_t metaslab_df_ops = {
+ metaslab_df_load,
+ metaslab_df_unload,
+ metaslab_df_alloc,
+ metaslab_df_claim,
+ metaslab_df_free,
+ metaslab_df_maxsize
};
+space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+
/*
* ==========================================================================
* Metaslabs
@@ -414,20 +572,28 @@ metaslab_weight(metaslab_t *msp)
}
static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
{
space_map_t *sm = &msp->ms_map;
+ space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- int error = space_map_load(sm, &metaslab_ff_ops,
- SM_FREE, &msp->ms_smo,
+ int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
if (error) {
metaslab_group_sort(msp->ms_group, msp, 0);
return (error);
}
+
+ /*
+ * If we were able to load the map then make sure
+ * that this map is still able to satisfy our request.
+ */
+ if (msp->ms_weight < size)
+ return (ENOSPC);
+
metaslab_group_sort(msp->ms_group, msp,
msp->ms_weight | activation_weight);
}
@@ -636,11 +802,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
int i;
activation_weight = METASLAB_WEIGHT_PRIMARY;
- for (i = 0; i < d; i++)
- if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
+ for (i = 0; i < d; i++) {
+ if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
activation_weight = METASLAB_WEIGHT_SECONDARY;
+ break;
+ }
+ }
for (;;) {
+ boolean_t was_active;
+
mutex_enter(&mg->mg_lock);
for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
if (msp->ms_weight < size) {
@@ -648,6 +819,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
return (-1ULL);
}
+ was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
if (activation_weight == METASLAB_WEIGHT_PRIMARY)
break;
@@ -673,7 +845,9 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
* another thread may have changed the weight while we
* were blocked on the metaslab lock.
*/
- if (msp->ms_weight < size) {
+ if (msp->ms_weight < size || (was_active &&
+ !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+ activation_weight == METASLAB_WEIGHT_PRIMARY)) {
mutex_exit(&msp->ms_lock);
continue;
}
@@ -686,7 +860,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
continue;
}
- if (metaslab_activate(msp, activation_weight) != 0) {
+ if (metaslab_activate(msp, activation_weight, size) != 0) {
mutex_exit(&msp->ms_lock);
continue;
}
@@ -869,7 +1043,7 @@ next:
goto top;
}
- if (!zio_lock) {
+ if (!allocatable && !zio_lock) {
dshift = 3;
zio_lock = B_TRUE;
goto top;
@@ -955,7 +1129,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock);
- error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
if (error || txg == 0) { /* txg == 0 indicates dry run */
mutex_exit(&msp->ms_lock);
return (error);
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 79f4bc91aa..b852bd039f 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -541,8 +541,8 @@ spa_activate(spa_t *spa, int mode)
spa->spa_state = POOL_STATE_ACTIVE;
spa->spa_mode = mode;
- spa->spa_normal_class = metaslab_class_create();
- spa->spa_log_class = metaslab_class_create();
+ spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
+ spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
for (int t = 0; t < ZIO_TYPES; t++) {
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c
index 1cdacc81da..4aa2394138 100644
--- a/usr/src/uts/common/fs/zfs/space_map.c
+++ b/usr/src/uts/common/fs/zfs/space_map.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -116,12 +116,23 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
if (merge_before && merge_after) {
avl_remove(&sm->sm_root, ss_before);
+ if (sm->sm_pp_root) {
+ avl_remove(sm->sm_pp_root, ss_before);
+ avl_remove(sm->sm_pp_root, ss_after);
+ }
ss_after->ss_start = ss_before->ss_start;
kmem_free(ss_before, sizeof (*ss_before));
+ ss = ss_after;
} else if (merge_before) {
ss_before->ss_end = end;
+ if (sm->sm_pp_root)
+ avl_remove(sm->sm_pp_root, ss_before);
+ ss = ss_before;
} else if (merge_after) {
ss_after->ss_start = start;
+ if (sm->sm_pp_root)
+ avl_remove(sm->sm_pp_root, ss_after);
+ ss = ss_after;
} else {
ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
ss->ss_start = start;
@@ -129,6 +140,9 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
avl_insert(&sm->sm_root, ss, where);
}
+ if (sm->sm_pp_root)
+ avl_add(sm->sm_pp_root, ss);
+
sm->sm_space += size;
}
@@ -163,12 +177,17 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
left_over = (ss->ss_start != start);
right_over = (ss->ss_end != end);
+ if (sm->sm_pp_root)
+ avl_remove(sm->sm_pp_root, ss);
+
if (left_over && right_over) {
newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
newseg->ss_start = end;
newseg->ss_end = ss->ss_end;
ss->ss_end = start;
avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
+ if (sm->sm_pp_root)
+ avl_add(sm->sm_pp_root, newseg);
} else if (left_over) {
ss->ss_end = start;
} else if (right_over) {
@@ -176,8 +195,12 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
} else {
avl_remove(&sm->sm_root, ss);
kmem_free(ss, sizeof (*ss));
+ ss = NULL;
}
+ if (sm->sm_pp_root && ss != NULL)
+ avl_add(sm->sm_pp_root, ss);
+
sm->sm_space -= size;
}
@@ -315,6 +338,9 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
if (ops != NULL)
ops->smop_load(sm);
} else {
+ if (ops != NULL)
+ ops->smop_unload(sm);
+ sm->sm_ops = NULL;
space_map_vacate(sm, NULL, NULL);
}
@@ -342,6 +368,15 @@ space_map_unload(space_map_t *sm)
}
uint64_t
+space_map_maxsize(space_map_t *sm)
+{
+ if (sm->sm_loaded && sm->sm_ops != NULL)
+ return (sm->sm_ops->smop_max(sm));
+ else
+ return (-1ULL);
+}
+
+uint64_t
space_map_alloc(space_map_t *sm, uint64_t size)
{
uint64_t start;
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
index 1c9d89e8fd..5d3e11c971 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -39,6 +39,8 @@ extern "C" {
typedef struct metaslab_class metaslab_class_t;
typedef struct metaslab_group metaslab_group_t;
+extern space_map_ops_t *zfs_metaslab_ops;
+
extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
uint64_t start, uint64_t size, uint64_t txg);
extern void metaslab_fini(metaslab_t *msp);
@@ -55,7 +57,7 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
boolean_t now);
extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
-extern metaslab_class_t *metaslab_class_create(void);
+extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops);
extern void metaslab_class_destroy(metaslab_class_t *mc);
extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
index 5980cbc843..d67dea7e97 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_METASLAB_IMPL_H
#define _SYS_METASLAB_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/metaslab.h>
#include <sys/space_map.h>
#include <sys/vdev.h>
@@ -41,6 +39,7 @@ extern "C" {
struct metaslab_class {
metaslab_group_t *mc_rotor;
uint64_t mc_allocated;
+ space_map_ops_t *mc_ops;
};
struct metaslab_group {
diff --git a/usr/src/uts/common/fs/zfs/sys/space_map.h b/usr/src/uts/common/fs/zfs/sys/space_map.h
index 8d7860660c..a682bbd409 100644
--- a/usr/src/uts/common/fs/zfs/sys/space_map.h
+++ b/usr/src/uts/common/fs/zfs/sys/space_map.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -46,12 +46,14 @@ typedef struct space_map {
uint8_t sm_loading; /* map loading? */
kcondvar_t sm_load_cv; /* map load completion */
space_map_ops_t *sm_ops; /* space map block picker ops vector */
+ avl_tree_t *sm_pp_root; /* picker-private AVL tree */
void *sm_ppd; /* picker-private data */
kmutex_t *sm_lock; /* pointer to lock that protects map */
} space_map_t;
typedef struct space_seg {
avl_node_t ss_node; /* AVL node */
+ avl_node_t ss_pp_node; /* AVL picker-private node */
uint64_t ss_start; /* starting offset of this segment */
uint64_t ss_end; /* ending offset (non-inclusive) */
} space_seg_t;
@@ -74,6 +76,7 @@ struct space_map_ops {
uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
+ uint64_t (*smop_max)(space_map_t *sm);
};
/*
@@ -152,6 +155,7 @@ extern void space_map_unload(space_map_t *sm);
extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
+extern uint64_t space_map_maxsize(space_map_t *sm);
extern void space_map_sync(space_map_t *sm, uint8_t maptype,
space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);