summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRyan Zezeski <rpz@joyent.com>2019-11-14 09:39:53 -0700
committerRyan Zezeski <rpz@joyent.com>2019-11-14 09:39:53 -0700
commit074bf480b3d9701c3c55056fe6105028504135b6 (patch)
tree9bd8568e7caa13fc1b13146260ba82af1827ebbc
parent27bc3ef3b6dd5a071a0607d96af5eec24ca5d276 (diff)
parent43ef85afe5649116d876156ca6eb797e144c9795 (diff)
downloadillumos-joyent-cr6990-OS-8027.tar.gz
Merge remote-tracking branch 'origin/master' into cr6990-OS-8027cr6990-OS-8027
-rw-r--r--usr/src/boot/Makefile.version2
-rw-r--r--usr/src/boot/lib/libstand/zfs/zfsimpl.c256
-rw-r--r--usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h97
-rw-r--r--usr/src/cmd/mdb/common/modules/zfs/zfs.c46
-rw-r--r--usr/src/cmd/zdb/zdb.c6
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c17
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c791
-rw-r--r--usr/src/uts/common/fs/zfs/range_tree.c30
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c4
-rw-r--r--usr/src/uts/common/fs/zfs/spa_log_spacemap.c3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab.h9
-rw-r--r--usr/src/uts/common/fs/zfs/sys/metaslab_impl.h21
-rw-r--r--usr/src/uts/common/fs/zfs/sys/range_tree.h2
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_initialize.c11
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_trim.c19
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c2
-rw-r--r--usr/src/uts/common/io/mac/mac_datapath_setup.c6
-rw-r--r--usr/src/uts/common/io/mac/mac_soft_ring.c3
-rw-r--r--usr/src/uts/common/io/usb/usba/usbai.c12
-rw-r--r--usr/src/uts/common/sys/mac_soft_ring.h2
-rw-r--r--usr/src/uts/common/xen/io/evtchn_dev.c5
-rw-r--r--usr/src/uts/i86pc/io/immu_intrmap.c15
-rw-r--r--usr/src/uts/i86pc/io/immu_regs.c6
-rw-r--r--usr/src/uts/i86pc/sys/immu.h14
-rw-r--r--usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c4
-rw-r--r--usr/src/uts/i86xpv/io/psm/xpv_psm.c10
27 files changed, 1066 insertions, 328 deletions
diff --git a/usr/src/boot/Makefile.version b/usr/src/boot/Makefile.version
index 9d40ee8993..a161b24487 100644
--- a/usr/src/boot/Makefile.version
+++ b/usr/src/boot/Makefile.version
@@ -33,4 +33,4 @@ LOADER_VERSION = 1.1
# Use date like formatting here, YYYY.MM.DD.XX, without leading zeroes.
# The version is processed from left to right, the version number can only
# be increased.
-BOOT_VERSION = $(LOADER_VERSION)-2019.11.04.1
+BOOT_VERSION = $(LOADER_VERSION)-2019.11.05.1
diff --git a/usr/src/boot/lib/libstand/zfs/zfsimpl.c b/usr/src/boot/lib/libstand/zfs/zfsimpl.c
index e595273c9b..fba9f1fc59 100644
--- a/usr/src/boot/lib/libstand/zfs/zfsimpl.c
+++ b/usr/src/boot/lib/libstand/zfs/zfsimpl.c
@@ -1534,71 +1534,104 @@ vdev_label_offset(uint64_t psize, int l, uint64_t offset)
}
static int
-vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
+vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
+{
+ unsigned int seq1 = 0;
+ unsigned int seq2 = 0;
+ int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
+
+ if (cmp != 0)
+ return (cmp);
+
+ cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
+ if (cmp != 0)
+ return (cmp);
+
+ if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
+ seq1 = MMP_SEQ(ub1);
+
+ if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
+ seq2 = MMP_SEQ(ub2);
+
+ return (AVL_CMP(seq1, seq2));
+}
+
+static int
+uberblock_verify(uberblock_t *ub)
+{
+ if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) {
+ byteswap_uint64_array(ub, sizeof (uberblock_t));
+ }
+
+ if (ub->ub_magic != UBERBLOCK_MAGIC ||
+ !SPA_VERSION_IS_SUPPORTED(ub->ub_version))
+ return (EINVAL);
+
+ return (0);
+}
+
+static int
+vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset,
+ size_t size)
{
- vdev_t vtmp;
- vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
- vdev_phys_t *tmp_label;
- spa_t *spa;
- vdev_t *vdev, *top_vdev, *pool_vdev;
- off_t off;
blkptr_t bp;
- const unsigned char *nvlist = NULL;
- uint64_t val;
- uint64_t guid;
- uint64_t best_txg = 0;
- uint64_t pool_txg, pool_guid;
- const char *pool_name;
- const unsigned char *vdevs;
- const unsigned char *features;
- int i, l, rc, is_newer;
- char *upbuf;
- const struct uberblock *up;
+ off_t off;
- /*
- * Load the vdev label and figure out which
- * uberblock is most current.
- */
- memset(&vtmp, 0, sizeof(vtmp));
- vtmp.v_phys_read = phys_read;
- vtmp.v_read_priv = read_priv;
- vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
- (uint64_t)sizeof (vdev_label_t));
+ off = vdev_label_offset(vd->v_psize, l, offset);
- /* Test for minimum device size. */
- if (vtmp.v_psize < SPA_MINDEVSIZE)
- return (EIO);
+ BP_ZERO(&bp);
+ BP_SET_LSIZE(&bp, size);
+ BP_SET_PSIZE(&bp, size);
+ BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
+ BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+ DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
+ ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
- tmp_label = zfs_alloc(sizeof (vdev_phys_t));
+ return (vdev_read_phys(vd, &bp, buf, off, size));
+}
- for (l = 0; l < VDEV_LABELS; l++) {
- off = vdev_label_offset(vtmp.v_psize, l,
- offsetof(vdev_label_t, vl_vdev_phys));
+static unsigned char *
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
+{
+ vdev_phys_t *label;
+ uint64_t best_txg = 0;
+ uint64_t label_txg = 0;
+ uint64_t asize;
+ unsigned char *nvl;
+ size_t nvl_size;
+ int error;
+
+ label = malloc(sizeof (vdev_phys_t));
+ if (label == NULL)
+ return (NULL);
- BP_ZERO(&bp);
- BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
- BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
- BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
- BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
- DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
- ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
+ nvl_size = VDEV_PHYS_SIZE - sizeof (zio_eck_t) - 4;
+ nvl = malloc(nvl_size);
+ if (nvl == NULL)
+ goto done;
- if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0))
- continue;
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ const unsigned char *nvlist;
- if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR)
+ if (vdev_label_read(vd, l, label,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t)))
continue;
- nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4;
- if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
- DATA_TYPE_UINT64, NULL, &pool_txg) != 0)
+ if (label->vp_nvlist[0] != NV_ENCODE_XDR)
continue;
- if (best_txg <= pool_txg) {
- uint64_t asize;
+ nvlist = (const unsigned char *) label->vp_nvlist + 4;
+ error = nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
+ DATA_TYPE_UINT64, NULL, &label_txg);
+ if (error != 0 || label_txg == 0) {
+ memcpy(nvl, nvlist, nvl_size);
+ goto done;
+ }
- best_txg = pool_txg;
- memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t));
+ if (label_txg <= txg && label_txg > best_txg) {
+ best_txg = label_txg;
+ memcpy(nvl, nvlist, nvl_size);
/*
* Use asize from pool config. We need this
@@ -1606,30 +1639,89 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
*/
if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
DATA_TYPE_UINT64, NULL, &asize) == 0) {
- vtmp.v_psize = asize +
+ vd->v_psize = asize +
VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
}
}
}
- zfs_free(tmp_label, sizeof (vdev_phys_t));
+ if (best_txg == 0) {
+ free(nvl);
+ nvl = NULL;
+ }
+done:
+ free(label);
+ return (nvl);
+}
+
+static void
+vdev_uberblock_load(vdev_t *vd, uberblock_t *ub)
+{
+ uberblock_t *buf;
+
+ buf = malloc(VDEV_UBERBLOCK_SIZE(vd));
+ if (buf == NULL)
+ return;
+
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ if (vdev_label_read(vd, l, buf,
+ VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd)))
+ continue;
+ if (uberblock_verify(buf) != 0)
+ continue;
+
+ if (vdev_uberblock_compare(buf, ub) > 0)
+ *ub = *buf;
+ }
+ }
+ free(buf);
+}
+
+static int
+vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
+{
+ vdev_t vtmp;
+ spa_t *spa;
+ vdev_t *vdev, *top_vdev, *pool_vdev;
+ unsigned char *nvlist;
+ uint64_t val;
+ uint64_t guid;
+ uint64_t pool_txg, pool_guid;
+ const char *pool_name;
+ const unsigned char *vdevs;
+ const unsigned char *features;
+ int rc, is_newer;
- if (best_txg == 0)
- return (EIO);
+ /*
+ * Load the vdev label and figure out which
+ * uberblock is most current.
+ */
+ memset(&vtmp, 0, sizeof (vtmp));
+ vtmp.v_phys_read = phys_read;
+ vtmp.v_read_priv = read_priv;
+ vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
+ (uint64_t)sizeof (vdev_label_t));
- if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR)
+ /* Test for minimum device size. */
+ if (vtmp.v_psize < SPA_MINDEVSIZE)
return (EIO);
- nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
+ nvlist = vdev_label_read_config(&vtmp, UINT64_MAX);
+ if (nvlist == NULL)
+ return (EIO);
if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
NULL, &val) != 0) {
+ free(nvlist);
return (EIO);
}
if (!SPA_VERSION_IS_SUPPORTED(val)) {
printf("ZFS: unsupported ZFS version %u (should be %u)\n",
(unsigned) val, (unsigned) SPA_VERSION);
+ free(nvlist);
return (EIO);
}
@@ -1637,16 +1729,19 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
DATA_TYPE_NVLIST, NULL, &features) == 0 &&
nvlist_check_features_for_read(features) != 0) {
+ free(nvlist);
return (EIO);
}
if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
NULL, &val) != 0) {
+ free(nvlist);
return (EIO);
}
if (val == POOL_STATE_DESTROYED) {
/* We don't boot only from destroyed pools. */
+ free(nvlist);
return (EIO);
}
@@ -1660,12 +1755,13 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
* Cache and spare devices end up here - just ignore
* them.
*/
- /*printf("ZFS: can't find pool details\n");*/
+ free(nvlist);
return (EIO);
}
if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64,
NULL, &val) == 0 && val != 0) {
+ free(nvlist);
return (EIO);
}
@@ -1675,8 +1771,10 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
spa = spa_find_by_guid(pool_guid);
if (spa == NULL) {
spa = spa_create(pool_guid, pool_name);
- if (spa == NULL)
+ if (spa == NULL) {
+ free(nvlist);
return (ENOMEM);
+ }
}
if (pool_txg > spa->spa_txg) {
spa->spa_txg = pool_txg;
@@ -1693,18 +1791,24 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
*/
if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
NULL, &guid) != 0) {
+ free(nvlist);
return (EIO);
}
vdev = vdev_find(guid);
- if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */
+ /* Has this vdev already been inited? */
+ if (vdev && vdev->v_phys_read) {
+ free(nvlist);
return (EIO);
+ }
if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
NULL, &vdevs)) {
+ free(nvlist);
return (EIO);
}
rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
+ free(nvlist);
if (rc != 0)
return (rc);
@@ -1714,6 +1818,7 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
if (top_vdev == pool_vdev)
break;
+
if (!pool_vdev && top_vdev) {
top_vdev->spa = spa;
STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
@@ -1748,36 +1853,7 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
* the best uberblock and then we can actually access
* the contents of the pool.
*/
- upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
- up = (const struct uberblock *)upbuf;
- for (l = 0; l < VDEV_LABELS; l++) {
- for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) {
- off = vdev_label_offset(vdev->v_psize, l,
- VDEV_UBERBLOCK_OFFSET(vdev, i));
- BP_ZERO(&bp);
- DVA_SET_OFFSET(&bp.blk_dva[0], off);
- BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
- BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
- BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
- BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
- ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
-
- if (vdev_read_phys(vdev, &bp, upbuf, off, 0) != 0)
- continue;
-
- if (up->ub_magic != UBERBLOCK_MAGIC)
- continue;
- if (up->ub_txg < spa->spa_txg)
- continue;
- if (up->ub_txg > spa->spa_uberblock.ub_txg ||
- (up->ub_txg == spa->spa_uberblock.ub_txg &&
- up->ub_timestamp >
- spa->spa_uberblock.ub_timestamp)) {
- spa->spa_uberblock = *up;
- }
- }
- }
- zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
+ vdev_uberblock_load(vdev, &spa->spa_uberblock);
vdev->spa = spa;
if (spap != NULL)
diff --git a/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h b/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h
index 2a71fcb067..8f45983761 100644
--- a/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h
+++ b/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h
@@ -66,6 +66,14 @@
#define _NOTE(s)
+/*
+ * AVL comparator helpers
+ */
+#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0))
+#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b)))
+#define AVL_PCMP(a, b) \
+ (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
+
/* CRC64 table */
#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
@@ -492,8 +500,16 @@ typedef struct zio_gbh {
#define VDEV_PHYS_SIZE (112 << 10)
#define VDEV_UBERBLOCK_RING (128 << 10)
+/*
+ * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock
+ * ring when MMP is enabled.
+ */
+#define MMP_BLOCKS_PER_LABEL 1
+
+/* The largest uberblock we support is 8k. */
+#define MAX_UBERBLOCK_SHIFT (13)
#define VDEV_UBERBLOCK_SHIFT(vd) \
- MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT)
+ MIN(MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT), MAX_UBERBLOCK_SHIFT)
#define VDEV_UBERBLOCK_COUNT(vd) \
(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
#define VDEV_UBERBLOCK_OFFSET(vd, n) \
@@ -843,15 +859,88 @@ typedef enum pool_state {
*/
#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
#define UBERBLOCK_SHIFT 10 /* up to 1K */
-
-struct uberblock {
+#define MMP_MAGIC 0xa11cea11 /* all-see-all */
+
+#define MMP_INTERVAL_VALID_BIT 0x01
+#define MMP_SEQ_VALID_BIT 0x02
+#define MMP_FAIL_INT_VALID_BIT 0x04
+
+#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
+ ubp->ub_mmp_magic == MMP_MAGIC)
+#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+ MMP_INTERVAL_VALID_BIT))
+#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+ MMP_SEQ_VALID_BIT))
+#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+ MMP_FAIL_INT_VALID_BIT))
+
+#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
+ >> 8)
+#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
+ >> 32)
+#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
+ >> 48)
+
+typedef struct uberblock {
uint64_t ub_magic; /* UBERBLOCK_MAGIC */
uint64_t ub_version; /* SPA_VERSION */
uint64_t ub_txg; /* txg of last sync */
uint64_t ub_guid_sum; /* sum of all vdev guids */
uint64_t ub_timestamp; /* UTC time of last sync */
blkptr_t ub_rootbp; /* MOS objset_phys_t */
-};
+ /* highest SPA_VERSION supported by software that wrote this txg */
+ uint64_t ub_software_version;
+ /* Maybe missing in uberblocks we read, but always written */
+ uint64_t ub_mmp_magic;
+ /*
+ * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off.
+ * Otherwise, nanosec since last MMP write.
+ */
+ uint64_t ub_mmp_delay;
+
+ /*
+ * The ub_mmp_config contains the multihost write interval, multihost
+ * fail intervals, sequence number for sub-second granularity, and
+ * valid bit mask. This layout is as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | Fail Intervals| Seq | Write Interval (ms) | VALID |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * This allows a write_interval of (2^24/1000)s, over 4.5 hours
+ *
+ * VALID Bits:
+ * - 0x01 - Write Interval (ms)
+ * - 0x02 - Sequence number exists
+ * - 0x04 - Fail Intervals
+ * - 0xf8 - Reserved
+ */
+ uint64_t ub_mmp_config;
+
+ /*
+ * ub_checkpoint_txg indicates two things about the current uberblock:
+ *
+ * 1] If it is not zero then this uberblock is a checkpoint. If it is
+ * zero, then this uberblock is not a checkpoint.
+ *
+ * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
+ * the ub_txg that the uberblock had at the time we moved it to
+ * the MOS config.
+ *
+ * The field is set when we checkpoint the uberblock and continues to
+ * hold that value even after we've rewound (unlike the ub_txg that
+ * is reset to a higher value).
+ *
+ * Besides checks used to determine whether we are reopening the
+ * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
+ * the value of the field is used to determine which ZIL blocks have
+ * been allocated according to the ms_sm when we are rewinding to a
+ * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
+ * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
+ */
+ uint64_t ub_checkpoint_txg;
+} uberblock_t;
/*
* Flags.
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
index 2c32e1a191..7cc12ccf0a 100644
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -3098,25 +3098,25 @@ reference_cb(uintptr_t addr, const void *ignored, void *arg)
return (WALK_NEXT);
}
-typedef struct mdb_refcount {
+typedef struct mdb_zfs_refcount {
uint64_t rc_count;
-} mdb_refcount_t;
+} mdb_zfs_refcount_t;
-typedef struct mdb_refcount_removed {
+typedef struct mdb_zfs_refcount_removed {
uint64_t rc_removed_count;
-} mdb_refcount_removed_t;
+} mdb_zfs_refcount_removed_t;
-typedef struct mdb_refcount_tracked {
+typedef struct mdb_zfs_refcount_tracked {
boolean_t rc_tracked;
-} mdb_refcount_tracked_t;
+} mdb_zfs_refcount_tracked_t;
/* ARGSUSED */
static int
-refcount(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+zfs_refcount(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
{
- mdb_refcount_t rc;
- mdb_refcount_removed_t rcr;
- mdb_refcount_tracked_t rct;
+ mdb_zfs_refcount_t rc;
+ mdb_zfs_refcount_removed_t rcr;
+ mdb_zfs_refcount_tracked_t rct;
int off;
boolean_t released = B_FALSE;
@@ -3128,30 +3128,30 @@ refcount(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
NULL) != argc)
return (DCMD_USAGE);
- if (mdb_ctf_vread(&rc, "refcount_t", "mdb_refcount_t", addr,
+ if (mdb_ctf_vread(&rc, "zfs_refcount_t", "mdb_zfs_refcount_t", addr,
0) == -1)
return (DCMD_ERR);
- if (mdb_ctf_vread(&rcr, "refcount_t", "mdb_refcount_removed_t", addr,
- MDB_CTF_VREAD_QUIET) == -1) {
- mdb_printf("refcount_t at %p has %llu holds (untracked)\n",
+ if (mdb_ctf_vread(&rcr, "zfs_refcount_t", "mdb_zfs_refcount_removed_t",
+ addr, MDB_CTF_VREAD_QUIET) == -1) {
+ mdb_printf("zfs_refcount_t at %p has %llu holds (untracked)\n",
addr, (longlong_t)rc.rc_count);
return (DCMD_OK);
}
- if (mdb_ctf_vread(&rct, "refcount_t", "mdb_refcount_tracked_t", addr,
- MDB_CTF_VREAD_QUIET) == -1) {
+ if (mdb_ctf_vread(&rct, "zfs_refcount_t", "mdb_zfs_refcount_tracked_t",
+ addr, MDB_CTF_VREAD_QUIET) == -1) {
/* If this is an old target, it might be tracked. */
rct.rc_tracked = B_TRUE;
}
- mdb_printf("refcount_t at %p has %llu current holds, "
+ mdb_printf("zfs_refcount_t at %p has %llu current holds, "
"%llu recently released holds\n",
addr, (longlong_t)rc.rc_count, (longlong_t)rcr.rc_removed_count);
if (rct.rc_tracked && rc.rc_count > 0)
mdb_printf("current holds:\n");
- off = mdb_ctf_offsetof_by_name("refcount_t", "rc_list");
+ off = mdb_ctf_offsetof_by_name("zfs_refcount_t", "rc_list");
if (off == -1)
return (DCMD_ERR);
mdb_pwalk("list", reference_cb, (void*)B_FALSE, addr + off);
@@ -3159,7 +3159,7 @@ refcount(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
if (released && rcr.rc_removed_count > 0) {
mdb_printf("released holds:\n");
- off = mdb_ctf_offsetof_by_name("refcount_t", "rc_removed");
+ off = mdb_ctf_offsetof_by_name("zfs_refcount_t", "rc_removed");
if (off == -1)
return (DCMD_ERR);
mdb_pwalk("list", reference_cb, (void*)B_TRUE, addr + off);
@@ -3797,12 +3797,12 @@ rrwlock(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
}
mdb_printf("anonymous references:\n");
- (void) mdb_call_dcmd("refcount", addr +
+ (void) mdb_call_dcmd("zfs_refcount", addr +
mdb_ctf_offsetof_by_name(ZFS_STRUCT "rrwlock", "rr_anon_rcount"),
DCMD_ADDRSPEC, 0, NULL);
mdb_printf("linked references:\n");
- (void) mdb_call_dcmd("refcount", addr +
+ (void) mdb_call_dcmd("zfs_refcount", addr +
mdb_ctf_offsetof_by_name(ZFS_STRUCT "rrwlock", "rr_linked_rcount"),
DCMD_ADDRSPEC, 0, NULL);
@@ -4345,9 +4345,9 @@ static const mdb_dcmd_t dcmds[] = {
"given a spa_t, print block type stats from last scrub",
zfs_blkstats },
{ "zfs_params", "", "print zfs tunable parameters", zfs_params },
- { "refcount", ":[-r]\n"
+ { "zfs_refcount", ":[-r]\n"
"\t-r display recently removed references",
- "print refcount_t holders", refcount },
+ "print zfs_refcount_t holders", zfs_refcount },
{ "zap_leaf", "", "print zap_leaf_phys_t", zap_leaf },
{ "zfs_aces", ":[-v]", "print all ACEs from a zfs_acl_t",
zfs_acl_dump },
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 13fd33522a..61cfd74df3 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Nexenta Systems, Inc.
* Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
@@ -901,7 +901,7 @@ dump_metaslab_stats(metaslab_t *msp)
/* max sure nicenum has enough space */
CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);
- zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf));
+ zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
(void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
"segments", avl_numnodes(t), "maxsize", maxbuf,
@@ -928,7 +928,7 @@ dump_metaslab(metaslab_t *msp)
if (dump_opt['m'] > 2 && !dump_opt['L']) {
mutex_enter(&msp->ms_lock);
- VERIFY0(metaslab_load(msp, 0));
+ VERIFY0(metaslab_load(msp));
range_tree_stat_verify(msp->ms_allocatable);
dump_metaslab_stats(msp);
metaslab_unload(msp);
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index eb574105a7..654b62db6a 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2018, Joyent, Inc.
+ * Copyright (c) 2019, Joyent, Inc.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc. All rights reserved.
@@ -296,6 +296,7 @@
#include <zfs_fletcher.h>
#include <sys/aggsum.h>
#include <sys/cityhash.h>
+#include <sys/param.h>
#ifndef _KERNEL
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
@@ -1268,6 +1269,20 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *);
+/*
+ * The arc_all_memory function is a ZoL enhancement that lives in their OSL
+ * code. In user-space code, which is used primarily for testing, we return
+ * half of all memory.
+ */
+uint64_t
+arc_all_memory(void)
+{
+#ifdef _KERNEL
+ return (ptob(physmem));
+#else
+ return ((sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES)) / 2);
+#endif
+}
/*
* We use Cityhash for this. It's fast, and has good hash properties without
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index b950ed26d6..bc6b45ec7f 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -199,28 +199,20 @@ uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
int metaslab_load_pct = 50;
/*
- * Determines how many txgs a metaslab may remain loaded without having any
- * allocations from it. As long as a metaslab continues to be used we will
- * keep it loaded.
+ * These tunables control how long a metaslab will remain loaded after the
+ * last allocation from it. A metaslab can't be unloaded until at least
+ * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
+ * have elapsed. However, zfs_metaslab_mem_limit may cause it to be
+ * unloaded sooner. These settings are intended to be generous -- to keep
+ * metaslabs loaded for a long time, reducing the rate of metaslab loading.
*/
-int metaslab_unload_delay = TXG_SIZE * 2;
-
-/*
- * Tunables used to reduce metaslab load/unload thrashing when selection
- * algorithm is allocating across metaslabs very evenly. In addition to
- * tracking when the slab was used for allocation (ms_selected_txg), we also
- * track when it was loaded (ms_loaded_txg). If the slab would be unloaded,
- * but the load txg is within the window of
- * metaslab_unload_delay + metaslab_load_window
- * then we ramp up metaslab_unload_delay instead of unloading the metaslab.
- */
-int metaslab_load_window = 10;
-int metaslab_unload_delay_max = 256;
+int metaslab_unload_delay = 32;
+int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
/*
* Max number of metaslabs per group to preload.
*/
-int metaslab_preload_limit = SPA_DVAS_PER_BP;
+int metaslab_preload_limit = 10;
/*
* Enable/disable preloading of metaslab.
@@ -281,6 +273,19 @@ uint64_t metaslab_trace_max_entries = 5000;
*/
int max_disabled_ms = 3;
+/*
+ * Maximum percentage of memory to use on storing loaded metaslabs. If loading
+ * a metaslab would take it over this percentage, the oldest selected metaslab
+ * is automatically unloaded.
+ */
+int zfs_metaslab_mem_limit = 25;
+
+/*
+ * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
+ * To avoid 64-bit overflow, don't set above UINT32_MAX.
+ */
+unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+
static uint64_t metaslab_weight(metaslab_t *);
static void metaslab_set_fragmentation(metaslab_t *);
static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
@@ -288,6 +293,8 @@ static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
+static unsigned int metaslab_idx_func(multilist_t *, void *);
+static void metaslab_evict(metaslab_t *, uint64_t);
kmem_cache_t *metaslab_alloc_trace_cache;
@@ -307,6 +314,8 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
mc->mc_rotor = NULL;
mc->mc_ops = ops;
mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
+ mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
+ offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
sizeof (zfs_refcount_t), KM_SLEEP);
mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
@@ -333,6 +342,7 @@ metaslab_class_destroy(metaslab_class_t *mc)
kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
sizeof (uint64_t));
mutex_destroy(&mc->mc_lock);
+ multilist_destroy(mc->mc_metaslab_txg_list);
kmem_free(mc, sizeof (metaslab_class_t));
}
@@ -523,6 +533,51 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
return (space);
}
+void
+metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
+{
+ multilist_t *ml = mc->mc_metaslab_txg_list;
+ for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
+ multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ metaslab_t *msp = multilist_sublist_head(mls);
+ multilist_sublist_unlock(mls);
+ while (msp != NULL) {
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * If the metaslab has been removed from the list
+ * (which could happen if we were at the memory limit
+ * and it was evicted during this loop), then we can't
+ * proceed and we should restart the sublist.
+ */
+ if (!multilist_link_active(&msp->ms_class_txg_node)) {
+ mutex_exit(&msp->ms_lock);
+ i--;
+ break;
+ }
+ mls = multilist_sublist_lock(ml, i);
+ metaslab_t *next_msp = multilist_sublist_next(mls, msp);
+ multilist_sublist_unlock(mls);
+ if (txg >
+ msp->ms_selected_txg + metaslab_unload_delay &&
+ gethrtime() > msp->ms_selected_time +
+ (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
+ metaslab_evict(msp, txg);
+ } else {
+ /*
+ * Once we've hit a metaslab selected too
+ * recently to evict, we're done evicting for
+ * now.
+ */
+ mutex_exit(&msp->ms_lock);
+ break;
+ }
+ mutex_exit(&msp->ms_lock);
+ msp = next_msp;
+ }
+ }
+}
+
static int
metaslab_compare(const void *x1, const void *x2)
{
@@ -1002,6 +1057,14 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
mutex_enter(&mg->mg_lock);
ASSERT(msp->ms_group == mg);
avl_remove(&mg->mg_metaslab_tree, msp);
+
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ multilist_sublist_t *mls =
+ multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+ if (multilist_link_active(&msp->ms_class_txg_node))
+ multilist_sublist_remove(mls, msp);
+ multilist_sublist_unlock(mls);
+
msp->ms_group = NULL;
mutex_exit(&mg->mg_lock);
}
@@ -1009,8 +1072,10 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
static void
metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(MUTEX_HELD(&mg->mg_lock));
ASSERT(msp->ms_group == mg);
+
avl_remove(&mg->mg_metaslab_tree, msp);
msp->ms_weight = weight;
avl_add(&mg->mg_metaslab_tree, msp);
@@ -1211,17 +1276,83 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
* Return the maximum contiguous segment within the metaslab.
*/
uint64_t
-metaslab_block_maxsize(metaslab_t *msp)
+metaslab_largest_allocatable(metaslab_t *msp)
{
avl_tree_t *t = &msp->ms_allocatable_by_size;
range_seg_t *rs;
- if (t == NULL || (rs = avl_last(t)) == NULL)
- return (0ULL);
+ if (t == NULL)
+ return (0);
+ rs = avl_last(t);
+ if (rs == NULL)
+ return (0);
return (rs->rs_end - rs->rs_start);
}
+/*
+ * Return the maximum contiguous segment within the unflushed frees of this
+ * metaslab.
+ */
+uint64_t
+metaslab_largest_unflushed_free(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if (msp->ms_unflushed_frees == NULL)
+ return (0);
+
+ range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size);
+ if (rs == NULL)
+ return (0);
+
+ /*
+ * When a range is freed from the metaslab, that range is added to
+ * both the unflushed frees and the deferred frees. While the block
+ * will eventually be usable, if the metaslab were loaded the range
+ * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
+ * txgs had passed. As a result, when attempting to estimate an upper
+ * bound for the largest currently-usable free segment in the
+ * metaslab, we need to not consider any ranges currently in the defer
+ * trees. This algorithm approximates the largest available chunk in
+ * the largest range in the unflushed_frees tree by taking the first
+ * chunk. While this may be a poor estimate, it should only remain so
+ * briefly and should eventually self-correct as frees are no longer
+ * deferred. Similar logic applies to the ms_freed tree. See
+ * metaslab_load() for more details.
+ *
+ * There are two primary sources of innacuracy in this estimate. Both
+ * are tolerated for performance reasons. The first source is that we
+ * only check the largest segment for overlaps. Smaller segments may
+ * have more favorable overlaps with the other trees, resulting in
+ * larger usable chunks. Second, we only look at the first chunk in
+ * the largest segment; there may be other usable chunks in the
+ * largest segment, but we ignore them.
+ */
+ uint64_t rstart = rs->rs_start;
+ uint64_t rsize = rs->rs_end - rstart;
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ uint64_t start = 0;
+ uint64_t size = 0;
+ boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
+ rsize, &start, &size);
+ if (found) {
+ if (rstart == start)
+ return (0);
+ rsize = start - rstart;
+ }
+ }
+
+ uint64_t start = 0;
+ uint64_t size = 0;
+ boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
+ rsize, &start, &size);
+ if (found)
+ rsize = start - rstart;
+
+ return (rsize);
+}
+
static range_seg_t *
metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
{
@@ -1311,7 +1442,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
* If we're running low on space, find a segment based on size,
* rather than iterating based on offset.
*/
- if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold ||
+ if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
free_pct < metaslab_df_free_pct) {
offset = -1;
} else {
@@ -1409,7 +1540,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
range_seg_t *rs, rsearch;
uint64_t hbit = highbit64(size);
uint64_t *cursor = &msp->ms_lbas[hbit - 1];
- uint64_t max_size = metaslab_block_maxsize(msp);
+ uint64_t max_size = metaslab_largest_allocatable(msp);
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT3U(avl_numnodes(t), ==,
@@ -1479,6 +1610,13 @@ metaslab_flush_wait(metaslab_t *msp)
cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
}
+static unsigned int
+metaslab_idx_func(multilist_t *ml, void *arg)
+{
+ metaslab_t *msp = arg;
+ return (msp->ms_id % multilist_get_num_sublists(ml));
+}
+
uint64_t
metaslab_allocated_space(metaslab_t *msp)
{
@@ -1537,6 +1675,8 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg)
allocating +=
range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
}
+ ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
+ msp->ms_allocating_total);
ASSERT3U(msp->ms_deferspace, ==,
range_tree_space(msp->ms_defer[0]) +
@@ -1725,7 +1865,6 @@ metaslab_verify_weight_and_frag(metaslab_t *msp)
msp->ms_weight = 0;
msp->ms_fragmentation = 0;
- msp->ms_max_size = 0;
/*
* This function is used for verification purposes. Regardless of
@@ -1753,6 +1892,87 @@ metaslab_verify_weight_and_frag(metaslab_t *msp)
VERIFY3U(msp->ms_weight, ==, weight);
}
+/*
+ * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
+ * this class that was used longest ago, and attempt to unload it. We don't
+ * want to spend too much time in this loop to prevent performance
+ * degredation, and we expect that most of the time this operation will
+ * succeed. Between that and the normal unloading processing during txg sync,
+ * we expect this to keep the metaslab memory usage under control.
+ */
+static void
+metaslab_potentially_evict(metaslab_class_t *mc)
+{
+#ifdef _KERNEL
+ uint64_t allmem = arc_all_memory();
+ extern kmem_cache_t *range_seg_cache;
+ uint64_t inuse = kmem_cache_stat(range_seg_cache, "buf_inuse");
+ uint64_t size = kmem_cache_stat(range_seg_cache, "buf_size");
+ int tries = 0;
+ for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
+ tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
+ tries++) {
+ unsigned int idx = multilist_get_random_index(
+ mc->mc_metaslab_txg_list);
+ multilist_sublist_t *mls =
+ multilist_sublist_lock(mc->mc_metaslab_txg_list, idx);
+ metaslab_t *msp = multilist_sublist_head(mls);
+ multilist_sublist_unlock(mls);
+ while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
+ inuse * size) {
+ VERIFY3P(mls, ==, multilist_sublist_lock(
+ mc->mc_metaslab_txg_list, idx));
+ ASSERT3U(idx, ==,
+ metaslab_idx_func(mc->mc_metaslab_txg_list, msp));
+
+ if (!multilist_link_active(&msp->ms_class_txg_node)) {
+ multilist_sublist_unlock(mls);
+ break;
+ }
+ metaslab_t *next_msp = multilist_sublist_next(mls, msp);
+ multilist_sublist_unlock(mls);
+ /*
+ * If the metaslab is currently loading there are two
+ * cases. If it's the metaslab we're evicting, we
+ * can't continue on or we'll panic when we attempt to
+ * recursively lock the mutex. If it's another
+ * metaslab that's loading, it can be safely skipped,
+ * since we know it's very new and therefore not a
+ * good eviction candidate. We check later once the
+ * lock is held that the metaslab is fully loaded
+ * before actually unloading it.
+ */
+ if (msp->ms_loading) {
+ msp = next_msp;
+ inuse = kmem_cache_stat(range_seg_cache,
+ "buf_inuse");
+ continue;
+ }
+ /*
+ * We can't unload metaslabs with no spacemap because
+ * they're not ready to be unloaded yet. We can't
+ * unload metaslabs with outstanding allocations
+ * because doing so could cause the metaslab's weight
+ * to decrease while it's unloaded, which violates an
+ * invariant that we use to prevent unnecessary
+ * loading. We also don't unload metaslabs that are
+ * currently active because they are high-weight
+ * metaslabs that are likely to be used in the near
+ * future.
+ */
+ mutex_enter(&msp->ms_lock);
+ if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
+ msp->ms_allocating_total == 0) {
+ metaslab_unload(msp);
+ }
+ mutex_exit(&msp->ms_lock);
+ msp = next_msp;
+ inuse = kmem_cache_stat(range_seg_cache, "buf_inuse");
+ }
+ }
+#endif
+}
+
static int
metaslab_load_impl(metaslab_t *msp)
{
@@ -1915,18 +2135,21 @@ metaslab_load_impl(metaslab_t *msp)
* comment for ms_synchist and ms_deferhist[] for more info]
*/
uint64_t weight = msp->ms_weight;
+ uint64_t max_size = msp->ms_max_size;
metaslab_recalculate_weight_and_sort(msp);
if (!WEIGHT_IS_SPACEBASED(weight))
ASSERT3U(weight, <=, msp->ms_weight);
- msp->ms_max_size = metaslab_block_maxsize(msp);
-
+ msp->ms_max_size = metaslab_largest_allocatable(msp);
+ ASSERT3U(max_size, <=, msp->ms_max_size);
hrtime_t load_end = gethrtime();
+ msp->ms_load_time = load_end;
if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, "
"ms_id %llu, smp_length %llu, "
"unflushed_allocs %llu, unflushed_frees %llu, "
"freed %llu, defer %llu + %llu, "
- "loading_time %lld ms",
+ "loading_time %lld ms, ms_max_size %llu, "
+ "max size error %llu",
spa_syncing_txg(spa), spa_name(spa),
msp->ms_group->mg_vd->vdev_id, msp->ms_id,
space_map_length(msp->ms_sm),
@@ -1935,7 +2158,8 @@ metaslab_load_impl(metaslab_t *msp)
range_tree_space(msp->ms_freed),
range_tree_space(msp->ms_defer[0]),
range_tree_space(msp->ms_defer[1]),
- (longlong_t)((load_end - load_start) / 1000000));
+ (longlong_t)((load_end - load_start) / 1000000),
+ msp->ms_max_size, msp->ms_max_size - max_size);
}
metaslab_verify_space(msp, spa_syncing_txg(spa));
@@ -1944,7 +2168,7 @@ metaslab_load_impl(metaslab_t *msp)
}
int
-metaslab_load(metaslab_t *msp, uint64_t txg)
+metaslab_load(metaslab_t *msp)
{
kstat_t *ksp;
ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -1988,11 +2212,20 @@ metaslab_load(metaslab_t *msp, uint64_t txg)
*/
ASSERT(!msp->ms_loaded);
+ /*
+ * If we're loading a metaslab in the normal class, consider evicting
+ * another one to keep our memory usage under the limit defined by the
+ * zfs_metaslab_mem_limit tunable.
+ */
+ if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
+ msp->ms_group->mg_class) {
+ metaslab_potentially_evict(msp->ms_group->mg_class);
+ }
+
int error = metaslab_load_impl(msp);
ASSERT(MUTEX_HELD(&msp->ms_lock));
msp->ms_loading = B_FALSE;
- msp->ms_loaded_txg = txg;
cv_broadcast(&msp->ms_load_cv);
return (error);
@@ -2003,14 +2236,29 @@ metaslab_unload(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
- metaslab_verify_weight_and_frag(msp);
+ /*
+ * This can happen if a metaslab is selected for eviction (in
+ * metaslab_potentially_evict) and then unloaded during spa_sync (via
+ * metaslab_class_evict_old).
+ */
+ if (!msp->ms_loaded)
+ return;
range_tree_vacate(msp->ms_allocatable, NULL, NULL);
msp->ms_loaded = B_FALSE;
- msp->ms_loaded_txg = 0;
+ msp->ms_unload_time = gethrtime();
+ msp->ms_activation_weight = 0;
msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
- msp->ms_max_size = 0;
+
+ if (msp->ms_group != NULL) {
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ multilist_sublist_t *mls =
+ multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+ if (multilist_link_active(&msp->ms_class_txg_node))
+ multilist_sublist_remove(mls, msp);
+ multilist_sublist_unlock(mls);
+ }
/*
* We explicitly recalculate the metaslab's weight based on its space
@@ -2029,6 +2277,21 @@ metaslab_unload(metaslab_t *msp)
}
void
+metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ multilist_sublist_t *mls =
+ multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+ if (multilist_link_active(&msp->ms_class_txg_node))
+ multilist_sublist_remove(mls, msp);
+ msp->ms_selected_txg = txg;
+ msp->ms_selected_time = gethrtime();
+ multilist_sublist_insert_tail(mls, msp);
+ multilist_sublist_unlock(mls);
+}
+
+void
metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
int64_t defer_delta, int64_t space_delta)
{
@@ -2056,6 +2319,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
+ multilist_link_init(&ms->ms_class_txg_node);
ms->ms_id = id;
ms->ms_start = id << vd->vdev_ms_shift;
@@ -2349,7 +2613,6 @@ metaslab_space_weight(metaslab_t *msp)
uint64_t weight, space;
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT(!vd->vdev_removing);
/*
* The baseline weight is the metaslab's free space.
@@ -2568,13 +2831,19 @@ metaslab_segment_weight(metaslab_t *msp)
* weights we rely on the entire weight (excluding the weight-type bit).
*/
boolean_t
-metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
+metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
{
- if (msp->ms_loaded) {
+ /*
+ * If the metaslab is loaded, ms_max_size is definitive and we can use
+ * the fast check. If it's not, the ms_max_size is a lower bound (once
+ * set), and we should use the fast check as long as we're not in
+ * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
+ * seconds since the metaslab was unloaded.
+ */
+ if (msp->ms_loaded ||
+ (msp->ms_max_size != 0 && !try_hard && gethrtime() <
+ msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
return (msp->ms_max_size >= asize);
- } else {
- ASSERT0(msp->ms_max_size);
- }
boolean_t should_allocate;
if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
@@ -2590,6 +2859,7 @@ metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
should_allocate = (asize <=
(msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
}
+
return (should_allocate);
}
@@ -2602,24 +2872,24 @@ metaslab_weight(metaslab_t *msp)
ASSERT(MUTEX_HELD(&msp->ms_lock));
- /*
- * If this vdev is in the process of being removed, there is nothing
- * for us to do here.
- */
- if (vd->vdev_removing)
- return (0);
-
metaslab_set_fragmentation(msp);
/*
- * Update the maximum size if the metaslab is loaded. This will
+ * Update the maximum size. If the metaslab is loaded, this will
* ensure that we get an accurate maximum size if newly freed space
- * has been added back into the free tree.
+ * has been added back into the free tree. If the metaslab is
+ * unloaded, we check if there's a larger free segment in the
+ * unflushed frees. This is a lower bound on the largest allocatable
+ * segment size. Coalescing of adjacent entries may reveal larger
+ * allocatable segments, but we aren't aware of those until loading
+ * the space map into a range tree.
*/
- if (msp->ms_loaded)
- msp->ms_max_size = metaslab_block_maxsize(msp);
- else
- ASSERT0(msp->ms_max_size);
+ if (msp->ms_loaded) {
+ msp->ms_max_size = metaslab_largest_allocatable(msp);
+ } else {
+ msp->ms_max_size = MAX(msp->ms_max_size,
+ metaslab_largest_unflushed_free(msp));
+ }
/*
* Segment-based weighting requires space map histogram support.
@@ -2638,6 +2908,8 @@ metaslab_weight(metaslab_t *msp)
void
metaslab_recalculate_weight_and_sort(metaslab_t *msp)
{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
/* note: we preserve the mask (e.g. indication of primary, etc..) */
uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
metaslab_group_sort(msp->ms_group, msp,
@@ -2648,16 +2920,23 @@ static int
metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
int allocator, uint64_t activation_weight)
{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
/*
* If we're activating for the claim code, we don't want to actually
* set the metaslab up for a specific allocator.
*/
- if (activation_weight == METASLAB_WEIGHT_CLAIM)
+ if (activation_weight == METASLAB_WEIGHT_CLAIM) {
+ ASSERT0(msp->ms_activation_weight);
+ msp->ms_activation_weight = msp->ms_weight;
+ metaslab_group_sort(mg, msp, msp->ms_weight |
+ activation_weight);
return (0);
+ }
+
metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
mg->mg_primaries : mg->mg_secondaries);
- ASSERT(MUTEX_HELD(&msp->ms_lock));
mutex_enter(&mg->mg_lock);
if (arr[allocator] != NULL) {
mutex_exit(&mg->mg_lock);
@@ -2668,39 +2947,88 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
ASSERT3S(msp->ms_allocator, ==, -1);
msp->ms_allocator = allocator;
msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
+
+ ASSERT0(msp->ms_activation_weight);
+ msp->ms_activation_weight = msp->ms_weight;
+ metaslab_group_sort_impl(mg, msp,
+ msp->ms_weight | activation_weight);
+
mutex_exit(&mg->mg_lock);
return (0);
}
static int
-metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight,
- uint64_t txg)
+metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
- if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- int error = metaslab_load(msp, txg);
- if (error != 0) {
- metaslab_group_sort(msp->ms_group, msp, 0);
- return (error);
- }
- if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
- /*
- * The metaslab was activated for another allocator
- * while we were waiting, we should reselect.
- */
+ /*
+ * The current metaslab is already activated for us so there
+ * is nothing to do. Already activated though, doesn't mean
+ * that this metaslab is activated for our allocator nor our
+ * requested activation weight. The metaslab could have started
+ * as an active one for our allocator but changed allocators
+ * while we were waiting to grab its ms_lock or we stole it
+ * [see find_valid_metaslab()]. This means that there is a
+ * possibility of passivating a metaslab of another allocator
+ * or from a different activation mask, from this thread.
+ */
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+ ASSERT(msp->ms_loaded);
+ return (0);
+ }
+
+ int error = metaslab_load(msp);
+ if (error != 0) {
+ metaslab_group_sort(msp->ms_group, msp, 0);
+ return (error);
+ }
+
+ /*
+ * When entering metaslab_load() we may have dropped the
+ * ms_lock because we were loading this metaslab, or we
+ * were waiting for another thread to load it for us. In
+ * that scenario, we recheck the weight of the metaslab
+ * to see if it was activated by another thread.
+ *
+ * If the metaslab was activated for another allocator or
+ * it was activated with a different activation weight (e.g.
+ * we wanted to make it a primary but it was activated as
+ * secondary) we return error (EBUSY).
+ *
+ * If the metaslab was activated for the same allocator
+ * and requested activation mask, skip activating it.
+ */
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+ if (msp->ms_allocator != allocator)
return (EBUSY);
- }
- if ((error = metaslab_activate_allocator(msp->ms_group, msp,
- allocator, activation_weight)) != 0) {
- return (error);
- }
- msp->ms_activation_weight = msp->ms_weight;
- metaslab_group_sort(msp->ms_group, msp,
- msp->ms_weight | activation_weight);
+ if ((msp->ms_weight & activation_weight) == 0)
+ return (EBUSY);
+
+ EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
+ msp->ms_primary);
+ return (0);
+ }
+
+ /*
+ * If the metaslab has literally 0 space, it will have weight 0. In
+ * that case, don't bother activating it. This can happen if the
+ * metaslab had space during find_valid_metaslab, but another thread
+ * loaded it and used all that space while we were waiting to grab the
+ * lock.
+ */
+ if (msp->ms_weight == 0) {
+ ASSERT0(range_tree_space(msp->ms_allocatable));
+ return (SET_ERROR(ENOSPC));
}
+
+ if ((error = metaslab_activate_allocator(msp->ms_group, msp,
+ allocator, activation_weight)) != 0) {
+ return (error);
+ }
+
ASSERT(msp->ms_loaded);
ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
@@ -2712,6 +3040,8 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
uint64_t weight)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(msp->ms_loaded);
+
if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
metaslab_group_sort(mg, msp, weight);
return;
@@ -2719,15 +3049,16 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
mutex_enter(&mg->mg_lock);
ASSERT3P(msp->ms_group, ==, mg);
+ ASSERT3S(0, <=, msp->ms_allocator);
+ ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
+
if (msp->ms_primary) {
- ASSERT3U(0, <=, msp->ms_allocator);
- ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
mg->mg_primaries[msp->ms_allocator] = NULL;
} else {
- ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
+ ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
mg->mg_secondaries[msp->ms_allocator] = NULL;
}
msp->ms_allocator = -1;
@@ -2749,9 +3080,10 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight)
range_tree_is_empty(msp->ms_allocatable));
ASSERT0(weight & METASLAB_ACTIVE_MASK);
+ ASSERT(msp->ms_activation_weight != 0);
msp->ms_activation_weight = 0;
metaslab_passivate_allocator(msp->ms_group, msp, weight);
- ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
+ ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
}
/*
@@ -2790,13 +3122,14 @@ static void
metaslab_preload(void *arg)
{
metaslab_t *msp = arg;
- spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ spa_t *spa = mc->mc_spa;
ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
mutex_enter(&msp->ms_lock);
- (void) metaslab_load(msp, spa_syncing_txg(spa));
- msp->ms_selected_txg = spa_syncing_txg(spa);
+ (void) metaslab_load(msp);
+ metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
mutex_exit(&msp->ms_lock);
}
@@ -3249,12 +3582,19 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
/*
* Normally, we don't want to process a metaslab if there are no
* allocations or frees to perform. However, if the metaslab is being
- * forced to condense and it's loaded, we need to let it through.
+ * forced to condense, it's loaded and we're not beyond the final
+ * dirty txg, we need to let it through. Not condensing beyond the
+ * final dirty txg prevents an issue where metaslabs that need to be
+ * condensed but were loaded for other reasons could cause a panic
+ * here. By only checking the txg in that branch of the conditional,
+ * we preserve the utility of the VERIFY statements in all other
+ * cases.
*/
if (range_tree_is_empty(alloctree) &&
range_tree_is_empty(msp->ms_freeing) &&
range_tree_is_empty(msp->ms_checkpointing) &&
- !(msp->ms_loaded && msp->ms_condense_wanted))
+ !(msp->ms_loaded && msp->ms_condense_wanted &&
+ txg <= spa_final_dirty_txg(spa)))
return;
@@ -3507,6 +3847,23 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
dmu_tx_commit(tx);
}
+static void
+metaslab_evict(metaslab_t *msp, uint64_t txg)
+{
+ if (!msp->ms_loaded || msp->ms_disabled != 0)
+ return;
+
+ for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+ VERIFY0(range_tree_space(
+ msp->ms_allocating[(txg + t) & TXG_MASK]));
+ }
+ if (msp->ms_allocator != -1)
+ metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+
+ if (!metaslab_debug_unload)
+ metaslab_unload(msp);
+}
+
/*
* Called after a transaction group has completely synced to mark
* all of the metaslab's free space as usable.
@@ -3553,7 +3910,9 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
msp->ms_unflushed_allocs = range_tree_create(NULL, NULL);
ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
- msp->ms_unflushed_frees = range_tree_create(NULL, NULL);
+ msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops,
+ &msp->ms_unflushed_frees_by_size,
+ metaslab_rangesize_compare, 0);
metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
}
@@ -3658,41 +4017,28 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
/*
* If the metaslab is loaded and we've not tried to load or allocate
* from it in 'metaslab_unload_delay' txgs, then we normally unload it.
- * However, to prevent thrashing, if the metaslab was recently loaded,
- * then instead of unloading it, we increase the unload delay (only up
- * to the maximum).
*/
if (msp->ms_loaded &&
msp->ms_disabled == 0 &&
msp->ms_selected_txg + metaslab_unload_delay < txg) {
- if (msp->ms_loaded_txg != 0 && msp->ms_loaded_txg +
- metaslab_unload_delay + metaslab_load_window >= txg) {
- if (metaslab_unload_delay + metaslab_load_window <=
- metaslab_unload_delay_max) {
- metaslab_unload_delay += metaslab_load_window;
- }
- DTRACE_PROBE1(zfs__metaslab__delay__unload,
- metaslab_t *, msp);
- } else {
- for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
- VERIFY0(range_tree_space(
- msp->ms_allocating[(txg + t) & TXG_MASK]));
- }
- if (msp->ms_allocator != -1) {
- metaslab_passivate(msp, msp->ms_weight &
- ~METASLAB_ACTIVE_MASK);
- }
-
- if (!metaslab_debug_unload)
- metaslab_unload(msp);
+ for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+ VERIFY0(range_tree_space(
+ msp->ms_allocating[(txg + t) & TXG_MASK]));
}
+ if (msp->ms_allocator != -1) {
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ }
+
+ if (!metaslab_debug_unload)
+ metaslab_unload(msp);
}
ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
ASSERT0(range_tree_space(msp->ms_freeing));
ASSERT0(range_tree_space(msp->ms_freed));
ASSERT0(range_tree_space(msp->ms_checkpointing));
-
+ msp->ms_allocating_total -= msp->ms_allocated_this_txg;
msp->ms_allocated_this_txg = 0;
mutex_exit(&msp->ms_lock);
}
@@ -3946,6 +4292,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
+ msp->ms_allocating_total += size;
metaslab_verify_space(msp, txg);
}
@@ -3954,7 +4301,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
* Now that we've attempted the allocation we need to update the
* metaslab's maximum block size since it may have changed.
*/
- msp->ms_max_size = metaslab_block_maxsize(msp);
+ msp->ms_max_size = metaslab_largest_allocatable(msp);
return (start);
}
@@ -3972,7 +4319,8 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
static metaslab_t *
find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
- zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
+ boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
+ boolean_t *was_active)
{
avl_index_t idx;
avl_tree_t *t = &mg->mg_metaslab_tree;
@@ -3982,7 +4330,7 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
int i;
- if (!metaslab_should_allocate(msp, asize)) {
+ if (!metaslab_should_allocate(msp, asize, try_hard)) {
metaslab_trace_add(zal, mg, msp, asize, d,
TRACE_TOO_SMALL, allocator);
continue;
@@ -4024,17 +4372,51 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
return (msp);
}
+void
+metaslab_active_mask_verify(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+ return;
+
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
+ return;
+
+ if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
+ VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+ VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
+ VERIFY3S(msp->ms_allocator, !=, -1);
+ VERIFY(msp->ms_primary);
+ return;
+ }
+
+ if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
+ VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+ VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
+ VERIFY3S(msp->ms_allocator, !=, -1);
+ VERIFY(!msp->ms_primary);
+ return;
+ }
+
+ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+ VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+ VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+ VERIFY3S(msp->ms_allocator, ==, -1);
+ return;
+ }
+}
+
/* ARGSUSED */
static uint64_t
metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
- uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
- int d, int allocator)
+ uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
+ int allocator, boolean_t try_hard)
{
metaslab_t *msp = NULL;
uint64_t offset = -1ULL;
- uint64_t activation_weight;
- activation_weight = METASLAB_WEIGHT_PRIMARY;
+ uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
for (int i = 0; i < d; i++) {
if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
@@ -4075,15 +4457,37 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
mg->mg_primaries[allocator] != NULL) {
msp = mg->mg_primaries[allocator];
+
+ /*
+ * Even though we don't hold the ms_lock for the
+ * primary metaslab, those fields should not
+ * change while we hold the mg_lock. Thus is is
+ * safe to make assertions on them.
+ */
+ ASSERT(msp->ms_primary);
+ ASSERT3S(msp->ms_allocator, ==, allocator);
+ ASSERT(msp->ms_loaded);
+
was_active = B_TRUE;
+ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
mg->mg_secondaries[allocator] != NULL) {
msp = mg->mg_secondaries[allocator];
+
+ /*
+ * See comment above about the similar assertions
+ * for the primary metaslab.
+ */
+ ASSERT(!msp->ms_primary);
+ ASSERT3S(msp->ms_allocator, ==, allocator);
+ ASSERT(msp->ms_loaded);
+
was_active = B_TRUE;
+ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
} else {
msp = find_valid_metaslab(mg, activation_weight, dva, d,
- want_unique, asize, allocator, zal, search,
- &was_active);
+ want_unique, asize, allocator, try_hard, zal,
+ search, &was_active);
}
mutex_exit(&mg->mg_lock);
@@ -4091,59 +4495,106 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
kmem_free(search, sizeof (*search));
return (-1ULL);
}
-
mutex_enter(&msp->ms_lock);
+
+ metaslab_active_mask_verify(msp);
+
+ /*
+ * This code is disabled out because of issues with
+ * tracepoints in non-gpl kernel modules.
+ */
+#if 0
+ DTRACE_PROBE3(ms__activation__attempt,
+ metaslab_t *, msp, uint64_t, activation_weight,
+ boolean_t, was_active);
+#endif
+
/*
* Ensure that the metaslab we have selected is still
* capable of handling our request. It's possible that
* another thread may have changed the weight while we
* were blocked on the metaslab lock. We check the
- * active status first to see if we need to reselect
+ * active status first to see if we need to set_selected_txg
* a new metaslab.
*/
if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
+ ASSERT3S(msp->ms_allocator, ==, -1);
mutex_exit(&msp->ms_lock);
continue;
}
/*
- * If the metaslab is freshly activated for an allocator that
- * isn't the one we're allocating from, or if it's a primary and
- * we're seeking a secondary (or vice versa), we go back and
- * select a new metaslab.
+ * If the metaslab was activated for another allocator
+ * while we were waiting in the ms_lock above, or it's
+ * a primary and we're seeking a secondary (or vice versa),
+ * we go back and select a new metaslab.
*/
if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
(msp->ms_allocator != -1) &&
(msp->ms_allocator != allocator || ((activation_weight ==
METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
+ ASSERT(msp->ms_loaded);
+ ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
+ msp->ms_allocator != -1);
mutex_exit(&msp->ms_lock);
continue;
}
+ /*
+ * This metaslab was used for claiming regions allocated
+ * by the ZIL during pool import. Once these regions are
+ * claimed we don't need to keep the CLAIM bit set
+ * anymore. Passivate this metaslab to zero its activation
+ * mask.
+ */
if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
activation_weight != METASLAB_WEIGHT_CLAIM) {
+ ASSERT(msp->ms_loaded);
+ ASSERT3S(msp->ms_allocator, ==, -1);
metaslab_passivate(msp, msp->ms_weight &
~METASLAB_WEIGHT_CLAIM);
mutex_exit(&msp->ms_lock);
continue;
}
- if (metaslab_activate(msp, allocator, activation_weight,
- txg) != 0) {
+ metaslab_set_selected_txg(msp, txg);
+
+ int activation_error =
+ metaslab_activate(msp, allocator, activation_weight);
+ metaslab_active_mask_verify(msp);
+
+ /*
+ * If the metaslab was activated by another thread for
+ * another allocator or activation_weight (EBUSY), or it
+ * failed because another metaslab was assigned as primary
+ * for this allocator (EEXIST) we continue using this
+ * metaslab for our allocation, rather than going on to a
+ * worse metaslab (we waited for that metaslab to be loaded
+ * after all).
+ *
+ * If the activation failed due to an I/O error or ENOSPC we
+ * skip to the next metaslab.
+ */
+ boolean_t activated;
+ if (activation_error == 0) {
+ activated = B_TRUE;
+ } else if (activation_error == EBUSY ||
+ activation_error == EEXIST) {
+ activated = B_FALSE;
+ } else {
mutex_exit(&msp->ms_lock);
continue;
}
-
- msp->ms_selected_txg = txg;
+ ASSERT(msp->ms_loaded);
/*
* Now that we have the lock, recheck to see if we should
* continue to use this metaslab for this allocation. The
- * the metaslab is now loaded so metaslab_should_allocate() can
- * accurately determine if the allocation attempt should
+ * the metaslab is now loaded so metaslab_should_allocate()
+ * can accurately determine if the allocation attempt should
* proceed.
*/
- if (!metaslab_should_allocate(msp, asize)) {
+ if (!metaslab_should_allocate(msp, asize, try_hard)) {
/* Passivate this metaslab and select a new one. */
metaslab_trace_add(zal, mg, msp, asize, d,
TRACE_TOO_SMALL, allocator);
@@ -4151,8 +4602,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
}
/*
- * If this metaslab is currently condensing then pick again as
- * we can't manipulate this metaslab until it's committed
+ * If this metaslab is currently condensing then pick again
+ * as we can't manipulate this metaslab until it's committed
* to disk. If this metaslab is being initialized, we shouldn't
* allocate from it since the allocated region might be
* overwritten after allocation.
@@ -4160,15 +4611,19 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
if (msp->ms_condensing) {
metaslab_trace_add(zal, mg, msp, asize, d,
TRACE_CONDENSING, allocator);
- metaslab_passivate(msp, msp->ms_weight &
- ~METASLAB_ACTIVE_MASK);
+ if (activated) {
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ }
mutex_exit(&msp->ms_lock);
continue;
} else if (msp->ms_disabled > 0) {
metaslab_trace_add(zal, mg, msp, asize, d,
TRACE_DISABLED, allocator);
- metaslab_passivate(msp, msp->ms_weight &
- ~METASLAB_ACTIVE_MASK);
+ if (activated) {
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ }
mutex_exit(&msp->ms_lock);
continue;
}
@@ -4178,13 +4633,23 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
if (offset != -1ULL) {
/* Proactively passivate the metaslab, if needed */
- metaslab_segment_may_passivate(msp);
+ if (activated)
+ metaslab_segment_may_passivate(msp);
break;
}
next:
ASSERT(msp->ms_loaded);
/*
+ * This code is disabled out because of issues with
+ * tracepoints in non-gpl kernel modules.
+ */
+#if 0
+ DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
+ uint64_t, asize);
+#endif
+
+ /*
* We were unable to allocate from this metaslab so determine
* a new weight for this metaslab. Now that we have loaded
* the metaslab we can provide a better hint to the metaslab
@@ -4205,14 +4670,33 @@ next:
* currently available for allocation and is accurate
* even within a sync pass.
*/
+ uint64_t weight;
if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
- uint64_t weight = metaslab_block_maxsize(msp);
+ weight = metaslab_largest_allocatable(msp);
WEIGHT_SET_SPACEBASED(weight);
+ } else {
+ weight = metaslab_weight_from_range_tree(msp);
+ }
+
+ if (activated) {
metaslab_passivate(msp, weight);
} else {
- metaslab_passivate(msp,
- metaslab_weight_from_range_tree(msp));
+ /*
+ * For the case where we use the metaslab that is
+ * active for another allocator we want to make
+ * sure that we retain the activation mask.
+ *
+ * Note that we could attempt to use something like
+ * metaslab_recalculate_weight_and_sort() that
+ * retains the activation mask here. That function
+ * uses metaslab_weight() to set the weight though
+ * which is not as accurate as the calculations
+ * above.
+ */
+ weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
+ metaslab_group_sort(mg, msp, weight);
}
+ metaslab_active_mask_verify(msp);
/*
* We have just failed an allocation attempt, check
@@ -4220,7 +4704,7 @@ next:
* we may end up in an infinite loop retrying the same
* metaslab.
*/
- ASSERT(!metaslab_should_allocate(msp, asize));
+ ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
mutex_exit(&msp->ms_lock);
}
@@ -4231,14 +4715,14 @@ next:
static uint64_t
metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
- uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
- int d, int allocator)
+ uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
+ int allocator, boolean_t try_hard)
{
uint64_t offset;
ASSERT(mg->mg_initialized);
offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
- dva, d, allocator);
+ dva, d, allocator, try_hard);
mutex_enter(&mg->mg_lock);
if (offset == -1ULL) {
@@ -4408,7 +4892,7 @@ top:
* allow any metaslab to be used (unique=false).
*/
uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
- !try_hard, dva, d, allocator);
+ !try_hard, dva, d, allocator, try_hard);
if (offset != -1ULL) {
/*
@@ -4731,6 +5215,7 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock);
range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
offset, size);
+ msp->ms_allocating_total -= size;
VERIFY(!msp->ms_condensing);
VERIFY3U(offset, >=, msp->ms_start);
@@ -4836,7 +5321,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
mutex_enter(&msp->ms_lock);
if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
- error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM, txg);
+ error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
/*
* No need to fail in that case; someone else has activated the
* metaslab, but that doesn't preclude us from using it.
@@ -4862,10 +5347,20 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
range_tree_clear(msp->ms_trim, offset, size);
if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
+ metaslab_class_t *mc = msp->ms_group->mg_class;
+ multilist_sublist_t *mls =
+ multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+ if (!multilist_link_active(&msp->ms_class_txg_node)) {
+ msp->ms_selected_txg = txg;
+ multilist_sublist_insert_head(mls, msp);
+ }
+ multilist_sublist_unlock(mls);
+
if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
vdev_dirty(vd, VDD_METASLAB, msp, txg);
range_tree_add(msp->ms_allocating[txg & TXG_MASK],
offset, size);
+ msp->ms_allocating_total += size;
}
mutex_exit(&msp->ms_lock);
@@ -5226,7 +5721,7 @@ metaslab_disable(metaslab_t *msp)
}
void
-metaslab_enable(metaslab_t *msp, boolean_t sync)
+metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
{
metaslab_group_t *mg = msp->ms_group;
spa_t *spa = mg->mg_vd->vdev_spa;
@@ -5244,6 +5739,8 @@ metaslab_enable(metaslab_t *msp, boolean_t sync)
if (--msp->ms_disabled == 0) {
mg->mg_ms_disabled--;
cv_broadcast(&mg->mg_ms_disabled_cv);
+ if (unload)
+ metaslab_unload(msp);
}
mutex_exit(&msp->ms_lock);
mutex_exit(&mg->mg_ms_disabled_lock);
diff --git a/usr/src/uts/common/fs/zfs/range_tree.c b/usr/src/uts/common/fs/zfs/range_tree.c
index 0ce251126b..92726c3f71 100644
--- a/usr/src/uts/common/fs/zfs/range_tree.c
+++ b/usr/src/uts/common/fs/zfs/range_tree.c
@@ -525,6 +525,36 @@ range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
}
/*
+ * Returns the first subset of the given range which overlaps with the range
+ * tree. Returns true if there is a segment in the range, and false if there
+ * isn't.
+ */
+boolean_t
+range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
+ uint64_t *ostart, uint64_t *osize)
+{
+ range_seg_t rsearch;
+ rsearch.rs_start = start;
+ rsearch.rs_end = start + 1;
+
+ avl_index_t where;
+ range_seg_t *rs = avl_find(&rt->rt_root, &rsearch, &where);
+ if (rs != NULL) {
+ *ostart = start;
+ *osize = MIN(size, rs->rs_end - start);
+ return (B_TRUE);
+ }
+
+ rs = avl_nearest(&rt->rt_root, where, AVL_AFTER);
+ if (rs == NULL || rs->rs_start > start + size)
+ return (B_FALSE);
+
+ *ostart = rs->rs_start;
+ *osize = MIN(start + size, rs->rs_end) - rs->rs_start;
+ return (B_TRUE);
+}
+
+/*
* Ensure that this range is not in the tree, regardless of whether
* it is currently in the tree.
*/
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index c213c860bd..054e773b3f 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -8618,6 +8618,10 @@ spa_sync(spa_t *spa, uint64_t txg)
while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
!= NULL)
vdev_sync_done(vd, txg);
+
+ metaslab_class_evict_old(spa->spa_normal_class, txg);
+ metaslab_class_evict_old(spa->spa_log_class, txg);
+
spa_sync_close_syncing_log_sm(spa);
spa_update_dspace(spa);
diff --git a/usr/src/uts/common/fs/zfs/spa_log_spacemap.c b/usr/src/uts/common/fs/zfs/spa_log_spacemap.c
index bbb6eda845..e0c369d13c 100644
--- a/usr/src/uts/common/fs/zfs/spa_log_spacemap.c
+++ b/usr/src/uts/common/fs/zfs/spa_log_spacemap.c
@@ -1191,7 +1191,8 @@ out:
metaslab_unflushed_changes_memused(m);
if (metaslab_debug_load && m->ms_sm != NULL) {
- VERIFY0(metaslab_load(m, spa_syncing_txg(spa)));
+ VERIFY0(metaslab_load(m));
+ metaslab_set_selected_txg(m, 0);
}
mutex_exit(&m->ms_lock);
}
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index f636d3dcf2..1ef3bb79ca 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -236,6 +236,7 @@ void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
+uint64_t arc_all_memory(void);
uint64_t arc_max_bytes(void);
void arc_init(void);
void arc_fini(void);
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
index 10705a84bc..069c5ab79a 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -56,7 +56,7 @@ uint64_t metaslab_estimated_condensed_size(metaslab_t *);
int metaslab_sort_by_flushed(const void *, const void *);
uint64_t metaslab_unflushed_changes_memused(metaslab_t *);
-int metaslab_load(metaslab_t *, uint64_t);
+int metaslab_load(metaslab_t *);
void metaslab_unload(metaslab_t *);
boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *);
@@ -65,7 +65,7 @@ uint64_t metaslab_allocated_space(metaslab_t *);
void metaslab_sync(metaslab_t *, uint64_t);
void metaslab_sync_done(metaslab_t *, uint64_t);
void metaslab_sync_reassess(metaslab_group_t *);
-uint64_t metaslab_block_maxsize(metaslab_t *);
+uint64_t metaslab_largest_allocatable(metaslab_t *);
/*
* metaslab alloc flags
@@ -107,7 +107,7 @@ uint64_t metaslab_class_expandable_space(metaslab_class_t *);
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
zio_t *, int);
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
-
+void metaslab_class_evict_old(metaslab_class_t *, uint64_t);
uint64_t metaslab_class_get_alloc(metaslab_class_t *);
uint64_t metaslab_class_get_space(metaslab_class_t *);
uint64_t metaslab_class_get_dspace(metaslab_class_t *);
@@ -130,7 +130,8 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
void metaslab_recalculate_weight_and_sort(metaslab_t *);
void metaslab_disable(metaslab_t *);
-void metaslab_enable(metaslab_t *, boolean_t);
+void metaslab_enable(metaslab_t *, boolean_t, boolean_t);
+void metaslab_set_selected_txg(metaslab_t *, uint64_t);
extern int metaslab_debug_load;
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
index 5920b3113c..a413eef490 100644
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -36,6 +36,7 @@
#include <sys/vdev.h>
#include <sys/txg.h>
#include <sys/avl.h>
+#include <sys/multilist.h>
#ifdef __cplusplus
extern "C" {
@@ -194,6 +195,12 @@ struct metaslab_class {
uint64_t mc_space; /* total space (alloc + free) */
uint64_t mc_dspace; /* total deflated space */
uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+ /*
+ * List of all loaded metaslabs in the class, sorted in order of most
+ * recent use.
+ */
+ multilist_t *mc_metaslab_txg_list;
};
/*
@@ -387,6 +394,7 @@ struct metaslab {
range_tree_t *ms_allocating[TXG_SIZE];
range_tree_t *ms_allocatable;
uint64_t ms_allocated_this_txg;
+ uint64_t ms_allocating_total;
/*
* The following range trees are accessed only from syncing context.
@@ -484,7 +492,13 @@ struct metaslab {
* stay cached.
*/
uint64_t ms_selected_txg;
- uint64_t ms_loaded_txg; /* track when metaslab was loaded */
+ /*
+ * ms_load/unload_time can be used for performance monitoring
+ * (e.g. by dtrace or mdb).
+ */
+ hrtime_t ms_load_time; /* time last loaded */
+ hrtime_t ms_unload_time; /* time last unloaded */
+ hrtime_t ms_selected_time; /* time last allocated from */
uint64_t ms_max_size; /* maximum allocatable size */
@@ -504,12 +518,17 @@ struct metaslab {
* segment sizes.
*/
avl_tree_t ms_allocatable_by_size;
+ avl_tree_t ms_unflushed_frees_by_size;
uint64_t ms_lbas[MAX_LBAS];
metaslab_group_t *ms_group; /* metaslab group */
avl_node_t ms_group_node; /* node in metaslab group tree */
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */
+ /*
+ * Node in metaslab class's selected txg list
+ */
+ multilist_node_t ms_class_txg_node;
/*
* Allocs and frees that are committed to the vdev log spacemap but
diff --git a/usr/src/uts/common/fs/zfs/sys/range_tree.h b/usr/src/uts/common/fs/zfs/sys/range_tree.h
index d450ff7f16..716aaf3b90 100644
--- a/usr/src/uts/common/fs/zfs/sys/range_tree.h
+++ b/usr/src/uts/common/fs/zfs/sys/range_tree.h
@@ -88,6 +88,8 @@ range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
void range_tree_destroy(range_tree_t *rt);
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
+ uint64_t *ostart, uint64_t *osize);
void range_tree_verify_not_present(range_tree_t *rt,
uint64_t start, uint64_t size);
range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
diff --git a/usr/src/uts/common/fs/zfs/vdev_initialize.c b/usr/src/uts/common/fs/zfs/vdev_initialize.c
index af18983c44..2079df133c 100644
--- a/usr/src/uts/common/fs/zfs/vdev_initialize.c
+++ b/usr/src/uts/common/fs/zfs/vdev_initialize.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
*/
#include <sys/spa.h>
@@ -350,7 +350,7 @@ vdev_initialize_calculate_progress(vdev_t *vd)
* metaslab. Load it and walk the free tree for more accurate
* progress estimation.
*/
- VERIFY0(metaslab_load(msp, spa_syncing_txg(vd->vdev_spa)));
+ VERIFY0(metaslab_load(msp));
for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
@@ -474,6 +474,7 @@ vdev_initialize_thread(void *arg)
for (uint64_t i = 0; !vd->vdev_detached &&
i < vd->vdev_top->vdev_ms_count; i++) {
metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+ boolean_t unload_when_done = B_FALSE;
/*
* If we've expanded the top-level vdev or it's our
@@ -487,14 +488,16 @@ vdev_initialize_thread(void *arg)
spa_config_exit(spa, SCL_CONFIG, FTAG);
metaslab_disable(msp);
mutex_enter(&msp->ms_lock);
- VERIFY0(metaslab_load(msp, spa_syncing_txg(spa)));
+ if (!msp->ms_loaded && !msp->ms_loading)
+ unload_when_done = B_TRUE;
+ VERIFY0(metaslab_load(msp));
range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
vd);
mutex_exit(&msp->ms_lock);
error = vdev_initialize_ranges(vd, deadbeef);
- metaslab_enable(msp, B_TRUE);
+ metaslab_enable(msp, B_TRUE, unload_when_done);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
diff --git a/usr/src/uts/common/fs/zfs/vdev_trim.c b/usr/src/uts/common/fs/zfs/vdev_trim.c
index a60d11814b..4be11bcb51 100644
--- a/usr/src/uts/common/fs/zfs/vdev_trim.c
+++ b/usr/src/uts/common/fs/zfs/vdev_trim.c
@@ -622,7 +622,7 @@ vdev_trim_calculate_progress(vdev_t *vd)
* metaslab. Load it and walk the free tree for more
* accurate progress estimation.
*/
- VERIFY0(metaslab_load(msp, spa_syncing_txg(vd->vdev_spa)));
+ VERIFY0(metaslab_load(msp));
for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
@@ -730,7 +730,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
*/
if (zfs_flags & ZFS_DEBUG_TRIM) {
metaslab_t *msp = ta->trim_msp;
- VERIFY0(metaslab_load(msp, spa_syncing_txg(vd->vdev_spa)));
+ VERIFY0(metaslab_load(msp));
VERIFY3B(msp->ms_loaded, ==, B_TRUE);
VERIFY(range_tree_find(msp->ms_allocatable, start, size));
}
@@ -842,7 +842,7 @@ vdev_trim_thread(void *arg)
spa_config_exit(spa, SCL_CONFIG, FTAG);
metaslab_disable(msp);
mutex_enter(&msp->ms_lock);
- VERIFY0(metaslab_load(msp, spa_syncing_txg(spa)));
+ VERIFY0(metaslab_load(msp));
/*
* If a partial TRIM was requested skip metaslabs which have
@@ -850,7 +850,7 @@ vdev_trim_thread(void *arg)
*/
if (msp->ms_sm == NULL && vd->vdev_trim_partial) {
mutex_exit(&msp->ms_lock);
- metaslab_enable(msp, B_FALSE);
+ metaslab_enable(msp, B_FALSE, B_FALSE);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
vdev_trim_calculate_progress(vd);
continue;
@@ -862,7 +862,7 @@ vdev_trim_thread(void *arg)
mutex_exit(&msp->ms_lock);
error = vdev_trim_ranges(&ta);
- metaslab_enable(msp, B_TRUE);
+ metaslab_enable(msp, B_TRUE, B_FALSE);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
range_tree_vacate(ta.trim_tree, NULL, NULL);
@@ -1167,7 +1167,7 @@ vdev_autotrim_thread(void *arg)
if (msp->ms_sm == NULL ||
range_tree_is_empty(msp->ms_trim)) {
mutex_exit(&msp->ms_lock);
- metaslab_enable(msp, B_FALSE);
+ metaslab_enable(msp, B_FALSE, B_FALSE);
continue;
}
@@ -1183,7 +1183,7 @@ vdev_autotrim_thread(void *arg)
*/
if (msp->ms_disabled > 1) {
mutex_exit(&msp->ms_lock);
- metaslab_enable(msp, B_FALSE);
+ metaslab_enable(msp, B_FALSE, B_FALSE);
continue;
}
@@ -1291,8 +1291,7 @@ vdev_autotrim_thread(void *arg)
*/
if (zfs_flags & ZFS_DEBUG_TRIM) {
mutex_enter(&msp->ms_lock);
- VERIFY0(metaslab_load(msp,
- spa_syncing_txg(spa)));
+ VERIFY0(metaslab_load(msp));
VERIFY3P(tap[0].trim_msp, ==, msp);
range_tree_walk(trim_tree,
vdev_trim_range_verify, &tap[0]);
@@ -1302,7 +1301,7 @@ vdev_autotrim_thread(void *arg)
range_tree_vacate(trim_tree, NULL, NULL);
range_tree_destroy(trim_tree);
- metaslab_enable(msp, issued_trim);
+ metaslab_enable(msp, issued_trim, B_FALSE);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
for (uint64_t c = 0; c < children; c++) {
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 72e18d5305..b24d83496c 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -5706,7 +5706,7 @@ zfs_ioc_next_obj(zfs_cmd_t *zc)
objset_t *os = NULL;
int error;
- error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os);
+ error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error != 0)
return (error);
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
index 3697d888e7..656c598e53 100644
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -1716,10 +1716,8 @@ mac_srs_create_proto_softrings(int id, uint16_t type, pri_t pri,
bzero(&mrf, sizeof (mac_rx_fifo_t));
mrf.mrf_type = MAC_RX_FIFO;
mrf.mrf_receive = (mac_receive_t)mac_soft_ring_poll;
- mrf.mrf_intr_enable =
- (mac_intr_enable_t)mac_soft_ring_intr_enable;
- mrf.mrf_intr_disable =
- (mac_intr_disable_t)mac_soft_ring_intr_disable;
+ mrf.mrf_intr_enable = (mac_intr_enable_t)mac_soft_ring_intr_enable;
+ mrf.mrf_intr_disable = (mac_intr_disable_t)mac_soft_ring_intr_disable;
mrf.mrf_flow_priority = pri;
softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait,
diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c
index 4655631dc1..c8a16e6fd3 100644
--- a/usr/src/uts/common/io/mac/mac_soft_ring.c
+++ b/usr/src/uts/common/io/mac/mac_soft_ring.c
@@ -494,7 +494,7 @@ done:
* Enabling is allow the processing thread to send packets to the
* client while disabling does the opposite.
*/
-void
+int
mac_soft_ring_intr_enable(void *arg)
{
mac_soft_ring_t *ringp = (mac_soft_ring_t *)arg;
@@ -503,6 +503,7 @@ mac_soft_ring_intr_enable(void *arg)
if (ringp->s_ring_first != NULL)
mac_soft_ring_worker_wakeup(ringp);
mutex_exit(&ringp->s_ring_lock);
+ return (0);
}
boolean_t
diff --git a/usr/src/uts/common/io/usb/usba/usbai.c b/usr/src/uts/common/io/usb/usba/usbai.c
index f6ac391bd8..e1a6b4dfcd 100644
--- a/usr/src/uts/common/io/usb/usba/usbai.c
+++ b/usr/src/uts/common/io/usb/usba/usbai.c
@@ -1040,7 +1040,7 @@ usb_register_hotplug_cbs(dev_info_t *dip,
}
}
if (ddi_add_event_handler(dip, usba_device->rm_cookie,
- (peh_t)disconnect_event_handler,
+ (peh_t)(uintptr_t)disconnect_event_handler,
NULL, &evdata->ev_rm_cb_id) != DDI_SUCCESS) {
USB_DPRINTF_L2(DPRINT_MASK_USBAI, usbai_log_handle,
"usb_register_hotplug_cbs: add disconnect handler failed");
@@ -1058,7 +1058,7 @@ usb_register_hotplug_cbs(dev_info_t *dip,
}
}
if (ddi_add_event_handler(dip, usba_device->ins_cookie,
- (peh_t)reconnect_event_handler,
+ (peh_t)(uintptr_t)reconnect_event_handler,
NULL, &evdata->ev_ins_cb_id) != DDI_SUCCESS) {
USB_DPRINTF_L2(DPRINT_MASK_USBAI, usbai_log_handle,
"usb_register_hotplug_cbs: add reconnect handler failed");
@@ -1129,7 +1129,7 @@ usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata,
}
}
if (ddi_add_event_handler(dip, usba_device->rm_cookie,
- (peh_t)usb_evdata->disconnect_event_handler,
+ (peh_t)(uintptr_t)usb_evdata->disconnect_event_handler,
NULL, &evdata->ev_rm_cb_id) != DDI_SUCCESS) {
goto fail;
@@ -1144,7 +1144,7 @@ usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata,
}
}
if (ddi_add_event_handler(dip, usba_device->ins_cookie,
- (peh_t)usb_evdata->reconnect_event_handler,
+ (peh_t)(uintptr_t)usb_evdata->reconnect_event_handler,
NULL, &evdata->ev_ins_cb_id) != DDI_SUCCESS) {
goto fail;
@@ -1159,7 +1159,7 @@ usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata,
}
}
if (ddi_add_event_handler(dip, usba_device->resume_cookie,
- (peh_t)usb_evdata->post_resume_event_handler,
+ (peh_t)(uintptr_t)usb_evdata->post_resume_event_handler,
NULL, &evdata->ev_resume_cb_id) != DDI_SUCCESS) {
goto fail;
@@ -1174,7 +1174,7 @@ usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata,
}
}
if (ddi_add_event_handler(dip, usba_device->suspend_cookie,
- (peh_t)usb_evdata->pre_suspend_event_handler,
+ (peh_t)(uintptr_t)usb_evdata->pre_suspend_event_handler,
NULL, &evdata->ev_suspend_cb_id) != DDI_SUCCESS) {
goto fail;
diff --git a/usr/src/uts/common/sys/mac_soft_ring.h b/usr/src/uts/common/sys/mac_soft_ring.h
index 581e18d06e..5a41899e60 100644
--- a/usr/src/uts/common/sys/mac_soft_ring.h
+++ b/usr/src/uts/common/sys/mac_soft_ring.h
@@ -691,7 +691,7 @@ extern void mac_srs_update_drv(struct mac_client_impl_s *);
extern void mac_update_srs_priority(mac_soft_ring_set_t *, pri_t);
extern void mac_client_update_classifier(mac_client_impl_t *, boolean_t);
-extern void mac_soft_ring_intr_enable(void *);
+extern int mac_soft_ring_intr_enable(void *);
extern boolean_t mac_soft_ring_intr_disable(void *);
extern mac_soft_ring_t *mac_soft_ring_create(int, clock_t, uint16_t,
pri_t, mac_client_impl_t *, mac_soft_ring_set_t *,
diff --git a/usr/src/uts/common/xen/io/evtchn_dev.c b/usr/src/uts/common/xen/io/evtchn_dev.c
index b4ba63b436..7a8d50eb33 100644
--- a/usr/src/uts/common/xen/io/evtchn_dev.c
+++ b/usr/src/uts/common/xen/io/evtchn_dev.c
@@ -112,8 +112,8 @@ static int evtchndrv_detach(dev_info_t *, ddi_detach_cmd_t);
static struct evtsoftdata *port_user[NR_EVENT_CHANNELS];
static kmutex_t port_user_lock;
-void
-evtchn_device_upcall()
+uint_t
+evtchn_device_upcall(caddr_t arg __unused, caddr_t arg1 __unused)
{
struct evtsoftdata *ep;
int port;
@@ -154,6 +154,7 @@ evtchn_device_upcall()
done:
mutex_exit(&port_user_lock);
+ return (DDI_INTR_CLAIMED);
}
/* ARGSUSED */
diff --git a/usr/src/uts/i86pc/io/immu_intrmap.c b/usr/src/uts/i86pc/io/immu_intrmap.c
index ab9f9bcbe7..737eed2efa 100644
--- a/usr/src/uts/i86pc/io/immu_intrmap.c
+++ b/usr/src/uts/i86pc/io/immu_intrmap.c
@@ -63,7 +63,7 @@ typedef struct intrmap_rte {
(p))
typedef enum {
- SVT_NO_VERIFY = 0, /* no verification */
+ SVT_NO_VERIFY = 0, /* no verification */
SVT_ALL_VERIFY, /* using sid and sq to verify */
SVT_BUS_VERIFY, /* verify #startbus and #endbus */
SVT_RSVD
@@ -224,7 +224,7 @@ bitset_find_multi_free(bitset_t *b, uint_t post, uint_t count)
}
}
- return (INTRMAP_IDX_FULL); /* no free index */
+ return (INTRMAP_IDX_FULL); /* no free index */
}
/* alloc one interrupt remapping table entry */
@@ -495,11 +495,12 @@ intrmap_enable(immu_t *immu)
/*
* immu_intr_handler()
- * the fault event handler for a single immu unit
+ * the fault event handler for a single immu unit
*/
-int
-immu_intr_handler(immu_t *immu)
+uint_t
+immu_intr_handler(caddr_t arg, caddr_t arg1 __unused)
{
+ immu_t *immu = (immu_t *)arg;
uint32_t status;
int index, fault_reg_offset;
int max_fault_index;
@@ -995,10 +996,10 @@ immu_intr_register(immu_t *immu)
"%s-intr-handler", immu->immu_name);
(void) add_avintr((void *)NULL, IMMU_INTR_IPL,
- (avfunc)(immu_intr_handler), intr_handler_name, irq,
+ immu_intr_handler, intr_handler_name, irq,
(caddr_t)immu, NULL, NULL, NULL);
immu_regs_intr_enable(immu, msi_addr, msi_data, uaddr);
- (void) immu_intr_handler(immu);
+ (void) immu_intr_handler((caddr_t)immu, NULL);
}
diff --git a/usr/src/uts/i86pc/io/immu_regs.c b/usr/src/uts/i86pc/io/immu_regs.c
index dc43b0f49a..d6b184416a 100644
--- a/usr/src/uts/i86pc/io/immu_regs.c
+++ b/usr/src/uts/i86pc/io/immu_regs.c
@@ -253,7 +253,7 @@ gaw2agaw(int gaw)
/*
* set_immu_agaw()
- * calculate agaw for a IOMMU unit
+ * calculate agaw for a IOMMU unit
*/
static int
set_agaw(immu_t *immu)
@@ -481,7 +481,7 @@ immu_regs_resume(immu_t *immu)
immu_regs_intr_enable(immu, immu->immu_regs_intr_msi_addr,
immu->immu_regs_intr_msi_data, immu->immu_regs_intr_uaddr);
- (void) immu_intr_handler(immu);
+ (void) immu_intr_handler((caddr_t)immu, NULL);
immu_regs_intrmap_enable(immu, immu->immu_intrmap_irta_reg);
@@ -638,7 +638,7 @@ immu_regs_wbf_flush(immu_t *immu)
/*
* immu_regs_cpu_flush()
- * flush the cpu cache line after CPU memory writes, so
+ * flush the cpu cache line after CPU memory writes, so
* IOMMU can see the writes
*/
void
diff --git a/usr/src/uts/i86pc/sys/immu.h b/usr/src/uts/i86pc/sys/immu.h
index 70193d26e6..22ae9ad3bf 100644
--- a/usr/src/uts/i86pc/sys/immu.h
+++ b/usr/src/uts/i86pc/sys/immu.h
@@ -130,11 +130,11 @@ typedef struct drhd {
kmutex_t dr_lock; /* protects the dmar field */
struct immu *dr_immu;
dev_info_t *dr_dip;
- uint16_t dr_seg;
- uint64_t dr_regs;
+ uint16_t dr_seg;
+ uint64_t dr_regs;
boolean_t dr_include_all;
- list_t dr_scope_list;
- list_node_t dr_node;
+ list_t dr_scope_list;
+ list_node_t dr_node;
} drhd_t;
typedef struct rmrr {
@@ -638,7 +638,7 @@ typedef struct immu {
* Enough space to hold the decimal number of any device instance.
* Used for device/cache names.
*/
-#define IMMU_ISTRLEN 11 /* log10(2^31) + 1 */
+#define IMMU_ISTRLEN 11 /* log10(2^31) + 1 */
/* properties that control DVMA */
#define DDI_DVMA_MAPTYPE_ROOTNEX_PROP "immu-dvma-mapping"
@@ -677,7 +677,7 @@ typedef struct domain {
/* list node for list of domains off immu */
list_node_t dom_immu_node;
- mod_hash_t *dom_cookie_hash;
+ mod_hash_t *dom_cookie_hash;
/* topmost device in domain; usually the device itself (non-shared) */
dev_info_t *dom_dip;
@@ -944,7 +944,7 @@ void immu_intrmap_destroy(list_t *immu_list);
/* registers interrupt handler for IOMMU unit */
void immu_intr_register(immu_t *immu);
-int immu_intr_handler(immu_t *immu);
+uint_t immu_intr_handler(caddr_t, caddr_t);
/* immu_qinv.c interfaces */
diff --git a/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c b/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c
index 8bc46f8e3e..a7745fd3f2 100644
--- a/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c
+++ b/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c
@@ -74,7 +74,7 @@ static gcpu_poll_trace_ctl_t gcpu_xpv_poll_trace_ctl;
#define GCPU_XPV_MCH_POLL_NO_REARM NULL
static uint_t
-gcpu_xpv_virq_intr(void)
+gcpu_xpv_virq_intr(caddr_t arg __unused, caddr_t arg1 __unused)
{
int types[] = { XEN_MC_URGENT, XEN_MC_NONURGENT };
uint64_t fetch_id;
@@ -194,7 +194,7 @@ gcpu_mca_poll_start(cmi_hdl_t hdl)
*/
gcpu_xpv_virq_vect = ec_bind_virq_to_irq(VIRQ_MCA, 0);
(void) add_avintr(NULL, gcpu_xpv_virq_level,
- (avfunc)gcpu_xpv_virq_intr, "MCA", gcpu_xpv_virq_vect,
+ gcpu_xpv_virq_intr, "MCA", gcpu_xpv_virq_vect,
NULL, NULL, NULL, NULL);
}
}
diff --git a/usr/src/uts/i86xpv/io/psm/xpv_psm.c b/usr/src/uts/i86xpv/io/psm/xpv_psm.c
index bc0ab7748d..94308c3f2f 100644
--- a/usr/src/uts/i86xpv/io/psm/xpv_psm.c
+++ b/usr/src/uts/i86xpv/io/psm/xpv_psm.c
@@ -223,14 +223,13 @@ xen_psm_hrtimeinit(void)
}
/* xen_psm NMI handler */
-/*ARGSUSED*/
-static void
-xen_psm_nmi_intr(caddr_t arg, struct regs *rp)
+static uint_t
+xen_psm_nmi_intr(caddr_t arg __unused, caddr_t arg1 __unused)
{
xen_psm_num_nmis++;
if (!lock_try(&xen_psm_nmi_lock))
- return;
+ return (DDI_INTR_UNCLAIMED);
if (xen_psm_kmdb_on_nmi && psm_debugger()) {
debug_enter("NMI received: entering kmdb\n");
@@ -247,6 +246,7 @@ xen_psm_nmi_intr(caddr_t arg, struct regs *rp)
}
lock_clear(&xen_psm_nmi_lock);
+ return (DDI_INTR_CLAIMED);
}
static void
@@ -294,7 +294,7 @@ xen_psm_picinit()
/* add nmi handler - least priority nmi handler */
LOCK_INIT_CLEAR(&xen_psm_nmi_lock);
- if (!psm_add_nmintr(0, (avfunc) xen_psm_nmi_intr,
+ if (!psm_add_nmintr(0, xen_psm_nmi_intr,
"xVM_psm NMI handler", (caddr_t)NULL))
cmn_err(CE_WARN, "xVM_psm: Unable to add nmi handler");
}