diff options
author | Ryan Zezeski <rpz@joyent.com> | 2019-11-14 09:39:53 -0700 |
---|---|---|
committer | Ryan Zezeski <rpz@joyent.com> | 2019-11-14 09:39:53 -0700 |
commit | 074bf480b3d9701c3c55056fe6105028504135b6 (patch) | |
tree | 9bd8568e7caa13fc1b13146260ba82af1827ebbc | |
parent | 27bc3ef3b6dd5a071a0607d96af5eec24ca5d276 (diff) | |
parent | 43ef85afe5649116d876156ca6eb797e144c9795 (diff) | |
download | illumos-joyent-cr6990-OS-8027.tar.gz |
Merge remote-tracking branch 'origin/master' into cr6990-OS-8027cr6990-OS-8027
27 files changed, 1066 insertions, 328 deletions
diff --git a/usr/src/boot/Makefile.version b/usr/src/boot/Makefile.version index 9d40ee8993..a161b24487 100644 --- a/usr/src/boot/Makefile.version +++ b/usr/src/boot/Makefile.version @@ -33,4 +33,4 @@ LOADER_VERSION = 1.1 # Use date like formatting here, YYYY.MM.DD.XX, without leading zeroes. # The version is processed from left to right, the version number can only # be increased. -BOOT_VERSION = $(LOADER_VERSION)-2019.11.04.1 +BOOT_VERSION = $(LOADER_VERSION)-2019.11.05.1 diff --git a/usr/src/boot/lib/libstand/zfs/zfsimpl.c b/usr/src/boot/lib/libstand/zfs/zfsimpl.c index e595273c9b..fba9f1fc59 100644 --- a/usr/src/boot/lib/libstand/zfs/zfsimpl.c +++ b/usr/src/boot/lib/libstand/zfs/zfsimpl.c @@ -1534,71 +1534,104 @@ vdev_label_offset(uint64_t psize, int l, uint64_t offset) } static int -vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap) +vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) +{ + unsigned int seq1 = 0; + unsigned int seq2 = 0; + int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); + + if (cmp != 0) + return (cmp); + + cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp); + if (cmp != 0) + return (cmp); + + if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1)) + seq1 = MMP_SEQ(ub1); + + if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) + seq2 = MMP_SEQ(ub2); + + return (AVL_CMP(seq1, seq2)); +} + +static int +uberblock_verify(uberblock_t *ub) +{ + if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) { + byteswap_uint64_array(ub, sizeof (uberblock_t)); + } + + if (ub->ub_magic != UBERBLOCK_MAGIC || + !SPA_VERSION_IS_SUPPORTED(ub->ub_version)) + return (EINVAL); + + return (0); +} + +static int +vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset, + size_t size) { - vdev_t vtmp; - vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch; - vdev_phys_t *tmp_label; - spa_t *spa; - vdev_t *vdev, *top_vdev, *pool_vdev; - off_t off; blkptr_t bp; - const unsigned char *nvlist = NULL; - uint64_t val; - uint64_t guid; - uint64_t best_txg = 0; - uint64_t pool_txg, pool_guid; - const char *pool_name; - const unsigned char *vdevs; - const unsigned char *features; - int i, l, rc, is_newer; - char *upbuf; - const struct uberblock *up; + off_t off; - /* - * Load the vdev label and figure out which - * uberblock is most current. - */ - memset(&vtmp, 0, sizeof(vtmp)); - vtmp.v_phys_read = phys_read; - vtmp.v_read_priv = read_priv; - vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv), - (uint64_t)sizeof (vdev_label_t)); + off = vdev_label_offset(vd->v_psize, l, offset); - /* Test for minimum device size. */ - if (vtmp.v_psize < SPA_MINDEVSIZE) - return (EIO); + BP_ZERO(&bp); + BP_SET_LSIZE(&bp, size); + BP_SET_PSIZE(&bp, size); + BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); + BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + DVA_SET_OFFSET(BP_IDENTITY(&bp), off); + ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); - tmp_label = zfs_alloc(sizeof (vdev_phys_t)); + return (vdev_read_phys(vd, &bp, buf, off, size)); +} - for (l = 0; l < VDEV_LABELS; l++) { - off = vdev_label_offset(vtmp.v_psize, l, - offsetof(vdev_label_t, vl_vdev_phys)); +static unsigned char * +vdev_label_read_config(vdev_t *vd, uint64_t txg) +{ + vdev_phys_t *label; + uint64_t best_txg = 0; + uint64_t label_txg = 0; + uint64_t asize; + unsigned char *nvl; + size_t nvl_size; + int error; + + label = malloc(sizeof (vdev_phys_t)); + if (label == NULL) + return (NULL); - BP_ZERO(&bp); - BP_SET_LSIZE(&bp, sizeof(vdev_phys_t)); - BP_SET_PSIZE(&bp, sizeof(vdev_phys_t)); - BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); - BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); - DVA_SET_OFFSET(BP_IDENTITY(&bp), off); - ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); + nvl_size = VDEV_PHYS_SIZE - sizeof (zio_eck_t) - 4; + nvl = malloc(nvl_size); + if (nvl == NULL) + goto done; - if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0)) - continue; + for (int l = 0; l < VDEV_LABELS; l++) { + const unsigned char *nvlist; - if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR) + if (vdev_label_read(vd, l, label, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t))) continue; - nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4; - if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, - DATA_TYPE_UINT64, NULL, &pool_txg) != 0) + if (label->vp_nvlist[0] != NV_ENCODE_XDR) continue; - if (best_txg <= pool_txg) { - uint64_t asize; + nvlist = (const unsigned char *) label->vp_nvlist + 4; + error = nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, + DATA_TYPE_UINT64, NULL, &label_txg); + if (error != 0 || label_txg == 0) { + memcpy(nvl, nvlist, nvl_size); + goto done; + } - best_txg = pool_txg; - memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t)); + if (label_txg <= txg && label_txg > best_txg) { + best_txg = label_txg; + memcpy(nvl, nvlist, nvl_size); /* * Use asize from pool config. We need this @@ -1606,30 +1639,89 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap) */ if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, DATA_TYPE_UINT64, NULL, &asize) == 0) { - vtmp.v_psize = asize + + vd->v_psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; } } } - zfs_free(tmp_label, sizeof (vdev_phys_t)); + if (best_txg == 0) { + free(nvl); + nvl = NULL; + } +done: + free(label); + return (nvl); +} + +static void +vdev_uberblock_load(vdev_t *vd, uberblock_t *ub) +{ + uberblock_t *buf; + + buf = malloc(VDEV_UBERBLOCK_SIZE(vd)); + if (buf == NULL) + return; + + for (int l = 0; l < VDEV_LABELS; l++) { + for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + if (vdev_label_read(vd, l, buf, + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd))) + continue; + if (uberblock_verify(buf) != 0) + continue; + + if (vdev_uberblock_compare(buf, ub) > 0) + *ub = *buf; + } + } + free(buf); +} + +static int +vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap) +{ + vdev_t vtmp; + spa_t *spa; + vdev_t *vdev, *top_vdev, *pool_vdev; + unsigned char *nvlist; + uint64_t val; + uint64_t guid; + uint64_t pool_txg, pool_guid; + const char *pool_name; + const unsigned char *vdevs; + const unsigned char *features; + int rc, is_newer; - if (best_txg == 0) - return (EIO); + /* + * Load the vdev label and figure out which + * uberblock is most current. + */ + memset(&vtmp, 0, sizeof (vtmp)); + vtmp.v_phys_read = phys_read; + vtmp.v_read_priv = read_priv; + vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv), + (uint64_t)sizeof (vdev_label_t)); - if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) + /* Test for minimum device size. */ + if (vtmp.v_psize < SPA_MINDEVSIZE) return (EIO); - nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4; + nvlist = vdev_label_read_config(&vtmp, UINT64_MAX); + if (nvlist == NULL) + return (EIO); if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64, NULL, &val) != 0) { + free(nvlist); return (EIO); } if (!SPA_VERSION_IS_SUPPORTED(val)) { printf("ZFS: unsupported ZFS version %u (should be %u)\n", (unsigned) val, (unsigned) SPA_VERSION); + free(nvlist); return (EIO); } @@ -1637,16 +1729,19 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap) if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ, DATA_TYPE_NVLIST, NULL, &features) == 0 && nvlist_check_features_for_read(features) != 0) { + free(nvlist); return (EIO); } if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64, NULL, &val) != 0) { + free(nvlist); return (EIO); } if (val == POOL_STATE_DESTROYED) { /* We don't boot only from destroyed pools. */ + free(nvlist); return (EIO); } @@ -1660,12 +1755,13 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap) * Cache and spare devices end up here - just ignore * them. */ - /*printf("ZFS: can't find pool details\n");*/ + free(nvlist); return (EIO); } if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL, &val) == 0 && val != 0) { + free(nvlist); return (EIO); } @@ -1675,8 +1771,10 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap) spa = spa_find_by_guid(pool_guid); if (spa == NULL) { spa = spa_create(pool_guid, pool_name); - if (spa == NULL) + if (spa == NULL) { + free(nvlist); return (ENOMEM); + } } if (pool_txg > spa->spa_txg) { spa->spa_txg = pool_txg; @@ -1693,18 +1791,24 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap) */ if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, NULL, &guid) != 0) { + free(nvlist); return (EIO); } vdev = vdev_find(guid); - if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */ + /* Has this vdev already been inited? */ + if (vdev && vdev->v_phys_read) { + free(nvlist); return (EIO); + } if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, NULL, &vdevs)) { + free(nvlist); return (EIO); } rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer); + free(nvlist); if (rc != 0) return (rc); @@ -1714,6 +1818,7 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap) STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink) if (top_vdev == pool_vdev) break; + if (!pool_vdev && top_vdev) { top_vdev->spa = spa; STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink); @@ -1748,36 +1853,7 @@ vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap) * the best uberblock and then we can actually access * the contents of the pool. */ - upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev)); - up = (const struct uberblock *)upbuf; - for (l = 0; l < VDEV_LABELS; l++) { - for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) { - off = vdev_label_offset(vdev->v_psize, l, - VDEV_UBERBLOCK_OFFSET(vdev, i)); - BP_ZERO(&bp); - DVA_SET_OFFSET(&bp.blk_dva[0], off); - BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev)); - BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev)); - BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); - BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); - ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); - - if (vdev_read_phys(vdev, &bp, upbuf, off, 0) != 0) - continue; - - if (up->ub_magic != UBERBLOCK_MAGIC) - continue; - if (up->ub_txg < spa->spa_txg) - continue; - if (up->ub_txg > spa->spa_uberblock.ub_txg || - (up->ub_txg == spa->spa_uberblock.ub_txg && - up->ub_timestamp > - spa->spa_uberblock.ub_timestamp)) { - spa->spa_uberblock = *up; - } - } - } - zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev)); + vdev_uberblock_load(vdev, &spa->spa_uberblock); vdev->spa = spa; if (spap != NULL) diff --git a/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h b/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h index 2a71fcb067..8f45983761 100644 --- a/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h +++ b/usr/src/boot/sys/cddl/boot/zfs/zfsimpl.h @@ -66,6 +66,14 @@ #define _NOTE(s) +/* + * AVL comparator helpers + */ +#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0)) +#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define AVL_PCMP(a, b) \ + (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) + /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ @@ -492,8 +500,16 @@ typedef struct zio_gbh { #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) +/* + * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock + * ring when MMP is enabled. + */ +#define MMP_BLOCKS_PER_LABEL 1 + +/* The largest uberblock we support is 8k. */ +#define MAX_UBERBLOCK_SHIFT (13) #define VDEV_UBERBLOCK_SHIFT(vd) \ - MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT) + MIN(MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT), MAX_UBERBLOCK_SHIFT) #define VDEV_UBERBLOCK_COUNT(vd) \ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) #define VDEV_UBERBLOCK_OFFSET(vd, n) \ @@ -843,15 +859,88 @@ typedef enum pool_state { */ #define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ #define UBERBLOCK_SHIFT 10 /* up to 1K */ - -struct uberblock { +#define MMP_MAGIC 0xa11cea11 /* all-see-all */ + +#define MMP_INTERVAL_VALID_BIT 0x01 +#define MMP_SEQ_VALID_BIT 0x02 +#define MMP_FAIL_INT_VALID_BIT 0x04 + +#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \ + ubp->ub_mmp_magic == MMP_MAGIC) +#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ + MMP_INTERVAL_VALID_BIT)) +#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ + MMP_SEQ_VALID_BIT)) +#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ + MMP_FAIL_INT_VALID_BIT)) + +#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \ + >> 8) +#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \ + >> 32) +#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \ + >> 48) + +typedef struct uberblock { uint64_t ub_magic; /* UBERBLOCK_MAGIC */ uint64_t ub_version; /* SPA_VERSION */ uint64_t ub_txg; /* txg of last sync */ uint64_t ub_guid_sum; /* sum of all vdev guids */ uint64_t ub_timestamp; /* UTC time of last sync */ blkptr_t ub_rootbp; /* MOS objset_phys_t */ -}; + /* highest SPA_VERSION supported by software that wrote this txg */ + uint64_t ub_software_version; + /* Maybe missing in uberblocks we read, but always written */ + uint64_t ub_mmp_magic; + /* + * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off. + * Otherwise, nanosec since last MMP write. + */ + uint64_t ub_mmp_delay; + + /* + * The ub_mmp_config contains the multihost write interval, multihost + * fail intervals, sequence number for sub-second granularity, and + * valid bit mask. This layout is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | Fail Intervals| Seq | Write Interval (ms) | VALID | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * This allows a write_interval of (2^24/1000)s, over 4.5 hours + * + * VALID Bits: + * - 0x01 - Write Interval (ms) + * - 0x02 - Sequence number exists + * - 0x04 - Fail Intervals + * - 0xf8 - Reserved + */ + uint64_t ub_mmp_config; + + /* + * ub_checkpoint_txg indicates two things about the current uberblock: + * + * 1] If it is not zero then this uberblock is a checkpoint. If it is + * zero, then this uberblock is not a checkpoint. + * + * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is + * the ub_txg that the uberblock had at the time we moved it to + * the MOS config. + * + * The field is set when we checkpoint the uberblock and continues to + * hold that value even after we've rewound (unlike the ub_txg that + * is reset to a higher value). + * + * Besides checks used to determine whether we are reopening the + * pool from a checkpointed uberblock [see spa_ld_select_uberblock()], + * the value of the field is used to determine which ZIL blocks have + * been allocated according to the ms_sm when we are rewinding to a + * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then + * the ZIL block is not allocated [see uses of spa_min_claim_txg()]. + */ + uint64_t ub_checkpoint_txg; +} uberblock_t; /* * Flags. diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index 2c32e1a191..7cc12ccf0a 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -3098,25 +3098,25 @@ reference_cb(uintptr_t addr, const void *ignored, void *arg) return (WALK_NEXT); } -typedef struct mdb_refcount { +typedef struct mdb_zfs_refcount { uint64_t rc_count; -} mdb_refcount_t; +} mdb_zfs_refcount_t; -typedef struct mdb_refcount_removed { +typedef struct mdb_zfs_refcount_removed { uint64_t rc_removed_count; -} mdb_refcount_removed_t; +} mdb_zfs_refcount_removed_t; -typedef struct mdb_refcount_tracked { +typedef struct mdb_zfs_refcount_tracked { boolean_t rc_tracked; -} mdb_refcount_tracked_t; +} mdb_zfs_refcount_tracked_t; /* ARGSUSED */ static int -refcount(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +zfs_refcount(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { - mdb_refcount_t rc; - mdb_refcount_removed_t rcr; - mdb_refcount_tracked_t rct; + mdb_zfs_refcount_t rc; + mdb_zfs_refcount_removed_t rcr; + mdb_zfs_refcount_tracked_t rct; int off; boolean_t released = B_FALSE; @@ -3128,30 +3128,30 @@ refcount(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) NULL) != argc) return (DCMD_USAGE); - if (mdb_ctf_vread(&rc, "refcount_t", "mdb_refcount_t", addr, + if (mdb_ctf_vread(&rc, "zfs_refcount_t", "mdb_zfs_refcount_t", addr, 0) == -1) return (DCMD_ERR); - if (mdb_ctf_vread(&rcr, "refcount_t", "mdb_refcount_removed_t", addr, - MDB_CTF_VREAD_QUIET) == -1) { - mdb_printf("refcount_t at %p has %llu holds (untracked)\n", + if (mdb_ctf_vread(&rcr, "zfs_refcount_t", "mdb_zfs_refcount_removed_t", + addr, MDB_CTF_VREAD_QUIET) == -1) { + mdb_printf("zfs_refcount_t at %p has %llu holds (untracked)\n", addr, (longlong_t)rc.rc_count); return (DCMD_OK); } - if (mdb_ctf_vread(&rct, "refcount_t", "mdb_refcount_tracked_t", addr, - MDB_CTF_VREAD_QUIET) == -1) { + if (mdb_ctf_vread(&rct, "zfs_refcount_t", "mdb_zfs_refcount_tracked_t", + addr, MDB_CTF_VREAD_QUIET) == -1) { /* If this is an old target, it might be tracked. */ rct.rc_tracked = B_TRUE; } - mdb_printf("refcount_t at %p has %llu current holds, " + mdb_printf("zfs_refcount_t at %p has %llu current holds, " "%llu recently released holds\n", addr, (longlong_t)rc.rc_count, (longlong_t)rcr.rc_removed_count); if (rct.rc_tracked && rc.rc_count > 0) mdb_printf("current holds:\n"); - off = mdb_ctf_offsetof_by_name("refcount_t", "rc_list"); + off = mdb_ctf_offsetof_by_name("zfs_refcount_t", "rc_list"); if (off == -1) return (DCMD_ERR); mdb_pwalk("list", reference_cb, (void*)B_FALSE, addr + off); @@ -3159,7 +3159,7 @@ refcount(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) if (released && rcr.rc_removed_count > 0) { mdb_printf("released holds:\n"); - off = mdb_ctf_offsetof_by_name("refcount_t", "rc_removed"); + off = mdb_ctf_offsetof_by_name("zfs_refcount_t", "rc_removed"); if (off == -1) return (DCMD_ERR); mdb_pwalk("list", reference_cb, (void*)B_TRUE, addr + off); @@ -3797,12 +3797,12 @@ rrwlock(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) } mdb_printf("anonymous references:\n"); - (void) mdb_call_dcmd("refcount", addr + + (void) mdb_call_dcmd("zfs_refcount", addr + mdb_ctf_offsetof_by_name(ZFS_STRUCT "rrwlock", "rr_anon_rcount"), DCMD_ADDRSPEC, 0, NULL); mdb_printf("linked references:\n"); - (void) mdb_call_dcmd("refcount", addr + + (void) mdb_call_dcmd("zfs_refcount", addr + mdb_ctf_offsetof_by_name(ZFS_STRUCT "rrwlock", "rr_linked_rcount"), DCMD_ADDRSPEC, 0, NULL); @@ -4345,9 +4345,9 @@ static const mdb_dcmd_t dcmds[] = { "given a spa_t, print block type stats from last scrub", zfs_blkstats }, { "zfs_params", "", "print zfs tunable parameters", zfs_params }, - { "refcount", ":[-r]\n" + { "zfs_refcount", ":[-r]\n" "\t-r display recently removed references", - "print refcount_t holders", refcount }, + "print zfs_refcount_t holders", zfs_refcount }, { "zap_leaf", "", "print zap_leaf_phys_t", zap_leaf }, { "zfs_aces", ":[-v]", "print all ACEs from a zfs_acl_t", zfs_acl_dump }, diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index 13fd33522a..61cfd74df3 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. @@ -901,7 +901,7 @@ dump_metaslab_stats(metaslab_t *msp) /* max sure nicenum has enough space */ CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ); - zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf)); + zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", "segments", avl_numnodes(t), "maxsize", maxbuf, @@ -928,7 +928,7 @@ dump_metaslab(metaslab_t *msp) if (dump_opt['m'] > 2 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); - VERIFY0(metaslab_load(msp, 0)); + VERIFY0(metaslab_load(msp)); range_tree_stat_verify(msp->ms_allocatable); dump_metaslab_stats(msp); metaslab_unload(msp); diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index eb574105a7..654b62db6a 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. + * Copyright (c) 2019, Joyent, Inc. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. All rights reserved. @@ -296,6 +296,7 @@ #include <zfs_fletcher.h> #include <sys/aggsum.h> #include <sys/cityhash.h> +#include <sys/param.h> #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ @@ -1268,6 +1269,20 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); +/* + * The arc_all_memory function is a ZoL enhancement that lives in their OSL + * code. In user-space code, which is used primarily for testing, we return + * half of all memory. + */ +uint64_t +arc_all_memory(void) +{ +#ifdef _KERNEL + return (ptob(physmem)); +#else + return ((sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES)) / 2); +#endif +} /* * We use Cityhash for this. It's fast, and has good hash properties without diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index b950ed26d6..bc6b45ec7f 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -199,28 +199,20 @@ uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; int metaslab_load_pct = 50; /* - * Determines how many txgs a metaslab may remain loaded without having any - * allocations from it. As long as a metaslab continues to be used we will - * keep it loaded. + * These tunables control how long a metaslab will remain loaded after the + * last allocation from it. A metaslab can't be unloaded until at least + * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds + * have elapsed. However, zfs_metaslab_mem_limit may cause it to be + * unloaded sooner. These settings are intended to be generous -- to keep + * metaslabs loaded for a long time, reducing the rate of metaslab loading. */ -int metaslab_unload_delay = TXG_SIZE * 2; - -/* - * Tunables used to reduce metaslab load/unload thrashing when selection - * algorithm is allocating across metaslabs very evenly. In addition to - * tracking when the slab was used for allocation (ms_selected_txg), we also - * track when it was loaded (ms_loaded_txg). If the slab would be unloaded, - * but the load txg is within the window of - * metaslab_unload_delay + metaslab_load_window - * then we ramp up metaslab_unload_delay instead of unloading the metaslab. - */ -int metaslab_load_window = 10; -int metaslab_unload_delay_max = 256; +int metaslab_unload_delay = 32; +int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */ /* * Max number of metaslabs per group to preload. */ -int metaslab_preload_limit = SPA_DVAS_PER_BP; +int metaslab_preload_limit = 10; /* * Enable/disable preloading of metaslab. @@ -281,6 +273,19 @@ uint64_t metaslab_trace_max_entries = 5000; */ int max_disabled_ms = 3; +/* + * Maximum percentage of memory to use on storing loaded metaslabs. If loading + * a metaslab would take it over this percentage, the oldest selected metaslab + * is automatically unloaded. + */ +int zfs_metaslab_mem_limit = 25; + +/* + * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. + * To avoid 64-bit overflow, don't set above UINT32_MAX. + */ +unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */ + static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); @@ -288,6 +293,8 @@ static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); static void metaslab_passivate(metaslab_t *msp, uint64_t weight); static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); +static unsigned int metaslab_idx_func(multilist_t *, void *); +static void metaslab_evict(metaslab_t *, uint64_t); kmem_cache_t *metaslab_alloc_trace_cache; @@ -307,6 +314,8 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) mc->mc_rotor = NULL; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); + mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t), + offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * sizeof (zfs_refcount_t), KM_SLEEP); mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * @@ -333,6 +342,7 @@ metaslab_class_destroy(metaslab_class_t *mc) kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * sizeof (uint64_t)); mutex_destroy(&mc->mc_lock); + multilist_destroy(mc->mc_metaslab_txg_list); kmem_free(mc, sizeof (metaslab_class_t)); } @@ -523,6 +533,51 @@ metaslab_class_expandable_space(metaslab_class_t *mc) return (space); } +void +metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) +{ + multilist_t *ml = mc->mc_metaslab_txg_list; + for (int i = 0; i < multilist_get_num_sublists(ml); i++) { + multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + metaslab_t *msp = multilist_sublist_head(mls); + multilist_sublist_unlock(mls); + while (msp != NULL) { + mutex_enter(&msp->ms_lock); + + /* + * If the metaslab has been removed from the list + * (which could happen if we were at the memory limit + * and it was evicted during this loop), then we can't + * proceed and we should restart the sublist. + */ + if (!multilist_link_active(&msp->ms_class_txg_node)) { + mutex_exit(&msp->ms_lock); + i--; + break; + } + mls = multilist_sublist_lock(ml, i); + metaslab_t *next_msp = multilist_sublist_next(mls, msp); + multilist_sublist_unlock(mls); + if (txg > + msp->ms_selected_txg + metaslab_unload_delay && + gethrtime() > msp->ms_selected_time + + (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) { + metaslab_evict(msp, txg); + } else { + /* + * Once we've hit a metaslab selected too + * recently to evict, we're done evicting for + * now. + */ + mutex_exit(&msp->ms_lock); + break; + } + mutex_exit(&msp->ms_lock); + msp = next_msp; + } + } +} + static int metaslab_compare(const void *x1, const void *x2) { @@ -1002,6 +1057,14 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); + + metaslab_class_t *mc = msp->ms_group->mg_class; + multilist_sublist_t *mls = + multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); + if (multilist_link_active(&msp->ms_class_txg_node)) + multilist_sublist_remove(mls, msp); + multilist_sublist_unlock(mls); + msp->ms_group = NULL; mutex_exit(&mg->mg_lock); } @@ -1009,8 +1072,10 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) static void metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { + ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&mg->mg_lock)); ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); msp->ms_weight = weight; avl_add(&mg->mg_metaslab_tree, msp); @@ -1211,17 +1276,83 @@ metaslab_rangesize_compare(const void *x1, const void *x2) * Return the maximum contiguous segment within the metaslab. */ uint64_t -metaslab_block_maxsize(metaslab_t *msp) +metaslab_largest_allocatable(metaslab_t *msp) { avl_tree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; - if (t == NULL || (rs = avl_last(t)) == NULL) - return (0ULL); + if (t == NULL) + return (0); + rs = avl_last(t); + if (rs == NULL) + return (0); return (rs->rs_end - rs->rs_start); } +/* + * Return the maximum contiguous segment within the unflushed frees of this + * metaslab. + */ +uint64_t +metaslab_largest_unflushed_free(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if (msp->ms_unflushed_frees == NULL) + return (0); + + range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size); + if (rs == NULL) + return (0); + + /* + * When a range is freed from the metaslab, that range is added to + * both the unflushed frees and the deferred frees. While the block + * will eventually be usable, if the metaslab were loaded the range + * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE + * txgs had passed. As a result, when attempting to estimate an upper + * bound for the largest currently-usable free segment in the + * metaslab, we need to not consider any ranges currently in the defer + * trees. This algorithm approximates the largest available chunk in + * the largest range in the unflushed_frees tree by taking the first + * chunk. While this may be a poor estimate, it should only remain so + * briefly and should eventually self-correct as frees are no longer + * deferred. Similar logic applies to the ms_freed tree. See + * metaslab_load() for more details. + * + * There are two primary sources of innacuracy in this estimate. Both + * are tolerated for performance reasons. The first source is that we + * only check the largest segment for overlaps. Smaller segments may + * have more favorable overlaps with the other trees, resulting in + * larger usable chunks. Second, we only look at the first chunk in + * the largest segment; there may be other usable chunks in the + * largest segment, but we ignore them. + */ + uint64_t rstart = rs->rs_start; + uint64_t rsize = rs->rs_end - rstart; + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + uint64_t start = 0; + uint64_t size = 0; + boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, + rsize, &start, &size); + if (found) { + if (rstart == start) + return (0); + rsize = start - rstart; + } + } + + uint64_t start = 0; + uint64_t size = 0; + boolean_t found = range_tree_find_in(msp->ms_freed, rstart, + rsize, &start, &size); + if (found) + rsize = start - rstart; + + return (rsize); +} + static range_seg_t * metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) { @@ -1311,7 +1442,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) * If we're running low on space, find a segment based on size, * rather than iterating based on offset. */ - if (metaslab_block_maxsize(msp) < metaslab_df_alloc_threshold || + if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { offset = -1; } else { @@ -1409,7 +1540,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) range_seg_t *rs, rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; - uint64_t max_size = metaslab_block_maxsize(msp); + uint64_t max_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, @@ -1479,6 +1610,13 @@ metaslab_flush_wait(metaslab_t *msp) cv_wait(&msp->ms_flush_cv, &msp->ms_lock); } +static unsigned int +metaslab_idx_func(multilist_t *ml, void *arg) +{ + metaslab_t *msp = arg; + return (msp->ms_id % multilist_get_num_sublists(ml)); +} + uint64_t metaslab_allocated_space(metaslab_t *msp) { @@ -1537,6 +1675,8 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg) allocating += range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } + ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, + msp->ms_allocating_total); ASSERT3U(msp->ms_deferspace, ==, range_tree_space(msp->ms_defer[0]) + @@ -1725,7 +1865,6 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) msp->ms_weight = 0; msp->ms_fragmentation = 0; - msp->ms_max_size = 0; /* * This function is used for verification purposes. Regardless of @@ -1753,6 +1892,87 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) VERIFY3U(msp->ms_weight, ==, weight); } +/* + * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from + * this class that was used longest ago, and attempt to unload it. We don't + * want to spend too much time in this loop to prevent performance + * degredation, and we expect that most of the time this operation will + * succeed. Between that and the normal unloading processing during txg sync, + * we expect this to keep the metaslab memory usage under control. + */ +static void +metaslab_potentially_evict(metaslab_class_t *mc) +{ +#ifdef _KERNEL + uint64_t allmem = arc_all_memory(); + extern kmem_cache_t *range_seg_cache; + uint64_t inuse = kmem_cache_stat(range_seg_cache, "buf_inuse"); + uint64_t size = kmem_cache_stat(range_seg_cache, "buf_size"); + int tries = 0; + for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && + tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2; + tries++) { + unsigned int idx = multilist_get_random_index( + mc->mc_metaslab_txg_list); + multilist_sublist_t *mls = + multilist_sublist_lock(mc->mc_metaslab_txg_list, idx); + metaslab_t *msp = multilist_sublist_head(mls); + multilist_sublist_unlock(mls); + while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < + inuse * size) { + VERIFY3P(mls, ==, multilist_sublist_lock( + mc->mc_metaslab_txg_list, idx)); + ASSERT3U(idx, ==, + metaslab_idx_func(mc->mc_metaslab_txg_list, msp)); + + if (!multilist_link_active(&msp->ms_class_txg_node)) { + multilist_sublist_unlock(mls); + break; + } + metaslab_t *next_msp = multilist_sublist_next(mls, msp); + multilist_sublist_unlock(mls); + /* + * If the metaslab is currently loading there are two + * cases. If it's the metaslab we're evicting, we + * can't continue on or we'll panic when we attempt to + * recursively lock the mutex. If it's another + * metaslab that's loading, it can be safely skipped, + * since we know it's very new and therefore not a + * good eviction candidate. We check later once the + * lock is held that the metaslab is fully loaded + * before actually unloading it. + */ + if (msp->ms_loading) { + msp = next_msp; + inuse = kmem_cache_stat(range_seg_cache, + "buf_inuse"); + continue; + } + /* + * We can't unload metaslabs with no spacemap because + * they're not ready to be unloaded yet. We can't + * unload metaslabs with outstanding allocations + * because doing so could cause the metaslab's weight + * to decrease while it's unloaded, which violates an + * invariant that we use to prevent unnecessary + * loading. We also don't unload metaslabs that are + * currently active because they are high-weight + * metaslabs that are likely to be used in the near + * future. + */ + mutex_enter(&msp->ms_lock); + if (msp->ms_allocator == -1 && msp->ms_sm != NULL && + msp->ms_allocating_total == 0) { + metaslab_unload(msp); + } + mutex_exit(&msp->ms_lock); + msp = next_msp; + inuse = kmem_cache_stat(range_seg_cache, "buf_inuse"); + } + } +#endif +} + static int metaslab_load_impl(metaslab_t *msp) { @@ -1915,18 +2135,21 @@ metaslab_load_impl(metaslab_t *msp) * comment for ms_synchist and ms_deferhist[] for more info] */ uint64_t weight = msp->ms_weight; + uint64_t max_size = msp->ms_max_size; metaslab_recalculate_weight_and_sort(msp); if (!WEIGHT_IS_SPACEBASED(weight)) ASSERT3U(weight, <=, msp->ms_weight); - msp->ms_max_size = metaslab_block_maxsize(msp); - + msp->ms_max_size = metaslab_largest_allocatable(msp); + ASSERT3U(max_size, <=, msp->ms_max_size); hrtime_t load_end = gethrtime(); + msp->ms_load_time = load_end; if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, smp_length %llu, " "unflushed_allocs %llu, unflushed_frees %llu, " "freed %llu, defer %llu + %llu, " - "loading_time %lld ms", + "loading_time %lld ms, ms_max_size %llu, " + "max size error %llu", spa_syncing_txg(spa), spa_name(spa), msp->ms_group->mg_vd->vdev_id, msp->ms_id, space_map_length(msp->ms_sm), @@ -1935,7 +2158,8 @@ metaslab_load_impl(metaslab_t *msp) range_tree_space(msp->ms_freed), range_tree_space(msp->ms_defer[0]), range_tree_space(msp->ms_defer[1]), - (longlong_t)((load_end - load_start) / 1000000)); + (longlong_t)((load_end - load_start) / 1000000), + msp->ms_max_size, msp->ms_max_size - max_size); } metaslab_verify_space(msp, spa_syncing_txg(spa)); @@ -1944,7 +2168,7 @@ metaslab_load_impl(metaslab_t *msp) } int -metaslab_load(metaslab_t *msp, uint64_t txg) +metaslab_load(metaslab_t *msp) { kstat_t *ksp; ASSERT(MUTEX_HELD(&msp->ms_lock)); @@ -1988,11 +2212,20 @@ metaslab_load(metaslab_t *msp, uint64_t txg) */ ASSERT(!msp->ms_loaded); + /* + * If we're loading a metaslab in the normal class, consider evicting + * another one to keep our memory usage under the limit defined by the + * zfs_metaslab_mem_limit tunable. + */ + if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == + msp->ms_group->mg_class) { + metaslab_potentially_evict(msp->ms_group->mg_class); + } + int error = metaslab_load_impl(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); msp->ms_loading = B_FALSE; - msp->ms_loaded_txg = txg; cv_broadcast(&msp->ms_load_cv); return (error); @@ -2003,14 +2236,29 @@ metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); - metaslab_verify_weight_and_frag(msp); + /* + * This can happen if a metaslab is selected for eviction (in + * metaslab_potentially_evict) and then unloaded during spa_sync (via + * metaslab_class_evict_old). + */ + if (!msp->ms_loaded) + return; range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; - msp->ms_loaded_txg = 0; + msp->ms_unload_time = gethrtime(); + msp->ms_activation_weight = 0; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; - msp->ms_max_size = 0; + + if (msp->ms_group != NULL) { + metaslab_class_t *mc = msp->ms_group->mg_class; + multilist_sublist_t *mls = + multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); + if (multilist_link_active(&msp->ms_class_txg_node)) + multilist_sublist_remove(mls, msp); + multilist_sublist_unlock(mls); + } /* * We explicitly recalculate the metaslab's weight based on its space @@ -2029,6 +2277,21 @@ metaslab_unload(metaslab_t *msp) } void +metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + metaslab_class_t *mc = msp->ms_group->mg_class; + multilist_sublist_t *mls = + multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); + if (multilist_link_active(&msp->ms_class_txg_node)) + multilist_sublist_remove(mls, msp); + msp->ms_selected_txg = txg; + msp->ms_selected_time = gethrtime(); + multilist_sublist_insert_tail(mls, msp); + multilist_sublist_unlock(mls); +} + +void metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { @@ -2056,6 +2319,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); + multilist_link_init(&ms->ms_class_txg_node); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; @@ -2349,7 +2613,6 @@ metaslab_space_weight(metaslab_t *msp) uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT(!vd->vdev_removing); /* * The baseline weight is the metaslab's free space. @@ -2568,13 +2831,19 @@ metaslab_segment_weight(metaslab_t *msp) * weights we rely on the entire weight (excluding the weight-type bit). */ boolean_t -metaslab_should_allocate(metaslab_t *msp, uint64_t asize) +metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) { - if (msp->ms_loaded) { + /* + * If the metaslab is loaded, ms_max_size is definitive and we can use + * the fast check. If it's not, the ms_max_size is a lower bound (once + * set), and we should use the fast check as long as we're not in + * try_hard and it's been less than zfs_metaslab_max_size_cache_sec + * seconds since the metaslab was unloaded. + */ + if (msp->ms_loaded || + (msp->ms_max_size != 0 && !try_hard && gethrtime() < + msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) return (msp->ms_max_size >= asize); - } else { - ASSERT0(msp->ms_max_size); - } boolean_t should_allocate; if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { @@ -2590,6 +2859,7 @@ metaslab_should_allocate(metaslab_t *msp, uint64_t asize) should_allocate = (asize <= (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); } + return (should_allocate); } @@ -2602,24 +2872,24 @@ metaslab_weight(metaslab_t *msp) ASSERT(MUTEX_HELD(&msp->ms_lock)); - /* - * If this vdev is in the process of being removed, there is nothing - * for us to do here. - */ - if (vd->vdev_removing) - return (0); - metaslab_set_fragmentation(msp); /* - * Update the maximum size if the metaslab is loaded. This will + * Update the maximum size. If the metaslab is loaded, this will * ensure that we get an accurate maximum size if newly freed space - * has been added back into the free tree. + * has been added back into the free tree. If the metaslab is + * unloaded, we check if there's a larger free segment in the + * unflushed frees. This is a lower bound on the largest allocatable + * segment size. Coalescing of adjacent entries may reveal larger + * allocatable segments, but we aren't aware of those until loading + * the space map into a range tree. */ - if (msp->ms_loaded) - msp->ms_max_size = metaslab_block_maxsize(msp); - else - ASSERT0(msp->ms_max_size); + if (msp->ms_loaded) { + msp->ms_max_size = metaslab_largest_allocatable(msp); + } else { + msp->ms_max_size = MAX(msp->ms_max_size, + metaslab_largest_unflushed_free(msp)); + } /* * Segment-based weighting requires space map histogram support. @@ -2638,6 +2908,8 @@ metaslab_weight(metaslab_t *msp) void metaslab_recalculate_weight_and_sort(metaslab_t *msp) { + ASSERT(MUTEX_HELD(&msp->ms_lock)); + /* note: we preserve the mask (e.g. indication of primary, etc..) */ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(msp->ms_group, msp, @@ -2648,16 +2920,23 @@ static int metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, int allocator, uint64_t activation_weight) { + ASSERT(MUTEX_HELD(&msp->ms_lock)); + /* * If we're activating for the claim code, we don't want to actually * set the metaslab up for a specific allocator. */ - if (activation_weight == METASLAB_WEIGHT_CLAIM) + if (activation_weight == METASLAB_WEIGHT_CLAIM) { + ASSERT0(msp->ms_activation_weight); + msp->ms_activation_weight = msp->ms_weight; + metaslab_group_sort(mg, msp, msp->ms_weight | + activation_weight); return (0); + } + metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? mg->mg_primaries : mg->mg_secondaries); - ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); if (arr[allocator] != NULL) { mutex_exit(&mg->mg_lock); @@ -2668,39 +2947,88 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, ASSERT3S(msp->ms_allocator, ==, -1); msp->ms_allocator = allocator; msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); + + ASSERT0(msp->ms_activation_weight); + msp->ms_activation_weight = msp->ms_weight; + metaslab_group_sort_impl(mg, msp, + msp->ms_weight | activation_weight); + mutex_exit(&mg->mg_lock); return (0); } static int -metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight, - uint64_t txg) +metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); - if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int error = metaslab_load(msp, txg); - if (error != 0) { - metaslab_group_sort(msp->ms_group, msp, 0); - return (error); - } - if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { - /* - * The metaslab was activated for another allocator - * while we were waiting, we should reselect. - */ + /* + * The current metaslab is already activated for us so there + * is nothing to do. Already activated though, doesn't mean + * that this metaslab is activated for our allocator nor our + * requested activation weight. The metaslab could have started + * as an active one for our allocator but changed allocators + * while we were waiting to grab its ms_lock or we stole it + * [see find_valid_metaslab()]. This means that there is a + * possibility of passivating a metaslab of another allocator + * or from a different activation mask, from this thread. + */ + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { + ASSERT(msp->ms_loaded); + return (0); + } + + int error = metaslab_load(msp); + if (error != 0) { + metaslab_group_sort(msp->ms_group, msp, 0); + return (error); + } + + /* + * When entering metaslab_load() we may have dropped the + * ms_lock because we were loading this metaslab, or we + * were waiting for another thread to load it for us. In + * that scenario, we recheck the weight of the metaslab + * to see if it was activated by another thread. + * + * If the metaslab was activated for another allocator or + * it was activated with a different activation weight (e.g. + * we wanted to make it a primary but it was activated as + * secondary) we return error (EBUSY). + * + * If the metaslab was activated for the same allocator + * and requested activation mask, skip activating it. + */ + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { + if (msp->ms_allocator != allocator) return (EBUSY); - } - if ((error = metaslab_activate_allocator(msp->ms_group, msp, - allocator, activation_weight)) != 0) { - return (error); - } - msp->ms_activation_weight = msp->ms_weight; - metaslab_group_sort(msp->ms_group, msp, - msp->ms_weight | activation_weight); + if ((msp->ms_weight & activation_weight) == 0) + return (EBUSY); + + EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY), + msp->ms_primary); + return (0); + } + + /* + * If the metaslab has literally 0 space, it will have weight 0. In + * that case, don't bother activating it. This can happen if the + * metaslab had space during find_valid_metaslab, but another thread + * loaded it and used all that space while we were waiting to grab the + * lock. + */ + if (msp->ms_weight == 0) { + ASSERT0(range_tree_space(msp->ms_allocatable)); + return (SET_ERROR(ENOSPC)); } + + if ((error = metaslab_activate_allocator(msp->ms_group, msp, + allocator, activation_weight)) != 0) { + return (error); + } + ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); @@ -2712,6 +3040,8 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(msp->ms_loaded); + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { metaslab_group_sort(mg, msp, weight); return; @@ -2719,15 +3049,16 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, mutex_enter(&mg->mg_lock); ASSERT3P(msp->ms_group, ==, mg); + ASSERT3S(0, <=, msp->ms_allocator); + ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); + if (msp->ms_primary) { - ASSERT3U(0, <=, msp->ms_allocator); - ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); mg->mg_primaries[msp->ms_allocator] = NULL; } else { - ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); + ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); mg->mg_secondaries[msp->ms_allocator] = NULL; } msp->ms_allocator = -1; @@ -2749,9 +3080,10 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight) range_tree_is_empty(msp->ms_allocatable)); ASSERT0(weight & METASLAB_ACTIVE_MASK); + ASSERT(msp->ms_activation_weight != 0); msp->ms_activation_weight = 0; metaslab_passivate_allocator(msp->ms_group, msp, weight); - ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); + ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); } /* @@ -2790,13 +3122,14 @@ static void metaslab_preload(void *arg) { metaslab_t *msp = arg; - spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + metaslab_class_t *mc = msp->ms_group->mg_class; + spa_t *spa = mc->mc_spa; ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); mutex_enter(&msp->ms_lock); - (void) metaslab_load(msp, spa_syncing_txg(spa)); - msp->ms_selected_txg = spa_syncing_txg(spa); + (void) metaslab_load(msp); + metaslab_set_selected_txg(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_lock); } @@ -3249,12 +3582,19 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being - * forced to condense and it's loaded, we need to let it through. + * forced to condense, it's loaded and we're not beyond the final + * dirty txg, we need to let it through. Not condensing beyond the + * final dirty txg prevents an issue where metaslabs that need to be + * condensed but were loaded for other reasons could cause a panic + * here. By only checking the txg in that branch of the conditional, + * we preserve the utility of the VERIFY statements in all other + * cases. */ if (range_tree_is_empty(alloctree) && range_tree_is_empty(msp->ms_freeing) && range_tree_is_empty(msp->ms_checkpointing) && - !(msp->ms_loaded && msp->ms_condense_wanted)) + !(msp->ms_loaded && msp->ms_condense_wanted && + txg <= spa_final_dirty_txg(spa))) return; @@ -3507,6 +3847,23 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) dmu_tx_commit(tx); } +static void +metaslab_evict(metaslab_t *msp, uint64_t txg) +{ + if (!msp->ms_loaded || msp->ms_disabled != 0) + return; + + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + VERIFY0(range_tree_space( + msp->ms_allocating[(txg + t) & TXG_MASK])); + } + if (msp->ms_allocator != -1) + metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); + + if (!metaslab_debug_unload) + metaslab_unload(msp); +} + /* * Called after a transaction group has completely synced to mark * all of the metaslab's free space as usable. @@ -3553,7 +3910,9 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); msp->ms_unflushed_allocs = range_tree_create(NULL, NULL); ASSERT3P(msp->ms_unflushed_frees, ==, NULL); - msp->ms_unflushed_frees = range_tree_create(NULL, NULL); + msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops, + &msp->ms_unflushed_frees_by_size, + metaslab_rangesize_compare, 0); metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); } @@ -3658,41 +4017,28 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) /* * If the metaslab is loaded and we've not tried to load or allocate * from it in 'metaslab_unload_delay' txgs, then we normally unload it. - * However, to prevent thrashing, if the metaslab was recently loaded, - * then instead of unloading it, we increase the unload delay (only up - * to the maximum). */ if (msp->ms_loaded && msp->ms_disabled == 0 && msp->ms_selected_txg + metaslab_unload_delay < txg) { - if (msp->ms_loaded_txg != 0 && msp->ms_loaded_txg + - metaslab_unload_delay + metaslab_load_window >= txg) { - if (metaslab_unload_delay + metaslab_load_window <= - metaslab_unload_delay_max) { - metaslab_unload_delay += metaslab_load_window; - } - DTRACE_PROBE1(zfs__metaslab__delay__unload, - metaslab_t *, msp); - } else { - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { - VERIFY0(range_tree_space( - msp->ms_allocating[(txg + t) & TXG_MASK])); - } - if (msp->ms_allocator != -1) { - metaslab_passivate(msp, msp->ms_weight & - ~METASLAB_ACTIVE_MASK); - } - - if (!metaslab_debug_unload) - metaslab_unload(msp); + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + VERIFY0(range_tree_space( + msp->ms_allocating[(txg + t) & TXG_MASK])); } + if (msp->ms_allocator != -1) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } + + if (!metaslab_debug_unload) + metaslab_unload(msp); } ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_freed)); ASSERT0(range_tree_space(msp->ms_checkpointing)); - + msp->ms_allocating_total -= msp->ms_allocated_this_txg; msp->ms_allocated_this_txg = 0; mutex_exit(&msp->ms_lock); } @@ -3946,6 +4292,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); + msp->ms_allocating_total += size; metaslab_verify_space(msp, txg); } @@ -3954,7 +4301,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) * Now that we've attempted the allocation we need to update the * metaslab's maximum block size since it may have changed. */ - msp->ms_max_size = metaslab_block_maxsize(msp); + msp->ms_max_size = metaslab_largest_allocatable(msp); return (start); } @@ -3972,7 +4319,8 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, - zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) + boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, + boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; @@ -3982,7 +4330,7 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; - if (!metaslab_should_allocate(msp, asize)) { + if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; @@ -4024,17 +4372,51 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, return (msp); } +void +metaslab_active_mask_verify(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) + return; + + if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); + VERIFY3S(msp->ms_allocator, !=, -1); + VERIFY(msp->ms_primary); + return; + } + + if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); + VERIFY3S(msp->ms_allocator, !=, -1); + VERIFY(!msp->ms_primary); + return; + } + + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); + VERIFY3S(msp->ms_allocator, ==, -1); + return; + } +} + /* ARGSUSED */ static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, - int d, int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, + int allocator, boolean_t try_hard) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; - uint64_t activation_weight; - activation_weight = METASLAB_WEIGHT_PRIMARY; + uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY; for (int i = 0; i < d; i++) { if (activation_weight == METASLAB_WEIGHT_PRIMARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { @@ -4075,15 +4457,37 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, if (activation_weight == METASLAB_WEIGHT_PRIMARY && mg->mg_primaries[allocator] != NULL) { msp = mg->mg_primaries[allocator]; + + /* + * Even though we don't hold the ms_lock for the + * primary metaslab, those fields should not + * change while we hold the mg_lock. Thus is is + * safe to make assertions on them. + */ + ASSERT(msp->ms_primary); + ASSERT3S(msp->ms_allocator, ==, allocator); + ASSERT(msp->ms_loaded); + was_active = B_TRUE; + ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && mg->mg_secondaries[allocator] != NULL) { msp = mg->mg_secondaries[allocator]; + + /* + * See comment above about the similar assertions + * for the primary metaslab. + */ + ASSERT(!msp->ms_primary); + ASSERT3S(msp->ms_allocator, ==, allocator); + ASSERT(msp->ms_loaded); + was_active = B_TRUE; + ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, - want_unique, asize, allocator, zal, search, - &was_active); + want_unique, asize, allocator, try_hard, zal, + search, &was_active); } mutex_exit(&mg->mg_lock); @@ -4091,59 +4495,106 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, kmem_free(search, sizeof (*search)); return (-1ULL); } - mutex_enter(&msp->ms_lock); + + metaslab_active_mask_verify(msp); + + /* + * This code is disabled out because of issues with + * tracepoints in non-gpl kernel modules. + */ +#if 0 + DTRACE_PROBE3(ms__activation__attempt, + metaslab_t *, msp, uint64_t, activation_weight, + boolean_t, was_active); +#endif + /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we * were blocked on the metaslab lock. We check the - * active status first to see if we need to reselect + * active status first to see if we need to set_selected_txg * a new metaslab. */ if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { + ASSERT3S(msp->ms_allocator, ==, -1); mutex_exit(&msp->ms_lock); continue; } /* - * If the metaslab is freshly activated for an allocator that - * isn't the one we're allocating from, or if it's a primary and - * we're seeking a secondary (or vice versa), we go back and - * select a new metaslab. + * If the metaslab was activated for another allocator + * while we were waiting in the ms_lock above, or it's + * a primary and we're seeking a secondary (or vice versa), + * we go back and select a new metaslab. */ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && (msp->ms_allocator != -1) && (msp->ms_allocator != allocator || ((activation_weight == METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { + ASSERT(msp->ms_loaded); + ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || + msp->ms_allocator != -1); mutex_exit(&msp->ms_lock); continue; } + /* + * This metaslab was used for claiming regions allocated + * by the ZIL during pool import. Once these regions are + * claimed we don't need to keep the CLAIM bit set + * anymore. Passivate this metaslab to zero its activation + * mask. + */ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && activation_weight != METASLAB_WEIGHT_CLAIM) { + ASSERT(msp->ms_loaded); + ASSERT3S(msp->ms_allocator, ==, -1); metaslab_passivate(msp, msp->ms_weight & ~METASLAB_WEIGHT_CLAIM); mutex_exit(&msp->ms_lock); continue; } - if (metaslab_activate(msp, allocator, activation_weight, - txg) != 0) { + metaslab_set_selected_txg(msp, txg); + + int activation_error = + metaslab_activate(msp, allocator, activation_weight); + metaslab_active_mask_verify(msp); + + /* + * If the metaslab was activated by another thread for + * another allocator or activation_weight (EBUSY), or it + * failed because another metaslab was assigned as primary + * for this allocator (EEXIST) we continue using this + * metaslab for our allocation, rather than going on to a + * worse metaslab (we waited for that metaslab to be loaded + * after all). + * + * If the activation failed due to an I/O error or ENOSPC we + * skip to the next metaslab. + */ + boolean_t activated; + if (activation_error == 0) { + activated = B_TRUE; + } else if (activation_error == EBUSY || + activation_error == EEXIST) { + activated = B_FALSE; + } else { mutex_exit(&msp->ms_lock); continue; } - - msp->ms_selected_txg = txg; + ASSERT(msp->ms_loaded); /* * Now that we have the lock, recheck to see if we should * continue to use this metaslab for this allocation. The - * the metaslab is now loaded so metaslab_should_allocate() can - * accurately determine if the allocation attempt should + * the metaslab is now loaded so metaslab_should_allocate() + * can accurately determine if the allocation attempt should * proceed. */ - if (!metaslab_should_allocate(msp, asize)) { + if (!metaslab_should_allocate(msp, asize, try_hard)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); @@ -4151,8 +4602,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, } /* - * If this metaslab is currently condensing then pick again as - * we can't manipulate this metaslab until it's committed + * If this metaslab is currently condensing then pick again + * as we can't manipulate this metaslab until it's committed * to disk. If this metaslab is being initialized, we shouldn't * allocate from it since the allocated region might be * overwritten after allocation. @@ -4160,15 +4611,19 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_CONDENSING, allocator); - metaslab_passivate(msp, msp->ms_weight & - ~METASLAB_ACTIVE_MASK); + if (activated) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } mutex_exit(&msp->ms_lock); continue; } else if (msp->ms_disabled > 0) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_DISABLED, allocator); - metaslab_passivate(msp, msp->ms_weight & - ~METASLAB_ACTIVE_MASK); + if (activated) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } mutex_exit(&msp->ms_lock); continue; } @@ -4178,13 +4633,23 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, if (offset != -1ULL) { /* Proactively passivate the metaslab, if needed */ - metaslab_segment_may_passivate(msp); + if (activated) + metaslab_segment_may_passivate(msp); break; } next: ASSERT(msp->ms_loaded); /* + * This code is disabled out because of issues with + * tracepoints in non-gpl kernel modules. + */ +#if 0 + DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp, + uint64_t, asize); +#endif + + /* * We were unable to allocate from this metaslab so determine * a new weight for this metaslab. Now that we have loaded * the metaslab we can provide a better hint to the metaslab @@ -4205,14 +4670,33 @@ next: * currently available for allocation and is accurate * even within a sync pass. */ + uint64_t weight; if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { - uint64_t weight = metaslab_block_maxsize(msp); + weight = metaslab_largest_allocatable(msp); WEIGHT_SET_SPACEBASED(weight); + } else { + weight = metaslab_weight_from_range_tree(msp); + } + + if (activated) { metaslab_passivate(msp, weight); } else { - metaslab_passivate(msp, - metaslab_weight_from_range_tree(msp)); + /* + * For the case where we use the metaslab that is + * active for another allocator we want to make + * sure that we retain the activation mask. + * + * Note that we could attempt to use something like + * metaslab_recalculate_weight_and_sort() that + * retains the activation mask here. That function + * uses metaslab_weight() to set the weight though + * which is not as accurate as the calculations + * above. + */ + weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; + metaslab_group_sort(mg, msp, weight); } + metaslab_active_mask_verify(msp); /* * We have just failed an allocation attempt, check @@ -4220,7 +4704,7 @@ next: * we may end up in an infinite loop retrying the same * metaslab. */ - ASSERT(!metaslab_should_allocate(msp, asize)); + ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); mutex_exit(&msp->ms_lock); } @@ -4231,14 +4715,14 @@ next: static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, - int d, int allocator) + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, + int allocator, boolean_t try_hard) { uint64_t offset; ASSERT(mg->mg_initialized); offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, - dva, d, allocator); + dva, d, allocator, try_hard); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { @@ -4408,7 +4892,7 @@ top: * allow any metaslab to be used (unique=false). */ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - !try_hard, dva, d, allocator); + !try_hard, dva, d, allocator, try_hard); if (offset != -1ULL) { /* @@ -4731,6 +5215,7 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); range_tree_remove(msp->ms_allocating[txg & TXG_MASK], offset, size); + msp->ms_allocating_total -= size; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); @@ -4836,7 +5321,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) - error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM, txg); + error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); /* * No need to fail in that case; someone else has activated the * metaslab, but that doesn't preclude us from using it. @@ -4862,10 +5347,20 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, range_tree_clear(msp->ms_trim, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ + metaslab_class_t *mc = msp->ms_group->mg_class; + multilist_sublist_t *mls = + multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); + if (!multilist_link_active(&msp->ms_class_txg_node)) { + msp->ms_selected_txg = txg; + multilist_sublist_insert_head(mls, msp); + } + multilist_sublist_unlock(mls); + if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], offset, size); + msp->ms_allocating_total += size; } mutex_exit(&msp->ms_lock); @@ -5226,7 +5721,7 @@ metaslab_disable(metaslab_t *msp) } void -metaslab_enable(metaslab_t *msp, boolean_t sync) +metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; @@ -5244,6 +5739,8 @@ metaslab_enable(metaslab_t *msp, boolean_t sync) if (--msp->ms_disabled == 0) { mg->mg_ms_disabled--; cv_broadcast(&mg->mg_ms_disabled_cv); + if (unload) + metaslab_unload(msp); } mutex_exit(&msp->ms_lock); mutex_exit(&mg->mg_ms_disabled_lock); diff --git a/usr/src/uts/common/fs/zfs/range_tree.c b/usr/src/uts/common/fs/zfs/range_tree.c index 0ce251126b..92726c3f71 100644 --- a/usr/src/uts/common/fs/zfs/range_tree.c +++ b/usr/src/uts/common/fs/zfs/range_tree.c @@ -525,6 +525,36 @@ range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) } /* + * Returns the first subset of the given range which overlaps with the range + * tree. Returns true if there is a segment in the range, and false if there + * isn't. + */ +boolean_t +range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, + uint64_t *ostart, uint64_t *osize) +{ + range_seg_t rsearch; + rsearch.rs_start = start; + rsearch.rs_end = start + 1; + + avl_index_t where; + range_seg_t *rs = avl_find(&rt->rt_root, &rsearch, &where); + if (rs != NULL) { + *ostart = start; + *osize = MIN(size, rs->rs_end - start); + return (B_TRUE); + } + + rs = avl_nearest(&rt->rt_root, where, AVL_AFTER); + if (rs == NULL || rs->rs_start > start + size) + return (B_FALSE); + + *ostart = rs->rs_start; + *osize = MIN(start + size, rs->rs_end) - rs->rs_start; + return (B_TRUE); +} + +/* * Ensure that this range is not in the tree, regardless of whether * it is currently in the tree. */ diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index c213c860bd..054e773b3f 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -8618,6 +8618,10 @@ spa_sync(spa_t *spa, uint64_t txg) while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) != NULL) vdev_sync_done(vd, txg); + + metaslab_class_evict_old(spa->spa_normal_class, txg); + metaslab_class_evict_old(spa->spa_log_class, txg); + spa_sync_close_syncing_log_sm(spa); spa_update_dspace(spa); diff --git a/usr/src/uts/common/fs/zfs/spa_log_spacemap.c b/usr/src/uts/common/fs/zfs/spa_log_spacemap.c index bbb6eda845..e0c369d13c 100644 --- a/usr/src/uts/common/fs/zfs/spa_log_spacemap.c +++ b/usr/src/uts/common/fs/zfs/spa_log_spacemap.c @@ -1191,7 +1191,8 @@ out: metaslab_unflushed_changes_memused(m); if (metaslab_debug_load && m->ms_sm != NULL) { - VERIFY0(metaslab_load(m, spa_syncing_txg(spa))); + VERIFY0(metaslab_load(m)); + metaslab_set_selected_txg(m, 0); } mutex_exit(&m->ms_lock); } diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index f636d3dcf2..1ef3bb79ca 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -236,6 +236,7 @@ void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); +uint64_t arc_all_memory(void); uint64_t arc_max_bytes(void); void arc_init(void); void arc_fini(void); diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h index 10705a84bc..069c5ab79a 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h @@ -56,7 +56,7 @@ uint64_t metaslab_estimated_condensed_size(metaslab_t *); int metaslab_sort_by_flushed(const void *, const void *); uint64_t metaslab_unflushed_changes_memused(metaslab_t *); -int metaslab_load(metaslab_t *, uint64_t); +int metaslab_load(metaslab_t *); void metaslab_unload(metaslab_t *); boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *); @@ -65,7 +65,7 @@ uint64_t metaslab_allocated_space(metaslab_t *); void metaslab_sync(metaslab_t *, uint64_t); void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); -uint64_t metaslab_block_maxsize(metaslab_t *); +uint64_t metaslab_largest_allocatable(metaslab_t *); /* * metaslab alloc flags @@ -107,7 +107,7 @@ uint64_t metaslab_class_expandable_space(metaslab_class_t *); boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, zio_t *, int); void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); - +void metaslab_class_evict_old(metaslab_class_t *, uint64_t); uint64_t metaslab_class_get_alloc(metaslab_class_t *); uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *); @@ -130,7 +130,8 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int, void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); void metaslab_recalculate_weight_and_sort(metaslab_t *); void metaslab_disable(metaslab_t *); -void metaslab_enable(metaslab_t *, boolean_t); +void metaslab_enable(metaslab_t *, boolean_t, boolean_t); +void metaslab_set_selected_txg(metaslab_t *, uint64_t); extern int metaslab_debug_load; diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h index 5920b3113c..a413eef490 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h @@ -36,6 +36,7 @@ #include <sys/vdev.h> #include <sys/txg.h> #include <sys/avl.h> +#include <sys/multilist.h> #ifdef __cplusplus extern "C" { @@ -194,6 +195,12 @@ struct metaslab_class { uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_dspace; /* total deflated space */ uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + + /* + * List of all loaded metaslabs in the class, sorted in order of most + * recent use. + */ + multilist_t *mc_metaslab_txg_list; }; /* @@ -387,6 +394,7 @@ struct metaslab { range_tree_t *ms_allocating[TXG_SIZE]; range_tree_t *ms_allocatable; uint64_t ms_allocated_this_txg; + uint64_t ms_allocating_total; /* * The following range trees are accessed only from syncing context. @@ -484,7 +492,13 @@ struct metaslab { * stay cached. */ uint64_t ms_selected_txg; - uint64_t ms_loaded_txg; /* track when metaslab was loaded */ + /* + * ms_load/unload_time can be used for performance monitoring + * (e.g. by dtrace or mdb). + */ + hrtime_t ms_load_time; /* time last loaded */ + hrtime_t ms_unload_time; /* time last unloaded */ + hrtime_t ms_selected_time; /* time last allocated from */ uint64_t ms_max_size; /* maximum allocatable size */ @@ -504,12 +518,17 @@ struct metaslab { * segment sizes. */ avl_tree_t ms_allocatable_by_size; + avl_tree_t ms_unflushed_frees_by_size; uint64_t ms_lbas[MAX_LBAS]; metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */ + /* + * Node in metaslab class's selected txg list + */ + multilist_node_t ms_class_txg_node; /* * Allocs and frees that are committed to the vdev log spacemap but diff --git a/usr/src/uts/common/fs/zfs/sys/range_tree.h b/usr/src/uts/common/fs/zfs/sys/range_tree.h index d450ff7f16..716aaf3b90 100644 --- a/usr/src/uts/common/fs/zfs/sys/range_tree.h +++ b/usr/src/uts/common/fs/zfs/sys/range_tree.h @@ -88,6 +88,8 @@ range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg, range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); +boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, + uint64_t *ostart, uint64_t *osize); void range_tree_verify_not_present(range_tree_t *rt, uint64_t start, uint64_t size); range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); diff --git a/usr/src/uts/common/fs/zfs/vdev_initialize.c b/usr/src/uts/common/fs/zfs/vdev_initialize.c index af18983c44..2079df133c 100644 --- a/usr/src/uts/common/fs/zfs/vdev_initialize.c +++ b/usr/src/uts/common/fs/zfs/vdev_initialize.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. */ #include <sys/spa.h> @@ -350,7 +350,7 @@ vdev_initialize_calculate_progress(vdev_t *vd) * metaslab. Load it and walk the free tree for more accurate * progress estimation. */ - VERIFY0(metaslab_load(msp, spa_syncing_txg(vd->vdev_spa))); + VERIFY0(metaslab_load(msp)); for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { @@ -474,6 +474,7 @@ vdev_initialize_thread(void *arg) for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + boolean_t unload_when_done = B_FALSE; /* * If we've expanded the top-level vdev or it's our @@ -487,14 +488,16 @@ vdev_initialize_thread(void *arg) spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); mutex_enter(&msp->ms_lock); - VERIFY0(metaslab_load(msp, spa_syncing_txg(spa))); + if (!msp->ms_loaded && !msp->ms_loading) + unload_when_done = B_TRUE; + VERIFY0(metaslab_load(msp)); range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, vd); mutex_exit(&msp->ms_lock); error = vdev_initialize_ranges(vd, deadbeef); - metaslab_enable(msp, B_TRUE); + metaslab_enable(msp, B_TRUE, unload_when_done); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); diff --git a/usr/src/uts/common/fs/zfs/vdev_trim.c b/usr/src/uts/common/fs/zfs/vdev_trim.c index a60d11814b..4be11bcb51 100644 --- a/usr/src/uts/common/fs/zfs/vdev_trim.c +++ b/usr/src/uts/common/fs/zfs/vdev_trim.c @@ -622,7 +622,7 @@ vdev_trim_calculate_progress(vdev_t *vd) * metaslab. Load it and walk the free tree for more * accurate progress estimation. */ - VERIFY0(metaslab_load(msp, spa_syncing_txg(vd->vdev_spa))); + VERIFY0(metaslab_load(msp)); for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { @@ -730,7 +730,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) */ if (zfs_flags & ZFS_DEBUG_TRIM) { metaslab_t *msp = ta->trim_msp; - VERIFY0(metaslab_load(msp, spa_syncing_txg(vd->vdev_spa))); + VERIFY0(metaslab_load(msp)); VERIFY3B(msp->ms_loaded, ==, B_TRUE); VERIFY(range_tree_find(msp->ms_allocatable, start, size)); } @@ -842,7 +842,7 @@ vdev_trim_thread(void *arg) spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); mutex_enter(&msp->ms_lock); - VERIFY0(metaslab_load(msp, spa_syncing_txg(spa))); + VERIFY0(metaslab_load(msp)); /* * If a partial TRIM was requested skip metaslabs which have @@ -850,7 +850,7 @@ vdev_trim_thread(void *arg) */ if (msp->ms_sm == NULL && vd->vdev_trim_partial) { mutex_exit(&msp->ms_lock); - metaslab_enable(msp, B_FALSE); + metaslab_enable(msp, B_FALSE, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_calculate_progress(vd); continue; @@ -862,7 +862,7 @@ vdev_trim_thread(void *arg) mutex_exit(&msp->ms_lock); error = vdev_trim_ranges(&ta); - metaslab_enable(msp, B_TRUE); + metaslab_enable(msp, B_TRUE, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); range_tree_vacate(ta.trim_tree, NULL, NULL); @@ -1167,7 +1167,7 @@ vdev_autotrim_thread(void *arg) if (msp->ms_sm == NULL || range_tree_is_empty(msp->ms_trim)) { mutex_exit(&msp->ms_lock); - metaslab_enable(msp, B_FALSE); + metaslab_enable(msp, B_FALSE, B_FALSE); continue; } @@ -1183,7 +1183,7 @@ vdev_autotrim_thread(void *arg) */ if (msp->ms_disabled > 1) { mutex_exit(&msp->ms_lock); - metaslab_enable(msp, B_FALSE); + metaslab_enable(msp, B_FALSE, B_FALSE); continue; } @@ -1291,8 +1291,7 @@ vdev_autotrim_thread(void *arg) */ if (zfs_flags & ZFS_DEBUG_TRIM) { mutex_enter(&msp->ms_lock); - VERIFY0(metaslab_load(msp, - spa_syncing_txg(spa))); + VERIFY0(metaslab_load(msp)); VERIFY3P(tap[0].trim_msp, ==, msp); range_tree_walk(trim_tree, vdev_trim_range_verify, &tap[0]); @@ -1302,7 +1301,7 @@ vdev_autotrim_thread(void *arg) range_tree_vacate(trim_tree, NULL, NULL); range_tree_destroy(trim_tree); - metaslab_enable(msp, issued_trim); + metaslab_enable(msp, issued_trim, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (uint64_t c = 0; c < children; c++) { diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 72e18d5305..b24d83496c 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -5706,7 +5706,7 @@ zfs_ioc_next_obj(zfs_cmd_t *zc) objset_t *os = NULL; int error; - error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c index 3697d888e7..656c598e53 100644 --- a/usr/src/uts/common/io/mac/mac_datapath_setup.c +++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c @@ -1716,10 +1716,8 @@ mac_srs_create_proto_softrings(int id, uint16_t type, pri_t pri, bzero(&mrf, sizeof (mac_rx_fifo_t)); mrf.mrf_type = MAC_RX_FIFO; mrf.mrf_receive = (mac_receive_t)mac_soft_ring_poll; - mrf.mrf_intr_enable = - (mac_intr_enable_t)mac_soft_ring_intr_enable; - mrf.mrf_intr_disable = - (mac_intr_disable_t)mac_soft_ring_intr_disable; + mrf.mrf_intr_enable = (mac_intr_enable_t)mac_soft_ring_intr_enable; + mrf.mrf_intr_disable = (mac_intr_disable_t)mac_soft_ring_intr_disable; mrf.mrf_flow_priority = pri; softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait, diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c index 4655631dc1..c8a16e6fd3 100644 --- a/usr/src/uts/common/io/mac/mac_soft_ring.c +++ b/usr/src/uts/common/io/mac/mac_soft_ring.c @@ -494,7 +494,7 @@ done: * Enabling is allow the processing thread to send packets to the * client while disabling does the opposite. */ -void +int mac_soft_ring_intr_enable(void *arg) { mac_soft_ring_t *ringp = (mac_soft_ring_t *)arg; @@ -503,6 +503,7 @@ mac_soft_ring_intr_enable(void *arg) if (ringp->s_ring_first != NULL) mac_soft_ring_worker_wakeup(ringp); mutex_exit(&ringp->s_ring_lock); + return (0); } boolean_t diff --git a/usr/src/uts/common/io/usb/usba/usbai.c b/usr/src/uts/common/io/usb/usba/usbai.c index f6ac391bd8..e1a6b4dfcd 100644 --- a/usr/src/uts/common/io/usb/usba/usbai.c +++ b/usr/src/uts/common/io/usb/usba/usbai.c @@ -1040,7 +1040,7 @@ usb_register_hotplug_cbs(dev_info_t *dip, } } if (ddi_add_event_handler(dip, usba_device->rm_cookie, - (peh_t)disconnect_event_handler, + (peh_t)(uintptr_t)disconnect_event_handler, NULL, &evdata->ev_rm_cb_id) != DDI_SUCCESS) { USB_DPRINTF_L2(DPRINT_MASK_USBAI, usbai_log_handle, "usb_register_hotplug_cbs: add disconnect handler failed"); @@ -1058,7 +1058,7 @@ usb_register_hotplug_cbs(dev_info_t *dip, } } if (ddi_add_event_handler(dip, usba_device->ins_cookie, - (peh_t)reconnect_event_handler, + (peh_t)(uintptr_t)reconnect_event_handler, NULL, &evdata->ev_ins_cb_id) != DDI_SUCCESS) { USB_DPRINTF_L2(DPRINT_MASK_USBAI, usbai_log_handle, "usb_register_hotplug_cbs: add reconnect handler failed"); @@ -1129,7 +1129,7 @@ usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata, } } if (ddi_add_event_handler(dip, usba_device->rm_cookie, - (peh_t)usb_evdata->disconnect_event_handler, + (peh_t)(uintptr_t)usb_evdata->disconnect_event_handler, NULL, &evdata->ev_rm_cb_id) != DDI_SUCCESS) { goto fail; @@ -1144,7 +1144,7 @@ usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata, } } if (ddi_add_event_handler(dip, usba_device->ins_cookie, - (peh_t)usb_evdata->reconnect_event_handler, + (peh_t)(uintptr_t)usb_evdata->reconnect_event_handler, NULL, &evdata->ev_ins_cb_id) != DDI_SUCCESS) { goto fail; @@ -1159,7 +1159,7 @@ usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata, } } if (ddi_add_event_handler(dip, usba_device->resume_cookie, - (peh_t)usb_evdata->post_resume_event_handler, + (peh_t)(uintptr_t)usb_evdata->post_resume_event_handler, NULL, &evdata->ev_resume_cb_id) != DDI_SUCCESS) { goto fail; @@ -1174,7 +1174,7 @@ usb_register_event_cbs(dev_info_t *dip, usb_event_t *usb_evdata, } } if (ddi_add_event_handler(dip, usba_device->suspend_cookie, - (peh_t)usb_evdata->pre_suspend_event_handler, + (peh_t)(uintptr_t)usb_evdata->pre_suspend_event_handler, NULL, &evdata->ev_suspend_cb_id) != DDI_SUCCESS) { goto fail; diff --git a/usr/src/uts/common/sys/mac_soft_ring.h b/usr/src/uts/common/sys/mac_soft_ring.h index 581e18d06e..5a41899e60 100644 --- a/usr/src/uts/common/sys/mac_soft_ring.h +++ b/usr/src/uts/common/sys/mac_soft_ring.h @@ -691,7 +691,7 @@ extern void mac_srs_update_drv(struct mac_client_impl_s *); extern void mac_update_srs_priority(mac_soft_ring_set_t *, pri_t); extern void mac_client_update_classifier(mac_client_impl_t *, boolean_t); -extern void mac_soft_ring_intr_enable(void *); +extern int mac_soft_ring_intr_enable(void *); extern boolean_t mac_soft_ring_intr_disable(void *); extern mac_soft_ring_t *mac_soft_ring_create(int, clock_t, uint16_t, pri_t, mac_client_impl_t *, mac_soft_ring_set_t *, diff --git a/usr/src/uts/common/xen/io/evtchn_dev.c b/usr/src/uts/common/xen/io/evtchn_dev.c index b4ba63b436..7a8d50eb33 100644 --- a/usr/src/uts/common/xen/io/evtchn_dev.c +++ b/usr/src/uts/common/xen/io/evtchn_dev.c @@ -112,8 +112,8 @@ static int evtchndrv_detach(dev_info_t *, ddi_detach_cmd_t); static struct evtsoftdata *port_user[NR_EVENT_CHANNELS]; static kmutex_t port_user_lock; -void -evtchn_device_upcall() +uint_t +evtchn_device_upcall(caddr_t arg __unused, caddr_t arg1 __unused) { struct evtsoftdata *ep; int port; @@ -154,6 +154,7 @@ evtchn_device_upcall() done: mutex_exit(&port_user_lock); + return (DDI_INTR_CLAIMED); } /* ARGSUSED */ diff --git a/usr/src/uts/i86pc/io/immu_intrmap.c b/usr/src/uts/i86pc/io/immu_intrmap.c index ab9f9bcbe7..737eed2efa 100644 --- a/usr/src/uts/i86pc/io/immu_intrmap.c +++ b/usr/src/uts/i86pc/io/immu_intrmap.c @@ -63,7 +63,7 @@ typedef struct intrmap_rte { (p)) typedef enum { - SVT_NO_VERIFY = 0, /* no verification */ + SVT_NO_VERIFY = 0, /* no verification */ SVT_ALL_VERIFY, /* using sid and sq to verify */ SVT_BUS_VERIFY, /* verify #startbus and #endbus */ SVT_RSVD @@ -224,7 +224,7 @@ bitset_find_multi_free(bitset_t *b, uint_t post, uint_t count) } } - return (INTRMAP_IDX_FULL); /* no free index */ + return (INTRMAP_IDX_FULL); /* no free index */ } /* alloc one interrupt remapping table entry */ @@ -495,11 +495,12 @@ intrmap_enable(immu_t *immu) /* * immu_intr_handler() - * the fault event handler for a single immu unit + * the fault event handler for a single immu unit */ -int -immu_intr_handler(immu_t *immu) +uint_t +immu_intr_handler(caddr_t arg, caddr_t arg1 __unused) { + immu_t *immu = (immu_t *)arg; uint32_t status; int index, fault_reg_offset; int max_fault_index; @@ -995,10 +996,10 @@ immu_intr_register(immu_t *immu) "%s-intr-handler", immu->immu_name); (void) add_avintr((void *)NULL, IMMU_INTR_IPL, - (avfunc)(immu_intr_handler), intr_handler_name, irq, + immu_intr_handler, intr_handler_name, irq, (caddr_t)immu, NULL, NULL, NULL); immu_regs_intr_enable(immu, msi_addr, msi_data, uaddr); - (void) immu_intr_handler(immu); + (void) immu_intr_handler((caddr_t)immu, NULL); } diff --git a/usr/src/uts/i86pc/io/immu_regs.c b/usr/src/uts/i86pc/io/immu_regs.c index dc43b0f49a..d6b184416a 100644 --- a/usr/src/uts/i86pc/io/immu_regs.c +++ b/usr/src/uts/i86pc/io/immu_regs.c @@ -253,7 +253,7 @@ gaw2agaw(int gaw) /* * set_immu_agaw() - * calculate agaw for a IOMMU unit + * calculate agaw for a IOMMU unit */ static int set_agaw(immu_t *immu) @@ -481,7 +481,7 @@ immu_regs_resume(immu_t *immu) immu_regs_intr_enable(immu, immu->immu_regs_intr_msi_addr, immu->immu_regs_intr_msi_data, immu->immu_regs_intr_uaddr); - (void) immu_intr_handler(immu); + (void) immu_intr_handler((caddr_t)immu, NULL); immu_regs_intrmap_enable(immu, immu->immu_intrmap_irta_reg); @@ -638,7 +638,7 @@ immu_regs_wbf_flush(immu_t *immu) /* * immu_regs_cpu_flush() - * flush the cpu cache line after CPU memory writes, so + * flush the cpu cache line after CPU memory writes, so * IOMMU can see the writes */ void diff --git a/usr/src/uts/i86pc/sys/immu.h b/usr/src/uts/i86pc/sys/immu.h index 70193d26e6..22ae9ad3bf 100644 --- a/usr/src/uts/i86pc/sys/immu.h +++ b/usr/src/uts/i86pc/sys/immu.h @@ -130,11 +130,11 @@ typedef struct drhd { kmutex_t dr_lock; /* protects the dmar field */ struct immu *dr_immu; dev_info_t *dr_dip; - uint16_t dr_seg; - uint64_t dr_regs; + uint16_t dr_seg; + uint64_t dr_regs; boolean_t dr_include_all; - list_t dr_scope_list; - list_node_t dr_node; + list_t dr_scope_list; + list_node_t dr_node; } drhd_t; typedef struct rmrr { @@ -638,7 +638,7 @@ typedef struct immu { * Enough space to hold the decimal number of any device instance. * Used for device/cache names. */ -#define IMMU_ISTRLEN 11 /* log10(2^31) + 1 */ +#define IMMU_ISTRLEN 11 /* log10(2^31) + 1 */ /* properties that control DVMA */ #define DDI_DVMA_MAPTYPE_ROOTNEX_PROP "immu-dvma-mapping" @@ -677,7 +677,7 @@ typedef struct domain { /* list node for list of domains off immu */ list_node_t dom_immu_node; - mod_hash_t *dom_cookie_hash; + mod_hash_t *dom_cookie_hash; /* topmost device in domain; usually the device itself (non-shared) */ dev_info_t *dom_dip; @@ -944,7 +944,7 @@ void immu_intrmap_destroy(list_t *immu_list); /* registers interrupt handler for IOMMU unit */ void immu_intr_register(immu_t *immu); -int immu_intr_handler(immu_t *immu); +uint_t immu_intr_handler(caddr_t, caddr_t); /* immu_qinv.c interfaces */ diff --git a/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c b/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c index 8bc46f8e3e..a7745fd3f2 100644 --- a/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c +++ b/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c @@ -74,7 +74,7 @@ static gcpu_poll_trace_ctl_t gcpu_xpv_poll_trace_ctl; #define GCPU_XPV_MCH_POLL_NO_REARM NULL static uint_t -gcpu_xpv_virq_intr(void) +gcpu_xpv_virq_intr(caddr_t arg __unused, caddr_t arg1 __unused) { int types[] = { XEN_MC_URGENT, XEN_MC_NONURGENT }; uint64_t fetch_id; @@ -194,7 +194,7 @@ gcpu_mca_poll_start(cmi_hdl_t hdl) */ gcpu_xpv_virq_vect = ec_bind_virq_to_irq(VIRQ_MCA, 0); (void) add_avintr(NULL, gcpu_xpv_virq_level, - (avfunc)gcpu_xpv_virq_intr, "MCA", gcpu_xpv_virq_vect, + gcpu_xpv_virq_intr, "MCA", gcpu_xpv_virq_vect, NULL, NULL, NULL, NULL); } } diff --git a/usr/src/uts/i86xpv/io/psm/xpv_psm.c b/usr/src/uts/i86xpv/io/psm/xpv_psm.c index bc0ab7748d..94308c3f2f 100644 --- a/usr/src/uts/i86xpv/io/psm/xpv_psm.c +++ b/usr/src/uts/i86xpv/io/psm/xpv_psm.c @@ -223,14 +223,13 @@ xen_psm_hrtimeinit(void) } /* xen_psm NMI handler */ -/*ARGSUSED*/ -static void -xen_psm_nmi_intr(caddr_t arg, struct regs *rp) +static uint_t +xen_psm_nmi_intr(caddr_t arg __unused, caddr_t arg1 __unused) { xen_psm_num_nmis++; if (!lock_try(&xen_psm_nmi_lock)) - return; + return (DDI_INTR_UNCLAIMED); if (xen_psm_kmdb_on_nmi && psm_debugger()) { debug_enter("NMI received: entering kmdb\n"); @@ -247,6 +246,7 @@ xen_psm_nmi_intr(caddr_t arg, struct regs *rp) } lock_clear(&xen_psm_nmi_lock); + return (DDI_INTR_CLAIMED); } static void @@ -294,7 +294,7 @@ xen_psm_picinit() /* add nmi handler - least priority nmi handler */ LOCK_INIT_CLEAR(&xen_psm_nmi_lock); - if (!psm_add_nmintr(0, (avfunc) xen_psm_nmi_intr, + if (!psm_add_nmintr(0, xen_psm_nmi_intr, "xVM_psm NMI handler", (caddr_t)NULL)) cmn_err(CE_WARN, "xVM_psm: Unable to add nmi handler"); } |