diff options
Diffstat (limited to 'usr/src/uts/common')
62 files changed, 1364 insertions, 382 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index ce7b7a3e6a..720701371d 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -110,6 +110,7 @@ GENUNIX_OBJS += \ bio.o \ bitmap.o \ blabel.o \ + bootbanner.o \ brandsys.o \ bz2blocksort.o \ bz2compress.o \ diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index 8a906a2e25..bb80ca63c4 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -26,6 +26,7 @@ # Copyright 2020 Joyent, Inc. # Copyright 2018 Nexenta Systems, Inc. # Copyright (c) 2017 by Delphix. All rights reserved. +# Copyright 2020 Oxide Computer Company # # @@ -1563,6 +1564,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/krtld/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(COMMONBASE)/bootbanner/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(COMMONBASE)/list/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1591,6 +1596,13 @@ $(OBJS_DIR)/%.o: $(COMMONBASE)/refhash/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/bootbanner.o := CPPFLAGS += \ + -DBOOTBANNER1='"$(BOOTBANNER1)"' \ + -DBOOTBANNER2='"$(BOOTBANNER2)"' \ + -DBOOTBANNER3='"$(BOOTBANNER3)"' \ + -DBOOTBANNER4='"$(BOOTBANNER4)"' \ + -DBOOTBANNER5='"$(BOOTBANNER5)"' + $(OBJS_DIR)/%.o: $(UTSBASE)/common/os/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c index fed6be37cf..c7e5351778 100644 --- a/usr/src/uts/common/brand/lx/os/lx_brand.c +++ b/usr/src/uts/common/brand/lx/os/lx_brand.c @@ -25,7 +25,7 @@ */ /* - * Copyright 2019 Joyent, Inc. + * Copyright 2020 Joyent, Inc. */ /* @@ -1402,8 +1402,15 @@ lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, if (p->p_brand == NULL) return (ENOSYS); - VERIFY(p->p_brand == &lx_brand); - VERIFY(p->p_brand_data != NULL); + /* + * Certain native applications may wish to start the lx_lockd process. + * Every other process that's not branded should be denied. + */ + if (p->p_brand != &lx_brand && cmd != B_START_NFS_LOCKD) + return (ENOSYS); + + if (cmd != B_START_NFS_LOCKD) + VERIFY(p->p_brand_data != NULL); switch (cmd) { case B_REGISTER: diff --git a/usr/src/uts/common/brand/lx/os/lx_lockd.c b/usr/src/uts/common/brand/lx/os/lx_lockd.c index d6d965398a..37b744b0e8 100644 --- a/usr/src/uts/common/brand/lx/os/lx_lockd.c +++ b/usr/src/uts/common/brand/lx/os/lx_lockd.c @@ -297,6 +297,18 @@ lx_upcall_statd(int op, struct nlm_globals *g, struct nlm_host *host) * as we pass to monitor, so that is also handled here by this same * brand hook. */ + + /* + * If the NLM was set up to be "v4 only" (i.e. no RPC call handlers + * to localhost at configure time), the semaphore is uninitialized, + * and will indefinitely hang. FURTHERMORE if just the semaphore + * was initialized, we'd still panic with a NULL nsm->ns_handle. + */ + if (g->nlm_v4_only) { + stat = RPC_SYSTEMERROR; + goto bail; + } + nlm_netbuf_to_netobj(&host->nh_addr, &family, &obj); nsm = &g->nlm_nsm; @@ -327,6 +339,7 @@ lx_upcall_statd(int op, struct nlm_globals *g, struct nlm_host *host) } sema_v(&nsm->ns_sem); +bail: if (stat != RPC_SUCCESS) { NLM_WARN("Failed to contact local statd, stat=%d", stat); if (op == SM_MON) { diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h index 90d87d78a8..85aa5e34bd 100644 --- a/usr/src/uts/common/brand/lx/sys/lx_brand.h +++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h @@ -94,6 +94,7 @@ extern "C" { #define B_LPID_TO_SPAIR 128 #define B_GET_CURRENT_CONTEXT 129 #define B_EMULATION_DONE 130 +/* Some native programs use B_START_NFS_LOCKD, so don't change this. */ #define B_START_NFS_LOCKD 131 #define B_BLOCK_ALL_SIGS 132 #define B_UNBLOCK_ALL_SIGS 133 diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c index b619719ba9..fa7b9fb2fc 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scan.c +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c @@ -24,7 +24,7 @@ * Copyright 2016 Gary Mills * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2019 Joyent, Inc. - * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. */ #include <sys/dsl_scan.h> @@ -549,6 +549,22 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", (longlong_t)scn->scn_restart_txg); + } else if (dsl_scan_resilvering(dp)) { + /* + * If a resilver is in progress and there are already + * errors, restart it instead of finishing this scan and + * then restarting it. If there haven't been any errors + * then remember that the incore DTL is valid. + */ + if (scn->scn_phys.scn_errors > 0) { + scn->scn_restart_txg = txg; + zfs_dbgmsg("resilver can't excise DTL_MISSING " + "when finished; restarting in txg %llu", + (u_longlong_t)scn->scn_restart_txg); + } else { + /* it's safe to excise DTL when finished */ + spa->spa_scrub_started = B_TRUE; + } } } @@ -599,6 +615,13 @@ dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) } boolean_t +dsl_scan_resilver_scheduled(dsl_pool_t *dp) +{ + return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) || + (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER)); +} + +boolean_t dsl_scan_scrubbing(const dsl_pool_t *dp) { dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys; @@ -794,7 +817,7 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) (void) spa_vdev_state_exit(spa, NULL, 0); if (func == POOL_SCAN_RESILVER) { - dsl_resilver_restart(spa->spa_dsl_pool, 0); + dsl_scan_restart_resilver(spa->spa_dsl_pool, 0); return (0); } @@ -813,41 +836,6 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } -/* - * Sets the resilver defer flag to B_FALSE on all leaf devs under vd. Returns - * B_TRUE if we have devices that need to be resilvered and are available to - * accept resilver I/Os. - */ -static boolean_t -dsl_scan_clear_deferred(vdev_t *vd, dmu_tx_t *tx) -{ - boolean_t resilver_needed = B_FALSE; - spa_t *spa = vd->vdev_spa; - - for (int c = 0; c < vd->vdev_children; c++) { - resilver_needed |= - dsl_scan_clear_deferred(vd->vdev_child[c], tx); - } - - if (vd == spa->spa_root_vdev && - spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { - spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); - vdev_config_dirty(vd); - spa->spa_resilver_deferred = B_FALSE; - return (resilver_needed); - } - - if (!vdev_is_concrete(vd) || vd->vdev_aux || - !vd->vdev_ops->vdev_op_leaf) - return (resilver_needed); - - if (vd->vdev_resilver_deferred) - vd->vdev_resilver_deferred = B_FALSE; - - return (!vdev_is_dead(vd) && !vd->vdev_offline && - vdev_resilver_needed(vd, NULL, NULL)); -} - /* ARGSUSED */ static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) @@ -915,7 +903,6 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) "errors=%llu", spa_get_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { - spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; /* @@ -943,30 +930,33 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) spa_errlog_rotate(spa); /* + * Don't clear flag until after vdev_dtl_reassess to ensure that + * DTL_MISSING will get updated when possible. + */ + spa->spa_scrub_started = B_FALSE; + + /* * We may have finished replacing a device. * Let the async thread assess this and handle the detach. */ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); /* - * Clear any deferred_resilver flags in the config. + * Clear any resilver_deferred flags in the config. * If there are drives that need resilvering, kick * off an asynchronous request to start resilver. - * dsl_scan_clear_deferred() may update the config + * vdev_clear_resilver_deferred() may update the config * before the resilver can restart. In the event of * a crash during this period, the spa loading code * will find the drives that need to be resilvered - * when the machine reboots and start the resilver then. + * and start the resilver then. */ - if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { - boolean_t resilver_needed = - dsl_scan_clear_deferred(spa->spa_root_vdev, tx); - if (resilver_needed) { - spa_history_log_internal(spa, - "starting deferred resilver", tx, - "errors=%llu", spa_get_errlog_size(spa)); - spa_async_request(spa, SPA_ASYNC_RESILVER); - } + if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) && + vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) { + spa_history_log_internal(spa, + "starting deferred resilver", tx, "errors=%llu", + (u_longlong_t)spa_get_errlog_size(spa)); + spa_async_request(spa, SPA_ASYNC_RESILVER); } } @@ -1073,7 +1063,7 @@ dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) /* start a new scan, or restart an existing one. */ void -dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) +dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg) { if (txg == 0) { dmu_tx_t *tx; @@ -1221,10 +1211,13 @@ scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) static boolean_t dsl_scan_should_clear(dsl_scan_t *scn) { + spa_t *spa = scn->scn_dp->dp_spa; vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; - uint64_t mlim_hard, mlim_soft, mused; - uint64_t alloc = metaslab_class_get_alloc(spa_normal_class( - scn->scn_dp->dp_spa)); + uint64_t alloc, mlim_hard, mlim_soft, mused; + + alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + alloc += metaslab_class_get_alloc(spa_special_class(spa)); + alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, zfs_scan_mem_lim_min); @@ -4208,3 +4201,33 @@ dsl_scan_freed(spa_t *spa, const blkptr_t *bp) for (int i = 0; i < BP_GET_NDVAS(bp); i++) dsl_scan_freed_dva(spa, bp, i); } + +/* + * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has + * not started, start it. Otherwise, only restart if max txg in DTL range is + * greater than the max txg in the current scan. If the DTL max is less than + * the scan max, then the vdev has not missed any new data since the resilver + * started, so a restart is not needed. + */ +void +dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd) +{ + uint64_t min, max; + + if (!vdev_resilver_needed(vd, &min, &max)) + return; + + if (!dsl_scan_resilvering(dp)) { + spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); + return; + } + + if (max <= dp->dp_scan->scn_phys.scn_max_txg) + return; + + /* restart is needed, check if it can be deferred */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)) + vdev_defer_resilver(vd); + else + spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); +} diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 547fa1e2bb..fc08eebbc0 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -27,9 +27,9 @@ * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome <tsoome@me.com> + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2017 Datto Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> */ @@ -6397,9 +6397,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ if (dsl_scan_resilvering(spa_get_dsl(spa)) && spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) - vdev_set_deferred_resilver(spa, newvd); + vdev_defer_resilver(newvd); else - dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg); if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); @@ -7637,7 +7637,7 @@ spa_async_thread(void *arg) if (tasks & SPA_ASYNC_RESILVER && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) - dsl_resilver_restart(dp, 0); + dsl_scan_restart_resilver(dp, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { mutex_enter(&spa_namespace_lock); @@ -7753,6 +7753,12 @@ spa_async_request(spa_t *spa, int task) mutex_exit(&spa->spa_async_lock); } +int +spa_async_tasks(spa_t *spa) +{ + return (spa->spa_async_tasks); +} + /* * ========================================================================== * SPA syncing routines diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h index 1b600405ae..4693293290 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. */ #ifndef _SYS_DSL_SCAN_H @@ -164,10 +164,12 @@ void dsl_scan_fini(struct dsl_pool *dp); void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); int dsl_scan_cancel(struct dsl_pool *); int dsl_scan(struct dsl_pool *, pool_scan_func_t); +void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd); boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp); int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd); -void dsl_resilver_restart(struct dsl_pool *, uint64_t txg); +void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg); boolean_t dsl_scan_resilvering(struct dsl_pool *dp); +boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ddt_entry_t *dde, dmu_tx_t *tx); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 31faac4f77..33cdfbeb4b 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -26,7 +26,7 @@ * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2019 Joyent, Inc. - * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> */ @@ -775,6 +775,7 @@ extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag); extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); +extern int spa_async_tasks(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index a6de7e6f2c..b8c2ee5c9e 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Datto Inc. All rights reserved. */ #ifndef _SYS_VDEV_H @@ -153,6 +154,8 @@ extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd); extern void vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd); +extern void vdev_defer_resilver(vdev_t *vd); +extern boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx); typedef enum vdev_config_flag { VDEV_CONFIG_SPARE = 1 << 0, diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h index 9947bedf54..60d4d6805f 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -24,6 +24,7 @@ * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_ZFS_IOCTL_H @@ -389,6 +390,10 @@ typedef struct zinject_record { #define ZI_NO_DVA (-1) +/* scaled frequency ranges */ +#define ZI_PERCENTAGE_MIN 4294UL +#define ZI_PERCENTAGE_MAX UINT32_MAX + typedef enum zinject_type { ZINJECT_UNINITIALIZED, ZINJECT_DATA_FAULT, diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 01e892f4c4..9773ec7960 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -27,6 +27,7 @@ * Copyright 2016 Toomas Soome <tsoome@me.com> * Copyright 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Datto Inc. All rights reserved. */ #include <sys/zfs_context.h> @@ -98,6 +99,12 @@ boolean_t vdev_validate_skip = B_FALSE; int zfs_vdev_dtl_sm_blksz = (1 << 12); /* + * Ignore errors during scrub/resilver. Allows to work around resilver + * upon import when there are pool errors. + */ +int zfs_scan_ignore_errors = 0; + +/* * vdev-wide space maps that have lots of entries written to them at * the end of each transaction can benefit from a higher I/O bandwidth * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. @@ -772,7 +779,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_resilver_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) - vdev_set_deferred_resilver(spa, vd); + vdev_defer_resilver(vd); /* * When importing a pool, we want to ignore the persistent fault @@ -1764,18 +1771,12 @@ vdev_open(vdev_t *vd) } /* - * If a leaf vdev has a DTL, and seems healthy, then kick off a - * resilver. But don't do this if we are doing a reopen for a scrub, - * since this would just restart the scrub we are already doing. + * If this is a leaf vdev, assess whether a resilver is needed. + * But don't do this if we are doing a reopen for a scrub, since + * this would just restart the scrub we are already doing. */ - if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && - vdev_resilver_needed(vd, NULL, NULL)) { - if (dsl_scan_resilvering(spa->spa_dsl_pool) && - spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) - vdev_set_deferred_resilver(spa, vd); - else - spa_async_request(spa, SPA_ASYNC_RESILVER); - } + if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) + dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } @@ -2470,7 +2471,6 @@ vdev_dtl_should_excise(vdev_t *vd) spa_t *spa = vd->vdev_spa; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; - ASSERT0(scn->scn_phys.scn_errors); ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) @@ -2520,10 +2520,29 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* + * If requested, pretend the scan completed cleanly. + */ + if (zfs_scan_ignore_errors && scn) + scn->scn_phys.scn_errors = 0; + + if (scrub_txg != 0 && + !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + wasempty = B_FALSE; + zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " + "dtl:%llu/%llu errors:%llu", + (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, + (u_longlong_t)scrub_txg, spa->spa_scrub_started, + (u_longlong_t)vdev_dtl_min(vd), + (u_longlong_t)vdev_dtl_max(vd), + (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); + } + + /* * If we've completed a scan cleanly then determine * if this vdev should remove any DTLs. We only want to * excise regions on vdevs that were available during @@ -2559,6 +2578,14 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); + + if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + zfs_dbgmsg("update DTL_MISSING:%llu/%llu", + (u_longlong_t)vdev_dtl_min(vd), + (u_longlong_t)vdev_dtl_max(vd)); + } else if (!wasempty) { + zfs_dbgmsg("DTL_MISSING is now empty"); + } } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], @@ -3543,14 +3570,11 @@ vdev_clear(spa_t *spa, vdev_t *vd) if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); - if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) { - if (dsl_scan_resilvering(spa->spa_dsl_pool) && - spa_feature_is_enabled(spa, - SPA_FEATURE_RESILVER_DEFER)) - vdev_set_deferred_resilver(spa, vd); - else - spa_async_request(spa, SPA_ASYNC_RESILVER); - } + /* If a resilver isn't required, check if vdevs can be culled */ + if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && + !dsl_scan_resilvering(spa->spa_dsl_pool) && + !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } @@ -4559,18 +4583,46 @@ vdev_deadman(vdev_t *vd) } void -vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd) +vdev_defer_resilver(vdev_t *vd) { - for (uint64_t i = 0; i < vd->vdev_children; i++) - vdev_set_deferred_resilver(spa, vd->vdev_child[i]); + ASSERT(vd->vdev_ops->vdev_op_leaf); - if (!vd->vdev_ops->vdev_op_leaf || !vdev_writeable(vd) || - range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { - return; + vd->vdev_resilver_deferred = B_TRUE; + vd->vdev_spa->spa_resilver_deferred = B_TRUE; +} + +/* + * Clears the resilver deferred flag on all leaf devs under vd. Returns + * B_TRUE if we have devices that need to be resilvered and are available to + * accept resilver I/Os. + */ +boolean_t +vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) +{ + boolean_t resilver_needed = B_FALSE; + spa_t *spa = vd->vdev_spa; + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); } - vd->vdev_resilver_deferred = B_TRUE; - spa->spa_resilver_deferred = B_TRUE; + if (vd == spa->spa_root_vdev && + spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { + spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); + vdev_config_dirty(vd); + spa->spa_resilver_deferred = B_FALSE; + return (resilver_needed); + } + + if (!vdev_is_concrete(vd) || vd->vdev_aux || + !vd->vdev_ops->vdev_op_leaf) + return (resilver_needed); + + vd->vdev_resilver_deferred = B_FALSE; + + return (!vdev_is_dead(vd) && !vd->vdev_offline && + vdev_resilver_needed(vd, NULL, NULL)); } /* diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c index a65721d175..e332da9672 100644 --- a/usr/src/uts/common/fs/zfs/zio_inject.c +++ b/usr/src/uts/common/fs/zfs/zio_inject.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. */ /* @@ -100,6 +101,26 @@ static kmutex_t inject_delay_mtx; static int inject_next_id = 1; /* + * Test if the requested frequency was triggered + */ +static boolean_t +freq_triggered(uint32_t frequency) +{ + /* + * zero implies always (100%) + */ + if (frequency == 0) + return (B_TRUE); + + /* + * Note: we still handle legacy (unscaled) frequecy values + */ + uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX; + + return (spa_get_random(maximum) < frequency); +} + +/* * Returns true if the given record matches the I/O in progress. */ static boolean_t @@ -114,8 +135,7 @@ zio_match_handler(zbookmark_phys_t *zb, uint64_t type, int dva, record->zi_object == DMU_META_DNODE_OBJECT) { if (record->zi_type == DMU_OT_NONE || type == record->zi_type) - return (record->zi_freq == 0 || - spa_get_random(100) < record->zi_freq); + return (freq_triggered(record->zi_freq)); else return (B_FALSE); } @@ -130,8 +150,7 @@ zio_match_handler(zbookmark_phys_t *zb, uint64_t type, int dva, zb->zb_blkid <= record->zi_end && (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) && error == record->zi_error) { - return (record->zi_freq == 0 || - spa_get_random(100) < record->zi_freq); + return (freq_triggered(record->zi_freq)); } return (B_FALSE); @@ -360,6 +379,12 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) if (handler->zi_record.zi_error == error) { /* + * limit error injection if requested + */ + if (!freq_triggered(handler->zi_record.zi_freq)) + continue; + + /* * For a failed open, pretend like the device * has gone away. */ @@ -527,6 +552,9 @@ zio_handle_io_delay(zio_t *zio) if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) continue; + if (!freq_triggered(handler->zi_record.zi_freq)) + continue; + if (vd->vdev_guid != handler->zi_record.zi_guid) continue; diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c index 9aeba33d30..b16fc9bf5f 100644 --- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c +++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c @@ -731,6 +731,7 @@ ipf_hook_protocol_notify(hook_notify_cmd_t command, void *arg, hook_hint_t hint; boolean_t out; int ret = 0; + const boolean_t gz = ifs->ifs_gz_controlled; /* We currently only care about viona hooks notifications */ @@ -2438,42 +2439,6 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg) return ipf_hook6(info, 1, FI_NOCKSUM, arg); } -/* ------------------------------------------------------------------------ */ -/* Function: ipf_hookvndl3_in */ -/* Returns: int - 0 == packet ok, else problem, free packet if not done */ -/* Parameters: event(I) - pointer to event */ -/* info(I) - pointer to hook information for firewalling */ -/* */ -/* The vnd hooks are private hooks to ON. They represents a layer 2 */ -/* datapath generally used to implement virtual machines. The driver sends */ -/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */ -/* them is in the upper 16 bits while the remaining bits are the */ -/* traditional packet hook flags. */ -/* */ -/* They end up calling the appropriate traditional ip hooks. */ -/* ------------------------------------------------------------------------ */ -/*ARGSUSED*/ -int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg) -{ - return ipf_hook4_in(token, info, arg); -} - -int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg) -{ - return ipf_hook6_in(token, info, arg); -} - -/*ARGSUSED*/ -int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg) -{ - return ipf_hook4_out(token, info, arg); -} - -int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg) -{ - return ipf_hook6_out(token, info, arg); -} - /* Static constants used by ipf_hook_ether */ static uint8_t ipf_eth_bcast_addr[ETHERADDRL] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF @@ -2569,6 +2534,42 @@ int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg, } /* ------------------------------------------------------------------------ */ +/* Function: ipf_hookvndl3_in */ +/* Returns: int - 0 == packet ok, else problem, free packet if not done */ +/* Parameters: event(I) - pointer to event */ +/* info(I) - pointer to hook information for firewalling */ +/* */ +/* The vnd hooks are private hooks to ON. They represents a layer 2 */ +/* datapath generally used to implement virtual machines. The driver sends */ +/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */ +/* them is in the upper 16 bits while the remaining bits are the */ +/* traditional packet hook flags. */ +/* */ +/* They end up calling the appropriate traditional ip hooks. */ +/* ------------------------------------------------------------------------ */ +/*ARGSUSED*/ +int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_in(token, info, arg); +} + +int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_in(token, info, arg); +} + +/*ARGSUSED*/ +int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_out(token, info, arg); +} + +int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_out(token, info, arg); +} + +/* ------------------------------------------------------------------------ */ /* Function: ipf_hookviona_{in,out} */ /* Returns: int - 0 == packet ok, else problem, free packet if not done */ /* Parameters: event(I) - pointer to event */ diff --git a/usr/src/uts/common/io/audio/impl/audio_grc3.h b/usr/src/uts/common/io/audio/impl/audio_grc3.h index 0003dc1574..4472307edf 100644 --- a/usr/src/uts/common/io/audio/impl/audio_grc3.h +++ b/usr/src/uts/common/io/audio/impl/audio_grc3.h @@ -53,7 +53,7 @@ typedef struct grc3state { int32_t *historyptr; int32_t dummy_pad1; - int32_t history[GRC3_MAXHISTORY * 2]; + int32_t history[GRC3_MAXHISTORY * 2 + 1]; uint32_t outsz; } grc3state_t; diff --git a/usr/src/uts/common/io/bge/bge_main2.c b/usr/src/uts/common/io/bge/bge_main2.c index ab511c068d..81b6528c7c 100644 --- a/usr/src/uts/common/io/bge/bge_main2.c +++ b/usr/src/uts/common/io/bge/bge_main2.c @@ -1437,8 +1437,49 @@ bge_unicst_find(bge_t *bgep, const uint8_t *mac_addr) } /* - * Programs the classifier to start steering packets matching 'mac_addr' to the - * specified ring 'arg'. + * The job of bge_addmac() is to set up everything in hardware for the mac + * address indicated to map to the specified group. + * + * For this to make sense, we need to first understand how most of the bge chips + * work. A given packet reaches a ring in two distinct logical steps: + * + * 1) The device must accept the packet. + * 2) The device must steer an accepted packet to a specific ring. + * + * For step 1, the device has four global MAC address filtering registers. We + * must either add the address here or put the device in promiscuous mode. + * Because there are only four of these and up to four groups, each group is + * only allowed to program a single entry. Note, this is not explicitly done in + * the driver. Rather, it is implicitly done by how we implement step 2. These + * registers start at 0x410 and are referred to as the 'EMAC MAC Addresses' in + * the manuals. + * + * For step 2, the device has eight sets of rule registers that are used to + * control how a packet in step 1 is mapped to a specific ring. Each set is + * comprised of a control register and a mask register. These start at 0x480 and + * are referred to as the 'Receive Rules Control Registers' and 'Receive Rules + * Value/Mask Registers'. These can be used to check for a 16-bit or 32-bit + * value at an offset in the packet. In addition, two sets can be combined to + * create a single conditional rule. + * + * For our purposes, we need to use this mechanism to steer a mac address to a + * specific ring. This requires that we use two of the sets of registers per MAC + * address that comes in here. The data about this is stored in 'mac_addr_rule' + * member of the 'recv_ring_t'. + * + * A reasonable question to ask is why are we storing this on the ring, when it + * relates to the group. The answer is that the current implementation of the + * driver assumes that each group is comprised of a single ring. While some + * parts may support additional rings, the driver doesn't take advantage of + * that. + * + * A result of all this is that the driver will support up to 4 groups today. + * Each group has a single ring. We want to make sure that each group can have a + * single MAC address programmed into it. This results in the check for a rule + * being assigned in the 'mac_addr_rule' member of the recv_ring_t below. If a + * future part were to support more global MAC address filters in part 1 and + * more rule registers needed for part 2, then we could relax this constraint + * and allow a group to have more than one MAC address assigned to it. */ static int bge_addmac(void *arg, const uint8_t * mac_addr) @@ -1461,7 +1502,10 @@ bge_addmac(void *arg, const uint8_t * mac_addr) } /* - * First add the unicast address to a available slot. + * The driver only supports a MAC address being programmed to be + * received by one ring in step 2. We check the global table of MAC + * addresses to see if this address has already been claimed by another + * group as a way to determine that. */ slot = bge_unicst_find(bgep, mac_addr); if (slot != -1) { @@ -1469,6 +1513,17 @@ bge_addmac(void *arg, const uint8_t * mac_addr) return (EEXIST); } + /* + * Check to see if this group has already used its hardware resources + * for step 2. If so, we have to return ENOSPC to MAC to indicate that + * this group cannot handle an additional MAC address and that MAC will + * need to use software classification on the default group. + */ + if (rrp->mac_addr_rule != NULL) { + mutex_exit(bgep->genlock); + return (ENOSPC); + } + for (slot = 0; slot < bgep->unicst_addr_total; slot++) { if (!bgep->curr_addr[slot].set) { bgep->curr_addr[slot].set = B_TRUE; @@ -1483,12 +1538,6 @@ bge_addmac(void *arg, const uint8_t * mac_addr) if ((err = bge_unicst_set(bgep, mac_addr, slot)) != 0) goto fail; - /* A rule is already here. Deny this. */ - if (rrp->mac_addr_rule != NULL) { - err = ether_cmp(mac_addr, rrp->mac_addr_val) ? EEXIST : EBUSY; - goto fail; - } - /* * Allocate a bge_rule_info_t to keep track of which rule slots * are being used. diff --git a/usr/src/uts/common/io/bnx/bnx.h b/usr/src/uts/common/io/bnx/bnx.h index e1d53fa9d7..9ef282678e 100644 --- a/usr/src/uts/common/io/bnx/bnx.h +++ b/usr/src/uts/common/io/bnx/bnx.h @@ -55,12 +55,6 @@ extern "C" { -/* - */ -#pragma weak hcksum_retrieve -#pragma weak hcksum_assoc - - #include "listq.h" #include "lm5706.h" #include "54xx_reg.h" diff --git a/usr/src/uts/common/io/bnx/bnxsnd.c b/usr/src/uts/common/io/bnx/bnxsnd.c index 16f1b03c10..f6e154c056 100644 --- a/usr/src/uts/common/io/bnx/bnxsnd.c +++ b/usr/src/uts/common/io/bnx/bnxsnd.c @@ -611,7 +611,7 @@ bnx_xmit_ring_xmit_mblk(um_device_t * const umdevice, umpacket->frag_list.cnt = 0; umpacket->mp = mp; - hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &pflags); + mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &pflags); bnx_xmit_pkt_cpy(umdevice, umpacket); diff --git a/usr/src/uts/common/io/chxge/pe.c b/usr/src/uts/common/io/chxge/pe.c index 652edba984..48a796470a 100644 --- a/usr/src/uts/common/io/chxge/pe.c +++ b/usr/src/uts/common/io/chxge/pe.c @@ -414,12 +414,12 @@ pe_start(ch_t *sa, mblk_t *mp, uint32_t flg) lseg = ch_bind_dvma_handle(sa, len, (void *)mp->b_rptr, &hmp[nseg], mseg - nseg); - if (lseg == NULL) { + if (lseg == 0) { sa->sge->intr_cnt.tx_no_dvma1++; if ((lseg = ch_bind_dma_handle(sa, len, (void *)mp->b_rptr, &hmp[nseg], - mseg - nseg)) == NULL) { + mseg - nseg)) == 0) { sa->sge->intr_cnt.tx_no_dma1++; /* @@ -444,7 +444,7 @@ pe_start(ch_t *sa, mblk_t *mp, uint32_t flg) lseg = ch_bind_dma_handle(sa, len, (void *)mp->b_rptr, &hmp[nseg], mseg - nseg); - if (lseg == NULL) { + if (lseg == 0) { sa->sge->intr_cnt.tx_no_dma1++; /* @@ -512,12 +512,12 @@ pe_start(ch_t *sa, mblk_t *mp, uint32_t flg) nseg = ch_bind_dvma_handle(sa, len, (void *)mp->b_rptr, &hmp[0], 16); - if (nseg == NULL) { + if (nseg == 0) { sa->sge->intr_cnt.tx_no_dvma2++; nseg = ch_bind_dma_handle(sa, len, (void *)mp->b_rptr, &hmp[0], 16); - if (nseg == NULL) { + if (nseg == 0) { sa->sge->intr_cnt.tx_no_dma2++; /* @@ -530,7 +530,7 @@ pe_start(ch_t *sa, mblk_t *mp, uint32_t flg) } else { nseg = ch_bind_dma_handle(sa, len, (void *)mp->b_rptr, &hmp[0], 16); - if (nseg == NULL) { + if (nseg == 0) { sa->sge->intr_cnt.tx_no_dma2++; /* diff --git a/usr/src/uts/common/io/cmlb.c b/usr/src/uts/common/io/cmlb.c index 6275948465..f4ae9f3ed5 100644 --- a/usr/src/uts/common/io/cmlb.c +++ b/usr/src/uts/common/io/cmlb.c @@ -1514,7 +1514,7 @@ cmlb_create_minor_nodes(struct cmlb_lun *cl) if (cl->cl_alter_behavior & CMLB_CREATE_P0_MINOR_NODE) { if (cmlb_create_minor(CMLB_DEVINFO(cl), "q", S_IFBLK, (instance << CMLBUNIT_FORCE_P0_SHIFT) | P0_RAW_DISK, - cl->cl_node_type, NULL, internal) == DDI_FAILURE) { + cl->cl_node_type, 0, internal) == DDI_FAILURE) { ddi_remove_minor_node(CMLB_DEVINFO(cl), NULL); return (ENXIO); } @@ -1522,7 +1522,7 @@ cmlb_create_minor_nodes(struct cmlb_lun *cl) if (cmlb_create_minor(CMLB_DEVINFO(cl), "q,raw", S_IFCHR, (instance << CMLBUNIT_FORCE_P0_SHIFT) | P0_RAW_DISK, - cl->cl_node_type, NULL, internal) == DDI_FAILURE) { + cl->cl_node_type, 0, internal) == DDI_FAILURE) { ddi_remove_minor_node(CMLB_DEVINFO(cl), NULL); return (ENXIO); } diff --git a/usr/src/uts/common/io/cxgbe/common/common.h b/usr/src/uts/common/io/cxgbe/common/common.h index c7de2c4ebf..b8d77ebda3 100644 --- a/usr/src/uts/common/io/cxgbe/common/common.h +++ b/usr/src/uts/common/io/cxgbe/common/common.h @@ -20,6 +20,10 @@ * release for licensing terms and conditions. */ +/* + * Copyright 2020 RackTop Systems, Inc. + */ + #ifndef __CHELSIO_COMMON_H #define __CHELSIO_COMMON_H @@ -103,9 +107,16 @@ enum { typedef unsigned char cc_pause_t; enum { - FEC_AUTO = 1 << 0, /* IEEE 802.3 "automatic" */ - FEC_RS = 1 << 1, /* Reed-Solomon */ - FEC_BASER_RS = 1 << 2, /* BaseR/Reed-Solomon */ + FEC_RS = 1 << 0, /* Reed-Solomon */ + FEC_BASER_RS = 1 << 1, /* Base-R, aka Firecode */ + FEC_NONE = 1 << 2, /* no FEC */ + + /* + * Pseudo FECs that translate to real FECs. The firmware knows nothing + * about these and they start at M_FW_PORT_CAP32_FEC + 1. AUTO should + * be set all by itself. + */ + FEC_AUTO = 1 << 5, }; typedef unsigned char cc_fec_t; diff --git a/usr/src/uts/common/io/cxgbe/common/t4_hw.c b/usr/src/uts/common/io/cxgbe/common/t4_hw.c index ae88f36f15..4bb48f1b3a 100644 --- a/usr/src/uts/common/io/cxgbe/common/t4_hw.c +++ b/usr/src/uts/common/io/cxgbe/common/t4_hw.c @@ -20,6 +20,10 @@ * release for licensing terms and conditions. */ +/* + * Copyright 2020 RackTop Systems, Inc. + */ + #include "common.h" #include "t4_regs.h" #include "t4_regs_values.h" @@ -4645,20 +4649,57 @@ static inline cc_fec_t fwcap_to_cc_fec(fw_port_cap32_t fw_fec) if (fw_fec & FW_PORT_CAP32_FEC_BASER_RS) cc_fec |= FEC_BASER_RS; - return cc_fec; + if (cc_fec == 0) + cc_fec = FEC_NONE; + + return (cc_fec); } /* Translate Common Code Forward Error Correction specification to Firmware */ -static inline fw_port_cap32_t cc_to_fwcap_fec(cc_fec_t cc_fec) +static inline boolean_t +cc_to_fwcap_fec(fw_port_cap32_t *fw_fecp, cc_fec_t cc_fec, + struct link_config *lc) { fw_port_cap32_t fw_fec = 0; - if (cc_fec & FEC_RS) + if ((cc_fec & FEC_AUTO) != 0) { + if ((lc->pcaps & FW_PORT_CAP32_SPEED_100G) == 0) + fw_fec |= FW_PORT_CAP32_FEC_BASER_RS; + + if ((lc->pcaps & FW_PORT_CAP32_FORCE_FEC) != 0) + fw_fec |= FW_PORT_CAP32_FEC_NO_FEC; + + fw_fec |= FW_PORT_CAP32_FEC_RS; + + *fw_fecp = fw_fec; + return (B_TRUE); + } + + if ((cc_fec & FEC_RS) != 0) fw_fec |= FW_PORT_CAP32_FEC_RS; - if (cc_fec & FEC_BASER_RS) + + if ((cc_fec & FEC_BASER_RS) != 0 && + (lc->pcaps & FW_PORT_CAP32_SPEED_100G) == 0) fw_fec |= FW_PORT_CAP32_FEC_BASER_RS; - return fw_fec; + if ((cc_fec & FEC_NONE) != 0) { + if ((lc->pcaps & FW_PORT_CAP32_FORCE_FEC) != 0) { + fw_fec |= FW_PORT_CAP32_FORCE_FEC; + fw_fec |= FW_PORT_CAP32_FEC_NO_FEC; + } + + *fw_fecp = fw_fec; + return (B_TRUE); + } + + if (fw_fec == 0) + return (B_FALSE); + + if ((lc->pcaps & FW_PORT_CAP32_FORCE_FEC) != 0) + fw_fec |= FW_PORT_CAP32_FORCE_FEC; + + *fw_fecp = fw_fec; + return (B_TRUE); } /** @@ -4692,11 +4733,18 @@ fw_port_cap32_t t4_link_acaps(struct adapter *adapter, unsigned int port, * the Transceiver Module EPROM FEC parameters. Otherwise we * use whatever is in the current Requested FEC settings. */ - if (lc->requested_fec & FEC_AUTO) - cc_fec = fwcap_to_cc_fec(lc->def_acaps); - else - cc_fec = lc->requested_fec; - fw_fec = cc_to_fwcap_fec(cc_fec); + if (fec_supported(lc->pcaps)) { + if (lc->requested_fec & FEC_AUTO) + cc_fec = fwcap_to_cc_fec(lc->def_acaps); + else + cc_fec = lc->requested_fec; + + if (!cc_to_fwcap_fec(&fw_fec, cc_fec, lc)) + return (0); + } else { + fw_fec = 0; + cc_fec = FEC_NONE; + } /* Figure out what our Requested Port Capabilities are going to be. * Note parallel structure in t4_handle_get_port_info() and @@ -9641,12 +9689,17 @@ static void init_link_config(struct link_config *lc, fw_port_cap32_t pcaps, lc->speed = 0; lc->requested_fc = lc->fc = PAUSE_RX | PAUSE_TX; - /* - * For Forward Error Control, we default to whatever the Firmware - * tells us the Link is currently advertising. - */ - lc->requested_fec = FEC_AUTO; - lc->fec = fwcap_to_cc_fec(lc->def_acaps); + if (fec_supported(pcaps)) { + /* + * For Forward Error Control, we default to whatever the Firmware + * tells us the Link is currently advertising. + */ + lc->requested_fec = FEC_AUTO; + lc->fec = fwcap_to_cc_fec(lc->def_acaps); + } else { + lc->requested_fec = FEC_NONE; + lc->fec = FEC_NONE; + } /* If the Port is capable of Auto-Negtotiation, initialize it as * "enabled" and copy over all of the Physical Port Capabilities diff --git a/usr/src/uts/common/io/cxgbe/firmware/t4fw_interface.h b/usr/src/uts/common/io/cxgbe/firmware/t4fw_interface.h index d705c73891..b998e85bae 100644 --- a/usr/src/uts/common/io/cxgbe/firmware/t4fw_interface.h +++ b/usr/src/uts/common/io/cxgbe/firmware/t4fw_interface.h @@ -11,6 +11,10 @@ * release for licensing terms and conditions. */ +/* + * Copyright 2020 RackTop Systems, Inc. + */ + #ifndef _T4FW_INTERFACE_H_ #define _T4FW_INTERFACE_H_ @@ -7204,11 +7208,12 @@ enum fw_port_mdi { #define FW_PORT_CAP32_MDISTRAIGHT 0x00400000UL #define FW_PORT_CAP32_FEC_RS 0x00800000UL #define FW_PORT_CAP32_FEC_BASER_RS 0x01000000UL -#define FW_PORT_CAP32_FEC_RESERVED1 0x02000000UL +#define FW_PORT_CAP32_FEC_NO_FEC 0x02000000UL #define FW_PORT_CAP32_FEC_RESERVED2 0x04000000UL #define FW_PORT_CAP32_FEC_RESERVED3 0x08000000UL #define FW_PORT_CAP32_FORCE_PAUSE 0x10000000UL -#define FW_PORT_CAP32_RESERVED2 0xe0000000UL +#define FW_PORT_CAP32_FORCE_FEC 0x20000000UL +#define FW_PORT_CAP32_RESERVED2 0xc0000000UL #define S_FW_PORT_CAP32_SPEED 0 #define M_FW_PORT_CAP32_SPEED 0xfff @@ -7254,7 +7259,7 @@ enum fw_port_mdi32 { (((x) >> S_FW_PORT_CAP32_MDI) & M_FW_PORT_CAP32_MDI) #define S_FW_PORT_CAP32_FEC 23 -#define M_FW_PORT_CAP32_FEC 0x1f +#define M_FW_PORT_CAP32_FEC 0x5f #define V_FW_PORT_CAP32_FEC(x) ((x) << S_FW_PORT_CAP32_FEC) #define G_FW_PORT_CAP32_FEC(x) \ (((x) >> S_FW_PORT_CAP32_FEC) & M_FW_PORT_CAP32_FEC) @@ -7269,6 +7274,15 @@ enum fw_port_mdi32 { #define CAP32_FC(__cap32) \ (V_FW_PORT_CAP32_FC(M_FW_PORT_CAP32_FC) & __cap32) +#ifdef _KERNEL +static inline boolean_t +fec_supported(uint32_t caps) +{ + return ((caps & (FW_PORT_CAP32_SPEED_25G | FW_PORT_CAP32_SPEED_50G | + FW_PORT_CAP32_SPEED_100G)) != 0); +} +#endif + enum fw_port_action { FW_PORT_ACTION_L1_CFG = 0x0001, FW_PORT_ACTION_L2_CFG = 0x0002, diff --git a/usr/src/uts/common/io/cxgbe/shared/shared.c b/usr/src/uts/common/io/cxgbe/shared/shared.c index 07dd78f189..e86272134a 100644 --- a/usr/src/uts/common/io/cxgbe/shared/shared.c +++ b/usr/src/uts/common/io/cxgbe/shared/shared.c @@ -32,17 +32,19 @@ static int rxbuf_ctor(void *, void *, int); static void rxbuf_dtor(void *, void *); -void +int cxgb_printf(dev_info_t *dip, int level, char *f, ...) { va_list list; char fmt[128]; + int rv; - (void) snprintf(fmt, sizeof (fmt), "%s%d: %s", ddi_driver_name(dip), + rv = snprintf(fmt, sizeof (fmt), "%s%d: %s", ddi_driver_name(dip), ddi_get_instance(dip), f); va_start(list, f); vcmn_err(level, fmt, list); va_end(list); + return (rv); } kmem_cache_t * diff --git a/usr/src/uts/common/io/cxgbe/shared/shared.h b/usr/src/uts/common/io/cxgbe/shared/shared.h index 5838416838..d3171c224b 100644 --- a/usr/src/uts/common/io/cxgbe/shared/shared.h +++ b/usr/src/uts/common/io/cxgbe/shared/shared.h @@ -66,7 +66,7 @@ struct rxbuf_cache_params { size_t buf_size; }; -void cxgb_printf(dev_info_t *dip, int level, char *f, ...); +int cxgb_printf(dev_info_t *dip, int level, char *f, ...); kmem_cache_t *rxbuf_cache_create(struct rxbuf_cache_params *p); void rxbuf_cache_destroy(kmem_cache_t *cache); struct rxbuf *rxbuf_alloc(kmem_cache_t *cache, int kmflags, uint_t ref_cnt); diff --git a/usr/src/uts/common/io/cxgbe/t4nex/cudbg.h b/usr/src/uts/common/io/cxgbe/t4nex/cudbg.h index cb21451e5c..e86de21085 100644 --- a/usr/src/uts/common/io/cxgbe/t4nex/cudbg.h +++ b/usr/src/uts/common/io/cxgbe/t4nex/cudbg.h @@ -318,7 +318,7 @@ static struct el ATTRIBUTE_UNUSED entity_list[] = { }; #ifdef _KERNEL -typedef int (*cudbg_print_cb) (dev_info_t *dip, ...); +typedef int (*cudbg_print_cb) (dev_info_t *dip, int, char *, ...); #else typedef int (*cudbg_print_cb) (char *, ...); #endif diff --git a/usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c b/usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c index ee28c8a2ba..85d79e6201 100644 --- a/usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c +++ b/usr/src/uts/common/io/cxgbe/t4nex/t4_ioctl.c @@ -1706,7 +1706,7 @@ get_cudbg(struct adapter *sc, void *data, int flags) cudbg = cudbg_get_init(handle); cudbg->adap = sc; - cudbg->print = (cudbg_print_cb)(uintptr_t)cxgb_printf; + cudbg->print = cxgb_printf; memcpy(cudbg->dbg_bitmap, dump.bitmap, sizeof(cudbg->dbg_bitmap)); diff --git a/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c b/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c index 59c0ddde8d..9b4ffd8325 100644 --- a/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c +++ b/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c @@ -20,6 +20,10 @@ * release for licensing terms and conditions. */ +/* + * Copyright 2020 RackTop Systems, Inc. + */ + #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/dlpi.h> @@ -930,6 +934,62 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data) return (status); } +static link_fec_t +fec_to_link_fec(cc_fec_t cc_fec) +{ + link_fec_t link_fec = 0; + + if ((cc_fec & (FEC_RS | FEC_BASER_RS)) == (FEC_RS | FEC_BASER_RS)) + return (LINK_FEC_AUTO); + + if ((cc_fec & FEC_NONE) != 0) + link_fec |= LINK_FEC_NONE; + + if ((cc_fec & FEC_AUTO) != 0) + link_fec |= LINK_FEC_AUTO; + + if ((cc_fec & FEC_RS) != 0) + link_fec |= LINK_FEC_RS; + + if ((cc_fec & FEC_BASER_RS) != 0) + link_fec |= LINK_FEC_BASE_R; + + return (link_fec); +} + +static int +link_fec_to_fec(int v) +{ + int fec = 0; + + if ((v & LINK_FEC_AUTO) != 0) { + fec = FEC_AUTO; + v &= ~LINK_FEC_AUTO; + } else { + if ((v & LINK_FEC_NONE) != 0) { + fec = FEC_NONE; + v &= ~LINK_FEC_NONE; + } + + if ((v & LINK_FEC_RS) != 0) { + fec |= FEC_RS; + v &= ~LINK_FEC_RS; + } + + if ((v & LINK_FEC_BASE_R) != 0) { + fec |= FEC_BASER_RS; + v &= ~LINK_FEC_BASE_R; + } + } + + if (v != 0) + return (-1); + + ASSERT3S(fec, !=, 0); + + return (fec); +} + /* ARGSUSED */ static int t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size, @@ -941,7 +1001,9 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size, uint8_t v8 = *(uint8_t *)val; uint32_t v32 = *(uint32_t *)val; int old, new = 0, relink = 0, rx_mode = 0, rc = 0; + boolean_t down_link = B_TRUE; link_flowctrl_t fc; + link_fec_t fec; /* * Save a copy of link_config. This can be used to restore link_config @@ -1009,6 +1071,30 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size, } break; + case MAC_PROP_EN_FEC_CAP: + if (!fec_supported(lc->pcaps)) { + rc = ENOTSUP; + break; + } + + fec = *(link_fec_t *)val; + new = link_fec_to_fec(fec); + if (new < 0) { + rc = EINVAL; + } else if (new != lc->requested_fec) { + lc->requested_fec = new; + relink = 1; + /* + * For fec, do not preemptively force the link + * down. If changing fec causes the link state + * to transition, then appropriate asynchronous + * events are generated which correctly reflect + * the link state. + */ + down_link = B_FALSE; + } + break; + case MAC_PROP_EN_10GFDX_CAP: if (lc->pcaps & FW_PORT_CAP32_ANEG && is_10G_port(pi)) { old = lc->acaps & FW_PORT_CAP32_SPEED_10G; @@ -1062,7 +1148,8 @@ t4_mc_setprop(void *arg, const char *name, mac_prop_id_t id, uint_t size, if (isset(&sc->open_device_map, pi->port_id) != 0) { if (relink != 0) { - t4_os_link_changed(pi->adapter, pi->port_id, 0); + if (down_link) + t4_os_link_changed(pi->adapter, pi->port_id, 0); rc = begin_synchronized_op(pi, 1, 1); if (rc != 0) return (rc); @@ -1143,6 +1230,20 @@ t4_mc_getprop(void *arg, const char *name, mac_prop_id_t id, uint_t size, *(link_flowctrl_t *)val = LINK_FLOWCTRL_NONE; break; + case MAC_PROP_ADV_FEC_CAP: + if (!fec_supported(lc->pcaps)) + return (ENOTSUP); + + *(link_fec_t *)val = fec_to_link_fec(lc->fec); + break; + + case MAC_PROP_EN_FEC_CAP: + if (!fec_supported(lc->pcaps)) + return (ENOTSUP); + + *(link_fec_t *)val = fec_to_link_fec(lc->requested_fec); + break; + case MAC_PROP_ADV_100GFDX_CAP: case MAC_PROP_EN_100GFDX_CAP: *u = !!(lc->acaps & FW_PORT_CAP32_SPEED_100G); @@ -1212,6 +1313,15 @@ t4_mc_propinfo(void *arg, const char *name, mac_prop_id_t id, mac_prop_info_set_default_link_flowctrl(ph, LINK_FLOWCTRL_BI); break; + case MAC_PROP_EN_FEC_CAP: + mac_prop_info_set_default_fec(ph, LINK_FEC_AUTO); + break; + + case MAC_PROP_ADV_FEC_CAP: + mac_prop_info_set_perm(ph, MAC_PROP_PERM_READ); + mac_prop_info_set_default_fec(ph, LINK_FEC_AUTO); + break; + case MAC_PROP_EN_10GFDX_CAP: if (lc->pcaps & FW_PORT_CAP32_ANEG && lc->pcaps & FW_PORT_CAP32_SPEED_10G) diff --git a/usr/src/uts/common/io/e1000g/e1000g_alloc.c b/usr/src/uts/common/io/e1000g/e1000g_alloc.c index c7496cd164..8a460fd45a 100644 --- a/usr/src/uts/common/io/e1000g/e1000g_alloc.c +++ b/usr/src/uts/common/io/e1000g/e1000g_alloc.c @@ -830,7 +830,7 @@ e1000g_free_dvma_buffer(dma_buffer_t *buf) return; } - buf->dma_address = NULL; + buf->dma_address = 0; if (buf->address != NULL) { kmem_free(buf->address, buf->size); diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index d698862d81..4ce359f87b 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2020 Joyent, Inc. * Copyright 2015 Garrett D'Amore <garrett@damore.org> + * Copyright 2020 RackTop Systems, Inc. */ /* @@ -3341,6 +3342,10 @@ mac_prop_check_size(mac_prop_id_t id, uint_t valsize, boolean_t is_range) case MAC_PROP_FLOWCTRL: minsize = sizeof (link_flowctrl_t); break; + case MAC_PROP_ADV_FEC_CAP: + case MAC_PROP_EN_FEC_CAP: + minsize = sizeof (link_fec_t); + break; case MAC_PROP_ADV_5000FDX_CAP: case MAC_PROP_EN_5000FDX_CAP: case MAC_PROP_ADV_2500FDX_CAP: @@ -3529,6 +3534,28 @@ mac_set_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val, break; } + case MAC_PROP_ADV_FEC_CAP: + case MAC_PROP_EN_FEC_CAP: { + link_fec_t fec; + + ASSERT(valsize >= sizeof (link_fec_t)); + + /* + * fec cannot be zero, and auto must be set exclusively. + */ + bcopy(val, &fec, sizeof (link_fec_t)); + if (fec == 0) + return (EINVAL); + if ((fec & LINK_FEC_AUTO) != 0 && (fec & ~LINK_FEC_AUTO) != 0) + return (EINVAL); + + if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) { + err = mip->mi_callbacks->mc_setprop(mip->mi_driver, + name, id, valsize, val); + } + break; + } + default: /* For other driver properties, call driver's callback */ if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) { @@ -4741,7 +4768,7 @@ mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp) * The bridge may place this mblk on a provider's Tx * path, a mac's Rx path, or both. Since we don't have * enough information at this point, we can't be sure - * that the desination(s) are capable of handling the + * that the destination(s) are capable of handling the * hardware offloads requested by the mblk. We emulate * them here as it is the safest choice. In the * future, if bridge performance becomes a priority, diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c index dcfb4803d6..b166e7987a 100644 --- a/usr/src/uts/common/io/mac/mac_client.c +++ b/usr/src/uts/common/io/mac/mac_client.c @@ -4243,7 +4243,7 @@ mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain, mpip->mpi_type == MAC_CLIENT_PROMISC_ALL || is_mcast) { mac_promisc_dispatch_one(mpip, mp, is_sender, - local); + local); } } } @@ -4274,7 +4274,7 @@ mac_promisc_client_dispatch(mac_client_impl_t *mcip, mblk_t *mp_chain) if (mpip->mpi_type == MAC_CLIENT_PROMISC_FILTERED && !is_mcast) { mac_promisc_dispatch_one(mpip, mp, B_FALSE, - B_FALSE); + B_FALSE); } } } @@ -4352,12 +4352,27 @@ i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) { mac_impl_t *mip = (mac_impl_t *)mh; - if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY) + if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY) { return (B_TRUE); - else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB) - return (mip->mi_getcapab(mip->mi_driver, cap, cap_data)); - else + } else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB) { + boolean_t res; + + res = mip->mi_getcapab(mip->mi_driver, cap, cap_data); + /* + * Until we have suppport for TSOv6 emulation in the MAC + * loopback path, do not allow the TSOv6 capability to be + * advertised to consumers. + */ + if (res && cap == MAC_CAPAB_LSO) { + mac_capab_lso_t *cap_lso = cap_data; + + cap_lso->lso_flags &= ~LSO_TX_BASIC_TCP_IPV6; + cap_lso->lso_basic_tcp_ipv6.lso_max = 0; + } + return (res); + } else { return (B_FALSE); + } } /* diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c index 7f193f68eb..bcca602589 100644 --- a/usr/src/uts/common/io/mac/mac_provider.c +++ b/usr/src/uts/common/io/mac/mac_provider.c @@ -23,6 +23,7 @@ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2020 RackTop Systems, Inc. */ #include <sys/types.h> @@ -1530,6 +1531,22 @@ mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph, } void +mac_prop_info_set_default_fec(mac_prop_info_handle_t ph, link_fec_t val) +{ + mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; + + /* nothing to do if the caller doesn't want the default value */ + if (pr->pr_default == NULL) + return; + + ASSERT(pr->pr_default_size >= sizeof (link_fec_t)); + + bcopy(&val, pr->pr_default, sizeof (val)); + + pr->pr_flags |= MAC_PROP_INFO_DEFAULT; +} + +void mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph, uint32_t min, uint32_t max) { diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c index 94ec8add16..8f983e50e4 100644 --- a/usr/src/uts/common/io/mac/mac_sched.c +++ b/usr/src/uts/common/io/mac/mac_sched.c @@ -4443,9 +4443,9 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, mac_hw_emul(&mp, NULL, NULL, MAC_ALL_EMULS); if (mp != NULL) { (dst_flow_ent->fe_cb_fn)( - dst_flow_ent->fe_cb_arg1, - dst_flow_ent->fe_cb_arg2, - mp, do_switch); + dst_flow_ent->fe_cb_arg1, + dst_flow_ent->fe_cb_arg2, + mp, do_switch); } } diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c index 6e33fb7f56..03da3a3504 100644 --- a/usr/src/uts/common/io/mac/mac_util.c +++ b/usr/src/uts/common/io/mac/mac_util.c @@ -258,7 +258,7 @@ bail: static boolean_t mac_sw_cksum_ipv6(mblk_t *mp, uint32_t ip_hdr_offset, const char **err) { - ip6_t* ip6h = (ip6_t *)(mp->b_rptr + ip_hdr_offset); + ip6_t *ip6h = (ip6_t *)(mp->b_rptr + ip_hdr_offset); const uint8_t proto = ip6h->ip6_nxt; const uint16_t *iphs = (uint16_t *)ip6h; /* ULP offset from start of L2. */ diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.c b/usr/src/uts/common/io/mlxcx/mlxcx.c index c90fa0969b..2aefac33db 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx.c @@ -273,11 +273,16 @@ * before making a WQE for it. * * After a completion event occurs, the packet is either discarded (and the - * buffer_t returned to the free list), or it is readied for loaning to MAC. + * buffer_t returned to the free list), or it is readied for loaning to MAC + * and placed on the "loaned" list in the mlxcx_buffer_shard_t. * * Once MAC and the rest of the system have finished with the packet, they call - * freemsg() on its mblk, which will call mlxcx_buf_mp_return and return the - * buffer_t to the free list. + * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point + * the fate of the buffer_t is determined by the state of the + * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t + * will be returned to the free list, potentially to be recycled and used + * again. But if the shard is draining (E.g. after a ring stop) there will be + * no recycling and the buffer_t is immediately destroyed. * * At detach/teardown time, buffers are only every destroyed from the free list. * @@ -289,18 +294,18 @@ * v * +----+----+ * | created | - * +----+----+ - * | - * | - * | mlxcx_buf_return - * | - * v - * mlxcx_buf_destroy +----+----+ - * +---------| free |<---------------+ - * | +----+----+ | + * +----+----+ +------+ + * | | dead | + * | +------+ + * | mlxcx_buf_return ^ + * | | + * v | mlxcx_buf_destroy + * mlxcx_buf_destroy +----+----+ +-----------+ | + * +---------| free |<------no-| draining? |-yes-+ + * | +----+----+ +-----------+ + * | | ^ * | | | - * | | | mlxcx_buf_return - * v | mlxcx_buf_take | + * v | mlxcx_buf_take | mlxcx_buf_return * +---+--+ v | * | dead | +---+---+ | * +------+ | on WQ |- - - - - - - - >O @@ -759,13 +764,19 @@ mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s) mlxcx_buffer_t *buf; mutex_enter(&s->mlbs_mtx); + while (!list_is_empty(&s->mlbs_busy)) cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); - while ((buf = list_head(&s->mlbs_free)) != NULL) { + + while (!list_is_empty(&s->mlbs_loaned)) + cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); + + while ((buf = list_head(&s->mlbs_free)) != NULL) mlxcx_buf_destroy(mlxp, buf); - } + list_destroy(&s->mlbs_free); list_destroy(&s->mlbs_busy); + list_destroy(&s->mlbs_loaned); mutex_exit(&s->mlbs_mtx); cv_destroy(&s->mlbs_free_nonempty); @@ -1336,6 +1347,8 @@ mlxcx_mlbs_create(mlxcx_t *mlxp) offsetof(mlxcx_buffer_t, mlb_entry)); list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t), offsetof(mlxcx_buffer_t, mlb_entry)); + list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t), + offsetof(mlxcx_buffer_t, mlb_entry)); cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL); list_insert_tail(&mlxp->mlx_buf_shards, s); @@ -1743,6 +1756,11 @@ mlxcx_setup_ports(mlxcx_t *mlxp) mutex_exit(&p->mlp_mtx); goto err; } + if (!mlxcx_cmd_query_port_fec(mlxp, p)) { + mutex_exit(&p->mlp_mtx); + goto err; + } + p->mlp_fec_requested = LINK_FEC_AUTO; mutex_exit(&p->mlp_mtx); } diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h index da048b4ac3..06277d033c 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx.h @@ -346,6 +346,8 @@ typedef struct mlxcx_port { mlxcx_eth_proto_t mlp_max_proto; mlxcx_eth_proto_t mlp_admin_proto; mlxcx_eth_proto_t mlp_oper_proto; + mlxcx_pplm_fec_active_t mlp_fec_active; + link_fec_t mlp_fec_requested; mlxcx_eth_inline_mode_t mlp_wqe_min_inline; @@ -424,11 +426,18 @@ typedef enum { MLXCX_BUFFER_ON_CHAIN, } mlxcx_buffer_state_t; +typedef enum { + MLXCX_SHARD_READY, + MLXCX_SHARD_DRAINING, +} mlxcx_shard_state_t; + typedef struct mlxcx_buf_shard { + mlxcx_shard_state_t mlbs_state; list_node_t mlbs_entry; kmutex_t mlbs_mtx; list_t mlbs_busy; list_t mlbs_free; + list_t mlbs_loaned; kcondvar_t mlbs_free_nonempty; } mlxcx_buf_shard_t; @@ -1171,6 +1180,8 @@ extern boolean_t mlxcx_buf_loan(mlxcx_t *, mlxcx_buffer_t *); extern void mlxcx_buf_return(mlxcx_t *, mlxcx_buffer_t *); extern void mlxcx_buf_return_chain(mlxcx_t *, mlxcx_buffer_t *, boolean_t); extern void mlxcx_buf_destroy(mlxcx_t *, mlxcx_buffer_t *); +extern void mlxcx_shard_ready(mlxcx_buf_shard_t *); +extern void mlxcx_shard_draining(mlxcx_buf_shard_t *); extern uint_t mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *, mblk_t *, size_t, mlxcx_buffer_t **); @@ -1311,7 +1322,12 @@ extern boolean_t mlxcx_cmd_access_register(mlxcx_t *, mlxcx_cmd_reg_opmod_t, mlxcx_register_id_t, mlxcx_register_data_t *); extern boolean_t mlxcx_cmd_query_port_mtu(mlxcx_t *, mlxcx_port_t *); extern boolean_t mlxcx_cmd_query_port_status(mlxcx_t *, mlxcx_port_t *); +extern boolean_t mlxcx_cmd_modify_port_status(mlxcx_t *, mlxcx_port_t *, + mlxcx_port_status_t); extern boolean_t mlxcx_cmd_query_port_speed(mlxcx_t *, mlxcx_port_t *); +extern boolean_t mlxcx_cmd_query_port_fec(mlxcx_t *, mlxcx_port_t *); +extern boolean_t mlxcx_cmd_modify_port_fec(mlxcx_t *, mlxcx_port_t *, + mlxcx_pplm_fec_caps_t); extern boolean_t mlxcx_cmd_set_port_mtu(mlxcx_t *, mlxcx_port_t *); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c index 30fb7ca8ef..f059b856a6 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c @@ -12,6 +12,7 @@ /* * Copyright 2020, The University of Queensland * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 RackTop Systems, Inc. */ /* @@ -1594,6 +1595,8 @@ mlxcx_reg_name(mlxcx_register_id_t rid) return ("MCIA"); case MLXCX_REG_PPCNT: return ("PPCNT"); + case MLXCX_REG_PPLM: + return ("PPLM"); default: return ("???"); } @@ -1640,6 +1643,9 @@ mlxcx_cmd_access_register(mlxcx_t *mlxp, mlxcx_cmd_reg_opmod_t opmod, case MLXCX_REG_PPCNT: dsize = sizeof (mlxcx_reg_ppcnt_t); break; + case MLXCX_REG_PPLM: + dsize = sizeof (mlxcx_reg_pplm_t); + break; default: dsize = 0; VERIFY(0); @@ -1776,6 +1782,25 @@ mlxcx_cmd_query_port_status(mlxcx_t *mlxp, mlxcx_port_t *mlp) } boolean_t +mlxcx_cmd_modify_port_status(mlxcx_t *mlxp, mlxcx_port_t *mlp, + mlxcx_port_status_t status) +{ + mlxcx_register_data_t data; + boolean_t ret; + + ASSERT(mutex_owned(&mlp->mlp_mtx)); + bzero(&data, sizeof (data)); + data.mlrd_paos.mlrd_paos_local_port = mlp->mlp_num + 1; + data.mlrd_paos.mlrd_paos_admin_status = status; + set_bit32(&data.mlrd_paos.mlrd_paos_flags, MLXCX_PAOS_ADMIN_ST_EN); + + ret = mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_WRITE, + MLXCX_REG_PAOS, &data); + + return (ret); +} + +boolean_t mlxcx_cmd_query_port_speed(mlxcx_t *mlxp, mlxcx_port_t *mlp) { mlxcx_register_data_t data; @@ -1809,6 +1834,82 @@ mlxcx_cmd_query_port_speed(mlxcx_t *mlxp, mlxcx_port_t *mlp) } boolean_t +mlxcx_cmd_query_port_fec(mlxcx_t *mlxp, mlxcx_port_t *mlp) +{ + mlxcx_register_data_t data; + boolean_t ret; + + ASSERT(mutex_owned(&mlp->mlp_mtx)); + bzero(&data, sizeof (data)); + data.mlrd_pplm.mlrd_pplm_local_port = mlp->mlp_num + 1; + + ret = mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ, + MLXCX_REG_PPLM, &data); + + if (ret) { + mlp->mlp_fec_active = + from_be24(data.mlrd_pplm.mlrd_pplm_fec_mode_active); + } + + return (ret); +} + +boolean_t +mlxcx_cmd_modify_port_fec(mlxcx_t *mlxp, mlxcx_port_t *mlp, + mlxcx_pplm_fec_caps_t fec) +{ + mlxcx_register_data_t data_in, data_out; + mlxcx_pplm_fec_caps_t caps; + mlxcx_reg_pplm_t *pplm_in, *pplm_out; + boolean_t ret; + + ASSERT(mutex_owned(&mlp->mlp_mtx)); + bzero(&data_in, sizeof (data_in)); + pplm_in = &data_in.mlrd_pplm; + pplm_in->mlrd_pplm_local_port = mlp->mlp_num + 1; + + ret = mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ, + MLXCX_REG_PPLM, &data_in); + + if (!ret) + return (B_FALSE); + + bzero(&data_out, sizeof (data_out)); + pplm_out = &data_out.mlrd_pplm; + pplm_out->mlrd_pplm_local_port = mlp->mlp_num + 1; + + caps = get_bits32(pplm_in->mlrd_pplm_fec_override_cap, + MLXCX_PPLM_CAP_56G); + set_bits32(&pplm_out->mlrd_pplm_fec_override_admin, + MLXCX_PPLM_CAP_56G, fec & caps); + + caps = get_bits32(pplm_in->mlrd_pplm_fec_override_cap, + MLXCX_PPLM_CAP_100G); + set_bits32(&pplm_out->mlrd_pplm_fec_override_admin, + MLXCX_PPLM_CAP_100G, fec & caps); + + caps = get_bits32(pplm_in->mlrd_pplm_fec_override_cap, + MLXCX_PPLM_CAP_50G); + set_bits32(&pplm_out->mlrd_pplm_fec_override_admin, + MLXCX_PPLM_CAP_50G, fec & caps); + + caps = get_bits32(pplm_in->mlrd_pplm_fec_override_cap, + MLXCX_PPLM_CAP_25G); + set_bits32(&pplm_out->mlrd_pplm_fec_override_admin, + MLXCX_PPLM_CAP_25G, fec & caps); + + caps = get_bits32(pplm_in->mlrd_pplm_fec_override_cap, + MLXCX_PPLM_CAP_10_40G); + set_bits32(&pplm_out->mlrd_pplm_fec_override_admin, + MLXCX_PPLM_CAP_10_40G, fec & caps); + + ret = mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_WRITE, + MLXCX_REG_PPLM, &data_out); + + return (ret); +} + +boolean_t mlxcx_cmd_modify_nic_vport_ctx(mlxcx_t *mlxp, mlxcx_port_t *mlp, mlxcx_modify_nic_vport_ctx_fields_t fields) { diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c index a08cec3980..2521641a00 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c @@ -80,6 +80,53 @@ mlxcx_speed_to_bits(mlxcx_eth_proto_t v) } } +static link_fec_t +mlxcx_fec_to_link_fec(mlxcx_pplm_fec_active_t mlxcx_fec) +{ + if ((mlxcx_fec & MLXCX_PPLM_FEC_ACTIVE_NONE) != 0) + return (LINK_FEC_NONE); + + if ((mlxcx_fec & MLXCX_PPLM_FEC_ACTIVE_FIRECODE) != 0) + return (LINK_FEC_BASE_R); + + if ((mlxcx_fec & (MLXCX_PPLM_FEC_ACTIVE_RS528 | + MLXCX_PPLM_FEC_ACTIVE_RS271 | MLXCX_PPLM_FEC_ACTIVE_RS544 | + MLXCX_PPLM_FEC_ACTIVE_RS272)) != 0) + return (LINK_FEC_RS); + + return (LINK_FEC_NONE); +} + +static boolean_t +mlxcx_link_fec_cap(link_fec_t fec, mlxcx_pplm_fec_caps_t *pfecp) +{ + mlxcx_pplm_fec_caps_t pplm_fec = 0; + + if ((fec & LINK_FEC_AUTO) != 0) { + pplm_fec = MLXCX_PPLM_FEC_CAP_AUTO; + fec &= ~LINK_FEC_AUTO; + } else if ((fec & LINK_FEC_NONE) != 0) { + pplm_fec = MLXCX_PPLM_FEC_CAP_NONE; + fec &= ~LINK_FEC_NONE; + } else if ((fec & LINK_FEC_RS) != 0) { + pplm_fec |= MLXCX_PPLM_FEC_CAP_RS; + fec &= ~LINK_FEC_RS; + } else if ((fec & LINK_FEC_BASE_R) != 0) { + pplm_fec |= MLXCX_PPLM_FEC_CAP_FIRECODE; + fec &= ~LINK_FEC_BASE_R; + } + + /* + * Only one fec option is allowed. + */ + if (fec != 0) + return (B_FALSE); + + *pfecp = pplm_fec; + + return (B_TRUE); +} + static int mlxcx_mac_stat_rfc_2863(mlxcx_t *mlxp, mlxcx_port_t *port, uint_t stat, uint64_t *val) @@ -451,7 +498,8 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) return (NULL); } - if (sq->mlwq_state & MLXCX_WQ_TEARDOWN) { + if ((sq->mlwq_state & (MLXCX_WQ_TEARDOWN | MLXCX_WQ_STARTED)) != + MLXCX_WQ_STARTED) { mutex_exit(&sq->mlwq_mtx); mlxcx_buf_return_chain(mlxp, b, B_FALSE); return (NULL); @@ -725,8 +773,28 @@ mlxcx_mac_ring_stop(mac_ring_driver_t rh) mlxcx_buf_shard_t *s; mlxcx_buffer_t *buf; + /* + * To prevent deadlocks and sleeping whilst holding either the + * CQ mutex or WQ mutex, we split the stop processing into two + * parts. + * + * With the CQ amd WQ mutexes held the appropriate WQ is stopped. + * The Q in the HCA is set to Reset state and flagged as no + * longer started. Atomic with changing this WQ state, the buffer + * shards are flagged as draining. + * + * Now, any requests for buffers and attempts to submit messages + * will fail and once we're in this state it is safe to relinquish + * the CQ and WQ mutexes. Allowing us to complete the ring stop + * by waiting for the buffer lists, with the exception of + * the loaned list, to drain. Buffers on the loaned list are + * not under our control, we will get them back when the mblk tied + * to the buffer is freed. + */ + mutex_enter(&cq->mlcq_mtx); mutex_enter(&wq->mlwq_mtx); + if (wq->mlwq_state & MLXCX_WQ_STARTED) { if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ && !mlxcx_cmd_stop_rq(mlxp, wq)) { @@ -743,7 +811,15 @@ mlxcx_mac_ring_stop(mac_ring_driver_t rh) } ASSERT0(wq->mlwq_state & MLXCX_WQ_STARTED); + mlxcx_shard_draining(wq->mlwq_bufs); + if (wq->mlwq_foreign_bufs != NULL) + mlxcx_shard_draining(wq->mlwq_foreign_bufs); + + if (wq->mlwq_state & MLXCX_WQ_BUFFERS) { + mutex_exit(&wq->mlwq_mtx); + mutex_exit(&cq->mlcq_mtx); + /* Return any outstanding buffers to the free pool. */ while ((buf = list_remove_head(&cq->mlcq_buffers)) != NULL) { mlxcx_buf_return_chain(mlxp, buf, B_FALSE); @@ -775,12 +851,13 @@ mlxcx_mac_ring_stop(mac_ring_driver_t rh) mutex_exit(&s->mlbs_mtx); } + mutex_enter(&wq->mlwq_mtx); wq->mlwq_state &= ~MLXCX_WQ_BUFFERS; + mutex_exit(&wq->mlwq_mtx); + } else { + mutex_exit(&wq->mlwq_mtx); + mutex_exit(&cq->mlcq_mtx); } - ASSERT0(wq->mlwq_state & MLXCX_WQ_BUFFERS); - - mutex_exit(&wq->mlwq_mtx); - mutex_exit(&cq->mlcq_mtx); } static int @@ -1061,6 +1138,14 @@ mlxcx_mac_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); mac_prop_info_set_default_uint8(prh, 1); break; + case MAC_PROP_ADV_FEC_CAP: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + mac_prop_info_set_default_fec(prh, LINK_FEC_AUTO); + break; + case MAC_PROP_EN_FEC_CAP: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); + mac_prop_info_set_default_fec(prh, LINK_FEC_AUTO); + break; case MAC_PROP_ADV_100GFDX_CAP: case MAC_PROP_EN_100GFDX_CAP: mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); @@ -1120,6 +1205,9 @@ mlxcx_mac_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, uint32_t new_mtu, new_hw_mtu, old_mtu; mlxcx_buf_shard_t *sh; boolean_t allocd = B_FALSE; + boolean_t relink = B_FALSE; + link_fec_t fec; + mlxcx_pplm_fec_caps_t cap_fec; mutex_enter(&port->mlp_mtx); @@ -1137,7 +1225,8 @@ mlxcx_mac_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, for (; sh != NULL; sh = list_next(&mlxp->mlx_buf_shards, sh)) { mutex_enter(&sh->mlbs_mtx); if (!list_is_empty(&sh->mlbs_free) || - !list_is_empty(&sh->mlbs_busy)) { + !list_is_empty(&sh->mlbs_busy) || + !list_is_empty(&sh->mlbs_loaned)) { allocd = B_TRUE; mutex_exit(&sh->mlbs_mtx); break; @@ -1167,11 +1256,57 @@ mlxcx_mac_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, break; } break; + + case MAC_PROP_EN_FEC_CAP: + bcopy(pr_val, &fec, sizeof (fec)); + if (!mlxcx_link_fec_cap(fec, &cap_fec)) { + ret = EINVAL; + break; + } + + /* + * Don't change the FEC if it is already at the requested + * setting AND the port is up. + * When the port is down, always set the FEC and attempt + * to retrain the link. + */ + if (fec == port->mlp_fec_requested && + fec == mlxcx_fec_to_link_fec(port->mlp_fec_active) && + port->mlp_oper_status != MLXCX_PORT_STATUS_DOWN) + break; + + /* + * The most like cause of this failing is an invalid + * or unsupported fec option. + */ + if (!mlxcx_cmd_modify_port_fec(mlxp, port, cap_fec)) { + ret = EINVAL; + break; + } + + port->mlp_fec_requested = fec; + + /* + * For FEC to become effective, the link needs to go back + * to training and negotiation state. This happens when + * the link transitions from down to up, force a relink. + */ + relink = B_TRUE; + break; + default: ret = ENOTSUP; break; } + if (relink) { + if (!mlxcx_cmd_modify_port_status(mlxp, port, + MLXCX_PORT_STATUS_DOWN) || + !mlxcx_cmd_modify_port_status(mlxp, port, + MLXCX_PORT_STATUS_UP)) { + ret = EIO; + } + } mutex_exit(&port->mlp_mtx); return (ret); @@ -1229,6 +1364,21 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, } *(uint8_t *)pr_val = port->mlp_autoneg; break; + case MAC_PROP_ADV_FEC_CAP: + if (pr_valsize < sizeof (link_fec_t)) { + ret = EOVERFLOW; + break; + } + *(link_fec_t *)pr_val = + mlxcx_fec_to_link_fec(port->mlp_fec_active); + break; + case MAC_PROP_EN_FEC_CAP: + if (pr_valsize < sizeof (link_fec_t)) { + ret = EOVERFLOW; + break; + } + *(link_fec_t *)pr_val = port->mlp_fec_requested; + break; case MAC_PROP_MTU: if (pr_valsize < sizeof (uint32_t)) { ret = EOVERFLOW; diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c index 4dc4291b08..aed691897b 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c @@ -355,6 +355,7 @@ mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port) mutex_enter(&port->mlp_mtx); (void) mlxcx_cmd_query_port_status(mlxp, port); (void) mlxcx_cmd_query_port_speed(mlxp, port); + (void) mlxcx_cmd_query_port_fec(mlxp, port); switch (port->mlp_oper_status) { case MLXCX_PORT_STATUS_UP: diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h index 6d09abea5c..abd717842d 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h @@ -2464,6 +2464,59 @@ typedef struct { } mlxcx_reg_ppcnt_t; typedef enum { + MLXCX_PPLM_FEC_CAP_AUTO = 0, + MLXCX_PPLM_FEC_CAP_NONE = (1 << 0), + MLXCX_PPLM_FEC_CAP_FIRECODE = (1 << 1), + MLXCX_PPLM_FEC_CAP_RS = (1 << 2), +} mlxcx_pplm_fec_caps_t; + +typedef enum { + MLXCX_PPLM_FEC_ACTIVE_NONE = (1 << 0), + MLXCX_PPLM_FEC_ACTIVE_FIRECODE = (1 << 1), + MLXCX_PPLM_FEC_ACTIVE_RS528 = (1 << 2), + MLXCX_PPLM_FEC_ACTIVE_RS271 = (1 << 3), + MLXCX_PPLM_FEC_ACTIVE_RS544 = (1 << 7), + MLXCX_PPLM_FEC_ACTIVE_RS272 = (1 << 9), +} mlxcx_pplm_fec_active_t; + +/* CSTYLED */ +#define MLXCX_PPLM_CAP_56G (bitdef_t){ 16, 0x000f0000 } +/* CSTYLED */ +#define MLXCX_PPLM_CAP_100G (bitdef_t){ 12, 0x0000f000 } +/* CSTYLED */ +#define MLXCX_PPLM_CAP_50G (bitdef_t){ 8, 0x00000f00 } +/* CSTYLED */ +#define MLXCX_PPLM_CAP_25G (bitdef_t){ 4, 0x000000f0 } +/* CSTYLED */ +#define MLXCX_PPLM_CAP_10_40G (bitdef_t){ 0, 0x0000000f } + +typedef struct { + uint8_t mlrd_pplm_rsvd; + uint8_t mlrd_pplm_local_port; + uint8_t mlrd_pplm_rsvd1[11]; + uint24be_t mlrd_pplm_fec_mode_active; + bits32_t mlrd_pplm_fec_override_cap; + bits32_t mlrd_pplm_fec_override_admin; + uint16be_t mlrd_pplm_fec_override_cap_400g_8x; + uint16be_t mlrd_pplm_fec_override_cap_200g_4x; + uint16be_t mlrd_pplm_fec_override_cap_100g_2x; + uint16be_t mlrd_pplm_fec_override_cap_50g_1x; + uint16be_t mlrd_pplm_fec_override_admin_400g_8x; + uint16be_t mlrd_pplm_fec_override_admin_200g_4x; + uint16be_t mlrd_pplm_fec_override_admin_100g_2x; + uint16be_t mlrd_pplm_fec_override_admin_50g_1x; + uint8_t mlrd_pplm_rsvd2[8]; + uint16be_t mlrd_pplm_fec_override_cap_hdr; + uint16be_t mlrd_pplm_fec_override_cap_edr; + uint16be_t mlrd_pplm_fec_override_cap_fdr; + uint16be_t mlrd_pplm_fec_override_cap_fdr10; + uint16be_t mlrd_pplm_fec_override_admin_hdr; + uint16be_t mlrd_pplm_fec_override_admin_edr; + uint16be_t mlrd_pplm_fec_override_admin_fdr; + uint16be_t mlrd_pplm_fec_override_admin_fdr10; +} mlxcx_reg_pplm_t; + +typedef enum { MLXCX_REG_PMTU = 0x5003, MLXCX_REG_PTYS = 0x5004, MLXCX_REG_PAOS = 0x5006, @@ -2472,6 +2525,7 @@ typedef enum { MLXCX_REG_MLCR = 0x902B, MLXCX_REG_MCIA = 0x9014, MLXCX_REG_PPCNT = 0x5008, + MLXCX_REG_PPLM = 0x5023, } mlxcx_register_id_t; typedef union { @@ -2482,6 +2536,7 @@ typedef union { mlxcx_reg_pmaos_t mlrd_pmaos; mlxcx_reg_mcia_t mlrd_mcia; mlxcx_reg_ppcnt_t mlrd_ppcnt; + mlxcx_reg_pplm_t mlrd_pplm; } mlxcx_register_data_t; typedef enum { diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c index 492f8fd8a5..da98a5cf40 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c @@ -1213,6 +1213,8 @@ mlxcx_rx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g, ASSERT0(rq->mlwq_state & MLXCX_WQ_BUFFERS); rq->mlwq_state |= MLXCX_WQ_BUFFERS; + mlxcx_shard_ready(rq->mlwq_bufs); + for (j = 0; j < rq->mlwq_nents; ++j) { if (!mlxcx_buf_create(mlxp, rq->mlwq_bufs, &b)) break; @@ -1409,6 +1411,9 @@ mlxcx_tx_ring_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g, } sq->mlwq_state |= MLXCX_WQ_BUFFERS; + mlxcx_shard_ready(sq->mlwq_bufs); + mlxcx_shard_ready(sq->mlwq_foreign_bufs); + if (!mlxcx_cmd_start_sq(mlxp, sq)) { mutex_exit(&sq->mlwq_mtx); mutex_exit(&cq->mlcq_mtx); @@ -1799,22 +1804,29 @@ mlxcx_rq_refill_task(void *arg) mlxcx_completion_queue_t *cq = wq->mlwq_cq; mlxcx_t *mlxp = wq->mlwq_mlx; mlxcx_buf_shard_t *s = wq->mlwq_bufs; - boolean_t refill; + boolean_t refill, draining; do { /* - * Wait until there are some free buffers. + * Wait here until one of 3 conditions: + * 1. The shard is draining, or + * 2. There are buffers on the free list, or + * 3. The WQ is being shut down. */ mutex_enter(&s->mlbs_mtx); - while (list_is_empty(&s->mlbs_free) && - (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0) + while (s->mlbs_state != MLXCX_SHARD_DRAINING && + list_is_empty(&s->mlbs_free) && + (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0) { cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); + } + + draining = (s->mlbs_state == MLXCX_SHARD_DRAINING); mutex_exit(&s->mlbs_mtx); mutex_enter(&cq->mlcq_mtx); mutex_enter(&wq->mlwq_mtx); - if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) { + if (draining || (cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) { refill = B_FALSE; wq->mlwq_state &= ~MLXCX_WQ_REFILLING; } else { @@ -1851,7 +1863,10 @@ mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) target = mlwq->mlwq_nents - MLXCX_RQ_REFILL_STEP; cq = mlwq->mlwq_cq; - if (cq->mlcq_state & MLXCX_CQ_TEARDOWN) + if ((mlwq->mlwq_state & MLXCX_WQ_STARTED) == 0) + return; + + if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) return; current = cq->mlcq_bufcnt; @@ -1883,7 +1898,7 @@ mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) return; } - if (mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) { + if ((mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) != 0) { for (i = 0; i < n; ++i) mlxcx_buf_return(mlxp, b[i]); return; @@ -2058,7 +2073,6 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, wqe_index = buf->mlb_wqe_index; if (!mlxcx_buf_loan(mlxp, buf)) { - mlxcx_warn(mlxp, "!loan failed, dropping packet"); mlxcx_buf_return(mlxp, buf); return (NULL); } @@ -2101,16 +2115,11 @@ mlxcx_buf_mp_return(caddr_t arg) mlxcx_buffer_t *b = (mlxcx_buffer_t *)arg; mlxcx_t *mlxp = b->mlb_mlx; - if (b->mlb_state != MLXCX_BUFFER_ON_LOAN) { - b->mlb_mp = NULL; - return; - } - /* - * The mblk for this buffer_t (in its mlb_mp field) has been used now, - * so NULL it out. - */ + /* The mblk has been used now, so NULL it out. */ b->mlb_mp = NULL; - mlxcx_buf_return(mlxp, b); + + if (b->mlb_state == MLXCX_BUFFER_ON_LOAN) + mlxcx_buf_return(mlxp, b); } boolean_t @@ -2177,6 +2186,11 @@ mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs; mutex_enter(&s->mlbs_mtx); + if (s->mlbs_state != MLXCX_SHARD_READY) { + mutex_exit(&s->mlbs_mtx); + return (NULL); + } + if ((b = list_remove_head(&s->mlbs_free)) != NULL) { ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); ASSERT(b->mlb_foreign); @@ -2345,6 +2359,11 @@ mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) mlxcx_buf_shard_t *s = wq->mlwq_bufs; mutex_enter(&s->mlbs_mtx); + if (s->mlbs_state != MLXCX_SHARD_READY) { + mutex_exit(&s->mlbs_mtx); + return (NULL); + } + if ((b = list_remove_head(&s->mlbs_free)) != NULL) { ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); b->mlb_state = MLXCX_BUFFER_ON_WQ; @@ -2366,6 +2385,11 @@ mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, s = wq->mlwq_bufs; mutex_enter(&s->mlbs_mtx); + if (s->mlbs_state != MLXCX_SHARD_READY) { + mutex_exit(&s->mlbs_mtx); + return (0); + } + while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) { ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); b->mlb_state = MLXCX_BUFFER_ON_WQ; @@ -2379,6 +2403,8 @@ mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, boolean_t mlxcx_buf_loan(mlxcx_t *mlxp, mlxcx_buffer_t *b) { + mlxcx_buf_shard_t *s = b->mlb_shard; + VERIFY3U(b->mlb_state, ==, MLXCX_BUFFER_ON_WQ); ASSERT3P(b->mlb_mlx, ==, mlxp); @@ -2391,6 +2417,12 @@ mlxcx_buf_loan(mlxcx_t *mlxp, mlxcx_buffer_t *b) b->mlb_state = MLXCX_BUFFER_ON_LOAN; b->mlb_wqe_index = 0; + + mutex_enter(&s->mlbs_mtx); + list_remove(&s->mlbs_busy, b); + list_insert_tail(&s->mlbs_loaned, b); + mutex_exit(&s->mlbs_mtx); + return (B_TRUE); } @@ -2453,7 +2485,23 @@ mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b) break; case MLXCX_BUFFER_ON_LOAN: ASSERT(!b->mlb_foreign); - list_remove(&s->mlbs_busy, b); + list_remove(&s->mlbs_loaned, b); + if (s->mlbs_state == MLXCX_SHARD_DRAINING) { + /* + * When we're draining, Eg during mac_stop(), + * we destroy the buffer immediately rather than + * recycling it. Otherwise we risk leaving it + * on the free list and leaking it. + */ + list_insert_tail(&s->mlbs_free, b); + mlxcx_buf_destroy(mlxp, b); + /* + * Teardown might be waiting for loaned list to empty. + */ + cv_broadcast(&s->mlbs_free_nonempty); + mutex_exit(&s->mlbs_mtx); + return; + } break; case MLXCX_BUFFER_FREE: VERIFY(0); @@ -2466,7 +2514,7 @@ mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b) } list_insert_tail(&s->mlbs_free, b); - cv_signal(&s->mlbs_free_nonempty); + cv_broadcast(&s->mlbs_free_nonempty); mutex_exit(&s->mlbs_mtx); @@ -2484,9 +2532,11 @@ void mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b) { mlxcx_buf_shard_t *s = b->mlb_shard; + VERIFY(b->mlb_state == MLXCX_BUFFER_FREE || b->mlb_state == MLXCX_BUFFER_INIT); ASSERT(mutex_owned(&s->mlbs_mtx)); + if (b->mlb_state == MLXCX_BUFFER_FREE) list_remove(&s->mlbs_free, b); @@ -2506,3 +2556,20 @@ mlxcx_buf_destroy(mlxcx_t *mlxp, mlxcx_buffer_t *b) kmem_cache_free(mlxp->mlx_bufs_cache, b); } + +void +mlxcx_shard_ready(mlxcx_buf_shard_t *s) +{ + mutex_enter(&s->mlbs_mtx); + s->mlbs_state = MLXCX_SHARD_READY; + mutex_exit(&s->mlbs_mtx); +} + +void +mlxcx_shard_draining(mlxcx_buf_shard_t *s) +{ + mutex_enter(&s->mlbs_mtx); + s->mlbs_state = MLXCX_SHARD_DRAINING; + cv_broadcast(&s->mlbs_free_nonempty); + mutex_exit(&s->mlbs_mtx); +} diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c index 55fd87db45..288f77ae47 100644 --- a/usr/src/uts/common/io/stream.c +++ b/usr/src/uts/common/io/stream.c @@ -839,7 +839,7 @@ frnop_func(void *arg) */ static mblk_t * gesballoc(unsigned char *base, size_t size, uint32_t db_rtfu, frtn_t *frp, - void (*lastfree)(mblk_t *, dblk_t *), int kmflags) + void (*lastfree)(mblk_t *, dblk_t *), int kmflags) { dblk_t *dbp; mblk_t *mp; diff --git a/usr/src/uts/common/klm/nlm_impl.h b/usr/src/uts/common/klm/nlm_impl.h index 68604309a2..9caae1a8c7 100644 --- a/usr/src/uts/common/klm/nlm_impl.h +++ b/usr/src/uts/common/klm/nlm_impl.h @@ -28,7 +28,7 @@ */ /* - * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2016 Joyent, Inc. */ @@ -112,7 +112,7 @@ struct _kthread; * We pass three callback functions to nlm_do_lock: * nlm_reply_cb: send a normal RPC reply * nlm_res_cb: do a _res (message style) RPC (call) - * nlm_testargs_cb: do a "granted" RPC call (after blocking) + * nlm_granted_cb: do a "granted" RPC call (after blocking) * Only one of the 1st or 2nd is used. * The 3rd is used only for blocking * @@ -123,7 +123,7 @@ struct _kthread; */ typedef bool_t (*nlm_reply_cb)(SVCXPRT *, nlm4_res *); typedef enum clnt_stat (*nlm_res_cb)(nlm4_res *, void *, CLIENT *); -typedef enum clnt_stat (*nlm_testargs_cb)(nlm4_testargs *, void *, CLIENT *); +typedef enum clnt_stat (*nlm_granted_cb)(nlm4_testargs *, nlm4_res *, CLIENT *); typedef enum clnt_stat (*nlm_testres_cb)(nlm4_testres *, void *, CLIENT *); /* @@ -624,7 +624,7 @@ void nlm_do_notify2(nlm_sm_status *, void *, struct svc_req *); void nlm_do_test(nlm4_testargs *, nlm4_testres *, struct svc_req *, nlm_testres_cb); void nlm_do_lock(nlm4_lockargs *, nlm4_res *, struct svc_req *, - nlm_reply_cb, nlm_res_cb, nlm_testargs_cb); + nlm_reply_cb, nlm_res_cb, nlm_granted_cb); void nlm_do_cancel(nlm4_cancargs *, nlm4_res *, struct svc_req *, nlm_res_cb); void nlm_do_unlock(nlm4_unlockargs *, nlm4_res *, diff --git a/usr/src/uts/common/klm/nlm_rpc_handle.c b/usr/src/uts/common/klm/nlm_rpc_handle.c index 9ddf56856c..b022acc380 100644 --- a/usr/src/uts/common/klm/nlm_rpc_handle.c +++ b/usr/src/uts/common/klm/nlm_rpc_handle.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ @@ -130,6 +130,7 @@ update_host_rpcbinding(struct nlm_host *hostp, int vers) static int refresh_nlm_rpc(struct nlm_host *hostp, nlm_rpc_t *rpcp) { + uint32_t zero = 0; int ret; if (rpcp->nr_handle == NULL) { @@ -175,6 +176,12 @@ refresh_nlm_rpc(struct nlm_host *hostp, nlm_rpc_t *rpcp) if (NLM_STALE_CLNT(stat)) { ret = ESTALE; } + /* + * Need to reset the XID after the null call above, + * otherwise we'll reuse the XID from that call. + */ + (void) CLNT_CONTROL(rpcp->nr_handle, CLSET_XID, + (char *)&zero); } } @@ -209,7 +216,8 @@ again: rc = cv_wait_sig(&hostp->nh_rpcb_cv, &hostp->nh_lock); if (rc == 0) { mutex_exit(&hostp->nh_lock); - return (EINTR); + rc = EINTR; + goto errout; } } @@ -229,7 +237,8 @@ again: */ if (hostp->nh_rpcb_ustat != RPC_SUCCESS) { mutex_exit(&hostp->nh_lock); - return (ENOENT); + rc = ENOENT; + goto errout; } } @@ -263,7 +272,7 @@ again: } destroy_rpch(rpcp); - return (rc); + goto errout; } DTRACE_PROBE2(end, struct nlm_host *, hostp, @@ -271,6 +280,10 @@ again: *rpcpp = rpcp; return (0); + +errout: + NLM_ERR("Can't get RPC client handle for: %s", hostp->nh_name); + return (rc); } void diff --git a/usr/src/uts/common/klm/nlm_rpc_svc.c b/usr/src/uts/common/klm/nlm_rpc_svc.c index 2911b31877..1f04e3f036 100644 --- a/usr/src/uts/common/klm/nlm_rpc_svc.c +++ b/usr/src/uts/common/klm/nlm_rpc_svc.c @@ -26,7 +26,7 @@ */ /* - * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ @@ -63,7 +63,7 @@ * 32-bit lock ranges. */ static void -nlm_convert_to_nlm_lock(struct nlm_lock *dst, struct nlm4_lock *src) +nlm_convert_to_nlm_lock(struct nlm_lock *dst, const struct nlm4_lock *src) { dst->caller_name = src->caller_name; dst->fh = src->fh; @@ -76,12 +76,22 @@ nlm_convert_to_nlm_lock(struct nlm_lock *dst, struct nlm4_lock *src) } /* + * Up-convert for v1 granted response + */ +static void +nlm_convert_to_nlm4_res(struct nlm4_res *dst, const struct nlm_res *src) +{ + dst->cookie = src->cookie; + dst->stat.stat = (nlm4_stats) src->stat.stat; +} + +/* * Up-convert for v1 svc functions with a 32-bit lock range arg. * Note that lock range checks (like overflow) are done later, * in nlm_init_flock(). */ static void -nlm_convert_to_nlm4_lock(struct nlm4_lock *dst, struct nlm_lock *src) +nlm_convert_to_nlm4_lock(struct nlm4_lock *dst, const struct nlm_lock *src) { dst->caller_name = src->caller_name; @@ -93,7 +103,7 @@ nlm_convert_to_nlm4_lock(struct nlm4_lock *dst, struct nlm_lock *src) } static void -nlm_convert_to_nlm4_share(struct nlm4_share *dst, struct nlm_share *src) +nlm_convert_to_nlm4_share(struct nlm4_share *dst, const struct nlm_share *src) { dst->caller_name = src->caller_name; @@ -113,7 +123,7 @@ nlm_convert_to_nlm4_share(struct nlm4_share *dst, struct nlm_share *src) * valid 32-bit lock range. */ static void -nlm_convert_to_nlm_holder(struct nlm_holder *dst, struct nlm4_holder *src) +nlm_convert_to_nlm_holder(struct nlm_holder *dst, const struct nlm4_holder *src) { dst->exclusive = src->exclusive; dst->svid = src->svid; @@ -133,7 +143,7 @@ nlm_convert_to_nlm_stats(enum nlm4_stats src) } static void -nlm_convert_to_nlm_res(struct nlm_res *dst, struct nlm4_res *src) +nlm_convert_to_nlm_res(struct nlm_res *dst, const struct nlm4_res *src) { dst->cookie = src->cookie; dst->stat.stat = nlm_convert_to_nlm_stats(src->stat.stat); @@ -175,7 +185,7 @@ nlm_test_1_svc(struct nlm_testargs *argp, nlm_testres *resp, * Callback functions for nlm_lock_1_svc */ static bool_t nlm_lock_1_reply(SVCXPRT *, nlm4_res *); -static enum clnt_stat nlm_granted_1_cb(nlm4_testargs *, void *, CLIENT *); +static enum clnt_stat nlm_granted_1_cb(nlm4_testargs *, nlm4_res *, CLIENT *); bool_t nlm_lock_1_svc(nlm_lockargs *argp, nlm_res *resp, @@ -215,7 +225,7 @@ nlm_lock_1_reply(SVCXPRT *transp, nlm4_res *resp) } static enum clnt_stat -nlm_granted_1_cb(nlm4_testargs *argp, void *resp, CLIENT *clnt) +nlm_granted_1_cb(nlm4_testargs *argp, nlm4_res *resp, CLIENT *clnt) { nlm_testargs args1; nlm_res res1; @@ -229,9 +239,7 @@ nlm_granted_1_cb(nlm4_testargs *argp, void *resp, CLIENT *clnt) rv = nlm_granted_1(&args1, &res1, clnt); - /* NB: We have a result our caller will not free. */ - xdr_free((xdrproc_t)xdr_nlm_res, (void *)&res1); - (void) resp; + nlm_convert_to_nlm4_res(resp, &res1); return (rv); } @@ -355,7 +363,8 @@ nlm_test_res_1_cb(nlm4_testres *res4, void *null, CLIENT *clnt) * Callback functions for nlm_lock_msg_1_svc */ static enum clnt_stat nlm_lock_res_1_cb(nlm4_res *, void *, CLIENT *); -static enum clnt_stat nlm_granted_msg_1_cb(nlm4_testargs *, void *, CLIENT *); +static enum clnt_stat nlm_granted_msg_1_cb(nlm4_testargs *, nlm4_res *, + CLIENT *); bool_t nlm_lock_msg_1_svc(nlm_lockargs *argp, void *resp, @@ -396,16 +405,22 @@ nlm_lock_res_1_cb(nlm4_res *resp, void *null, CLIENT *clnt) } static enum clnt_stat -nlm_granted_msg_1_cb(nlm4_testargs *argp, void *null, CLIENT *clnt) +nlm_granted_msg_1_cb(nlm4_testargs *argp, nlm4_res *resp, CLIENT *clnt) { nlm_testargs args1; + int rv; args1.cookie = argp->cookie; args1.exclusive = argp->exclusive; nlm_convert_to_nlm_lock(&args1.alock, &argp->alock); - return (nlm_granted_msg_1(&args1, null, clnt)); + rv = nlm_granted_msg_1(&args1, NULL, clnt); + + /* MSG call doesn't fill in *resp, so do it here. */ + if (rv != RPC_SUCCESS) + resp->stat.stat = nlm4_failed; + return (rv); } @@ -693,7 +708,6 @@ nlm4_test_4_svc(nlm4_testargs *argp, nlm4_testres *resp, struct svc_req *sr) * Callback functions for nlm4_lock_4_svc */ static bool_t nlm4_lock_4_reply(SVCXPRT *, nlm4_res *); -static enum clnt_stat nlm4_granted_4_cb(nlm4_testargs *, void *, CLIENT *); bool_t nlm4_lock_4_svc(nlm4_lockargs *argp, nlm4_res *resp, @@ -703,7 +717,7 @@ nlm4_lock_4_svc(nlm4_lockargs *argp, nlm4_res *resp, /* NLM4_LOCK */ nlm_do_lock(argp, resp, sr, nlm4_lock_4_reply, NULL, - nlm4_granted_4_cb); + nlm4_granted_4); /* above does its own reply */ return (FALSE); @@ -715,22 +729,6 @@ nlm4_lock_4_reply(SVCXPRT *transp, nlm4_res *resp) return (svc_sendreply(transp, xdr_nlm4_res, (char *)resp)); } -static enum clnt_stat -nlm4_granted_4_cb(nlm4_testargs *argp, void *resp, CLIENT *clnt) -{ - nlm4_res res4; - int rv; - - bzero(&res4, sizeof (res4)); - rv = nlm4_granted_4(argp, &res4, clnt); - - /* NB: We have a result our caller will not free. */ - xdr_free((xdrproc_t)xdr_nlm4_res, (void *)&res4); - (void) resp; - - return (rv); -} - bool_t nlm4_cancel_4_svc(nlm4_cancargs *argp, nlm4_res *resp, struct svc_req *sr) { @@ -773,6 +771,8 @@ nlm4_test_msg_4_svc(nlm4_testargs *argp, void *resp, struct svc_req *sr) * Callback functions for nlm4_lock_msg_4_svc * (using the RPC client stubs directly) */ +static enum clnt_stat nlm4_granted_msg_4_cb(nlm4_testargs *, nlm4_res *, + CLIENT *); bool_t nlm4_lock_msg_4_svc(nlm4_lockargs *argp, void *resp, @@ -784,7 +784,7 @@ nlm4_lock_msg_4_svc(nlm4_lockargs *argp, void *resp, bzero(&res4, sizeof (res4)); nlm_do_lock(argp, &res4, sr, NULL, nlm4_lock_res_4, - nlm4_granted_msg_4); + nlm4_granted_msg_4_cb); /* NB: We have a result our caller will not free. */ xdr_free((xdrproc_t)xdr_nlm4_res, (void *)&res4); @@ -794,6 +794,20 @@ nlm4_lock_msg_4_svc(nlm4_lockargs *argp, void *resp, return (FALSE); } +static enum clnt_stat +nlm4_granted_msg_4_cb(nlm4_testargs *argp, nlm4_res *resp, CLIENT *clnt) +{ + int rv; + + rv = nlm4_granted_msg_4(argp, NULL, clnt); + + /* MSG call doesn't fill in *resp, so do it here. */ + if (rv != RPC_SUCCESS) + resp->stat.stat = nlm4_failed; + + return (rv); +} + bool_t nlm4_cancel_msg_4_svc(nlm4_cancargs *argp, void *resp, struct svc_req *sr) { diff --git a/usr/src/uts/common/klm/nlm_service.c b/usr/src/uts/common/klm/nlm_service.c index dceabaf53f..f4f733443e 100644 --- a/usr/src/uts/common/klm/nlm_service.c +++ b/usr/src/uts/common/klm/nlm_service.c @@ -27,7 +27,7 @@ /* * Copyright (c) 2012, 2016 by Delphix. All rights reserved. - * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. * Copyright 2014 Joyent, Inc. All rights reserved. */ @@ -81,6 +81,7 @@ struct nlm_block_cb_data { struct nlm_host *hostp; struct nlm_vhold *nvp; struct flock64 *flp; + bool_t registered; }; /* @@ -107,9 +108,9 @@ static void nlm_block( nlm4_lockargs *lockargs, struct nlm_host *host, struct nlm_vhold *nvp, - nlm_rpc_t *rpcp, struct flock64 *fl, - nlm_testargs_cb grant_cb); + nlm_granted_cb grant_cb, + rpcvers_t); static vnode_t *nlm_fh_to_vp(struct netobj *); static struct nlm_vhold *nlm_fh_to_vhold(struct nlm_host *, struct netobj *); @@ -314,6 +315,11 @@ nlm_do_notify2(nlm_sm_status *argp, void *res, struct svc_req *sr) * NLM_TEST, NLM_TEST_MSG, * NLM4_TEST, NLM4_TEST_MSG, * Client inquiry about locks, non-blocking. + * + * Arg cb is NULL for NLM_TEST, NLM4_TEST, and + * non-NULL for NLM_TEST_MSG, NLM4_TEST_MSG + * The MSG forms use the cb to send the reply, + * and don't return a reply for this call. */ void nlm_do_test(nlm4_testargs *argp, nlm4_testres *resp, @@ -455,10 +461,19 @@ out: * We also have to keep a list of locks (pending + granted) * both to handle retransmitted requests, and to keep the * vnodes for those locks active. + * + * Callback arguments: + * reply_cb Used to send a normal RPC reply just as if + * we had filled in a response for our caller. + * Needed because we do work after the reply. + * res_cb Used for the MSG calls, where there's no + * regular RPC response. + * grant_cb Used to CALL the client informing them of a + * granted lock after a "blocked" reply. */ void nlm_do_lock(nlm4_lockargs *argp, nlm4_res *resp, struct svc_req *sr, - nlm_reply_cb reply_cb, nlm_res_cb res_cb, nlm_testargs_cb grant_cb) + nlm_reply_cb reply_cb, nlm_res_cb res_cb, nlm_granted_cb grant_cb) { struct nlm_globals *g; struct flock64 fl; @@ -492,20 +507,18 @@ nlm_do_lock(nlm4_lockargs *argp, nlm4_res *resp, struct svc_req *sr, struct nlm_host *, host, nlm4_lockargs *, argp); /* - * If we may need to do _msg_ call needing an RPC - * callback, get the RPC client handle now, - * so we know if we can bind to the NLM service on - * this client. - * - * Note: host object carries transport type. - * One client using multiple transports gets - * separate sysids for each of its transports. + * If this is a MSG call (NLM_LOCK_MSG, NLM4_LOCK_MSG) + * we'll have res_cb != NULL, and we know we'll need an + * RPC client handle _now_ so we can send the response. + * If we can't get an rpc handle (rpcp) then we have + * no way to respond, and the client will time out. */ - if (res_cb != NULL || (grant_cb != NULL && argp->block == TRUE)) { + if (res_cb != NULL) { error = nlm_host_get_rpc(host, sr->rq_vers, &rpcp); if (error != 0) { + ASSERT(rpcp == NULL); status = nlm4_denied_nolocks; - goto doreply; + goto out; } } @@ -584,6 +597,8 @@ nlm_do_lock(nlm4_lockargs *argp, nlm4_res *resp, struct svc_req *sr, /* * OK, can detach this thread, so this call * will block below (after we reply). + * The "blocked" reply tells the client to + * expect a "granted" call-back later. */ status = nlm4_blocked; do_blocking = TRUE; @@ -655,11 +670,12 @@ doreply: * "detach" it from the RPC SVC pool, allowing it * to block indefinitely if needed. */ - ASSERT(rpcp != NULL); + ASSERT(grant_cb != NULL); (void) svc_detach_thread(sr->rq_xprt); - nlm_block(argp, host, nvp, rpcp, &fl, grant_cb); + nlm_block(argp, host, nvp, &fl, grant_cb, sr->rq_vers); } +out: DTRACE_PROBE3(lock__end, struct nlm_globals *, g, struct nlm_host *, host, nlm4_res *, resp); @@ -679,25 +695,26 @@ static void nlm_block(nlm4_lockargs *lockargs, struct nlm_host *host, struct nlm_vhold *nvp, - nlm_rpc_t *rpcp, struct flock64 *flp, - nlm_testargs_cb grant_cb) + nlm_granted_cb grant_cb, + rpcvers_t vers) { nlm4_testargs args; + nlm4_res res; int error; flk_callback_t flk_cb; struct nlm_block_cb_data cb_data; + nlm_rpc_t *rpcp = NULL; + enum clnt_stat status; /* * Keep a list of blocked locks on nh_pending, and use it * to cancel these threads in nlm_destroy_client_pending. * - * Check to see if this lock is already in the list - * and if not, add an entry for it. Allocate first, - * then if we don't insert, free the new one. - * Caller already has vp held. + * Check to see if this lock is already in the list. If so, + * some earlier call is already blocked getting this lock, + * so there's nothing more this call needs to do. */ - error = nlm_slreq_register(host, nvp, flp); if (error != 0) { /* @@ -710,9 +727,22 @@ nlm_block(nlm4_lockargs *lockargs, return; } + /* + * Make sure we can get an RPC client handle we can use to + * deliver the "granted" callback if/when we get the lock. + * If we can't, there's no point blocking to get the lock + * for them because they'll never find out about it. + */ + error = nlm_host_get_rpc(host, vers, &rpcp); + if (error != 0) { + (void) nlm_slreq_unregister(host, nvp, flp); + return; + } + cb_data.hostp = host; cb_data.nvp = nvp; cb_data.flp = flp; + cb_data.registered = TRUE; flk_init_callback(&flk_cb, nlm_block_callback, &cb_data); /* BSD: VOP_ADVLOCK(vp, NULL, F_SETLK, fl, F_REMOTE); */ @@ -720,23 +750,60 @@ nlm_block(nlm4_lockargs *lockargs, F_REMOTELOCK | FREAD | FWRITE, (u_offset_t)0, &flk_cb, CRED(), NULL); + /* + * If the nlm_block_callback didn't already do it... + */ + if (cb_data.registered) + (void) nlm_slreq_unregister(host, nvp, flp); + if (error != 0) { /* * We failed getting the lock, but have no way to * tell the client about that. Let 'em time out. */ - (void) nlm_slreq_unregister(host, nvp, flp); return; } - /* + * ... else we got the lock on behalf of this client. + * + * We MUST either tell the client about this lock + * (via the "granted" callback RPC) or unlock. + * * Do the "granted" call-back to the client. */ + bzero(&args, sizeof (args)); args.cookie = lockargs->cookie; args.exclusive = lockargs->exclusive; args.alock = lockargs->alock; + bzero(&res, sizeof (res)); + + /* + * Not using the NLM_INVOKE_CALLBACK() macro because + * we need to take actions on errors. + */ + status = (*grant_cb)(&args, &res, (rpcp)->nr_handle); + if (status != RPC_SUCCESS) { + struct rpc_err err; + + CLNT_GETERR((rpcp)->nr_handle, &err); + NLM_ERR("NLM: %s callback failed: " + "stat %d, err %d\n", "grant", status, + err.re_errno); + res.stat.stat = nlm4_failed; + } + if (res.stat.stat != nlm4_granted) { + /* + * Failed to deliver the granted callback, so + * the client doesn't know about this lock. + * Unlock the lock. The client will time out. + */ + (void) nlm_vop_frlock(nvp->nv_vp, F_UNLCK, flp, + F_REMOTELOCK | FREAD | FWRITE, + (u_offset_t)0, NULL, CRED(), NULL); + } + xdr_free((xdrproc_t)xdr_nlm4_res, (void *)&res); - NLM_INVOKE_CALLBACK("grant", rpcp, &args, grant_cb); + nlm_host_rele_rpc(host, rpcp); } /* @@ -756,6 +823,7 @@ nlm_block_callback(flk_cb_when_t when, void *data) if (when == FLK_AFTER_SLEEP) { (void) nlm_slreq_unregister(cb_data->hostp, cb_data->nvp, cb_data->flp); + cb_data->registered = FALSE; } return (0); diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index 9e58a7bb56..2543bdf17e 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -20,6 +20,7 @@ */ /* + * Copyright 2020 Oxide Computer Company * Copyright (c) 2013 Gary Mills * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2020 Joyent, Inc. @@ -43,6 +44,7 @@ #include <sys/utsname.h> #include <sys/id_space.h> #include <sys/zone.h> +#include <sys/bootbanner.h> log_zone_t log_global; queue_t *log_consq; @@ -182,6 +184,14 @@ log_zonefree(zoneid_t zoneid, void *arg) kmem_free(lzp, sizeof (log_zone_t)); } +static void +log_bootbanner_print(const char *line, uint_t num) +{ + const char *pfx = (num == 0) ? "\r" : ""; + + printf("%s%s\n", pfx, line); +} + void log_init(void) { @@ -246,11 +256,15 @@ log_init(void) log_update(&log_backlog, log_backlogq, SL_CONSOLE, log_console); /* - * Now that logging is enabled, emit the SunOS banner. + * Now that logging is enabled, emit the boot banner. */ +#ifdef LEGACY_BANNER printf("\rSunOS Release %s Version %s %u-bit\n", utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); printf("Copyright 2010-2020 Joyent, Inc.\n"); +#else + bootbanner_print(log_bootbanner_print, KM_SLEEP); +#endif #ifdef DEBUG printf("DEBUG enabled\n"); #endif diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index ac1ee2d1ce..1e18a0ce9e 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -28,6 +28,7 @@ * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2018 Joyent, Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 3664f0096b..24fdd94c11 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -96,6 +96,7 @@ CHKHDRS= \ bofi.h \ bofi_impl.h \ bpp_io.h \ + bootbanner.h \ bootstat.h \ brand.h \ buf.h \ diff --git a/usr/src/uts/common/sys/bootbanner.h b/usr/src/uts/common/sys/bootbanner.h new file mode 100644 index 0000000000..93ba1b9e79 --- /dev/null +++ b/usr/src/uts/common/sys/bootbanner.h @@ -0,0 +1,33 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2020 Oxide Computer Company + */ + +#ifndef _SYS_BOOTBANNER_H +#define _SYS_BOOTBANNER_H + +/* + * Rendering of the boot banner, used on the system and zone consoles. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +extern void bootbanner_print(void (*)(const char *, uint_t), int kmflag); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BOOTBANNER_H */ diff --git a/usr/src/uts/common/sys/fibre-channel/fca/emlxs/emlxs_hw.h b/usr/src/uts/common/sys/fibre-channel/fca/emlxs/emlxs_hw.h index ab4b4b4e6b..406c90303b 100644 --- a/usr/src/uts/common/sys/fibre-channel/fca/emlxs/emlxs_hw.h +++ b/usr/src/uts/common/sys/fibre-channel/fca/emlxs/emlxs_hw.h @@ -2140,7 +2140,7 @@ typedef struct #define SLI_FW_TYPE_101 SLI_FW_TYPE_SHIFT(0xb) /* LP101 */ -enum emlxs_prog_type +typedef enum emlxs_prog_type { TEST_PROGRAM, /* 0 */ UTIL_PROGRAM, /* 1 */ diff --git a/usr/src/uts/common/sys/fs/sdev_impl.h b/usr/src/uts/common/sys/fs/sdev_impl.h index d1c5f674f1..676193fcfa 100644 --- a/usr/src/uts/common/sys/fs/sdev_impl.h +++ b/usr/src/uts/common/sys/fs/sdev_impl.h @@ -39,6 +39,7 @@ extern "C" { #include <sys/nvpair.h> #include <sys/fs/sdev_plugin.h> #include <sys/sunddi.h> +#include <sys/fs/sdev_plugin.h> /* * sdev_nodes are the file-system specific part of the diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h index 1d7ddf9648..a5974f6d7d 100644 --- a/usr/src/uts/common/sys/mac.h +++ b/usr/src/uts/common/sys/mac.h @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2018 Joyent, Inc. * Copyright (c) 2015 Garrett D'Amore <garrett@damore.org> + * Copyright 2020 RackTop Systems, Inc. */ #ifndef _SYS_MAC_H @@ -88,6 +89,13 @@ typedef enum { } link_flowctrl_t; typedef enum { + LINK_FEC_NONE = 1 << 0, + LINK_FEC_AUTO = 1 << 1, + LINK_FEC_RS = 1 << 2, + LINK_FEC_BASE_R = 1 << 3 +} link_fec_t; + +typedef enum { LINK_TAGMODE_VLANONLY = 0, LINK_TAGMODE_NORMAL } link_tagmode_t; @@ -239,6 +247,8 @@ typedef enum { MAC_PROP_EN_25GFDX_CAP, MAC_PROP_ADV_50GFDX_CAP, MAC_PROP_EN_50GFDX_CAP, + MAC_PROP_EN_FEC_CAP, + MAC_PROP_ADV_FEC_CAP, MAC_PROP_PRIVATE = -1 } mac_prop_id_t; diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index 3c103c073a..21f2c10a8e 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -290,54 +290,6 @@ struct mac_group_s { #define GROUP_INTR_ENABLE_FUNC(g) (g)->mrg_info.mgi_intr.mi_enable #define GROUP_INTR_DISABLE_FUNC(g) (g)->mrg_info.mgi_intr.mi_disable -#define MAC_RING_TX(mhp, rh, mp, rest) { \ - mac_ring_handle_t mrh = rh; \ - mac_impl_t *mimpl = (mac_impl_t *)mhp; \ - /* \ - * Send packets through a selected tx ring, or through the \ - * default handler if there is no selected ring. \ - */ \ - if (mrh == NULL) \ - mrh = mimpl->mi_default_tx_ring; \ - if (mrh == NULL) { \ - rest = mimpl->mi_tx(mimpl->mi_driver, mp); \ - } else { \ - rest = mac_hwring_tx(mrh, mp); \ - } \ -} - -/* - * This is the final stop before reaching the underlying driver - * or aggregation, so this is where the bridging hook is implemented. - * Packets that are bridged will return through mac_bridge_tx(), with - * rh nulled out if the bridge chooses to send output on a different - * link due to forwarding. - */ -#define MAC_TX(mip, rh, mp, src_mcip) { \ - mac_ring_handle_t rhandle = (rh); \ - /* \ - * If there is a bound Hybrid I/O share, send packets through \ - * the default tx ring. (When there's a bound Hybrid I/O share, \ - * the tx rings of this client are mapped in the guest domain \ - * and not accessible from here.) \ - */ \ - _NOTE(CONSTANTCONDITION) \ - if ((src_mcip)->mci_state_flags & MCIS_SHARE_BOUND) \ - rhandle = (mip)->mi_default_tx_ring; \ - if (mip->mi_promisc_list != NULL) \ - mac_promisc_dispatch(mip, mp, src_mcip); \ - /* \ - * Grab the proper transmit pointer and handle. Special \ - * optimization: we can test mi_bridge_link itself atomically, \ - * and if that indicates no bridge send packets through tx ring.\ - */ \ - if (mip->mi_bridge_link == NULL) { \ - MAC_RING_TX(mip, rhandle, mp, mp); \ - } else { \ - mp = mac_bridge_tx(mip, rhandle, mp); \ - } \ -} - /* mci_tx_flag */ #define MCI_TX_QUIESCE 0x1 diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h index 04c20d6aac..fc0866f2d1 100644 --- a/usr/src/uts/common/sys/mac_provider.h +++ b/usr/src/uts/common/sys/mac_provider.h @@ -22,6 +22,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 RackTop Systems, Inc. */ #ifndef _SYS_MAC_PROVIDER_H @@ -631,6 +632,8 @@ extern void mac_prop_info_set_default_uint32( mac_prop_info_handle_t, uint32_t); extern void mac_prop_info_set_default_link_flowctrl( mac_prop_info_handle_t, link_flowctrl_t); +extern void mac_prop_info_set_default_fec( + mac_prop_info_handle_t, link_fec_t); extern void mac_prop_info_set_range_uint32( mac_prop_info_handle_t, uint32_t, uint32_t); diff --git a/usr/src/uts/common/sys/smbios.h b/usr/src/uts/common/sys/smbios.h index 34281898e0..55048d549d 100644 --- a/usr/src/uts/common/sys/smbios.h +++ b/usr/src/uts/common/sys/smbios.h @@ -1315,11 +1315,12 @@ typedef struct smbios_memdevice { #define SMB_MTECH_NVDIMM_P 0x06 /* NVDIMM-P */ #define SMB_MTECH_INTCPM 0x07 /* Intel Optane DC Persistent Memory */ -#define SMB_MOMC_OTHER 0x01 /* other */ -#define SMB_MOMC_UNKNOWN 0x02 /* unknown */ -#define SMB_MOMC_VOLATILE 0x04 /* Volatile memory */ -#define SMB_MOMC_BYTE_PM 0x08 /* Byte-accessible persistent memory */ -#define SMB_MOMC_BLOCK_PM 0x10 /* Block-accessible persistent memory */ +#define SMB_MOMC_RESERVED 0x01 /* reserved */ +#define SMB_MOMC_OTHER 0x02 /* other */ +#define SMB_MOMC_UNKNOWN 0x04 /* unknown */ +#define SMB_MOMC_VOLATILE 0x08 /* Volatile memory */ +#define SMB_MOMC_BYTE_PM 0x10 /* Byte-accessible persistent memory */ +#define SMB_MOMC_BLOCK_PM 0x20 /* Block-accessible persistent memory */ /* * SMBIOS Memory Array Mapped Address. See DSP0134 Section 7.20 for more diff --git a/usr/src/uts/common/sys/sysconfig.h b/usr/src/uts/common/sys/sysconfig.h index d5b65ef78c..7e87d7a983 100644 --- a/usr/src/uts/common/sys/sysconfig.h +++ b/usr/src/uts/common/sys/sysconfig.h @@ -101,8 +101,9 @@ extern int mach_sysconfig(int); #define _CONFIG_SYMLOOP_MAX 46 /* maximum # of symlinks in pathname */ #define _CONFIG_EPHID_MAX 47 /* maximum ephemeral uid */ +#define _CONFIG_UADDR_MAX 48 /* maximum user address */ -#define _CONFIG_NPROC_NCPU 48 /* NCPU (sometimes > NPROC_MAX) */ +#define _CONFIG_NPROC_NCPU 49 /* NCPU (sometimes > NPROC_MAX) */ #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/systeminfo.h b/usr/src/uts/common/sys/systeminfo.h index a664a19b9e..951d799a70 100644 --- a/usr/src/uts/common/sys/systeminfo.h +++ b/usr/src/uts/common/sys/systeminfo.h @@ -21,6 +21,7 @@ /* * Copyright 2014 Garrett D'Amore <garrett@damore.org> * Copyright 2017 RackTop Systems. + * Copyright 2020 Oxide Computer Company * * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -56,8 +57,8 @@ extern char platform[]; * 257 - 512 Unix International assigned numbers for `set' style commands * where the value is selected to be the value for the * corresponding `get' command plus 256. - * 513 - 768 Solaris specific `get' style commands. - * 769 - 1024 Solaris specific `set' style commands where the value is + * 513 - 768 illumos-defined `get' style commands. + * 769 - 1024 illumos-defined `set' style commands where the value is * selected to be the value for the corresponding `get' command * plus 256. * @@ -69,7 +70,7 @@ extern char platform[]; /* UI defined `get' commands (1-256) */ #define SI_SYSNAME 1 /* return name of operating system */ #define SI_HOSTNAME 2 /* return name of node */ -#define SI_RELEASE 3 /* return release of operating system */ +#define SI_RELEASE 3 /* return release of operating system */ #define SI_VERSION 4 /* return version field of utsname */ #define SI_MACHINE 5 /* return kind of machine */ #define SI_ARCHITECTURE 6 /* return instruction set arch */ @@ -81,7 +82,7 @@ extern char platform[]; #define SI_SET_HOSTNAME 258 /* set name of node */ #define SI_SET_SRPC_DOMAIN 265 /* set secure RPC domain */ -/* Solaris defined `get' commands (513-768) */ +/* illumos-defined `get' commands (513-768) */ #define SI_PLATFORM 513 /* return platform identifier */ #define SI_ISALIST 514 /* return supported isa list */ #define SI_DHCP_CACHE 515 /* return kernel-cached DHCPACK */ @@ -89,8 +90,9 @@ extern char platform[]; #define SI_ARCHITECTURE_64 517 /* basic 64-bit SI_ARCHITECTURE */ #define SI_ARCHITECTURE_K 518 /* kernel SI_ARCHITECTURE equivalent */ #define SI_ARCHITECTURE_NATIVE 519 /* SI_ARCHITECTURE of the caller */ +#define SI_ADDRESS_WIDTH 520 /* number of bits in native address */ -/* Solaris defined `set' commands (769-1024) (none currently assigned) */ +/* illumos-defined `set' commands (769-1024) (none currently assigned) */ #define HW_INVALID_HOSTID 0xFFFFFFFF /* an invalid hostid */ diff --git a/usr/src/uts/common/sys/unistd.h b/usr/src/uts/common/sys/unistd.h index f366e99f6a..591a3426f5 100644 --- a/usr/src/uts/common/sys/unistd.h +++ b/usr/src/uts/common/sys/unistd.h @@ -106,9 +106,9 @@ extern "C" { #define _SC_ARG_MAX 1 #define _SC_CHILD_MAX 2 #define _SC_CLK_TCK 3 -#define _SC_NGROUPS_MAX 4 +#define _SC_NGROUPS_MAX 4 #define _SC_OPEN_MAX 5 -#define _SC_JOB_CONTROL 6 +#define _SC_JOB_CONTROL 6 #define _SC_SAVED_IDS 7 #define _SC_VERSION 8 /* SVR4 names */ @@ -151,21 +151,21 @@ extern "C" { #define _SC_TIMER_MAX 44 /* XPG4 names */ #define _SC_2_C_BIND 45 -#define _SC_2_C_DEV 46 +#define _SC_2_C_DEV 46 #define _SC_2_C_VERSION 47 -#define _SC_2_FORT_DEV 48 -#define _SC_2_FORT_RUN 49 +#define _SC_2_FORT_DEV 48 +#define _SC_2_FORT_RUN 49 #define _SC_2_LOCALEDEF 50 -#define _SC_2_SW_DEV 51 +#define _SC_2_SW_DEV 51 #define _SC_2_UPE 52 #define _SC_2_VERSION 53 #define _SC_BC_BASE_MAX 54 -#define _SC_BC_DIM_MAX 55 +#define _SC_BC_DIM_MAX 55 #define _SC_BC_SCALE_MAX 56 #define _SC_BC_STRING_MAX 57 #define _SC_COLL_WEIGHTS_MAX 58 #define _SC_EXPR_NEST_MAX 59 -#define _SC_LINE_MAX 60 +#define _SC_LINE_MAX 60 #define _SC_RE_DUP_MAX 61 #define _SC_XOPEN_CRYPT 62 #define _SC_XOPEN_ENH_I18N 63 @@ -210,6 +210,7 @@ extern "C" { #define _SC_NPROCESSORS_MAX 516 /* maximum # of processors */ #define _SC_CPUID_MAX 517 /* maximum CPU id */ #define _SC_EPHID_MAX 518 /* maximum ephemeral id */ +#define _SC_UADDR_MAX 519 /* maximum user address */ /* * POSIX.1c (pthreads) names. These values are defined above @@ -351,7 +352,7 @@ extern "C" { #ifdef _XPG6 #define _POSIX_VERSION 200112L /* Supports IEEE Std 1003.1-2001 */ #else -#define _POSIX_VERSION 199506L /* Supports POSIX-1c DIS */ +#define _POSIX_VERSION 199506L /* Supports POSIX-1c DIS */ #endif #endif /* _POSIX_VERSION */ @@ -359,7 +360,7 @@ extern "C" { #ifdef _XPG6 #define _POSIX2_VERSION 200112L /* Supports IEEE Std 1003.1-2001 */ #else -#define _POSIX2_VERSION 199209L /* Supports ISO POSIX-2 DIS */ +#define _POSIX2_VERSION 199209L /* Supports ISO POSIX-2 DIS */ #endif #endif /* _POSIX2_VERSION */ @@ -395,14 +396,14 @@ extern "C" { #define _POSIX2_FORT_RUN 200112L /* Supports FORTRAN runtime */ #define _POSIX2_LOCALEDEF 200112L /* Supports creation of locales */ #define _POSIX2_SW_DEV 200112L /* Supports S/W Development Utility */ -#define _POSIX2_UPE 200112L /* Supports User Portability Utility */ +#define _POSIX2_UPE 200112L /* Supports User Portability Utility */ #else #define _POSIX2_C_BIND 1 /* Supports C Language Bindings */ #define _POSIX2_C_DEV 1 /* Supports C language dev utility */ #define _POSIX2_FORT_RUN 1 /* Supports FORTRAN runtime */ #define _POSIX2_LOCALEDEF 1 /* Supports creation of locales */ #define _POSIX2_SW_DEV 1 /* Supports S/W Development Utility */ -#define _POSIX2_UPE 1 /* Supports User Portability Utility */ +#define _POSIX2_UPE 1 /* Supports User Portability Utility */ #endif /* _XPG6 */ /* UNIX 03 names */ diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c index e09f4e85a2..96535fdd08 100644 --- a/usr/src/uts/common/syscall/sysconfig.c +++ b/usr/src/uts/common/syscall/sysconfig.c @@ -47,6 +47,7 @@ #include <sys/timer.h> #include <sys/zone.h> #include <sys/vm_usage.h> +#include <vm/as.h> extern rctl_hndl_t rc_process_sigqueue; @@ -208,6 +209,9 @@ sysconfig(int which) case _CONFIG_EPHID_MAX: return (MAXEPHUID); + case _CONFIG_UADDR_MAX: + return ((long)(uintptr_t)curproc->p_as->a_userlimit); + case _CONFIG_SYMLOOP_MAX: return (MAXSYMLINKS); } diff --git a/usr/src/uts/common/syscall/systeminfo.c b/usr/src/uts/common/syscall/systeminfo.c index 21b5ac08ba..00d11e5aba 100644 --- a/usr/src/uts/common/syscall/systeminfo.c +++ b/usr/src/uts/common/syscall/systeminfo.c @@ -19,6 +19,7 @@ * CDDL HEADER END */ /* + * Copyright 2020 Oxide Computer Company * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -26,6 +27,7 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All rights reserved. */ + #include <sys/param.h> #include <sys/types.h> #include <sys/sysmacros.h> @@ -81,6 +83,9 @@ systeminfo(int command, char *buf, long count) kstr = utsname.machine; break; #ifdef _LP64 + case SI_ADDRESS_WIDTH: + kstr = "64"; + break; case SI_ARCHITECTURE_64: case SI_ARCHITECTURE_K: kstr = architecture; @@ -94,6 +99,9 @@ systeminfo(int command, char *buf, long count) architecture : architecture_32; break; #else + case SI_ADDRESS_WIDTH: + kstr = "32"; + break; case SI_ARCHITECTURE_K: case SI_ARCHITECTURE_32: case SI_ARCHITECTURE: diff --git a/usr/src/uts/common/vm/seg_spt.c b/usr/src/uts/common/vm/seg_spt.c index 1a9ef5223f..1308935159 100644 --- a/usr/src/uts/common/vm/seg_spt.c +++ b/usr/src/uts/common/vm/seg_spt.c @@ -63,7 +63,7 @@ size_t spt_used; * See spt_setminfree(). */ pgcnt_t segspt_minfree = 0; -size_t segspt_minfree_clamp = (1UL << 30); /* 1Gb in bytes */ +size_t segspt_minfree_clamp = (1UL << 30); /* 1GB in bytes */ static int segspt_create(struct seg **segpp, void *argsp); static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize); @@ -317,7 +317,7 @@ static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len, * * The traditional default value of 5% of total memory is used, except on * systems where that quickly gets ridiculous: in that case we clamp at a rather - * arbitrary value of 1Gb. + * arbitrary value of 1GB. * * Note that since this is called lazily on the first sptcreate(), in theory, * this could represent a very small value if the system is heavily loaded |