summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/man/man7d/mlxcx.7d39
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx.c75
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx.conf20
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx.h56
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx_gld.c82
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx_intr.c353
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx_reg.h23
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx_ring.c380
8 files changed, 706 insertions, 322 deletions
diff --git a/usr/src/man/man7d/mlxcx.7d b/usr/src/man/man7d/mlxcx.7d
index 5373b5bec5..d7b0cf8ad9 100644
--- a/usr/src/man/man7d/mlxcx.7d
+++ b/usr/src/man/man7d/mlxcx.7d
@@ -11,7 +11,7 @@
.\"
.\" Copyright 2020 the University of Queensland
.\"
-.Dd January 17, 2020
+.Dd April 9, 2020
.Dt MLXCX 7D
.Os
.Sh NAME
@@ -94,8 +94,11 @@ property determines the number of entries on Completion Queues for the device.
The number of entries is calculated as
.Li (1 << cq_size_shift) ,
so a value of 9 would mean 512 entries are created on each Event Queue.
-The default value is
-.Sy 10 .
+The default value is device dependent,
+.Sy 10
+for devices with maximum supported speed of 10Gb/s or less and
+.Sy 12
+for devices with higher supported speeds.
This should be kept very close to the value set for
.Sy rq_size_shift
and
@@ -116,8 +119,11 @@ The number of descriptors is calculated as
.Dv (1 << rq_size_shift) ,
so a value of 9 would mean 512 descriptors are created on each Receive Queue.
This sets the number of packets on RX rings advertised to MAC.
-The default value is
-.Sy 10 .
+The default value is device dependent,
+.Sy 10
+for devices with maximum supported speed of 10Gb/s or less and
+.Sy 12
+for devices with higher supported speeds.
.Ed
.It Sy sq_size_shift
.Bd -filled -compact
@@ -134,8 +140,11 @@ The number of descriptors is calculated as
.Dv (1 << sq_size_shift) ,
so a value of 9 would mean 512 descriptors are created on each Send Queue.
This sets the number of packets on RX rings advertised to MAC.
-The default value is
-.Sy 11 .
+The default value is device dependent,
+.Sy 11
+for devices with maximum supported speed of 10Gb/s or less and
+.Sy 13
+for devices with higher supported speeds.
Note that large packets often occupy more than one descriptor slot on the SQ,
so it is sometimes a good idea to increase this if using a large MTU.
.Ed
@@ -325,6 +334,22 @@ is seldom worth using them for small packets.
The default value is
.Sy 2048 .
.Ed
+.It Sy rx_limit_per_completion
+.Bd -filled -compact
+Minimum:
+.Sy 16 |
+Maximum:
+.Sy 4096
+.Ed
+.Bd -filled
+The
+.Sy rx_limit_per_completion
+property determines the maximum number of packets that
+will be processed on a given completion ring during a single interrupt.
+This is done to try and guarantee some amount of liveness in the system.
+The default value is
+.Sy 256 .
+.Ed
.El
.Sh FILES
.Bl -tag -width Pa
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.c b/usr/src/uts/common/io/mlxcx/mlxcx.c
index 12a8d52b3f..c90fa0969b 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.c
@@ -12,6 +12,7 @@
/*
* Copyright 2020, The University of Queensland
* Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
@@ -453,23 +454,68 @@ uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT;
uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT;
static void
-mlxcx_load_props(mlxcx_t *mlxp)
+mlxcx_load_prop_defaults(mlxcx_t *mlxp)
{
mlxcx_drv_props_t *p = &mlxp->mlx_props;
+ mlxcx_port_t *port = &mlxp->mlx_ports[0];
+
+ VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0);
+ VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0);
+
+ /*
+ * Currently we have different queue size defaults for two
+ * categories of queues. One set for devices which support a
+ * maximum speed of 10Gb/s, and another for those above that.
+ */
+ if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G |
+ MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0) {
+ p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G;
+ p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G;
+ p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G;
+ } else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G |
+ MLXCX_PROTO_10G)) != 0) {
+ p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
+ p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
+ p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
+ } else {
+ mlxcx_warn(mlxp, "Encountered a port with a speed we don't "
+ "recognize. Proto: 0x%x", port->mlp_max_proto);
+ p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
+ p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
+ p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
+ }
+}
+
+/*
+ * Properties which may have different defaults based on hardware
+ * characteristics.
+ */
+static void
+mlxcx_load_model_props(mlxcx_t *mlxp)
+{
+ mlxcx_drv_props_t *p = &mlxp->mlx_props;
+
+ mlxcx_load_prop_defaults(mlxp);
- p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
- DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
- MLXCX_EQ_SIZE_SHIFT_DFLT);
p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift",
- MLXCX_CQ_SIZE_SHIFT_DFLT);
+ p->mldp_cq_size_shift_default);
p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift",
- MLXCX_SQ_SIZE_SHIFT_DFLT);
+ p->mldp_sq_size_shift_default);
p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift",
- MLXCX_RQ_SIZE_SHIFT_DFLT);
+ p->mldp_rq_size_shift_default);
+}
+
+static void
+mlxcx_load_props(mlxcx_t *mlxp)
+{
+ mlxcx_drv_props_t *p = &mlxp->mlx_props;
+ p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
+ DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
+ MLXCX_EQ_SIZE_SHIFT_DFLT);
p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec",
MLXCX_CQEMOD_PERIOD_USEC_DFLT);
@@ -521,6 +567,19 @@ mlxcx_load_props(mlxcx_t *mlxp)
p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
"wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT);
+
+ p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
+ DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion",
+ MLXCX_RX_PER_CQ_DEFAULT);
+
+ if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN ||
+ p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) {
+ mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is "
+ "out of range. Defaulting to: %d. Valid values are from "
+ "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT,
+ MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX);
+ p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT;
+ }
}
void
@@ -2595,6 +2654,8 @@ mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
}
mlxp->mlx_attach |= MLXCX_ATTACH_PORTS;
+ mlxcx_load_model_props(mlxp);
+
/*
* Set up, enable and arm the rest of the interrupt EQs which will
* service events from CQs.
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.conf b/usr/src/uts/common/io/mlxcx/mlxcx.conf
index 3569c4e5f5..321820a47b 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.conf
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.conf
@@ -12,6 +12,7 @@
#
# Copyright 2018, Joyent, Inc.
# Copyright 2020, The University of Queensland
+# Copyright 2020 RackTop Systems, Inc.
#
#
@@ -23,10 +24,15 @@
# Sizing of event and completion queues.
#
# The number of entries on each queue will be (1 << *_size_shift) -- so
-# a value of 9 would mean 512 entries.
+# a value of 10 would mean 1024 entries.
#
#eq_size_shift = 9;
+
+# The default for devices with a maximum supported speed up to 10Gb/s
#cq_size_shift = 10;
+#
+# The default for devices with a maximum supported speed above 10Gb/s
+#cq_size_shift = 12;
#
# Sizing of send and receive queues.
@@ -35,8 +41,13 @@
# advertise to MAC. It also determines how many packet buffers we will allocate
# when starting the interface.
#
+# The defaults for devices with a maximum supported speed up to 10Gb/s
#sq_size_shift = 11;
#rq_size_shift = 10;
+#
+# The defaults for devices with a maximum supported speed above 10Gb/s
+#sq_size_shift = 13;
+#rq_size_shift = 12;
#
# Number and configuration of TX groups and rings.
@@ -99,3 +110,10 @@
#eq_check_interval_sec = 30;
#cq_check_interval_sec = 300;
#wq_check_interval_sec = 300;
+
+#
+# To provide some level of moderation and aid latencies, after
+# "rx_limit_per_completion" packets are received in a single completion
+# event, the interrupt handler will pass the chain up the receive stack.
+#
+#rx_limit_per_completion = 256;
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h
index 3b58989961..bf07691095 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.h
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.h
@@ -12,6 +12,7 @@
/*
* Copyright 2020, The University of Queensland
* Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
@@ -32,6 +33,7 @@
#include <sys/ddifm.h>
#include <sys/id_space.h>
#include <sys/list.h>
+#include <sys/taskq_impl.h>
#include <sys/stddef.h>
#include <sys/stream.h>
#include <sys/strsun.h>
@@ -89,18 +91,36 @@ extern "C" {
* Queues will be sized to (1 << *Q_SIZE_SHIFT) entries long.
*/
#define MLXCX_EQ_SIZE_SHIFT_DFLT 9
+
+/*
+ * The CQ, SQ and RQ sizes can effect throughput on higher speed interfaces.
+ * EQ less so, as it only takes a single EQ entry to indicate there are
+ * multiple completions on the CQ.
+ *
+ * Particularly on the Rx side, the RQ (and corresponding CQ) would run
+ * low on available entries. A symptom of this is the refill taskq running
+ * frequently. A larger RQ (and CQ) alleviates this, and as there is a
+ * close relationship between SQ and CQ size, the SQ is increased too.
+ */
#define MLXCX_CQ_SIZE_SHIFT_DFLT 10
+#define MLXCX_CQ_SIZE_SHIFT_25G 12
/*
* Default to making SQs bigger than RQs for 9k MTU, since most packets will
* spill over into more than one slot. RQ WQEs are always 1 slot.
*/
#define MLXCX_SQ_SIZE_SHIFT_DFLT 11
+#define MLXCX_SQ_SIZE_SHIFT_25G 13
+
#define MLXCX_RQ_SIZE_SHIFT_DFLT 10
+#define MLXCX_RQ_SIZE_SHIFT_25G 12
#define MLXCX_CQ_HWM_GAP 16
#define MLXCX_CQ_LWM_GAP 24
+#define MLXCX_WQ_HWM_GAP MLXCX_CQ_HWM_GAP
+#define MLXCX_WQ_LWM_GAP MLXCX_CQ_LWM_GAP
+
#define MLXCX_RQ_REFILL_STEP 64
/*
@@ -135,6 +155,14 @@ extern "C" {
#define MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT 300
#define MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT 30
+/*
+ * After this many packets, the packets received so far are passed to
+ * the mac layer.
+ */
+#define MLXCX_RX_PER_CQ_DEFAULT 256
+#define MLXCX_RX_PER_CQ_MIN 16
+#define MLXCX_RX_PER_CQ_MAX 4096
+
#define MLXCX_DOORBELL_TRIES_DFLT 3
extern uint_t mlxcx_doorbell_tries;
@@ -417,6 +445,11 @@ typedef struct mlxcx_buffer {
size_t mlb_used;
mblk_t *mlb_tx_mp;
+ /*
+ * The number of work queue basic blocks this buf uses.
+ */
+ uint_t mlb_wqebbs;
+
mlxcx_t *mlb_mlx;
mlxcx_buffer_state_t mlb_state;
uint_t mlb_wqe_index;
@@ -495,6 +528,8 @@ typedef enum {
MLXCX_WQ_DESTROYED = 1 << 3,
MLXCX_WQ_TEARDOWN = 1 << 4,
MLXCX_WQ_BUFFERS = 1 << 5,
+ MLXCX_WQ_REFILLING = 1 << 6,
+ MLXCX_WQ_BLOCKED_MAC = 1 << 7
} mlxcx_workq_state_t;
typedef enum {
@@ -540,12 +575,18 @@ struct mlxcx_work_queue {
};
uint64_t mlwq_pc; /* producer counter */
+ uint64_t mlwq_wqebb_used;
+ size_t mlwq_bufhwm;
+ size_t mlwq_buflwm;
+
mlxcx_dma_buffer_t mlwq_doorbell_dma;
mlxcx_workq_doorbell_t *mlwq_doorbell;
mlxcx_buf_shard_t *mlwq_bufs;
mlxcx_buf_shard_t *mlwq_foreign_bufs;
+ taskq_ent_t mlwq_tqe;
+
boolean_t mlwq_fm_repd_qstate;
};
@@ -773,6 +814,8 @@ struct mlxcx_ring_group {
mlxcx_flow_group_t *mlg_rx_vlan_promisc_fg;
list_t mlg_rx_vlans;
+ taskq_t *mlg_refill_tq;
+
/*
* Flow table for separating out by protocol before hashing
*/
@@ -856,8 +899,11 @@ typedef struct {
typedef struct {
uint_t mldp_eq_size_shift;
uint_t mldp_cq_size_shift;
+ uint_t mldp_cq_size_shift_default;
uint_t mldp_rq_size_shift;
+ uint_t mldp_rq_size_shift_default;
uint_t mldp_sq_size_shift;
+ uint_t mldp_sq_size_shift_default;
uint_t mldp_cqemod_period_usec;
uint_t mldp_cqemod_count;
uint_t mldp_intrmod_period_usec;
@@ -865,6 +911,7 @@ typedef struct {
uint_t mldp_rx_ngroups_small;
uint_t mldp_rx_nrings_per_large_group;
uint_t mldp_rx_nrings_per_small_group;
+ uint_t mldp_rx_per_cq;
uint_t mldp_tx_ngroups;
uint_t mldp_tx_nrings_per_group;
uint_t mldp_ftbl_root_size_shift;
@@ -1098,6 +1145,7 @@ extern boolean_t mlxcx_intr_setup(mlxcx_t *);
extern void mlxcx_intr_teardown(mlxcx_t *);
extern void mlxcx_arm_eq(mlxcx_t *, mlxcx_event_queue_t *);
extern void mlxcx_arm_cq(mlxcx_t *, mlxcx_completion_queue_t *);
+extern void mlxcx_update_cqci(mlxcx_t *, mlxcx_completion_queue_t *);
extern mblk_t *mlxcx_rx_poll(mlxcx_t *, mlxcx_completion_queue_t *, size_t);
@@ -1109,8 +1157,6 @@ extern boolean_t mlxcx_register_mac(mlxcx_t *);
/*
* From mlxcx_ring.c
*/
-extern boolean_t mlxcx_cq_alloc_dma(mlxcx_t *, mlxcx_completion_queue_t *);
-extern void mlxcx_cq_rele_dma(mlxcx_t *, mlxcx_completion_queue_t *);
extern boolean_t mlxcx_wq_alloc_dma(mlxcx_t *, mlxcx_work_queue_t *);
extern void mlxcx_wq_rele_dma(mlxcx_t *, mlxcx_work_queue_t *);
@@ -1118,7 +1164,7 @@ extern boolean_t mlxcx_buf_create(mlxcx_t *, mlxcx_buf_shard_t *,
mlxcx_buffer_t **);
extern boolean_t mlxcx_buf_create_foreign(mlxcx_t *, mlxcx_buf_shard_t *,
mlxcx_buffer_t **);
-extern void mlxcx_buf_take(mlxcx_t *, mlxcx_work_queue_t *, mlxcx_buffer_t **);
+extern mlxcx_buffer_t *mlxcx_buf_take(mlxcx_t *, mlxcx_work_queue_t *);
extern size_t mlxcx_buf_take_n(mlxcx_t *, mlxcx_work_queue_t *,
mlxcx_buffer_t **, size_t);
extern boolean_t mlxcx_buf_loan(mlxcx_t *, mlxcx_buffer_t *);
@@ -1126,8 +1172,8 @@ extern void mlxcx_buf_return(mlxcx_t *, mlxcx_buffer_t *);
extern void mlxcx_buf_return_chain(mlxcx_t *, mlxcx_buffer_t *, boolean_t);
extern void mlxcx_buf_destroy(mlxcx_t *, mlxcx_buffer_t *);
-extern boolean_t mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *,
- mblk_t *, size_t, mlxcx_buffer_t **);
+extern mlxcx_buffer_t *mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *,
+ mblk_t *, size_t);
extern boolean_t mlxcx_rx_group_setup(mlxcx_t *, mlxcx_ring_group_t *);
extern boolean_t mlxcx_tx_group_setup(mlxcx_t *, mlxcx_ring_group_t *);
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
index 7b01702376..a1d50659c1 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
@@ -430,15 +430,10 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
}
}
- if (!mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take, &b)) {
- /*
- * Something went really wrong, and we probably will never be
- * able to TX again (all our buffers are broken and DMA is
- * failing). Drop the packet on the floor -- FMA should be
- * reporting this error elsewhere.
- */
- freemsg(mp);
- return (NULL);
+ b = mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take);
+ if (b == NULL) {
+ atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
+ return (mp);
}
mutex_enter(&sq->mlwq_mtx);
@@ -467,18 +462,20 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
*/
if (cq->mlcq_bufcnt >= cq->mlcq_bufhwm) {
atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC);
- mutex_exit(&sq->mlwq_mtx);
- mlxcx_buf_return_chain(mlxp, b, B_TRUE);
- return (mp);
+ goto blocked;
+ }
+
+ if (sq->mlwq_wqebb_used >= sq->mlwq_bufhwm) {
+ atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
+ goto blocked;
}
ok = mlxcx_sq_add_buffer(mlxp, sq, inline_hdrs, inline_hdrlen,
chkflags, b);
if (!ok) {
atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC);
- mutex_exit(&sq->mlwq_mtx);
- mlxcx_buf_return_chain(mlxp, b, B_TRUE);
- return (mp);
+ atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
+ goto blocked;
}
/*
@@ -493,6 +490,11 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
mutex_exit(&sq->mlwq_mtx);
return (NULL);
+
+blocked:
+ mutex_exit(&sq->mlwq_mtx);
+ mlxcx_buf_return_chain(mlxp, b, B_TRUE);
+ return (mp);
}
static int
@@ -862,9 +864,8 @@ mlxcx_mac_ring_intr_disable(mac_intr_handle_t intrh)
{
mlxcx_completion_queue_t *cq = (mlxcx_completion_queue_t *)intrh;
- atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_POLLING);
mutex_enter(&cq->mlcq_mtx);
- VERIFY(cq->mlcq_state & MLXCX_CQ_POLLING);
+ atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_POLLING);
mutex_exit(&cq->mlcq_mtx);
return (0);
@@ -1061,56 +1062,43 @@ mlxcx_mac_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
case MAC_PROP_EN_100GFDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto &
- (MLXCX_PROTO_100GBASE_CR4 | MLXCX_PROTO_100GBASE_SR4 |
- MLXCX_PROTO_100GBASE_KR4)) != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_100G) != 0);
break;
case MAC_PROP_ADV_50GFDX_CAP:
case MAC_PROP_EN_50GFDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto &
- (MLXCX_PROTO_50GBASE_CR2 | MLXCX_PROTO_50GBASE_KR2 |
- MLXCX_PROTO_50GBASE_SR2)) != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_50G) != 0);
break;
case MAC_PROP_ADV_40GFDX_CAP:
case MAC_PROP_EN_40GFDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto &
- (MLXCX_PROTO_40GBASE_SR4 | MLXCX_PROTO_40GBASE_LR4_ER4 |
- MLXCX_PROTO_40GBASE_CR4 | MLXCX_PROTO_40GBASE_KR4))
- != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_40G) != 0);
break;
case MAC_PROP_ADV_25GFDX_CAP:
case MAC_PROP_EN_25GFDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto &
- (MLXCX_PROTO_25GBASE_CR | MLXCX_PROTO_25GBASE_KR |
- MLXCX_PROTO_25GBASE_SR)) != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_25G) != 0);
break;
case MAC_PROP_ADV_10GFDX_CAP:
case MAC_PROP_EN_10GFDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto &
- (MLXCX_PROTO_10GBASE_CX4 | MLXCX_PROTO_10GBASE_KX4 |
- MLXCX_PROTO_10GBASE_KR | MLXCX_PROTO_10GBASE_CR |
- MLXCX_PROTO_10GBASE_SR | MLXCX_PROTO_10GBASE_ER_LR)) != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_10G) != 0);
break;
case MAC_PROP_ADV_1000FDX_CAP:
case MAC_PROP_EN_1000FDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto & (MLXCX_PROTO_1000BASE_KX |
- MLXCX_PROTO_SGMII)) != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_1G) != 0);
break;
case MAC_PROP_ADV_100FDX_CAP:
case MAC_PROP_EN_100FDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto & MLXCX_PROTO_SGMII_100BASE) != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_100M) != 0);
break;
default:
break;
@@ -1252,8 +1240,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- (MLXCX_PROTO_100GBASE_CR4 | MLXCX_PROTO_100GBASE_SR4 |
- MLXCX_PROTO_100GBASE_KR4)) != 0;
+ MLXCX_PROTO_100G) != 0;
break;
case MAC_PROP_ADV_50GFDX_CAP:
case MAC_PROP_EN_50GFDX_CAP:
@@ -1262,8 +1249,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- (MLXCX_PROTO_50GBASE_CR2 | MLXCX_PROTO_50GBASE_KR2 |
- MLXCX_PROTO_50GBASE_SR2)) != 0;
+ MLXCX_PROTO_50G) != 0;
break;
case MAC_PROP_ADV_40GFDX_CAP:
case MAC_PROP_EN_40GFDX_CAP:
@@ -1272,8 +1258,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- (MLXCX_PROTO_40GBASE_SR4 | MLXCX_PROTO_40GBASE_LR4_ER4 |
- MLXCX_PROTO_40GBASE_CR4 | MLXCX_PROTO_40GBASE_KR4)) != 0;
+ MLXCX_PROTO_40G) != 0;
break;
case MAC_PROP_ADV_25GFDX_CAP:
case MAC_PROP_EN_25GFDX_CAP:
@@ -1282,8 +1267,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- (MLXCX_PROTO_25GBASE_CR | MLXCX_PROTO_25GBASE_KR |
- MLXCX_PROTO_25GBASE_SR)) != 0;
+ MLXCX_PROTO_25G) != 0;
break;
case MAC_PROP_ADV_10GFDX_CAP:
case MAC_PROP_EN_10GFDX_CAP:
@@ -1292,9 +1276,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- (MLXCX_PROTO_10GBASE_CX4 | MLXCX_PROTO_10GBASE_KX4 |
- MLXCX_PROTO_10GBASE_KR | MLXCX_PROTO_10GBASE_CR |
- MLXCX_PROTO_10GBASE_SR | MLXCX_PROTO_10GBASE_ER_LR)) != 0;
+ MLXCX_PROTO_10G) != 0;
break;
case MAC_PROP_ADV_1000FDX_CAP:
case MAC_PROP_EN_1000FDX_CAP:
@@ -1303,7 +1285,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- (MLXCX_PROTO_1000BASE_KX | MLXCX_PROTO_SGMII)) != 0;
+ MLXCX_PROTO_1G) != 0;
break;
case MAC_PROP_ADV_100FDX_CAP:
case MAC_PROP_EN_100FDX_CAP:
@@ -1312,7 +1294,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- MLXCX_PROTO_SGMII_100BASE) != 0;
+ MLXCX_PROTO_100M) != 0;
break;
default:
ret = ENOTSUP;
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
index 0516f86d6b..4dc4291b08 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
@@ -11,6 +11,7 @@
/*
* Copyright (c) 2020, the University of Queensland
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
@@ -26,6 +27,11 @@
#include <mlxcx.h>
+/*
+ * CTASSERT(s) to cover bad values which would induce bugs.
+ */
+CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP);
+
void
mlxcx_intr_teardown(mlxcx_t *mlxp)
{
@@ -190,6 +196,31 @@ mlxcx_cq_next(mlxcx_completion_queue_t *mlcq)
}
void
+mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
+{
+ ddi_fm_error_t err;
+ uint_t try = 0;
+
+ mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
+
+retry:
+ MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
+ ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
+ DDI_FME_VERSION);
+ if (err.fme_status != DDI_FM_OK) {
+ if (try++ < mlxcx_doorbell_tries) {
+ ddi_fm_dma_err_clear(
+ mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
+ DDI_FME_VERSION);
+ goto retry;
+ } else {
+ ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
+ return;
+ }
+ }
+}
+
+void
mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
{
bits32_t dbval = new_bits32();
@@ -538,14 +569,15 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2)
if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
!(mleq->mleq_state & MLXCX_EQ_CREATED) ||
(mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
- mlxcx_warn(mlxp, "int0 on bad eq state");
+ mlxcx_warn(mlxp, "int %d on bad eq state",
+ mleq->mleq_intr_index);
mutex_exit(&mleq->mleq_mtx);
return (DDI_INTR_UNCLAIMED);
}
ent = mlxcx_eq_next(mleq);
if (ent == NULL) {
- mlxcx_warn(mlxp, "spurious int 0?");
+ mlxcx_warn(mlxp, "spurious int %d", mleq->mleq_intr_index);
mutex_exit(&mleq->mleq_mtx);
return (DDI_INTR_UNCLAIMED);
}
@@ -574,8 +606,8 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2)
mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod);
break;
default:
- mlxcx_warn(mlxp, "unhandled event 0x%x on int0",
- ent->mleqe_event_type);
+ mlxcx_warn(mlxp, "unhandled event 0x%x on int %d",
+ ent->mleqe_event_type, mleq->mleq_intr_index);
}
}
@@ -591,46 +623,56 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2)
return (DDI_INTR_CLAIMED);
}
-mblk_t *
-mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
+static boolean_t
+mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp,
+ size_t bytelim)
{
- mlxcx_buffer_t *buf;
- mblk_t *mp, *cmp, *nmp;
+ mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
mlxcx_completionq_ent_t *cent;
+ mblk_t *mp, *cmp, *nmp;
+ mlxcx_buffer_t *buf;
+ boolean_t found, added;
size_t bytes = 0;
- boolean_t found;
-
- ASSERT(mutex_owned(&mlcq->mlcq_mtx));
+ uint_t rx_frames = 0;
+ uint_t comp_cnt = 0;
+ int64_t wqebbs, bufcnt;
- ASSERT(mlcq->mlcq_wq != NULL);
- ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
+ *mpp = NULL;
if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
!(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
(mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
(mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
- return (NULL);
+ return (B_FALSE);
}
- ASSERT(mlcq->mlcq_state & MLXCX_CQ_POLLING);
-
nmp = cmp = mp = NULL;
- cent = mlxcx_cq_next(mlcq);
- for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) {
+ wqebbs = 0;
+ bufcnt = 0;
+ for (cent = mlxcx_cq_next(mlcq); cent != NULL;
+ cent = mlxcx_cq_next(mlcq)) {
/*
* Teardown and ring stop can atomic_or this flag
* into our state if they want us to stop early.
*/
if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
- break;
+ return (B_FALSE);
+ comp_cnt++;
if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
/* NOP */
+ atomic_dec_64(&wq->mlwq_wqebb_used);
goto nextcq;
}
+lookagain:
+ /*
+ * Generally the buffer we're looking for will be
+ * at the front of the list, so this loop won't
+ * need to look far.
+ */
buf = list_head(&mlcq->mlcq_buffers);
found = B_FALSE;
while (buf != NULL) {
@@ -641,36 +683,118 @@ mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
}
buf = list_next(&mlcq->mlcq_buffers, buf);
}
+
if (!found) {
+ /*
+ * If there's any buffers waiting on the
+ * buffers_b list, then merge those into
+ * the main list and have another look.
+ *
+ * The wq enqueue routines push new buffers
+ * into buffers_b so that they can avoid
+ * taking the mlcq_mtx and blocking us for
+ * every single packet.
+ */
+ added = B_FALSE;
+ mutex_enter(&mlcq->mlcq_bufbmtx);
+ if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
+ list_move_tail(&mlcq->mlcq_buffers,
+ &mlcq->mlcq_buffers_b);
+ added = B_TRUE;
+ }
+ mutex_exit(&mlcq->mlcq_bufbmtx);
+ if (added)
+ goto lookagain;
+
buf = list_head(&mlcq->mlcq_buffers);
mlxcx_warn(mlxp, "got completion on CQ %x but "
"no buffer matching wqe found: %x (first "
"buffer counter = %x)", mlcq->mlcq_num,
from_be16(cent->mlcqe_wqe_counter),
- buf == NULL ? UINT32_MAX : buf->mlb_wqe_index);
+ buf == NULL ? UINT32_MAX :
+ buf->mlb_wqe_index);
mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
goto nextcq;
}
+
+ /*
+ * The buf is likely to be freed below, count this now.
+ */
+ wqebbs += buf->mlb_wqebbs;
+
list_remove(&mlcq->mlcq_buffers, buf);
- atomic_dec_64(&mlcq->mlcq_bufcnt);
+ bufcnt++;
- nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
- if (nmp != NULL) {
+ switch (mlcq->mlcq_wq->mlwq_type) {
+ case MLXCX_WQ_TYPE_SENDQ:
+ mlxcx_tx_completion(mlxp, mlcq, cent, buf);
+ break;
+ case MLXCX_WQ_TYPE_RECVQ:
+ nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
bytes += from_be32(cent->mlcqe_byte_cnt);
- if (cmp != NULL) {
- cmp->b_next = nmp;
- cmp = nmp;
- } else {
- mp = cmp = nmp;
+ if (nmp != NULL) {
+ if (cmp != NULL) {
+ cmp->b_next = nmp;
+ cmp = nmp;
+ } else {
+ mp = cmp = nmp;
+ }
+
+ rx_frames++;
}
+ break;
}
-nextcq:
- mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
- if (bytelim != 0 && bytes > bytelim)
+ /*
+ * Update the consumer index with what has been processed,
+ * followed by driver counters. It is important to tell the
+ * hardware first, otherwise when we throw more packets at
+ * it, it may get an overflow error.
+ * We do this whenever we've processed enough to bridge the
+ * high->low water mark.
+ */
+ if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) {
+ mlxcx_update_cqci(mlxp, mlcq);
+ /*
+ * Both these variables are incremented using
+ * atomics as they are modified in other code paths
+ * (Eg during tx) which hold different locks.
+ */
+ atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
+ atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
+ wqebbs = 0;
+ bufcnt = 0;
+ comp_cnt = 0;
+ }
+nextcq:
+ if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq ||
+ (bytelim != 0 && bytes > bytelim))
break;
}
+ if (comp_cnt > 0) {
+ mlxcx_update_cqci(mlxp, mlcq);
+ atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
+ atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
+ }
+
+ *mpp = mp;
+ return (B_TRUE);
+}
+
+
+mblk_t *
+mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
+{
+ mblk_t *mp = NULL;
+
+ ASSERT(mutex_owned(&mlcq->mlcq_mtx));
+
+ ASSERT(mlcq->mlcq_wq != NULL);
+ ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
+
+ (void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim);
+
return (mp);
}
@@ -680,11 +804,10 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2)
mlxcx_t *mlxp = (mlxcx_t *)arg;
mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
mlxcx_eventq_ent_t *ent;
- mlxcx_completionq_ent_t *cent;
mlxcx_completion_queue_t *mlcq, probe;
- mlxcx_buffer_t *buf;
- mblk_t *mp, *cmp, *nmp;
- boolean_t found, tellmac = B_FALSE, added;
+ mlxcx_work_queue_t *mlwq;
+ mblk_t *mp = NULL;
+ boolean_t tellmac = B_FALSE;
mutex_enter(&mleq->mleq_mtx);
@@ -729,10 +852,12 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2)
if (mlcq == NULL)
continue;
+ mlwq = mlcq->mlcq_wq;
+
/*
* The polling function might have the mutex and stop us from
- * getting the lock here, so we increment the event counter
- * atomically from outside.
+ * getting the lock in mlxcx_process_cq(), so we increment
+ * the event counter atomically from outside.
*
* This way at the end of polling when we go back to interrupts
* from this CQ, the event counter is still correct.
@@ -746,145 +871,57 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2)
if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) {
/*
- * If we failed to take the mutex because the polling
- * function has it, just move on. We don't want to
- * block other CQs behind this one.
+ * If we failed to take the mutex because the
+ * polling function has it, just move on.
+ * We don't want to block other CQs behind
+ * this one.
*/
if (mlcq->mlcq_state & MLXCX_CQ_POLLING)
- continue;
+ goto update_eq;
+
/* Otherwise we will wait. */
mutex_enter(&mlcq->mlcq_mtx);
}
- if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
- !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
- (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
- (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) ||
- (mlcq->mlcq_state & MLXCX_CQ_POLLING)) {
- mutex_exit(&mlcq->mlcq_mtx);
- continue;
- }
-
- nmp = cmp = mp = NULL;
- tellmac = B_FALSE;
-
- cent = mlxcx_cq_next(mlcq);
- for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) {
+ if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 &&
+ mlxcx_process_cq(mlxp, mlcq, &mp, 0)) {
/*
- * Teardown and ring stop can atomic_or this flag
- * into our state if they want us to stop early.
+ * The ring is not in polling mode and we processed
+ * some completion queue entries.
*/
- if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
- break;
- if (mlcq->mlcq_state & MLXCX_CQ_POLLING)
- break;
-
- if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
- cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
- /* NOP */
- goto nextcq;
+ if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 &&
+ mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
+ atomic_and_uint(&mlcq->mlcq_state,
+ ~MLXCX_CQ_BLOCKED_MAC);
+ tellmac = B_TRUE;
}
-lookagain:
- /*
- * Generally the buffer we're looking for will be
- * at the front of the list, so this loop won't
- * need to look far.
- */
- buf = list_head(&mlcq->mlcq_buffers);
- found = B_FALSE;
- while (buf != NULL) {
- if ((buf->mlb_wqe_index & UINT16_MAX) ==
- from_be16(cent->mlcqe_wqe_counter)) {
- found = B_TRUE;
- break;
- }
- buf = list_next(&mlcq->mlcq_buffers, buf);
- }
- if (!found) {
- /*
- * If there's any buffers waiting on the
- * buffers_b list, then merge those into
- * the main list and have another look.
- *
- * The wq enqueue routines push new buffers
- * into buffers_b so that they can avoid
- * taking the mlcq_mtx and blocking us for
- * every single packet.
- */
- added = B_FALSE;
- mutex_enter(&mlcq->mlcq_bufbmtx);
- if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
- list_move_tail(&mlcq->mlcq_buffers,
- &mlcq->mlcq_buffers_b);
- added = B_TRUE;
- }
- mutex_exit(&mlcq->mlcq_bufbmtx);
- if (added)
- goto lookagain;
- }
- if (!found) {
- buf = list_head(&mlcq->mlcq_buffers);
- mlxcx_warn(mlxp, "got completion on CQ %x but "
- "no buffer matching wqe found: %x (first "
- "buffer counter = %x)", mlcq->mlcq_num,
- from_be16(cent->mlcqe_wqe_counter),
- buf == NULL ? UINT32_MAX :
- buf->mlb_wqe_index);
- mlxcx_fm_ereport(mlxp,
- DDI_FM_DEVICE_INVAL_STATE);
- goto nextcq;
+ if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 &&
+ mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) {
+ atomic_and_uint(&mlwq->mlwq_state,
+ ~MLXCX_WQ_BLOCKED_MAC);
+ tellmac = B_TRUE;
}
- list_remove(&mlcq->mlcq_buffers, buf);
- atomic_dec_64(&mlcq->mlcq_bufcnt);
- switch (mlcq->mlcq_wq->mlwq_type) {
- case MLXCX_WQ_TYPE_SENDQ:
- mlxcx_tx_completion(mlxp, mlcq, cent, buf);
- break;
- case MLXCX_WQ_TYPE_RECVQ:
- nmp = mlxcx_rx_completion(mlxp, mlcq, cent,
- buf);
- if (nmp != NULL) {
- if (cmp != NULL) {
- cmp->b_next = nmp;
- cmp = nmp;
- } else {
- mp = cmp = nmp;
- }
- }
- break;
- }
+ mlxcx_arm_cq(mlxp, mlcq);
-nextcq:
- /*
- * Update the "doorbell" consumer counter for the queue
- * every time. Unlike a UAR write, this is relatively
- * cheap and doesn't require us to go out on the bus
- * straight away (since it's our memory).
- */
- mlcq->mlcq_doorbell->mlcqd_update_ci =
- to_be24(mlcq->mlcq_cc);
+ mutex_exit(&mlcq->mlcq_mtx);
- if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) &&
- mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
- mlcq->mlcq_state &= ~MLXCX_CQ_BLOCKED_MAC;
- tellmac = B_TRUE;
+ if (tellmac) {
+ mac_tx_ring_update(mlxp->mlx_mac_hdl,
+ mlcq->mlcq_mac_hdl);
+ tellmac = B_FALSE;
}
- }
- mlxcx_arm_cq(mlxp, mlcq);
- mutex_exit(&mlcq->mlcq_mtx);
-
- if (tellmac) {
- mac_tx_ring_update(mlxp->mlx_mac_hdl,
- mlcq->mlcq_mac_hdl);
- }
- if (mp != NULL) {
- mac_rx_ring(mlxp->mlx_mac_hdl, mlcq->mlcq_mac_hdl,
- mp, mlcq->mlcq_mac_gen);
+ if (mp != NULL) {
+ mac_rx_ring(mlxp->mlx_mac_hdl,
+ mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen);
+ }
+ } else {
+ mutex_exit(&mlcq->mlcq_mtx);
}
+update_eq:
/*
* Updating the consumer counter for an EQ requires a write
* to the UAR, which is possibly expensive.
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
index 76d0da30e7..f65280d41d 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
@@ -12,6 +12,7 @@
/*
* Copyright 2020, The University of Queensland
* Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
*/
#ifndef _MLXCX_REG_H
@@ -2259,6 +2260,28 @@ typedef enum {
MLXCX_PROTO_50GBASE_KR2 = 1UL << 31,
} mlxcx_eth_proto_t;
+#define MLXCX_PROTO_100M MLXCX_PROTO_SGMII_100BASE
+
+#define MLXCX_PROTO_1G (MLXCX_PROTO_1000BASE_KX | MLXCX_PROTO_SGMII)
+
+#define MLXCX_PROTO_10G (MLXCX_PROTO_10GBASE_CX4 | \
+ MLXCX_PROTO_10GBASE_KX4 | MLXCX_PROTO_10GBASE_KR | \
+ MLXCX_PROTO_10GBASE_CR | MLXCX_PROTO_10GBASE_SR | \
+ MLXCX_PROTO_10GBASE_ER_LR)
+
+#define MLXCX_PROTO_25G (MLXCX_PROTO_25GBASE_CR | \
+ MLXCX_PROTO_25GBASE_KR | MLXCX_PROTO_25GBASE_SR)
+
+#define MLXCX_PROTO_40G (MLXCX_PROTO_40GBASE_SR4 | \
+ MLXCX_PROTO_40GBASE_LR4_ER4 | MLXCX_PROTO_40GBASE_CR4 | \
+ MLXCX_PROTO_40GBASE_KR4)
+
+#define MLXCX_PROTO_50G (MLXCX_PROTO_50GBASE_CR2 | \
+ MLXCX_PROTO_50GBASE_KR2 | MLXCX_PROTO_50GBASE_SR2)
+
+#define MLXCX_PROTO_100G (MLXCX_PROTO_100GBASE_CR4 | \
+ MLXCX_PROTO_100GBASE_SR4 | MLXCX_PROTO_100GBASE_KR4)
+
typedef enum {
MLXCX_AUTONEG_DISABLE_CAP = 1 << 5,
MLXCX_AUTONEG_DISABLE = 1 << 6
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
index 8337545b57..da609ed28c 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
@@ -12,6 +12,7 @@
/*
* Copyright 2020, The University of Queensland
* Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
@@ -113,8 +114,9 @@ mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC;
}
-boolean_t
-mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
+static boolean_t
+mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
+ uint_t ent_shift)
{
ddi_device_acc_attr_t acc;
ddi_dma_attr_t attr;
@@ -123,7 +125,7 @@ mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC);
- mlcq->mlcq_entshift = mlxp->mlx_props.mldp_cq_size_shift;
+ mlcq->mlcq_entshift = ent_shift;
mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift);
sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t);
ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
@@ -165,7 +167,7 @@ mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
return (B_TRUE);
}
-void
+static void
mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
{
VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC);
@@ -331,7 +333,7 @@ mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
static boolean_t
mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
- mlxcx_completion_queue_t **cqp)
+ mlxcx_completion_queue_t **cqp, uint_t ent_shift)
{
mlxcx_completion_queue_t *cq;
@@ -350,7 +352,7 @@ mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
mutex_enter(&cq->mlcq_mtx);
- if (!mlxcx_cq_alloc_dma(mlxp, cq)) {
+ if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) {
mutex_exit(&cq->mlcq_mtx);
return (B_FALSE);
}
@@ -413,6 +415,9 @@ mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq,
return (B_FALSE);
}
+ wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
+ wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
+
mutex_exit(&wq->mlwq_mtx);
mutex_enter(&cq->mlcq_mtx);
@@ -459,6 +464,9 @@ mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
return (B_FALSE);
}
+ wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
+ wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
+
mutex_exit(&wq->mlwq_mtx);
mutex_enter(&cq->mlcq_mtx);
@@ -471,6 +479,35 @@ mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
return (B_TRUE);
}
+/*
+ * Before we tear down the queues associated with the rx group,
+ * flag each cq as being torn down and wake up any tasks.
+ */
+static void
+mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
+{
+ mlxcx_work_queue_t *wq;
+ mlxcx_completion_queue_t *cq;
+ mlxcx_buf_shard_t *s;
+ uint_t i;
+
+ mutex_enter(&g->mlg_mtx);
+
+ for (i = 0; i < g->mlg_nwqs; ++i) {
+ wq = &g->mlg_wqs[i];
+ cq = wq->mlwq_cq;
+ if (cq != NULL) {
+ s = wq->mlwq_bufs;
+ mutex_enter(&s->mlbs_mtx);
+ atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN);
+ cv_broadcast(&s->mlbs_free_nonempty);
+ mutex_exit(&s->mlbs_mtx);
+ }
+ }
+
+ mutex_exit(&g->mlg_mtx);
+}
+
void
mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
{
@@ -551,6 +588,7 @@ mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
}
mutex_exit(&wq->mlwq_mtx);
}
+ taskq_destroy(g->mlg_refill_tq);
g->mlg_state &= ~MLXCX_GROUP_RUNNING;
}
@@ -662,8 +700,16 @@ mlxcx_teardown_groups(mlxcx_t *mlxp)
if (!(g->mlg_state & MLXCX_GROUP_INIT))
continue;
ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX);
+ mlxcx_quiesce_rx_cqs(mlxp, g);
+ }
+
+ for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
+ g = &mlxp->mlx_rx_groups[i];
+ if (!(g->mlg_state & MLXCX_GROUP_INIT))
+ continue;
mlxcx_teardown_rx_group(mlxp, g);
}
+
kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size);
mlxp->mlx_rx_groups = NULL;
@@ -674,6 +720,7 @@ mlxcx_teardown_groups(mlxcx_t *mlxp)
ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX);
mlxcx_teardown_tx_group(mlxp, g);
}
+
kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size);
mlxp->mlx_tx_groups = NULL;
}
@@ -687,6 +734,7 @@ mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
mlxcx_flow_table_t *ft;
mlxcx_flow_group_t *fg;
mlxcx_flow_entry_t *fe;
+ uint_t ent_shift;
uint_t i, j;
ASSERT3S(g->mlg_state, ==, 0);
@@ -730,10 +778,18 @@ mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
}
}
- if (!mlxcx_cq_setup(mlxp, eq, &cq)) {
+ /*
+ * A single completion is indicated for each rq entry as
+ * it is used. So, the number of cq entries never needs
+ * to be larger than the rq.
+ */
+ ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift,
+ mlxp->mlx_props.mldp_rq_size_shift);
+ if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) {
g->mlg_nwqs = i;
break;
}
+
cq->mlcq_stats = &g->mlg_port->mlp_stats;
rq = &g->mlg_wqs[i];
@@ -1182,6 +1238,7 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
mlxcx_flow_table_t *ft;
mlxcx_flow_group_t *fg;
mlxcx_flow_entry_t *fe;
+ char tq_name[TASKQ_NAMELEN];
mutex_enter(&g->mlg_mtx);
@@ -1194,6 +1251,23 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
g->mlg_state |= MLXCX_GROUP_RUNNING;
+ snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld",
+ ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst,
+ g - &mlxp->mlx_rx_groups[0]);
+
+ /*
+ * Create one refill taskq per group with one thread per work queue.
+ * The refill task may block waiting for resources, so by effectively
+ * having one thread per work queue we avoid work queues blocking each
+ * other.
+ */
+ if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri,
+ g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
+ mlxcx_warn(mlxp, "failed to create rq refill task queue");
+ mutex_exit(&g->mlg_mtx);
+ return (B_FALSE);
+ }
+
if (g == &mlxp->mlx_rx_groups[0]) {
ft = g->mlg_port->mlp_rx_flow;
mutex_enter(&ft->mlft_mtx);
@@ -1207,6 +1281,8 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
mutex_exit(&ft->mlft_mtx);
+ g->mlg_state &= ~MLXCX_GROUP_RUNNING;
+ taskq_destroy(g->mlg_refill_tq);
mutex_exit(&g->mlg_mtx);
return (B_FALSE);
}
@@ -1273,8 +1349,10 @@ mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
}
}
- if (!mlxcx_cq_setup(mlxp, eq, &cq))
+ if (!mlxcx_cq_setup(mlxp, eq, &cq,
+ mlxp->mlx_props.mldp_cq_size_shift))
return (B_FALSE);
+
cq->mlcq_stats = &g->mlg_port->mlp_stats;
sq = &g->mlg_wqs[i];
@@ -1409,6 +1487,11 @@ mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
ent0 = &mlwq->mlwq_send_ent[index];
start_pc = mlwq->mlwq_pc;
++mlwq->mlwq_pc;
+ /*
+ * This counter is manipulated in the interrupt handler, which
+ * does not hold the mlwq_mtx, hence the atomic.
+ */
+ atomic_inc_64(&mlwq->mlwq_wqebb_used);
bzero(ent0, sizeof (mlxcx_sendq_ent_t));
ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP;
@@ -1441,7 +1524,7 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags,
mlxcx_buffer_t *b0)
{
- uint_t index, first, ents = 0;
+ uint_t index, first, ents;
mlxcx_completion_queue_t *cq;
mlxcx_sendq_ent_t *ent0;
mlxcx_sendq_extra_ent_t *ent;
@@ -1449,8 +1532,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
uint_t ptri, nptr;
const ddi_dma_cookie_t *c;
size_t rem;
+ uint64_t wqebb_used;
mlxcx_buffer_t *b;
ddi_fm_error_t err;
+ boolean_t rv;
ASSERT(mutex_owned(&mlwq->mlwq_mtx));
ASSERT3P(b0->mlb_tx_head, ==, b0);
@@ -1460,16 +1545,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
ent0 = &mlwq->mlwq_send_ent[index];
b0->mlb_wqe_index = mlwq->mlwq_pc;
- ++mlwq->mlwq_pc;
- ++ents;
+ ents = 1;
first = index;
- mutex_enter(&cq->mlcq_bufbmtx);
- list_insert_tail(&cq->mlcq_buffers_b, b0);
- atomic_inc_64(&cq->mlcq_bufcnt);
- mutex_exit(&cq->mlcq_bufbmtx);
-
bzero(ent0, sizeof (mlxcx_sendq_ent_t));
ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND;
ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
@@ -1502,6 +1581,16 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM);
}
+ /*
+ * mlwq_wqebb_used is only incremented whilst holding
+ * the mlwq_mtx mutex, but it is decremented (atomically) in
+ * the interrupt context *not* under mlwq_mtx mutex.
+ * So, now take a snapshot of the number of used wqes which will
+ * be a conistent maximum we can use whilst iterating through
+ * the buffers and DMA cookies.
+ */
+ wqebb_used = mlwq->mlwq_wqebb_used;
+
b = b0;
ptri = 0;
nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t);
@@ -1513,9 +1602,12 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
while (rem > 0 &&
(c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) {
if (ptri >= nptr) {
- index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
+ if ((ents + wqebb_used) >= mlwq->mlwq_nents)
+ return (B_FALSE);
+
+ index = (mlwq->mlwq_pc + ents) &
+ (mlwq->mlwq_nents - 1);
ent = &mlwq->mlwq_send_extra_ent[index];
- ++mlwq->mlwq_pc;
++ents;
seg = ent->mlsqe_data;
@@ -1548,6 +1640,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
}
}
+ b0->mlb_wqebbs = ents;
+ mlwq->mlwq_pc += ents;
+ atomic_add_64(&mlwq->mlwq_wqebb_used, ents);
+
for (; ptri < nptr; ++ptri, ++seg) {
seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
seg->mlds_byte_count = to_be32(0);
@@ -1566,10 +1662,24 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
if (err.fme_status != DDI_FM_OK) {
return (B_FALSE);
}
- if (!mlxcx_sq_ring_dbell(mlxp, mlwq, first)) {
- return (B_FALSE);
+
+ /*
+ * Hold the bufmtx whilst ringing the doorbell, to prevent
+ * the buffer from being moved to another list, so we can
+ * safely remove it should the ring fail.
+ */
+ mutex_enter(&cq->mlcq_bufbmtx);
+
+ list_insert_tail(&cq->mlcq_buffers_b, b0);
+ if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) {
+ atomic_inc_64(&cq->mlcq_bufcnt);
+ } else {
+ list_remove(&cq->mlcq_buffers_b, b0);
}
- return (B_TRUE);
+
+ mutex_exit(&cq->mlcq_bufbmtx);
+
+ return (rv);
}
boolean_t
@@ -1604,8 +1714,10 @@ mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
ent = &mlwq->mlwq_recv_ent[index];
buf->mlb_wqe_index = mlwq->mlwq_pc;
+ buf->mlb_wqebbs = 1;
++mlwq->mlwq_pc;
+ atomic_inc_64(&mlwq->mlwq_wqebb_used);
mutex_enter(&cq->mlcq_bufbmtx);
list_insert_tail(&cq->mlcq_buffers, buf);
@@ -1666,11 +1778,53 @@ mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
return (B_TRUE);
}
+static void
+mlxcx_rq_refill_task(void *arg)
+{
+ mlxcx_work_queue_t *wq = arg;
+ mlxcx_completion_queue_t *cq = wq->mlwq_cq;
+ mlxcx_t *mlxp = wq->mlwq_mlx;
+ mlxcx_buf_shard_t *s = wq->mlwq_bufs;
+ boolean_t refill;
+
+ do {
+ /*
+ * Wait until there are some free buffers.
+ */
+ mutex_enter(&s->mlbs_mtx);
+ while (list_is_empty(&s->mlbs_free) &&
+ (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0)
+ cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
+ mutex_exit(&s->mlbs_mtx);
+
+ mutex_enter(&cq->mlcq_mtx);
+ mutex_enter(&wq->mlwq_mtx);
+
+ if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) {
+ refill = B_FALSE;
+ wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
+ } else {
+ mlxcx_rq_refill(mlxp, wq);
+
+ if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) {
+ refill = B_TRUE;
+ } else {
+ refill = B_FALSE;
+ wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
+ }
+ }
+
+ mutex_exit(&wq->mlwq_mtx);
+ mutex_exit(&cq->mlcq_mtx);
+ } while (refill);
+}
+
void
mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
{
size_t target, current, want, done, n;
mlxcx_completion_queue_t *cq;
+ mlxcx_ring_group_t *g;
mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP];
uint_t i;
@@ -1697,10 +1851,24 @@ mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) {
n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP);
if (n == 0) {
- mlxcx_warn(mlxp, "!exiting rq refill early, done %u "
- "but wanted %u", done, want);
+ /*
+ * We didn't get any buffers from the free queue.
+ * It might not be an issue, schedule a taskq
+ * to wait for free buffers if the completion
+ * queue is low.
+ */
+ if (current < MLXCX_RQ_REFILL_STEP &&
+ (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) {
+ mlwq->mlwq_state |= MLXCX_WQ_REFILLING;
+ g = mlwq->mlwq_group;
+ taskq_dispatch_ent(g->mlg_refill_tq,
+ mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP,
+ &mlwq->mlwq_tqe);
+ }
+
return;
}
+
if (mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) {
for (i = 0; i < n; ++i)
mlxcx_buf_return(mlxp, b[i]);
@@ -1826,6 +1994,7 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
{
uint32_t chkflags = 0;
+ uint_t wqe_index;
ddi_fm_error_t err;
ASSERT(mutex_owned(&mlcq->mlcq_mtx));
@@ -1868,6 +2037,12 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
return (NULL);
}
+ /*
+ * mlxcx_buf_loan() will set mlb_wqe_index to zero.
+ * Remember it for later.
+ */
+ wqe_index = buf->mlb_wqe_index;
+
if (!mlxcx_buf_loan(mlxp, buf)) {
mlxcx_warn(mlxp, "!loan failed, dropping packet");
mlxcx_buf_return(mlxp, buf);
@@ -1894,7 +2069,7 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
* Don't check if a refill is needed on every single completion,
* since checking involves taking the RQ lock.
*/
- if ((buf->mlb_wqe_index & 0x7) == 0) {
+ if ((wqe_index & 0x7) == 0) {
mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
ASSERT(wq != NULL);
mutex_enter(&wq->mlwq_mtx);
@@ -1981,39 +2156,66 @@ mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard,
return (B_TRUE);
}
-static void
-mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
- mlxcx_buffer_t **bp)
+static mlxcx_buffer_t *
+mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
{
mlxcx_buffer_t *b;
mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs;
mutex_enter(&s->mlbs_mtx);
- while (list_is_empty(&s->mlbs_free))
- cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
- b = list_remove_head(&s->mlbs_free);
- ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
- ASSERT(b->mlb_foreign);
- b->mlb_state = MLXCX_BUFFER_ON_WQ;
- list_insert_tail(&s->mlbs_busy, b);
+ if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
+ ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
+ ASSERT(b->mlb_foreign);
+ b->mlb_state = MLXCX_BUFFER_ON_WQ;
+ list_insert_tail(&s->mlbs_busy, b);
+ }
mutex_exit(&s->mlbs_mtx);
- *bp = b;
+ return (b);
}
-boolean_t
+static mlxcx_buffer_t *
+mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz)
+{
+ ddi_fm_error_t err;
+ mlxcx_buffer_t *b;
+ uint_t attempts = 0;
+
+copyb:
+ if ((b = mlxcx_buf_take(mlxp, wq)) == NULL)
+ return (NULL);
+
+ ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
+ bcopy(rptr, b->mlb_dma.mxdb_va, sz);
+
+ MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
+
+ ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
+ DDI_FME_VERSION);
+ if (err.fme_status != DDI_FM_OK) {
+ ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
+ DDI_FME_VERSION);
+ mlxcx_buf_return(mlxp, b);
+ if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
+ return (NULL);
+ }
+ goto copyb;
+ }
+
+ return (b);
+}
+
+mlxcx_buffer_t *
mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
- mblk_t *mpb, size_t off, mlxcx_buffer_t **bp)
+ mblk_t *mpb, size_t off)
{
mlxcx_buffer_t *b, *b0 = NULL;
boolean_t first = B_TRUE;
- ddi_fm_error_t err;
mblk_t *mp;
uint8_t *rptr;
size_t sz;
size_t ncookies = 0;
boolean_t ret;
- uint_t attempts = 0;
for (mp = mpb; mp != NULL; mp = mp->b_cont) {
rptr = mp->b_rptr;
@@ -2024,31 +2226,24 @@ mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
rptr += off;
sz -= off;
- if (sz < mlxp->mlx_props.mldp_tx_bind_threshold)
- goto copyb;
-
- mlxcx_buf_take_foreign(mlxp, wq, &b);
- ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, B_FALSE);
+ if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) {
+ b = mlxcx_copy_data(mlxp, wq, rptr, sz);
+ if (b == NULL)
+ goto failed;
+ } else {
+ b = mlxcx_buf_take_foreign(mlxp, wq);
+ if (b == NULL)
+ goto failed;
- if (!ret) {
- mlxcx_buf_return(mlxp, b);
+ ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off,
+ B_FALSE);
-copyb:
- mlxcx_buf_take(mlxp, wq, &b);
- ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
- bcopy(rptr, b->mlb_dma.mxdb_va, sz);
- MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
- ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
- DDI_FME_VERSION);
- if (err.fme_status != DDI_FM_OK) {
- ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
- DDI_FME_VERSION);
+ if (!ret) {
mlxcx_buf_return(mlxp, b);
- if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
- *bp = NULL;
- return (B_FALSE);
- }
- goto copyb;
+
+ b = mlxcx_copy_data(mlxp, wq, rptr, sz);
+ if (b == NULL)
+ goto failed;
}
}
@@ -2082,54 +2277,44 @@ copyb:
ASSERT3U(ncookies, <=, MLXCX_SQE_MAX_PTRS);
- *bp = b0;
- return (B_TRUE);
+ return (b0);
+
+failed:
+ if (b0 != NULL)
+ mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
+
+ return (NULL);
}
-void
-mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mlxcx_buffer_t **bp)
+mlxcx_buffer_t *
+mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
{
mlxcx_buffer_t *b;
mlxcx_buf_shard_t *s = wq->mlwq_bufs;
mutex_enter(&s->mlbs_mtx);
- while (list_is_empty(&s->mlbs_free))
- cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
- b = list_remove_head(&s->mlbs_free);
- ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
- b->mlb_state = MLXCX_BUFFER_ON_WQ;
- list_insert_tail(&s->mlbs_busy, b);
+ if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
+ ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
+ b->mlb_state = MLXCX_BUFFER_ON_WQ;
+ list_insert_tail(&s->mlbs_busy, b);
+ }
mutex_exit(&s->mlbs_mtx);
- *bp = b;
+ return (b);
}
-#define MLXCX_BUF_TAKE_N_TIMEOUT_USEC 5000
-#define MLXCX_BUF_TAKE_N_MAX_RETRIES 3
-
size_t
mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
mlxcx_buffer_t **bp, size_t nbufs)
{
mlxcx_buffer_t *b;
- size_t done = 0, empty = 0;
- clock_t wtime = drv_usectohz(MLXCX_BUF_TAKE_N_TIMEOUT_USEC);
+ size_t done = 0;
mlxcx_buf_shard_t *s;
s = wq->mlwq_bufs;
mutex_enter(&s->mlbs_mtx);
- while (done < nbufs) {
- while (list_is_empty(&s->mlbs_free)) {
- (void) cv_reltimedwait(&s->mlbs_free_nonempty,
- &s->mlbs_mtx, wtime, TR_MILLISEC);
- if (list_is_empty(&s->mlbs_free) &&
- empty++ >= MLXCX_BUF_TAKE_N_MAX_RETRIES) {
- mutex_exit(&s->mlbs_mtx);
- return (done);
- }
- }
- b = list_remove_head(&s->mlbs_free);
+ while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) {
ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
b->mlb_state = MLXCX_BUFFER_ON_WQ;
list_insert_tail(&s->mlbs_busy, b);
@@ -2187,13 +2372,26 @@ mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE);
ASSERT3P(b->mlb_mlx, ==, mlxp);
+
+ /*
+ * The mlbs_mtx held below is a heavily contended lock, so it is
+ * imperative we do as much of the buffer clean up outside the lock
+ * as is possible.
+ */
b->mlb_state = MLXCX_BUFFER_FREE;
b->mlb_wqe_index = 0;
b->mlb_tx_head = NULL;
b->mlb_tx_mp = NULL;
b->mlb_used = 0;
+ b->mlb_wqebbs = 0;
ASSERT(list_is_empty(&b->mlb_tx_chain));
+ if (b->mlb_foreign) {
+ if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
+ mlxcx_dma_unbind(mlxp, &b->mlb_dma);
+ }
+ }
+
mutex_enter(&s->mlbs_mtx);
switch (oldstate) {
case MLXCX_BUFFER_INIT:
@@ -2215,12 +2413,6 @@ mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
break;
}
- if (b->mlb_foreign) {
- if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
- mlxcx_dma_unbind(mlxp, &b->mlb_dma);
- }
- }
-
list_insert_tail(&s->mlbs_free, b);
cv_signal(&s->mlbs_free_nonempty);