summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorPaul Winder <pwinder@racktopsystems.com>2020-03-09 13:16:05 +0000
committerPaul Winder <paul@winders.demon.co.uk>2020-04-14 16:40:07 +0100
commit22d052287ba7ed169757650e2eec25fedbae163a (patch)
treecc05c04281562815d8c52d8e2d7f3023d10f3a9f /usr/src
parent63878f749f68d1c188363e0e7a36e7b7e855dff2 (diff)
downloadillumos-joyent-22d052287ba7ed169757650e2eec25fedbae163a.tar.gz
12383 Slow down and lock up in mlxcx receive interrupt path
12438 mlxcx should pass receive messages to mac layer more frequently 12439 mlxcx send rings can overflow 12440 mlxcx should not block in the send path 12441 mlxcx default queue sizes are a bit on the small size Reviewed by: Garrett D'Amore <garrett@damore.org> Reviewed by: Andy Stormont <astormont@racktopsystems.com> Reviewed by: Igor Kozhukhov <igor@dilos.org> Reviewed by: Robert Mustacchi <rm@fingolfin.org> Approved by: Garrett D'Amore <garrett@damore.org>
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/man/man7d/mlxcx.7d39
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx.c75
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx.conf20
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx.h56
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx_gld.c82
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx_intr.c353
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx_reg.h23
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx_ring.c380
8 files changed, 706 insertions, 322 deletions
diff --git a/usr/src/man/man7d/mlxcx.7d b/usr/src/man/man7d/mlxcx.7d
index 5373b5bec5..d7b0cf8ad9 100644
--- a/usr/src/man/man7d/mlxcx.7d
+++ b/usr/src/man/man7d/mlxcx.7d
@@ -11,7 +11,7 @@
.\"
.\" Copyright 2020 the University of Queensland
.\"
-.Dd January 17, 2020
+.Dd April 9, 2020
.Dt MLXCX 7D
.Os
.Sh NAME
@@ -94,8 +94,11 @@ property determines the number of entries on Completion Queues for the device.
The number of entries is calculated as
.Li (1 << cq_size_shift) ,
so a value of 9 would mean 512 entries are created on each Event Queue.
-The default value is
-.Sy 10 .
+The default value is device dependent,
+.Sy 10
+for devices with maximum supported speed of 10Gb/s or less and
+.Sy 12
+for devices with higher supported speeds.
This should be kept very close to the value set for
.Sy rq_size_shift
and
@@ -116,8 +119,11 @@ The number of descriptors is calculated as
.Dv (1 << rq_size_shift) ,
so a value of 9 would mean 512 descriptors are created on each Receive Queue.
This sets the number of packets on RX rings advertised to MAC.
-The default value is
-.Sy 10 .
+The default value is device dependent,
+.Sy 10
+for devices with maximum supported speed of 10Gb/s or less and
+.Sy 12
+for devices with higher supported speeds.
.Ed
.It Sy sq_size_shift
.Bd -filled -compact
@@ -134,8 +140,11 @@ The number of descriptors is calculated as
.Dv (1 << sq_size_shift) ,
so a value of 9 would mean 512 descriptors are created on each Send Queue.
This sets the number of packets on RX rings advertised to MAC.
-The default value is
-.Sy 11 .
+The default value is device dependent,
+.Sy 11
+for devices with maximum supported speed of 10Gb/s or less and
+.Sy 13
+for devices with higher supported speeds.
Note that large packets often occupy more than one descriptor slot on the SQ,
so it is sometimes a good idea to increase this if using a large MTU.
.Ed
@@ -325,6 +334,22 @@ is seldom worth using them for small packets.
The default value is
.Sy 2048 .
.Ed
+.It Sy rx_limit_per_completion
+.Bd -filled -compact
+Minimum:
+.Sy 16 |
+Maximum:
+.Sy 4096
+.Ed
+.Bd -filled
+The
+.Sy rx_limit_per_completion
+property determines the maximum number of packets that
+will be processed on a given completion ring during a single interrupt.
+This is done to try and guarantee some amount of liveness in the system.
+The default value is
+.Sy 256 .
+.Ed
.El
.Sh FILES
.Bl -tag -width Pa
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.c b/usr/src/uts/common/io/mlxcx/mlxcx.c
index 12a8d52b3f..c90fa0969b 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.c
@@ -12,6 +12,7 @@
/*
* Copyright 2020, The University of Queensland
* Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
@@ -453,23 +454,68 @@ uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT;
uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT;
static void
-mlxcx_load_props(mlxcx_t *mlxp)
+mlxcx_load_prop_defaults(mlxcx_t *mlxp)
{
mlxcx_drv_props_t *p = &mlxp->mlx_props;
+ mlxcx_port_t *port = &mlxp->mlx_ports[0];
+
+ VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0);
+ VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0);
+
+ /*
+ * Currently we have different queue size defaults for two
+ * categories of queues. One set for devices which support a
+ * maximum speed of 10Gb/s, and another for those above that.
+ */
+ if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G |
+ MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0) {
+ p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G;
+ p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G;
+ p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G;
+ } else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G |
+ MLXCX_PROTO_10G)) != 0) {
+ p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
+ p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
+ p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
+ } else {
+ mlxcx_warn(mlxp, "Encountered a port with a speed we don't "
+ "recognize. Proto: 0x%x", port->mlp_max_proto);
+ p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
+ p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
+ p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
+ }
+}
+
+/*
+ * Properties which may have different defaults based on hardware
+ * characteristics.
+ */
+static void
+mlxcx_load_model_props(mlxcx_t *mlxp)
+{
+ mlxcx_drv_props_t *p = &mlxp->mlx_props;
+
+ mlxcx_load_prop_defaults(mlxp);
- p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
- DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
- MLXCX_EQ_SIZE_SHIFT_DFLT);
p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift",
- MLXCX_CQ_SIZE_SHIFT_DFLT);
+ p->mldp_cq_size_shift_default);
p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift",
- MLXCX_SQ_SIZE_SHIFT_DFLT);
+ p->mldp_sq_size_shift_default);
p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift",
- MLXCX_RQ_SIZE_SHIFT_DFLT);
+ p->mldp_rq_size_shift_default);
+}
+
+static void
+mlxcx_load_props(mlxcx_t *mlxp)
+{
+ mlxcx_drv_props_t *p = &mlxp->mlx_props;
+ p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
+ DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
+ MLXCX_EQ_SIZE_SHIFT_DFLT);
p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec",
MLXCX_CQEMOD_PERIOD_USEC_DFLT);
@@ -521,6 +567,19 @@ mlxcx_load_props(mlxcx_t *mlxp)
p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
"wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT);
+
+ p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
+ DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion",
+ MLXCX_RX_PER_CQ_DEFAULT);
+
+ if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN ||
+ p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) {
+ mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is "
+ "out of range. Defaulting to: %d. Valid values are from "
+ "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT,
+ MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX);
+ p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT;
+ }
}
void
@@ -2595,6 +2654,8 @@ mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
}
mlxp->mlx_attach |= MLXCX_ATTACH_PORTS;
+ mlxcx_load_model_props(mlxp);
+
/*
* Set up, enable and arm the rest of the interrupt EQs which will
* service events from CQs.
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.conf b/usr/src/uts/common/io/mlxcx/mlxcx.conf
index 3569c4e5f5..321820a47b 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.conf
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.conf
@@ -12,6 +12,7 @@
#
# Copyright 2018, Joyent, Inc.
# Copyright 2020, The University of Queensland
+# Copyright 2020 RackTop Systems, Inc.
#
#
@@ -23,10 +24,15 @@
# Sizing of event and completion queues.
#
# The number of entries on each queue will be (1 << *_size_shift) -- so
-# a value of 9 would mean 512 entries.
+# a value of 10 would mean 1024 entries.
#
#eq_size_shift = 9;
+
+# The default for devices with a maximum supported speed up to 10Gb/s
#cq_size_shift = 10;
+#
+# The default for devices with a maximum supported speed above 10Gb/s
+#cq_size_shift = 12;
#
# Sizing of send and receive queues.
@@ -35,8 +41,13 @@
# advertise to MAC. It also determines how many packet buffers we will allocate
# when starting the interface.
#
+# The defaults for devices with a maximum supported speed up to 10Gb/s
#sq_size_shift = 11;
#rq_size_shift = 10;
+#
+# The defaults for devices with a maximum supported speed above 10Gb/s
+#sq_size_shift = 13;
+#rq_size_shift = 12;
#
# Number and configuration of TX groups and rings.
@@ -99,3 +110,10 @@
#eq_check_interval_sec = 30;
#cq_check_interval_sec = 300;
#wq_check_interval_sec = 300;
+
+#
+# To provide some level of moderation and aid latencies, after
+# "rx_limit_per_completion" packets are received in a single completion
+# event, the interrupt handler will pass the chain up the receive stack.
+#
+#rx_limit_per_completion = 256;
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h
index 3b58989961..bf07691095 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.h
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.h
@@ -12,6 +12,7 @@
/*
* Copyright 2020, The University of Queensland
* Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
@@ -32,6 +33,7 @@
#include <sys/ddifm.h>
#include <sys/id_space.h>
#include <sys/list.h>
+#include <sys/taskq_impl.h>
#include <sys/stddef.h>
#include <sys/stream.h>
#include <sys/strsun.h>
@@ -89,18 +91,36 @@ extern "C" {
* Queues will be sized to (1 << *Q_SIZE_SHIFT) entries long.
*/
#define MLXCX_EQ_SIZE_SHIFT_DFLT 9
+
+/*
+ * The CQ, SQ and RQ sizes can effect throughput on higher speed interfaces.
+ * EQ less so, as it only takes a single EQ entry to indicate there are
+ * multiple completions on the CQ.
+ *
+ * Particularly on the Rx side, the RQ (and corresponding CQ) would run
+ * low on available entries. A symptom of this is the refill taskq running
+ * frequently. A larger RQ (and CQ) alleviates this, and as there is a
+ * close relationship between SQ and CQ size, the SQ is increased too.
+ */
#define MLXCX_CQ_SIZE_SHIFT_DFLT 10
+#define MLXCX_CQ_SIZE_SHIFT_25G 12
/*
* Default to making SQs bigger than RQs for 9k MTU, since most packets will
* spill over into more than one slot. RQ WQEs are always 1 slot.
*/
#define MLXCX_SQ_SIZE_SHIFT_DFLT 11
+#define MLXCX_SQ_SIZE_SHIFT_25G 13
+
#define MLXCX_RQ_SIZE_SHIFT_DFLT 10
+#define MLXCX_RQ_SIZE_SHIFT_25G 12
#define MLXCX_CQ_HWM_GAP 16
#define MLXCX_CQ_LWM_GAP 24
+#define MLXCX_WQ_HWM_GAP MLXCX_CQ_HWM_GAP
+#define MLXCX_WQ_LWM_GAP MLXCX_CQ_LWM_GAP
+
#define MLXCX_RQ_REFILL_STEP 64
/*
@@ -135,6 +155,14 @@ extern "C" {
#define MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT 300
#define MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT 30
+/*
+ * After this many packets, the packets received so far are passed to
+ * the mac layer.
+ */
+#define MLXCX_RX_PER_CQ_DEFAULT 256
+#define MLXCX_RX_PER_CQ_MIN 16
+#define MLXCX_RX_PER_CQ_MAX 4096
+
#define MLXCX_DOORBELL_TRIES_DFLT 3
extern uint_t mlxcx_doorbell_tries;
@@ -417,6 +445,11 @@ typedef struct mlxcx_buffer {
size_t mlb_used;
mblk_t *mlb_tx_mp;
+ /*
+ * The number of work queue basic blocks this buf uses.
+ */
+ uint_t mlb_wqebbs;
+
mlxcx_t *mlb_mlx;
mlxcx_buffer_state_t mlb_state;
uint_t mlb_wqe_index;
@@ -495,6 +528,8 @@ typedef enum {
MLXCX_WQ_DESTROYED = 1 << 3,
MLXCX_WQ_TEARDOWN = 1 << 4,
MLXCX_WQ_BUFFERS = 1 << 5,
+ MLXCX_WQ_REFILLING = 1 << 6,
+ MLXCX_WQ_BLOCKED_MAC = 1 << 7
} mlxcx_workq_state_t;
typedef enum {
@@ -540,12 +575,18 @@ struct mlxcx_work_queue {
};
uint64_t mlwq_pc; /* producer counter */
+ uint64_t mlwq_wqebb_used;
+ size_t mlwq_bufhwm;
+ size_t mlwq_buflwm;
+
mlxcx_dma_buffer_t mlwq_doorbell_dma;
mlxcx_workq_doorbell_t *mlwq_doorbell;
mlxcx_buf_shard_t *mlwq_bufs;
mlxcx_buf_shard_t *mlwq_foreign_bufs;
+ taskq_ent_t mlwq_tqe;
+
boolean_t mlwq_fm_repd_qstate;
};
@@ -773,6 +814,8 @@ struct mlxcx_ring_group {
mlxcx_flow_group_t *mlg_rx_vlan_promisc_fg;
list_t mlg_rx_vlans;
+ taskq_t *mlg_refill_tq;
+
/*
* Flow table for separating out by protocol before hashing
*/
@@ -856,8 +899,11 @@ typedef struct {
typedef struct {
uint_t mldp_eq_size_shift;
uint_t mldp_cq_size_shift;
+ uint_t mldp_cq_size_shift_default;
uint_t mldp_rq_size_shift;
+ uint_t mldp_rq_size_shift_default;
uint_t mldp_sq_size_shift;
+ uint_t mldp_sq_size_shift_default;
uint_t mldp_cqemod_period_usec;
uint_t mldp_cqemod_count;
uint_t mldp_intrmod_period_usec;
@@ -865,6 +911,7 @@ typedef struct {
uint_t mldp_rx_ngroups_small;
uint_t mldp_rx_nrings_per_large_group;
uint_t mldp_rx_nrings_per_small_group;
+ uint_t mldp_rx_per_cq;
uint_t mldp_tx_ngroups;
uint_t mldp_tx_nrings_per_group;
uint_t mldp_ftbl_root_size_shift;
@@ -1098,6 +1145,7 @@ extern boolean_t mlxcx_intr_setup(mlxcx_t *);
extern void mlxcx_intr_teardown(mlxcx_t *);
extern void mlxcx_arm_eq(mlxcx_t *, mlxcx_event_queue_t *);
extern void mlxcx_arm_cq(mlxcx_t *, mlxcx_completion_queue_t *);
+extern void mlxcx_update_cqci(mlxcx_t *, mlxcx_completion_queue_t *);
extern mblk_t *mlxcx_rx_poll(mlxcx_t *, mlxcx_completion_queue_t *, size_t);
@@ -1109,8 +1157,6 @@ extern boolean_t mlxcx_register_mac(mlxcx_t *);
/*
* From mlxcx_ring.c
*/
-extern boolean_t mlxcx_cq_alloc_dma(mlxcx_t *, mlxcx_completion_queue_t *);
-extern void mlxcx_cq_rele_dma(mlxcx_t *, mlxcx_completion_queue_t *);
extern boolean_t mlxcx_wq_alloc_dma(mlxcx_t *, mlxcx_work_queue_t *);
extern void mlxcx_wq_rele_dma(mlxcx_t *, mlxcx_work_queue_t *);
@@ -1118,7 +1164,7 @@ extern boolean_t mlxcx_buf_create(mlxcx_t *, mlxcx_buf_shard_t *,
mlxcx_buffer_t **);
extern boolean_t mlxcx_buf_create_foreign(mlxcx_t *, mlxcx_buf_shard_t *,
mlxcx_buffer_t **);
-extern void mlxcx_buf_take(mlxcx_t *, mlxcx_work_queue_t *, mlxcx_buffer_t **);
+extern mlxcx_buffer_t *mlxcx_buf_take(mlxcx_t *, mlxcx_work_queue_t *);
extern size_t mlxcx_buf_take_n(mlxcx_t *, mlxcx_work_queue_t *,
mlxcx_buffer_t **, size_t);
extern boolean_t mlxcx_buf_loan(mlxcx_t *, mlxcx_buffer_t *);
@@ -1126,8 +1172,8 @@ extern void mlxcx_buf_return(mlxcx_t *, mlxcx_buffer_t *);
extern void mlxcx_buf_return_chain(mlxcx_t *, mlxcx_buffer_t *, boolean_t);
extern void mlxcx_buf_destroy(mlxcx_t *, mlxcx_buffer_t *);
-extern boolean_t mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *,
- mblk_t *, size_t, mlxcx_buffer_t **);
+extern mlxcx_buffer_t *mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *,
+ mblk_t *, size_t);
extern boolean_t mlxcx_rx_group_setup(mlxcx_t *, mlxcx_ring_group_t *);
extern boolean_t mlxcx_tx_group_setup(mlxcx_t *, mlxcx_ring_group_t *);
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
index 7b01702376..a1d50659c1 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
@@ -430,15 +430,10 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
}
}
- if (!mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take, &b)) {
- /*
- * Something went really wrong, and we probably will never be
- * able to TX again (all our buffers are broken and DMA is
- * failing). Drop the packet on the floor -- FMA should be
- * reporting this error elsewhere.
- */
- freemsg(mp);
- return (NULL);
+ b = mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take);
+ if (b == NULL) {
+ atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
+ return (mp);
}
mutex_enter(&sq->mlwq_mtx);
@@ -467,18 +462,20 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
*/
if (cq->mlcq_bufcnt >= cq->mlcq_bufhwm) {
atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC);
- mutex_exit(&sq->mlwq_mtx);
- mlxcx_buf_return_chain(mlxp, b, B_TRUE);
- return (mp);
+ goto blocked;
+ }
+
+ if (sq->mlwq_wqebb_used >= sq->mlwq_bufhwm) {
+ atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
+ goto blocked;
}
ok = mlxcx_sq_add_buffer(mlxp, sq, inline_hdrs, inline_hdrlen,
chkflags, b);
if (!ok) {
atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC);
- mutex_exit(&sq->mlwq_mtx);
- mlxcx_buf_return_chain(mlxp, b, B_TRUE);
- return (mp);
+ atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
+ goto blocked;
}
/*
@@ -493,6 +490,11 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
mutex_exit(&sq->mlwq_mtx);
return (NULL);
+
+blocked:
+ mutex_exit(&sq->mlwq_mtx);
+ mlxcx_buf_return_chain(mlxp, b, B_TRUE);
+ return (mp);
}
static int
@@ -862,9 +864,8 @@ mlxcx_mac_ring_intr_disable(mac_intr_handle_t intrh)
{
mlxcx_completion_queue_t *cq = (mlxcx_completion_queue_t *)intrh;
- atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_POLLING);
mutex_enter(&cq->mlcq_mtx);
- VERIFY(cq->mlcq_state & MLXCX_CQ_POLLING);
+ atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_POLLING);
mutex_exit(&cq->mlcq_mtx);
return (0);
@@ -1061,56 +1062,43 @@ mlxcx_mac_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
case MAC_PROP_EN_100GFDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto &
- (MLXCX_PROTO_100GBASE_CR4 | MLXCX_PROTO_100GBASE_SR4 |
- MLXCX_PROTO_100GBASE_KR4)) != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_100G) != 0);
break;
case MAC_PROP_ADV_50GFDX_CAP:
case MAC_PROP_EN_50GFDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto &
- (MLXCX_PROTO_50GBASE_CR2 | MLXCX_PROTO_50GBASE_KR2 |
- MLXCX_PROTO_50GBASE_SR2)) != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_50G) != 0);
break;
case MAC_PROP_ADV_40GFDX_CAP:
case MAC_PROP_EN_40GFDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto &
- (MLXCX_PROTO_40GBASE_SR4 | MLXCX_PROTO_40GBASE_LR4_ER4 |
- MLXCX_PROTO_40GBASE_CR4 | MLXCX_PROTO_40GBASE_KR4))
- != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_40G) != 0);
break;
case MAC_PROP_ADV_25GFDX_CAP:
case MAC_PROP_EN_25GFDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto &
- (MLXCX_PROTO_25GBASE_CR | MLXCX_PROTO_25GBASE_KR |
- MLXCX_PROTO_25GBASE_SR)) != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_25G) != 0);
break;
case MAC_PROP_ADV_10GFDX_CAP:
case MAC_PROP_EN_10GFDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto &
- (MLXCX_PROTO_10GBASE_CX4 | MLXCX_PROTO_10GBASE_KX4 |
- MLXCX_PROTO_10GBASE_KR | MLXCX_PROTO_10GBASE_CR |
- MLXCX_PROTO_10GBASE_SR | MLXCX_PROTO_10GBASE_ER_LR)) != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_10G) != 0);
break;
case MAC_PROP_ADV_1000FDX_CAP:
case MAC_PROP_EN_1000FDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto & (MLXCX_PROTO_1000BASE_KX |
- MLXCX_PROTO_SGMII)) != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_1G) != 0);
break;
case MAC_PROP_ADV_100FDX_CAP:
case MAC_PROP_EN_100FDX_CAP:
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
mac_prop_info_set_default_uint8(prh,
- (port->mlp_oper_proto & MLXCX_PROTO_SGMII_100BASE) != 0);
+ (port->mlp_oper_proto & MLXCX_PROTO_100M) != 0);
break;
default:
break;
@@ -1252,8 +1240,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- (MLXCX_PROTO_100GBASE_CR4 | MLXCX_PROTO_100GBASE_SR4 |
- MLXCX_PROTO_100GBASE_KR4)) != 0;
+ MLXCX_PROTO_100G) != 0;
break;
case MAC_PROP_ADV_50GFDX_CAP:
case MAC_PROP_EN_50GFDX_CAP:
@@ -1262,8 +1249,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- (MLXCX_PROTO_50GBASE_CR2 | MLXCX_PROTO_50GBASE_KR2 |
- MLXCX_PROTO_50GBASE_SR2)) != 0;
+ MLXCX_PROTO_50G) != 0;
break;
case MAC_PROP_ADV_40GFDX_CAP:
case MAC_PROP_EN_40GFDX_CAP:
@@ -1272,8 +1258,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- (MLXCX_PROTO_40GBASE_SR4 | MLXCX_PROTO_40GBASE_LR4_ER4 |
- MLXCX_PROTO_40GBASE_CR4 | MLXCX_PROTO_40GBASE_KR4)) != 0;
+ MLXCX_PROTO_40G) != 0;
break;
case MAC_PROP_ADV_25GFDX_CAP:
case MAC_PROP_EN_25GFDX_CAP:
@@ -1282,8 +1267,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- (MLXCX_PROTO_25GBASE_CR | MLXCX_PROTO_25GBASE_KR |
- MLXCX_PROTO_25GBASE_SR)) != 0;
+ MLXCX_PROTO_25G) != 0;
break;
case MAC_PROP_ADV_10GFDX_CAP:
case MAC_PROP_EN_10GFDX_CAP:
@@ -1292,9 +1276,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- (MLXCX_PROTO_10GBASE_CX4 | MLXCX_PROTO_10GBASE_KX4 |
- MLXCX_PROTO_10GBASE_KR | MLXCX_PROTO_10GBASE_CR |
- MLXCX_PROTO_10GBASE_SR | MLXCX_PROTO_10GBASE_ER_LR)) != 0;
+ MLXCX_PROTO_10G) != 0;
break;
case MAC_PROP_ADV_1000FDX_CAP:
case MAC_PROP_EN_1000FDX_CAP:
@@ -1303,7 +1285,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- (MLXCX_PROTO_1000BASE_KX | MLXCX_PROTO_SGMII)) != 0;
+ MLXCX_PROTO_1G) != 0;
break;
case MAC_PROP_ADV_100FDX_CAP:
case MAC_PROP_EN_100FDX_CAP:
@@ -1312,7 +1294,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
}
*(uint8_t *)pr_val = (port->mlp_max_proto &
- MLXCX_PROTO_SGMII_100BASE) != 0;
+ MLXCX_PROTO_100M) != 0;
break;
default:
ret = ENOTSUP;
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
index 0516f86d6b..4dc4291b08 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
@@ -11,6 +11,7 @@
/*
* Copyright (c) 2020, the University of Queensland
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
@@ -26,6 +27,11 @@
#include <mlxcx.h>
+/*
+ * CTASSERT(s) to cover bad values which would induce bugs.
+ */
+CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP);
+
void
mlxcx_intr_teardown(mlxcx_t *mlxp)
{
@@ -190,6 +196,31 @@ mlxcx_cq_next(mlxcx_completion_queue_t *mlcq)
}
void
+mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
+{
+ ddi_fm_error_t err;
+ uint_t try = 0;
+
+ mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
+
+retry:
+ MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
+ ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
+ DDI_FME_VERSION);
+ if (err.fme_status != DDI_FM_OK) {
+ if (try++ < mlxcx_doorbell_tries) {
+ ddi_fm_dma_err_clear(
+ mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
+ DDI_FME_VERSION);
+ goto retry;
+ } else {
+ ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
+ return;
+ }
+ }
+}
+
+void
mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
{
bits32_t dbval = new_bits32();
@@ -538,14 +569,15 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2)
if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
!(mleq->mleq_state & MLXCX_EQ_CREATED) ||
(mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
- mlxcx_warn(mlxp, "int0 on bad eq state");
+ mlxcx_warn(mlxp, "int %d on bad eq state",
+ mleq->mleq_intr_index);
mutex_exit(&mleq->mleq_mtx);
return (DDI_INTR_UNCLAIMED);
}
ent = mlxcx_eq_next(mleq);
if (ent == NULL) {
- mlxcx_warn(mlxp, "spurious int 0?");
+ mlxcx_warn(mlxp, "spurious int %d", mleq->mleq_intr_index);
mutex_exit(&mleq->mleq_mtx);
return (DDI_INTR_UNCLAIMED);
}
@@ -574,8 +606,8 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2)
mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod);
break;
default:
- mlxcx_warn(mlxp, "unhandled event 0x%x on int0",
- ent->mleqe_event_type);
+ mlxcx_warn(mlxp, "unhandled event 0x%x on int %d",
+ ent->mleqe_event_type, mleq->mleq_intr_index);
}
}
@@ -591,46 +623,56 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2)
return (DDI_INTR_CLAIMED);
}
-mblk_t *
-mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
+static boolean_t
+mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp,
+ size_t bytelim)
{
- mlxcx_buffer_t *buf;
- mblk_t *mp, *cmp, *nmp;
+ mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
mlxcx_completionq_ent_t *cent;
+ mblk_t *mp, *cmp, *nmp;
+ mlxcx_buffer_t *buf;
+ boolean_t found, added;
size_t bytes = 0;
- boolean_t found;
-
- ASSERT(mutex_owned(&mlcq->mlcq_mtx));
+ uint_t rx_frames = 0;
+ uint_t comp_cnt = 0;
+ int64_t wqebbs, bufcnt;
- ASSERT(mlcq->mlcq_wq != NULL);
- ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
+ *mpp = NULL;
if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
!(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
(mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
(mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
- return (NULL);
+ return (B_FALSE);
}
- ASSERT(mlcq->mlcq_state & MLXCX_CQ_POLLING);
-
nmp = cmp = mp = NULL;
- cent = mlxcx_cq_next(mlcq);
- for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) {
+ wqebbs = 0;
+ bufcnt = 0;
+ for (cent = mlxcx_cq_next(mlcq); cent != NULL;
+ cent = mlxcx_cq_next(mlcq)) {
/*
* Teardown and ring stop can atomic_or this flag
* into our state if they want us to stop early.
*/
if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
- break;
+ return (B_FALSE);
+ comp_cnt++;
if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
/* NOP */
+ atomic_dec_64(&wq->mlwq_wqebb_used);
goto nextcq;
}
+lookagain:
+ /*
+ * Generally the buffer we're looking for will be
+ * at the front of the list, so this loop won't
+ * need to look far.
+ */
buf = list_head(&mlcq->mlcq_buffers);
found = B_FALSE;
while (buf != NULL) {
@@ -641,36 +683,118 @@ mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
}
buf = list_next(&mlcq->mlcq_buffers, buf);
}
+
if (!found) {
+ /*
+ * If there's any buffers waiting on the
+ * buffers_b list, then merge those into
+ * the main list and have another look.
+ *
+ * The wq enqueue routines push new buffers
+ * into buffers_b so that they can avoid
+ * taking the mlcq_mtx and blocking us for
+ * every single packet.
+ */
+ added = B_FALSE;
+ mutex_enter(&mlcq->mlcq_bufbmtx);
+ if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
+ list_move_tail(&mlcq->mlcq_buffers,
+ &mlcq->mlcq_buffers_b);
+ added = B_TRUE;
+ }
+ mutex_exit(&mlcq->mlcq_bufbmtx);
+ if (added)
+ goto lookagain;
+
buf = list_head(&mlcq->mlcq_buffers);
mlxcx_warn(mlxp, "got completion on CQ %x but "
"no buffer matching wqe found: %x (first "
"buffer counter = %x)", mlcq->mlcq_num,
from_be16(cent->mlcqe_wqe_counter),
- buf == NULL ? UINT32_MAX : buf->mlb_wqe_index);
+ buf == NULL ? UINT32_MAX :
+ buf->mlb_wqe_index);
mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
goto nextcq;
}
+
+ /*
+ * The buf is likely to be freed below, count this now.
+ */
+ wqebbs += buf->mlb_wqebbs;
+
list_remove(&mlcq->mlcq_buffers, buf);
- atomic_dec_64(&mlcq->mlcq_bufcnt);
+ bufcnt++;
- nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
- if (nmp != NULL) {
+ switch (mlcq->mlcq_wq->mlwq_type) {
+ case MLXCX_WQ_TYPE_SENDQ:
+ mlxcx_tx_completion(mlxp, mlcq, cent, buf);
+ break;
+ case MLXCX_WQ_TYPE_RECVQ:
+ nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
bytes += from_be32(cent->mlcqe_byte_cnt);
- if (cmp != NULL) {
- cmp->b_next = nmp;
- cmp = nmp;
- } else {
- mp = cmp = nmp;
+ if (nmp != NULL) {
+ if (cmp != NULL) {
+ cmp->b_next = nmp;
+ cmp = nmp;
+ } else {
+ mp = cmp = nmp;
+ }
+
+ rx_frames++;
}
+ break;
}
-nextcq:
- mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
- if (bytelim != 0 && bytes > bytelim)
+ /*
+ * Update the consumer index with what has been processed,
+ * followed by driver counters. It is important to tell the
+ * hardware first, otherwise when we throw more packets at
+ * it, it may get an overflow error.
+ * We do this whenever we've processed enough to bridge the
+ * high->low water mark.
+ */
+ if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) {
+ mlxcx_update_cqci(mlxp, mlcq);
+ /*
+ * Both these variables are incremented using
+ * atomics as they are modified in other code paths
+ * (Eg during tx) which hold different locks.
+ */
+ atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
+ atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
+ wqebbs = 0;
+ bufcnt = 0;
+ comp_cnt = 0;
+ }
+nextcq:
+ if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq ||
+ (bytelim != 0 && bytes > bytelim))
break;
}
+ if (comp_cnt > 0) {
+ mlxcx_update_cqci(mlxp, mlcq);
+ atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
+ atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
+ }
+
+ *mpp = mp;
+ return (B_TRUE);
+}
+
+
+mblk_t *
+mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
+{
+ mblk_t *mp = NULL;
+
+ ASSERT(mutex_owned(&mlcq->mlcq_mtx));
+
+ ASSERT(mlcq->mlcq_wq != NULL);
+ ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
+
+ (void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim);
+
return (mp);
}
@@ -680,11 +804,10 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2)
mlxcx_t *mlxp = (mlxcx_t *)arg;
mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
mlxcx_eventq_ent_t *ent;
- mlxcx_completionq_ent_t *cent;
mlxcx_completion_queue_t *mlcq, probe;
- mlxcx_buffer_t *buf;
- mblk_t *mp, *cmp, *nmp;
- boolean_t found, tellmac = B_FALSE, added;
+ mlxcx_work_queue_t *mlwq;
+ mblk_t *mp = NULL;
+ boolean_t tellmac = B_FALSE;
mutex_enter(&mleq->mleq_mtx);
@@ -729,10 +852,12 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2)
if (mlcq == NULL)
continue;
+ mlwq = mlcq->mlcq_wq;
+
/*
* The polling function might have the mutex and stop us from
- * getting the lock here, so we increment the event counter
- * atomically from outside.
+ * getting the lock in mlxcx_process_cq(), so we increment
+ * the event counter atomically from outside.
*
* This way at the end of polling when we go back to interrupts
* from this CQ, the event counter is still correct.
@@ -746,145 +871,57 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2)
if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) {
/*
- * If we failed to take the mutex because the polling
- * function has it, just move on. We don't want to
- * block other CQs behind this one.
+ * If we failed to take the mutex because the
+ * polling function has it, just move on.
+ * We don't want to block other CQs behind
+ * this one.
*/
if (mlcq->mlcq_state & MLXCX_CQ_POLLING)
- continue;
+ goto update_eq;
+
/* Otherwise we will wait. */
mutex_enter(&mlcq->mlcq_mtx);
}
- if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
- !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
- (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
- (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) ||
- (mlcq->mlcq_state & MLXCX_CQ_POLLING)) {
- mutex_exit(&mlcq->mlcq_mtx);
- continue;
- }
-
- nmp = cmp = mp = NULL;
- tellmac = B_FALSE;
-
- cent = mlxcx_cq_next(mlcq);
- for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) {
+ if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 &&
+ mlxcx_process_cq(mlxp, mlcq, &mp, 0)) {
/*
- * Teardown and ring stop can atomic_or this flag
- * into our state if they want us to stop early.
+ * The ring is not in polling mode and we processed
+ * some completion queue entries.
*/
- if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
- break;
- if (mlcq->mlcq_state & MLXCX_CQ_POLLING)
- break;
-
- if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
- cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
- /* NOP */
- goto nextcq;
+ if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 &&
+ mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
+ atomic_and_uint(&mlcq->mlcq_state,
+ ~MLXCX_CQ_BLOCKED_MAC);
+ tellmac = B_TRUE;
}
-lookagain:
- /*
- * Generally the buffer we're looking for will be
- * at the front of the list, so this loop won't
- * need to look far.
- */
- buf = list_head(&mlcq->mlcq_buffers);
- found = B_FALSE;
- while (buf != NULL) {
- if ((buf->mlb_wqe_index & UINT16_MAX) ==
- from_be16(cent->mlcqe_wqe_counter)) {
- found = B_TRUE;
- break;
- }
- buf = list_next(&mlcq->mlcq_buffers, buf);
- }
- if (!found) {
- /*
- * If there's any buffers waiting on the
- * buffers_b list, then merge those into
- * the main list and have another look.
- *
- * The wq enqueue routines push new buffers
- * into buffers_b so that they can avoid
- * taking the mlcq_mtx and blocking us for
- * every single packet.
- */
- added = B_FALSE;
- mutex_enter(&mlcq->mlcq_bufbmtx);
- if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
- list_move_tail(&mlcq->mlcq_buffers,
- &mlcq->mlcq_buffers_b);
- added = B_TRUE;
- }
- mutex_exit(&mlcq->mlcq_bufbmtx);
- if (added)
- goto lookagain;
- }
- if (!found) {
- buf = list_head(&mlcq->mlcq_buffers);
- mlxcx_warn(mlxp, "got completion on CQ %x but "
- "no buffer matching wqe found: %x (first "
- "buffer counter = %x)", mlcq->mlcq_num,
- from_be16(cent->mlcqe_wqe_counter),
- buf == NULL ? UINT32_MAX :
- buf->mlb_wqe_index);
- mlxcx_fm_ereport(mlxp,
- DDI_FM_DEVICE_INVAL_STATE);
- goto nextcq;
+ if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 &&
+ mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) {
+ atomic_and_uint(&mlwq->mlwq_state,
+ ~MLXCX_WQ_BLOCKED_MAC);
+ tellmac = B_TRUE;
}
- list_remove(&mlcq->mlcq_buffers, buf);
- atomic_dec_64(&mlcq->mlcq_bufcnt);
- switch (mlcq->mlcq_wq->mlwq_type) {
- case MLXCX_WQ_TYPE_SENDQ:
- mlxcx_tx_completion(mlxp, mlcq, cent, buf);
- break;
- case MLXCX_WQ_TYPE_RECVQ:
- nmp = mlxcx_rx_completion(mlxp, mlcq, cent,
- buf);
- if (nmp != NULL) {
- if (cmp != NULL) {
- cmp->b_next = nmp;
- cmp = nmp;
- } else {
- mp = cmp = nmp;
- }
- }
- break;
- }
+ mlxcx_arm_cq(mlxp, mlcq);
-nextcq:
- /*
- * Update the "doorbell" consumer counter for the queue
- * every time. Unlike a UAR write, this is relatively
- * cheap and doesn't require us to go out on the bus
- * straight away (since it's our memory).
- */
- mlcq->mlcq_doorbell->mlcqd_update_ci =
- to_be24(mlcq->mlcq_cc);
+ mutex_exit(&mlcq->mlcq_mtx);
- if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) &&
- mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
- mlcq->mlcq_state &= ~MLXCX_CQ_BLOCKED_MAC;
- tellmac = B_TRUE;
+ if (tellmac) {
+ mac_tx_ring_update(mlxp->mlx_mac_hdl,
+ mlcq->mlcq_mac_hdl);
+ tellmac = B_FALSE;
}
- }
- mlxcx_arm_cq(mlxp, mlcq);
- mutex_exit(&mlcq->mlcq_mtx);
-
- if (tellmac) {
- mac_tx_ring_update(mlxp->mlx_mac_hdl,
- mlcq->mlcq_mac_hdl);
- }
- if (mp != NULL) {
- mac_rx_ring(mlxp->mlx_mac_hdl, mlcq->mlcq_mac_hdl,
- mp, mlcq->mlcq_mac_gen);
+ if (mp != NULL) {
+ mac_rx_ring(mlxp->mlx_mac_hdl,
+ mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen);
+ }
+ } else {
+ mutex_exit(&mlcq->mlcq_mtx);
}
+update_eq:
/*
* Updating the consumer counter for an EQ requires a write
* to the UAR, which is possibly expensive.
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
index 76d0da30e7..f65280d41d 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
@@ -12,6 +12,7 @@
/*
* Copyright 2020, The University of Queensland
* Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
*/
#ifndef _MLXCX_REG_H
@@ -2259,6 +2260,28 @@ typedef enum {
MLXCX_PROTO_50GBASE_KR2 = 1UL << 31,
} mlxcx_eth_proto_t;
+#define MLXCX_PROTO_100M MLXCX_PROTO_SGMII_100BASE
+
+#define MLXCX_PROTO_1G (MLXCX_PROTO_1000BASE_KX | MLXCX_PROTO_SGMII)
+
+#define MLXCX_PROTO_10G (MLXCX_PROTO_10GBASE_CX4 | \
+ MLXCX_PROTO_10GBASE_KX4 | MLXCX_PROTO_10GBASE_KR | \
+ MLXCX_PROTO_10GBASE_CR | MLXCX_PROTO_10GBASE_SR | \
+ MLXCX_PROTO_10GBASE_ER_LR)
+
+#define MLXCX_PROTO_25G (MLXCX_PROTO_25GBASE_CR | \
+ MLXCX_PROTO_25GBASE_KR | MLXCX_PROTO_25GBASE_SR)
+
+#define MLXCX_PROTO_40G (MLXCX_PROTO_40GBASE_SR4 | \
+ MLXCX_PROTO_40GBASE_LR4_ER4 | MLXCX_PROTO_40GBASE_CR4 | \
+ MLXCX_PROTO_40GBASE_KR4)
+
+#define MLXCX_PROTO_50G (MLXCX_PROTO_50GBASE_CR2 | \
+ MLXCX_PROTO_50GBASE_KR2 | MLXCX_PROTO_50GBASE_SR2)
+
+#define MLXCX_PROTO_100G (MLXCX_PROTO_100GBASE_CR4 | \
+ MLXCX_PROTO_100GBASE_SR4 | MLXCX_PROTO_100GBASE_KR4)
+
typedef enum {
MLXCX_AUTONEG_DISABLE_CAP = 1 << 5,
MLXCX_AUTONEG_DISABLE = 1 << 6
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
index 8337545b57..da609ed28c 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
@@ -12,6 +12,7 @@
/*
* Copyright 2020, The University of Queensland
* Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
@@ -113,8 +114,9 @@ mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC;
}
-boolean_t
-mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
+static boolean_t
+mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
+ uint_t ent_shift)
{
ddi_device_acc_attr_t acc;
ddi_dma_attr_t attr;
@@ -123,7 +125,7 @@ mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC);
- mlcq->mlcq_entshift = mlxp->mlx_props.mldp_cq_size_shift;
+ mlcq->mlcq_entshift = ent_shift;
mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift);
sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t);
ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
@@ -165,7 +167,7 @@ mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
return (B_TRUE);
}
-void
+static void
mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
{
VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC);
@@ -331,7 +333,7 @@ mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
static boolean_t
mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
- mlxcx_completion_queue_t **cqp)
+ mlxcx_completion_queue_t **cqp, uint_t ent_shift)
{
mlxcx_completion_queue_t *cq;
@@ -350,7 +352,7 @@ mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
mutex_enter(&cq->mlcq_mtx);
- if (!mlxcx_cq_alloc_dma(mlxp, cq)) {
+ if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) {
mutex_exit(&cq->mlcq_mtx);
return (B_FALSE);
}
@@ -413,6 +415,9 @@ mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq,
return (B_FALSE);
}
+ wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
+ wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
+
mutex_exit(&wq->mlwq_mtx);
mutex_enter(&cq->mlcq_mtx);
@@ -459,6 +464,9 @@ mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
return (B_FALSE);
}
+ wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
+ wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
+
mutex_exit(&wq->mlwq_mtx);
mutex_enter(&cq->mlcq_mtx);
@@ -471,6 +479,35 @@ mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
return (B_TRUE);
}
+/*
+ * Before we tear down the queues associated with the rx group,
+ * flag each cq as being torn down and wake up any tasks.
+ */
+static void
+mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
+{
+ mlxcx_work_queue_t *wq;
+ mlxcx_completion_queue_t *cq;
+ mlxcx_buf_shard_t *s;
+ uint_t i;
+
+ mutex_enter(&g->mlg_mtx);
+
+ for (i = 0; i < g->mlg_nwqs; ++i) {
+ wq = &g->mlg_wqs[i];
+ cq = wq->mlwq_cq;
+ if (cq != NULL) {
+ s = wq->mlwq_bufs;
+ mutex_enter(&s->mlbs_mtx);
+ atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN);
+ cv_broadcast(&s->mlbs_free_nonempty);
+ mutex_exit(&s->mlbs_mtx);
+ }
+ }
+
+ mutex_exit(&g->mlg_mtx);
+}
+
void
mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
{
@@ -551,6 +588,7 @@ mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
}
mutex_exit(&wq->mlwq_mtx);
}
+ taskq_destroy(g->mlg_refill_tq);
g->mlg_state &= ~MLXCX_GROUP_RUNNING;
}
@@ -662,8 +700,16 @@ mlxcx_teardown_groups(mlxcx_t *mlxp)
if (!(g->mlg_state & MLXCX_GROUP_INIT))
continue;
ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX);
+ mlxcx_quiesce_rx_cqs(mlxp, g);
+ }
+
+ for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
+ g = &mlxp->mlx_rx_groups[i];
+ if (!(g->mlg_state & MLXCX_GROUP_INIT))
+ continue;
mlxcx_teardown_rx_group(mlxp, g);
}
+
kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size);
mlxp->mlx_rx_groups = NULL;
@@ -674,6 +720,7 @@ mlxcx_teardown_groups(mlxcx_t *mlxp)
ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX);
mlxcx_teardown_tx_group(mlxp, g);
}
+
kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size);
mlxp->mlx_tx_groups = NULL;
}
@@ -687,6 +734,7 @@ mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
mlxcx_flow_table_t *ft;
mlxcx_flow_group_t *fg;
mlxcx_flow_entry_t *fe;
+ uint_t ent_shift;
uint_t i, j;
ASSERT3S(g->mlg_state, ==, 0);
@@ -730,10 +778,18 @@ mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
}
}
- if (!mlxcx_cq_setup(mlxp, eq, &cq)) {
+ /*
+ * A single completion is indicated for each rq entry as
+ * it is used. So, the number of cq entries never needs
+ * to be larger than the rq.
+ */
+ ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift,
+ mlxp->mlx_props.mldp_rq_size_shift);
+ if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) {
g->mlg_nwqs = i;
break;
}
+
cq->mlcq_stats = &g->mlg_port->mlp_stats;
rq = &g->mlg_wqs[i];
@@ -1182,6 +1238,7 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
mlxcx_flow_table_t *ft;
mlxcx_flow_group_t *fg;
mlxcx_flow_entry_t *fe;
+ char tq_name[TASKQ_NAMELEN];
mutex_enter(&g->mlg_mtx);
@@ -1194,6 +1251,23 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
g->mlg_state |= MLXCX_GROUP_RUNNING;
+ snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld",
+ ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst,
+ g - &mlxp->mlx_rx_groups[0]);
+
+ /*
+ * Create one refill taskq per group with one thread per work queue.
+ * The refill task may block waiting for resources, so by effectively
+ * having one thread per work queue we avoid work queues blocking each
+ * other.
+ */
+ if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri,
+ g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
+ mlxcx_warn(mlxp, "failed to create rq refill task queue");
+ mutex_exit(&g->mlg_mtx);
+ return (B_FALSE);
+ }
+
if (g == &mlxp->mlx_rx_groups[0]) {
ft = g->mlg_port->mlp_rx_flow;
mutex_enter(&ft->mlft_mtx);
@@ -1207,6 +1281,8 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
mutex_exit(&ft->mlft_mtx);
+ g->mlg_state &= ~MLXCX_GROUP_RUNNING;
+ taskq_destroy(g->mlg_refill_tq);
mutex_exit(&g->mlg_mtx);
return (B_FALSE);
}
@@ -1273,8 +1349,10 @@ mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
}
}
- if (!mlxcx_cq_setup(mlxp, eq, &cq))
+ if (!mlxcx_cq_setup(mlxp, eq, &cq,
+ mlxp->mlx_props.mldp_cq_size_shift))
return (B_FALSE);
+
cq->mlcq_stats = &g->mlg_port->mlp_stats;
sq = &g->mlg_wqs[i];
@@ -1409,6 +1487,11 @@ mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
ent0 = &mlwq->mlwq_send_ent[index];
start_pc = mlwq->mlwq_pc;
++mlwq->mlwq_pc;
+ /*
+ * This counter is manipulated in the interrupt handler, which
+ * does not hold the mlwq_mtx, hence the atomic.
+ */
+ atomic_inc_64(&mlwq->mlwq_wqebb_used);
bzero(ent0, sizeof (mlxcx_sendq_ent_t));
ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP;
@@ -1441,7 +1524,7 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags,
mlxcx_buffer_t *b0)
{
- uint_t index, first, ents = 0;
+ uint_t index, first, ents;
mlxcx_completion_queue_t *cq;
mlxcx_sendq_ent_t *ent0;
mlxcx_sendq_extra_ent_t *ent;
@@ -1449,8 +1532,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
uint_t ptri, nptr;
const ddi_dma_cookie_t *c;
size_t rem;
+ uint64_t wqebb_used;
mlxcx_buffer_t *b;
ddi_fm_error_t err;
+ boolean_t rv;
ASSERT(mutex_owned(&mlwq->mlwq_mtx));
ASSERT3P(b0->mlb_tx_head, ==, b0);
@@ -1460,16 +1545,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
ent0 = &mlwq->mlwq_send_ent[index];
b0->mlb_wqe_index = mlwq->mlwq_pc;
- ++mlwq->mlwq_pc;
- ++ents;
+ ents = 1;
first = index;
- mutex_enter(&cq->mlcq_bufbmtx);
- list_insert_tail(&cq->mlcq_buffers_b, b0);
- atomic_inc_64(&cq->mlcq_bufcnt);
- mutex_exit(&cq->mlcq_bufbmtx);
-
bzero(ent0, sizeof (mlxcx_sendq_ent_t));
ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND;
ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
@@ -1502,6 +1581,16 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM);
}
+ /*
+ * mlwq_wqebb_used is only incremented whilst holding
+ * the mlwq_mtx mutex, but it is decremented (atomically) in
+ * the interrupt context *not* under mlwq_mtx mutex.
+ * So, now take a snapshot of the number of used wqes which will
+ * be a conistent maximum we can use whilst iterating through
+ * the buffers and DMA cookies.
+ */
+ wqebb_used = mlwq->mlwq_wqebb_used;
+
b = b0;
ptri = 0;
nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t);
@@ -1513,9 +1602,12 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
while (rem > 0 &&
(c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) {
if (ptri >= nptr) {
- index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
+ if ((ents + wqebb_used) >= mlwq->mlwq_nents)
+ return (B_FALSE);
+
+ index = (mlwq->mlwq_pc + ents) &
+ (mlwq->mlwq_nents - 1);
ent = &mlwq->mlwq_send_extra_ent[index];
- ++mlwq->mlwq_pc;
++ents;
seg = ent->mlsqe_data;
@@ -1548,6 +1640,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
}
}
+ b0->mlb_wqebbs = ents;
+ mlwq->mlwq_pc += ents;
+ atomic_add_64(&mlwq->mlwq_wqebb_used, ents);
+
for (; ptri < nptr; ++ptri, ++seg) {
seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
seg->mlds_byte_count = to_be32(0);
@@ -1566,10 +1662,24 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
if (err.fme_status != DDI_FM_OK) {
return (B_FALSE);
}
- if (!mlxcx_sq_ring_dbell(mlxp, mlwq, first)) {
- return (B_FALSE);
+
+ /*
+ * Hold the bufmtx whilst ringing the doorbell, to prevent
+ * the buffer from being moved to another list, so we can
+ * safely remove it should the ring fail.
+ */
+ mutex_enter(&cq->mlcq_bufbmtx);
+
+ list_insert_tail(&cq->mlcq_buffers_b, b0);
+ if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) {
+ atomic_inc_64(&cq->mlcq_bufcnt);
+ } else {
+ list_remove(&cq->mlcq_buffers_b, b0);
}
- return (B_TRUE);
+
+ mutex_exit(&cq->mlcq_bufbmtx);
+
+ return (rv);
}
boolean_t
@@ -1604,8 +1714,10 @@ mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
ent = &mlwq->mlwq_recv_ent[index];
buf->mlb_wqe_index = mlwq->mlwq_pc;
+ buf->mlb_wqebbs = 1;
++mlwq->mlwq_pc;
+ atomic_inc_64(&mlwq->mlwq_wqebb_used);
mutex_enter(&cq->mlcq_bufbmtx);
list_insert_tail(&cq->mlcq_buffers, buf);
@@ -1666,11 +1778,53 @@ mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
return (B_TRUE);
}
+static void
+mlxcx_rq_refill_task(void *arg)
+{
+ mlxcx_work_queue_t *wq = arg;
+ mlxcx_completion_queue_t *cq = wq->mlwq_cq;
+ mlxcx_t *mlxp = wq->mlwq_mlx;
+ mlxcx_buf_shard_t *s = wq->mlwq_bufs;
+ boolean_t refill;
+
+ do {
+ /*
+ * Wait until there are some free buffers.
+ */
+ mutex_enter(&s->mlbs_mtx);
+ while (list_is_empty(&s->mlbs_free) &&
+ (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0)
+ cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
+ mutex_exit(&s->mlbs_mtx);
+
+ mutex_enter(&cq->mlcq_mtx);
+ mutex_enter(&wq->mlwq_mtx);
+
+ if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) {
+ refill = B_FALSE;
+ wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
+ } else {
+ mlxcx_rq_refill(mlxp, wq);
+
+ if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) {
+ refill = B_TRUE;
+ } else {
+ refill = B_FALSE;
+ wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
+ }
+ }
+
+ mutex_exit(&wq->mlwq_mtx);
+ mutex_exit(&cq->mlcq_mtx);
+ } while (refill);
+}
+
void
mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
{
size_t target, current, want, done, n;
mlxcx_completion_queue_t *cq;
+ mlxcx_ring_group_t *g;
mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP];
uint_t i;
@@ -1697,10 +1851,24 @@ mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) {
n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP);
if (n == 0) {
- mlxcx_warn(mlxp, "!exiting rq refill early, done %u "
- "but wanted %u", done, want);
+ /*
+ * We didn't get any buffers from the free queue.
+ * It might not be an issue, schedule a taskq
+ * to wait for free buffers if the completion
+ * queue is low.
+ */
+ if (current < MLXCX_RQ_REFILL_STEP &&
+ (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) {
+ mlwq->mlwq_state |= MLXCX_WQ_REFILLING;
+ g = mlwq->mlwq_group;
+ taskq_dispatch_ent(g->mlg_refill_tq,
+ mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP,
+ &mlwq->mlwq_tqe);
+ }
+
return;
}
+
if (mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) {
for (i = 0; i < n; ++i)
mlxcx_buf_return(mlxp, b[i]);
@@ -1826,6 +1994,7 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
{
uint32_t chkflags = 0;
+ uint_t wqe_index;
ddi_fm_error_t err;
ASSERT(mutex_owned(&mlcq->mlcq_mtx));
@@ -1868,6 +2037,12 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
return (NULL);
}
+ /*
+ * mlxcx_buf_loan() will set mlb_wqe_index to zero.
+ * Remember it for later.
+ */
+ wqe_index = buf->mlb_wqe_index;
+
if (!mlxcx_buf_loan(mlxp, buf)) {
mlxcx_warn(mlxp, "!loan failed, dropping packet");
mlxcx_buf_return(mlxp, buf);
@@ -1894,7 +2069,7 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
* Don't check if a refill is needed on every single completion,
* since checking involves taking the RQ lock.
*/
- if ((buf->mlb_wqe_index & 0x7) == 0) {
+ if ((wqe_index & 0x7) == 0) {
mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
ASSERT(wq != NULL);
mutex_enter(&wq->mlwq_mtx);
@@ -1981,39 +2156,66 @@ mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard,
return (B_TRUE);
}
-static void
-mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
- mlxcx_buffer_t **bp)
+static mlxcx_buffer_t *
+mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
{
mlxcx_buffer_t *b;
mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs;
mutex_enter(&s->mlbs_mtx);
- while (list_is_empty(&s->mlbs_free))
- cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
- b = list_remove_head(&s->mlbs_free);
- ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
- ASSERT(b->mlb_foreign);
- b->mlb_state = MLXCX_BUFFER_ON_WQ;
- list_insert_tail(&s->mlbs_busy, b);
+ if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
+ ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
+ ASSERT(b->mlb_foreign);
+ b->mlb_state = MLXCX_BUFFER_ON_WQ;
+ list_insert_tail(&s->mlbs_busy, b);
+ }
mutex_exit(&s->mlbs_mtx);
- *bp = b;
+ return (b);
}
-boolean_t
+static mlxcx_buffer_t *
+mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz)
+{
+ ddi_fm_error_t err;
+ mlxcx_buffer_t *b;
+ uint_t attempts = 0;
+
+copyb:
+ if ((b = mlxcx_buf_take(mlxp, wq)) == NULL)
+ return (NULL);
+
+ ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
+ bcopy(rptr, b->mlb_dma.mxdb_va, sz);
+
+ MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
+
+ ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
+ DDI_FME_VERSION);
+ if (err.fme_status != DDI_FM_OK) {
+ ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
+ DDI_FME_VERSION);
+ mlxcx_buf_return(mlxp, b);
+ if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
+ return (NULL);
+ }
+ goto copyb;
+ }
+
+ return (b);
+}
+
+mlxcx_buffer_t *
mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
- mblk_t *mpb, size_t off, mlxcx_buffer_t **bp)
+ mblk_t *mpb, size_t off)
{
mlxcx_buffer_t *b, *b0 = NULL;
boolean_t first = B_TRUE;
- ddi_fm_error_t err;
mblk_t *mp;
uint8_t *rptr;
size_t sz;
size_t ncookies = 0;
boolean_t ret;
- uint_t attempts = 0;
for (mp = mpb; mp != NULL; mp = mp->b_cont) {
rptr = mp->b_rptr;
@@ -2024,31 +2226,24 @@ mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
rptr += off;
sz -= off;
- if (sz < mlxp->mlx_props.mldp_tx_bind_threshold)
- goto copyb;
-
- mlxcx_buf_take_foreign(mlxp, wq, &b);
- ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, B_FALSE);
+ if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) {
+ b = mlxcx_copy_data(mlxp, wq, rptr, sz);
+ if (b == NULL)
+ goto failed;
+ } else {
+ b = mlxcx_buf_take_foreign(mlxp, wq);
+ if (b == NULL)
+ goto failed;
- if (!ret) {
- mlxcx_buf_return(mlxp, b);
+ ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off,
+ B_FALSE);
-copyb:
- mlxcx_buf_take(mlxp, wq, &b);
- ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
- bcopy(rptr, b->mlb_dma.mxdb_va, sz);
- MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
- ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
- DDI_FME_VERSION);
- if (err.fme_status != DDI_FM_OK) {
- ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
- DDI_FME_VERSION);
+ if (!ret) {
mlxcx_buf_return(mlxp, b);
- if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
- *bp = NULL;
- return (B_FALSE);
- }
- goto copyb;
+
+ b = mlxcx_copy_data(mlxp, wq, rptr, sz);
+ if (b == NULL)
+ goto failed;
}
}
@@ -2082,54 +2277,44 @@ copyb:
ASSERT3U(ncookies, <=, MLXCX_SQE_MAX_PTRS);
- *bp = b0;
- return (B_TRUE);
+ return (b0);
+
+failed:
+ if (b0 != NULL)
+ mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
+
+ return (NULL);
}
-void
-mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mlxcx_buffer_t **bp)
+mlxcx_buffer_t *
+mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
{
mlxcx_buffer_t *b;
mlxcx_buf_shard_t *s = wq->mlwq_bufs;
mutex_enter(&s->mlbs_mtx);
- while (list_is_empty(&s->mlbs_free))
- cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
- b = list_remove_head(&s->mlbs_free);
- ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
- b->mlb_state = MLXCX_BUFFER_ON_WQ;
- list_insert_tail(&s->mlbs_busy, b);
+ if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
+ ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
+ b->mlb_state = MLXCX_BUFFER_ON_WQ;
+ list_insert_tail(&s->mlbs_busy, b);
+ }
mutex_exit(&s->mlbs_mtx);
- *bp = b;
+ return (b);
}
-#define MLXCX_BUF_TAKE_N_TIMEOUT_USEC 5000
-#define MLXCX_BUF_TAKE_N_MAX_RETRIES 3
-
size_t
mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
mlxcx_buffer_t **bp, size_t nbufs)
{
mlxcx_buffer_t *b;
- size_t done = 0, empty = 0;
- clock_t wtime = drv_usectohz(MLXCX_BUF_TAKE_N_TIMEOUT_USEC);
+ size_t done = 0;
mlxcx_buf_shard_t *s;
s = wq->mlwq_bufs;
mutex_enter(&s->mlbs_mtx);
- while (done < nbufs) {
- while (list_is_empty(&s->mlbs_free)) {
- (void) cv_reltimedwait(&s->mlbs_free_nonempty,
- &s->mlbs_mtx, wtime, TR_MILLISEC);
- if (list_is_empty(&s->mlbs_free) &&
- empty++ >= MLXCX_BUF_TAKE_N_MAX_RETRIES) {
- mutex_exit(&s->mlbs_mtx);
- return (done);
- }
- }
- b = list_remove_head(&s->mlbs_free);
+ while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) {
ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
b->mlb_state = MLXCX_BUFFER_ON_WQ;
list_insert_tail(&s->mlbs_busy, b);
@@ -2187,13 +2372,26 @@ mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE);
ASSERT3P(b->mlb_mlx, ==, mlxp);
+
+ /*
+ * The mlbs_mtx held below is a heavily contended lock, so it is
+ * imperative we do as much of the buffer clean up outside the lock
+ * as is possible.
+ */
b->mlb_state = MLXCX_BUFFER_FREE;
b->mlb_wqe_index = 0;
b->mlb_tx_head = NULL;
b->mlb_tx_mp = NULL;
b->mlb_used = 0;
+ b->mlb_wqebbs = 0;
ASSERT(list_is_empty(&b->mlb_tx_chain));
+ if (b->mlb_foreign) {
+ if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
+ mlxcx_dma_unbind(mlxp, &b->mlb_dma);
+ }
+ }
+
mutex_enter(&s->mlbs_mtx);
switch (oldstate) {
case MLXCX_BUFFER_INIT:
@@ -2215,12 +2413,6 @@ mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
break;
}
- if (b->mlb_foreign) {
- if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
- mlxcx_dma_unbind(mlxp, &b->mlb_dma);
- }
- }
-
list_insert_tail(&s->mlbs_free, b);
cv_signal(&s->mlbs_free_nonempty);