12383 Slow down and lock up in mlxcx receive interrupt path

12438 mlxcx should pass receive messages to mac layer more frequently 12439 mlxcx send rings can overflow 12440 mlxcx should not block in the send path 12441 mlxcx default queue sizes are a bit on the small size Reviewed by: Garrett D'Amore <garrett@damore.org> Reviewed by: Andy Stormont <astormont@racktopsystems.com> Reviewed by: Igor Kozhukhov <igor@dilos.org> Reviewed by: Robert Mustacchi <rm@fingolfin.org> Approved by: Garrett D'Amore <garrett@damore.org>
author: Paul Winder <pwinder@racktopsystems.com> 2020-03-09 13:16:05 +0000
committer: Paul Winder <paul@winders.demon.co.uk> 2020-04-14 16:40:07 +0100
commit: 22d052287ba7ed169757650e2eec25fedbae163a (patch)
tree: cc05c04281562815d8c52d8e2d7f3023d10f3a9f /usr/src
parent: 63878f749f68d1c188363e0e7a36e7b7e855dff2 (diff)
download: illumos-joyent-22d052287ba7ed169757650e2eec25fedbae163a.tar.gz
8 files changed, 706 insertions, 322 deletions
diff --git a/usr/src/man/man7d/mlxcx.7d b/usr/src/man/man7d/mlxcx.7d
index 5373b5bec5..d7b0cf8ad9 100644
--- a/usr/src/man/man7d/mlxcx.7d
+++ b/usr/src/man/man7d/mlxcx.7d
@@ -11,7 +11,7 @@
 .\"
 .\" Copyright 2020 the University of Queensland
 .\"
-.Dd January 17, 2020
+.Dd April 9, 2020
 .Dt MLXCX 7D
 .Os
 .Sh NAME
@@ -94,8 +94,11 @@ property determines the number of entries on Completion Queues for the device.
 The number of entries is calculated as
 .Li (1 << cq_size_shift) ,
 so a value of 9 would mean 512 entries are created on each Event Queue.
-The default value is
-.Sy 10 .
+The default value is device dependent,
+.Sy 10
+for devices with maximum supported speed of 10Gb/s or less and
+.Sy 12
+for devices with higher supported speeds.
 This should be kept very close to the value set for
 .Sy rq_size_shift
 and
@@ -116,8 +119,11 @@ The number of descriptors is calculated as
 .Dv (1 << rq_size_shift) ,
 so a value of 9 would mean 512 descriptors are created on each Receive Queue.
 This sets the number of packets on RX rings advertised to MAC.
-The default value is
-.Sy 10 .
+The default value is device dependent,
+.Sy 10
+for devices with maximum supported speed of 10Gb/s or less and
+.Sy 12
+for devices with higher supported speeds.
 .Ed
 .It Sy sq_size_shift
 .Bd -filled -compact
@@ -134,8 +140,11 @@ The number of descriptors is calculated as
 .Dv (1 << sq_size_shift) ,
 so a value of 9 would mean 512 descriptors are created on each Send Queue.
 This sets the number of packets on RX rings advertised to MAC.
-The default value is
-.Sy 11 .
+The default value is device dependent,
+.Sy 11
+for devices with maximum supported speed of 10Gb/s or less and
+.Sy 13
+for devices with higher supported speeds.
 Note that large packets often occupy more than one descriptor slot on the SQ,
 so it is sometimes a good idea to increase this if using a large MTU.
 .Ed
@@ -325,6 +334,22 @@ is seldom worth using them for small packets.
 The default value is
 .Sy 2048 .
 .Ed
+.It Sy rx_limit_per_completion
+.Bd -filled -compact
+Minimum:
+.Sy 16 |
+Maximum:
+.Sy 4096
+.Ed
+.Bd -filled
+The
+.Sy rx_limit_per_completion
+property determines the maximum number of packets that
+will be processed on a given completion ring during a single interrupt.
+This is done to try and guarantee some amount of liveness in the system.
+The default value is
+.Sy 256 .
+.Ed
 .El
 .Sh FILES
 .Bl -tag -width Pa
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.c b/usr/src/uts/common/io/mlxcx/mlxcx.c
index 12a8d52b3f..c90fa0969b 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.c
@@ -12,6 +12,7 @@
 /*
  * Copyright 2020, The University of Queensland
  * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 /*
@@ -453,23 +454,68 @@ uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT;
 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT;
 
 static void
-mlxcx_load_props(mlxcx_t *mlxp)
+mlxcx_load_prop_defaults(mlxcx_t *mlxp)
 {
 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
+	mlxcx_port_t *port = &mlxp->mlx_ports[0];
+
+	VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0);
+	VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0);
+
+	/*
+	 * Currently we have different queue size defaults for two
+	 * categories of queues. One set for devices which support a
+	 * maximum speed of 10Gb/s, and another for those above that.
+	 */
+	if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G |
+	    MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0) {
+		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G;
+		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G;
+		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G;
+	} else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G |
+	    MLXCX_PROTO_10G)) != 0) {
+		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
+		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
+		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
+	} else {
+		mlxcx_warn(mlxp, "Encountered a port with a speed we don't "
+		    "recognize. Proto: 0x%x", port->mlp_max_proto);
+		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
+		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
+		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
+	}
+}
+
+/*
+ * Properties which may have different defaults based on hardware
+ * characteristics.
+ */
+static void
+mlxcx_load_model_props(mlxcx_t *mlxp)
+{
+	mlxcx_drv_props_t *p = &mlxp->mlx_props;
+
+	mlxcx_load_prop_defaults(mlxp);
 
-	p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
-	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
-	    MLXCX_EQ_SIZE_SHIFT_DFLT);
 	p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift",
-	    MLXCX_CQ_SIZE_SHIFT_DFLT);
+	    p->mldp_cq_size_shift_default);
 	p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift",
-	    MLXCX_SQ_SIZE_SHIFT_DFLT);
+	    p->mldp_sq_size_shift_default);
 	p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift",
-	    MLXCX_RQ_SIZE_SHIFT_DFLT);
+	    p->mldp_rq_size_shift_default);
+}
+
+static void
+mlxcx_load_props(mlxcx_t *mlxp)
+{
+	mlxcx_drv_props_t *p = &mlxp->mlx_props;
 
+	p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
+	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
+	    MLXCX_EQ_SIZE_SHIFT_DFLT);
 	p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec",
 	    MLXCX_CQEMOD_PERIOD_USEC_DFLT);
@@ -521,6 +567,19 @@ mlxcx_load_props(mlxcx_t *mlxp)
 	p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
 	    "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT);
+
+	p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
+	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion",
+	    MLXCX_RX_PER_CQ_DEFAULT);
+
+	if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN ||
+	    p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) {
+		mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is "
+		    "out of range. Defaulting to: %d. Valid values are from "
+		    "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT,
+		    MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX);
+		p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT;
+	}
 }
 
 void
@@ -2595,6 +2654,8 @@ mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	}
 	mlxp->mlx_attach |= MLXCX_ATTACH_PORTS;
 
+	mlxcx_load_model_props(mlxp);
+
 	/*
 	 * Set up, enable and arm the rest of the interrupt EQs which will
 	 * service events from CQs.
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.conf b/usr/src/uts/common/io/mlxcx/mlxcx.conf
index 3569c4e5f5..321820a47b 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.conf
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.conf
@@ -12,6 +12,7 @@
 #
 # Copyright 2018, Joyent, Inc.
 # Copyright 2020, The University of Queensland
+# Copyright 2020 RackTop Systems, Inc.
 #
 
 #
@@ -23,10 +24,15 @@
 # Sizing of event and completion queues.
 #
 # The number of entries on each queue will be (1 << *_size_shift) -- so
-# a value of 9 would mean 512 entries.
+# a value of 10 would mean 1024 entries.
 #
 #eq_size_shift = 9;
+
+# The default for devices with a maximum supported speed up to 10Gb/s
 #cq_size_shift = 10;
+#
+# The default for devices with a maximum supported speed above 10Gb/s
+#cq_size_shift = 12;
 
 #
 # Sizing of send and receive queues.
@@ -35,8 +41,13 @@
 # advertise to MAC. It also determines how many packet buffers we will allocate
 # when starting the interface.
 #
+# The defaults for devices with a maximum supported speed up to 10Gb/s
 #sq_size_shift = 11;
 #rq_size_shift = 10;
+#
+# The defaults for devices with a maximum supported speed above 10Gb/s
+#sq_size_shift = 13;
+#rq_size_shift = 12;
 
 #
 # Number and configuration of TX groups and rings.
@@ -99,3 +110,10 @@
 #eq_check_interval_sec = 30;
 #cq_check_interval_sec = 300;
 #wq_check_interval_sec = 300;
+
+#
+# To provide some level of moderation and aid latencies, after
+# "rx_limit_per_completion" packets are received in a single completion
+# event, the interrupt handler will pass the chain up the receive stack.
+#
+#rx_limit_per_completion = 256;
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h
index 3b58989961..bf07691095 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.h
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.h
@@ -12,6 +12,7 @@
 /*
  * Copyright 2020, The University of Queensland
  * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 /*
@@ -32,6 +33,7 @@
 #include <sys/ddifm.h>
 #include <sys/id_space.h>
 #include <sys/list.h>
+#include <sys/taskq_impl.h>
 #include <sys/stddef.h>
 #include <sys/stream.h>
 #include <sys/strsun.h>
@@ -89,18 +91,36 @@ extern "C" {
  * Queues will be sized to (1 << *Q_SIZE_SHIFT) entries long.
  */
 #define	MLXCX_EQ_SIZE_SHIFT_DFLT	9
+
+/*
+ * The CQ, SQ and RQ sizes can effect throughput on higher speed interfaces.
+ * EQ less so, as it only takes a single EQ entry to indicate there are
+ * multiple completions on the CQ.
+ *
+ * Particularly on the Rx side, the RQ (and corresponding CQ) would run
+ * low on available entries. A symptom of this is the refill taskq running
+ * frequently. A larger RQ (and CQ) alleviates this, and as there is a
+ * close relationship between SQ and CQ size, the SQ is increased too.
+ */
 #define	MLXCX_CQ_SIZE_SHIFT_DFLT	10
+#define	MLXCX_CQ_SIZE_SHIFT_25G		12
 
 /*
  * Default to making SQs bigger than RQs for 9k MTU, since most packets will
  * spill over into more than one slot. RQ WQEs are always 1 slot.
  */
 #define	MLXCX_SQ_SIZE_SHIFT_DFLT	11
+#define	MLXCX_SQ_SIZE_SHIFT_25G		13
+
 #define	MLXCX_RQ_SIZE_SHIFT_DFLT	10
+#define	MLXCX_RQ_SIZE_SHIFT_25G		12
 
 #define	MLXCX_CQ_HWM_GAP		16
 #define	MLXCX_CQ_LWM_GAP		24
 
+#define	MLXCX_WQ_HWM_GAP		MLXCX_CQ_HWM_GAP
+#define	MLXCX_WQ_LWM_GAP		MLXCX_CQ_LWM_GAP
+
 #define	MLXCX_RQ_REFILL_STEP		64
 
 /*
@@ -135,6 +155,14 @@ extern "C" {
 #define	MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT		300
 #define	MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT		30
 
+/*
+ * After this many packets, the packets received so far are passed to
+ * the mac layer.
+ */
+#define	MLXCX_RX_PER_CQ_DEFAULT			256
+#define	MLXCX_RX_PER_CQ_MIN			16
+#define	MLXCX_RX_PER_CQ_MAX			4096
+
 #define	MLXCX_DOORBELL_TRIES_DFLT		3
 extern uint_t mlxcx_doorbell_tries;
 
@@ -417,6 +445,11 @@ typedef struct mlxcx_buffer {
 	size_t			mlb_used;
 	mblk_t			*mlb_tx_mp;
 
+	/*
+	 * The number of work queue basic blocks this buf uses.
+	 */
+	uint_t			mlb_wqebbs;
+
 	mlxcx_t			*mlb_mlx;
 	mlxcx_buffer_state_t	mlb_state;
 	uint_t			mlb_wqe_index;
@@ -495,6 +528,8 @@ typedef enum {
 	MLXCX_WQ_DESTROYED	= 1 << 3,
 	MLXCX_WQ_TEARDOWN	= 1 << 4,
 	MLXCX_WQ_BUFFERS	= 1 << 5,
+	MLXCX_WQ_REFILLING	= 1 << 6,
+	MLXCX_WQ_BLOCKED_MAC	= 1 << 7
 } mlxcx_workq_state_t;
 
 typedef enum {
@@ -540,12 +575,18 @@ struct mlxcx_work_queue {
 	};
 	uint64_t			mlwq_pc;	/* producer counter */
 
+	uint64_t			mlwq_wqebb_used;
+	size_t				mlwq_bufhwm;
+	size_t				mlwq_buflwm;
+
 	mlxcx_dma_buffer_t		mlwq_doorbell_dma;
 	mlxcx_workq_doorbell_t		*mlwq_doorbell;
 
 	mlxcx_buf_shard_t		*mlwq_bufs;
 	mlxcx_buf_shard_t		*mlwq_foreign_bufs;
 
+	taskq_ent_t			mlwq_tqe;
+
 	boolean_t			mlwq_fm_repd_qstate;
 };
 
@@ -773,6 +814,8 @@ struct mlxcx_ring_group {
 	mlxcx_flow_group_t		*mlg_rx_vlan_promisc_fg;
 	list_t				mlg_rx_vlans;
 
+	taskq_t				*mlg_refill_tq;
+
 	/*
 	 * Flow table for separating out by protocol before hashing
 	 */
@@ -856,8 +899,11 @@ typedef struct {
 typedef struct {
 	uint_t			mldp_eq_size_shift;
 	uint_t			mldp_cq_size_shift;
+	uint_t			mldp_cq_size_shift_default;
 	uint_t			mldp_rq_size_shift;
+	uint_t			mldp_rq_size_shift_default;
 	uint_t			mldp_sq_size_shift;
+	uint_t			mldp_sq_size_shift_default;
 	uint_t			mldp_cqemod_period_usec;
 	uint_t			mldp_cqemod_count;
 	uint_t			mldp_intrmod_period_usec;
@@ -865,6 +911,7 @@ typedef struct {
 	uint_t			mldp_rx_ngroups_small;
 	uint_t			mldp_rx_nrings_per_large_group;
 	uint_t			mldp_rx_nrings_per_small_group;
+	uint_t			mldp_rx_per_cq;
 	uint_t			mldp_tx_ngroups;
 	uint_t			mldp_tx_nrings_per_group;
 	uint_t			mldp_ftbl_root_size_shift;
@@ -1098,6 +1145,7 @@ extern boolean_t mlxcx_intr_setup(mlxcx_t *);
 extern void mlxcx_intr_teardown(mlxcx_t *);
 extern void mlxcx_arm_eq(mlxcx_t *, mlxcx_event_queue_t *);
 extern void mlxcx_arm_cq(mlxcx_t *, mlxcx_completion_queue_t *);
+extern void mlxcx_update_cqci(mlxcx_t *, mlxcx_completion_queue_t *);
 
 extern mblk_t *mlxcx_rx_poll(mlxcx_t *, mlxcx_completion_queue_t *, size_t);
 
@@ -1109,8 +1157,6 @@ extern boolean_t mlxcx_register_mac(mlxcx_t *);
 /*
  * From mlxcx_ring.c
  */
-extern boolean_t mlxcx_cq_alloc_dma(mlxcx_t *, mlxcx_completion_queue_t *);
-extern void mlxcx_cq_rele_dma(mlxcx_t *, mlxcx_completion_queue_t *);
 extern boolean_t mlxcx_wq_alloc_dma(mlxcx_t *, mlxcx_work_queue_t *);
 extern void mlxcx_wq_rele_dma(mlxcx_t *, mlxcx_work_queue_t *);
 
@@ -1118,7 +1164,7 @@ extern boolean_t mlxcx_buf_create(mlxcx_t *, mlxcx_buf_shard_t *,
     mlxcx_buffer_t **);
 extern boolean_t mlxcx_buf_create_foreign(mlxcx_t *, mlxcx_buf_shard_t *,
     mlxcx_buffer_t **);
-extern void mlxcx_buf_take(mlxcx_t *, mlxcx_work_queue_t *, mlxcx_buffer_t **);
+extern mlxcx_buffer_t *mlxcx_buf_take(mlxcx_t *, mlxcx_work_queue_t *);
 extern size_t mlxcx_buf_take_n(mlxcx_t *, mlxcx_work_queue_t *,
     mlxcx_buffer_t **, size_t);
 extern boolean_t mlxcx_buf_loan(mlxcx_t *, mlxcx_buffer_t *);
@@ -1126,8 +1172,8 @@ extern void mlxcx_buf_return(mlxcx_t *, mlxcx_buffer_t *);
 extern void mlxcx_buf_return_chain(mlxcx_t *, mlxcx_buffer_t *, boolean_t);
 extern void mlxcx_buf_destroy(mlxcx_t *, mlxcx_buffer_t *);
 
-extern boolean_t mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *,
-    mblk_t *, size_t, mlxcx_buffer_t **);
+extern mlxcx_buffer_t *mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *,
+    mblk_t *, size_t);
 
 extern boolean_t mlxcx_rx_group_setup(mlxcx_t *, mlxcx_ring_group_t *);
 extern boolean_t mlxcx_tx_group_setup(mlxcx_t *, mlxcx_ring_group_t *);
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
index 7b01702376..a1d50659c1 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
@@ -430,15 +430,10 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
 		}
 	}
 
-	if (!mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take, &b)) {
-		/*
-		 * Something went really wrong, and we probably will never be
-		 * able to TX again (all our buffers are broken and DMA is
-		 * failing). Drop the packet on the floor -- FMA should be
-		 * reporting this error elsewhere.
-		 */
-		freemsg(mp);
-		return (NULL);
+	b = mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take);
+	if (b == NULL) {
+		atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
+		return (mp);
 	}
 
 	mutex_enter(&sq->mlwq_mtx);
@@ -467,18 +462,20 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
 	 */
 	if (cq->mlcq_bufcnt >= cq->mlcq_bufhwm) {
 		atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC);
-		mutex_exit(&sq->mlwq_mtx);
-		mlxcx_buf_return_chain(mlxp, b, B_TRUE);
-		return (mp);
+		goto blocked;
+	}
+
+	if (sq->mlwq_wqebb_used >= sq->mlwq_bufhwm) {
+		atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
+		goto blocked;
 	}
 
 	ok = mlxcx_sq_add_buffer(mlxp, sq, inline_hdrs, inline_hdrlen,
 	    chkflags, b);
 	if (!ok) {
 		atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC);
-		mutex_exit(&sq->mlwq_mtx);
-		mlxcx_buf_return_chain(mlxp, b, B_TRUE);
-		return (mp);
+		atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
+		goto blocked;
 	}
 
 	/*
@@ -493,6 +490,11 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
 	mutex_exit(&sq->mlwq_mtx);
 
 	return (NULL);
+
+blocked:
+	mutex_exit(&sq->mlwq_mtx);
+	mlxcx_buf_return_chain(mlxp, b, B_TRUE);
+	return (mp);
 }
 
 static int
@@ -862,9 +864,8 @@ mlxcx_mac_ring_intr_disable(mac_intr_handle_t intrh)
 {
 	mlxcx_completion_queue_t *cq = (mlxcx_completion_queue_t *)intrh;
 
-	atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_POLLING);
 	mutex_enter(&cq->mlcq_mtx);
-	VERIFY(cq->mlcq_state & MLXCX_CQ_POLLING);
+	atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_POLLING);
 	mutex_exit(&cq->mlcq_mtx);
 
 	return (0);
@@ -1061,56 +1062,43 @@ mlxcx_mac_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 	case MAC_PROP_EN_100GFDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto &
-		    (MLXCX_PROTO_100GBASE_CR4 | MLXCX_PROTO_100GBASE_SR4 |
-		    MLXCX_PROTO_100GBASE_KR4)) != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_100G) != 0);
 		break;
 	case MAC_PROP_ADV_50GFDX_CAP:
 	case MAC_PROP_EN_50GFDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto &
-		    (MLXCX_PROTO_50GBASE_CR2 | MLXCX_PROTO_50GBASE_KR2 |
-		    MLXCX_PROTO_50GBASE_SR2)) != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_50G) != 0);
 		break;
 	case MAC_PROP_ADV_40GFDX_CAP:
 	case MAC_PROP_EN_40GFDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto &
-		    (MLXCX_PROTO_40GBASE_SR4 | MLXCX_PROTO_40GBASE_LR4_ER4 |
-		    MLXCX_PROTO_40GBASE_CR4 | MLXCX_PROTO_40GBASE_KR4))
-		    != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_40G) != 0);
 		break;
 	case MAC_PROP_ADV_25GFDX_CAP:
 	case MAC_PROP_EN_25GFDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto &
-		    (MLXCX_PROTO_25GBASE_CR | MLXCX_PROTO_25GBASE_KR |
-		    MLXCX_PROTO_25GBASE_SR)) != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_25G) != 0);
 		break;
 	case MAC_PROP_ADV_10GFDX_CAP:
 	case MAC_PROP_EN_10GFDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto &
-		    (MLXCX_PROTO_10GBASE_CX4 | MLXCX_PROTO_10GBASE_KX4 |
-		    MLXCX_PROTO_10GBASE_KR | MLXCX_PROTO_10GBASE_CR |
-		    MLXCX_PROTO_10GBASE_SR | MLXCX_PROTO_10GBASE_ER_LR)) != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_10G) != 0);
 		break;
 	case MAC_PROP_ADV_1000FDX_CAP:
 	case MAC_PROP_EN_1000FDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto & (MLXCX_PROTO_1000BASE_KX |
-		    MLXCX_PROTO_SGMII)) != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_1G) != 0);
 		break;
 	case MAC_PROP_ADV_100FDX_CAP:
 	case MAC_PROP_EN_100FDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto & MLXCX_PROTO_SGMII_100BASE) != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_100M) != 0);
 		break;
 	default:
 		break;
@@ -1252,8 +1240,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    (MLXCX_PROTO_100GBASE_CR4 | MLXCX_PROTO_100GBASE_SR4 |
-		    MLXCX_PROTO_100GBASE_KR4)) != 0;
+		    MLXCX_PROTO_100G) != 0;
 		break;
 	case MAC_PROP_ADV_50GFDX_CAP:
 	case MAC_PROP_EN_50GFDX_CAP:
@@ -1262,8 +1249,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    (MLXCX_PROTO_50GBASE_CR2 | MLXCX_PROTO_50GBASE_KR2 |
-		    MLXCX_PROTO_50GBASE_SR2)) != 0;
+		    MLXCX_PROTO_50G) != 0;
 		break;
 	case MAC_PROP_ADV_40GFDX_CAP:
 	case MAC_PROP_EN_40GFDX_CAP:
@@ -1272,8 +1258,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    (MLXCX_PROTO_40GBASE_SR4 | MLXCX_PROTO_40GBASE_LR4_ER4 |
-		    MLXCX_PROTO_40GBASE_CR4 | MLXCX_PROTO_40GBASE_KR4)) != 0;
+		    MLXCX_PROTO_40G) != 0;
 		break;
 	case MAC_PROP_ADV_25GFDX_CAP:
 	case MAC_PROP_EN_25GFDX_CAP:
@@ -1282,8 +1267,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    (MLXCX_PROTO_25GBASE_CR | MLXCX_PROTO_25GBASE_KR |
-		    MLXCX_PROTO_25GBASE_SR)) != 0;
+		    MLXCX_PROTO_25G) != 0;
 		break;
 	case MAC_PROP_ADV_10GFDX_CAP:
 	case MAC_PROP_EN_10GFDX_CAP:
@@ -1292,9 +1276,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    (MLXCX_PROTO_10GBASE_CX4 | MLXCX_PROTO_10GBASE_KX4 |
-		    MLXCX_PROTO_10GBASE_KR | MLXCX_PROTO_10GBASE_CR |
-		    MLXCX_PROTO_10GBASE_SR | MLXCX_PROTO_10GBASE_ER_LR)) != 0;
+		    MLXCX_PROTO_10G) != 0;
 		break;
 	case MAC_PROP_ADV_1000FDX_CAP:
 	case MAC_PROP_EN_1000FDX_CAP:
@@ -1303,7 +1285,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    (MLXCX_PROTO_1000BASE_KX | MLXCX_PROTO_SGMII)) != 0;
+		    MLXCX_PROTO_1G) != 0;
 		break;
 	case MAC_PROP_ADV_100FDX_CAP:
 	case MAC_PROP_EN_100FDX_CAP:
@@ -1312,7 +1294,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    MLXCX_PROTO_SGMII_100BASE) != 0;
+		    MLXCX_PROTO_100M) != 0;
 		break;
 	default:
 		ret = ENOTSUP;
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
index 0516f86d6b..4dc4291b08 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
@@ -11,6 +11,7 @@
 
 /*
  * Copyright (c) 2020, the University of Queensland
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 /*
@@ -26,6 +27,11 @@
 
 #include <mlxcx.h>
 
+/*
+ * CTASSERT(s) to cover bad values which would induce bugs.
+ */
+CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP);
+
 void
 mlxcx_intr_teardown(mlxcx_t *mlxp)
 {
@@ -190,6 +196,31 @@ mlxcx_cq_next(mlxcx_completion_queue_t *mlcq)
 }
 
 void
+mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
+{
+	ddi_fm_error_t err;
+	uint_t try = 0;
+
+	mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
+
+retry:
+	MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
+	ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
+	    DDI_FME_VERSION);
+	if (err.fme_status != DDI_FM_OK) {
+		if (try++ < mlxcx_doorbell_tries) {
+			ddi_fm_dma_err_clear(
+			    mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
+			    DDI_FME_VERSION);
+			goto retry;
+		} else {
+			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
+			return;
+		}
+	}
+}
+
+void
 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
 {
 	bits32_t dbval = new_bits32();
@@ -538,14 +569,15 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2)
 	if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
 	    !(mleq->mleq_state & MLXCX_EQ_CREATED) ||
 	    (mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
-		mlxcx_warn(mlxp, "int0 on bad eq state");
+		mlxcx_warn(mlxp, "int %d on bad eq state",
+		    mleq->mleq_intr_index);
 		mutex_exit(&mleq->mleq_mtx);
 		return (DDI_INTR_UNCLAIMED);
 	}
 
 	ent = mlxcx_eq_next(mleq);
 	if (ent == NULL) {
-		mlxcx_warn(mlxp, "spurious int 0?");
+		mlxcx_warn(mlxp, "spurious int %d", mleq->mleq_intr_index);
 		mutex_exit(&mleq->mleq_mtx);
 		return (DDI_INTR_UNCLAIMED);
 	}
@@ -574,8 +606,8 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2)
 			mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod);
 			break;
 		default:
-			mlxcx_warn(mlxp, "unhandled event 0x%x on int0",
-			    ent->mleqe_event_type);
+			mlxcx_warn(mlxp, "unhandled event 0x%x on int %d",
+			    ent->mleqe_event_type, mleq->mleq_intr_index);
 		}
 	}
 
@@ -591,46 +623,56 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2)
 	return (DDI_INTR_CLAIMED);
 }
 
-mblk_t *
-mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
+static boolean_t
+mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp,
+    size_t bytelim)
 {
-	mlxcx_buffer_t *buf;
-	mblk_t *mp, *cmp, *nmp;
+	mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
 	mlxcx_completionq_ent_t *cent;
+	mblk_t *mp, *cmp, *nmp;
+	mlxcx_buffer_t *buf;
+	boolean_t found, added;
 	size_t bytes = 0;
-	boolean_t found;
-
-	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
+	uint_t rx_frames = 0;
+	uint_t comp_cnt = 0;
+	int64_t wqebbs, bufcnt;
 
-	ASSERT(mlcq->mlcq_wq != NULL);
-	ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
+	*mpp = NULL;
 
 	if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
 	    !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
 	    (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
 	    (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
-		return (NULL);
+		return (B_FALSE);
 	}
 
-	ASSERT(mlcq->mlcq_state & MLXCX_CQ_POLLING);
-
 	nmp = cmp = mp = NULL;
 
-	cent = mlxcx_cq_next(mlcq);
-	for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) {
+	wqebbs = 0;
+	bufcnt = 0;
+	for (cent = mlxcx_cq_next(mlcq); cent != NULL;
+	    cent = mlxcx_cq_next(mlcq)) {
 		/*
 		 * Teardown and ring stop can atomic_or this flag
 		 * into our state if they want us to stop early.
 		 */
 		if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
-			break;
+			return (B_FALSE);
 
+		comp_cnt++;
 		if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
 		    cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
 			/* NOP */
+			atomic_dec_64(&wq->mlwq_wqebb_used);
 			goto nextcq;
 		}
 
+lookagain:
+		/*
+		 * Generally the buffer we're looking for will be
+		 * at the front of the list, so this loop won't
+		 * need to look far.
+		 */
 		buf = list_head(&mlcq->mlcq_buffers);
 		found = B_FALSE;
 		while (buf != NULL) {
@@ -641,36 +683,118 @@ mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
 			}
 			buf = list_next(&mlcq->mlcq_buffers, buf);
 		}
+
 		if (!found) {
+			/*
+			 * If there's any buffers waiting on the
+			 * buffers_b list, then merge those into
+			 * the main list and have another look.
+			 *
+			 * The wq enqueue routines push new buffers
+			 * into buffers_b so that they can avoid
+			 * taking the mlcq_mtx and blocking us for
+			 * every single packet.
+			 */
+			added = B_FALSE;
+			mutex_enter(&mlcq->mlcq_bufbmtx);
+			if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
+				list_move_tail(&mlcq->mlcq_buffers,
+				    &mlcq->mlcq_buffers_b);
+				added = B_TRUE;
+			}
+			mutex_exit(&mlcq->mlcq_bufbmtx);
+			if (added)
+				goto lookagain;
+
 			buf = list_head(&mlcq->mlcq_buffers);
 			mlxcx_warn(mlxp, "got completion on CQ %x but "
 			    "no buffer matching wqe found: %x (first "
 			    "buffer counter = %x)", mlcq->mlcq_num,
 			    from_be16(cent->mlcqe_wqe_counter),
-			    buf == NULL ? UINT32_MAX : buf->mlb_wqe_index);
+			    buf == NULL ? UINT32_MAX :
+			    buf->mlb_wqe_index);
 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
 			goto nextcq;
 		}
+
+		/*
+		 * The buf is likely to be freed below, count this now.
+		 */
+		wqebbs += buf->mlb_wqebbs;
+
 		list_remove(&mlcq->mlcq_buffers, buf);
-		atomic_dec_64(&mlcq->mlcq_bufcnt);
+		bufcnt++;
 
-		nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
-		if (nmp != NULL) {
+		switch (mlcq->mlcq_wq->mlwq_type) {
+		case MLXCX_WQ_TYPE_SENDQ:
+			mlxcx_tx_completion(mlxp, mlcq, cent, buf);
+			break;
+		case MLXCX_WQ_TYPE_RECVQ:
+			nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
 			bytes += from_be32(cent->mlcqe_byte_cnt);
-			if (cmp != NULL) {
-				cmp->b_next = nmp;
-				cmp = nmp;
-			} else {
-				mp = cmp = nmp;
+			if (nmp != NULL) {
+				if (cmp != NULL) {
+					cmp->b_next = nmp;
+					cmp = nmp;
+				} else {
+					mp = cmp = nmp;
+				}
+
+				rx_frames++;
 			}
+			break;
 		}
-nextcq:
-		mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
 
-		if (bytelim != 0 && bytes > bytelim)
+		/*
+		 * Update the consumer index with what has been processed,
+		 * followed by driver counters. It is important to tell the
+		 * hardware first, otherwise when we throw more packets at
+		 * it, it may get an overflow error.
+		 * We do this whenever we've processed enough to bridge the
+		 * high->low water mark.
+		 */
+		if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) {
+			mlxcx_update_cqci(mlxp, mlcq);
+			/*
+			 * Both these variables are incremented using
+			 * atomics as they are modified in other code paths
+			 * (Eg during tx) which hold different locks.
+			 */
+			atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
+			atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
+			wqebbs = 0;
+			bufcnt = 0;
+			comp_cnt = 0;
+		}
+nextcq:
+		if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq ||
+		    (bytelim != 0 && bytes > bytelim))
 			break;
 	}
 
+	if (comp_cnt > 0) {
+		mlxcx_update_cqci(mlxp, mlcq);
+		atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
+		atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
+	}
+
+	*mpp = mp;
+	return (B_TRUE);
+}
+
+
+mblk_t *
+mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
+{
+	mblk_t *mp = NULL;
+
+	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
+
+	ASSERT(mlcq->mlcq_wq != NULL);
+	ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
+
+	(void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim);
+
 	return (mp);
 }
 
@@ -680,11 +804,10 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2)
 	mlxcx_t *mlxp = (mlxcx_t *)arg;
 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
 	mlxcx_eventq_ent_t *ent;
-	mlxcx_completionq_ent_t *cent;
 	mlxcx_completion_queue_t *mlcq, probe;
-	mlxcx_buffer_t *buf;
-	mblk_t *mp, *cmp, *nmp;
-	boolean_t found, tellmac = B_FALSE, added;
+	mlxcx_work_queue_t *mlwq;
+	mblk_t *mp = NULL;
+	boolean_t tellmac = B_FALSE;
 
 	mutex_enter(&mleq->mleq_mtx);
 
@@ -729,10 +852,12 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2)
 		if (mlcq == NULL)
 			continue;
 
+		mlwq = mlcq->mlcq_wq;
+
 		/*
 		 * The polling function might have the mutex and stop us from
-		 * getting the lock here, so we increment the event counter
-		 * atomically from outside.
+		 * getting the lock in mlxcx_process_cq(), so we increment
+		 * the event counter atomically from outside.
 		 *
 		 * This way at the end of polling when we go back to interrupts
 		 * from this CQ, the event counter is still correct.
@@ -746,145 +871,57 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2)
 
 		if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) {
 			/*
-			 * If we failed to take the mutex because the polling
-			 * function has it, just move on. We don't want to
-			 * block other CQs behind this one.
+			 * If we failed to take the mutex because the
+			 * polling function has it, just move on.
+			 * We don't want to block other CQs behind
+			 * this one.
 			 */
 			if (mlcq->mlcq_state & MLXCX_CQ_POLLING)
-				continue;
+				goto update_eq;
+
 			/* Otherwise we will wait. */
 			mutex_enter(&mlcq->mlcq_mtx);
 		}
 
-		if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
-		    !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
-		    (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
-		    (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) ||
-		    (mlcq->mlcq_state & MLXCX_CQ_POLLING)) {
-			mutex_exit(&mlcq->mlcq_mtx);
-			continue;
-		}
-
-		nmp = cmp = mp = NULL;
-		tellmac = B_FALSE;
-
-		cent = mlxcx_cq_next(mlcq);
-		for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) {
+		if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 &&
+		    mlxcx_process_cq(mlxp, mlcq, &mp, 0)) {
 			/*
-			 * Teardown and ring stop can atomic_or this flag
-			 * into our state if they want us to stop early.
+			 * The ring is not in polling mode and we processed
+			 * some completion queue entries.
 			 */
-			if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
-				break;
-			if (mlcq->mlcq_state & MLXCX_CQ_POLLING)
-				break;
-
-			if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
-			    cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
-				/* NOP */
-				goto nextcq;
+			if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 &&
+			    mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
+				atomic_and_uint(&mlcq->mlcq_state,
+				    ~MLXCX_CQ_BLOCKED_MAC);
+				tellmac = B_TRUE;
 			}
 
-lookagain:
-			/*
-			 * Generally the buffer we're looking for will be
-			 * at the front of the list, so this loop won't
-			 * need to look far.
-			 */
-			buf = list_head(&mlcq->mlcq_buffers);
-			found = B_FALSE;
-			while (buf != NULL) {
-				if ((buf->mlb_wqe_index & UINT16_MAX) ==
-				    from_be16(cent->mlcqe_wqe_counter)) {
-					found = B_TRUE;
-					break;
-				}
-				buf = list_next(&mlcq->mlcq_buffers, buf);
-			}
-			if (!found) {
-				/*
-				 * If there's any buffers waiting on the
-				 * buffers_b list, then merge those into
-				 * the main list and have another look.
-				 *
-				 * The wq enqueue routines push new buffers
-				 * into buffers_b so that they can avoid
-				 * taking the mlcq_mtx and blocking us for
-				 * every single packet.
-				 */
-				added = B_FALSE;
-				mutex_enter(&mlcq->mlcq_bufbmtx);
-				if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
-					list_move_tail(&mlcq->mlcq_buffers,
-					    &mlcq->mlcq_buffers_b);
-					added = B_TRUE;
-				}
-				mutex_exit(&mlcq->mlcq_bufbmtx);
-				if (added)
-					goto lookagain;
-			}
-			if (!found) {
-				buf = list_head(&mlcq->mlcq_buffers);
-				mlxcx_warn(mlxp, "got completion on CQ %x but "
-				    "no buffer matching wqe found: %x (first "
-				    "buffer counter = %x)", mlcq->mlcq_num,
-				    from_be16(cent->mlcqe_wqe_counter),
-				    buf == NULL ? UINT32_MAX :
-				    buf->mlb_wqe_index);
-				mlxcx_fm_ereport(mlxp,
-				    DDI_FM_DEVICE_INVAL_STATE);
-				goto nextcq;
+			if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 &&
+			    mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) {
+				atomic_and_uint(&mlwq->mlwq_state,
+				    ~MLXCX_WQ_BLOCKED_MAC);
+				tellmac = B_TRUE;
 			}
-			list_remove(&mlcq->mlcq_buffers, buf);
-			atomic_dec_64(&mlcq->mlcq_bufcnt);
 
-			switch (mlcq->mlcq_wq->mlwq_type) {
-			case MLXCX_WQ_TYPE_SENDQ:
-				mlxcx_tx_completion(mlxp, mlcq, cent, buf);
-				break;
-			case MLXCX_WQ_TYPE_RECVQ:
-				nmp = mlxcx_rx_completion(mlxp, mlcq, cent,
-				    buf);
-				if (nmp != NULL) {
-					if (cmp != NULL) {
-						cmp->b_next = nmp;
-						cmp = nmp;
-					} else {
-						mp = cmp = nmp;
-					}
-				}
-				break;
-			}
+			mlxcx_arm_cq(mlxp, mlcq);
 
-nextcq:
-			/*
-			 * Update the "doorbell" consumer counter for the queue
-			 * every time. Unlike a UAR write, this is relatively
-			 * cheap and doesn't require us to go out on the bus
-			 * straight away (since it's our memory).
-			 */
-			mlcq->mlcq_doorbell->mlcqd_update_ci =
-			    to_be24(mlcq->mlcq_cc);
+			mutex_exit(&mlcq->mlcq_mtx);
 
-			if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) &&
-			    mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
-				mlcq->mlcq_state &= ~MLXCX_CQ_BLOCKED_MAC;
-				tellmac = B_TRUE;
+			if (tellmac) {
+				mac_tx_ring_update(mlxp->mlx_mac_hdl,
+				    mlcq->mlcq_mac_hdl);
+				tellmac = B_FALSE;
 			}
-		}
 
-		mlxcx_arm_cq(mlxp, mlcq);
-		mutex_exit(&mlcq->mlcq_mtx);
-
-		if (tellmac) {
-			mac_tx_ring_update(mlxp->mlx_mac_hdl,
-			    mlcq->mlcq_mac_hdl);
-		}
-		if (mp != NULL) {
-			mac_rx_ring(mlxp->mlx_mac_hdl, mlcq->mlcq_mac_hdl,
-			    mp, mlcq->mlcq_mac_gen);
+			if (mp != NULL) {
+				mac_rx_ring(mlxp->mlx_mac_hdl,
+				    mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen);
+			}
+		} else {
+			mutex_exit(&mlcq->mlcq_mtx);
 		}
 
+update_eq:
 		/*
 		 * Updating the consumer counter for an EQ requires a write
 		 * to the UAR, which is possibly expensive.
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
index 76d0da30e7..f65280d41d 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
@@ -12,6 +12,7 @@
 /*
  * Copyright 2020, The University of Queensland
  * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 #ifndef _MLXCX_REG_H
@@ -2259,6 +2260,28 @@ typedef enum {
 	MLXCX_PROTO_50GBASE_KR2			= 1UL << 31,
 } mlxcx_eth_proto_t;
 
+#define	MLXCX_PROTO_100M	MLXCX_PROTO_SGMII_100BASE
+
+#define	MLXCX_PROTO_1G		(MLXCX_PROTO_1000BASE_KX | MLXCX_PROTO_SGMII)
+
+#define	MLXCX_PROTO_10G		(MLXCX_PROTO_10GBASE_CX4 | \
+	MLXCX_PROTO_10GBASE_KX4 | MLXCX_PROTO_10GBASE_KR | \
+	MLXCX_PROTO_10GBASE_CR | MLXCX_PROTO_10GBASE_SR | \
+	MLXCX_PROTO_10GBASE_ER_LR)
+
+#define	MLXCX_PROTO_25G		(MLXCX_PROTO_25GBASE_CR | \
+	MLXCX_PROTO_25GBASE_KR | MLXCX_PROTO_25GBASE_SR)
+
+#define	MLXCX_PROTO_40G		(MLXCX_PROTO_40GBASE_SR4 | \
+	MLXCX_PROTO_40GBASE_LR4_ER4 | MLXCX_PROTO_40GBASE_CR4 | \
+	MLXCX_PROTO_40GBASE_KR4)
+
+#define	MLXCX_PROTO_50G		(MLXCX_PROTO_50GBASE_CR2 | \
+	MLXCX_PROTO_50GBASE_KR2 | MLXCX_PROTO_50GBASE_SR2)
+
+#define	MLXCX_PROTO_100G	(MLXCX_PROTO_100GBASE_CR4 | \
+	MLXCX_PROTO_100GBASE_SR4 | MLXCX_PROTO_100GBASE_KR4)
+
 typedef enum {
 	MLXCX_AUTONEG_DISABLE_CAP	= 1 << 5,
 	MLXCX_AUTONEG_DISABLE		= 1 << 6
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
index 8337545b57..da609ed28c 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
@@ -12,6 +12,7 @@
 /*
  * Copyright 2020, The University of Queensland
  * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 /*
@@ -113,8 +114,9 @@ mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
 	mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC;
 }
 
-boolean_t
-mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
+static boolean_t
+mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
+    uint_t ent_shift)
 {
 	ddi_device_acc_attr_t acc;
 	ddi_dma_attr_t attr;
@@ -123,7 +125,7 @@ mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
 
 	VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC);
 
-	mlcq->mlcq_entshift = mlxp->mlx_props.mldp_cq_size_shift;
+	mlcq->mlcq_entshift = ent_shift;
 	mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift);
 	sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t);
 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
@@ -165,7 +167,7 @@ mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
 	return (B_TRUE);
 }
 
-void
+static void
 mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
 {
 	VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC);
@@ -331,7 +333,7 @@ mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
 
 static boolean_t
 mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
-    mlxcx_completion_queue_t **cqp)
+    mlxcx_completion_queue_t **cqp, uint_t ent_shift)
 {
 	mlxcx_completion_queue_t *cq;
 
@@ -350,7 +352,7 @@ mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
 
 	mutex_enter(&cq->mlcq_mtx);
 
-	if (!mlxcx_cq_alloc_dma(mlxp, cq)) {
+	if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) {
 		mutex_exit(&cq->mlcq_mtx);
 		return (B_FALSE);
 	}
@@ -413,6 +415,9 @@ mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq,
 		return (B_FALSE);
 	}
 
+	wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
+	wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
+
 	mutex_exit(&wq->mlwq_mtx);
 
 	mutex_enter(&cq->mlcq_mtx);
@@ -459,6 +464,9 @@ mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
 		return (B_FALSE);
 	}
 
+	wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
+	wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
+
 	mutex_exit(&wq->mlwq_mtx);
 
 	mutex_enter(&cq->mlcq_mtx);
@@ -471,6 +479,35 @@ mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
 	return (B_TRUE);
 }
 
+/*
+ * Before we tear down the queues associated with the rx group,
+ * flag each cq as being torn down and wake up any tasks.
+ */
+static void
+mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
+{
+	mlxcx_work_queue_t *wq;
+	mlxcx_completion_queue_t *cq;
+	mlxcx_buf_shard_t *s;
+	uint_t i;
+
+	mutex_enter(&g->mlg_mtx);
+
+	for (i = 0; i < g->mlg_nwqs; ++i) {
+		wq = &g->mlg_wqs[i];
+		cq = wq->mlwq_cq;
+		if (cq != NULL) {
+			s = wq->mlwq_bufs;
+			mutex_enter(&s->mlbs_mtx);
+			atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN);
+			cv_broadcast(&s->mlbs_free_nonempty);
+			mutex_exit(&s->mlbs_mtx);
+		}
+	}
+
+	mutex_exit(&g->mlg_mtx);
+}
+
 void
 mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 {
@@ -551,6 +588,7 @@ mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 			}
 			mutex_exit(&wq->mlwq_mtx);
 		}
+		taskq_destroy(g->mlg_refill_tq);
 		g->mlg_state &= ~MLXCX_GROUP_RUNNING;
 	}
 
@@ -662,8 +700,16 @@ mlxcx_teardown_groups(mlxcx_t *mlxp)
 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
 			continue;
 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX);
+		mlxcx_quiesce_rx_cqs(mlxp, g);
+	}
+
+	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
+		g = &mlxp->mlx_rx_groups[i];
+		if (!(g->mlg_state & MLXCX_GROUP_INIT))
+			continue;
 		mlxcx_teardown_rx_group(mlxp, g);
 	}
+
 	kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size);
 	mlxp->mlx_rx_groups = NULL;
 
@@ -674,6 +720,7 @@ mlxcx_teardown_groups(mlxcx_t *mlxp)
 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX);
 		mlxcx_teardown_tx_group(mlxp, g);
 	}
+
 	kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size);
 	mlxp->mlx_tx_groups = NULL;
 }
@@ -687,6 +734,7 @@ mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 	mlxcx_flow_table_t *ft;
 	mlxcx_flow_group_t *fg;
 	mlxcx_flow_entry_t *fe;
+	uint_t ent_shift;
 	uint_t i, j;
 
 	ASSERT3S(g->mlg_state, ==, 0);
@@ -730,10 +778,18 @@ mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 			}
 		}
 
-		if (!mlxcx_cq_setup(mlxp, eq, &cq)) {
+		/*
+		 * A single completion is indicated for each rq entry as
+		 * it is used. So, the number of cq entries never needs
+		 * to be larger than the rq.
+		 */
+		ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift,
+		    mlxp->mlx_props.mldp_rq_size_shift);
+		if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) {
 			g->mlg_nwqs = i;
 			break;
 		}
+
 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
 
 		rq = &g->mlg_wqs[i];
@@ -1182,6 +1238,7 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 	mlxcx_flow_table_t *ft;
 	mlxcx_flow_group_t *fg;
 	mlxcx_flow_entry_t *fe;
+	char tq_name[TASKQ_NAMELEN];
 
 	mutex_enter(&g->mlg_mtx);
 
@@ -1194,6 +1251,23 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 
 	g->mlg_state |= MLXCX_GROUP_RUNNING;
 
+	snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld",
+	    ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst,
+	    g - &mlxp->mlx_rx_groups[0]);
+
+	/*
+	 * Create one refill taskq per group with one thread per work queue.
+	 * The refill task may block waiting for resources, so by effectively
+	 * having one thread per work queue we avoid work queues blocking each
+	 * other.
+	 */
+	if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri,
+	    g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
+		mlxcx_warn(mlxp, "failed to create rq refill task queue");
+		mutex_exit(&g->mlg_mtx);
+		return (B_FALSE);
+	}
+
 	if (g == &mlxp->mlx_rx_groups[0]) {
 		ft = g->mlg_port->mlp_rx_flow;
 		mutex_enter(&ft->mlft_mtx);
@@ -1207,6 +1281,8 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
 			mutex_exit(&ft->mlft_mtx);
+			g->mlg_state &= ~MLXCX_GROUP_RUNNING;
+			taskq_destroy(g->mlg_refill_tq);
 			mutex_exit(&g->mlg_mtx);
 			return (B_FALSE);
 		}
@@ -1273,8 +1349,10 @@ mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 			}
 		}
 
-		if (!mlxcx_cq_setup(mlxp, eq, &cq))
+		if (!mlxcx_cq_setup(mlxp, eq, &cq,
+		    mlxp->mlx_props.mldp_cq_size_shift))
 			return (B_FALSE);
+
 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
 
 		sq = &g->mlg_wqs[i];
@@ -1409,6 +1487,11 @@ mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
 	ent0 = &mlwq->mlwq_send_ent[index];
 	start_pc = mlwq->mlwq_pc;
 	++mlwq->mlwq_pc;
+	/*
+	 * This counter is manipulated in the interrupt handler, which
+	 * does not hold the mlwq_mtx, hence the atomic.
+	 */
+	atomic_inc_64(&mlwq->mlwq_wqebb_used);
 
 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP;
@@ -1441,7 +1524,7 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
     uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags,
     mlxcx_buffer_t *b0)
 {
-	uint_t index, first, ents = 0;
+	uint_t index, first, ents;
 	mlxcx_completion_queue_t *cq;
 	mlxcx_sendq_ent_t *ent0;
 	mlxcx_sendq_extra_ent_t *ent;
@@ -1449,8 +1532,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 	uint_t ptri, nptr;
 	const ddi_dma_cookie_t *c;
 	size_t rem;
+	uint64_t wqebb_used;
 	mlxcx_buffer_t *b;
 	ddi_fm_error_t err;
+	boolean_t rv;
 
 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
 	ASSERT3P(b0->mlb_tx_head, ==, b0);
@@ -1460,16 +1545,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 	index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
 	ent0 = &mlwq->mlwq_send_ent[index];
 	b0->mlb_wqe_index = mlwq->mlwq_pc;
-	++mlwq->mlwq_pc;
-	++ents;
+	ents = 1;
 
 	first = index;
 
-	mutex_enter(&cq->mlcq_bufbmtx);
-	list_insert_tail(&cq->mlcq_buffers_b, b0);
-	atomic_inc_64(&cq->mlcq_bufcnt);
-	mutex_exit(&cq->mlcq_bufbmtx);
-
 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND;
 	ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
@@ -1502,6 +1581,16 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 		    MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM);
 	}
 
+	/*
+	 * mlwq_wqebb_used is only incremented whilst holding
+	 * the mlwq_mtx mutex, but it is decremented (atomically) in
+	 * the interrupt context *not* under mlwq_mtx mutex.
+	 * So, now take a snapshot of the number of used wqes which will
+	 * be a conistent maximum we can use whilst iterating through
+	 * the buffers and DMA cookies.
+	 */
+	wqebb_used = mlwq->mlwq_wqebb_used;
+
 	b = b0;
 	ptri = 0;
 	nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t);
@@ -1513,9 +1602,12 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 		while (rem > 0 &&
 		    (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) {
 			if (ptri >= nptr) {
-				index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
+				if ((ents + wqebb_used) >= mlwq->mlwq_nents)
+					return (B_FALSE);
+
+				index = (mlwq->mlwq_pc + ents) &
+				    (mlwq->mlwq_nents - 1);
 				ent = &mlwq->mlwq_send_extra_ent[index];
-				++mlwq->mlwq_pc;
 				++ents;
 
 				seg = ent->mlsqe_data;
@@ -1548,6 +1640,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 		}
 	}
 
+	b0->mlb_wqebbs = ents;
+	mlwq->mlwq_pc += ents;
+	atomic_add_64(&mlwq->mlwq_wqebb_used, ents);
+
 	for (; ptri < nptr; ++ptri, ++seg) {
 		seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
 		seg->mlds_byte_count = to_be32(0);
@@ -1566,10 +1662,24 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 	if (err.fme_status != DDI_FM_OK) {
 		return (B_FALSE);
 	}
-	if (!mlxcx_sq_ring_dbell(mlxp, mlwq, first)) {
-		return (B_FALSE);
+
+	/*
+	 * Hold the bufmtx whilst ringing the doorbell, to prevent
+	 * the buffer from being moved to another list, so we can
+	 * safely remove it should the ring fail.
+	 */
+	mutex_enter(&cq->mlcq_bufbmtx);
+
+	list_insert_tail(&cq->mlcq_buffers_b, b0);
+	if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) {
+		atomic_inc_64(&cq->mlcq_bufcnt);
+	} else {
+		list_remove(&cq->mlcq_buffers_b, b0);
 	}
-	return (B_TRUE);
+
+	mutex_exit(&cq->mlcq_bufbmtx);
+
+	return (rv);
 }
 
 boolean_t
@@ -1604,8 +1714,10 @@ mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 		index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
 		ent = &mlwq->mlwq_recv_ent[index];
 		buf->mlb_wqe_index = mlwq->mlwq_pc;
+		buf->mlb_wqebbs = 1;
 
 		++mlwq->mlwq_pc;
+		atomic_inc_64(&mlwq->mlwq_wqebb_used);
 
 		mutex_enter(&cq->mlcq_bufbmtx);
 		list_insert_tail(&cq->mlcq_buffers, buf);
@@ -1666,11 +1778,53 @@ mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 	return (B_TRUE);
 }
 
+static void
+mlxcx_rq_refill_task(void *arg)
+{
+	mlxcx_work_queue_t *wq = arg;
+	mlxcx_completion_queue_t *cq = wq->mlwq_cq;
+	mlxcx_t *mlxp = wq->mlwq_mlx;
+	mlxcx_buf_shard_t *s = wq->mlwq_bufs;
+	boolean_t refill;
+
+	do {
+		/*
+		 * Wait until there are some free buffers.
+		 */
+		mutex_enter(&s->mlbs_mtx);
+		while (list_is_empty(&s->mlbs_free) &&
+		    (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0)
+			cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
+		mutex_exit(&s->mlbs_mtx);
+
+		mutex_enter(&cq->mlcq_mtx);
+		mutex_enter(&wq->mlwq_mtx);
+
+		if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) {
+			refill = B_FALSE;
+			wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
+		} else {
+			mlxcx_rq_refill(mlxp, wq);
+
+			if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) {
+				refill = B_TRUE;
+			} else {
+				refill = B_FALSE;
+				wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
+			}
+		}
+
+		mutex_exit(&wq->mlwq_mtx);
+		mutex_exit(&cq->mlcq_mtx);
+	} while (refill);
+}
+
 void
 mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
 {
 	size_t target, current, want, done, n;
 	mlxcx_completion_queue_t *cq;
+	mlxcx_ring_group_t *g;
 	mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP];
 	uint_t i;
 
@@ -1697,10 +1851,24 @@ mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
 	while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) {
 		n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP);
 		if (n == 0) {
-			mlxcx_warn(mlxp, "!exiting rq refill early, done %u "
-			    "but wanted %u", done, want);
+			/*
+			 * We didn't get any buffers from the free queue.
+			 * It might not be an issue, schedule a taskq
+			 * to wait for free buffers if the completion
+			 * queue is low.
+			 */
+			if (current < MLXCX_RQ_REFILL_STEP &&
+			    (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) {
+				mlwq->mlwq_state |= MLXCX_WQ_REFILLING;
+				g = mlwq->mlwq_group;
+				taskq_dispatch_ent(g->mlg_refill_tq,
+				    mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP,
+				    &mlwq->mlwq_tqe);
+			}
+
 			return;
 		}
+
 		if (mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) {
 			for (i = 0; i < n; ++i)
 				mlxcx_buf_return(mlxp, b[i]);
@@ -1826,6 +1994,7 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
     mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
 {
 	uint32_t chkflags = 0;
+	uint_t wqe_index;
 	ddi_fm_error_t err;
 
 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
@@ -1868,6 +2037,12 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
 		return (NULL);
 	}
 
+	/*
+	 * mlxcx_buf_loan() will set mlb_wqe_index to zero.
+	 * Remember it for later.
+	 */
+	wqe_index = buf->mlb_wqe_index;
+
 	if (!mlxcx_buf_loan(mlxp, buf)) {
 		mlxcx_warn(mlxp, "!loan failed, dropping packet");
 		mlxcx_buf_return(mlxp, buf);
@@ -1894,7 +2069,7 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
 	 * Don't check if a refill is needed on every single completion,
 	 * since checking involves taking the RQ lock.
 	 */
-	if ((buf->mlb_wqe_index & 0x7) == 0) {
+	if ((wqe_index & 0x7) == 0) {
 		mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
 		ASSERT(wq != NULL);
 		mutex_enter(&wq->mlwq_mtx);
@@ -1981,39 +2156,66 @@ mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard,
 	return (B_TRUE);
 }
 
-static void
-mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
-    mlxcx_buffer_t **bp)
+static mlxcx_buffer_t *
+mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
 {
 	mlxcx_buffer_t *b;
 	mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs;
 
 	mutex_enter(&s->mlbs_mtx);
-	while (list_is_empty(&s->mlbs_free))
-		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
-	b = list_remove_head(&s->mlbs_free);
-	ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
-	ASSERT(b->mlb_foreign);
-	b->mlb_state = MLXCX_BUFFER_ON_WQ;
-	list_insert_tail(&s->mlbs_busy, b);
+	if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
+		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
+		ASSERT(b->mlb_foreign);
+		b->mlb_state = MLXCX_BUFFER_ON_WQ;
+		list_insert_tail(&s->mlbs_busy, b);
+	}
 	mutex_exit(&s->mlbs_mtx);
 
-	*bp = b;
+	return (b);
 }
 
-boolean_t
+static mlxcx_buffer_t *
+mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz)
+{
+	ddi_fm_error_t err;
+	mlxcx_buffer_t *b;
+	uint_t attempts = 0;
+
+copyb:
+	if ((b = mlxcx_buf_take(mlxp, wq)) == NULL)
+		return (NULL);
+
+	ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
+	bcopy(rptr, b->mlb_dma.mxdb_va, sz);
+
+	MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
+
+	ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
+	    DDI_FME_VERSION);
+	if (err.fme_status != DDI_FM_OK) {
+		ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
+		    DDI_FME_VERSION);
+		mlxcx_buf_return(mlxp, b);
+		if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
+			return (NULL);
+		}
+		goto copyb;
+	}
+
+	return (b);
+}
+
+mlxcx_buffer_t *
 mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
-    mblk_t *mpb, size_t off, mlxcx_buffer_t **bp)
+    mblk_t *mpb, size_t off)
 {
 	mlxcx_buffer_t *b, *b0 = NULL;
 	boolean_t first = B_TRUE;
-	ddi_fm_error_t err;
 	mblk_t *mp;
 	uint8_t *rptr;
 	size_t sz;
 	size_t ncookies = 0;
 	boolean_t ret;
-	uint_t attempts = 0;
 
 	for (mp = mpb; mp != NULL; mp = mp->b_cont) {
 		rptr = mp->b_rptr;
@@ -2024,31 +2226,24 @@ mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
 		rptr += off;
 		sz -= off;
 
-		if (sz < mlxp->mlx_props.mldp_tx_bind_threshold)
-			goto copyb;
-
-		mlxcx_buf_take_foreign(mlxp, wq, &b);
-		ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, B_FALSE);
+		if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) {
+			b = mlxcx_copy_data(mlxp, wq, rptr, sz);
+			if (b == NULL)
+				goto failed;
+		} else {
+			b = mlxcx_buf_take_foreign(mlxp, wq);
+			if (b == NULL)
+				goto failed;
 
-		if (!ret) {
-			mlxcx_buf_return(mlxp, b);
+			ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off,
+			    B_FALSE);
 
-copyb:
-			mlxcx_buf_take(mlxp, wq, &b);
-			ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
-			bcopy(rptr, b->mlb_dma.mxdb_va, sz);
-			MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
-			ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
-			    DDI_FME_VERSION);
-			if (err.fme_status != DDI_FM_OK) {
-				ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
-				    DDI_FME_VERSION);
+			if (!ret) {
 				mlxcx_buf_return(mlxp, b);
-				if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
-					*bp = NULL;
-					return (B_FALSE);
-				}
-				goto copyb;
+
+				b = mlxcx_copy_data(mlxp, wq, rptr, sz);
+				if (b == NULL)
+					goto failed;
 			}
 		}
 
@@ -2082,54 +2277,44 @@ copyb:
 
 	ASSERT3U(ncookies, <=, MLXCX_SQE_MAX_PTRS);
 
-	*bp = b0;
-	return (B_TRUE);
+	return (b0);
+
+failed:
+	if (b0 != NULL)
+		mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
+
+	return (NULL);
 }
 
-void
-mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mlxcx_buffer_t **bp)
+mlxcx_buffer_t *
+mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
 {
 	mlxcx_buffer_t *b;
 	mlxcx_buf_shard_t *s = wq->mlwq_bufs;
 
 	mutex_enter(&s->mlbs_mtx);
-	while (list_is_empty(&s->mlbs_free))
-		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
-	b = list_remove_head(&s->mlbs_free);
-	ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
-	b->mlb_state = MLXCX_BUFFER_ON_WQ;
-	list_insert_tail(&s->mlbs_busy, b);
+	if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
+		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
+		b->mlb_state = MLXCX_BUFFER_ON_WQ;
+		list_insert_tail(&s->mlbs_busy, b);
+	}
 	mutex_exit(&s->mlbs_mtx);
 
-	*bp = b;
+	return (b);
 }
 
-#define	MLXCX_BUF_TAKE_N_TIMEOUT_USEC		5000
-#define	MLXCX_BUF_TAKE_N_MAX_RETRIES		3
-
 size_t
 mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
     mlxcx_buffer_t **bp, size_t nbufs)
 {
 	mlxcx_buffer_t *b;
-	size_t done = 0, empty = 0;
-	clock_t wtime = drv_usectohz(MLXCX_BUF_TAKE_N_TIMEOUT_USEC);
+	size_t done = 0;
 	mlxcx_buf_shard_t *s;
 
 	s = wq->mlwq_bufs;
 
 	mutex_enter(&s->mlbs_mtx);
-	while (done < nbufs) {
-		while (list_is_empty(&s->mlbs_free)) {
-			(void) cv_reltimedwait(&s->mlbs_free_nonempty,
-			    &s->mlbs_mtx, wtime, TR_MILLISEC);
-			if (list_is_empty(&s->mlbs_free) &&
-			    empty++ >= MLXCX_BUF_TAKE_N_MAX_RETRIES) {
-				mutex_exit(&s->mlbs_mtx);
-				return (done);
-			}
-		}
-		b = list_remove_head(&s->mlbs_free);
+	while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) {
 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
 		list_insert_tail(&s->mlbs_busy, b);
@@ -2187,13 +2372,26 @@ mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
 
 	VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE);
 	ASSERT3P(b->mlb_mlx, ==, mlxp);
+
+	/*
+	 * The mlbs_mtx held below is a heavily contended lock, so it is
+	 * imperative we do as much of the buffer clean up outside the lock
+	 * as is possible.
+	 */
 	b->mlb_state = MLXCX_BUFFER_FREE;
 	b->mlb_wqe_index = 0;
 	b->mlb_tx_head = NULL;
 	b->mlb_tx_mp = NULL;
 	b->mlb_used = 0;
+	b->mlb_wqebbs = 0;
 	ASSERT(list_is_empty(&b->mlb_tx_chain));
 
+	if (b->mlb_foreign) {
+		if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
+			mlxcx_dma_unbind(mlxp, &b->mlb_dma);
+		}
+	}
+
 	mutex_enter(&s->mlbs_mtx);
 	switch (oldstate) {
 	case MLXCX_BUFFER_INIT:
@@ -2215,12 +2413,6 @@ mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
 		break;
 	}
 
-	if (b->mlb_foreign) {
-		if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
-			mlxcx_dma_unbind(mlxp, &b->mlb_dma);
-		}
-	}
-
 	list_insert_tail(&s->mlbs_free, b);
 	cv_signal(&s->mlbs_free_nonempty);
author	Paul Winder <pwinder@racktopsystems.com>	2020-03-09 13:16:05 +0000
committer	Paul Winder <paul@winders.demon.co.uk>	2020-04-14 16:40:07 +0100
commit	22d052287ba7ed169757650e2eec25fedbae163a (patch)
tree	cc05c04281562815d8c52d8e2d7f3023d10f3a9f /usr/src
parent	63878f749f68d1c188363e0e7a36e7b7e855dff2 (diff)
download	illumos-joyent-22d052287ba7ed169757650e2eec25fedbae163a.tar.gz