8 files changed, 706 insertions, 322 deletions
diff --git a/usr/src/man/man7d/mlxcx.7d b/usr/src/man/man7d/mlxcx.7d
index 5373b5bec5..d7b0cf8ad9 100644
--- a/usr/src/man/man7d/mlxcx.7d
+++ b/usr/src/man/man7d/mlxcx.7d
@@ -11,7 +11,7 @@
 .\"
 .\" Copyright 2020 the University of Queensland
 .\"
-.Dd January 17, 2020
+.Dd April 9, 2020
 .Dt MLXCX 7D
 .Os
 .Sh NAME
@@ -94,8 +94,11 @@ property determines the number of entries on Completion Queues for the device.
 The number of entries is calculated as
 .Li (1 << cq_size_shift) ,
 so a value of 9 would mean 512 entries are created on each Event Queue.
-The default value is
-.Sy 10 .
+The default value is device dependent,
+.Sy 10
+for devices with maximum supported speed of 10Gb/s or less and
+.Sy 12
+for devices with higher supported speeds.
 This should be kept very close to the value set for
 .Sy rq_size_shift
 and
@@ -116,8 +119,11 @@ The number of descriptors is calculated as
 .Dv (1 << rq_size_shift) ,
 so a value of 9 would mean 512 descriptors are created on each Receive Queue.
 This sets the number of packets on RX rings advertised to MAC.
-The default value is
-.Sy 10 .
+The default value is device dependent,
+.Sy 10
+for devices with maximum supported speed of 10Gb/s or less and
+.Sy 12
+for devices with higher supported speeds.
 .Ed
 .It Sy sq_size_shift
 .Bd -filled -compact
@@ -134,8 +140,11 @@ The number of descriptors is calculated as
 .Dv (1 << sq_size_shift) ,
 so a value of 9 would mean 512 descriptors are created on each Send Queue.
 This sets the number of packets on RX rings advertised to MAC.
-The default value is
-.Sy 11 .
+The default value is device dependent,
+.Sy 11
+for devices with maximum supported speed of 10Gb/s or less and
+.Sy 13
+for devices with higher supported speeds.
 Note that large packets often occupy more than one descriptor slot on the SQ,
 so it is sometimes a good idea to increase this if using a large MTU.
 .Ed
@@ -325,6 +334,22 @@ is seldom worth using them for small packets.
 The default value is
 .Sy 2048 .
 .Ed
+.It Sy rx_limit_per_completion
+.Bd -filled -compact
+Minimum:
+.Sy 16 |
+Maximum:
+.Sy 4096
+.Ed
+.Bd -filled
+The
+.Sy rx_limit_per_completion
+property determines the maximum number of packets that
+will be processed on a given completion ring during a single interrupt.
+This is done to try and guarantee some amount of liveness in the system.
+The default value is
+.Sy 256 .
+.Ed
 .El
 .Sh FILES
 .Bl -tag -width Pa
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.c b/usr/src/uts/common/io/mlxcx/mlxcx.c
index 12a8d52b3f..c90fa0969b 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.c
@@ -12,6 +12,7 @@
 /*
  * Copyright 2020, The University of Queensland
  * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 /*
@@ -453,23 +454,68 @@ uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT;
 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT;
 
 static void
-mlxcx_load_props(mlxcx_t *mlxp)
+mlxcx_load_prop_defaults(mlxcx_t *mlxp)
 {
 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
+	mlxcx_port_t *port = &mlxp->mlx_ports[0];
+
+	VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0);
+	VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0);
+
+	/*
+	 * Currently we have different queue size defaults for two
+	 * categories of queues. One set for devices which support a
+	 * maximum speed of 10Gb/s, and another for those above that.
+	 */
+	if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G |
+	    MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0) {
+		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G;
+		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G;
+		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G;
+	} else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G |
+	    MLXCX_PROTO_10G)) != 0) {
+		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
+		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
+		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
+	} else {
+		mlxcx_warn(mlxp, "Encountered a port with a speed we don't "
+		    "recognize. Proto: 0x%x", port->mlp_max_proto);
+		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
+		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
+		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
+	}
+}
+
+/*
+ * Properties which may have different defaults based on hardware
+ * characteristics.
+ */
+static void
+mlxcx_load_model_props(mlxcx_t *mlxp)
+{
+	mlxcx_drv_props_t *p = &mlxp->mlx_props;
+
+	mlxcx_load_prop_defaults(mlxp);
 
-	p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
-	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
-	    MLXCX_EQ_SIZE_SHIFT_DFLT);
 	p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift",
-	    MLXCX_CQ_SIZE_SHIFT_DFLT);
+	    p->mldp_cq_size_shift_default);
 	p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift",
-	    MLXCX_SQ_SIZE_SHIFT_DFLT);
+	    p->mldp_sq_size_shift_default);
 	p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift",
-	    MLXCX_RQ_SIZE_SHIFT_DFLT);
+	    p->mldp_rq_size_shift_default);
+}
+
+static void
+mlxcx_load_props(mlxcx_t *mlxp)
+{
+	mlxcx_drv_props_t *p = &mlxp->mlx_props;
 
+	p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
+	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
+	    MLXCX_EQ_SIZE_SHIFT_DFLT);
 	p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec",
 	    MLXCX_CQEMOD_PERIOD_USEC_DFLT);
@@ -521,6 +567,19 @@ mlxcx_load_props(mlxcx_t *mlxp)
 	p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
 	    "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT);
+
+	p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
+	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion",
+	    MLXCX_RX_PER_CQ_DEFAULT);
+
+	if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN ||
+	    p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) {
+		mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is "
+		    "out of range. Defaulting to: %d. Valid values are from "
+		    "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT,
+		    MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX);
+		p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT;
+	}
 }
 
 void
@@ -2595,6 +2654,8 @@ mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	}
 	mlxp->mlx_attach |= MLXCX_ATTACH_PORTS;
 
+	mlxcx_load_model_props(mlxp);
+
 	/*
 	 * Set up, enable and arm the rest of the interrupt EQs which will
 	 * service events from CQs.
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.conf b/usr/src/uts/common/io/mlxcx/mlxcx.conf
index 3569c4e5f5..321820a47b 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.conf
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.conf
@@ -12,6 +12,7 @@
 #
 # Copyright 2018, Joyent, Inc.
 # Copyright 2020, The University of Queensland
+# Copyright 2020 RackTop Systems, Inc.
 #
 
 #
@@ -23,10 +24,15 @@
 # Sizing of event and completion queues.
 #
 # The number of entries on each queue will be (1 << *_size_shift) -- so
-# a value of 9 would mean 512 entries.
+# a value of 10 would mean 1024 entries.
 #
 #eq_size_shift = 9;
+
+# The default for devices with a maximum supported speed up to 10Gb/s
 #cq_size_shift = 10;
+#
+# The default for devices with a maximum supported speed above 10Gb/s
+#cq_size_shift = 12;
 
 #
 # Sizing of send and receive queues.
@@ -35,8 +41,13 @@
 # advertise to MAC. It also determines how many packet buffers we will allocate
 # when starting the interface.
 #
+# The defaults for devices with a maximum supported speed up to 10Gb/s
 #sq_size_shift = 11;
 #rq_size_shift = 10;
+#
+# The defaults for devices with a maximum supported speed above 10Gb/s
+#sq_size_shift = 13;
+#rq_size_shift = 12;
 
 #
 # Number and configuration of TX groups and rings.
@@ -99,3 +110,10 @@
 #eq_check_interval_sec = 30;
 #cq_check_interval_sec = 300;
 #wq_check_interval_sec = 300;
+
+#
+# To provide some level of moderation and aid latencies, after
+# "rx_limit_per_completion" packets are received in a single completion
+# event, the interrupt handler will pass the chain up the receive stack.
+#
+#rx_limit_per_completion = 256;
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h
index 3b58989961..bf07691095 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.h
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.h
@@ -12,6 +12,7 @@
 /*
  * Copyright 2020, The University of Queensland
  * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 /*
@@ -32,6 +33,7 @@
 #include <sys/ddifm.h>
 #include <sys/id_space.h>
 #include <sys/list.h>
+#include <sys/taskq_impl.h>
 #include <sys/stddef.h>
 #include <sys/stream.h>
 #include <sys/strsun.h>
@@ -89,18 +91,36 @@ extern "C" {
  * Queues will be sized to (1 << *Q_SIZE_SHIFT) entries long.
  */
 #define	MLXCX_EQ_SIZE_SHIFT_DFLT	9
+
+/*
+ * The CQ, SQ and RQ sizes can effect throughput on higher speed interfaces.
+ * EQ less so, as it only takes a single EQ entry to indicate there are
+ * multiple completions on the CQ.
+ *
+ * Particularly on the Rx side, the RQ (and corresponding CQ) would run
+ * low on available entries. A symptom of this is the refill taskq running
+ * frequently. A larger RQ (and CQ) alleviates this, and as there is a
+ * close relationship between SQ and CQ size, the SQ is increased too.
+ */
 #define	MLXCX_CQ_SIZE_SHIFT_DFLT	10
+#define	MLXCX_CQ_SIZE_SHIFT_25G		12
 
 /*
  * Default to making SQs bigger than RQs for 9k MTU, since most packets will
  * spill over into more than one slot. RQ WQEs are always 1 slot.
  */
 #define	MLXCX_SQ_SIZE_SHIFT_DFLT	11
+#define	MLXCX_SQ_SIZE_SHIFT_25G		13
+
 #define	MLXCX_RQ_SIZE_SHIFT_DFLT	10
+#define	MLXCX_RQ_SIZE_SHIFT_25G		12
 
 #define	MLXCX_CQ_HWM_GAP		16
 #define	MLXCX_CQ_LWM_GAP		24
 
+#define	MLXCX_WQ_HWM_GAP		MLXCX_CQ_HWM_GAP
+#define	MLXCX_WQ_LWM_GAP		MLXCX_CQ_LWM_GAP
+
 #define	MLXCX_RQ_REFILL_STEP		64
 
 /*
@@ -135,6 +155,14 @@ extern "C" {
 #define	MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT		300
 #define	MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT		30
 
+/*
+ * After this many packets, the packets received so far are passed to
+ * the mac layer.
+ */
+#define	MLXCX_RX_PER_CQ_DEFAULT			256
+#define	MLXCX_RX_PER_CQ_MIN			16
+#define	MLXCX_RX_PER_CQ_MAX			4096
+
 #define	MLXCX_DOORBELL_TRIES_DFLT		3
 extern uint_t mlxcx_doorbell_tries;
 
@@ -417,6 +445,11 @@ typedef struct mlxcx_buffer {
 	size_t			mlb_used;
 	mblk_t			*mlb_tx_mp;
 
+	/*
+	 * The number of work queue basic blocks this buf uses.
+	 */
+	uint_t			mlb_wqebbs;
+
 	mlxcx_t			*mlb_mlx;
 	mlxcx_buffer_state_t	mlb_state;
 	uint_t			mlb_wqe_index;
@@ -495,6 +528,8 @@ typedef enum {
 	MLXCX_WQ_DESTROYED	= 1 << 3,
 	MLXCX_WQ_TEARDOWN	= 1 << 4,
 	MLXCX_WQ_BUFFERS	= 1 << 5,
+	MLXCX_WQ_REFILLING	= 1 << 6,
+	MLXCX_WQ_BLOCKED_MAC	= 1 << 7
 } mlxcx_workq_state_t;
 
 typedef enum {
@@ -540,12 +575,18 @@ struct mlxcx_work_queue {
 	};
 	uint64_t			mlwq_pc;	/* producer counter */
 
+	uint64_t			mlwq_wqebb_used;
+	size_t				mlwq_bufhwm;
+	size_t				mlwq_buflwm;
+
 	mlxcx_dma_buffer_t		mlwq_doorbell_dma;
 	mlxcx_workq_doorbell_t		*mlwq_doorbell;
 
 	mlxcx_buf_shard_t		*mlwq_bufs;
 	mlxcx_buf_shard_t		*mlwq_foreign_bufs;
 
+	taskq_ent_t			mlwq_tqe;
+
 	boolean_t			mlwq_fm_repd_qstate;
 };
 
@@ -773,6 +814,8 @@ struct mlxcx_ring_group {
 	mlxcx_flow_group_t		*mlg_rx_vlan_promisc_fg;
 	list_t				mlg_rx_vlans;
 
+	taskq_t				*mlg_refill_tq;
+
 	/*
 	 * Flow table for separating out by protocol before hashing
 	 */
@@ -856,8 +899,11 @@ typedef struct {
 typedef struct {
 	uint_t			mldp_eq_size_shift;
 	uint_t			mldp_cq_size_shift;
+	uint_t			mldp_cq_size_shift_default;
 	uint_t			mldp_rq_size_shift;
+	uint_t			mldp_rq_size_shift_default;
 	uint_t			mldp_sq_size_shift;
+	uint_t			mldp_sq_size_shift_default;
 	uint_t			mldp_cqemod_period_usec;
 	uint_t			mldp_cqemod_count;
 	uint_t			mldp_intrmod_period_usec;
@@ -865,6 +911,7 @@ typedef struct {
 	uint_t			mldp_rx_ngroups_small;
 	uint_t			mldp_rx_nrings_per_large_group;
 	uint_t			mldp_rx_nrings_per_small_group;
+	uint_t			mldp_rx_per_cq;
 	uint_t			mldp_tx_ngroups;
 	uint_t			mldp_tx_nrings_per_group;
 	uint_t			mldp_ftbl_root_size_shift;
@@ -1098,6 +1145,7 @@ extern boolean_t mlxcx_intr_setup(mlxcx_t *);
 extern void mlxcx_intr_teardown(mlxcx_t *);
 extern void mlxcx_arm_eq(mlxcx_t *, mlxcx_event_queue_t *);
 extern void mlxcx_arm_cq(mlxcx_t *, mlxcx_completion_queue_t *);
+extern void mlxcx_update_cqci(mlxcx_t *, mlxcx_completion_queue_t *);
 
 extern mblk_t *mlxcx_rx_poll(mlxcx_t *, mlxcx_completion_queue_t *, size_t);
 
@@ -1109,8 +1157,6 @@ extern boolean_t mlxcx_register_mac(mlxcx_t *);
 /*
  * From mlxcx_ring.c
  */
-extern boolean_t mlxcx_cq_alloc_dma(mlxcx_t *, mlxcx_completion_queue_t *);
-extern void mlxcx_cq_rele_dma(mlxcx_t *, mlxcx_completion_queue_t *);
 extern boolean_t mlxcx_wq_alloc_dma(mlxcx_t *, mlxcx_work_queue_t *);
 extern void mlxcx_wq_rele_dma(mlxcx_t *, mlxcx_work_queue_t *);
 
@@ -1118,7 +1164,7 @@ extern boolean_t mlxcx_buf_create(mlxcx_t *, mlxcx_buf_shard_t *,
     mlxcx_buffer_t **);
 extern boolean_t mlxcx_buf_create_foreign(mlxcx_t *, mlxcx_buf_shard_t *,
     mlxcx_buffer_t **);
-extern void mlxcx_buf_take(mlxcx_t *, mlxcx_work_queue_t *, mlxcx_buffer_t **);
+extern mlxcx_buffer_t *mlxcx_buf_take(mlxcx_t *, mlxcx_work_queue_t *);
 extern size_t mlxcx_buf_take_n(mlxcx_t *, mlxcx_work_queue_t *,
     mlxcx_buffer_t **, size_t);
 extern boolean_t mlxcx_buf_loan(mlxcx_t *, mlxcx_buffer_t *);
@@ -1126,8 +1172,8 @@ extern void mlxcx_buf_return(mlxcx_t *, mlxcx_buffer_t *);
 extern void mlxcx_buf_return_chain(mlxcx_t *, mlxcx_buffer_t *, boolean_t);
 extern void mlxcx_buf_destroy(mlxcx_t *, mlxcx_buffer_t *);
 
-extern boolean_t mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *,
-    mblk_t *, size_t, mlxcx_buffer_t **);
+extern mlxcx_buffer_t *mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *,
+    mblk_t *, size_t);
 
 extern boolean_t mlxcx_rx_group_setup(mlxcx_t *, mlxcx_ring_group_t *);
 extern boolean_t mlxcx_tx_group_setup(mlxcx_t *, mlxcx_ring_group_t *);
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
index 7b01702376..a1d50659c1 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
@@ -430,15 +430,10 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
 		}
 	}
 
-	if (!mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take, &b)) {
-		/*
-		 * Something went really wrong, and we probably will never be
-		 * able to TX again (all our buffers are broken and DMA is
-		 * failing). Drop the packet on the floor -- FMA should be
-		 * reporting this error elsewhere.
-		 */
-		freemsg(mp);
-		return (NULL);
+	b = mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take);
+	if (b == NULL) {
+		atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
+		return (mp);
 	}
 
 	mutex_enter(&sq->mlwq_mtx);
@@ -467,18 +462,20 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
 	 */
 	if (cq->mlcq_bufcnt >= cq->mlcq_bufhwm) {
 		atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC);
-		mutex_exit(&sq->mlwq_mtx);
-		mlxcx_buf_return_chain(mlxp, b, B_TRUE);
-		return (mp);
+		goto blocked;
+	}
+
+	if (sq->mlwq_wqebb_used >= sq->mlwq_bufhwm) {
+		atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
+		goto blocked;
 	}
 
 	ok = mlxcx_sq_add_buffer(mlxp, sq, inline_hdrs, inline_hdrlen,
 	    chkflags, b);
 	if (!ok) {
 		atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC);
-		mutex_exit(&sq->mlwq_mtx);
-		mlxcx_buf_return_chain(mlxp, b, B_TRUE);
-		return (mp);
+		atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
+		goto blocked;
 	}
 
 	/*
@@ -493,6 +490,11 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
 	mutex_exit(&sq->mlwq_mtx);
 
 	return (NULL);
+
+blocked:
+	mutex_exit(&sq->mlwq_mtx);
+	mlxcx_buf_return_chain(mlxp, b, B_TRUE);
+	return (mp);
 }
 
 static int
@@ -862,9 +864,8 @@ mlxcx_mac_ring_intr_disable(mac_intr_handle_t intrh)
 {
 	mlxcx_completion_queue_t *cq = (mlxcx_completion_queue_t *)intrh;
 
-	atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_POLLING);
 	mutex_enter(&cq->mlcq_mtx);
-	VERIFY(cq->mlcq_state & MLXCX_CQ_POLLING);
+	atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_POLLING);
 	mutex_exit(&cq->mlcq_mtx);
 
 	return (0);
@@ -1061,56 +1062,43 @@ mlxcx_mac_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 	case MAC_PROP_EN_100GFDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto &
-		    (MLXCX_PROTO_100GBASE_CR4 | MLXCX_PROTO_100GBASE_SR4 |
-		    MLXCX_PROTO_100GBASE_KR4)) != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_100G) != 0);
 		break;
 	case MAC_PROP_ADV_50GFDX_CAP:
 	case MAC_PROP_EN_50GFDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto &
-		    (MLXCX_PROTO_50GBASE_CR2 | MLXCX_PROTO_50GBASE_KR2 |
-		    MLXCX_PROTO_50GBASE_SR2)) != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_50G) != 0);
 		break;
 	case MAC_PROP_ADV_40GFDX_CAP:
 	case MAC_PROP_EN_40GFDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto &
-		    (MLXCX_PROTO_40GBASE_SR4 | MLXCX_PROTO_40GBASE_LR4_ER4 |
-		    MLXCX_PROTO_40GBASE_CR4 | MLXCX_PROTO_40GBASE_KR4))
-		    != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_40G) != 0);
 		break;
 	case MAC_PROP_ADV_25GFDX_CAP:
 	case MAC_PROP_EN_25GFDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto &
-		    (MLXCX_PROTO_25GBASE_CR | MLXCX_PROTO_25GBASE_KR |
-		    MLXCX_PROTO_25GBASE_SR)) != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_25G) != 0);
 		break;
 	case MAC_PROP_ADV_10GFDX_CAP:
 	case MAC_PROP_EN_10GFDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto &
-		    (MLXCX_PROTO_10GBASE_CX4 | MLXCX_PROTO_10GBASE_KX4 |
-		    MLXCX_PROTO_10GBASE_KR | MLXCX_PROTO_10GBASE_CR |
-		    MLXCX_PROTO_10GBASE_SR | MLXCX_PROTO_10GBASE_ER_LR)) != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_10G) != 0);
 		break;
 	case MAC_PROP_ADV_1000FDX_CAP:
 	case MAC_PROP_EN_1000FDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto & (MLXCX_PROTO_1000BASE_KX |
-		    MLXCX_PROTO_SGMII)) != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_1G) != 0);
 		break;
 	case MAC_PROP_ADV_100FDX_CAP:
 	case MAC_PROP_EN_100FDX_CAP:
 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 		mac_prop_info_set_default_uint8(prh,
-		    (port->mlp_oper_proto & MLXCX_PROTO_SGMII_100BASE) != 0);
+		    (port->mlp_oper_proto & MLXCX_PROTO_100M) != 0);
 		break;
 	default:
 		break;
@@ -1252,8 +1240,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    (MLXCX_PROTO_100GBASE_CR4 | MLXCX_PROTO_100GBASE_SR4 |
-		    MLXCX_PROTO_100GBASE_KR4)) != 0;
+		    MLXCX_PROTO_100G) != 0;
 		break;
 	case MAC_PROP_ADV_50GFDX_CAP:
 	case MAC_PROP_EN_50GFDX_CAP:
@@ -1262,8 +1249,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    (MLXCX_PROTO_50GBASE_CR2 | MLXCX_PROTO_50GBASE_KR2 |
-		    MLXCX_PROTO_50GBASE_SR2)) != 0;
+		    MLXCX_PROTO_50G) != 0;
 		break;
 	case MAC_PROP_ADV_40GFDX_CAP:
 	case MAC_PROP_EN_40GFDX_CAP:
@@ -1272,8 +1258,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    (MLXCX_PROTO_40GBASE_SR4 | MLXCX_PROTO_40GBASE_LR4_ER4 |
-		    MLXCX_PROTO_40GBASE_CR4 | MLXCX_PROTO_40GBASE_KR4)) != 0;
+		    MLXCX_PROTO_40G) != 0;
 		break;
 	case MAC_PROP_ADV_25GFDX_CAP:
 	case MAC_PROP_EN_25GFDX_CAP:
@@ -1282,8 +1267,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    (MLXCX_PROTO_25GBASE_CR | MLXCX_PROTO_25GBASE_KR |
-		    MLXCX_PROTO_25GBASE_SR)) != 0;
+		    MLXCX_PROTO_25G) != 0;
 		break;
 	case MAC_PROP_ADV_10GFDX_CAP:
 	case MAC_PROP_EN_10GFDX_CAP:
@@ -1292,9 +1276,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    (MLXCX_PROTO_10GBASE_CX4 | MLXCX_PROTO_10GBASE_KX4 |
-		    MLXCX_PROTO_10GBASE_KR | MLXCX_PROTO_10GBASE_CR |
-		    MLXCX_PROTO_10GBASE_SR | MLXCX_PROTO_10GBASE_ER_LR)) != 0;
+		    MLXCX_PROTO_10G) != 0;
 		break;
 	case MAC_PROP_ADV_1000FDX_CAP:
 	case MAC_PROP_EN_1000FDX_CAP:
@@ -1303,7 +1285,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    (MLXCX_PROTO_1000BASE_KX | MLXCX_PROTO_SGMII)) != 0;
+		    MLXCX_PROTO_1G) != 0;
 		break;
 	case MAC_PROP_ADV_100FDX_CAP:
 	case MAC_PROP_EN_100FDX_CAP:
@@ -1312,7 +1294,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 			break;
 		}
 		*(uint8_t *)pr_val = (port->mlp_max_proto &
-		    MLXCX_PROTO_SGMII_100BASE) != 0;
+		    MLXCX_PROTO_100M) != 0;
 		break;
 	default:
 		ret = ENOTSUP;
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
index 0516f86d6b..4dc4291b08 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
@@ -11,6 +11,7 @@
 
 /*
  * Copyright (c) 2020, the University of Queensland
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 /*
@@ -26,6 +27,11 @@
 
 #include <mlxcx.h>
 
+/*
+ * CTASSERT(s) to cover bad values which would induce bugs.
+ */
+CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP);
+
 void
 mlxcx_intr_teardown(mlxcx_t *mlxp)
 {
@@ -190,6 +196,31 @@ mlxcx_cq_next(mlxcx_completion_queue_t *mlcq)
 }
 
 void
+mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
+{
+	ddi_fm_error_t err;
+	uint_t try = 0;
+
+	mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
+
+retry:
+	MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
+	ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
+	    DDI_FME_VERSION);
+	if (err.fme_status != DDI_FM_OK) {
+		if (try++ < mlxcx_doorbell_tries) {
+			ddi_fm_dma_err_clear(
+			    mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
+			    DDI_FME_VERSION);
+			goto retry;
+		} else {
+			ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
+			return;
+		}
+	}
+}
+
+void
 mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
 {
 	bits32_t dbval = new_bits32();
@@ -538,14 +569,15 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2)
 	if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
 	    !(mleq->mleq_state & MLXCX_EQ_CREATED) ||
 	    (mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
-		mlxcx_warn(mlxp, "int0 on bad eq state");
+		mlxcx_warn(mlxp, "int %d on bad eq state",
+		    mleq->mleq_intr_index);
 		mutex_exit(&mleq->mleq_mtx);
 		return (DDI_INTR_UNCLAIMED);
 	}
 
 	ent = mlxcx_eq_next(mleq);
 	if (ent == NULL) {
-		mlxcx_warn(mlxp, "spurious int 0?");
+		mlxcx_warn(mlxp, "spurious int %d", mleq->mleq_intr_index);
 		mutex_exit(&mleq->mleq_mtx);
 		return (DDI_INTR_UNCLAIMED);
 	}
@@ -574,8 +606,8 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2)
 			mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod);
 			break;
 		default:
-			mlxcx_warn(mlxp, "unhandled event 0x%x on int0",
-			    ent->mleqe_event_type);
+			mlxcx_warn(mlxp, "unhandled event 0x%x on int %d",
+			    ent->mleqe_event_type, mleq->mleq_intr_index);
 		}
 	}
 
@@ -591,46 +623,56 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2)
 	return (DDI_INTR_CLAIMED);
 }
 
-mblk_t *
-mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
+static boolean_t
+mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp,
+    size_t bytelim)
 {
-	mlxcx_buffer_t *buf;
-	mblk_t *mp, *cmp, *nmp;
+	mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
 	mlxcx_completionq_ent_t *cent;
+	mblk_t *mp, *cmp, *nmp;
+	mlxcx_buffer_t *buf;
+	boolean_t found, added;
 	size_t bytes = 0;
-	boolean_t found;
-
-	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
+	uint_t rx_frames = 0;
+	uint_t comp_cnt = 0;
+	int64_t wqebbs, bufcnt;
 
-	ASSERT(mlcq->mlcq_wq != NULL);
-	ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
+	*mpp = NULL;
 
 	if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
 	    !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
 	    (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
 	    (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
-		return (NULL);
+		return (B_FALSE);
 	}
 
-	ASSERT(mlcq->mlcq_state & MLXCX_CQ_POLLING);
-
 	nmp = cmp = mp = NULL;
 
-	cent = mlxcx_cq_next(mlcq);
-	for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) {
+	wqebbs = 0;
+	bufcnt = 0;
+	for (cent = mlxcx_cq_next(mlcq); cent != NULL;
+	    cent = mlxcx_cq_next(mlcq)) {
 		/*
 		 * Teardown and ring stop can atomic_or this flag
 		 * into our state if they want us to stop early.
 		 */
 		if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
-			break;
+			return (B_FALSE);
 
+		comp_cnt++;
 		if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
 		    cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
 			/* NOP */
+			atomic_dec_64(&wq->mlwq_wqebb_used);
 			goto nextcq;
 		}
 
+lookagain:
+		/*
+		 * Generally the buffer we're looking for will be
+		 * at the front of the list, so this loop won't
+		 * need to look far.
+		 */
 		buf = list_head(&mlcq->mlcq_buffers);
 		found = B_FALSE;
 		while (buf != NULL) {
@@ -641,36 +683,118 @@ mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
 			}
 			buf = list_next(&mlcq->mlcq_buffers, buf);
 		}
+
 		if (!found) {
+			/*
+			 * If there's any buffers waiting on the
+			 * buffers_b list, then merge those into
+			 * the main list and have another look.
+			 *
+			 * The wq enqueue routines push new buffers
+			 * into buffers_b so that they can avoid
+			 * taking the mlcq_mtx and blocking us for
+			 * every single packet.
+			 */
+			added = B_FALSE;
+			mutex_enter(&mlcq->mlcq_bufbmtx);
+			if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
+				list_move_tail(&mlcq->mlcq_buffers,
+				    &mlcq->mlcq_buffers_b);
+				added = B_TRUE;
+			}
+			mutex_exit(&mlcq->mlcq_bufbmtx);
+			if (added)
+				goto lookagain;
+
 			buf = list_head(&mlcq->mlcq_buffers);
 			mlxcx_warn(mlxp, "got completion on CQ %x but "
 			    "no buffer matching wqe found: %x (first "
 			    "buffer counter = %x)", mlcq->mlcq_num,
 			    from_be16(cent->mlcqe_wqe_counter),
-			    buf == NULL ? UINT32_MAX : buf->mlb_wqe_index);
+			    buf == NULL ? UINT32_MAX :
+			    buf->mlb_wqe_index);
 			mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
 			goto nextcq;
 		}
+
+		/*
+		 * The buf is likely to be freed below, count this now.
+		 */
+		wqebbs += buf->mlb_wqebbs;
+
 		list_remove(&mlcq->mlcq_buffers, buf);
-		atomic_dec_64(&mlcq->mlcq_bufcnt);
+		bufcnt++;
 
-		nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
-		if (nmp != NULL) {
+		switch (mlcq->mlcq_wq->mlwq_type) {
+		case MLXCX_WQ_TYPE_SENDQ:
+			mlxcx_tx_completion(mlxp, mlcq, cent, buf);
+			break;
+		case MLXCX_WQ_TYPE_RECVQ:
+			nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
 			bytes += from_be32(cent->mlcqe_byte_cnt);
-			if (cmp != NULL) {
-				cmp->b_next = nmp;
-				cmp = nmp;
-			} else {
-				mp = cmp = nmp;
+			if (nmp != NULL) {
+				if (cmp != NULL) {
+					cmp->b_next = nmp;
+					cmp = nmp;
+				} else {
+					mp = cmp = nmp;
+				}
+
+				rx_frames++;
 			}
+			break;
 		}
-nextcq:
-		mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
 
-		if (bytelim != 0 && bytes > bytelim)
+		/*
+		 * Update the consumer index with what has been processed,
+		 * followed by driver counters. It is important to tell the
+		 * hardware first, otherwise when we throw more packets at
+		 * it, it may get an overflow error.
+		 * We do this whenever we've processed enough to bridge the
+		 * high->low water mark.
+		 */
+		if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) {
+			mlxcx_update_cqci(mlxp, mlcq);
+			/*
+			 * Both these variables are incremented using
+			 * atomics as they are modified in other code paths
+			 * (Eg during tx) which hold different locks.
+			 */
+			atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
+			atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
+			wqebbs = 0;
+			bufcnt = 0;
+			comp_cnt = 0;
+		}
+nextcq:
+		if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq ||
+		    (bytelim != 0 && bytes > bytelim))
 			break;
 	}
 
+	if (comp_cnt > 0) {
+		mlxcx_update_cqci(mlxp, mlcq);
+		atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
+		atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
+	}
+
+	*mpp = mp;
+	return (B_TRUE);
+}
+
+
+mblk_t *
+mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
+{
+	mblk_t *mp = NULL;
+
+	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
+
+	ASSERT(mlcq->mlcq_wq != NULL);
+	ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);
+
+	(void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim);
+
 	return (mp);
 }
 
@@ -680,11 +804,10 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2)
 	mlxcx_t *mlxp = (mlxcx_t *)arg;
 	mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
 	mlxcx_eventq_ent_t *ent;
-	mlxcx_completionq_ent_t *cent;
 	mlxcx_completion_queue_t *mlcq, probe;
-	mlxcx_buffer_t *buf;
-	mblk_t *mp, *cmp, *nmp;
-	boolean_t found, tellmac = B_FALSE, added;
+	mlxcx_work_queue_t *mlwq;
+	mblk_t *mp = NULL;
+	boolean_t tellmac = B_FALSE;
 
 	mutex_enter(&mleq->mleq_mtx);
 
@@ -729,10 +852,12 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2)
 		if (mlcq == NULL)
 			continue;
 
+		mlwq = mlcq->mlcq_wq;
+
 		/*
 		 * The polling function might have the mutex and stop us from
-		 * getting the lock here, so we increment the event counter
-		 * atomically from outside.
+		 * getting the lock in mlxcx_process_cq(), so we increment
+		 * the event counter atomically from outside.
 		 *
 		 * This way at the end of polling when we go back to interrupts
 		 * from this CQ, the event counter is still correct.
@@ -746,145 +871,57 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2)
 
 		if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) {
 			/*
-			 * If we failed to take the mutex because the polling
-			 * function has it, just move on. We don't want to
-			 * block other CQs behind this one.
+			 * If we failed to take the mutex because the
+			 * polling function has it, just move on.
+			 * We don't want to block other CQs behind
+			 * this one.
 			 */
 			if (mlcq->mlcq_state & MLXCX_CQ_POLLING)
-				continue;
+				goto update_eq;
+
 			/* Otherwise we will wait. */
 			mutex_enter(&mlcq->mlcq_mtx);
 		}
 
-		if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
-		    !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
-		    (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
-		    (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) ||
-		    (mlcq->mlcq_state & MLXCX_CQ_POLLING)) {
-			mutex_exit(&mlcq->mlcq_mtx);
-			continue;
-		}
-
-		nmp = cmp = mp = NULL;
-		tellmac = B_FALSE;
-
-		cent = mlxcx_cq_next(mlcq);
-		for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) {
+		if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 &&
+		    mlxcx_process_cq(mlxp, mlcq, &mp, 0)) {
 			/*
-			 * Teardown and ring stop can atomic_or this flag
-			 * into our state if they want us to stop early.
+			 * The ring is not in polling mode and we processed
+			 * some completion queue entries.
 			 */
-			if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
-				break;
-			if (mlcq->mlcq_state & MLXCX_CQ_POLLING)
-				break;
-
-			if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
-			    cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
-				/* NOP */
-				goto nextcq;
+			if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 &&
+			    mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
+				atomic_and_uint(&mlcq->mlcq_state,
+				    ~MLXCX_CQ_BLOCKED_MAC);
+				tellmac = B_TRUE;
 			}
 
-lookagain:
-			/*
-			 * Generally the buffer we're looking for will be
-			 * at the front of the list, so this loop won't
-			 * need to look far.
-			 */
-			buf = list_head(&mlcq->mlcq_buffers);
-			found = B_FALSE;
-			while (buf != NULL) {
-				if ((buf->mlb_wqe_index & UINT16_MAX) ==
-				    from_be16(cent->mlcqe_wqe_counter)) {
-					found = B_TRUE;
-					break;
-				}
-				buf = list_next(&mlcq->mlcq_buffers, buf);
-			}
-			if (!found) {
-				/*
-				 * If there's any buffers waiting on the
-				 * buffers_b list, then merge those into
-				 * the main list and have another look.
-				 *
-				 * The wq enqueue routines push new buffers
-				 * into buffers_b so that they can avoid
-				 * taking the mlcq_mtx and blocking us for
-				 * every single packet.
-				 */
-				added = B_FALSE;
-				mutex_enter(&mlcq->mlcq_bufbmtx);
-				if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
-					list_move_tail(&mlcq->mlcq_buffers,
-					    &mlcq->mlcq_buffers_b);
-					added = B_TRUE;
-				}
-				mutex_exit(&mlcq->mlcq_bufbmtx);
-				if (added)
-					goto lookagain;
-			}
-			if (!found) {
-				buf = list_head(&mlcq->mlcq_buffers);
-				mlxcx_warn(mlxp, "got completion on CQ %x but "
-				    "no buffer matching wqe found: %x (first "
-				    "buffer counter = %x)", mlcq->mlcq_num,
-				    from_be16(cent->mlcqe_wqe_counter),
-				    buf == NULL ? UINT32_MAX :
-				    buf->mlb_wqe_index);
-				mlxcx_fm_ereport(mlxp,
-				    DDI_FM_DEVICE_INVAL_STATE);
-				goto nextcq;
+			if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 &&
+			    mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) {
+				atomic_and_uint(&mlwq->mlwq_state,
+				    ~MLXCX_WQ_BLOCKED_MAC);
+				tellmac = B_TRUE;
 			}
-			list_remove(&mlcq->mlcq_buffers, buf);
-			atomic_dec_64(&mlcq->mlcq_bufcnt);
 
-			switch (mlcq->mlcq_wq->mlwq_type) {
-			case MLXCX_WQ_TYPE_SENDQ:
-				mlxcx_tx_completion(mlxp, mlcq, cent, buf);
-				break;
-			case MLXCX_WQ_TYPE_RECVQ:
-				nmp = mlxcx_rx_completion(mlxp, mlcq, cent,
-				    buf);
-				if (nmp != NULL) {
-					if (cmp != NULL) {
-						cmp->b_next = nmp;
-						cmp = nmp;
-					} else {
-						mp = cmp = nmp;
-					}
-				}
-				break;
-			}
+			mlxcx_arm_cq(mlxp, mlcq);
 
-nextcq:
-			/*
-			 * Update the "doorbell" consumer counter for the queue
-			 * every time. Unlike a UAR write, this is relatively
-			 * cheap and doesn't require us to go out on the bus
-			 * straight away (since it's our memory).
-			 */
-			mlcq->mlcq_doorbell->mlcqd_update_ci =
-			    to_be24(mlcq->mlcq_cc);
+			mutex_exit(&mlcq->mlcq_mtx);
 
-			if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) &&
-			    mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
-				mlcq->mlcq_state &= ~MLXCX_CQ_BLOCKED_MAC;
-				tellmac = B_TRUE;
+			if (tellmac) {
+				mac_tx_ring_update(mlxp->mlx_mac_hdl,
+				    mlcq->mlcq_mac_hdl);
+				tellmac = B_FALSE;
 			}
-		}
 
-		mlxcx_arm_cq(mlxp, mlcq);
-		mutex_exit(&mlcq->mlcq_mtx);
-
-		if (tellmac) {
-			mac_tx_ring_update(mlxp->mlx_mac_hdl,
-			    mlcq->mlcq_mac_hdl);
-		}
-		if (mp != NULL) {
-			mac_rx_ring(mlxp->mlx_mac_hdl, mlcq->mlcq_mac_hdl,
-			    mp, mlcq->mlcq_mac_gen);
+			if (mp != NULL) {
+				mac_rx_ring(mlxp->mlx_mac_hdl,
+				    mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen);
+			}
+		} else {
+			mutex_exit(&mlcq->mlcq_mtx);
 		}
 
+update_eq:
 		/*
 		 * Updating the consumer counter for an EQ requires a write
 		 * to the UAR, which is possibly expensive.
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
index 76d0da30e7..f65280d41d 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
@@ -12,6 +12,7 @@
 /*
  * Copyright 2020, The University of Queensland
  * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 #ifndef _MLXCX_REG_H
@@ -2259,6 +2260,28 @@ typedef enum {
 	MLXCX_PROTO_50GBASE_KR2			= 1UL << 31,
 } mlxcx_eth_proto_t;
 
+#define	MLXCX_PROTO_100M	MLXCX_PROTO_SGMII_100BASE
+
+#define	MLXCX_PROTO_1G		(MLXCX_PROTO_1000BASE_KX | MLXCX_PROTO_SGMII)
+
+#define	MLXCX_PROTO_10G		(MLXCX_PROTO_10GBASE_CX4 | \
+	MLXCX_PROTO_10GBASE_KX4 | MLXCX_PROTO_10GBASE_KR | \
+	MLXCX_PROTO_10GBASE_CR | MLXCX_PROTO_10GBASE_SR | \
+	MLXCX_PROTO_10GBASE_ER_LR)
+
+#define	MLXCX_PROTO_25G		(MLXCX_PROTO_25GBASE_CR | \
+	MLXCX_PROTO_25GBASE_KR | MLXCX_PROTO_25GBASE_SR)
+
+#define	MLXCX_PROTO_40G		(MLXCX_PROTO_40GBASE_SR4 | \
+	MLXCX_PROTO_40GBASE_LR4_ER4 | MLXCX_PROTO_40GBASE_CR4 | \
+	MLXCX_PROTO_40GBASE_KR4)
+
+#define	MLXCX_PROTO_50G		(MLXCX_PROTO_50GBASE_CR2 | \
+	MLXCX_PROTO_50GBASE_KR2 | MLXCX_PROTO_50GBASE_SR2)
+
+#define	MLXCX_PROTO_100G	(MLXCX_PROTO_100GBASE_CR4 | \
+	MLXCX_PROTO_100GBASE_SR4 | MLXCX_PROTO_100GBASE_KR4)
+
 typedef enum {
 	MLXCX_AUTONEG_DISABLE_CAP	= 1 << 5,
 	MLXCX_AUTONEG_DISABLE		= 1 << 6
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
index 8337545b57..da609ed28c 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
@@ -12,6 +12,7 @@
 /*
  * Copyright 2020, The University of Queensland
  * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
  */
 
 /*
@@ -113,8 +114,9 @@ mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
 	mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC;
 }
 
-boolean_t
-mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
+static boolean_t
+mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
+    uint_t ent_shift)
 {
 	ddi_device_acc_attr_t acc;
 	ddi_dma_attr_t attr;
@@ -123,7 +125,7 @@ mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
 
 	VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC);
 
-	mlcq->mlcq_entshift = mlxp->mlx_props.mldp_cq_size_shift;
+	mlcq->mlcq_entshift = ent_shift;
 	mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift);
 	sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t);
 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
@@ -165,7 +167,7 @@ mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
 	return (B_TRUE);
 }
 
-void
+static void
 mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
 {
 	VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC);
@@ -331,7 +333,7 @@ mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
 
 static boolean_t
 mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
-    mlxcx_completion_queue_t **cqp)
+    mlxcx_completion_queue_t **cqp, uint_t ent_shift)
 {
 	mlxcx_completion_queue_t *cq;
 
@@ -350,7 +352,7 @@ mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq,
 
 	mutex_enter(&cq->mlcq_mtx);
 
-	if (!mlxcx_cq_alloc_dma(mlxp, cq)) {
+	if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) {
 		mutex_exit(&cq->mlcq_mtx);
 		return (B_FALSE);
 	}
@@ -413,6 +415,9 @@ mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq,
 		return (B_FALSE);
 	}
 
+	wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
+	wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
+
 	mutex_exit(&wq->mlwq_mtx);
 
 	mutex_enter(&cq->mlcq_mtx);
@@ -459,6 +464,9 @@ mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
 		return (B_FALSE);
 	}
 
+	wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP;
+	wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP;
+
 	mutex_exit(&wq->mlwq_mtx);
 
 	mutex_enter(&cq->mlcq_mtx);
@@ -471,6 +479,35 @@ mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq,
 	return (B_TRUE);
 }
 
+/*
+ * Before we tear down the queues associated with the rx group,
+ * flag each cq as being torn down and wake up any tasks.
+ */
+static void
+mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
+{
+	mlxcx_work_queue_t *wq;
+	mlxcx_completion_queue_t *cq;
+	mlxcx_buf_shard_t *s;
+	uint_t i;
+
+	mutex_enter(&g->mlg_mtx);
+
+	for (i = 0; i < g->mlg_nwqs; ++i) {
+		wq = &g->mlg_wqs[i];
+		cq = wq->mlwq_cq;
+		if (cq != NULL) {
+			s = wq->mlwq_bufs;
+			mutex_enter(&s->mlbs_mtx);
+			atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN);
+			cv_broadcast(&s->mlbs_free_nonempty);
+			mutex_exit(&s->mlbs_mtx);
+		}
+	}
+
+	mutex_exit(&g->mlg_mtx);
+}
+
 void
 mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 {
@@ -551,6 +588,7 @@ mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 			}
 			mutex_exit(&wq->mlwq_mtx);
 		}
+		taskq_destroy(g->mlg_refill_tq);
 		g->mlg_state &= ~MLXCX_GROUP_RUNNING;
 	}
 
@@ -662,8 +700,16 @@ mlxcx_teardown_groups(mlxcx_t *mlxp)
 		if (!(g->mlg_state & MLXCX_GROUP_INIT))
 			continue;
 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX);
+		mlxcx_quiesce_rx_cqs(mlxp, g);
+	}
+
+	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
+		g = &mlxp->mlx_rx_groups[i];
+		if (!(g->mlg_state & MLXCX_GROUP_INIT))
+			continue;
 		mlxcx_teardown_rx_group(mlxp, g);
 	}
+
 	kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size);
 	mlxp->mlx_rx_groups = NULL;
 
@@ -674,6 +720,7 @@ mlxcx_teardown_groups(mlxcx_t *mlxp)
 		ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX);
 		mlxcx_teardown_tx_group(mlxp, g);
 	}
+
 	kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size);
 	mlxp->mlx_tx_groups = NULL;
 }
@@ -687,6 +734,7 @@ mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 	mlxcx_flow_table_t *ft;
 	mlxcx_flow_group_t *fg;
 	mlxcx_flow_entry_t *fe;
+	uint_t ent_shift;
 	uint_t i, j;
 
 	ASSERT3S(g->mlg_state, ==, 0);
@@ -730,10 +778,18 @@ mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 			}
 		}
 
-		if (!mlxcx_cq_setup(mlxp, eq, &cq)) {
+		/*
+		 * A single completion is indicated for each rq entry as
+		 * it is used. So, the number of cq entries never needs
+		 * to be larger than the rq.
+		 */
+		ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift,
+		    mlxp->mlx_props.mldp_rq_size_shift);
+		if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) {
 			g->mlg_nwqs = i;
 			break;
 		}
+
 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
 
 		rq = &g->mlg_wqs[i];
@@ -1182,6 +1238,7 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 	mlxcx_flow_table_t *ft;
 	mlxcx_flow_group_t *fg;
 	mlxcx_flow_entry_t *fe;
+	char tq_name[TASKQ_NAMELEN];
 
 	mutex_enter(&g->mlg_mtx);
 
@@ -1194,6 +1251,23 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 
 	g->mlg_state |= MLXCX_GROUP_RUNNING;
 
+	snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld",
+	    ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst,
+	    g - &mlxp->mlx_rx_groups[0]);
+
+	/*
+	 * Create one refill taskq per group with one thread per work queue.
+	 * The refill task may block waiting for resources, so by effectively
+	 * having one thread per work queue we avoid work queues blocking each
+	 * other.
+	 */
+	if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri,
+	    g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) {
+		mlxcx_warn(mlxp, "failed to create rq refill task queue");
+		mutex_exit(&g->mlg_mtx);
+		return (B_FALSE);
+	}
+
 	if (g == &mlxp->mlx_rx_groups[0]) {
 		ft = g->mlg_port->mlp_rx_flow;
 		mutex_enter(&ft->mlft_mtx);
@@ -1207,6 +1281,8 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft;
 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
 			mutex_exit(&ft->mlft_mtx);
+			g->mlg_state &= ~MLXCX_GROUP_RUNNING;
+			taskq_destroy(g->mlg_refill_tq);
 			mutex_exit(&g->mlg_mtx);
 			return (B_FALSE);
 		}
@@ -1273,8 +1349,10 @@ mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
 			}
 		}
 
-		if (!mlxcx_cq_setup(mlxp, eq, &cq))
+		if (!mlxcx_cq_setup(mlxp, eq, &cq,
+		    mlxp->mlx_props.mldp_cq_size_shift))
 			return (B_FALSE);
+
 		cq->mlcq_stats = &g->mlg_port->mlp_stats;
 
 		sq = &g->mlg_wqs[i];
@@ -1409,6 +1487,11 @@ mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
 	ent0 = &mlwq->mlwq_send_ent[index];
 	start_pc = mlwq->mlwq_pc;
 	++mlwq->mlwq_pc;
+	/*
+	 * This counter is manipulated in the interrupt handler, which
+	 * does not hold the mlwq_mtx, hence the atomic.
+	 */
+	atomic_inc_64(&mlwq->mlwq_wqebb_used);
 
 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP;
@@ -1441,7 +1524,7 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
     uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags,
     mlxcx_buffer_t *b0)
 {
-	uint_t index, first, ents = 0;
+	uint_t index, first, ents;
 	mlxcx_completion_queue_t *cq;
 	mlxcx_sendq_ent_t *ent0;
 	mlxcx_sendq_extra_ent_t *ent;
@@ -1449,8 +1532,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 	uint_t ptri, nptr;
 	const ddi_dma_cookie_t *c;
 	size_t rem;
+	uint64_t wqebb_used;
 	mlxcx_buffer_t *b;
 	ddi_fm_error_t err;
+	boolean_t rv;
 
 	ASSERT(mutex_owned(&mlwq->mlwq_mtx));
 	ASSERT3P(b0->mlb_tx_head, ==, b0);
@@ -1460,16 +1545,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 	index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
 	ent0 = &mlwq->mlwq_send_ent[index];
 	b0->mlb_wqe_index = mlwq->mlwq_pc;
-	++mlwq->mlwq_pc;
-	++ents;
+	ents = 1;
 
 	first = index;
 
-	mutex_enter(&cq->mlcq_bufbmtx);
-	list_insert_tail(&cq->mlcq_buffers_b, b0);
-	atomic_inc_64(&cq->mlcq_bufcnt);
-	mutex_exit(&cq->mlcq_bufbmtx);
-
 	bzero(ent0, sizeof (mlxcx_sendq_ent_t));
 	ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND;
 	ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num);
@@ -1502,6 +1581,16 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 		    MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM);
 	}
 
+	/*
+	 * mlwq_wqebb_used is only incremented whilst holding
+	 * the mlwq_mtx mutex, but it is decremented (atomically) in
+	 * the interrupt context *not* under mlwq_mtx mutex.
+	 * So, now take a snapshot of the number of used wqes which will
+	 * be a conistent maximum we can use whilst iterating through
+	 * the buffers and DMA cookies.
+	 */
+	wqebb_used = mlwq->mlwq_wqebb_used;
+
 	b = b0;
 	ptri = 0;
 	nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t);
@@ -1513,9 +1602,12 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 		while (rem > 0 &&
 		    (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) {
 			if (ptri >= nptr) {
-				index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
+				if ((ents + wqebb_used) >= mlwq->mlwq_nents)
+					return (B_FALSE);
+
+				index = (mlwq->mlwq_pc + ents) &
+				    (mlwq->mlwq_nents - 1);
 				ent = &mlwq->mlwq_send_extra_ent[index];
-				++mlwq->mlwq_pc;
 				++ents;
 
 				seg = ent->mlsqe_data;
@@ -1548,6 +1640,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 		}
 	}
 
+	b0->mlb_wqebbs = ents;
+	mlwq->mlwq_pc += ents;
+	atomic_add_64(&mlwq->mlwq_wqebb_used, ents);
+
 	for (; ptri < nptr; ++ptri, ++seg) {
 		seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY);
 		seg->mlds_byte_count = to_be32(0);
@@ -1566,10 +1662,24 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 	if (err.fme_status != DDI_FM_OK) {
 		return (B_FALSE);
 	}
-	if (!mlxcx_sq_ring_dbell(mlxp, mlwq, first)) {
-		return (B_FALSE);
+
+	/*
+	 * Hold the bufmtx whilst ringing the doorbell, to prevent
+	 * the buffer from being moved to another list, so we can
+	 * safely remove it should the ring fail.
+	 */
+	mutex_enter(&cq->mlcq_bufbmtx);
+
+	list_insert_tail(&cq->mlcq_buffers_b, b0);
+	if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) {
+		atomic_inc_64(&cq->mlcq_bufcnt);
+	} else {
+		list_remove(&cq->mlcq_buffers_b, b0);
 	}
-	return (B_TRUE);
+
+	mutex_exit(&cq->mlcq_bufbmtx);
+
+	return (rv);
 }
 
 boolean_t
@@ -1604,8 +1714,10 @@ mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 		index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1);
 		ent = &mlwq->mlwq_recv_ent[index];
 		buf->mlb_wqe_index = mlwq->mlwq_pc;
+		buf->mlb_wqebbs = 1;
 
 		++mlwq->mlwq_pc;
+		atomic_inc_64(&mlwq->mlwq_wqebb_used);
 
 		mutex_enter(&cq->mlcq_bufbmtx);
 		list_insert_tail(&cq->mlcq_buffers, buf);
@@ -1666,11 +1778,53 @@ mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 	return (B_TRUE);
 }
 
+static void
+mlxcx_rq_refill_task(void *arg)
+{
+	mlxcx_work_queue_t *wq = arg;
+	mlxcx_completion_queue_t *cq = wq->mlwq_cq;
+	mlxcx_t *mlxp = wq->mlwq_mlx;
+	mlxcx_buf_shard_t *s = wq->mlwq_bufs;
+	boolean_t refill;
+
+	do {
+		/*
+		 * Wait until there are some free buffers.
+		 */
+		mutex_enter(&s->mlbs_mtx);
+		while (list_is_empty(&s->mlbs_free) &&
+		    (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0)
+			cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
+		mutex_exit(&s->mlbs_mtx);
+
+		mutex_enter(&cq->mlcq_mtx);
+		mutex_enter(&wq->mlwq_mtx);
+
+		if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) {
+			refill = B_FALSE;
+			wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
+		} else {
+			mlxcx_rq_refill(mlxp, wq);
+
+			if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) {
+				refill = B_TRUE;
+			} else {
+				refill = B_FALSE;
+				wq->mlwq_state &= ~MLXCX_WQ_REFILLING;
+			}
+		}
+
+		mutex_exit(&wq->mlwq_mtx);
+		mutex_exit(&cq->mlcq_mtx);
+	} while (refill);
+}
+
 void
 mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
 {
 	size_t target, current, want, done, n;
 	mlxcx_completion_queue_t *cq;
+	mlxcx_ring_group_t *g;
 	mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP];
 	uint_t i;
 
@@ -1697,10 +1851,24 @@ mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq)
 	while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) {
 		n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP);
 		if (n == 0) {
-			mlxcx_warn(mlxp, "!exiting rq refill early, done %u "
-			    "but wanted %u", done, want);
+			/*
+			 * We didn't get any buffers from the free queue.
+			 * It might not be an issue, schedule a taskq
+			 * to wait for free buffers if the completion
+			 * queue is low.
+			 */
+			if (current < MLXCX_RQ_REFILL_STEP &&
+			    (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) {
+				mlwq->mlwq_state |= MLXCX_WQ_REFILLING;
+				g = mlwq->mlwq_group;
+				taskq_dispatch_ent(g->mlg_refill_tq,
+				    mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP,
+				    &mlwq->mlwq_tqe);
+			}
+
 			return;
 		}
+
 		if (mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) {
 			for (i = 0; i < n; ++i)
 				mlxcx_buf_return(mlxp, b[i]);
@@ -1826,6 +1994,7 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
     mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf)
 {
 	uint32_t chkflags = 0;
+	uint_t wqe_index;
 	ddi_fm_error_t err;
 
 	ASSERT(mutex_owned(&mlcq->mlcq_mtx));
@@ -1868,6 +2037,12 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
 		return (NULL);
 	}
 
+	/*
+	 * mlxcx_buf_loan() will set mlb_wqe_index to zero.
+	 * Remember it for later.
+	 */
+	wqe_index = buf->mlb_wqe_index;
+
 	if (!mlxcx_buf_loan(mlxp, buf)) {
 		mlxcx_warn(mlxp, "!loan failed, dropping packet");
 		mlxcx_buf_return(mlxp, buf);
@@ -1894,7 +2069,7 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq,
 	 * Don't check if a refill is needed on every single completion,
 	 * since checking involves taking the RQ lock.
 	 */
-	if ((buf->mlb_wqe_index & 0x7) == 0) {
+	if ((wqe_index & 0x7) == 0) {
 		mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
 		ASSERT(wq != NULL);
 		mutex_enter(&wq->mlwq_mtx);
@@ -1981,39 +2156,66 @@ mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard,
 	return (B_TRUE);
 }
 
-static void
-mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
-    mlxcx_buffer_t **bp)
+static mlxcx_buffer_t *
+mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
 {
 	mlxcx_buffer_t *b;
 	mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs;
 
 	mutex_enter(&s->mlbs_mtx);
-	while (list_is_empty(&s->mlbs_free))
-		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
-	b = list_remove_head(&s->mlbs_free);
-	ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
-	ASSERT(b->mlb_foreign);
-	b->mlb_state = MLXCX_BUFFER_ON_WQ;
-	list_insert_tail(&s->mlbs_busy, b);
+	if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
+		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
+		ASSERT(b->mlb_foreign);
+		b->mlb_state = MLXCX_BUFFER_ON_WQ;
+		list_insert_tail(&s->mlbs_busy, b);
+	}
 	mutex_exit(&s->mlbs_mtx);
 
-	*bp = b;
+	return (b);
 }
 
-boolean_t
+static mlxcx_buffer_t *
+mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz)
+{
+	ddi_fm_error_t err;
+	mlxcx_buffer_t *b;
+	uint_t attempts = 0;
+
+copyb:
+	if ((b = mlxcx_buf_take(mlxp, wq)) == NULL)
+		return (NULL);
+
+	ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
+	bcopy(rptr, b->mlb_dma.mxdb_va, sz);
+
+	MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
+
+	ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
+	    DDI_FME_VERSION);
+	if (err.fme_status != DDI_FM_OK) {
+		ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
+		    DDI_FME_VERSION);
+		mlxcx_buf_return(mlxp, b);
+		if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
+			return (NULL);
+		}
+		goto copyb;
+	}
+
+	return (b);
+}
+
+mlxcx_buffer_t *
 mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
-    mblk_t *mpb, size_t off, mlxcx_buffer_t **bp)
+    mblk_t *mpb, size_t off)
 {
 	mlxcx_buffer_t *b, *b0 = NULL;
 	boolean_t first = B_TRUE;
-	ddi_fm_error_t err;
 	mblk_t *mp;
 	uint8_t *rptr;
 	size_t sz;
 	size_t ncookies = 0;
 	boolean_t ret;
-	uint_t attempts = 0;
 
 	for (mp = mpb; mp != NULL; mp = mp->b_cont) {
 		rptr = mp->b_rptr;
@@ -2024,31 +2226,24 @@ mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
 		rptr += off;
 		sz -= off;
 
-		if (sz < mlxp->mlx_props.mldp_tx_bind_threshold)
-			goto copyb;
-
-		mlxcx_buf_take_foreign(mlxp, wq, &b);
-		ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, B_FALSE);
+		if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) {
+			b = mlxcx_copy_data(mlxp, wq, rptr, sz);
+			if (b == NULL)
+				goto failed;
+		} else {
+			b = mlxcx_buf_take_foreign(mlxp, wq);
+			if (b == NULL)
+				goto failed;
 
-		if (!ret) {
-			mlxcx_buf_return(mlxp, b);
+			ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off,
+			    B_FALSE);
 
-copyb:
-			mlxcx_buf_take(mlxp, wq, &b);
-			ASSERT3U(b->mlb_dma.mxdb_len, >=, sz);
-			bcopy(rptr, b->mlb_dma.mxdb_va, sz);
-			MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV);
-			ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err,
-			    DDI_FME_VERSION);
-			if (err.fme_status != DDI_FM_OK) {
-				ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle,
-				    DDI_FME_VERSION);
+			if (!ret) {
 				mlxcx_buf_return(mlxp, b);
-				if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) {
-					*bp = NULL;
-					return (B_FALSE);
-				}
-				goto copyb;
+
+				b = mlxcx_copy_data(mlxp, wq, rptr, sz);
+				if (b == NULL)
+					goto failed;
 			}
 		}
 
@@ -2082,54 +2277,44 @@ copyb:
 
 	ASSERT3U(ncookies, <=, MLXCX_SQE_MAX_PTRS);
 
-	*bp = b0;
-	return (B_TRUE);
+	return (b0);
+
+failed:
+	if (b0 != NULL)
+		mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
+
+	return (NULL);
 }
 
-void
-mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mlxcx_buffer_t **bp)
+mlxcx_buffer_t *
+mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq)
 {
 	mlxcx_buffer_t *b;
 	mlxcx_buf_shard_t *s = wq->mlwq_bufs;
 
 	mutex_enter(&s->mlbs_mtx);
-	while (list_is_empty(&s->mlbs_free))
-		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
-	b = list_remove_head(&s->mlbs_free);
-	ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
-	b->mlb_state = MLXCX_BUFFER_ON_WQ;
-	list_insert_tail(&s->mlbs_busy, b);
+	if ((b = list_remove_head(&s->mlbs_free)) != NULL) {
+		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
+		b->mlb_state = MLXCX_BUFFER_ON_WQ;
+		list_insert_tail(&s->mlbs_busy, b);
+	}
 	mutex_exit(&s->mlbs_mtx);
 
-	*bp = b;
+	return (b);
 }
 
-#define	MLXCX_BUF_TAKE_N_TIMEOUT_USEC		5000
-#define	MLXCX_BUF_TAKE_N_MAX_RETRIES		3
-
 size_t
 mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
     mlxcx_buffer_t **bp, size_t nbufs)
 {
 	mlxcx_buffer_t *b;
-	size_t done = 0, empty = 0;
-	clock_t wtime = drv_usectohz(MLXCX_BUF_TAKE_N_TIMEOUT_USEC);
+	size_t done = 0;
 	mlxcx_buf_shard_t *s;
 
 	s = wq->mlwq_bufs;
 
 	mutex_enter(&s->mlbs_mtx);
-	while (done < nbufs) {
-		while (list_is_empty(&s->mlbs_free)) {
-			(void) cv_reltimedwait(&s->mlbs_free_nonempty,
-			    &s->mlbs_mtx, wtime, TR_MILLISEC);
-			if (list_is_empty(&s->mlbs_free) &&
-			    empty++ >= MLXCX_BUF_TAKE_N_MAX_RETRIES) {
-				mutex_exit(&s->mlbs_mtx);
-				return (done);
-			}
-		}
-		b = list_remove_head(&s->mlbs_free);
+	while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) {
 		ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE);
 		b->mlb_state = MLXCX_BUFFER_ON_WQ;
 		list_insert_tail(&s->mlbs_busy, b);
@@ -2187,13 +2372,26 @@ mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
 
 	VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE);
 	ASSERT3P(b->mlb_mlx, ==, mlxp);
+
+	/*
+	 * The mlbs_mtx held below is a heavily contended lock, so it is
+	 * imperative we do as much of the buffer clean up outside the lock
+	 * as is possible.
+	 */
 	b->mlb_state = MLXCX_BUFFER_FREE;
 	b->mlb_wqe_index = 0;
 	b->mlb_tx_head = NULL;
 	b->mlb_tx_mp = NULL;
 	b->mlb_used = 0;
+	b->mlb_wqebbs = 0;
 	ASSERT(list_is_empty(&b->mlb_tx_chain));
 
+	if (b->mlb_foreign) {
+		if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
+			mlxcx_dma_unbind(mlxp, &b->mlb_dma);
+		}
+	}
+
 	mutex_enter(&s->mlbs_mtx);
 	switch (oldstate) {
 	case MLXCX_BUFFER_INIT:
@@ -2215,12 +2413,6 @@ mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b)
 		break;
 	}
 
-	if (b->mlb_foreign) {
-		if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) {
-			mlxcx_dma_unbind(mlxp, &b->mlb_dma);
-		}
-	}
-
 	list_insert_tail(&s->mlbs_free, b);
 	cv_signal(&s->mlbs_free_nonempty);