summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Mustacchi <rm@joyent.com>2016-04-01 17:49:57 +0000
committerRobert Mustacchi <rm@joyent.com>2016-04-15 21:03:59 +0000
commit56046f267b29072fb6ea4da60520a4ed1a09c8aa (patch)
treed032b4aa2d0a7c0f23f8692072b3a66db39f6a2b
parentbc1b3e9d58811ef50db915fa1ad8c136754c277a (diff)
downloadillumos-joyent-56046f267b29072fb6ea4da60520a4ed1a09c8aa.tar.gz
OS-5236 Fortville should enable ITR logic
OS-5316 Fortville should leverage RX DMA binding OS-5317 i40e ring interrupt blanking needs to work OS-5318 Don't grab the tcb lock while holding the tx ring lock Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Reviewed by: Joshua M. Clulow <jmc@joyent.com>
-rw-r--r--usr/src/man/man7d/i40e.7d27
-rw-r--r--usr/src/uts/common/io/i40e/i40e_gld.c189
-rw-r--r--usr/src/uts/common/io/i40e/i40e_intr.c160
-rw-r--r--usr/src/uts/common/io/i40e/i40e_main.c57
-rw-r--r--usr/src/uts/common/io/i40e/i40e_stats.c13
-rw-r--r--usr/src/uts/common/io/i40e/i40e_sw.h80
-rw-r--r--usr/src/uts/common/io/i40e/i40e_transceiver.c140
7 files changed, 618 insertions, 48 deletions
diff --git a/usr/src/man/man7d/i40e.7d b/usr/src/man/man7d/i40e.7d
index 0a8019ac9a..082f8b0eeb 100644
--- a/usr/src/man/man7d/i40e.7d
+++ b/usr/src/man/man7d/i40e.7d
@@ -101,6 +101,11 @@ consumer such as
.Xr snoop 1M
or an LLDP daemon is started.
.Pp
+Some properties may be tuned at runtime with the
+.Xr dladm 1M
+utility. Properties that can be will have the name of the dladm property
+called out explicitly.
+.Pp
These properties are not considered stable at this time. They may change
and should not be relied on. They are considered
.Sy Volatile .
@@ -112,7 +117,9 @@ these values.
Minimum:
.Sy 1500 |
Maximum:
-.Sy 9710
+.Sy 9710 |
+Runtime Property:
+.Sy mtu
.Ed
.Bd -filled
The
@@ -232,6 +239,24 @@ for this. Turning it off will increase latency and decrease throughput
when receiving packets, but should be done if a hardware bug is
suspected.
.Ed
+.It Sy rx_dma_threshold
+.Bd -filled -compact
+Minimum:
+.Sy 0 |
+Maximum:
+.Sy INT32_MAX |
+Runtime Property:
+.Sy _rx_dma_treshold
+.Ed
+.Bd -filled
+The
+.Sy rx_dma_treshold
+indicates the size in bytes of a received frame, including all of its
+headers, at which the driver should not copy the frame but instead bind
+DMA memory. By setting this property to its minimum, all frames will be
+processed with DMA binding. By setting this property to its maximum, all
+frames will be processed by copying the frame.
+.Ed
.El
.Sh ARCHITECTURE
The
diff --git a/usr/src/uts/common/io/i40e/i40e_gld.c b/usr/src/uts/common/io/i40e/i40e_gld.c
index 21a9fff3e1..6fec1fd634 100644
--- a/usr/src/uts/common/io/i40e/i40e_gld.c
+++ b/usr/src/uts/common/io/i40e/i40e_gld.c
@@ -20,6 +20,21 @@
#include "i40e_sw.h"
+#define I40E_PROP_RX_DMA_THRESH "_rx_dma_threshold"
+#define I40E_PROP_TX_DMA_THRESH "_tx_dma_threshold"
+#define I40E_PROP_RX_ITR "_rx_intr_throttle"
+#define I40E_PROP_TX_ITR "_tx_intr_throttle"
+#define I40E_PROP_OTHER_ITR "_other_intr_throttle"
+
+char *i40e_priv_props[] = {
+ I40E_PROP_RX_DMA_THRESH,
+ I40E_PROP_TX_DMA_THRESH,
+ I40E_PROP_RX_ITR,
+ I40E_PROP_TX_ITR,
+ I40E_PROP_OTHER_ITR,
+ NULL
+};
+
static int
i40e_group_remove_mac(void *arg, const uint8_t *mac_addr)
{
@@ -431,24 +446,35 @@ i40e_ring_start(mac_ring_driver_t rh, uint64_t gen_num)
return (0);
}
-/*
- * Because we only support a single ring at this time, we don't support toggling
- * interrupts and polling. When we do, we should simply toggle the interrupt
- * cause enable bit for this and potentially ignore it when looking at the
- * interrupt vector mapping.
- */
/* ARGSUSED */
static int
i40e_rx_ring_intr_enable(mac_intr_handle_t intrh)
{
- return (EINVAL);
+ i40e_trqpair_t *itrq = (i40e_trqpair_t *)intrh;
+ i40e_t *i40e = itrq->itrq_i40e;
+
+ mutex_enter(&i40e->i40e_general_lock);
+ ASSERT(i40e->i40e_intr_poll == B_TRUE);
+ i40e_intr_rx_queue_enable(i40e, itrq->itrq_index);
+ i40e->i40e_intr_poll = B_FALSE;
+ mutex_exit(&i40e->i40e_general_lock);
+
+ return (0);
}
/* ARGSUSED */
static int
i40e_rx_ring_intr_disable(mac_intr_handle_t intrh)
{
- return (EINVAL);
+ i40e_trqpair_t *itrq = (i40e_trqpair_t *)intrh;
+ i40e_t *i40e = itrq->itrq_i40e;
+
+ mutex_enter(&i40e->i40e_general_lock);
+ i40e_intr_rx_queue_disable(i40e, itrq->itrq_index);
+ i40e->i40e_intr_poll = B_TRUE;
+ mutex_exit(&i40e->i40e_general_lock);
+
+ return (0);
}
static void
@@ -599,6 +625,145 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
}
static int
+i40e_m_setprop_private(i40e_t *i40e, const char *pr_name, uint_t pr_valsize,
+ const void *pr_val)
+{
+ int ret;
+ long val;
+ char *eptr;
+
+ ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+
+ if ((ret = ddi_strtol(pr_val, &eptr, 10, &val)) != 0 ||
+ *eptr != '\0') {
+ return (ret);
+ }
+
+ if (strcmp(pr_name, I40E_PROP_RX_DMA_THRESH) == 0) {
+ if (val < I40E_MIN_RX_DMA_THRESH ||
+ val > I40E_MAX_RX_DMA_THRESH) {
+ return (EINVAL);
+ }
+ i40e->i40e_rx_dma_min = (uint32_t)val;
+ return (0);
+ }
+
+ if (strcmp(pr_name, I40E_PROP_TX_DMA_THRESH) == 0) {
+ if (val < I40E_MIN_TX_DMA_THRESH ||
+ val > I40E_MAX_TX_DMA_THRESH) {
+ return (EINVAL);
+ }
+ i40e->i40e_tx_dma_min = (uint32_t)val;
+ return (0);
+ }
+
+ if (strcmp(pr_name, I40E_PROP_RX_ITR) == 0) {
+ if (val < I40E_MIN_ITR ||
+ val > I40E_MAX_ITR) {
+ return (EINVAL);
+ }
+ i40e->i40e_rx_itr = (uint32_t)val;
+ i40e_intr_set_itr(i40e, I40E_ITR_INDEX_RX, i40e->i40e_rx_itr);
+ return (0);
+ }
+
+ if (strcmp(pr_name, I40E_PROP_TX_ITR) == 0) {
+ if (val < I40E_MIN_ITR ||
+ val > I40E_MAX_ITR) {
+ return (EINVAL);
+ }
+ i40e->i40e_tx_itr = (uint32_t)val;
+ i40e_intr_set_itr(i40e, I40E_ITR_INDEX_TX, i40e->i40e_tx_itr);
+ return (0);
+ }
+
+ if (strcmp(pr_name, I40E_PROP_OTHER_ITR) == 0) {
+ if (val < I40E_MIN_ITR ||
+ val > I40E_MAX_ITR) {
+ return (EINVAL);
+ }
+ i40e->i40e_tx_itr = (uint32_t)val;
+ i40e_intr_set_itr(i40e, I40E_ITR_INDEX_OTHER,
+ i40e->i40e_other_itr);
+ return (0);
+ }
+
+ return (ENOTSUP);
+}
+
+static int
+i40e_m_getprop_private(i40e_t *i40e, const char *pr_name, uint_t pr_valsize,
+ void *pr_val)
+{
+ uint32_t val;
+
+ ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+
+ if (strcmp(pr_name, I40E_PROP_RX_DMA_THRESH) == 0) {
+ val = i40e->i40e_rx_dma_min;
+ } else if (strcmp(pr_name, I40E_PROP_TX_DMA_THRESH) == 0) {
+ val = i40e->i40e_tx_dma_min;
+ } else if (strcmp(pr_name, I40E_PROP_RX_ITR) == 0) {
+ val = i40e->i40e_rx_itr;
+ } else if (strcmp(pr_name, I40E_PROP_TX_ITR) == 0) {
+ val = i40e->i40e_tx_itr;
+ } else if (strcmp(pr_name, I40E_PROP_OTHER_ITR) == 0) {
+ val = i40e->i40e_other_itr;
+ } else {
+ return (ENOTSUP);
+ }
+
+ if (snprintf(pr_val, pr_valsize, "%d", val) >= pr_valsize)
+ return (ERANGE);
+ return (0);
+}
+
+/*
+ * Annoyingly for private properties MAC seems to ignore default values that
+ * aren't strings. That means that we have to translate all of these into
+ * uint32_t's and instead we size the buffer to be large enough to hold a
+ * uint32_t.
+ */
+static void
+i40e_m_propinfo_private(i40e_t *i40e, const char *pr_name,
+ mac_prop_info_handle_t prh)
+{
+ char buf[64];
+ uint32_t def;
+
+ if (strcmp(pr_name, I40E_PROP_RX_DMA_THRESH) == 0) {
+ mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+ def = I40E_DEF_RX_DMA_THRESH;
+ mac_prop_info_set_range_uint32(prh,
+ I40E_MIN_RX_DMA_THRESH,
+ I40E_MAX_RX_DMA_THRESH);
+ } else if (strcmp(pr_name, I40E_PROP_TX_DMA_THRESH) == 0) {
+ mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+ def = I40E_DEF_TX_DMA_THRESH;
+ mac_prop_info_set_range_uint32(prh,
+ I40E_MIN_TX_DMA_THRESH,
+ I40E_MAX_TX_DMA_THRESH);
+ } else if (strcmp(pr_name, I40E_PROP_RX_ITR) == 0) {
+ mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+ def = I40E_DEF_RX_ITR;
+ mac_prop_info_set_range_uint32(prh, I40E_MIN_ITR, I40E_MAX_ITR);
+ } else if (strcmp(pr_name, I40E_PROP_TX_ITR) == 0) {
+ mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+ def = I40E_DEF_TX_ITR;
+ mac_prop_info_set_range_uint32(prh, I40E_MIN_ITR, I40E_MAX_ITR);
+ } else if (strcmp(pr_name, I40E_PROP_OTHER_ITR) == 0) {
+ mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+ def = I40E_DEF_OTHER_ITR;
+ mac_prop_info_set_range_uint32(prh, I40E_MIN_ITR, I40E_MAX_ITR);
+ } else {
+ return;
+ }
+
+ (void) snprintf(buf, sizeof (buf), "%d", def);
+ mac_prop_info_set_default_str(prh, buf);
+}
+
+static int
i40e_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
uint_t pr_valsize, const void *pr_val)
{
@@ -662,6 +827,8 @@ i40e_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
break;
case MAC_PROP_PRIVATE:
+ ret = i40e_m_setprop_private(i40e, pr_name, pr_valsize, pr_val);
+ break;
default:
ret = ENOTSUP;
break;
@@ -775,6 +942,8 @@ i40e_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
*u8 = (i40e->i40e_phy.link_speed & I40E_LINK_SPEED_40GB) != 0;
break;
case MAC_PROP_PRIVATE:
+ ret = i40e_m_getprop_private(i40e, pr_name, pr_valsize, pr_val);
+ break;
default:
ret = ENOTSUP;
break;
@@ -862,6 +1031,8 @@ i40e_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
(i40e->i40e_phy.link_speed & I40E_LINK_SPEED_40GB) != 0);
break;
case MAC_PROP_PRIVATE:
+ i40e_m_propinfo_private(i40e, pr_name, prh);
+ break;
default:
break;
}
@@ -909,7 +1080,7 @@ i40e_register_mac(i40e_t *i40e)
mac->m_min_sdu = 0;
mac->m_max_sdu = i40e->i40e_sdu;
mac->m_margin = VLAN_TAGSZ;
- mac->m_priv_props = NULL;
+ mac->m_priv_props = i40e_priv_props;
mac->m_v12n = MAC_VIRT_LEVEL1;
status = mac_register(mac, &i40e->i40e_mac_hdl);
diff --git a/usr/src/uts/common/io/i40e/i40e_intr.c b/usr/src/uts/common/io/i40e/i40e_intr.c
index 1d9c3d78b1..9ff64cb74b 100644
--- a/usr/src/uts/common/io/i40e/i40e_intr.c
+++ b/usr/src/uts/common/io/i40e/i40e_intr.c
@@ -119,6 +119,56 @@
* Finally, we still have to set up the interrupt linked list, but the list is
* instead rooted at the register I40E_PFINT_LNKLST0, rather than being tied to
* one of the other MSI-X registers.
+ *
+ * --------------------
+ * Interrupt Moderation
+ * --------------------
+ *
+ * The XL710 hardware has three different interrupt moderation registers per
+ * interrupt. Unsurprisingly, we use these for:
+ *
+ * o RX interrupts
+ * o TX interrupts
+ * o 'Other interrupts' (link status change, admin queue, etc.)
+ *
+ * By default, we throttle 'other interrupts' the most, then TX interrupts, and
+ * then RX interrupts. The default values for these were based on trying to
+ * reason about both the importance and frequency of events. Generally speaking
+ * 'other interrupts' are not very frequent and they're not important for the
+ * I/O data path in and of itself (though they may indicate issues with the I/O
+ * data path).
+ *
+ * On the flip side, when we're not polling, RX interrupts are very important.
+ * The longer we wait for them, the more latency that we inject into the system.
+ * However, if we allow interrupts to occur too frequently, we risk a few
+ * problems:
+ *
+ * 1) Abusing system resources. Without proper interrupt blanking and polling,
+ * we can see upwards of 200k-300k interrupts per second on the system.
+ *
+ * 2) Not enough data coalescing to enable polling. In other words, the more
+ * data that we allow to build up, the more likely we'll be able to enable
+ * polling mode and allowing us to better handle bulk data.
+ *
+ * In-between the 'other interrupts' and the TX interrupts we have the
+ * reclamation of TX buffers. This operation is not quite as important as we
+ * generally size the ring large enough that we should be able to reclaim a
+ * substantial amount of the descriptors that we have used per interrupt. So
+ * while it's important that this interrupt occur, we don't necessarily need it
+ * firing as frequently as RX; it doesn't, on its own, induce additional latency
+ * into the system.
+ *
+ * Based on all this we currently assign static ITR values for the system. While
+ * we could move to a dynamic system (the hardware supports that), we'd want to
+ * make sure that we're seeing problems from this that we believe would be
+ * generally helped by the added complexity.
+ *
+ * Based on this, the default values that we have allow for the following
+ * interrupt thresholds:
+ *
+ * o 20k interrupts/s for RX
+ * o 5k interrupts/s for TX
+ * o 2k interupts/s for 'Other Interrupts'
*/
#include "i40e_sw.h"
@@ -130,6 +180,30 @@
#define I40E_INTR_NOTX_TX_QUEUE 1
#define I40E_INTR_NOTX_TX_MASK (1 << I40E_PFINT_ICR0_QUEUE_1_SHIFT)
+void
+i40e_intr_set_itr(i40e_t *i40e, i40e_itr_index_t itr, uint_t val)
+{
+ int i;
+ i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+ VERIFY3U(val, <=, I40E_MAX_ITR);
+ VERIFY3U(itr, <, I40E_ITR_INDEX_NONE);
+
+ /*
+ * No matter the interrupt mode, the ITR for other interrupts is always
+ * on interrupt zero and the same is true if we're not using MSI-X.
+ */
+ if (itr == I40E_ITR_INDEX_OTHER ||
+ i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) {
+ I40E_WRITE_REG(hw, I40E_PFINT_ITR0(itr), val);
+ return;
+ }
+
+ for (i = 1; i < i40e->i40e_intr_count; i++) {
+ I40E_WRITE_REG(hw, I40E_PFINT_ITRN(itr, i - 1), val);
+ }
+}
+
/*
* Re-enable the adminq. Note that the adminq doesn't have a traditional queue
* associated with it from an interrupt perspective and just lives on ICR0.
@@ -144,7 +218,7 @@ i40e_intr_adminq_enable(i40e_t *i40e)
reg = I40E_PFINT_DYN_CTL0_INTENA_MASK |
I40E_PFINT_DYN_CTL0_CLEARPBA_MASK |
- (I40E_ITR_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT);
+ (I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT);
I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTL0, reg);
i40e_flush(hw);
}
@@ -155,7 +229,7 @@ i40e_intr_adminq_disable(i40e_t *i40e)
i40e_hw_t *hw = &i40e->i40e_hw_space;
uint32_t reg;
- reg = I40E_ITR_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT;
+ reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT;
I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTL0, reg);
}
@@ -167,7 +241,7 @@ i40e_intr_io_enable(i40e_t *i40e, int vector)
reg = I40E_PFINT_DYN_CTLN_INTENA_MASK |
I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
- (I40E_ITR_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
+ (I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg);
}
@@ -177,7 +251,7 @@ i40e_intr_io_disable(i40e_t *i40e, int vector)
uint32_t reg;
i40e_hw_t *hw = &i40e->i40e_hw_space;
- reg = I40E_ITR_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
+ reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg);
}
@@ -326,7 +400,7 @@ i40e_intr_init_queue_msix(i40e_t *i40e)
I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(0), reg);
reg = (1 << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
- (I40E_ITR_NONE << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
+ (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
(0 << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
(I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
I40E_QINT_RQCTL_CAUSE_ENA_MASK;
@@ -334,7 +408,7 @@ i40e_intr_init_queue_msix(i40e_t *i40e)
I40E_WRITE_REG(hw, I40E_QINT_RQCTL(0), reg);
reg = (1 << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
- (I40E_ITR_NONE << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
+ (I40E_ITR_INDEX_TX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
(I40E_QUEUE_TYPE_EOL << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
(I40E_QUEUE_TYPE_RX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
I40E_QINT_TQCTL_CAUSE_ENA_MASK;
@@ -363,7 +437,7 @@ i40e_intr_init_queue_shared(i40e_t *i40e)
I40E_WRITE_REG(hw, I40E_PFINT_LNKLST0, reg);
reg = (I40E_INTR_NOTX_INTR << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
- (I40E_ITR_NONE << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
+ (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
(I40E_INTR_NOTX_RX_QUEUE << I40E_QINT_RQCTL_MSIX0_INDX_SHIFT) |
(I40E_INTR_NOTX_QUEUE << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
(I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT);
@@ -371,7 +445,7 @@ i40e_intr_init_queue_shared(i40e_t *i40e)
I40E_WRITE_REG(hw, I40E_QINT_RQCTL(I40E_INTR_NOTX_QUEUE), reg);
reg = (I40E_INTR_NOTX_INTR << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) |
- (I40E_ITR_NONE << I40E_QINT_TQCTL_ITR_INDX_SHIFT) |
+ (I40E_ITR_INDEX_TX << I40E_QINT_TQCTL_ITR_INDX_SHIFT) |
(I40E_INTR_NOTX_TX_QUEUE << I40E_QINT_TQCTL_MSIX0_INDX_SHIFT) |
(I40E_QUEUE_TYPE_EOL << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
(I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT);
@@ -380,6 +454,47 @@ i40e_intr_init_queue_shared(i40e_t *i40e)
}
/*
+ * Enable the specified queue as a valid source of interrupts. Note, this should
+ * only be used as part of the GLDv3's interrupt blanking routines. The debug
+ * build assertions are specific to that.
+ */
+void
+i40e_intr_rx_queue_enable(i40e_t *i40e, uint_t queue)
+{
+ uint32_t reg;
+ i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+ ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+ ASSERT(queue < i40e->i40e_num_trqpairs);
+
+ reg = I40E_READ_REG(hw, I40E_QINT_RQCTL(queue));
+ ASSERT0(reg & I40E_QINT_RQCTL_CAUSE_ENA_MASK);
+ reg |= I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+ I40E_WRITE_REG(hw, I40E_QINT_RQCTL(queue), reg);
+}
+
+/*
+ * Disable the specified queue as a valid source of interrupts. Note, this
+ * should only be used as part of the GLDv3's interrupt blanking routines. The
+ * debug build assertions are specific to that.
+ */
+void
+i40e_intr_rx_queue_disable(i40e_t *i40e, uint_t queue)
+{
+ uint32_t reg;
+ i40e_hw_t *hw = &i40e->i40e_hw_space;
+
+ ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
+ ASSERT(queue < i40e->i40e_num_trqpairs);
+
+ reg = I40E_READ_REG(hw, I40E_QINT_RQCTL(queue));
+ ASSERT3U(reg & I40E_QINT_RQCTL_CAUSE_ENA_MASK, ==,
+ I40E_QINT_RQCTL_CAUSE_ENA_MASK);
+ reg &= ~I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+ I40E_WRITE_REG(hw, I40E_QINT_RQCTL(queue), reg);
+}
+
+/*
* Start up the various chip's interrupt handling. We not only configure the
* adminq here, but we also go through and configure all of the actual queues,
* the interrupt linked lists, and others.
@@ -395,11 +510,18 @@ i40e_intr_chip_init(i40e_t *i40e)
*/
i40e_intr_io_disable_all(i40e);
- /* First, the adminq. */
I40E_WRITE_REG(hw, I40E_PFINT_ICR0_ENA, 0);
I40E_READ_REG(hw, I40E_PFINT_ICR0);
/*
+ * Always enable all of the other-class interrupts to be on their own
+ * ITR. This only needs to be set on interrupt zero, which has its own
+ * special setting.
+ */
+ reg = I40E_ITR_INDEX_OTHER << I40E_PFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT;
+ I40E_WRITE_REG(hw, I40E_PFINT_STAT_CTL0, reg);
+
+ /*
* Enable interrupt types we expect to receive. At the moment, this
* is limited to the adminq; however, we'll want to review 11.2.2.9.22
* for more types here as we add support for detecting them, handling
@@ -425,8 +547,15 @@ i40e_intr_chip_init(i40e_t *i40e)
} else {
i40e_intr_init_queue_shared(i40e);
}
-}
+ /*
+ * Finally set all of the default ITRs for the interrupts. Note that the
+ * queues will have been set up above.
+ */
+ i40e_intr_set_itr(i40e, I40E_ITR_INDEX_RX, i40e->i40e_rx_itr);
+ i40e_intr_set_itr(i40e, I40E_ITR_INDEX_TX, i40e->i40e_tx_itr);
+ i40e_intr_set_itr(i40e, I40E_ITR_INDEX_OTHER, i40e->i40e_other_itr);
+}
static void
i40e_intr_adminq_work(i40e_t *i40e)
@@ -548,7 +677,16 @@ i40e_intr_msix(void *arg1, void *arg2)
VERIFY(vector_idx == 1);
- i40e_intr_rx_work(i40e, 0);
+ /*
+ * Note that we explicitly do not check this value under the lock even
+ * though assignments to it are done so. In this case, the cost of
+ * getting this wrong is at worst a bit of additional contention and
+ * even more rarely, a duplicated packet. However, the cost on the other
+ * hand is a lot more. This is something that as we more generally
+ * implement ring support we should revisit.
+ */
+ if (i40e->i40e_intr_poll != B_TRUE)
+ i40e_intr_rx_work(i40e, 0);
i40e_intr_tx_work(i40e, 0);
i40e_intr_io_enable(i40e, 1);
diff --git a/usr/src/uts/common/io/i40e/i40e_main.c b/usr/src/uts/common/io/i40e/i40e_main.c
index 69303a0370..83b3af7e26 100644
--- a/usr/src/uts/common/io/i40e/i40e_main.c
+++ b/usr/src/uts/common/io/i40e/i40e_main.c
@@ -1097,11 +1097,14 @@ i40e_free_trqpairs(i40e_t *i40e)
i40e->i40e_trqpairs = NULL;
}
+ cv_destroy(&i40e->i40e_rx_pending_cv);
+ mutex_destroy(&i40e->i40e_rx_pending_lock);
mutex_destroy(&i40e->i40e_general_lock);
}
/*
- * Allocate receive & transmit rings.
+ * Allocate transmit and receive rings, as well as other data structures that we
+ * need.
*/
static boolean_t
i40e_alloc_trqpairs(i40e_t *i40e)
@@ -1114,6 +1117,8 @@ i40e_alloc_trqpairs(i40e_t *i40e)
* all relevant locks.
*/
mutex_init(&i40e->i40e_general_lock, NULL, MUTEX_DRIVER, mutexpri);
+ mutex_init(&i40e->i40e_rx_pending_lock, NULL, MUTEX_DRIVER, mutexpri);
+ cv_init(&i40e->i40e_rx_pending_cv, NULL, CV_DRIVER, NULL);
i40e->i40e_trqpairs = kmem_zalloc(sizeof (i40e_trqpair_t) *
i40e->i40e_num_trqpairs, KM_SLEEP);
@@ -1527,6 +1532,23 @@ i40e_init_properties(i40e_t *i40e)
i40e->i40e_rx_hcksum_enable = i40e_get_prop(i40e, "rx_hcksum_enable",
B_FALSE, B_TRUE, B_TRUE);
+ i40e->i40e_rx_dma_min = i40e_get_prop(i40e, "rx_dma_threshold",
+ I40E_MIN_RX_DMA_THRESH, I40E_MAX_RX_DMA_THRESH,
+ I40E_DEF_RX_DMA_THRESH);
+
+ i40e->i40e_tx_dma_min = i40e_get_prop(i40e, "tx_dma_threshold",
+ I40E_MIN_TX_DMA_THRESH, I40E_MAX_TX_DMA_THRESH,
+ I40E_DEF_TX_DMA_THRESH);
+
+ i40e->i40e_tx_itr = i40e_get_prop(i40e, "tx_intr_throttle",
+ I40E_MIN_ITR, I40E_MAX_ITR, I40E_DEF_TX_ITR);
+
+ i40e->i40e_rx_itr = i40e_get_prop(i40e, "rx_intr_throttle",
+ I40E_MIN_ITR, I40E_MAX_ITR, I40E_DEF_RX_ITR);
+
+ i40e->i40e_other_itr = i40e_get_prop(i40e, "other_intr_throttle",
+ I40E_MIN_ITR, I40E_MAX_ITR, I40E_DEF_OTHER_ITR);
+
if (!i40e->i40e_mr_enable) {
i40e->i40e_num_trqpairs = I40E_TRQPAIR_NOMSIX;
i40e->i40e_num_rx_groups = I40E_GROUP_NOMSIX;
@@ -2567,6 +2589,27 @@ done:
return (rc);
}
+/*
+ * We may have loaned up descriptors to the stack. As such, if we still have
+ * them outstanding, then we will not continue with detach.
+ */
+static boolean_t
+i40e_drain_rx(i40e_t *i40e)
+{
+ mutex_enter(&i40e->i40e_rx_pending_lock);
+ while (i40e->i40e_rx_pending > 0) {
+ if (cv_reltimedwait(&i40e->i40e_rx_pending_cv,
+ &i40e->i40e_rx_pending_lock,
+ drv_usectohz(I40E_DRAIN_RX_WAIT), TR_CLOCK_TICK) == -1) {
+ mutex_exit(&i40e->i40e_rx_pending_lock);
+ return (B_FALSE);
+ }
+ }
+ mutex_exit(&i40e->i40e_rx_pending_lock);
+
+ return (B_TRUE);
+}
+
static int
i40e_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
{
@@ -2712,11 +2755,12 @@ i40e_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
return (DDI_FAILURE);
}
- /*
- * When we add support for DMA binding, we'll need to make sure that we
- * take care of draining any outstanding packets that are still up in
- * the kernel.
- */
+ if (i40e_drain_rx(i40e) == B_FALSE) {
+ i40e_log(i40e, "timed out draining DMA resources, %d buffers "
+ "remain", i40e->i40e_rx_pending);
+ return (DDI_FAILURE);
+ }
+
mutex_enter(&i40e_glock);
list_remove(&i40e_glist, i40e);
mutex_exit(&i40e_glock);
@@ -2726,7 +2770,6 @@ i40e_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
return (DDI_SUCCESS);
}
-
static struct cb_ops i40e_cb_ops = {
nulldev, /* cb_open */
nulldev, /* cb_close */
diff --git a/usr/src/uts/common/io/i40e/i40e_stats.c b/usr/src/uts/common/io/i40e/i40e_stats.c
index 90b85a1453..c7dd403fc8 100644
--- a/usr/src/uts/common/io/i40e/i40e_stats.c
+++ b/usr/src/uts/common/io/i40e/i40e_stats.c
@@ -1212,6 +1212,13 @@ i40e_stats_trqpair_init(i40e_trqpair_t *itrq)
kstat_named_init(&tsp->itxs_packets, "tx_packets",
KSTAT_DATA_UINT64);
tsp->itxs_packets.value.ui64 = 0;
+ kstat_named_init(&tsp->itxs_descriptors, "tx_descriptors",
+ KSTAT_DATA_UINT64);
+ tsp->itxs_descriptors.value.ui64 = 0;
+ kstat_named_init(&tsp->itxs_recycled, "tx_recycled",
+ KSTAT_DATA_UINT64);
+ tsp->itxs_recycled.value.ui64 = 0;
+
kstat_named_init(&tsp->itxs_hck_meoifail, "tx_hck_meoifail",
KSTAT_DATA_UINT64);
tsp->itxs_hck_meoifail.value.ui64 = 0;
@@ -1256,6 +1263,12 @@ i40e_stats_trqpair_init(i40e_trqpair_t *itrq)
kstat_named_init(&rsp->irxs_rx_intr_limit, "rx_intr_limit",
KSTAT_DATA_UINT64);
rsp->irxs_rx_intr_limit.value.ui64 = 0;
+ kstat_named_init(&rsp->irxs_rx_bind_norcb, "rx_bind_norcb",
+ KSTAT_DATA_UINT64);
+ rsp->irxs_rx_bind_norcb.value.ui64 = 0;
+ kstat_named_init(&rsp->irxs_rx_bind_nomp, "rx_bind_nomp",
+ KSTAT_DATA_UINT64);
+ rsp->irxs_rx_bind_nomp.value.ui64 = 0;
kstat_named_init(&rsp->irxs_rx_copy_nomem, "rx_copy_nomem",
KSTAT_DATA_UINT64);
rsp->irxs_rx_copy_nomem.value.ui64 = 0;
diff --git a/usr/src/uts/common/io/i40e/i40e_sw.h b/usr/src/uts/common/io/i40e/i40e_sw.h
index 26e851176b..077599b237 100644
--- a/usr/src/uts/common/io/i40e/i40e_sw.h
+++ b/usr/src/uts/common/io/i40e/i40e_sw.h
@@ -68,6 +68,7 @@ extern "C" {
#include <sys/fm/io/ddi.h>
#include <sys/list.h>
#include <sys/debug.h>
+#include <sys/sdt.h>
#include "i40e_type.h"
#include "i40e_osdep.h"
#include "i40e_prototype.h"
@@ -124,16 +125,39 @@ extern "C" {
#define I40E_DEF_MTU ETHERMTU
/*
- * Table 1-5 of the PRM notes that LSO supports up to 256 KB.
+ * Interrupt throttling related values. Interrupt throttling values are defined
+ * in two microsecond increments. Note that a value of zero basically says do no
+ * ITR activity. A helpful way to think about these is that setting the ITR to a
+ * value will allow a certain number of interrupts per second.
+ *
+ * Our default values for RX allow 20k interrupts per second while our default
+ * values for TX allow for 5k interrupts per second. For other class interrupts,
+ * we limit ourselves to a rate of 2k/s.
*/
-#define I40E_LSO_MAXLEN (256 * 1024)
+#define I40E_MIN_ITR 0x0000
+#define I40E_MAX_ITR 0x0FF0
+#define I40E_DEF_RX_ITR 0x0019
+#define I40E_DEF_TX_ITR 0x0064
+#define I40E_DEF_OTHER_ITR 0x00FA
+
+/*
+ * Indexes into the three ITR registers that we have.
+ */
+typedef enum i40e_itr_index {
+ I40E_ITR_INDEX_RX = 0x0,
+ I40E_ITR_INDEX_TX = 0x1,
+ I40E_ITR_INDEX_OTHER = 0x2,
+ I40E_ITR_INDEX_NONE = 0x3
+} i40e_itr_index_t;
-#define I40E_CYCLIC_PERIOD NANOSEC /* 1 second */
/*
- * Interrupt rates and ITR logic.
+ * Table 1-5 of the PRM notes that LSO supports up to 256 KB.
*/
-#define I40E_ITR_NONE 0x3
+#define I40E_LSO_MAXLEN (256 * 1024)
+
+#define I40E_CYCLIC_PERIOD NANOSEC /* 1 second */
+#define I40E_DRAIN_RX_WAIT (500 * MILLISEC) /* In us */
/*
* All the other queue types for are defined by the common code. However, this
@@ -165,6 +189,20 @@ extern "C" {
#define I40E_DEF_TX_BLOCK_THRESH I40E_MIN_TX_BLOCK_THRESH
/*
+ * Sizing for DMA thresholds. These are used to indicate whether or not we
+ * should perform a bcopy or a DMA binding of a given message block. The range
+ * allows for setting things such that we'll always do a bcopy (a high value) or
+ * always perform a DMA binding (a low value).
+ */
+#define I40E_MIN_RX_DMA_THRESH 0
+#define I40E_DEF_RX_DMA_THRESH 256
+#define I40E_MAX_RX_DMA_THRESH INT32_MAX
+
+#define I40E_MIN_TX_DMA_THRESH 0
+#define I40E_DEF_TX_DMA_THRESH 256
+#define I40E_MAX_TX_DMA_THRESH INT32_MAX
+
+/*
* Resource sizing counts. There are various aspects of hardware where we may
* have some variable number of elements that we need to handle. Such as the
* hardware capabilities and switch capacities. We cannot know a priori how many
@@ -350,6 +388,7 @@ typedef struct i40e_tx_desc i40e_tx_desc_t;
typedef union i40e_32byte_rx_desc i40e_rx_desc_t;
typedef struct i40e_tx_control_block {
+ struct i40e_tx_control_block *tcb_next;
mblk_t *tcb_mp;
i40e_tx_type_t tcb_type;
ddi_dma_handle_t tcb_dma_handle;
@@ -372,11 +411,10 @@ typedef struct i40e_rx_data {
/*
* RX control block list definitions
*/
+ kmutex_t rxd_free_lock; /* Lock to protect free data */
i40e_rx_control_block_t *rxd_rcb_area; /* Array of control blocks */
i40e_rx_control_block_t **rxd_work_list; /* Work list of rcbs */
i40e_rx_control_block_t **rxd_free_list; /* Free list of rcbs */
- uint32_t rxd_rcb_head; /* Index of next free rcb */
- uint32_t rxd_rcb_tail; /* Index to put recycled rcb */
uint32_t rxd_rcb_free; /* Number of free rcbs */
/*
@@ -427,6 +465,8 @@ typedef struct i40e_rxq_stat {
kstat_named_t irxs_rx_desc_error; /* Error bit set on desc */
kstat_named_t irxs_rx_copy_nomem; /* allocb failure for copy */
kstat_named_t irxs_rx_intr_limit; /* Hit i40e_rx_limit_per_intr */
+ kstat_named_t irxs_rx_bind_norcb; /* No replacement rcb free */
+ kstat_named_t irxs_rx_bind_nomp; /* No mblk_t in bind rcb */
/*
* The following set of statistics covers rx checksum related activity.
@@ -449,9 +489,10 @@ typedef struct i40e_rxq_stat {
* Collection of TX Statistics on a given queue
*/
typedef struct i40e_txq_stat {
- kstat_named_t itxs_bytes; /* Bytes out on queue */
- kstat_named_t itxs_packets; /* Packets out on queue */
-
+ kstat_named_t itxs_bytes; /* Bytes out on queue */
+ kstat_named_t itxs_packets; /* Packets out on queue */
+ kstat_named_t itxs_descriptors; /* Descriptors issued */
+ kstat_named_t itxs_recycled; /* Descriptors reclaimed */
/*
* Various failure conditions.
*/
@@ -748,21 +789,33 @@ typedef struct i40e {
i40e_trqpair_t *i40e_trqpairs;
boolean_t i40e_mr_enable;
int i40e_num_trqpairs;
+ uint_t i40e_other_itr;
+
int i40e_num_rx_groups;
int i40e_num_rx_descs;
- int i40e_num_tx_descs;
mac_group_handle_t i40e_rx_group_handle;
uint32_t i40e_rx_ring_size;
uint32_t i40e_rx_buf_size;
boolean_t i40e_rx_hcksum_enable;
+ uint32_t i40e_rx_dma_min;
uint32_t i40e_rx_limit_per_intr;
+ uint_t i40e_rx_itr;
+
+ int i40e_num_tx_descs;
uint32_t i40e_tx_ring_size;
uint32_t i40e_tx_buf_size;
uint32_t i40e_tx_block_thresh;
boolean_t i40e_tx_hcksum_enable;
+ uint32_t i40e_tx_dma_min;
+ uint_t i40e_tx_itr;
/*
* Interrupt state
+ *
+ * Note that the use of a single boolean_t for i40e_intr_poll isn't
+ * really the best design. When we have more than a single ring on the
+ * device working, we'll transition to using something more
+ * sophisticated.
*/
uint_t i40e_intr_pri;
uint_t i40e_intr_force;
@@ -774,6 +827,7 @@ typedef struct i40e {
size_t i40e_intr_size;
ddi_intr_handle_t *i40e_intr_handles;
ddi_cb_handle_t i40e_callback_handle;
+ boolean_t i40e_intr_poll;
/*
* DMA attributes. See i40e_buf.c for why we have copies of them in the
@@ -790,6 +844,7 @@ typedef struct i40e {
* detach as we have active DMA memory outstanding.
*/
kmutex_t i40e_rx_pending_lock;
+ kcondvar_t i40e_rx_pending_cv;
uint32_t i40e_rx_pending;
/*
@@ -867,6 +922,9 @@ extern uint_t i40e_intr_legacy(void *, void *);
extern void i40e_intr_io_enable_all(i40e_t *);
extern void i40e_intr_io_disable_all(i40e_t *);
extern void i40e_intr_io_clear_cause(i40e_t *);
+extern void i40e_intr_rx_queue_disable(i40e_t *, uint_t);
+extern void i40e_intr_rx_queue_enable(i40e_t *, uint_t);
+extern void i40e_intr_set_itr(i40e_t *, i40e_itr_index_t, uint_t);
/*
* Receive-side functions
diff --git a/usr/src/uts/common/io/i40e/i40e_transceiver.c b/usr/src/uts/common/io/i40e/i40e_transceiver.c
index 3c05a7cec3..49739554de 100644
--- a/usr/src/uts/common/io/i40e/i40e_transceiver.c
+++ b/usr/src/uts/common/io/i40e/i40e_transceiver.c
@@ -661,8 +661,6 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
rxd->rxd_ring_size = i40e->i40e_rx_ring_size;
rxd->rxd_free_list_size = i40e->i40e_rx_ring_size;
- rxd->rxd_rcb_head = 0;
- rxd->rxd_rcb_tail = 0;
rxd->rxd_rcb_free = rxd->rxd_free_list_size;
rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
@@ -1069,6 +1067,36 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
}
}
+static void
+i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb)
+{
+ mutex_enter(&rxd->rxd_free_lock);
+ ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size);
+ ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL);
+ rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb;
+ rxd->rxd_rcb_free++;
+ mutex_exit(&rxd->rxd_free_lock);
+}
+
+static i40e_rx_control_block_t *
+i40e_rcb_alloc(i40e_rx_data_t *rxd)
+{
+ i40e_rx_control_block_t *rcb;
+
+ mutex_enter(&rxd->rxd_free_lock);
+ if (rxd->rxd_rcb_free == 0) {
+ mutex_exit(&rxd->rxd_free_lock);
+ return (NULL);
+ }
+ rxd->rxd_rcb_free--;
+ rcb = rxd->rxd_free_list[rxd->rxd_rcb_free];
+ VERIFY(rcb != NULL);
+ rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL;
+ mutex_exit(&rxd->rxd_free_lock);
+
+ return (rcb);
+}
+
/*
* This is the callback that we get from the OS when freemsg(9F) has been called
* on a loaned descriptor. In addition, if we take the last reference count
@@ -1087,11 +1115,27 @@ i40e_rx_recycle(caddr_t arg)
i40e = rxd->rxd_i40e;
/*
- * At the moment this only exists for tearing down, because we don't
- * support rx DMA binding. When we do, this will need to also put things
- * back onto the free list.
+ * It's possible for this to be called with a reference count of zero.
+ * That will happen when we're doing the freemsg after taking the last
+ * reference because we're tearing down everything and this rcb is not
+ * outstanding.
+ */
+ if (rcb->rcb_ref == 0)
+ return;
+
+ /*
+ * Don't worry about failure of desballoc here. It'll only become fatal
+ * if we're trying to use it and we can't in i40e_rx_bind().
*/
+ rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
+ rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
+ i40e_rcb_free(rxd, rcb);
+ /*
+ * It's possible that the rcb was being used while we are shutting down
+ * the device. In that case, we'll take the final reference from the
+ * device here.
+ */
ref = atomic_dec_32_nv(&rcb->rcb_ref);
if (ref == 0) {
freemsg(rcb->rcb_mp);
@@ -1106,13 +1150,68 @@ i40e_rx_recycle(caddr_t arg)
* If this was the last block and it's been indicated that we've
* passed the shutdown point, we should clean up.
*/
- if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0)
+ if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) {
i40e_free_rx_data(rxd);
+ cv_broadcast(&i40e->i40e_rx_pending_cv);
+ }
mutex_exit(&i40e->i40e_rx_pending_lock);
}
}
+static mblk_t *
+i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index,
+ uint32_t plen)
+{
+ mblk_t *mp;
+ i40e_t *i40e = rxd->rxd_i40e;
+ i40e_rx_control_block_t *rcb, *rep_rcb;
+
+ ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock));
+
+ if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) {
+ itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++;
+ return (NULL);
+ }
+
+ rcb = rxd->rxd_work_list[index];
+
+ /*
+ * Check to make sure we have a mblk_t. If we don't, this is our last
+ * chance to try and get one.
+ */
+ if (rcb->rcb_mp == NULL) {
+ rcb->rcb_mp =
+ desballoc((unsigned char *)rcb->rcb_dma.dmab_address,
+ rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn);
+ if (rcb->rcb_mp == NULL) {
+ itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++;
+ i40e_rcb_free(rxd, rcb);
+ return (NULL);
+ }
+ }
+
+ I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL);
+
+ if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) {
+ ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED);
+ atomic_or_32(&i40e->i40e_state, I40E_ERROR);
+ i40e_rcb_free(rxd, rcb);
+ return (NULL);
+ }
+
+ /*
+ * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT.
+ */
+ mp = rcb->rcb_mp;
+ atomic_inc_32(&rcb->rcb_ref);
+ mp->b_wptr = mp->b_rptr + plen;
+ mp->b_next = mp->b_cont = NULL;
+
+ rxd->rxd_work_list[index] = rep_rcb;
+ return (mp);
+}
+
/*
* We're going to allocate a new message block for this frame and attempt to
* receive it. See the big theory statement for more information on when we copy
@@ -1372,7 +1471,12 @@ i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes)
break;
rx_bytes += plen;
- mp = i40e_rx_copy(itrq, rxd, cur_head, plen);
+ mp = NULL;
+ if (plen >= i40e->i40e_rx_dma_min)
+ mp = i40e_rx_bind(itrq, rxd, cur_head, plen);
+ if (mp == NULL)
+ mp = i40e_rx_copy(itrq, rxd, cur_head, plen);
+
if (mp != NULL) {
if (i40e->i40e_rx_hcksum_enable)
i40e_rx_hcksum(itrq, mp, stword, error, ptype);
@@ -1829,6 +1933,7 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb)
tcb->tcb_type = I40E_TX_NONE;
freemsg(tcb->tcb_mp);
tcb->tcb_mp = NULL;
+ tcb->tcb_next = NULL;
}
/*
@@ -1884,6 +1989,7 @@ void
i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
{
uint32_t wbhead, toclean, count;
+ i40e_tx_control_block_t *tcbhead;
i40e_t *i40e = itrq->itrq_i40e;
mutex_enter(&itrq->itrq_tx_lock);
@@ -1920,6 +2026,7 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
wbhead = *itrq->itrq_desc_wbhead;
toclean = itrq->itrq_desc_head;
count = 0;
+ tcbhead = NULL;
while (toclean != wbhead) {
i40e_tx_control_block_t *tcb;
@@ -1927,8 +2034,8 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
tcb = itrq->itrq_tcb_work_list[toclean];
itrq->itrq_tcb_work_list[toclean] = NULL;
ASSERT(tcb != NULL);
- i40e_tcb_reset(tcb);
- i40e_tcb_free(itrq, tcb);
+ tcb->tcb_next = tcbhead;
+ tcbhead = tcb;
/*
* We zero this out for sanity purposes.
@@ -1940,6 +2047,7 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
itrq->itrq_desc_head = wbhead;
itrq->itrq_desc_free += count;
+ itrq->itrq_txstat.itxs_recycled.value.ui64 += count;
ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size);
if (itrq->itrq_tx_blocked == B_TRUE &&
@@ -1951,6 +2059,19 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
}
mutex_exit(&itrq->itrq_tx_lock);
+
+ /*
+ * Now clean up the tcb.
+ */
+ while (tcbhead != NULL) {
+ i40e_tx_control_block_t *tcb = tcbhead;
+
+ tcbhead = tcb->tcb_next;
+ i40e_tcb_reset(tcb);
+ i40e_tcb_free(itrq, tcb);
+ }
+
+ DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
}
/*
@@ -2116,6 +2237,7 @@ i40e_ring_tx(void *arg, mblk_t *mp)
txs->itxs_bytes.value.ui64 += mpsize;
txs->itxs_packets.value.ui64++;
+ txs->itxs_descriptors.value.ui64++;
mutex_exit(&itrq->itrq_tx_lock);