summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRob Johnston <rob.johnston@joyent.com>2018-04-10 01:46:32 +0000
committerRob Johnston <rob.johnston@joyent.com>2018-05-31 18:33:17 +0000
commit9e30beee2f0c127bf41868db46257124206e28d6 (patch)
treed4a7f7c8f8743bbfdb1a76d71727c2f01e8c5489
parentd3cb756c75a7f0c43387251324eaa198e59f60a0 (diff)
downloadillumos-joyent-9e30beee2f0c127bf41868db46257124206e28d6.tar.gz
OS-5225 Want Fortville TSO support
Reviewed by: Ryan Zezeski <rpz@joyent.com> Reviewed by: Robert Mustacchi <rm@joyent.com> Approved by: Patrick Mooney <patrick.mooney@joyent.com>
-rw-r--r--usr/src/man/man7d/i40e.7d20
-rw-r--r--usr/src/uts/common/io/i40e/i40e_gld.c12
-rw-r--r--usr/src/uts/common/io/i40e/i40e_main.c8
-rw-r--r--usr/src/uts/common/io/i40e/i40e_stats.c5
-rw-r--r--usr/src/uts/common/io/i40e/i40e_sw.h42
-rw-r--r--usr/src/uts/common/io/i40e/i40e_transceiver.c657
6 files changed, 571 insertions, 173 deletions
diff --git a/usr/src/man/man7d/i40e.7d b/usr/src/man/man7d/i40e.7d
index 2d8a2da45b..f025fba01a 100644
--- a/usr/src/man/man7d/i40e.7d
+++ b/usr/src/man/man7d/i40e.7d
@@ -9,9 +9,9 @@
.\" http://www.illumos.org/license/CDDL.
.\"
.\"
-.\" Copyright (c) 2017 Joyent, Inc.
+.\" Copyright (c) 2018 Joyent, Inc.
.\"
-.Dd September 8, 2017
+.Dd May 23, 2018
.Dt I40E 7D
.Os
.Sh NAME
@@ -273,6 +273,22 @@ binding.
By setting this property to its maximum, all frames will be processed by copying
the frame.
.Ed
+.It Sy tx_lso_enable
+.Bd -filled -compact
+Minimum:
+.Sy 0 |
+Maximum:
+.Sy 1
+.Ed
+.Bd -filled
+The
+.Sy tx_lso_enable
+property controls whether or not the device enables support for Large Segment
+Offloand (LSO) when transmitting packets.
+The default is to always enable support for this.
+Turning it off will decrease throughput when transmitting packets, but should
+be done if a hardware bug is suspected.
+.Ed
.El
.Sh ARCHITECTURE
The
diff --git a/usr/src/uts/common/io/i40e/i40e_gld.c b/usr/src/uts/common/io/i40e/i40e_gld.c
index d34057d64f..e2a5ef1541 100644
--- a/usr/src/uts/common/io/i40e/i40e_gld.c
+++ b/usr/src/uts/common/io/i40e/i40e_gld.c
@@ -732,6 +732,18 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
break;
}
+ case MAC_CAPAB_LSO: {
+ mac_capab_lso_t *cap_lso = cap_data;
+
+ if (i40e->i40e_tx_lso_enable == B_TRUE) {
+ cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
+ cap_lso->lso_basic_tcp_ipv4.lso_max = I40E_LSO_MAXLEN;
+ } else {
+ return (B_FALSE);
+ }
+ break;
+ }
+
case MAC_CAPAB_RINGS:
cap_rings = cap_data;
cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
diff --git a/usr/src/uts/common/io/i40e/i40e_main.c b/usr/src/uts/common/io/i40e/i40e_main.c
index 263f99dfdb..c15acbb265 100644
--- a/usr/src/uts/common/io/i40e/i40e_main.c
+++ b/usr/src/uts/common/io/i40e/i40e_main.c
@@ -11,7 +11,7 @@
/*
* Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright (c) 2018, Joyent, Inc.
* Copyright 2017 Tegile Systems, Inc. All rights reserved.
*/
@@ -359,7 +359,6 @@
* While bugs have been filed to cover this future work, the following gives an
* overview of expected work:
*
- * o TSO support
* o Multiple group support
* o DMA binding and breaking up the locking in ring recycling.
* o Enhanced detection of device errors
@@ -371,7 +370,7 @@
#include "i40e_sw.h"
-static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.1";
+static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.2";
/*
* The i40e_glock primarily protects the lists below and the i40e_device_t
@@ -1559,6 +1558,9 @@ i40e_init_properties(i40e_t *i40e)
i40e->i40e_tx_hcksum_enable = i40e_get_prop(i40e, "tx_hcksum_enable",
B_FALSE, B_TRUE, B_TRUE);
+ i40e->i40e_tx_lso_enable = i40e_get_prop(i40e, "tx_lso_enable",
+ B_FALSE, B_TRUE, B_TRUE);
+
i40e->i40e_rx_hcksum_enable = i40e_get_prop(i40e, "rx_hcksum_enable",
B_FALSE, B_TRUE, B_TRUE);
diff --git a/usr/src/uts/common/io/i40e/i40e_stats.c b/usr/src/uts/common/io/i40e/i40e_stats.c
index 7a4f0faedd..810ccedd8f 100644
--- a/usr/src/uts/common/io/i40e/i40e_stats.c
+++ b/usr/src/uts/common/io/i40e/i40e_stats.c
@@ -11,7 +11,7 @@
/*
* Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#include "i40e_sw.h"
@@ -1249,6 +1249,9 @@ i40e_stats_trqpair_init(i40e_trqpair_t *itrq)
kstat_named_init(&tsp->itxs_hck_badl4, "tx_hck_badl4",
KSTAT_DATA_UINT64);
tsp->itxs_hck_badl4.value.ui64 = 0;
+ kstat_named_init(&tsp->itxs_lso_nohck, "tx_lso_nohck",
+ KSTAT_DATA_UINT64);
+ tsp->itxs_lso_nohck.value.ui64 = 0;
kstat_named_init(&tsp->itxs_err_notcb, "tx_err_notcb",
KSTAT_DATA_UINT64);
tsp->itxs_err_notcb.value.ui64 = 0;
diff --git a/usr/src/uts/common/io/i40e/i40e_sw.h b/usr/src/uts/common/io/i40e/i40e_sw.h
index 78aced0144..5411be3d83 100644
--- a/usr/src/uts/common/io/i40e/i40e_sw.h
+++ b/usr/src/uts/common/io/i40e/i40e_sw.h
@@ -11,7 +11,7 @@
/*
* Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright (c) 2018, Joyent, Inc.
* Copyright 2017 Tegile Systems, Inc. All rights reserved.
*/
@@ -152,9 +152,10 @@ typedef enum i40e_itr_index {
} i40e_itr_index_t;
/*
- * Table 1-5 of the PRM notes that LSO supports up to 256 KB.
+ * The hardware claims to support LSO up to 256 KB, but due to the limitations
+ * imposed by the IP header for non-jumbo frames, we cap it at 64 KB.
*/
-#define I40E_LSO_MAXLEN (256 * 1024)
+#define I40E_LSO_MAXLEN (64 * 1024)
#define I40E_CYCLIC_PERIOD NANOSEC /* 1 second */
#define I40E_DRAIN_RX_WAIT (500 * MILLISEC) /* In us */
@@ -173,13 +174,22 @@ typedef enum i40e_itr_index {
#define I40E_BUF_IPHDR_ALIGNMENT 2
/*
- * The XL710 controller has a limit of eight buffers being allowed to be used
- * for the transmission of a single frame. This is defined in 8.4.1 - Transmit
+ * The XL710 controller has a total of eight buffers available for the
+ * transmission of any single frame. This is defined in 8.4.1 - Transmit
* Packet in System Memory.
*/
#define I40E_TX_MAX_COOKIE 8
/*
+ * An LSO frame can be as large as 64KB, so we allow a DMA bind to span more
+ * cookies than a non-LSO frame. The key here to is to select a value such
+ * that once the HW has chunked up the LSO frame into MSS-sized segments that no
+ * single segment spans more than 8 cookies (see comments for
+ * I40E_TX_MAX_COOKIE)
+ */
+#define I40E_TX_LSO_MAX_COOKIE 32
+
+/*
* Sizing to determine the amount of available descriptors at which we'll
* consider ourselves blocked. Also, when we have these available, we'll then
* consider ourselves available to transmit to MAC again. Strictly speaking, the
@@ -203,6 +213,12 @@ typedef enum i40e_itr_index {
#define I40E_MAX_TX_DMA_THRESH INT32_MAX
/*
+ * The max size of each individual tx buffer is 16KB - 1.
+ * See table 8-17
+ */
+#define I40E_MAX_TX_BUFSZ 0x0000000000003FFFull
+
+/*
* Resource sizing counts. There are various aspects of hardware where we may
* have some variable number of elements that we need to handle. Such as the
* hardware capabilities and switch capacities. We cannot know a priori how many
@@ -405,18 +421,29 @@ typedef struct i40e_rx_control_block {
typedef enum {
I40E_TX_NONE,
I40E_TX_COPY,
- I40E_TX_DMA
+ I40E_TX_DMA,
+ I40E_TX_DESC,
} i40e_tx_type_t;
typedef struct i40e_tx_desc i40e_tx_desc_t;
+typedef struct i40e_tx_context_desc i40e_tx_context_desc_t;
typedef union i40e_32byte_rx_desc i40e_rx_desc_t;
+struct i40e_dma_bind_info {
+ caddr_t dbi_paddr;
+ size_t dbi_len;
+};
+
typedef struct i40e_tx_control_block {
struct i40e_tx_control_block *tcb_next;
mblk_t *tcb_mp;
i40e_tx_type_t tcb_type;
ddi_dma_handle_t tcb_dma_handle;
+ ddi_dma_handle_t tcb_lso_dma_handle;
i40e_dma_buffer_t tcb_dma;
+ struct i40e_dma_bind_info *tcb_bind_info;
+ uint_t tcb_bind_ncookies;
+ boolean_t tcb_used_lso;
} i40e_tx_control_block_t;
/*
@@ -526,6 +553,7 @@ typedef struct i40e_txq_stat {
kstat_named_t itxs_hck_nol4info; /* Missing l4 info */
kstat_named_t itxs_hck_badl3; /* Not IPv4/IPv6 */
kstat_named_t itxs_hck_badl4; /* Bad L4 Paylaod */
+ kstat_named_t itxs_lso_nohck; /* Missing offloads for LSO */
kstat_named_t itxs_err_notcb; /* No tcb's available */
kstat_named_t itxs_err_nodescs; /* No tcb's available */
@@ -832,6 +860,7 @@ typedef struct i40e {
uint32_t i40e_tx_buf_size;
uint32_t i40e_tx_block_thresh;
boolean_t i40e_tx_hcksum_enable;
+ boolean_t i40e_tx_lso_enable;
uint32_t i40e_tx_dma_min;
uint_t i40e_tx_itr;
@@ -855,6 +884,7 @@ typedef struct i40e {
*/
ddi_dma_attr_t i40e_static_dma_attr;
ddi_dma_attr_t i40e_txbind_dma_attr;
+ ddi_dma_attr_t i40e_txbind_lso_dma_attr;
ddi_device_acc_attr_t i40e_desc_acc_attr;
ddi_device_acc_attr_t i40e_buf_acc_attr;
diff --git a/usr/src/uts/common/io/i40e/i40e_transceiver.c b/usr/src/uts/common/io/i40e/i40e_transceiver.c
index 75132e27f0..cb150545ea 100644
--- a/usr/src/uts/common/io/i40e/i40e_transceiver.c
+++ b/usr/src/uts/common/io/i40e/i40e_transceiver.c
@@ -11,7 +11,7 @@
/*
* Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#include "i40e_sw.h"
@@ -60,19 +60,19 @@
* This size is then rounded up to the nearest 1k chunk, which represents the
* actual amount of memory that we'll allocate for a single frame.
*
- * Note, that for rx, we do something that might be unexpected. We always add
+ * Note, that for RX, we do something that might be unexpected. We always add
* an extra two bytes to the frame size that we allocate. We then offset the DMA
* address that we receive a packet into by two bytes. This ensures that the IP
* header will always be 4 byte aligned because the MAC header is either 14 or
* 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
* and MAC's lives easier.
*
- * Both the rx and tx descriptor rings (which are what we use to communicate
+ * Both the RX and TX descriptor rings (which are what we use to communicate
* with hardware) are allocated as a single region of DMA memory which is the
* size of the descriptor (4 bytes and 2 bytes respectively) times the total
- * number of descriptors for an rx and tx ring.
+ * number of descriptors for an RX and TX ring.
*
- * While the rx and tx descriptors are allocated using DMA-based memory, the
+ * While the RX and TX descriptors are allocated using DMA-based memory, the
* control blocks for each of them are allocated using normal kernel memory.
* They aren't special from a DMA perspective. We'll go over the design of both
* receiving and transmitting separately, as they have slightly different
@@ -113,16 +113,16 @@
*
* To try and ensure that the device always has blocks that it can receive data
* into, we maintain two lists of control blocks, a working list and a free
- * list. Each list is sized equal to the number of descriptors in the rx ring.
- * During the GLDv3 mc_start routine, we allocate a number of rx control blocks
+ * list. Each list is sized equal to the number of descriptors in the RX ring.
+ * During the GLDv3 mc_start routine, we allocate a number of RX control blocks
* equal to twice the number of descriptors in the ring and we assign them
* equally to the free list and to the working list. Each control block also has
* DMA memory allocated and associated with which it will be used to receive the
* actual packet data. All of a received frame's data will end up in a single
* DMA buffer.
*
- * During operation, we always maintain the invariant that each rx descriptor
- * has an associated rx control block which lives in the working list. If we
+ * During operation, we always maintain the invariant that each RX descriptor
+ * has an associated RX control block which lives in the working list. If we
* feel that we should loan up DMA memory to MAC in the form of a message block,
* we can only do so if we can maintain this invariant. To do that, we swap in
* one of the buffers from the free list. If none are available, then we resort
@@ -130,14 +130,14 @@
* size.
*
* Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
- * called on the block, at which point we restore the rx control block to the
+ * called on the block, at which point we restore the RX control block to the
* free list and are able to reuse the DMA memory again. While the scheme may
* seem odd, it importantly keeps us out of trying to do any DMA allocations in
* the normal path of operation, even though we may still have to allocate
* message blocks and copy.
*
- * The following state machine describes the life time of a rx control block. In
- * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx
+ * The following state machine describes the life time of a RX control block. In
+ * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx
* control block entry as rcb.
*
* | |
@@ -160,11 +160,11 @@
* +--------------------<-----| rcb loaned to MAC |
* +-------------------+
*
- * Finally, note that every rx control block has a reference count on it. One
+ * Finally, note that every RX control block has a reference count on it. One
* reference is added as long as the driver has had the GLDv3 mc_start endpoint
* called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
* no other DLPI consumers remain, then we'll decrement the reference count by
- * one. Whenever we loan up the rx control block and associated buffer to MAC,
+ * one. Whenever we loan up the RX control block and associated buffer to MAC,
* then we bump the reference count again. Even though the device is stopped,
* there may still be loaned frames in upper levels that we'll want to account
* for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
@@ -192,10 +192,10 @@
* state tracking. Effectively, we cache the HEAD register and then update it
* ourselves based on our work.
*
- * When we iterate over the rx descriptors and thus the received frames, we are
+ * When we iterate over the RX descriptors and thus the received frames, we are
* either in an interrupt context or we've been asked by MAC to poll on the
* ring. If we've been asked to poll on the ring, we have a maximum number of
- * bytes of mblk_t's to return. If processing an rx descriptor would cause us to
+ * bytes of mblk_t's to return. If processing an RX descriptor would cause us to
* exceed that count, then we do not process it. When in interrupt context, we
* don't have a strict byte count. However, to ensure liveness, we limit the
* amount of data based on a configuration value
@@ -249,31 +249,44 @@
* differently due to the fact that all data is originated by the operating
* system and not by the device.
*
- * Like rx, there is both a descriptor ring that we use to communicate to the
- * driver and which points to the memory used to transmit a frame. Similarly,
- * there is a corresponding transmit control block. Each transmit control block
- * has a region of DMA memory allocated to it; however, the way we use it
- * varies.
+ * Like RX, there is both a descriptor ring that we use to communicate to the
+ * driver and which points to the memory used to transmit a frame. Similarly,
+ * there is a corresponding transmit control block, however, the correspondence
+ * between descriptors and control blocks is more complex and not necessarily
+ * 1-to-1.
*
* The driver is asked to process a single frame at a time. That message block
* may be made up of multiple fragments linked together by the mblk_t`b_cont
* member. The device has a hard limit of up to 8 buffers being allowed for use
* for a single logical frame. For each fragment, we'll try and use an entry
- * from the tx descriptor ring and then we'll allocate a corresponding tx
- * control block. Depending on the size of the fragment, we may copy it around
- * or we might instead try to do DMA binding of the fragment.
- *
- * If we exceed the number of blocks that fit, we'll try to pull up the block
- * and then we'll do a DMA bind and send it out.
- *
- * If we don't have enough space in the ring or tx control blocks available,
+ * from the TX descriptor ring and then we'll allocate a corresponding TX
+ * control block.
+ *
+ * We alter our DMA strategy based on a threshold tied to the frame size.
+ * This threshold is configurable via the tx_dma_threshold property. If the
+ * frame size is above the threshold, we do DMA binding of the fragments,
+ * building a control block and data descriptor for each piece. If it's below
+ * or at the threshold then we just use a single control block and data
+ * descriptor and simply bcopy all of the fragments into the pre-allocated DMA
+ * buffer in the control block. For the LSO TX case we always do DMA binding of
+ * the fragments, with one control block and one TX data descriptor allocated
+ * per fragment.
+ *
+ * Furthermore, if the frame requires HW offloads such as LSO, tunneling or
+ * filtering, then the TX data descriptors must be preceeded by a single TX
+ * context descriptor. Because there is no DMA transfer associated with the
+ * context descriptor, we allocate a control block with a special type which
+ * indicates to the TX ring recycle code that there are no associated DMA
+ * resources to unbind when the control block is free'd.
+ *
+ * If we don't have enough space in the ring or TX control blocks available,
* then we'll return the unprocessed message block to MAC. This will induce flow
* control and once we recycle enough entries, we'll once again enable sending
* on the ring.
*
* We size the working list as equal to the number of descriptors in the ring.
* We size the free list as equal to 1.5 times the number of descriptors in the
- * ring. We'll allocate a number of tx control block entries equal to the number
+ * ring. We'll allocate a number of TX control block entries equal to the number
* of entries in the free list. By default, all entries are placed in the free
* list. As we come along and try to send something, we'll allocate entries from
* the free list and add them to the working list, where they'll stay until the
@@ -325,7 +338,7 @@
* +------------------+ +------------------+
* | tcb on free list |---*------------------>| tcb on work list |
* +------------------+ . +------------------+
- * ^ . tcb allocated |
+ * ^ . N tcbs allocated[1] |
* | to send frame v
* | or fragment on |
* | wire, mblk from |
@@ -335,20 +348,27 @@
* .
* . Hardware indicates
* entry transmitted.
- * tcb recycled, mblk
+ * tcbs recycled, mblk
* from MAC freed.
*
+ * [1] We allocate N tcbs to transmit a single frame where N can be 1 context
+ * descriptor plus 1 data descriptor, in the non-DMA-bind case. In the DMA
+ * bind case, N can be 1 context descriptor plus 1 data descriptor per
+ * b_cont in the mblk. In this case, the mblk is associated with the first
+ * data descriptor and freed as part of freeing that data descriptor.
+ *
* ------------
* Blocking MAC
* ------------
*
- * Wen performing transmit, we can run out of descriptors and ring entries. When
- * such a case happens, we return the mblk_t to MAC to indicate that we've been
- * blocked. At that point in time, MAC becomes blocked and will not transmit
- * anything out that specific ring until we notify MAC. To indicate that we're
- * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE.
+ * When performing transmit, we can run out of descriptors and ring entries.
+ * When such a case happens, we return the mblk_t to MAC to indicate that we've
+ * been blocked. At that point in time, MAC becomes blocked and will not
+ * transmit anything out that specific ring until we notify MAC. To indicate
+ * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member
+ * to B_TRUE.
*
- * When we recycle tx descriptors then we'll end up signaling MAC by calling
+ * When we recycle TX descriptors then we'll end up signaling MAC by calling
* mac_tx_ring_update() if we were blocked, letting it know that it's safe to
* start sending frames out to us again.
*/
@@ -367,13 +387,15 @@
/*
* This structure is used to maintain information and flags related to
- * transmitting a frame. The first member is the set of flags we need to or into
- * the command word (generally checksumming related). The second member controls
- * the word offsets which is required for IP and L4 checksumming.
+ * transmitting a frame. These fields are ultimately used to construct the
+ * TX data descriptor(s) and, if necessary, the TX context descriptor.
*/
typedef struct i40e_tx_context {
- enum i40e_tx_desc_cmd_bits itc_cmdflags;
- uint32_t itc_offsets;
+ enum i40e_tx_desc_cmd_bits itc_data_cmdflags;
+ uint32_t itc_data_offsets;
+ enum i40e_tx_ctx_desc_cmd_bits itc_ctx_cmdflags;
+ uint32_t itc_ctx_tsolen;
+ uint32_t itc_ctx_mss;
} i40e_tx_context_t;
/*
@@ -395,14 +417,18 @@ i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
* i40e_static_dma_attr, is designed to be used for both the descriptor rings
* and the static buffers that we associate with control blocks. For this
* reason, we force an SGL length of one. While technically the driver supports
- * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our
+ * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our
* management here. In addition, when the Intel common code wants to allocate
* memory via the i40e_allocate_virt_mem osdep function, we have it leverage
* the static dma attr.
*
- * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're
- * binding a bunch of mblk_t fragments to go out the door. Note that the main
- * difference here is that we're allowed a larger SGL length -- eight.
+ * The latter two sets of attributes, are what we use when we're binding a
+ * bunch of mblk_t fragments to go out the door. Note that the main difference
+ * here is that we're allowed a larger SGL length. For non-LSO TX, we
+ * restrict the SGL length to match the number of TX buffers available to the
+ * PF (8). For the LSO case we can go much larger, with the caveat that each
+ * MSS-sized chunk (segment) must not span more than 8 data descriptors and
+ * hence must not span more than 8 cookies.
*
* Note, we default to setting ourselves to be DMA capable here. However,
* because we could have multiple instances which have different FMA error
@@ -429,7 +455,7 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
DMA_ATTR_V0, /* version number */
0x0000000000000000ull, /* low address */
0xFFFFFFFFFFFFFFFFull, /* high address */
- 0x00000000FFFFFFFFull, /* dma counter max */
+ I40E_MAX_TX_BUFSZ, /* dma counter max */
I40E_DMA_ALIGNMENT, /* alignment */
0x00000FFF, /* burst sizes */
0x00000001, /* minimum transfer size */
@@ -440,6 +466,21 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
DDI_DMA_FLAGERR /* DMA flags */
};
+static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = {
+ DMA_ATTR_V0, /* version number */
+ 0x0000000000000000ull, /* low address */
+ 0xFFFFFFFFFFFFFFFFull, /* high address */
+ I40E_MAX_TX_BUFSZ, /* dma counter max */
+ I40E_DMA_ALIGNMENT, /* alignment */
+ 0x00000FFF, /* burst sizes */
+ 0x00000001, /* minimum transfer size */
+ 0x00000000FFFFFFFFull, /* maximum transfer size */
+ 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */
+ I40E_TX_LSO_MAX_COOKIE, /* scatter/gather list length */
+ 0x00000001, /* granularity */
+ DDI_DMA_FLAGERR /* DMA flags */
+};
+
/*
* Next, we have the attributes for these structures. The descriptor rings are
* all strictly little endian, while the data buffers are just arrays of bytes
@@ -668,7 +709,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
rxd->rxd_ring_size, KM_NOSLEEP);
if (rxd->rxd_work_list == NULL) {
- i40e_error(i40e, "failed to allocate rx work list for a ring "
+ i40e_error(i40e, "failed to allocate RX work list for a ring "
"of %d entries for ring %d", rxd->rxd_ring_size,
itrq->itrq_index);
goto cleanup;
@@ -677,7 +718,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
rxd->rxd_free_list_size, KM_NOSLEEP);
if (rxd->rxd_free_list == NULL) {
- i40e_error(i40e, "failed to allocate a %d entry rx free list "
+ i40e_error(i40e, "failed to allocate a %d entry RX free list "
"for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
goto cleanup;
}
@@ -765,7 +806,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
i40e_t *i40e = rxd->rxd_i40e;
/*
- * First allocate the rx descriptor ring.
+ * First allocate the RX descriptor ring.
*/
dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
VERIFY(dmasz > 0);
@@ -773,7 +814,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
&i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
B_TRUE, dmasz) == B_FALSE) {
i40e_error(i40e, "failed to allocate DMA resources "
- "for rx descriptor ring");
+ "for RX descriptor ring");
return (B_FALSE);
}
rxd->rxd_desc_ring =
@@ -799,7 +840,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
if (i40e_alloc_dma_buffer(i40e, dmap,
&i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
B_TRUE, B_FALSE, dmasz) == B_FALSE) {
- i40e_error(i40e, "failed to allocate rx dma buffer");
+ i40e_error(i40e, "failed to allocate RX dma buffer");
return (B_FALSE);
}
@@ -841,6 +882,10 @@ i40e_free_tx_dma(i40e_trqpair_t *itrq)
ddi_dma_free_handle(&tcb->tcb_dma_handle);
tcb->tcb_dma_handle = NULL;
}
+ if (tcb->tcb_lso_dma_handle != NULL) {
+ ddi_dma_free_handle(&tcb->tcb_lso_dma_handle);
+ tcb->tcb_lso_dma_handle = NULL;
+ }
}
fsz = sizeof (i40e_tx_control_block_t) *
@@ -881,7 +926,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
(i40e->i40e_tx_ring_size >> 1);
/*
- * Allocate an additional tx descriptor for the writeback head.
+ * Allocate an additional TX descriptor for the writeback head.
*/
dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
dmasz += sizeof (i40e_tx_desc_t);
@@ -890,7 +935,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
&i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
B_FALSE, B_TRUE, dmasz) == B_FALSE) {
- i40e_error(i40e, "failed to allocate DMA resources for tx "
+ i40e_error(i40e, "failed to allocate DMA resources for TX "
"descriptor ring");
return (B_FALSE);
}
@@ -905,7 +950,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
if (itrq->itrq_tcb_work_list == NULL) {
- i40e_error(i40e, "failed to allocate a %d entry tx work list "
+ i40e_error(i40e, "failed to allocate a %d entry TX work list "
"for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
goto cleanup;
}
@@ -913,14 +958,14 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
sizeof (i40e_tx_control_block_t *), KM_SLEEP);
if (itrq->itrq_tcb_free_list == NULL) {
- i40e_error(i40e, "failed to allocate a %d entry tx free list "
+ i40e_error(i40e, "failed to allocate a %d entry TX free list "
"for ring %d", itrq->itrq_tx_free_list_size,
itrq->itrq_index);
goto cleanup;
}
/*
- * We allocate enough tx control blocks to cover the free list.
+ * We allocate enough TX control blocks to cover the free list.
*/
itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
itrq->itrq_tx_free_list_size, KM_NOSLEEP);
@@ -948,18 +993,29 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
&i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
&tcb->tcb_dma_handle);
if (ret != DDI_SUCCESS) {
- i40e_error(i40e, "failed to allocate DMA handle for tx "
+ i40e_error(i40e, "failed to allocate DMA handle for TX "
"data binding on ring %d: %d", itrq->itrq_index,
ret);
tcb->tcb_dma_handle = NULL;
goto cleanup;
}
+ ret = ddi_dma_alloc_handle(i40e->i40e_dip,
+ &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL,
+ &tcb->tcb_lso_dma_handle);
+ if (ret != DDI_SUCCESS) {
+ i40e_error(i40e, "failed to allocate DMA handle for TX "
+ "LSO data binding on ring %d: %d", itrq->itrq_index,
+ ret);
+ tcb->tcb_lso_dma_handle = NULL;
+ goto cleanup;
+ }
+
if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
&i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
B_TRUE, B_FALSE, dmasz) == B_FALSE) {
i40e_error(i40e, "failed to allocate %ld bytes of "
- "DMA for tx data binding on ring %d", dmasz,
+ "DMA for TX data binding on ring %d", dmasz,
itrq->itrq_index);
goto cleanup;
}
@@ -989,10 +1045,10 @@ i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init)
i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata;
/*
- * Clean up our rx data. We have to free DMA resources first and
+ * Clean up our RX data. We have to free DMA resources first and
* then if we have no more pending RCB's, then we'll go ahead
* and clean things up. Note, we can't set the stopped flag on
- * the rx data until after we've done the first pass of the
+ * the RX data until after we've done the first pass of the
* pending resources. Otherwise we might race with
* i40e_rx_recycle on determining who should free the
* i40e_rx_data_t above.
@@ -1055,6 +1111,8 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
sizeof (ddi_dma_attr_t));
bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
sizeof (ddi_dma_attr_t));
+ bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr,
+ sizeof (ddi_dma_attr_t));
bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
sizeof (ddi_device_acc_attr_t));
bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
@@ -1063,9 +1121,13 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
if (fma == B_TRUE) {
i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
+ i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |=
+ DDI_DMA_FLAGERR;
} else {
i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
+ i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &=
+ ~DDI_DMA_FLAGERR;
}
}
@@ -1102,7 +1164,7 @@ i40e_rcb_alloc(i40e_rx_data_t *rxd)
/*
* This is the callback that we get from the OS when freemsg(9F) has been called
* on a loaned descriptor. In addition, if we take the last reference count
- * here, then we have to tear down all of the rx data.
+ * here, then we have to tear down all of the RX data.
*/
void
i40e_rx_recycle(caddr_t arg)
@@ -1768,16 +1830,19 @@ mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
* to properly program the hardware for checksum offload as well as the
* generally required flags.
*
- * The i40e_tx_context_t`itc_cmdflags contains the set of flags we need to or
- * into the descriptor based on the checksum flags for this mblk_t and the
+ * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to
+ * 'or' into the descriptor based on the checksum flags for this mblk_t and the
* actual information we care about.
+ *
+ * If the mblk requires LSO then we'll also gather the information that will be
+ * used to construct the Transmit Context Descriptor.
*/
static int
i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
i40e_tx_context_t *tctx)
{
int ret;
- uint32_t flags, start;
+ uint32_t chkflags, start, mss, lsoflags;
mac_ether_offload_info_t meo;
i40e_txq_stat_t *txs = &itrq->itrq_txstat;
@@ -1786,8 +1851,10 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
if (i40e->i40e_tx_hcksum_enable != B_TRUE)
return (0);
- mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags);
- if (flags == 0)
+ mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags);
+ mac_lso_get(mp, &mss, &lsoflags);
+
+ if (chkflags == 0 && lsoflags == 0)
return (0);
if ((ret = mac_ether_offload_info(mp, &meo)) != 0) {
@@ -1800,7 +1867,7 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
* have sufficient information and then set the proper fields in the
* command structure.
*/
- if (flags & HCK_IPV4_HDRCKSUM) {
+ if (chkflags & HCK_IPV4_HDRCKSUM) {
if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
txs->itxs_hck_nol2info.value.ui64++;
return (-1);
@@ -1813,10 +1880,10 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
txs->itxs_hck_badl3.value.ui64++;
return (-1);
}
- tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
- tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
+ tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
+ tctx->itc_data_offsets |= (meo.meoi_l2hlen >> 1) <<
I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
- tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
+ tctx->itc_data_offsets |= (meo.meoi_l3hlen >> 2) <<
I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
}
@@ -1826,13 +1893,13 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
* onto seeing if we have enough information for the L4 checksum
* offload.
*/
- if (flags & HCK_PARTIALCKSUM) {
+ if (chkflags & HCK_PARTIALCKSUM) {
if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) {
txs->itxs_hck_nol4info.value.ui64++;
return (-1);
}
- if (!(flags & HCK_IPV4_HDRCKSUM)) {
+ if (!(chkflags & HCK_IPV4_HDRCKSUM)) {
if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
txs->itxs_hck_nol2info.value.ui64++;
return (-1);
@@ -1843,40 +1910,60 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
}
if (meo.meoi_l3proto == ETHERTYPE_IP) {
- tctx->itc_cmdflags |=
+ tctx->itc_data_cmdflags |=
I40E_TX_DESC_CMD_IIPT_IPV4;
} else if (meo.meoi_l3proto == ETHERTYPE_IPV6) {
- tctx->itc_cmdflags |=
+ tctx->itc_data_cmdflags |=
I40E_TX_DESC_CMD_IIPT_IPV6;
} else {
txs->itxs_hck_badl3.value.ui64++;
return (-1);
}
- tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
+ tctx->itc_data_offsets |= (meo.meoi_l2hlen >> 1) <<
I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
- tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
+ tctx->itc_data_offsets |= (meo.meoi_l3hlen >> 2) <<
I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
}
switch (meo.meoi_l4proto) {
case IPPROTO_TCP:
- tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
+ tctx->itc_data_cmdflags |=
+ I40E_TX_DESC_CMD_L4T_EOFT_TCP;
break;
case IPPROTO_UDP:
- tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
+ tctx->itc_data_cmdflags |=
+ I40E_TX_DESC_CMD_L4T_EOFT_UDP;
break;
case IPPROTO_SCTP:
- tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
+ tctx->itc_data_cmdflags |=
+ I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
break;
default:
txs->itxs_hck_badl4.value.ui64++;
return (-1);
}
- tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) <<
+ tctx->itc_data_offsets |= (meo.meoi_l4hlen >> 2) <<
I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
}
+ if (lsoflags & HW_LSO) {
+ /*
+ * LSO requires that checksum offloads are enabled. If for
+ * some reason they're not we bail out with an error.
+ */
+ if ((chkflags & HCK_IPV4_HDRCKSUM) == 0 ||
+ (chkflags & HCK_PARTIALCKSUM) == 0) {
+ txs->itxs_lso_nohck.value.ui64++;
+ return (-1);
+ }
+
+ tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO;
+ tctx->itc_ctx_mss = mss;
+ tctx->itc_ctx_tsolen = msgsize(mp) -
+ (meo.meoi_l2hlen + meo.meoi_l3hlen + meo.meoi_l4hlen);
+ }
+
return (0);
}
@@ -1925,7 +2012,20 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb)
tcb->tcb_dma.dmab_len = 0;
break;
case I40E_TX_DMA:
- (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
+ if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0)
+ (void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle);
+ else if (tcb->tcb_bind_ncookies > 0)
+ (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
+ if (tcb->tcb_bind_info != NULL) {
+ kmem_free(tcb->tcb_bind_info,
+ tcb->tcb_bind_ncookies *
+ sizeof (struct i40e_dma_bind_info));
+ }
+ tcb->tcb_bind_info = NULL;
+ tcb->tcb_bind_ncookies = 0;
+ tcb->tcb_used_lso = B_FALSE;
+ break;
+ case I40E_TX_DESC:
break;
case I40E_TX_NONE:
/* Cast to pacify lint */
@@ -1935,8 +2035,10 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb)
}
tcb->tcb_type = I40E_TX_NONE;
- freemsg(tcb->tcb_mp);
- tcb->tcb_mp = NULL;
+ if (tcb->tcb_mp != NULL) {
+ freemsg(tcb->tcb_mp);
+ tcb->tcb_mp = NULL;
+ }
tcb->tcb_next = NULL;
}
@@ -1995,6 +2097,7 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
uint32_t wbhead, toclean, count;
i40e_tx_control_block_t *tcbhead;
i40e_t *i40e = itrq->itrq_i40e;
+ uint_t desc_per_tcb, i;
mutex_enter(&itrq->itrq_tx_lock);
@@ -2042,11 +2145,27 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
tcbhead = tcb;
/*
- * We zero this out for sanity purposes.
+ * In the DMA bind case, there may not necessarily be a 1:1
+ * mapping between tcb's and descriptors. If the tcb type
+ * indicates a DMA binding then check the number of DMA
+ * cookies to determine how many entries to clean in the
+ * descriptor ring.
*/
- bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t));
- toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size);
- count++;
+ if (tcb->tcb_type == I40E_TX_DMA)
+ desc_per_tcb = tcb->tcb_bind_ncookies;
+ else
+ desc_per_tcb = 1;
+
+ for (i = 0; i < desc_per_tcb; i++) {
+ /*
+ * We zero this out for sanity purposes.
+ */
+ bzero(&itrq->itrq_desc_ring[toclean],
+ sizeof (i40e_tx_desc_t));
+ toclean = i40e_next_desc(toclean, 1,
+ itrq->itrq_tx_ring_size);
+ count++;
+ }
}
itrq->itrq_desc_head = wbhead;
@@ -2078,6 +2197,94 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
}
+static i40e_tx_control_block_t *
+i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp,
+ boolean_t use_lso)
+{
+ ddi_dma_handle_t dma_handle;
+ ddi_dma_cookie_t dma_cookie;
+ uint_t i = 0, ncookies = 0, dmaflags;
+ i40e_tx_control_block_t *tcb;
+ i40e_txq_stat_t *txs = &itrq->itrq_txstat;
+
+ if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
+ txs->itxs_err_notcb.value.ui64++;
+ return (NULL);
+ }
+ tcb->tcb_type = I40E_TX_DMA;
+
+ if (use_lso == B_TRUE)
+ dma_handle = tcb->tcb_lso_dma_handle;
+ else
+ dma_handle = tcb->tcb_dma_handle;
+
+ dmaflags = DDI_DMA_RDWR | DDI_DMA_STREAMING;
+ if (ddi_dma_addr_bind_handle(dma_handle, NULL,
+ (caddr_t)mp->b_rptr, MBLKL(mp), dmaflags, DDI_DMA_DONTWAIT, NULL,
+ &dma_cookie, &ncookies) != DDI_DMA_MAPPED) {
+ goto bffail;
+ }
+ tcb->tcb_bind_ncookies = ncookies;
+ tcb->tcb_used_lso = use_lso;
+
+ tcb->tcb_bind_info =
+ kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info),
+ KM_NOSLEEP);
+ if (tcb->tcb_bind_info == NULL)
+ goto bffail;
+
+ while (i < ncookies) {
+ if (i > 0)
+ ddi_dma_nextcookie(dma_handle, &dma_cookie);
+
+ tcb->tcb_bind_info[i].dbi_paddr =
+ (caddr_t)dma_cookie.dmac_laddress;
+ tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size;
+ }
+
+ return (tcb);
+
+bffail:
+ i40e_tcb_reset(tcb);
+ i40e_tcb_free(itrq, tcb);
+ return (NULL);
+}
+
+static void
+i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx,
+ struct i40e_dma_bind_info *dbi, boolean_t last_desc)
+{
+ i40e_tx_desc_t *txdesc;
+ int cmd;
+
+ ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
+ itrq->itrq_desc_free--;
+ txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
+ itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
+ itrq->itrq_tx_ring_size);
+
+ cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags;
+
+ /*
+ * The last data descriptor needs the EOP bit set, so that the HW knows
+ * that we're ready to send. Additionally, we set the RS (Report
+ * Status) bit, so that we are notified when the transmit engine has
+ * completed DMA'ing all of the data descriptors and data buffers
+ * associated with this frame.
+ */
+ if (last_desc == B_TRUE) {
+ cmd |= I40E_TX_DESC_CMD_EOP;
+ cmd |= I40E_TX_DESC_CMD_RS;
+ }
+
+ txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)dbi->dbi_paddr);
+ txdesc->cmd_type_offset_bsz =
+ LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA |
+ ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
+ ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
+ ((uint64_t)dbi->dbi_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
+}
+
/*
* We've been asked to send a message block on the wire. We'll only have a
* single chain. There will not be any b_next pointers; however, there may be
@@ -2098,10 +2305,15 @@ i40e_ring_tx(void *arg, mblk_t *mp)
{
const mblk_t *nmp;
size_t mpsize;
- i40e_tx_control_block_t *tcb;
+ i40e_tx_control_block_t *tcb_ctx = NULL, *tcb_data = NULL,
+ **tcb_dma = NULL;
i40e_tx_desc_t *txdesc;
+ i40e_tx_context_desc_t *ctxdesc;
i40e_tx_context_t tctx;
int cmd, type;
+ uint_t i, needed_desc = 0, nbufs = 0;
+ boolean_t do_ctx_desc = B_FALSE, do_dma_bind = B_FALSE,
+ use_lso = B_FALSE;
i40e_trqpair_t *itrq = arg;
i40e_t *i40e = itrq->itrq_i40e;
@@ -2121,7 +2333,7 @@ i40e_ring_tx(void *arg, mblk_t *mp)
/*
* Figure out the relevant context about this frame that we might need
- * for enabling checksum, lso, etc. This also fills in information that
+ * for enabling checksum, LSO, etc. This also fills in information that
* we might set around the packet type, etc.
*/
if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) {
@@ -2129,97 +2341,204 @@ i40e_ring_tx(void *arg, mblk_t *mp)
itrq->itrq_txstat.itxs_err_context.value.ui64++;
return (NULL);
}
+ if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
+ use_lso = B_TRUE;
+ do_ctx_desc = B_TRUE;
+ }
/*
* For the primordial driver we can punt on doing any recycling right
* now; however, longer term we need to probably do some more pro-active
- * recycling to cut back on stalls in the tx path.
+ * recycling to cut back on stalls in the TX path.
*/
/*
- * Do a quick size check to make sure it fits into what we think it
- * should for this device. Note that longer term this will be false,
- * particularly when we have the world of TSO.
+ * Iterate through the mblks to calculate both the total size of the
+ * frame and the number of fragments. This is used to determine
+ * whether we're doing DMA binding and, if so, how many TX control
+ * blocks we'll need.
*/
mpsize = 0;
for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
- mpsize += MBLKL(nmp);
+ size_t blksz = MBLKL(nmp);
+ if (blksz > 0) {
+ mpsize += blksz;
+ nbufs++;
+ }
}
- /*
- * First we allocate our tx control block and prepare the packet for
- * transmit before we do a final check for descriptors. We do it this
- * way to minimize the time under the tx lock.
- */
- tcb = i40e_tcb_alloc(itrq);
- if (tcb == NULL) {
- txs->itxs_err_notcb.value.ui64++;
- goto txfail;
+ if (do_ctx_desc) {
+ /*
+ * If we're doing tunneling or LSO, then we'll need a TX
+ * context descriptor in addition to one or more TX data
+ * descriptors. Since there's no data DMA block or handle
+ * associated with the context descriptor, we create a special
+ * control block that behaves effectively like a NOP.
+ */
+ if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) {
+ txs->itxs_err_notcb.value.ui64++;
+ goto txfail;
+ }
+ tcb_ctx->tcb_type = I40E_TX_DESC;
+ needed_desc++;
}
/*
- * For transmitting a block, we're currently going to use just a
- * single control block and bcopy all of the fragments into it. We
- * should be more intelligent about doing DMA binding or otherwise, but
- * for getting off the ground this will have to do.
+ * For the non-LSO TX case, we alter our DMA strategy based on a
+ * threshold tied to the frame size. This threshold is configurable
+ * via the tx_dma_threshold property.
+ *
+ * If the frame size is above the threshold, we do DMA binding of the
+ * fragments, building a control block and data descriptor for each
+ * piece.
+ *
+ * If it's below or at the threshold then we just use a single control
+ * block and data descriptor and simply bcopy all of the fragments into
+ * the pre-allocated DMA buffer in the control block.
+ *
+ * For the LSO TX case we always do DMA binding.
*/
- ASSERT(tcb->tcb_dma.dmab_len == 0);
- ASSERT(tcb->tcb_dma.dmab_size >= mpsize);
- for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
- size_t clen = MBLKL(nmp);
- void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
+ if (use_lso == B_TRUE || mpsize > i40e->i40e_tx_dma_min) {
+ do_dma_bind = B_TRUE;
+ tcb_dma =
+ kmem_zalloc(nbufs * sizeof (i40e_tx_control_block_t *),
+ KM_NOSLEEP);
+ if (tcb_dma == NULL) {
+ i40e_error(i40e, "failed to allocate tcb_dma list");
+ goto txfail;
+ }
+ /*
+ * For each b_cont: bind the control block's DMA handle to the
+ * b_rptr, and record the cookies so that we can later iterate
+ * through them and build TX data descriptors.
+ */
+ for (nmp = mp, i = 0; nmp != NULL; nmp = nmp->b_cont) {
+ if (MBLKL(nmp) == 0)
+ continue;
+ tcb_dma[i] = i40e_tx_bind_fragment(itrq, nmp, use_lso);
+ if (tcb_dma[i] == NULL) {
+ i40e_error(i40e, "dma bind failed!");
+ goto txfail;
+ }
+ if (i == 0)
+ tcb_dma[i]->tcb_mp = mp;
+ needed_desc += tcb_dma[i++]->tcb_bind_ncookies;
+ }
+ } else {
+ /*
+ * Just use a single control block and bcopy all of the
+ * fragments into its pre-allocated DMA buffer.
+ */
+ if ((tcb_data = i40e_tcb_alloc(itrq)) == NULL) {
+ txs->itxs_err_notcb.value.ui64++;
+ goto txfail;
+ }
+ tcb_data->tcb_type = I40E_TX_COPY;
- bcopy(nmp->b_rptr, coff, clen);
- tcb->tcb_dma.dmab_len += clen;
- }
- ASSERT(tcb->tcb_dma.dmab_len == mpsize);
+ ASSERT(tcb_data->tcb_dma.dmab_len == 0);
+ ASSERT(tcb_data->tcb_dma.dmab_size >= mpsize);
- /*
- * While there's really no need to keep the mp here, but let's just do
- * it to help with our own debugging for now.
- */
- tcb->tcb_mp = mp;
- tcb->tcb_type = I40E_TX_COPY;
- I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
+ for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
+ size_t clen = MBLKL(nmp);
+ void *coff = tcb_data->tcb_dma.dmab_address +
+ tcb_data->tcb_dma.dmab_len;
+
+ bcopy(nmp->b_rptr, coff, clen);
+ tcb_data->tcb_dma.dmab_len += clen;
+ }
+ ASSERT(tcb_data->tcb_dma.dmab_len == mpsize);
+ I40E_DMA_SYNC(&tcb_data->tcb_dma, DDI_DMA_SYNC_FORDEV);
+
+ tcb_data->tcb_mp = mp;
+ needed_desc++;
+ }
mutex_enter(&itrq->itrq_tx_lock);
- if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) {
+ if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh ||
+ itrq->itrq_desc_free < needed_desc) {
txs->itxs_err_nodescs.value.ui64++;
mutex_exit(&itrq->itrq_tx_lock);
goto txfail;
}
- /*
- * Build up the descriptor and send it out. Thankfully at the moment
- * we only need a single desc, because we're not doing anything fancy
- * yet.
- */
- ASSERT(itrq->itrq_desc_free > 0);
- itrq->itrq_desc_free--;
- txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
- itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
- itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
- itrq->itrq_tx_ring_size);
+ if (do_ctx_desc) {
+ /*
+ * If we're enabling any offloads for this frame, then we'll
+ * need to build up a transmit context descriptor, first. The
+ * context descriptor needs to be placed in the TX ring before
+ * the data descriptor(s). See section 8.4.2, table 8-16
+ */
+ uint_t tail = itrq->itrq_desc_tail;
+ itrq->itrq_desc_free--;
+ ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail];
+ itrq->itrq_tcb_work_list[tail] = tcb_ctx;
+ itrq->itrq_desc_tail = i40e_next_desc(tail, 1,
+ itrq->itrq_tx_ring_size);
+
+ /* QW0 */
+ type = I40E_TX_DESC_DTYPE_CONTEXT;
+ ctxdesc->tunneling_params = 0;
+ ctxdesc->l2tag2 = 0;
+
+ /* QW1 */
+ ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type);
+ if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
+ ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t)
+ ((uint64_t)tctx.itc_ctx_cmdflags <<
+ I40E_TXD_CTX_QW1_CMD_SHIFT) |
+ ((uint64_t)tctx.itc_ctx_tsolen <<
+ I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
+ ((uint64_t)tctx.itc_ctx_mss <<
+ I40E_TXD_CTX_QW1_MSS_SHIFT));
+ }
+ }
- /*
- * Note, we always set EOP and RS which indicates that this is the last
- * data frame and that we should ask for it to be transmitted. We also
- * must always set ICRC, because that is an internal bit that must be
- * set to one for data descriptors. The remaining bits in the command
- * descriptor depend on checksumming and are determined based on the
- * information set up in i40e_tx_context().
- */
- type = I40E_TX_DESC_DTYPE_DATA;
- cmd = I40E_TX_DESC_CMD_EOP |
- I40E_TX_DESC_CMD_RS |
- I40E_TX_DESC_CMD_ICRC |
- tctx.itc_cmdflags;
- txdesc->buffer_addr =
- CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address);
- txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type |
- ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
- ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
- ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
+ if (do_dma_bind == B_TRUE) {
+ /*
+ * Next build up a transmit data descriptor for each buffer.
+ */
+ boolean_t last_desc = B_FALSE;
+ for (i = 0; i < nbufs; i++) {
+ itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] =
+ tcb_dma[i];
+
+ for (uint_t c = 0; c < tcb_dma[i]->tcb_bind_ncookies;
+ c++) {
+ if (i == (nbufs - 1) &&
+ c == (tcb_dma[i]->tcb_bind_ncookies - 1)) {
+ last_desc = B_TRUE;
+ }
+ i40e_tx_set_data_desc(itrq, &tctx,
+ &tcb_dma[i]->tcb_bind_info[c], last_desc);
+ }
+ }
+ kmem_free(tcb_dma, nbufs * sizeof (i40e_tx_control_block_t *));
+ tcb_dma = NULL;
+ } else {
+ /*
+ * Build up the single transmit data descriptor needed for the
+ * non-DMA-bind case.
+ */
+ itrq->itrq_desc_free--;
+ txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
+ itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb_data;
+ itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
+ itrq->itrq_tx_ring_size);
+
+ type = I40E_TX_DESC_DTYPE_DATA;
+ cmd = I40E_TX_DESC_CMD_EOP |
+ I40E_TX_DESC_CMD_RS |
+ I40E_TX_DESC_CMD_ICRC |
+ tctx.itc_data_cmdflags;
+ txdesc->buffer_addr =
+ CPU_TO_LE64((uintptr_t)tcb_data->tcb_dma.dmab_dma_address);
+ txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type |
+ ((uint64_t)tctx.itc_data_offsets <<
+ I40E_TXD_QW1_OFFSET_SHIFT) |
+ ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
+ ((uint64_t)tcb_data->tcb_dma.dmab_len <<
+ I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
+ }
/*
* Now, finally, sync the DMA data and alert hardware.
@@ -2228,6 +2547,7 @@ i40e_ring_tx(void *arg, mblk_t *mp)
I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
itrq->itrq_desc_tail);
+
if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
DDI_FM_OK) {
/*
@@ -2241,7 +2561,7 @@ i40e_ring_tx(void *arg, mblk_t *mp)
txs->itxs_bytes.value.ui64 += mpsize;
txs->itxs_packets.value.ui64++;
- txs->itxs_descriptors.value.ui64++;
+ txs->itxs_descriptors.value.ui64 += needed_desc;
mutex_exit(&itrq->itrq_tx_lock);
@@ -2254,10 +2574,25 @@ txfail:
* Make sure to reset their message block's, since we'll return them
* back to MAC.
*/
- if (tcb != NULL) {
- tcb->tcb_mp = NULL;
- i40e_tcb_reset(tcb);
- i40e_tcb_free(itrq, tcb);
+ if (tcb_ctx != NULL) {
+ tcb_ctx->tcb_mp = NULL;
+ i40e_tcb_reset(tcb_ctx);
+ i40e_tcb_free(itrq, tcb_ctx);
+ }
+ if (tcb_data != NULL) {
+ tcb_data->tcb_mp = NULL;
+ i40e_tcb_reset(tcb_data);
+ i40e_tcb_free(itrq, tcb_data);
+ }
+ if (tcb_dma != NULL) {
+ for (i = 0; i < nbufs; i++) {
+ if (tcb_dma[i] == NULL)
+ break;
+ tcb_dma[i]->tcb_mp = NULL;
+ i40e_tcb_reset(tcb_dma[i]);
+ i40e_tcb_free(itrq, tcb_dma[i]);
+ }
+ kmem_free(tcb_dma, nbufs * sizeof (i40e_tx_control_block_t *));
}
mutex_enter(&itrq->itrq_tx_lock);