summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason King <jason.king@joyent.com>2020-05-07 01:24:17 -0500
committerJason King <jason.king@joyent.com>2021-02-26 17:17:31 -0600
commite39a7b5b5afda54edb90e52709591832137db39f (patch)
treead0b24369468c4289f4392d51684e33d8b9ccc64
parent3a51e5c06fdc66b1c3883ded28372f6ae8e9cedb (diff)
downloadillumos-joyent-e39a7b5b5afda54edb90e52709591832137db39f.tar.gz
12849 ixgbe can do unnecessary allocations during tx
Reviewed by: Robert Mustacchi <rm@fingolfin.org> Approved by: Richard Lowe <richlowe@richlowe.net>
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_tx.c714
-rw-r--r--usr/src/uts/common/mapfiles/ddi.mapfile3
2 files changed, 296 insertions, 421 deletions
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_tx.c b/usr/src/uts/common/io/ixgbe/ixgbe_tx.c
index b6926fc3b7..eb91f4af09 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_tx.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_tx.c
@@ -27,19 +27,21 @@
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2012 Nexenta Systems, Inc. All rights reserved.
* Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
*/
#include "ixgbe_sw.h"
-static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
- uint32_t, boolean_t);
-static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
- uint32_t);
+static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t **,
+ link_list_t *, const void *, size_t);
+static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t **,
+ link_list_t *, uint8_t *, size_t);
+static uint_t ixgbe_tcb_done(tx_control_block_t *);
static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
ixgbe_tx_context_t *, size_t);
static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
-static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *);
+static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *,
+ link_list_t *);
static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
@@ -65,42 +67,71 @@ static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
* they will be processed by using bcopy; otherwise, they will
* be processed by using DMA binding.
*
- * To process the mblk, a tx control block is got from the
- * free list. One tx control block contains one tx buffer, which
- * is used to copy mblk fragments' data; and one tx DMA handle,
- * which is used to bind a mblk fragment with DMA resource.
+ * To process the mblk, for each fragment, we pass a pointer to the location
+ * of the current transmit control block (tcb) (initialized to NULL) to either
+ * ixgbe_tx_copy() or ixgbe_tx_bind() (based on the size of the mblk fragment).
+ * ixgbe_tx_copy() and ixgbe_tx_bind() will either continue to use the current
+ * if possible, or close out the current tcb, allocate a new tcb, and update
+ * the passed location (tx_control_block_t **) to reflect the new current tcb.
*
- * Several small mblk fragments can be copied into one tx control
- * block's buffer, and then the buffer will be transmitted with
- * one tx descriptor.
+ * Since bound mblk fragments require their own tcb, the close, allocate new,
+ * and update steps occur on every call to ixgbe_tx_bind(), but since
+ * consecutive small mblk fragments can be combined into a single tcb, the
+ * close, allocate new, and update steps may not occur on every call to
+ * ixgbe_tx_copy(). If the current tcb is already being used to copy data and
+ * we call ixgbe_tx_copy(), if there is enough room in the current tcb for
+ * the current mblk fragment, we append the data from the mblk fragment. If
+ * we call ixgbe_tx_copy() and the current tcb isn't being used to copy (i.e.
+ * the previous iteration of the loop called ixgbe_tx_bind()), or doesn't
+ * have enough space for the mblk fragment, we close out the current tcb,
+ * grab a new tcb from the free list, and update the current tcb to the
+ * newly obtained tcb.
*
- * A large fragment only binds with one tx control block's DMA
- * handle, and it can span several tx descriptors for transmitting.
+ * When LSO (large segment offload) is enabled, we first copy the packet
+ * headers (ethernet, IP, and TCP/UDP) into their own descriptor before
+ * processing the remainder of the packet. The remaining bytes of the packet
+ * are then copied or mapped based on the fragment size as described above.
+ *
+ * Through the entire processing of a packet, we keep track of the number of
+ * DMA descriptors being used (either bound or pre-bound buffers used for
+ * copying) by this packet. Each tcb requires at least one DMA descriptor, but
+ * may require more than one. When a tcb is closed by ixgbe_tx_bind() or
+ * ixgbe_tx_copy(), it does so by calling ixgbe_tcb_done() which returns the
+ * number of DMA descriptors that are closed (ready for the HW). Since the
+ * hardware limits the number of descriptors that can be used to transmit a
+ * single packet, if the total number DMA descriptors required to transmit
+ * this packet exceeds this limit, we perform a msgpullup() and try again.
+ * Since our DMA attributes limit the number of DMA cookies allowed to
+ * map a single span of memory to a value (MAX_COOKIE) less than the
+ * maximum number of descriptors allowed for a packet (IXGBE_TX_DESC_LIMIT),
+ * as long as sufficient tcbs are available, we should always be able to
+ * process a packet that's contained in a single mblk_t (no additional
+ * fragments).
+ *
+ * Once all of the tcbs have been setup, ixgbe_tx_fill_ring() is called to
+ * setup the tx ring to transmit the tcbs and then tell the HW to start
+ * transmitting. When transmission is complete, an interrupt is triggered
+ * which calls the appropriate recycle routine to place the tcbs that were
+ * used in transmission back in the free list. We also may also try to
+ * recycle any available tcbs when the size of the tcb free list gets low
+ * or if the watchdog timer triggers.
*
- * So to transmit a packet (mblk), several tx control blocks can
- * be used. After the processing, those tx control blocks will
- * be put to the work list.
*/
mblk_t *
-ixgbe_ring_tx(void *arg, mblk_t *mp)
+ixgbe_ring_tx(void *arg, mblk_t *orig_mp)
{
ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
ixgbe_t *ixgbe = tx_ring->ixgbe;
- tx_type_t current_flag, next_flag;
- uint32_t current_len, next_len;
+ mblk_t *mp = orig_mp;
+ mblk_t *pull_mp = NULL;
+ tx_control_block_t *tcb;
+ size_t mbsize, offset, len;
uint32_t desc_total;
- size_t mbsize;
+ uint32_t copy_thresh;
int desc_num;
- boolean_t copy_done, eop;
- mblk_t *current_mp, *next_mp, *nmp, *pull_mp = NULL;
- tx_control_block_t *tcb;
- ixgbe_tx_context_t tx_context, *ctx;
+ ixgbe_tx_context_t tx_context, *ctx = NULL;
link_list_t pending_list;
- uint32_t len, hdr_frag_len, hdr_len;
- uint32_t copy_thresh;
- mblk_t *hdr_new_mp = NULL;
- mblk_t *hdr_pre_mp = NULL;
- mblk_t *hdr_nmp = NULL;
+ boolean_t limit_retry = B_FALSE;
ASSERT(mp->b_next == NULL);
@@ -115,11 +146,7 @@ ixgbe_ring_tx(void *arg, mblk_t *mp)
copy_thresh = ixgbe->tx_copy_thresh;
- /* Get the mblk size */
- mbsize = 0;
- for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
- mbsize += MBLKL(nmp);
- }
+ mbsize = msgsize(mp);
if (ixgbe->tx_hcksum_enable) {
/*
@@ -145,11 +172,16 @@ ixgbe_ring_tx(void *arg, mblk_t *mp)
IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
return (NULL);
}
- } else {
- ctx = NULL;
}
/*
+ * If we use too many descriptors (see comments below), we may do
+ * pull_mp = msgpullup(orig_mp, -1), and jump back to here. As such,
+ * any time we error return past here, we should check and free
+ * pull_mp if != NULL.
+ */
+retry:
+ /*
* Check and recycle tx descriptors.
* The recycle threshold here should be selected carefully
*/
@@ -165,7 +197,9 @@ ixgbe_ring_tx(void *arg, mblk_t *mp)
if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
tx_ring->reschedule = B_TRUE;
tx_ring->stat_overload++;
- return (mp);
+ if (pull_mp != NULL)
+ freemsg(pull_mp);
+ return (orig_mp);
}
/*
@@ -175,313 +209,149 @@ ixgbe_ring_tx(void *arg, mblk_t *mp)
* It is used to reduce the lock contention of the tx_lock.
*/
LINK_LIST_INIT(&pending_list);
+
+ tcb = NULL;
desc_num = 0;
desc_total = 0;
+ offset = 0;
/*
- * The software should guarantee LSO packet header(MAC+IP+TCP)
- * to be within one descriptor. Here we reallocate and refill the
- * the header if it's physical memory non-contiguous.
+ * For LSO, we always copy the packet header (Ethernet + IP + TCP/UDP)
+ * into a single descriptor separate from the remaining data.
*/
if ((ctx != NULL) && ctx->lso_flag) {
- /* find the last fragment of the header */
- len = MBLKL(mp);
- ASSERT(len > 0);
- hdr_nmp = mp;
+ size_t hdr_len;
+
hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
- while (len < hdr_len) {
- hdr_pre_mp = hdr_nmp;
- hdr_nmp = hdr_nmp->b_cont;
- len += MBLKL(hdr_nmp);
- }
- /*
- * If the header and the payload are in different mblks,
- * we simply force the header to be copied into pre-allocated
- * page-aligned buffer.
- */
- if (len == hdr_len)
- goto adjust_threshold;
- hdr_frag_len = hdr_len - (len - MBLKL(hdr_nmp));
/*
- * There are two cases we need to reallocate a mblk for the
- * last header fragment:
- * 1. the header is in multiple mblks and the last fragment
- * share the same mblk with the payload
- * 2. the header is in a single mblk shared with the payload
- * and the header is physical memory non-contiguous
+ * copy the first hdr_len bytes of mp (i.e. the Ethernet, IP,
+ * and TCP/UDP headers) into tcb.
*/
- if ((hdr_nmp != mp) ||
- (P2NPHASE((uintptr_t)hdr_nmp->b_rptr, ixgbe->sys_page_size)
- < hdr_len)) {
- tx_ring->stat_lso_header_fail++;
+ for (len = hdr_len; mp != NULL && len > 0; mp = mp->b_cont) {
+ size_t mlen = MBLKL(mp);
+ size_t amt = MIN(mlen, len);
+ int ret;
+
+ ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list,
+ mp->b_rptr, amt);
+ /*
+ * Since we're trying to copy all of the headers into
+ * a single buffer in a single tcb, if ixgbe_tx_copy()
+ * returns anything but 0, it means either no tcbs
+ * are available (< 0), or while copying, we spilled
+ * over and couldn't fit all the headers into a
+ * single tcb.
+ */
+ if (ret != 0) {
+ if (ret > 0)
+ tx_ring->stat_lso_header_fail++;
+ goto tx_failure;
+ }
+
+ len -= amt;
+
/*
- * reallocate the mblk for the last header fragment,
- * expect to bcopy into pre-allocated page-aligned
- * buffer
+ * If we copy less than the full amount of this
+ * mblk_t, we have some amount to copy below.
*/
- hdr_new_mp = allocb(hdr_frag_len, 0);
- if (!hdr_new_mp)
- return (mp);
- bcopy(hdr_nmp->b_rptr, hdr_new_mp->b_rptr,
- hdr_frag_len);
- /* link the new header fragment with the other parts */
- hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len;
- hdr_new_mp->b_cont = hdr_nmp;
- if (hdr_pre_mp)
- hdr_pre_mp->b_cont = hdr_new_mp;
- else
- mp = hdr_new_mp;
- hdr_nmp->b_rptr += hdr_frag_len;
+ if (amt < mlen) {
+ offset = amt;
+ break;
+ }
}
-adjust_threshold:
+
+ ASSERT0(len);
+
/*
- * adjust the bcopy threshhold to guarantee
- * the header to use bcopy way
+ * Finish off the header tcb, and start anew for the
+ * rest of the packet.
*/
- if (copy_thresh < hdr_len)
- copy_thresh = hdr_len;
+ desc_total += ixgbe_tcb_done(tcb);
+ tcb = NULL;
}
- current_mp = mp;
- current_len = MBLKL(current_mp);
- /*
- * Decide which method to use for the first fragment
- */
- current_flag = (current_len <= copy_thresh) ?
- USE_COPY : USE_DMA;
/*
- * If the mblk includes several contiguous small fragments,
- * they may be copied into one buffer. This flag is used to
- * indicate whether there are pending fragments that need to
- * be copied to the current tx buffer.
+ * Process each remaining segment in the packet -- either binding
+ * the dblk_t or copying the contents of the dblk_t to an already
+ * bound buffer. When we copy, we will accumulate consecutive small
+ * (less than copy_thresh bytes) segments into a single tcb buffer
+ * until no more can fit (or we encounter a segment larger than
+ * copy_thresh and bind the dblk_t).
*
- * If this flag is B_TRUE, it indicates that a new tx control
- * block is needed to process the next fragment using either
- * copy or DMA binding.
+ * Both ixgbe_tx_bind() and ixgbe_tx_copy() will allocate new
+ * transmit control blocks (tcb)s as needed (and append them onto
+ * 'pending_list'). Both functions also replace 'tcb' with the new
+ * tcb when they allocate a new tcb.
*
- * Otherwise, it indicates that the next fragment will be
- * copied to the current tx buffer that is maintained by the
- * current tx control block. No new tx control block is needed.
+ * We stop trying to process the packet once the number of descriptors
+ * used equals IXGBE_TX_DESC_LIMIT. Even if we're copying into the
+ * IXGBE_TX_DESC_LIMIT-th descriptor, we won't have room to add a
+ * context descriptor (since we're already at the limit), so there's
+ * no point in continuing. We'll pull up the mblk_t (see below)
+ * and try again.
*/
- copy_done = B_TRUE;
- while (current_mp) {
- next_mp = current_mp->b_cont;
- eop = (next_mp == NULL); /* Last fragment of the packet? */
- next_len = eop ? 0: MBLKL(next_mp);
-
- /*
- * When the current fragment is an empty fragment, if
- * the next fragment will still be copied to the current
- * tx buffer, we cannot skip this fragment here. Because
- * the copy processing is pending for completion. We have
- * to process this empty fragment in the tx_copy routine.
- *
- * If the copy processing is completed or a DMA binding
- * processing is just completed, we can just skip this
- * empty fragment.
- */
- if ((current_len == 0) && (copy_done)) {
- current_mp = next_mp;
- current_len = next_len;
- current_flag = (current_len <= copy_thresh) ?
- USE_COPY : USE_DMA;
- continue;
- }
-
- if (copy_done) {
- /*
- * Get a new tx control block from the free list
- */
- tcb = ixgbe_get_free_list(tx_ring);
-
- if (tcb == NULL) {
- tx_ring->stat_fail_no_tcb++;
- goto tx_failure;
- }
-
- /*
- * Push the tx control block to the pending list
- * to avoid using lock too early
- */
- LIST_PUSH_TAIL(&pending_list, &tcb->link);
- }
-
- if (current_flag == USE_COPY) {
- /*
- * Check whether to use bcopy or DMA binding to process
- * the next fragment, and if using bcopy, whether we
- * need to continue copying the next fragment into the
- * current tx buffer.
- */
- ASSERT((tcb->tx_buf.len + current_len) <=
- tcb->tx_buf.size);
+ while (mp != NULL && desc_total < IXGBE_TX_DESC_LIMIT) {
+ uint8_t *rptr = mp->b_rptr + offset;
+ int ret;
- if (eop) {
- /*
- * This is the last fragment of the packet, so
- * the copy processing will be completed with
- * this fragment.
- */
- next_flag = USE_NONE;
- copy_done = B_TRUE;
- } else if ((tcb->tx_buf.len + current_len + next_len) >
- tcb->tx_buf.size) {
- /*
- * If the next fragment is too large to be
- * copied to the current tx buffer, we need
- * to complete the current copy processing.
- */
- next_flag = (next_len > copy_thresh) ?
- USE_DMA: USE_COPY;
- copy_done = B_TRUE;
- } else if (next_len > copy_thresh) {
- /*
- * The next fragment needs to be processed with
- * DMA binding. So the copy prcessing will be
- * completed with the current fragment.
- */
- next_flag = USE_DMA;
- copy_done = B_TRUE;
- } else {
- /*
- * Continue to copy the next fragment to the
- * current tx buffer.
- */
- next_flag = USE_COPY;
- copy_done = B_FALSE;
- }
+ len = MBLKL(mp) - offset;
+ offset = 0;
- desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp,
- current_len, copy_done);
+ if (len > copy_thresh) {
+ ret = ixgbe_tx_bind(tx_ring, &tcb, &pending_list, rptr,
+ len);
} else {
- /*
- * Check whether to use bcopy or DMA binding to process
- * the next fragment.
- */
- next_flag = (next_len > copy_thresh) ?
- USE_DMA: USE_COPY;
- ASSERT(copy_done == B_TRUE);
-
- desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp,
- current_len);
+ ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list, rptr,
+ len);
}
- if (desc_num > 0)
- desc_total += desc_num;
- else if (desc_num < 0)
+ if (ret < 0)
goto tx_failure;
- current_mp = next_mp;
- current_len = next_len;
- current_flag = next_flag;
+ desc_total += ret;
+ mp = mp->b_cont;
}
- /*
- * Attach the mblk to the last tx control block
- */
- ASSERT(tcb);
- ASSERT(tcb->mp == NULL);
- tcb->mp = mp;
+ /* Finish off the last tcb */
+ desc_total += ixgbe_tcb_done(tcb);
/*
* 82598/82599 chipset has a limitation that no more than 32 tx
- * descriptors can be transmited out at one time.
+ * descriptors can be transmited out at one time. As noted above,
+ * we need to include space for a context descriptor in case its
+ * necessary, so we do this even if desc_total == IXGBE_TX_DESC_LIMIT
+ * as well as when it exceeds the limit.
*
- * Here is a workaround for it: pull up the mblk then send it
- * out with bind way. By doing so, no more than MAX_COOKIE (18)
- * descriptors is needed.
+ * If we exceed this limit, we take the hit, do a msgpullup(), and
+ * then try again. Our DMA attributes guarantee we should never use
+ * more than MAX_COOKIE (18) descriptors to map a single mblk_t, so we
+ * should only need to retry once.
*/
- if (desc_total + 1 > IXGBE_TX_DESC_LIMIT) {
- tx_ring->stat_break_tbd_limit++;
+ if (desc_total >= IXGBE_TX_DESC_LIMIT) {
+ /* We shouldn't hit this path twice */
+ VERIFY0(limit_retry);
- /*
- * Discard the mblk and free the used resources
- */
- tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
- while (tcb) {
- tcb->mp = NULL;
- ixgbe_free_tcb(tcb);
- tcb = (tx_control_block_t *)
- LIST_GET_NEXT(&pending_list, &tcb->link);
- }
+ tx_ring->stat_break_tbd_limit++;
- /*
- * Return the tx control blocks in the pending list to
- * the free list.
- */
+ /* Release all the tcbs we used previously */
ixgbe_put_free_list(tx_ring, &pending_list);
-
- /*
- * pull up the mblk and send it out with bind way
- */
- if ((pull_mp = msgpullup(mp, -1)) == NULL) {
- tx_ring->reschedule = B_TRUE;
-
- /*
- * If new mblk has been allocted for the last header
- * fragment of a LSO packet, we should restore the
- * modified mp.
- */
- if (hdr_new_mp) {
- hdr_new_mp->b_cont = NULL;
- freeb(hdr_new_mp);
- hdr_nmp->b_rptr -= hdr_frag_len;
- if (hdr_pre_mp)
- hdr_pre_mp->b_cont = hdr_nmp;
- else
- mp = hdr_nmp;
- }
- return (mp);
- }
-
- LINK_LIST_INIT(&pending_list);
desc_total = 0;
+ offset = 0;
- /*
- * if the packet is a LSO packet, we simply
- * transmit the header in one descriptor using the copy way
- */
- if ((ctx != NULL) && ctx->lso_flag) {
- hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len +
- ctx->l4_hdr_len;
-
- tcb = ixgbe_get_free_list(tx_ring);
- if (tcb == NULL) {
- tx_ring->stat_fail_no_tcb++;
- goto tx_failure;
- }
- desc_num = ixgbe_tx_copy(tx_ring, tcb, pull_mp,
- hdr_len, B_TRUE);
- LIST_PUSH_TAIL(&pending_list, &tcb->link);
- desc_total += desc_num;
-
- pull_mp->b_rptr += hdr_len;
- }
-
- tcb = ixgbe_get_free_list(tx_ring);
- if (tcb == NULL) {
- tx_ring->stat_fail_no_tcb++;
- goto tx_failure;
- }
- if ((ctx != NULL) && ctx->lso_flag) {
- desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
- mbsize - hdr_len);
- } else {
- desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
- mbsize);
- }
- if (desc_num < 0) {
- goto tx_failure;
+ pull_mp = msgpullup(orig_mp, -1);
+ if (pull_mp == NULL) {
+ tx_ring->reschedule = B_TRUE;
+ return (orig_mp);
}
- LIST_PUSH_TAIL(&pending_list, &tcb->link);
- desc_total += desc_num;
- tcb->mp = pull_mp;
+ mp = pull_mp;
+ limit_retry = B_TRUE;
+ goto retry;
}
/*
- * Before fill the tx descriptor ring with the data, we need to
+ * Before filling the tx descriptor ring with the data, we need to
* ensure there are adequate free descriptors for transmit
* (including one context descriptor).
* Do not use up all the tx descriptors.
@@ -506,22 +376,31 @@ adjust_threshold:
goto tx_failure;
}
+ /*
+ * Attach the mblk_t we've setup to the last control block.
+ * This is only done once we know there are enough free descriptors
+ * to transmit so that the cleanup in tx_failure doesn't try to
+ * call freemsg() on mp (since we will want to return it).
+ */
+ tcb->mp = (pull_mp != NULL) ? pull_mp : orig_mp;
+
desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
mbsize);
ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
tx_ring->stat_obytes += mbsize;
- tx_ring->stat_opackets ++;
+ tx_ring->stat_opackets++;
mutex_exit(&tx_ring->tx_lock);
/*
- * now that the transmission succeeds, need to free the original
- * mp if we used the pulling up mblk for transmission.
+ * Now that tx is done, if we pulled up the original message, we
+ * can free the original message since it is no longer being
+ * used.
*/
- if (pull_mp) {
- freemsg(mp);
+ if (pull_mp != NULL) {
+ freemsg(orig_mp);
}
return (NULL);
@@ -535,31 +414,10 @@ tx_failure:
}
/*
- * If new mblk has been allocted for the last header
- * fragment of a LSO packet, we should restore the
- * modified mp.
- */
- if (hdr_new_mp) {
- hdr_new_mp->b_cont = NULL;
- freeb(hdr_new_mp);
- hdr_nmp->b_rptr -= hdr_frag_len;
- if (hdr_pre_mp)
- hdr_pre_mp->b_cont = hdr_nmp;
- else
- mp = hdr_nmp;
- }
- /*
- * Discard the mblk and free the used resources
+ * tcb->mp should not be set until we know we can transmit (see above),
+ * so it should always be NULL if we get here.
*/
- tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
- while (tcb) {
- tcb->mp = NULL;
-
- ixgbe_free_tcb(tcb);
-
- tcb = (tx_control_block_t *)
- LIST_GET_NEXT(&pending_list, &tcb->link);
- }
+ VERIFY3P(tcb->mp, ==, NULL);
/*
* Return the tx control blocks in the pending list to the free list.
@@ -569,22 +427,44 @@ tx_failure:
/* Transmit failed, do not drop the mblk, rechedule the transmit */
tx_ring->reschedule = B_TRUE;
- return (mp);
+ return (orig_mp);
}
/*
* ixgbe_tx_copy
*
- * Copy the mblk fragment to the pre-allocated tx buffer
+ * Copy the mblk fragment to the pre-allocated tx buffer. Return -1 on error,
+ * otherwise return the number of descriptors we've completed in this call.
*/
static int
-ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
- uint32_t len, boolean_t copy_done)
+ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp,
+ link_list_t *pending_list, const void *buf, size_t len)
{
+ tx_control_block_t *tcb = *tcbp;
dma_buffer_t *tx_buf;
- uint32_t desc_num;
- _NOTE(ARGUNUSED(tx_ring));
+ uint32_t desc_num = 0;
+
+ /*
+ * We need a new tcb -- either the current one (tcb) is NULL because
+ * we just started, tcb is being used for DMA, or tcb isn't large enough
+ * to hold the contents we need to copy.
+ */
+ if (tcb == NULL || tcb->tx_type == USE_DMA ||
+ tcb->tx_buf.len + len > tcb->tx_buf.size) {
+ tx_control_block_t *newtcb;
+
+ newtcb = ixgbe_get_free_list(tx_ring, pending_list);
+ if (newtcb == NULL)
+ return (-1);
+ newtcb->tx_type = USE_COPY;
+
+ if (tcb != NULL)
+ desc_num += ixgbe_tcb_done(tcb);
+ *tcbp = tcb = newtcb;
+ }
+
+ ASSERT3S(tcb->tx_type, ==, USE_COPY);
tx_buf = &tcb->tx_buf;
/*
@@ -598,62 +478,43 @@ ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
* fragment.
*/
if (len > 0) {
- bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);
+ bcopy(buf, tx_buf->address + tx_buf->len, len);
tx_buf->len += len;
tcb->frag_num++;
}
- desc_num = 0;
-
- /*
- * If it is the last fragment copied to the current tx buffer,
- * in other words, if there's no remaining fragment or the remaining
- * fragment requires a new tx control block to process, we need to
- * complete the current copy processing by syncing up the current
- * DMA buffer and saving the descriptor data.
- */
- if (copy_done) {
- /*
- * Sync the DMA buffer of the packet data
- */
- DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
-
- tcb->tx_type = USE_COPY;
-
- /*
- * Save the address and length to the private data structure
- * of the tx control block, which will be used to fill the
- * tx descriptor ring after all the fragments are processed.
- */
- ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
- desc_num++;
- }
-
return (desc_num);
}
/*
* ixgbe_tx_bind
*
- * Bind the mblk fragment with DMA
+ * Bind the mblk fragment with DMA. Returns -1 on error, otherwise it
+ * returns the number of descriptors completed in this call. This count
+ * can include descriptors that weren't filled in by the current call to
+ * ixgbe_tx_bind() but were being used (but not yet completed) in previous
+ * calls to ixgbe_tx_bind() or ixgbe_tx_copy().
*/
static int
-ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
- uint32_t len)
+ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp,
+ link_list_t *pending_list, uint8_t *buf, size_t len)
{
- int status, i;
- ddi_dma_cookie_t dma_cookie;
- uint_t ncookies;
- int desc_num;
+ tx_control_block_t *tcb = NULL;
+ uint_t desc_num = 0;
+ int status;
+
+ tcb = ixgbe_get_free_list(tx_ring, pending_list);
+ if (tcb == NULL)
+ return (-1);
/*
* Use DMA binding to process the mblk fragment
*/
status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
- (caddr_t)mp->b_rptr, len,
+ (caddr_t)buf, len,
DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
- 0, &dma_cookie, &ncookies);
+ 0, NULL, NULL);
if (status != DDI_DMA_MAPPED) {
tx_ring->stat_fail_dma_bind++;
@@ -662,25 +523,51 @@ ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
tcb->frag_num++;
tcb->tx_type = USE_DMA;
+
/*
- * Each fragment can span several cookies. One cookie will have
- * one tx descriptor to transmit.
+ * If there was an old tcb, we're about to replace it. Finish
+ * setting up the old tcb so we can replace it with the new one.
*/
- desc_num = 0;
- for (i = ncookies; i > 0; i--) {
- /*
- * Save the address and length to the private data structure
- * of the tx control block, which will be used to fill the
- * tx descriptor ring after all the fragments are processed.
- */
- ixgbe_save_desc(tcb,
- dma_cookie.dmac_laddress,
- dma_cookie.dmac_size);
+ if (*tcbp != NULL)
+ desc_num += ixgbe_tcb_done(*tcbp);
- desc_num++;
+ *tcbp = tcb;
+ return (desc_num);
+}
+
+/*
+ * Once we're done populating a tcb (either by binding or copying into
+ * a buffer in the tcb), get it ready for tx and return the number of
+ * descriptors used.
+ */
+static uint_t
+ixgbe_tcb_done(tx_control_block_t *tcb)
+{
+ uint_t desc_num = 0;
+
+ if (tcb->tx_type == USE_DMA) {
+ const ddi_dma_cookie_t *c;
- if (i > 1)
- ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
+ for (c = ddi_dma_cookie_iter(tcb->tx_dma_handle, NULL);
+ c != NULL;
+ c = ddi_dma_cookie_iter(tcb->tx_dma_handle, c)) {
+ /*
+ * Save the address and length to the private data
+ * structure of the tx control block, which will be
+ * used to fill the tx descriptor ring after all the
+ * fragments are processed.
+ */
+ ixgbe_save_desc(tcb, c->dmac_laddress, c->dmac_size);
+ desc_num++;
+ }
+ } else if (tcb->tx_type == USE_COPY) {
+ dma_buffer_t *tx_buf = &tcb->tx_buf;
+
+ DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
+ ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
+ desc_num++;
+ } else {
+ panic("invalid tcb type");
}
return (desc_num);
@@ -1364,21 +1251,6 @@ ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
mutex_exit(&tx_ring->recycle_lock);
/*
- * Free the resources used by the tx control blocks
- * in the pending list
- */
- tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
- while (tcb != NULL) {
- /*
- * Release the resources occupied by the tx control block
- */
- ixgbe_free_tcb(tcb);
-
- tcb = (tx_control_block_t *)
- LIST_GET_NEXT(&pending_list, &tcb->link);
- }
-
- /*
* Add the tx control blocks in the pending list to the free list.
*/
ixgbe_put_free_list(tx_ring, &pending_list);
@@ -1511,21 +1383,6 @@ ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
mutex_exit(&tx_ring->recycle_lock);
/*
- * Free the resources used by the tx control blocks
- * in the pending list
- */
- tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
- while (tcb) {
- /*
- * Release the resources occupied by the tx control block
- */
- ixgbe_free_tcb(tcb);
-
- tcb = (tx_control_block_t *)
- LIST_GET_NEXT(&pending_list, &tcb->link);
- }
-
- /*
* Add the tx control blocks in the pending list to the free list.
*/
ixgbe_put_free_list(tx_ring, &pending_list);
@@ -1543,6 +1400,9 @@ ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
void
ixgbe_free_tcb(tx_control_block_t *tcb)
{
+ if (tcb == NULL)
+ return;
+
switch (tcb->tx_type) {
case USE_COPY:
/*
@@ -1576,14 +1436,15 @@ ixgbe_free_tcb(tx_control_block_t *tcb)
}
/*
- * ixgbe_get_free_list - Get a free tx control block from the free list
+ * ixgbe_get_free_list - Get a free tx control block from the free list.
+ * Returns the tx control block and appends it to list.
*
* The atomic operation on the number of the available tx control block
* in the free list is used to keep this routine mutual exclusive with
* the routine ixgbe_put_check_list.
*/
static tx_control_block_t *
-ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
+ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *list)
{
tx_control_block_t *tcb;
@@ -1591,8 +1452,10 @@ ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
* Check and update the number of the free tx control block
* in the free list.
*/
- if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
+ if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0) {
+ tx_ring->stat_fail_no_tcb++;
return (NULL);
+ }
mutex_enter(&tx_ring->tcb_head_lock);
@@ -1604,6 +1467,7 @@ ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
mutex_exit(&tx_ring->tcb_head_lock);
+ LIST_PUSH_TAIL(list, &tcb->link);
return (tcb);
}
@@ -1623,6 +1487,16 @@ ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
int tcb_num;
tx_control_block_t *tcb;
+ for (tcb = (tx_control_block_t *)LIST_GET_HEAD(pending_list);
+ tcb != NULL;
+ tcb = (tx_control_block_t *)LIST_GET_NEXT(pending_list, tcb)) {
+ /*
+ * Despite the name, ixgbe_free_tcb() just releases the
+ * resources in tcb, but does not free tcb itself.
+ */
+ ixgbe_free_tcb(tcb);
+ }
+
mutex_enter(&tx_ring->tcb_tail_lock);
index = tx_ring->tcb_tail;
diff --git a/usr/src/uts/common/mapfiles/ddi.mapfile b/usr/src/uts/common/mapfiles/ddi.mapfile
index abbc802cd8..89d1073019 100644
--- a/usr/src/uts/common/mapfiles/ddi.mapfile
+++ b/usr/src/uts/common/mapfiles/ddi.mapfile
@@ -10,7 +10,7 @@
#
#
-# Copyright 2019 Joyent, Inc.
+# Copyright 2020 Joyent, Inc.
# Copyright 2020 RackTop Systems, Inc.
#
@@ -77,6 +77,7 @@ SYMBOL_SCOPE {
ddi_dev_regsize { FLAGS = EXTERN };
ddi_dma_addr_bind_handle { FLAGS = EXTERN };
ddi_dma_alloc_handle { FLAGS = EXTERN };
+ ddi_dma_cookie_iter { FLAGS = EXTERN };
ddi_dma_free_handle { FLAGS = EXTERN };
ddi_dma_mem_alloc { FLAGS = EXTERN };
ddi_dma_mem_free { FLAGS = EXTERN };