diff options
Diffstat (limited to 'usr/src/uts/common/io/ixgbe/ixgbe_tx.c')
-rw-r--r-- | usr/src/uts/common/io/ixgbe/ixgbe_tx.c | 1320 |
1 files changed, 1320 insertions, 0 deletions
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_tx.c b/usr/src/uts/common/io/ixgbe/ixgbe_tx.c new file mode 100644 index 0000000000..ad6cac1e8d --- /dev/null +++ b/usr/src/uts/common/io/ixgbe/ixgbe_tx.c @@ -0,0 +1,1320 @@ +/* + * CDDL HEADER START + * + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at: + * http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When using or redistributing this file, you may do so under the + * License only. No other modification of this header is permitted. + * + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms of the CDDL. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "ixgbe_sw.h" + +static boolean_t ixgbe_tx(ixgbe_tx_ring_t *, mblk_t *); +static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *, + uint32_t, boolean_t, boolean_t); +static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *, + uint32_t); +static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *, + hcksum_context_t *); +static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t); +static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *); + +static void ixgbe_get_hcksum_context(mblk_t *, hcksum_context_t *); +static boolean_t ixgbe_check_hcksum_context(ixgbe_tx_ring_t *, + hcksum_context_t *); +static void ixgbe_fill_hcksum_context(struct ixgbe_adv_tx_context_desc *, + hcksum_context_t *); + +#ifndef IXGBE_DEBUG +#pragma inline(ixgbe_save_desc) +#pragma inline(ixgbe_get_hcksum_context) +#pragma inline(ixgbe_check_hcksum_context) +#pragma inline(ixgbe_fill_hcksum_context) +#endif + +/* + * ixgbe_m_tx + * + * The GLDv3 interface to call driver's tx routine to transmit + * the mblks. + */ +mblk_t * +ixgbe_m_tx(void *arg, mblk_t *mp) +{ + ixgbe_t *ixgbe = (ixgbe_t *)arg; + mblk_t *next; + ixgbe_tx_ring_t *tx_ring; + + /* + * If the adapter is suspended, or it is not started, or the link + * is not up, the mblks are simply dropped. + */ + if (((ixgbe->ixgbe_state & IXGBE_SUSPENDED) != 0) || + ((ixgbe->ixgbe_state & IXGBE_STARTED) == 0) || + (ixgbe->link_state != LINK_STATE_UP)) { + /* Free the mblk chain */ + while (mp != NULL) { + next = mp->b_next; + mp->b_next = NULL; + + freemsg(mp); + mp = next; + } + + return (NULL); + } + + /* + * Decide which tx ring is used to transmit the packets. + * This needs to be updated later to fit the new interface + * of the multiple rings support. + */ + tx_ring = &ixgbe->tx_rings[0]; + + while (mp != NULL) { + next = mp->b_next; + mp->b_next = NULL; + + if (!ixgbe_tx(tx_ring, mp)) { + mp->b_next = next; + break; + } + + mp = next; + } + + return (mp); +} + +/* + * ixgbe_tx - Main transmit processing + * + * Called from ixgbe_m_tx with an mblk ready to transmit. this + * routine sets up the transmit descriptors and sends data to + * the wire. + * + * One mblk can consist of several fragments, each fragment + * will be processed with different methods based on the size. + * For the fragments with size less than the bcopy threshold, + * they will be processed by using bcopy; otherwise, they will + * be processed by using DMA binding. + * + * To process the mblk, a tx control block is got from the + * free list. One tx control block contains one tx buffer, which + * is used to copy mblk fragments' data; and one tx DMA handle, + * which is used to bind a mblk fragment with DMA resource. + * + * Several small mblk fragments can be copied into one tx control + * block's buffer, and then the buffer will be transmitted with + * one tx descriptor. + * + * A large fragment only binds with one tx control block's DMA + * handle, and it can span several tx descriptors for transmitting. + * + * So to transmit a packet (mblk), several tx control blocks can + * be used. After the processing, those tx control blocks will + * be put to the work list. + */ +static boolean_t +ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) +{ + ixgbe_t *ixgbe = tx_ring->ixgbe; + tx_type_t current_flag, next_flag; + uint32_t current_len, next_len; + uint32_t desc_total; + size_t mbsize; + int desc_num; + boolean_t copy_done, eop; + mblk_t *current_mp, *next_mp, *nmp; + tx_control_block_t *tcb; + hcksum_context_t hcksum_context, *hcksum; + link_list_t pending_list; + + /* Get the mblk size */ + mbsize = 0; + for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { + mbsize += MBLK_LEN(nmp); + } + + /* + * If the mblk size exceeds the max frame size, + * discard this mblk, and return B_TRUE + */ + if (mbsize > (ixgbe->max_frame_size - ETHERFCSL)) { + freemsg(mp); + IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize"); + return (B_TRUE); + } + + /* + * Check and recycle tx descriptors. + * The recycle threshold here should be selected carefully + */ + if (tx_ring->tbd_free < tx_ring->recycle_thresh) + tx_ring->tx_recycle(tx_ring); + + /* + * After the recycling, if the tbd_free is less than the + * overload_threshold, assert overload, return B_FALSE; + * and we need to re-schedule the tx again. + */ + if (tx_ring->tbd_free < tx_ring->overload_thresh) { + tx_ring->reschedule = B_TRUE; + IXGBE_DEBUG_STAT(tx_ring->stat_overload); + return (B_FALSE); + } + + /* + * The pending_list is a linked list that is used to save + * the tx control blocks that have packet data processed + * but have not put the data to the tx descriptor ring. + * It is used to reduce the lock contention of the tx_lock. + */ + LINK_LIST_INIT(&pending_list); + desc_num = 0; + desc_total = 0; + + current_mp = mp; + current_len = MBLK_LEN(current_mp); + /* + * Decide which method to use for the first fragment + */ + current_flag = (current_len <= tx_ring->copy_thresh) ? + USE_COPY : USE_DMA; + /* + * If the mblk includes several contiguous small fragments, + * they may be copied into one buffer. This flag is used to + * indicate whether there are pending fragments that need to + * be copied to the current tx buffer. + * + * If this flag is B_TRUE, it indicates that a new tx control + * block is needed to process the next fragment using either + * copy or DMA binding. + * + * Otherwise, it indicates that the next fragment will be + * copied to the current tx buffer that is maintained by the + * current tx control block. No new tx control block is needed. + */ + copy_done = B_TRUE; + while (current_mp) { + next_mp = current_mp->b_cont; + eop = (next_mp == NULL); /* Last fragment of the packet? */ + next_len = eop ? 0: MBLK_LEN(next_mp); + + /* + * When the current fragment is an empty fragment, if + * the next fragment will still be copied to the current + * tx buffer, we cannot skip this fragment here. Because + * the copy processing is pending for completion. We have + * to process this empty fragment in the tx_copy routine. + * + * If the copy processing is completed or a DMA binding + * processing is just completed, we can just skip this + * empty fragment. + */ + if ((current_len == 0) && (copy_done)) { + current_mp = next_mp; + current_len = next_len; + current_flag = (current_len <= tx_ring->copy_thresh) ? + USE_COPY : USE_DMA; + continue; + } + + if (copy_done) { + /* + * Get a new tx control block from the free list + */ + tcb = ixgbe_get_free_list(tx_ring); + + if (tcb == NULL) { + IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb); + goto tx_failure; + } + + /* + * Push the tx control block to the pending list + * to avoid using lock too early + */ + LIST_PUSH_TAIL(&pending_list, &tcb->link); + } + + if (current_flag == USE_COPY) { + /* + * Check whether to use bcopy or DMA binding to process + * the next fragment, and if using bcopy, whether we + * need to continue copying the next fragment into the + * current tx buffer. + */ + ASSERT((tcb->tx_buf.len + current_len) <= + tcb->tx_buf.size); + + if (eop) { + /* + * This is the last fragment of the packet, so + * the copy processing will be completed with + * this fragment. + */ + next_flag = USE_NONE; + copy_done = B_TRUE; + } else if ((tcb->tx_buf.len + current_len + next_len) > + tcb->tx_buf.size) { + /* + * If the next fragment is too large to be + * copied to the current tx buffer, we need + * to complete the current copy processing. + */ + next_flag = (next_len > tx_ring->copy_thresh) ? + USE_DMA: USE_COPY; + copy_done = B_TRUE; + } else if (next_len > tx_ring->copy_thresh) { + /* + * The next fragment needs to be processed with + * DMA binding. So the copy prcessing will be + * completed with the current fragment. + */ + next_flag = USE_DMA; + copy_done = B_TRUE; + } else { + /* + * Continue to copy the next fragment to the + * current tx buffer. + */ + next_flag = USE_COPY; + copy_done = B_FALSE; + } + + desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp, + current_len, copy_done, eop); + } else { + /* + * Check whether to use bcopy or DMA binding to process + * the next fragment. + */ + next_flag = (next_len > tx_ring->copy_thresh) ? + USE_DMA: USE_COPY; + ASSERT(copy_done == B_TRUE); + + desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp, + current_len); + } + + if (desc_num > 0) + desc_total += desc_num; + else if (desc_num < 0) + goto tx_failure; + + current_mp = next_mp; + current_len = next_len; + current_flag = next_flag; + } + + /* + * Attach the mblk to the last tx control block + */ + ASSERT(tcb); + ASSERT(tcb->mp == NULL); + tcb->mp = mp; + + if (ixgbe->tx_hcksum_enable) { + /* + * Retrieve checksum context information from the mblk that will + * be used to decide whether/how to fill the context descriptor. + */ + hcksum = &hcksum_context; + ixgbe_get_hcksum_context(mp, hcksum); + } else { + hcksum = NULL; + } + + /* + * Before fill the tx descriptor ring with the data, we need to + * ensure there are adequate free descriptors for transmit + * (including one context descriptor). + */ + if (tx_ring->tbd_free < (desc_total + 1)) { + tx_ring->tx_recycle(tx_ring); + } + + mutex_enter(&tx_ring->tx_lock); + + /* + * If the number of free tx descriptors is not enough for transmit + * then return failure. + * + * Note: we must put this check under the mutex protection to + * ensure the correctness when multiple threads access it in + * parallel. + */ + if (tx_ring->tbd_free < (desc_total + 1)) { + IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tbd); + mutex_exit(&tx_ring->tx_lock); + goto tx_failure; + } + + desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, hcksum); + + ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1))); + + mutex_exit(&tx_ring->tx_lock); + + return (B_TRUE); + +tx_failure: + /* + * Discard the mblk and free the used resources + */ + tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); + while (tcb) { + tcb->mp = NULL; + + ixgbe_free_tcb(tcb); + + tcb = (tx_control_block_t *) + LIST_GET_NEXT(&pending_list, &tcb->link); + } + + /* + * Return the tx control blocks in the pending list to the free list. + */ + ixgbe_put_free_list(tx_ring, &pending_list); + + /* Transmit failed, do not drop the mblk, rechedule the transmit */ + tx_ring->reschedule = B_TRUE; + + return (B_FALSE); +} + +/* + * ixgbe_tx_copy + * + * Copy the mblk fragment to the pre-allocated tx buffer + */ +static int +ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp, + uint32_t len, boolean_t copy_done, boolean_t eop) +{ + dma_buffer_t *tx_buf; + uint32_t desc_num; + _NOTE(ARGUNUSED(tx_ring)); + + tx_buf = &tcb->tx_buf; + + /* + * Copy the packet data of the mblk fragment into the + * pre-allocated tx buffer, which is maintained by the + * tx control block. + * + * Several mblk fragments can be copied into one tx buffer. + * The destination address of the current copied fragment in + * the tx buffer is next to the end of the previous copied + * fragment. + */ + if (len > 0) { + bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len); + + tx_buf->len += len; + tcb->frag_num++; + } + + desc_num = 0; + + /* + * If it is the last fragment copied to the current tx buffer, + * in other words, if there's no remaining fragment or the remaining + * fragment requires a new tx control block to process, we need to + * complete the current copy processing by syncing up the current + * DMA buffer and saving the descriptor data. + */ + if (copy_done) { + /* + * For the packet smaller than 64 bytes, we need to + * pad it to 60 bytes. The NIC hardware will add 4 + * bytes of CRC. + */ + if (eop && (tx_buf->len < ETHERMIN)) { + bzero(tx_buf->address + tx_buf->len, + ETHERMIN - tx_buf->len); + tx_buf->len = ETHERMIN; + } + + /* + * Sync the DMA buffer of the packet data + */ + DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV); + + tcb->tx_type = USE_COPY; + + /* + * Save the address and length to the private data structure + * of the tx control block, which will be used to fill the + * tx descriptor ring after all the fragments are processed. + */ + ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len); + desc_num++; + } + + return (desc_num); +} + +/* + * ixgbe_tx_bind + * + * Bind the mblk fragment with DMA + */ +static int +ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp, + uint32_t len) +{ + int status, i; + ddi_dma_cookie_t dma_cookie; + uint_t ncookies; + int desc_num; + + /* + * Use DMA binding to process the mblk fragment + */ + status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL, + (caddr_t)mp->b_rptr, len, + DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, + 0, &dma_cookie, &ncookies); + + if (status != DDI_DMA_MAPPED) { + IXGBE_DEBUG_STAT(tx_ring->stat_fail_dma_bind); + return (-1); + } + + tcb->frag_num++; + tcb->tx_type = USE_DMA; + /* + * Each fragment can span several cookies. One cookie will have + * one tx descriptor to transmit. + */ + desc_num = 0; + for (i = ncookies; i > 0; i--) { + /* + * Save the address and length to the private data structure + * of the tx control block, which will be used to fill the + * tx descriptor ring after all the fragments are processed. + */ + ixgbe_save_desc(tcb, + dma_cookie.dmac_laddress, + dma_cookie.dmac_size); + + desc_num++; + + if (i > 1) + ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie); + } + + return (desc_num); +} + +/* + * ixgbe_get_hcksum_context + * + * Get the hcksum context information from the mblk + */ +static void +ixgbe_get_hcksum_context(mblk_t *mp, hcksum_context_t *hcksum) +{ + uint32_t start; + uint32_t flags; + uint32_t len; + uint32_t size; + uint32_t offset; + unsigned char *pos; + ushort_t etype; + uint32_t mac_hdr_len; + uint32_t l4_proto; + + ASSERT(mp != NULL); + + hcksum_retrieve(mp, NULL, NULL, &start, NULL, NULL, NULL, &flags); + + hcksum->hcksum_flags = flags; + + if (flags == 0) + return; + + etype = 0; + mac_hdr_len = 0; + l4_proto = 0; + + /* + * Firstly get the position of the ether_type/ether_tpid. + * Here we don't assume the ether (VLAN) header is fully included + * in one mblk fragment, so we go thourgh the fragments to parse + * the ether type. + */ + size = len = MBLK_LEN(mp); + offset = offsetof(struct ether_header, ether_type); + while (size <= offset) { + mp = mp->b_cont; + ASSERT(mp != NULL); + len = MBLK_LEN(mp); + size += len; + } + pos = mp->b_rptr + offset + len - size; + + etype = ntohs(*(ushort_t *)(uintptr_t)pos); + if (etype == ETHERTYPE_VLAN) { + /* + * Get the position of the ether_type in VLAN header + */ + offset = offsetof(struct ether_vlan_header, ether_type); + while (size <= offset) { + mp = mp->b_cont; + ASSERT(mp != NULL); + len = MBLK_LEN(mp); + size += len; + } + pos = mp->b_rptr + offset + len - size; + + etype = ntohs(*(ushort_t *)(uintptr_t)pos); + mac_hdr_len = sizeof (struct ether_vlan_header); + } else { + mac_hdr_len = sizeof (struct ether_header); + } + + /* + * Here we don't assume the IP(V6) header is fully included in + * one mblk fragment, so we go thourgh the fragments to parse + * the protocol type. + */ + switch (etype) { + case ETHERTYPE_IP: + offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len; + while (size <= offset) { + mp = mp->b_cont; + ASSERT(mp != NULL); + len = MBLK_LEN(mp); + size += len; + } + pos = mp->b_rptr + offset + len - size; + + l4_proto = *(uint8_t *)pos; + break; + case ETHERTYPE_IPV6: + offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len; + while (size <= offset) { + mp = mp->b_cont; + ASSERT(mp != NULL); + len = MBLK_LEN(mp); + size += len; + } + pos = mp->b_rptr + offset + len - size; + + l4_proto = *(uint8_t *)pos; + break; + default: + /* Unrecoverable error */ + IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum"); + return; + } + + hcksum->mac_hdr_len = mac_hdr_len; + hcksum->ip_hdr_len = start; + hcksum->l4_proto = l4_proto; +} + +/* + * ixgbe_check_hcksum_context + * + * Check if a new context descriptor is needed + */ +static boolean_t +ixgbe_check_hcksum_context(ixgbe_tx_ring_t *tx_ring, hcksum_context_t *hcksum) +{ + hcksum_context_t *last; + + if (hcksum == NULL) + return (B_FALSE); + + /* + * Compare the checksum data retrieved from the mblk and the + * stored checksum data of the last context descriptor. The data + * need to be checked are: + * hcksum_flags + * l4_proto + * mac_hdr_len + * ip_hdr_len + * Either one of the above data is changed, a new context descriptor + * will be needed. + */ + last = &tx_ring->hcksum_context; + + if (hcksum->hcksum_flags != 0) { + if ((hcksum->hcksum_flags != last->hcksum_flags) || + (hcksum->l4_proto != last->l4_proto) || + (hcksum->mac_hdr_len != last->mac_hdr_len) || + (hcksum->ip_hdr_len != last->ip_hdr_len)) { + + return (B_TRUE); + } + } + + return (B_FALSE); +} + +/* + * ixgbe_fill_hcksum_context + * + * Fill the context descriptor with hardware checksum informations + */ +static void +ixgbe_fill_hcksum_context(struct ixgbe_adv_tx_context_desc *ctx_tbd, + hcksum_context_t *hcksum) +{ + /* + * Fill the context descriptor with the checksum + * context information we've got + */ + ctx_tbd->vlan_macip_lens = hcksum->ip_hdr_len; + ctx_tbd->vlan_macip_lens |= hcksum->mac_hdr_len << + IXGBE_ADVTXD_MACLEN_SHIFT; + + ctx_tbd->type_tucmd_mlhl = + IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT; + + if (hcksum->hcksum_flags & HCK_IPV4_HDRCKSUM) + ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; + + if (hcksum->hcksum_flags & HCK_PARTIALCKSUM) { + switch (hcksum->l4_proto) { + case IPPROTO_TCP: + ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP; + break; + case IPPROTO_UDP: + /* + * We don't have to explicitly set: + * ctx_tbd->type_tucmd_mlhl |= + * IXGBE_ADVTXD_TUCMD_L4T_UDP; + * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b + */ + break; + default: + /* Unrecoverable error */ + IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum"); + break; + } + } + + ctx_tbd->seqnum_seed = 0; + ctx_tbd->mss_l4len_idx = 0; +} + +/* + * ixgbe_tx_fill_ring + * + * Fill the tx descriptor ring with the data + */ +static int +ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list, + hcksum_context_t *hcksum) +{ + struct ixgbe_hw *hw = &tx_ring->ixgbe->hw; + boolean_t load_context; + uint32_t index, tcb_index, desc_num; + union ixgbe_adv_tx_desc *tbd, *first_tbd; + tx_control_block_t *tcb, *first_tcb; + uint32_t hcksum_flags; + int i; + + ASSERT(mutex_owned(&tx_ring->tx_lock)); + + tbd = NULL; + first_tbd = NULL; + first_tcb = NULL; + desc_num = 0; + hcksum_flags = 0; + load_context = B_FALSE; + + /* + * Get the index of the first tx descriptor that will be filled, + * and the index of the first work list item that will be attached + * with the first used tx control block in the pending list. + * Note: the two indexes are the same. + */ + index = tx_ring->tbd_tail; + tcb_index = tx_ring->tbd_tail; + + if (hcksum != NULL) { + hcksum_flags = hcksum->hcksum_flags; + + /* + * Check if a new context descriptor is needed for this packet + */ + load_context = ixgbe_check_hcksum_context(tx_ring, hcksum); + if (load_context) { + first_tcb = (tx_control_block_t *) + LIST_GET_HEAD(pending_list); + tbd = &tx_ring->tbd_ring[index]; + + /* + * Fill the context descriptor with the + * hardware checksum offload informations. + */ + ixgbe_fill_hcksum_context( + (struct ixgbe_adv_tx_context_desc *)tbd, hcksum); + + index = NEXT_INDEX(index, 1, tx_ring->ring_size); + desc_num++; + + /* + * Store the checksum context data if + * a new context descriptor is added + */ + tx_ring->hcksum_context = *hcksum; + } + } + + first_tbd = &tx_ring->tbd_ring[index]; + + /* + * Fill tx data descriptors with the data saved in the pending list. + * The tx control blocks in the pending list are added to the work list + * at the same time. + * + * The work list is strictly 1:1 corresponding to the descriptor ring. + * One item of the work list corresponds to one tx descriptor. Because + * one tx control block can span multiple tx descriptors, the tx + * control block will be added to the first work list item that + * corresponds to the first tx descriptor generated from that tx + * control block. + */ + tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); + while (tcb != NULL) { + + for (i = 0; i < tcb->desc_num; i++) { + tbd = &tx_ring->tbd_ring[index]; + + tbd->read.buffer_addr = tcb->desc[i].address; + tbd->read.cmd_type_len = tcb->desc[i].length; + + tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_RS | + IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_DATA; + + tbd->read.olinfo_status = 0; + + index = NEXT_INDEX(index, 1, tx_ring->ring_size); + desc_num++; + } + + if (first_tcb != NULL) { + /* + * Count the checksum context descriptor for + * the first tx control block. + */ + first_tcb->desc_num++; + first_tcb = NULL; + } + + /* + * Add the tx control block to the work list + */ + ASSERT(tx_ring->work_list[tcb_index] == NULL); + tx_ring->work_list[tcb_index] = tcb; + + tcb_index = index; + tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); + } + + /* + * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only + * valid in the first descriptor of the packet. + */ + ASSERT(first_tbd != NULL); + first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS; + + /* Set hardware checksum bits */ + if (hcksum_flags != 0) { + if (hcksum_flags & HCK_IPV4_HDRCKSUM) + first_tbd->read.olinfo_status |= + IXGBE_TXD_POPTS_IXSM << 8; + if (hcksum_flags & HCK_PARTIALCKSUM) + first_tbd->read.olinfo_status |= + IXGBE_TXD_POPTS_TXSM << 8; + } + + /* + * The last descriptor of packet needs End Of Packet (EOP), + * and Report Status (RS) bits set + */ + ASSERT(tbd != NULL); + tbd->read.cmd_type_len |= + IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS; + + /* + * Sync the DMA buffer of the tx descriptor ring + */ + DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV); + + if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { + ddi_fm_service_impact(tx_ring->ixgbe->dip, + DDI_SERVICE_DEGRADED); + } + + /* + * Update the number of the free tx descriptors. + * The mutual exclusion between the transmission and the recycling + * (for the tx descriptor ring and the work list) is implemented + * with the atomic operation on the number of the free tx descriptors. + * + * Note: we should always decrement the counter tbd_free before + * advancing the hardware TDT pointer to avoid the race condition - + * before the counter tbd_free is decremented, the transmit of the + * tx descriptors has done and the counter tbd_free is increased by + * the tx recycling. + */ + i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num); + ASSERT(i >= 0); + + tx_ring->tbd_tail = index; + + /* + * Advance the hardware TDT pointer of the tx descriptor ring + */ + IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index); + + if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) != + DDI_FM_OK) { + ddi_fm_service_impact(tx_ring->ixgbe->dip, + DDI_SERVICE_DEGRADED); + } + + return (desc_num); +} + +/* + * ixgbe_save_desc + * + * Save the address/length pair to the private array + * of the tx control block. The address/length pairs + * will be filled into the tx descriptor ring later. + */ +static void +ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length) +{ + sw_desc_t *desc; + + desc = &tcb->desc[tcb->desc_num]; + desc->address = address; + desc->length = length; + + tcb->desc_num++; +} + +/* + * ixgbe_tx_recycle_legacy + * + * Recycle the tx descriptors and tx control blocks. + * + * The work list is traversed to check if the corresponding + * tx descriptors have been transmitted. If so, the resources + * bound to the tx control blocks will be freed, and those + * tx control blocks will be returned to the free list. + */ +uint32_t +ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring) +{ + uint32_t index, last_index; + int desc_num; + boolean_t desc_done; + tx_control_block_t *tcb; + link_list_t pending_list; + + /* + * The mutex_tryenter() is used to avoid unnecessary + * lock contention. + */ + if (mutex_tryenter(&tx_ring->recycle_lock) == 0) + return (0); + + ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); + + if (tx_ring->tbd_free == tx_ring->ring_size) { + tx_ring->recycle_fail = 0; + tx_ring->stall_watchdog = 0; + mutex_exit(&tx_ring->recycle_lock); + return (0); + } + + /* + * Sync the DMA buffer of the tx descriptor ring + */ + DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); + + if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { + ddi_fm_service_impact(tx_ring->ixgbe->dip, + DDI_SERVICE_DEGRADED); + } + + LINK_LIST_INIT(&pending_list); + desc_num = 0; + index = tx_ring->tbd_head; /* Index of next tbd/tcb to recycle */ + + tcb = tx_ring->work_list[index]; + ASSERT(tcb != NULL); + + desc_done = B_TRUE; + while (desc_done && (tcb != NULL)) { + + /* + * Get the last tx descriptor of the tx control block. + * If the last tx descriptor is done, it is done with + * all the tx descriptors of the tx control block. + * Then the tx control block and all the corresponding + * tx descriptors can be recycled. + */ + last_index = NEXT_INDEX(index, tcb->desc_num - 1, + tx_ring->ring_size); + + /* + * Check if the Descriptor Done bit is set + */ + desc_done = tx_ring->tbd_ring[last_index].wb.status & + IXGBE_TXD_STAT_DD; + if (desc_done) { + /* + * Strip off the tx control block from the work list, + * and add it to the pending list. + */ + tx_ring->work_list[index] = NULL; + LIST_PUSH_TAIL(&pending_list, &tcb->link); + + /* + * Count the total number of the tx descriptors recycled + */ + desc_num += tcb->desc_num; + + /* + * Advance the index of the tx descriptor ring + */ + index = NEXT_INDEX(last_index, 1, tx_ring->ring_size); + + tcb = tx_ring->work_list[index]; + } + } + + /* + * If no tx descriptors are recycled, no need to do more processing + */ + if (desc_num == 0) { + tx_ring->recycle_fail++; + mutex_exit(&tx_ring->recycle_lock); + return (0); + } + + tx_ring->recycle_fail = 0; + tx_ring->stall_watchdog = 0; + + /* + * Update the head index of the tx descriptor ring + */ + tx_ring->tbd_head = index; + + /* + * Update the number of the free tx descriptors with atomic operations + */ + atomic_add_32(&tx_ring->tbd_free, desc_num); + + mutex_exit(&tx_ring->recycle_lock); + + /* + * Free the resources used by the tx control blocks + * in the pending list + */ + tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); + while (tcb != NULL) { + /* + * Release the resources occupied by the tx control block + */ + ixgbe_free_tcb(tcb); + + tcb = (tx_control_block_t *) + LIST_GET_NEXT(&pending_list, &tcb->link); + } + + /* + * Add the tx control blocks in the pending list to the free list. + */ + ixgbe_put_free_list(tx_ring, &pending_list); + + return (desc_num); +} + +/* + * ixgbe_tx_recycle_head_wb + * + * Check the head write-back, and recycle all the transmitted + * tx descriptors and tx control blocks. + */ +uint32_t +ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring) +{ + uint32_t index; + uint32_t head_wb; + int desc_num; + tx_control_block_t *tcb; + link_list_t pending_list; + + /* + * The mutex_tryenter() is used to avoid unnecessary + * lock contention. + */ + if (mutex_tryenter(&tx_ring->recycle_lock) == 0) + return (0); + + ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); + + if (tx_ring->tbd_free == tx_ring->ring_size) { + tx_ring->recycle_fail = 0; + tx_ring->stall_watchdog = 0; + mutex_exit(&tx_ring->recycle_lock); + return (0); + } + + /* + * Sync the DMA buffer of the tx descriptor ring + * + * Note: For head write-back mode, the tx descriptors will not + * be written back, but the head write-back value is stored at + * the last extra tbd at the end of the DMA area, we still need + * to sync the head write-back value for kernel. + * + * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL); + */ + (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle, + sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size, + sizeof (uint32_t), + DDI_DMA_SYNC_FORKERNEL); + + if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) { + ddi_fm_service_impact(tx_ring->ixgbe->dip, + DDI_SERVICE_DEGRADED); + } + + LINK_LIST_INIT(&pending_list); + desc_num = 0; + index = tx_ring->tbd_head; /* Next index to clean */ + + /* + * Get the value of head write-back + */ + head_wb = *tx_ring->tbd_head_wb; + while (index != head_wb) { + tcb = tx_ring->work_list[index]; + ASSERT(tcb != NULL); + + if (OFFSET(index, head_wb, tx_ring->ring_size) < + tcb->desc_num) { + /* + * The current tx control block is not + * completely transmitted, stop recycling + */ + break; + } + + /* + * Strip off the tx control block from the work list, + * and add it to the pending list. + */ + tx_ring->work_list[index] = NULL; + LIST_PUSH_TAIL(&pending_list, &tcb->link); + + /* + * Advance the index of the tx descriptor ring + */ + index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size); + + /* + * Count the total number of the tx descriptors recycled + */ + desc_num += tcb->desc_num; + } + + /* + * If no tx descriptors are recycled, no need to do more processing + */ + if (desc_num == 0) { + tx_ring->recycle_fail++; + mutex_exit(&tx_ring->recycle_lock); + return (0); + } + + tx_ring->recycle_fail = 0; + tx_ring->stall_watchdog = 0; + + /* + * Update the head index of the tx descriptor ring + */ + tx_ring->tbd_head = index; + + /* + * Update the number of the free tx descriptors with atomic operations + */ + atomic_add_32(&tx_ring->tbd_free, desc_num); + + mutex_exit(&tx_ring->recycle_lock); + + /* + * Free the resources used by the tx control blocks + * in the pending list + */ + tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list); + while (tcb) { + /* + * Release the resources occupied by the tx control block + */ + ixgbe_free_tcb(tcb); + + tcb = (tx_control_block_t *) + LIST_GET_NEXT(&pending_list, &tcb->link); + } + + /* + * Add the tx control blocks in the pending list to the free list. + */ + ixgbe_put_free_list(tx_ring, &pending_list); + + return (desc_num); +} + +/* + * ixgbe_free_tcb - free up the tx control block + * + * Free the resources of the tx control block, including + * unbind the previously bound DMA handle, and reset other + * control fields. + */ +void +ixgbe_free_tcb(tx_control_block_t *tcb) +{ + switch (tcb->tx_type) { + case USE_COPY: + /* + * Reset the buffer length that is used for copy + */ + tcb->tx_buf.len = 0; + break; + case USE_DMA: + /* + * Release the DMA resource that is used for + * DMA binding. + */ + (void) ddi_dma_unbind_handle(tcb->tx_dma_handle); + break; + default: + break; + } + + /* + * Free the mblk + */ + if (tcb->mp != NULL) { + freemsg(tcb->mp); + tcb->mp = NULL; + } + + tcb->tx_type = USE_NONE; + tcb->frag_num = 0; + tcb->desc_num = 0; +} + +/* + * ixgbe_get_free_list - Get a free tx control block from the free list + * + * The atomic operation on the number of the available tx control block + * in the free list is used to keep this routine mutual exclusive with + * the routine ixgbe_put_check_list. + */ +static tx_control_block_t * +ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring) +{ + tx_control_block_t *tcb; + + /* + * Check and update the number of the free tx control block + * in the free list. + */ + if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0) + return (NULL); + + mutex_enter(&tx_ring->tcb_head_lock); + + tcb = tx_ring->free_list[tx_ring->tcb_head]; + ASSERT(tcb != NULL); + tx_ring->free_list[tx_ring->tcb_head] = NULL; + tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1, + tx_ring->free_list_size); + + mutex_exit(&tx_ring->tcb_head_lock); + + return (tcb); +} + +/* + * ixgbe_put_free_list + * + * Put a list of used tx control blocks back to the free list + * + * A mutex is used here to ensure the serialization. The mutual exclusion + * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with + * the atomic operation on the counter tcb_free. + */ +void +ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list) +{ + uint32_t index; + int tcb_num; + tx_control_block_t *tcb; + + mutex_enter(&tx_ring->tcb_tail_lock); + + index = tx_ring->tcb_tail; + + tcb_num = 0; + tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); + while (tcb != NULL) { + ASSERT(tx_ring->free_list[index] == NULL); + tx_ring->free_list[index] = tcb; + + tcb_num++; + + index = NEXT_INDEX(index, 1, tx_ring->free_list_size); + + tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list); + } + + tx_ring->tcb_tail = index; + + /* + * Update the number of the free tx control block + * in the free list. This operation must be placed + * under the protection of the lock. + */ + atomic_add_32(&tx_ring->tcb_free, tcb_num); + + mutex_exit(&tx_ring->tcb_tail_lock); +} |