/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2018 Joyent, Inc. */ #include #include "i40e_sw.h" /* * --------------------------------------------------------- * Buffer and Memory Management, Receiving, and Transmitting * --------------------------------------------------------- * * Each physical function (PF), which is what we think of as an instance of the * device driver, has a series of associated transmit and receive queue pairs. * Effectively, what we think of in MAC as rings. Each of these has their own * ring of descriptors which is used as part of doing DMA activity. * * The transmit ring of descriptors are 16-byte entries which are used to send * packets, program filters, etc. The receive ring of descriptors are either * 16-byte or 32-bytes each. At the moment, we opt to use the larger descriptor * format so that we're in a better position if we ever want to leverage that * information later on. * * However, these rings are just for descriptors, they don't talk or deal with * how we actually store the memory that we need for DMA or the associated * information that we need for keeping track of message blocks. To correspond * to the hardware descriptor ring which is how we communicate with hardware, we * introduce a control block which keeps track of our required metadata like DMA * mappings. * * There are two main considerations that dictate how much memory and buffers * we end up allocating. Those are: * * o The size of the ring (controlled through the driver.conf file) * * o The maximum size frame we can receive. * * The size of the rings currently defaults to 1024 descriptors and is stored in * the i40e_t`i40e_rx_ring_size and i40e_t`i40e_tx_ring_size. * * While the size of the rings is controlled by the driver.conf, the maximum * size frame is informed primarily through the use of dladm and the setting of * the MTU property on the device. From the MTU, we then go and do some * machinations. The first thing we do is we then have to add in space for the * Ethernet header, potentially a VLAN header, and the FCS check. This value is * what's stored as i40e_t`i40e_frame_max and is derived any time * i40e_t`i40e_sdu changes. * * This size is then rounded up to the nearest 1k chunk, which represents the * actual amount of memory that we'll allocate for a single frame. * * Note, that for rx, we do something that might be unexpected. We always add * an extra two bytes to the frame size that we allocate. We then offset the DMA * address that we receive a packet into by two bytes. This ensures that the IP * header will always be 4 byte aligned because the MAC header is either 14 or * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's * and MAC's lives easier. * * Both the rx and tx descriptor rings (which are what we use to communicate * with hardware) are allocated as a single region of DMA memory which is the * size of the descriptor (4 bytes and 2 bytes respectively) times the total * number of descriptors for an rx and tx ring. * * While the rx and tx descriptors are allocated using DMA-based memory, the * control blocks for each of them are allocated using normal kernel memory. * They aren't special from a DMA perspective. We'll go over the design of both * receiving and transmitting separately, as they have slightly different * control blocks and different ways that we manage the relationship between * control blocks and descriptors. * * --------------------------------- * RX Descriptors and Control Blocks * --------------------------------- * * For every descriptor in the ring that the driver has, we need some associated * memory, which means that we need to have the receive specific control block. * We have a couple different, but related goals: * * o Once we've completed the mc_start GLDv3 endpoint (i40e_m_start), we do * not want to do any additional memory allocations or DMA allocations if * we don't have to. * * o We'd like to try and do as much zero-copy as possible, while taking into * account the cost of mapping in DMA resources. * * o We'd like to have every receive descriptor available. * * Now, these rules are a bit in tension with one another. The act of mapping in * is an exercise of trying to find the break-even point between page table * updates and bcopy. We currently start by using the same metrics that ixgbe * used; however, it should be known that this value has effectively been * cargo-culted across to yet another driver, sorry. * * If we receive a packet which is larger than our copy threshold, we'll create * a message block out of the DMA memory via desballoc(9F) and send that up to * MAC that way. This will cause us to be notified when the message block is * then freed because it has been consumed, dropped, or otherwise. Otherwise, if * it's less than the threshold, we'll try to use allocb and bcopy it into the * block, thus allowing us to immediately reuse the DMA resource. Note, on debug * builds, we allow someone to whack the variable i40e_debug_rx_mode to override * the behavior and always do a bcopy or a DMA bind. * * To try and ensure that the device always has blocks that it can receive data * into, we maintain two lists of control blocks, a working list and a free * list. Each list is sized equal to the number of descriptors in the rx ring. * During the GLDv3 mc_start routine, we allocate a number of rx control blocks * equal to twice the number of descriptors in the ring and we assign them * equally to the free list and to the working list. Each control block also has * DMA memory allocated and associated with which it will be used to receive the * actual packet data. All of a received frame's data will end up in a single * DMA buffer. * * During operation, we always maintain the invariant that each rx descriptor * has an associated rx control block which lives in the working list. If we * feel that we should loan up DMA memory to MAC in the form of a message block, * we can only do so if we can maintain this invariant. To do that, we swap in * one of the buffers from the free list. If none are available, then we resort * to using allocb(9F) and bcopy(9F) on the packet instead, regardless of the * size. * * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is * called on the block, at which point we restore the rx control block to the * free list and are able to reuse the DMA memory again. While the scheme may * seem odd, it importantly keeps us out of trying to do any DMA allocations in * the normal path of operation, even though we may still have to allocate * message blocks and copy. * * The following state machine describes the life time of a rx control block. In * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx * control block entry as rcb. * * | | * * ... 1/2 of all initial rcb's ... * * | | * v v * +------------------+ +------------------+ * | rcb on free list |---*---------->| rcb on work list | * +------------------+ . +------------------+ * ^ . moved to | * | replace rcb * . . Frame received, * | loaned to | entry on free list * | MAC + co. | available. rcb's * | | memory made into mblk_t * * . freemsg(9F) | and sent up to MAC. * | called on | * | loaned rcb | * | and it is v * | recycled. +-------------------+ * +--------------------<-----| rcb loaned to MAC | * +-------------------+ * * Finally, note that every rx control block has a reference count on it. One * reference is added as long as the driver has had the GLDv3 mc_start endpoint * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and * no other DLPI consumers remain, then we'll decrement the reference count by * one. Whenever we loan up the rx control block and associated buffer to MAC, * then we bump the reference count again. Even though the device is stopped, * there may still be loaned frames in upper levels that we'll want to account * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure * that it is cleaned up. * * -------------------- * Managing the RX Ring * -------------------- * * The receive ring descriptors are arranged in a circular buffer with a head * and tail pointer. There are both the conventional head and tail pointers * which are used to partition the ring into two portions, a portion that we, * the operating system, manage and a portion that is managed by hardware. When * hardware owns a descriptor in the ring, it means that it is waiting for data * to be filled in. However, when a portion of the ring is owned by the driver, * then that means that the descriptor has been consumed and we need to go take * a look at it. * * The initial head is configured to be zero by writing it as such in the * receive queue context in the FPM (function private memory from the host). The * initial tail is written to be the last descriptor. This is written to via the * PCIe register I40E_QRX_TAIL(). Technically, hardware owns everything between * the HEAD and TAIL, inclusive. Note that while we initially program the HEAD, * the only values we ever consult ourselves are the TAIL register and our own * state tracking. Effectively, we cache the HEAD register and then update it * ourselves based on our work. * * When we iterate over the rx descriptors and thus the received frames, we are * either in an interrupt context or we've been asked by MAC to poll on the * ring. If we've been asked to poll on the ring, we have a maximum number of * bytes of mblk_t's to return. If processing an rx descriptor would cause us to * exceed that count, then we do not process it. When in interrupt context, we * don't have a strict byte count. However, to ensure liveness, we limit the * amount of data based on a configuration value * (i40e_t`i40e_rx_limit_per_intr). The number that we've started with for this * is based on similar numbers that are used for ixgbe. After some additional * time in the field, we'll have a sense as to whether or not it should be * changed. * * When processing, we start at our own HEAD pointer * (i40e_rx_data_t`rxd_desc_next), which indicates the descriptor to start * processing. Every RX descriptor has what's described as the DD bit. This bit * (the LSB of the second 8-byte word), indicates whether or not the descriptor * is done. When we give descriptors to the hardware, this value is always * zero. When the hardware has finished a descriptor, it will always be one. * * The first thing that we check is whether the DD bit indicates that the * current HEAD is ready. If it isn't, then we're done. That's the primary * invariant of processing a frame. If it's done, then there are a few other * things that we want to look at. In the same status word as the DD bit, there * are two other important bits: * * o End of Packet (EOP) * o Error bits * * The end of packet indicates that we have reached the last descriptor. Now, * you might ask when would there be more than one descriptor. The reason for * that might be due to large receive offload (lro) or header splitting * functionality, which presently isn't supported in the driver. The error bits * in the frame are only valid when EOP is set. * * If error bits are set on the frame, then we still consume it; however, we * will not generate an mblk_t to send up to MAC. If there are no error bits * set, then we'll consume the descriptor either using bcopy or DMA binding. See * the earlier section 'RX DESCRIPTORS AND CONTROL BLOCKS' for more information * on how that selection is made. * * Regardless of whether we construct an mblk_t or encounter an error, we end up * resetting the descriptor. This re-arms the descriptor for hardware and in the * process, we may end up assigning it a new receive control bock. After we do * this, we always update our HEAD pointer, no matter what. * * Finally, once we've consumed as much as we will in a given window, we go and * update the TAIL register to indicate all the frames we've consumed. We only * do a single bulk write for the ring. * * --------------------------------- * TX Descriptors and Control Blocks * --------------------------------- * * While the transmit path is similar in spirit to the receive path, it works * differently due to the fact that all data is originated by the operating * system and not by the device. * * Like rx, there is both a descriptor ring that we use to communicate to the * driver and which points to the memory used to transmit a frame. Similarly, * there is a corresponding transmit control block. Each transmit control block * has a region of DMA memory allocated to it; however, the way we use it * varies. * * The driver is asked to process a single frame at a time. That message block * may be made up of multiple fragments linked together by the mblk_t`b_cont * member. The device has a hard limit of up to 8 buffers being allowed for use * for a single logical frame. For each fragment, we'll try and use an entry * from the tx descriptor ring and then we'll allocate a corresponding tx * control block. Depending on the size of the fragment, we may copy it around * or we might instead try to do DMA binding of the fragment. * * If we exceed the number of blocks that fit, we'll try to pull up the block * and then we'll do a DMA bind and send it out. * * If we don't have enough space in the ring or tx control blocks available, * then we'll return the unprocessed message block to MAC. This will induce flow * control and once we recycle enough entries, we'll once again enable sending * on the ring. * * We size the working list as equal to the number of descriptors in the ring. * We size the free list as equal to 1.5 times the number of descriptors in the * ring. We'll allocate a number of tx control block entries equal to the number * of entries in the free list. By default, all entries are placed in the free * list. As we come along and try to send something, we'll allocate entries from * the free list and add them to the working list, where they'll stay until the * hardware indicates that all of the data has been written back to us. The * reason that we start with 1.5x is to help facilitate having more than one TX * buffer associated with the DMA activity. * * -------------------- * Managing the TX Ring * -------------------- * * The transmit descriptor ring is driven by us. We maintain our own notion of a * HEAD and TAIL register and we update the hardware with updates to the TAIL * register. When the hardware is done writing out data, it updates us by * writing back to a specific address, not by updating the individual * descriptors. That address is a 4-byte region after the main transmit * descriptor ring. This is why the descriptor ring has an extra descriptor's * worth allocated to it. * * We maintain our notion of the HEAD in the i40e_trqpair_t`itrq_desc_head and * the TAIL in the i40e_trqpair_t`itrq_desc_tail. When we write out frames, * we'll update the tail there and in the I40E_QTX_TAIL() register. At various * points in time, through both interrupts, and our own internal checks, we'll * sync the write-back head portion of the DMA space. Based on the index it * reports back, we'll free everything between our current HEAD and the * indicated index and update HEAD to the new index. * * When a frame comes in, we try to use a number of transmit control blocks and * we'll transition them from the free list to the work list. They'll get moved * to the entry on the work list that corresponds with the transmit descriptor * they correspond to. Once we are indicated that the corresponding descriptor * has been freed, we'll return it to the list. * * The transmit control block free list is managed by keeping track of the * number of entries in it, i40e_trqpair_t`itrq_tcb_free. We use it as a way to * index into the free list and add things to it. In effect, we always push and * pop from the tail and protect it with a single lock, * i40e_trqpair_t`itrq_tcb_lock. This scheme is somewhat simplistic and may not * stand up to further performance testing; however, it does allow us to get off * the ground with the device driver. * * The following image describes where a given transmit control block lives in * its lifetime: * * | * * ... Initial placement for all tcb's * | * v * +------------------+ +------------------+ * | tcb on free list |---*------------------>| tcb on work list | * +------------------+ . +------------------+ * ^ . tcb allocated | * | to send frame v * | or fragment on | * | wire, mblk from | * | MAC associated. | * | | * +------*-------------------------------<----+ * . * . Hardware indicates * entry transmitted. * tcb recycled, mblk * from MAC freed. * * ------------ * Blocking MAC * ------------ * * When performing transmit, we can run out of descriptors and ring entries. * When such a case happens, we return the mblk_t to MAC to indicate that we've * been blocked. At that point in time, MAC becomes blocked and will not * transmit anything out that specific ring until we notify MAC. To indicate * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member * to B_TRUE. * * When we recycle tx descriptors then we'll end up signaling MAC by calling * mac_tx_ring_update() if we were blocked, letting it know that it's safe to * start sending frames out to us again. */ /* * We set our DMA alignment requests based on the smallest supported page size * of the corresponding platform. */ #if defined(__sparc) #define I40E_DMA_ALIGNMENT 0x2000ull #elif defined(__x86) #define I40E_DMA_ALIGNMENT 0x1000ull #else #error "unknown architecture for i40e" #endif /* * This structure is used to maintain information and flags related to * transmitting a frame. These fields are ultimately used to construct the * tx data descriptor(s) and, if necessary, the tx context descriptor. */ typedef struct i40e_tx_context { enum i40e_tx_desc_cmd_bits itc_data_cmdflags; uint32_t itc_data_offsets; boolean_t itc_ctx_tunneled; uint32_t itc_ctx_tunnel_fld; enum i40e_tx_ctx_desc_cmd_bits itc_ctx_cmdflags; uint32_t itc_ctx_tsolen; uint32_t itc_ctx_mss; } i40e_tx_context_t; /* * Toggles on debug builds which can be used to override our RX behaviour based * on thresholds. */ #ifdef DEBUG typedef enum { I40E_DEBUG_RX_DEFAULT = 0, I40E_DEBUG_RX_BCOPY = 1, I40E_DEBUG_RX_DMABIND = 2 } i40e_debug_rx_t; i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT; #endif /* DEBUG */ /* * Notes on the following pair of DMA attributes. The first attribute, * i40e_static_dma_attr, is designed to be used for both the descriptor rings * and the static buffers that we associate with control blocks. For this * reason, we force an SGL length of one. While technically the driver supports * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our * management here. In addition, when the Intel common code wants to allocate * memory via the i40e_allocate_virt_mem osdep function, we have it leverage * the static dma attr. * * The latter two sets of attributes, are what we use when we're binding a * bunch of mblk_t fragments to go out the door. Note that the main difference * here is that we're allowed a larger SGL length. For non-LSO tx, we * restrict the SGL length to match the number of tx buffers available to the * PF (8). For the LSO case we can go much larger, with the caveat that each * MSS-sized chunk (segment) must not span more than 8 data descriptors and * hence must not span more than 8 cookies. * * Note, we default to setting ourselves to be DMA capable here. However, * because we could have multiple instances which have different FMA error * checking capabilities, or end up on different buses, we make these static * and const and copy them into the i40e_t for the given device with the actual * values that reflect the actual capabilities. */ static const ddi_dma_attr_t i40e_g_static_dma_attr = { DMA_ATTR_V0, /* version number */ 0x0000000000000000ull, /* low address */ 0xFFFFFFFFFFFFFFFFull, /* high address */ 0x00000000FFFFFFFFull, /* dma counter max */ I40E_DMA_ALIGNMENT, /* alignment */ 0x00000FFF, /* burst sizes */ 0x00000001, /* minimum transfer size */ 0x00000000FFFFFFFFull, /* maximum transfer size */ 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ 1, /* scatter/gather list length */ 0x00000001, /* granularity */ DDI_DMA_FLAGERR /* DMA flags */ }; static const ddi_dma_attr_t i40e_g_txbind_dma_attr = { DMA_ATTR_V0, /* version number */ 0x0000000000000000ull, /* low address */ 0xFFFFFFFFFFFFFFFFull, /* high address */ I40E_MAX_TX_BUFSZ, /* dma counter max */ I40E_DMA_ALIGNMENT, /* alignment */ 0x00000FFF, /* burst sizes */ 0x00000001, /* minimum transfer size */ 0x00000000FFFFFFFFull, /* maximum transfer size */ 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ I40E_TX_MAX_COOKIE, /* scatter/gather list length */ 0x00000001, /* granularity */ DDI_DMA_FLAGERR /* DMA flags */ }; static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = { DMA_ATTR_V0, /* version number */ 0x0000000000000000ull, /* low address */ 0xFFFFFFFFFFFFFFFFull, /* high address */ I40E_MAX_TX_BUFSZ, /* dma counter max */ I40E_DMA_ALIGNMENT, /* alignment */ 0x00000FFF, /* burst sizes */ 0x00000001, /* minimum transfer size */ 0x00000000FFFFFFFFull, /* maximum transfer size */ 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ I40E_TX_LSO_MAX_COOKIE, /* scatter/gather list length */ 0x00000001, /* granularity */ DDI_DMA_FLAGERR /* DMA flags */ }; /* * Next, we have the attributes for these structures. The descriptor rings are * all strictly little endian, while the data buffers are just arrays of bytes * representing frames. Because of this, we purposefully simplify the driver * programming life by programming the descriptor ring as little endian, while * for the buffer data we keep it as unstructured. * * Note, that to keep the Intel common code operating in a reasonable way, when * we allocate DMA memory for it, we do not use byte swapping and thus use the * standard i40e_buf_acc_attr. */ static const ddi_device_acc_attr_t i40e_g_desc_acc_attr = { DDI_DEVICE_ATTR_V0, DDI_STRUCTURE_LE_ACC, DDI_STRICTORDER_ACC }; static const ddi_device_acc_attr_t i40e_g_buf_acc_attr = { DDI_DEVICE_ATTR_V0, DDI_NEVERSWAP_ACC, DDI_STRICTORDER_ACC }; /* * The next two functions are designed to be type-safe versions of macros that * are used to increment and decrement a descriptor index in the loop. Note, * these are marked inline to try and keep the data path hot and they were * effectively inlined in their previous life as macros. */ static inline int i40e_next_desc(int base, int count, int size) { int out; ASSERT(base >= 0); ASSERT(count > 0); ASSERT(size > 0); if (base + count < size) { out = base + count; } else { out = base + count - size; } ASSERT(out >= 0 && out < size); return (out); } static inline int i40e_prev_desc(int base, int count, int size) { int out; ASSERT(base >= 0); ASSERT(count > 0); ASSERT(size > 0); if (base >= count) { out = base - count; } else { out = base - count + size; } ASSERT(out >= 0 && out < size); return (out); } /* * Free DMA memory that is represented by a i40e_dma_buffer_t. */ static void i40e_free_dma_buffer(i40e_dma_buffer_t *dmap) { if (dmap->dmab_dma_address != NULL) { VERIFY(dmap->dmab_dma_handle != NULL); (void) ddi_dma_unbind_handle(dmap->dmab_dma_handle); dmap->dmab_dma_address = NULL; dmap->dmab_size = 0; } if (dmap->dmab_acc_handle != NULL) { ddi_dma_mem_free(&dmap->dmab_acc_handle); dmap->dmab_acc_handle = NULL; dmap->dmab_address = NULL; } if (dmap->dmab_dma_handle != NULL) { ddi_dma_free_handle(&dmap->dmab_dma_handle); dmap->dmab_dma_handle = NULL; } /* * These should only be set if we have valid handles allocated and * therefore should always be NULLed out due to the above code. This * is here to catch us acting sloppy. */ ASSERT(dmap->dmab_dma_address == NULL); ASSERT(dmap->dmab_address == NULL); ASSERT(dmap->dmab_size == 0); dmap->dmab_len = 0; } /* * Allocate size bytes of DMA memory based on the passed in attributes. This * fills in the information in dmap and is designed for all of our single cookie * allocations. */ static boolean_t i40e_alloc_dma_buffer(i40e_t *i40e, i40e_dma_buffer_t *dmap, ddi_dma_attr_t *attrsp, ddi_device_acc_attr_t *accp, boolean_t stream, boolean_t zero, size_t size) { int ret; uint_t flags; size_t len; ddi_dma_cookie_t cookie; uint_t ncookies; if (stream == B_TRUE) flags = DDI_DMA_STREAMING; else flags = DDI_DMA_CONSISTENT; /* * Step one: Allocate the DMA handle */ ret = ddi_dma_alloc_handle(i40e->i40e_dip, attrsp, DDI_DMA_DONTWAIT, NULL, &dmap->dmab_dma_handle); if (ret != DDI_SUCCESS) { i40e_error(i40e, "failed to allocate dma handle for I/O " "buffers: %d", ret); dmap->dmab_dma_handle = NULL; return (B_FALSE); } /* * Step two: Allocate the DMA memory */ ret = ddi_dma_mem_alloc(dmap->dmab_dma_handle, size, accp, flags, DDI_DMA_DONTWAIT, NULL, &dmap->dmab_address, &len, &dmap->dmab_acc_handle); if (ret != DDI_SUCCESS) { i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O " "buffers", size); dmap->dmab_address = NULL; dmap->dmab_acc_handle = NULL; i40e_free_dma_buffer(dmap); return (B_FALSE); } /* * Step three: Optionally zero */ if (zero == B_TRUE) bzero(dmap->dmab_address, len); /* * Step four: Bind the memory */ ret = ddi_dma_addr_bind_handle(dmap->dmab_dma_handle, NULL, dmap->dmab_address, len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT, NULL, &cookie, &ncookies); if (ret != DDI_DMA_MAPPED) { i40e_error(i40e, "failed to allocate %ld bytes of DMA for I/O " "buffers: %d", size, ret); i40e_free_dma_buffer(dmap); return (B_FALSE); } VERIFY(ncookies == 1); dmap->dmab_dma_address = cookie.dmac_laddress; dmap->dmab_size = len; dmap->dmab_len = 0; return (B_TRUE); } /* * This function is called once the last pending rcb has been freed by the upper * levels of the system. */ static void i40e_free_rx_data(i40e_rx_data_t *rxd) { VERIFY(rxd->rxd_rcb_pending == 0); if (rxd->rxd_rcb_area != NULL) { kmem_free(rxd->rxd_rcb_area, sizeof (i40e_rx_control_block_t) * (rxd->rxd_free_list_size + rxd->rxd_ring_size)); rxd->rxd_rcb_area = NULL; } if (rxd->rxd_free_list != NULL) { kmem_free(rxd->rxd_free_list, sizeof (i40e_rx_control_block_t *) * rxd->rxd_free_list_size); rxd->rxd_free_list = NULL; } if (rxd->rxd_work_list != NULL) { kmem_free(rxd->rxd_work_list, sizeof (i40e_rx_control_block_t *) * rxd->rxd_ring_size); rxd->rxd_work_list = NULL; } kmem_free(rxd, sizeof (i40e_rx_data_t)); } static boolean_t i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq) { i40e_rx_data_t *rxd; rxd = kmem_zalloc(sizeof (i40e_rx_data_t), KM_NOSLEEP); if (rxd == NULL) return (B_FALSE); itrq->itrq_rxdata = rxd; rxd->rxd_i40e = i40e; rxd->rxd_ring_size = i40e->i40e_rx_ring_size; rxd->rxd_free_list_size = i40e->i40e_rx_ring_size; rxd->rxd_rcb_free = rxd->rxd_free_list_size; rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * rxd->rxd_ring_size, KM_NOSLEEP); if (rxd->rxd_work_list == NULL) { i40e_error(i40e, "failed to allocate rx work list for a ring " "of %d entries for ring %d", rxd->rxd_ring_size, itrq->itrq_index); goto cleanup; } rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * rxd->rxd_free_list_size, KM_NOSLEEP); if (rxd->rxd_free_list == NULL) { i40e_error(i40e, "failed to allocate a %d entry rx free list " "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index); goto cleanup; } rxd->rxd_rcb_area = kmem_zalloc(sizeof (i40e_rx_control_block_t) * (rxd->rxd_free_list_size + rxd->rxd_ring_size), KM_NOSLEEP); if (rxd->rxd_rcb_area == NULL) { i40e_error(i40e, "failed to allocate a %d entry rcb area for " "ring %d", rxd->rxd_ring_size + rxd->rxd_free_list_size, itrq->itrq_index); goto cleanup; } return (B_TRUE); cleanup: i40e_free_rx_data(rxd); itrq->itrq_rxdata = NULL; return (B_FALSE); } /* * Free all of the memory that we've allocated for DMA. Note that we may have * buffers that we've loaned up to the OS which are still outstanding. We'll * always free up the descriptor ring, because we no longer need that. For each * rcb, we'll iterate over it and if we send the reference count to zero, then * we'll free the message block and DMA related resources. However, if we don't * take the last one, then we'll go ahead and keep track that we'll have pending * data and clean it up when we get there. */ static void i40e_free_rx_dma(i40e_rx_data_t *rxd, boolean_t failed_init) { uint32_t i, count, ref; i40e_rx_control_block_t *rcb; i40e_t *i40e = rxd->rxd_i40e; i40e_free_dma_buffer(&rxd->rxd_desc_area); rxd->rxd_desc_ring = NULL; rxd->rxd_desc_next = 0; mutex_enter(&i40e->i40e_rx_pending_lock); rcb = rxd->rxd_rcb_area; count = rxd->rxd_ring_size + rxd->rxd_free_list_size; for (i = 0; i < count; i++, rcb++) { VERIFY(rcb != NULL); /* * If we're cleaning up from a failed creation attempt, then an * entry may never have been assembled which would mean that * it's reference count is zero. If we find that, we leave it * be, because nothing else should be modifying it at this * point. We're not at the point that any more references can be * added, just removed. */ if (failed_init == B_TRUE && rcb->rcb_ref == 0) continue; ref = atomic_dec_32_nv(&rcb->rcb_ref); if (ref == 0) { freemsg(rcb->rcb_mp); rcb->rcb_mp = NULL; i40e_free_dma_buffer(&rcb->rcb_dma); } else { atomic_inc_32(&rxd->rxd_rcb_pending); atomic_inc_32(&i40e->i40e_rx_pending); } } mutex_exit(&i40e->i40e_rx_pending_lock); } /* * Initialize the DMA memory for the descriptor ring and for each frame in the * control block list. */ static boolean_t i40e_alloc_rx_dma(i40e_rx_data_t *rxd) { int i, count; size_t dmasz; i40e_rx_control_block_t *rcb; i40e_t *i40e = rxd->rxd_i40e; /* * First allocate the rx descriptor ring. */ dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size; VERIFY(dmasz > 0); if (i40e_alloc_dma_buffer(i40e, &rxd->rxd_desc_area, &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE, B_TRUE, dmasz) == B_FALSE) { i40e_error(i40e, "failed to allocate DMA resources " "for rx descriptor ring"); return (B_FALSE); } rxd->rxd_desc_ring = (i40e_rx_desc_t *)(uintptr_t)rxd->rxd_desc_area.dmab_address; rxd->rxd_desc_next = 0; count = rxd->rxd_ring_size + rxd->rxd_free_list_size; rcb = rxd->rxd_rcb_area; dmasz = i40e->i40e_rx_buf_size; VERIFY(dmasz > 0); for (i = 0; i < count; i++, rcb++) { i40e_dma_buffer_t *dmap; VERIFY(rcb != NULL); if (i < rxd->rxd_ring_size) { rxd->rxd_work_list[i] = rcb; } else { rxd->rxd_free_list[i - rxd->rxd_ring_size] = rcb; } dmap = &rcb->rcb_dma; if (i40e_alloc_dma_buffer(i40e, dmap, &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, B_TRUE, B_FALSE, dmasz) == B_FALSE) { i40e_error(i40e, "failed to allocate rx dma buffer"); return (B_FALSE); } /* * Initialize the control block and offset the DMA address. See * the note in the big theory statement that explains how this * helps IP deal with alignment. Note, we don't worry about * whether or not we successfully get an mblk_t from desballoc, * it's a common case that we have to handle later on in the * system. */ dmap->dmab_size -= I40E_BUF_IPHDR_ALIGNMENT; dmap->dmab_address += I40E_BUF_IPHDR_ALIGNMENT; dmap->dmab_dma_address += I40E_BUF_IPHDR_ALIGNMENT; rcb->rcb_ref = 1; rcb->rcb_rxd = rxd; rcb->rcb_free_rtn.free_func = i40e_rx_recycle; rcb->rcb_free_rtn.free_arg = (caddr_t)rcb; rcb->rcb_mp = desballoc((unsigned char *)dmap->dmab_address, dmap->dmab_size, 0, &rcb->rcb_free_rtn); } return (B_TRUE); } static void i40e_free_tx_dma(i40e_trqpair_t *itrq) { size_t fsz; if (itrq->itrq_tcb_area != NULL) { uint32_t i; i40e_tx_control_block_t *tcb = itrq->itrq_tcb_area; for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) { i40e_free_dma_buffer(&tcb->tcb_dma); if (tcb->tcb_dma_handle != NULL) { ddi_dma_free_handle(&tcb->tcb_dma_handle); tcb->tcb_dma_handle = NULL; } } fsz = sizeof (i40e_tx_control_block_t) * itrq->itrq_tx_free_list_size; kmem_free(itrq->itrq_tcb_area, fsz); itrq->itrq_tcb_area = NULL; } if (itrq->itrq_tcb_free_list != NULL) { fsz = sizeof (i40e_tx_control_block_t *) * itrq->itrq_tx_free_list_size; kmem_free(itrq->itrq_tcb_free_list, fsz); itrq->itrq_tcb_free_list = NULL; } if (itrq->itrq_tcb_work_list != NULL) { fsz = sizeof (i40e_tx_control_block_t *) * itrq->itrq_tx_ring_size; kmem_free(itrq->itrq_tcb_work_list, fsz); itrq->itrq_tcb_work_list = NULL; } i40e_free_dma_buffer(&itrq->itrq_desc_area); itrq->itrq_desc_ring = NULL; } static boolean_t i40e_alloc_tx_dma(i40e_trqpair_t *itrq) { int i, ret; size_t dmasz; i40e_tx_control_block_t *tcb; i40e_t *i40e = itrq->itrq_i40e; itrq->itrq_tx_ring_size = i40e->i40e_tx_ring_size; itrq->itrq_tx_free_list_size = i40e->i40e_tx_ring_size + (i40e->i40e_tx_ring_size >> 1); /* * Allocate an additional tx descriptor for the writeback head. */ dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size; dmasz += sizeof (i40e_tx_desc_t); VERIFY(dmasz > 0); if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area, &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE, B_TRUE, dmasz) == B_FALSE) { i40e_error(i40e, "failed to allocate DMA resources for tx " "descriptor ring"); return (B_FALSE); } itrq->itrq_desc_ring = (i40e_tx_desc_t *)(uintptr_t)itrq->itrq_desc_area.dmab_address; itrq->itrq_desc_wbhead = (uint32_t *)(itrq->itrq_desc_ring + itrq->itrq_tx_ring_size); itrq->itrq_desc_head = 0; itrq->itrq_desc_tail = 0; itrq->itrq_desc_free = itrq->itrq_tx_ring_size; itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size * sizeof (i40e_tx_control_block_t *), KM_NOSLEEP); if (itrq->itrq_tcb_work_list == NULL) { i40e_error(i40e, "failed to allocate a %d entry tx work list " "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index); goto cleanup; } itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size * sizeof (i40e_tx_control_block_t *), KM_SLEEP); if (itrq->itrq_tcb_free_list == NULL) { i40e_error(i40e, "failed to allocate a %d entry tx free list " "for ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index); goto cleanup; } /* * We allocate enough tx control blocks to cover the free list. */ itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) * itrq->itrq_tx_free_list_size, KM_NOSLEEP); if (itrq->itrq_tcb_area == NULL) { i40e_error(i40e, "failed to allocate a %d entry tcb area for " "ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index); goto cleanup; } /* * For each tcb, allocate DMA memory. */ dmasz = i40e->i40e_tx_buf_size; VERIFY(dmasz > 0); tcb = itrq->itrq_tcb_area; for (i = 0; i < itrq->itrq_tx_free_list_size; i++, tcb++) { VERIFY(tcb != NULL); /* * Allocate both a DMA buffer which we'll use for when we copy * packets for transmission and allocate a DMA handle which * we'll use when we bind data. */ ret = ddi_dma_alloc_handle(i40e->i40e_dip, &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL, &tcb->tcb_dma_handle); if (ret != DDI_SUCCESS) { i40e_error(i40e, "failed to allocate DMA handle for tx " "data binding on ring %d: %d", itrq->itrq_index, ret); tcb->tcb_dma_handle = NULL; goto cleanup; } ret = ddi_dma_alloc_handle(i40e->i40e_dip, &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL, &tcb->tcb_lso_dma_handle); if (ret != DDI_SUCCESS) { i40e_error(i40e, "failed to allocate DMA handle for tx " "LSO data binding on ring %d: %d", itrq->itrq_index, ret); tcb->tcb_lso_dma_handle = NULL; goto cleanup; } if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma, &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, B_TRUE, B_FALSE, dmasz) == B_FALSE) { i40e_error(i40e, "failed to allocate %ld bytes of " "DMA for tx data binding on ring %d", dmasz, itrq->itrq_index); goto cleanup; } itrq->itrq_tcb_free_list[i] = tcb; } itrq->itrq_tcb_free = itrq->itrq_tx_free_list_size; return (B_TRUE); cleanup: i40e_free_tx_dma(itrq); return (B_FALSE); } /* * Free all memory associated with all of the rings on this i40e instance. Note, * this is done as part of the GLDv3 stop routine. */ void i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init) { int i; for (i = 0; i < i40e->i40e_num_trqpairs; i++) { i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata; /* * Clean up our rx data. We have to free DMA resources first and * then if we have no more pending RCB's, then we'll go ahead * and clean things up. Note, we can't set the stopped flag on * the rx data until after we've done the first pass of the * pending resources. Otherwise we might race with * i40e_rx_recycle on determining who should free the * i40e_rx_data_t above. */ i40e_free_rx_dma(rxd, failed_init); mutex_enter(&i40e->i40e_rx_pending_lock); rxd->rxd_shutdown = B_TRUE; if (rxd->rxd_rcb_pending == 0) { i40e_free_rx_data(rxd); i40e->i40e_trqpairs[i].itrq_rxdata = NULL; } mutex_exit(&i40e->i40e_rx_pending_lock); i40e_free_tx_dma(&i40e->i40e_trqpairs[i]); } } /* * Allocate all of the resources associated with all of the rings on this i40e * instance. Note this is done as part of the GLDv3 start routine and thus we * should not use blocking allocations. This takes care of both DMA and non-DMA * related resources. */ boolean_t i40e_alloc_ring_mem(i40e_t *i40e) { int i; for (i = 0; i < i40e->i40e_num_trqpairs; i++) { if (i40e_alloc_rx_data(i40e, &i40e->i40e_trqpairs[i]) == B_FALSE) goto unwind; if (i40e_alloc_rx_dma(i40e->i40e_trqpairs[i].itrq_rxdata) == B_FALSE) goto unwind; if (i40e_alloc_tx_dma(&i40e->i40e_trqpairs[i]) == B_FALSE) goto unwind; } return (B_TRUE); unwind: i40e_free_ring_mem(i40e, B_TRUE); return (B_FALSE); } /* * Because every instance of i40e may have different support for FMA * capabilities, we copy the DMA attributes into the i40e_t and set them that * way and use them for determining attributes. */ void i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma) { bcopy(&i40e_g_static_dma_attr, &i40e->i40e_static_dma_attr, sizeof (ddi_dma_attr_t)); bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr, sizeof (ddi_dma_attr_t)); bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr, sizeof (ddi_dma_attr_t)); bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr, sizeof (ddi_device_acc_attr_t)); bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr, sizeof (ddi_device_acc_attr_t)); if (fma == B_TRUE) { i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; } else { i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; } } static void i40e_rcb_free(i40e_rx_data_t *rxd, i40e_rx_control_block_t *rcb) { mutex_enter(&rxd->rxd_free_lock); ASSERT(rxd->rxd_rcb_free < rxd->rxd_free_list_size); ASSERT(rxd->rxd_free_list[rxd->rxd_rcb_free] == NULL); rxd->rxd_free_list[rxd->rxd_rcb_free] = rcb; rxd->rxd_rcb_free++; mutex_exit(&rxd->rxd_free_lock); } static i40e_rx_control_block_t * i40e_rcb_alloc(i40e_rx_data_t *rxd) { i40e_rx_control_block_t *rcb; mutex_enter(&rxd->rxd_free_lock); if (rxd->rxd_rcb_free == 0) { mutex_exit(&rxd->rxd_free_lock); return (NULL); } rxd->rxd_rcb_free--; rcb = rxd->rxd_free_list[rxd->rxd_rcb_free]; VERIFY(rcb != NULL); rxd->rxd_free_list[rxd->rxd_rcb_free] = NULL; mutex_exit(&rxd->rxd_free_lock); return (rcb); } /* * This is the callback that we get from the OS when freemsg(9F) has been called * on a loaned descriptor. In addition, if we take the last reference count * here, then we have to tear down all of the rx data. */ void i40e_rx_recycle(caddr_t arg) { uint32_t ref; i40e_rx_control_block_t *rcb; i40e_rx_data_t *rxd; i40e_t *i40e; /* LINTED: E_BAD_PTR_CAST_ALIGN */ rcb = (i40e_rx_control_block_t *)arg; rxd = rcb->rcb_rxd; i40e = rxd->rxd_i40e; /* * It's possible for this to be called with a reference count of zero. * That will happen when we're doing the freemsg after taking the last * reference because we're tearing down everything and this rcb is not * outstanding. */ if (rcb->rcb_ref == 0) return; /* * Don't worry about failure of desballoc here. It'll only become fatal * if we're trying to use it and we can't in i40e_rx_bind(). */ rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address, rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn); i40e_rcb_free(rxd, rcb); /* * It's possible that the rcb was being used while we are shutting down * the device. In that case, we'll take the final reference from the * device here. */ ref = atomic_dec_32_nv(&rcb->rcb_ref); if (ref == 0) { freemsg(rcb->rcb_mp); rcb->rcb_mp = NULL; i40e_free_dma_buffer(&rcb->rcb_dma); mutex_enter(&i40e->i40e_rx_pending_lock); atomic_dec_32(&rxd->rxd_rcb_pending); atomic_dec_32(&i40e->i40e_rx_pending); /* * If this was the last block and it's been indicated that we've * passed the shutdown point, we should clean up. */ if (rxd->rxd_shutdown == B_TRUE && rxd->rxd_rcb_pending == 0) { i40e_free_rx_data(rxd); cv_broadcast(&i40e->i40e_rx_pending_cv); } mutex_exit(&i40e->i40e_rx_pending_lock); } } static mblk_t * i40e_rx_bind(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index, uint32_t plen) { mblk_t *mp; i40e_t *i40e = rxd->rxd_i40e; i40e_rx_control_block_t *rcb, *rep_rcb; ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock)); if ((rep_rcb = i40e_rcb_alloc(rxd)) == NULL) { itrq->itrq_rxstat.irxs_rx_bind_norcb.value.ui64++; return (NULL); } rcb = rxd->rxd_work_list[index]; /* * Check to make sure we have a mblk_t. If we don't, this is our last * chance to try and get one. */ if (rcb->rcb_mp == NULL) { rcb->rcb_mp = desballoc((unsigned char *)rcb->rcb_dma.dmab_address, rcb->rcb_dma.dmab_size, 0, &rcb->rcb_free_rtn); if (rcb->rcb_mp == NULL) { itrq->itrq_rxstat.irxs_rx_bind_nomp.value.ui64++; i40e_rcb_free(rxd, rcb); return (NULL); } } I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL); if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) { ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); atomic_or_32(&i40e->i40e_state, I40E_ERROR); i40e_rcb_free(rxd, rcb); return (NULL); } /* * Note, we've already accounted for the I40E_BUF_IPHDR_ALIGNMENT. */ mp = rcb->rcb_mp; atomic_inc_32(&rcb->rcb_ref); mp->b_wptr = mp->b_rptr + plen; mp->b_next = mp->b_cont = NULL; rxd->rxd_work_list[index] = rep_rcb; return (mp); } /* * We're going to allocate a new message block for this frame and attempt to * receive it. See the big theory statement for more information on when we copy * versus bind. */ static mblk_t * i40e_rx_copy(i40e_trqpair_t *itrq, i40e_rx_data_t *rxd, uint32_t index, uint32_t plen) { i40e_t *i40e = rxd->rxd_i40e; i40e_rx_control_block_t *rcb; mblk_t *mp; ASSERT(index < rxd->rxd_ring_size); rcb = rxd->rxd_work_list[index]; I40E_DMA_SYNC(&rcb->rcb_dma, DDI_DMA_SYNC_FORKERNEL); if (i40e_check_dma_handle(rcb->rcb_dma.dmab_dma_handle) != DDI_FM_OK) { ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); atomic_or_32(&i40e->i40e_state, I40E_ERROR); return (NULL); } mp = allocb(plen + I40E_BUF_IPHDR_ALIGNMENT, 0); if (mp == NULL) { itrq->itrq_rxstat.irxs_rx_copy_nomem.value.ui64++; return (NULL); } mp->b_rptr += I40E_BUF_IPHDR_ALIGNMENT; bcopy(rcb->rcb_dma.dmab_address, mp->b_rptr, plen); mp->b_wptr = mp->b_rptr + plen; return (mp); } /* * Determine if this pinfo is valid for L4 outer checksum offload for a * non-tunneled packet. This is the case for an IP packe */ static inline int i40e_rx_ptype_nontunnel_ol4(struct i40e_rx_ptype_decoded *pinfo) { return (pinfo->outer_ip == I40E_RX_PTYPE_OUTER_IP && pinfo->tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE && (pinfo->inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP || pinfo->inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP || pinfo->inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)); } static inline int i40e_rx_ptype_tunnel_il4(struct i40e_rx_ptype_decoded *pinfo) { return (pinfo->outer_ip == I40E_RX_PTYPE_OUTER_IP && (pinfo->tunnel_type == I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC || pinfo->tunnel_type == I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN) && pinfo->tunnel_end_frag == I40E_RX_PTYPE_NOT_FRAG && pinfo->tunnel_end_prot != I40E_RX_PTYPE_TUNNEL_END_NONE && (pinfo->inner_prot == I40E_RX_PTYPE_INNER_PROT_UDP || pinfo->inner_prot == I40E_RX_PTYPE_INNER_PROT_TCP || pinfo->inner_prot == I40E_RX_PTYPE_INNER_PROT_SCTP)); } /* * Determine if the device has enabled any checksum flags for us. The level of * checksum computed will depend on the type packet that we have, which is * contained in ptype. For example, the checksum logic it does will vary * depending on whether or not the packet is considered tunneled, whether it * recognizes the L4 type, etc. Section 8.3.4.3 summarizes which checksums are * valid. * * While there are additional checksums that we could recognize here, we'll need * to get some additional GLDv3 enhancements to be able to properly describe * them. */ static void i40e_rx_hcksum(i40e_trqpair_t *itrq, mblk_t *mp, uint64_t status, uint32_t err, uint32_t ptype) { uint32_t cksum; struct i40e_rx_ptype_decoded pinfo; ASSERT(ptype <= 255); pinfo = decode_rx_desc_ptype(ptype); cksum = 0; /* * If the ptype isn't something that we know in the driver, then we * shouldn't even consider moving forward. */ if (pinfo.known == 0) { itrq->itrq_rxstat.irxs_hck_unknown.value.ui64++; return; } /* * If hardware didn't set the L3L4P bit on the frame, then there is no * checksum offload to consider. */ if ((status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) == 0) { itrq->itrq_rxstat.irxs_hck_nol3l4p.value.ui64++; return; } /* * The device tells us that IPv6 checksums where a Destination Options * Header or a Routing header shouldn't be trusted. Discard all * checksums in this case. */ if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6 && (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))) { itrq->itrq_rxstat.irxs_hck_v6skip.value.ui64++; return; } /* * The hardware denotes three kinds of possible errors. Two are used * for inner and outer IP checksum errors (IPE and EIPE) and the third * is for L4 checksum errors (L4E). If there is only one IP header, then * the only thing that we care about is IPE. However, if this is a * tunnel packet, then we care about EPIE. Note, none of this controls * whether or not we have an inner IPv4 checksum. */ if (pinfo.outer_ip == I40E_RX_PTYPE_OUTER_IP && pinfo.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) { if (pinfo.tunnel_type == I40E_RX_PTYPE_TUNNEL_NONE) { if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) { itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++; } else { itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++; cksum |= HCK_IPV4_HDRCKSUM_OK; } } else { if ((err & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)) != 0) { itrq->itrq_rxstat.irxs_hck_eiperr.value.ui64++; } else { itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++; cksum |= HCK_IPV4_HDRCKSUM_OK; } } } /* * If we have a fragmented packet in any form, we're done. */ if (pinfo.outer_frag != I40E_RX_PTYPE_NOT_FRAG) goto done; /* * If we have a tunneled packet and the inner IP header is IPv4, check * IPE to see if we have a valid L4 checksum. */ if (pinfo.tunnel_type != I40E_RX_PTYPE_TUNNEL_NONE && pinfo.tunnel_end_prot == I40E_RX_PTYPE_TUNNEL_END_IPV4) { if ((err & (1 << I40E_RX_DESC_ERROR_IPE_SHIFT)) != 0) { itrq->itrq_rxstat.irxs_hck_iperr.value.ui64++; } else { itrq->itrq_rxstat.irxs_hck_v4hdrok.value.ui64++; cksum |= HCK_INNER_IPV4_HDRCKSUM_OK; } } /* * Determine if we have a valid outer L4 checksum. The only supported L4 * checksums are TCP, SCTP, and UDP. If this is a UDP tunneled packet, * then there is no support for the outer L4 unless we are on the X722 * MAC. However, we do not support that at this time. */ if (i40e_rx_ptype_nontunnel_ol4(&pinfo)) { if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) { itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++; } else { itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++; cksum |= HCK_FULLCKSUM_OK; } } if (i40e_rx_ptype_tunnel_il4(&pinfo)) { if ((err & (1 << I40E_RX_DESC_ERROR_L4E_SHIFT)) != 0) { itrq->itrq_rxstat.irxs_hck_l4err.value.ui64++; } else { itrq->itrq_rxstat.irxs_hck_l4hdrok.value.ui64++; cksum |= HCK_INNER_FULLCKSUM_OK; } } done: if (cksum != 0) { itrq->itrq_rxstat.irxs_hck_set.value.ui64++; mac_hcksum_set(mp, 0, 0, 0, 0, cksum); } else { itrq->itrq_rxstat.irxs_hck_miss.value.ui64++; } } mblk_t * i40e_ring_rx(i40e_trqpair_t *itrq, int poll_bytes) { i40e_t *i40e; i40e_hw_t *hw; i40e_rx_data_t *rxd; uint32_t cur_head; i40e_rx_desc_t *cur_desc; i40e_rx_control_block_t *rcb; uint64_t rx_bytes, rx_frames; uint64_t stword; mblk_t *mp, *mp_head, **mp_tail; ASSERT(MUTEX_HELD(&itrq->itrq_rx_lock)); rxd = itrq->itrq_rxdata; i40e = itrq->itrq_i40e; hw = &i40e->i40e_hw_space; if (!(i40e->i40e_state & I40E_STARTED) || (i40e->i40e_state & I40E_OVERTEMP) || (i40e->i40e_state & I40E_SUSPENDED) || (i40e->i40e_state & I40E_ERROR)) return (NULL); /* * Before we do anything else, we have to make sure that all of the DMA * buffers are synced up and then check to make sure that they're * actually good from an FM perspective. */ I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORKERNEL); if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) != DDI_FM_OK) { ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); atomic_or_32(&i40e->i40e_state, I40E_ERROR); return (NULL); } /* * Prepare our stats. We do a limited amount of processing in both * polling and interrupt context. The limit in interrupt context is * based on frames, in polling context based on bytes. */ rx_bytes = rx_frames = 0; mp_head = NULL; mp_tail = &mp_head; /* * At this point, the descriptor ring is available to check. We'll try * and process until we either run out of poll_bytes or descriptors. */ cur_head = rxd->rxd_desc_next; cur_desc = &rxd->rxd_desc_ring[cur_head]; stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len); /* * Note, the primary invariant of this loop should be that cur_head, * cur_desc, and stword always point to the currently processed * descriptor. When we leave the loop, it should point to a descriptor * that HAS NOT been processed. Meaning, that if we haven't consumed the * frame, the descriptor should not be advanced. */ while ((stword & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) != 0) { uint32_t error, eop, plen, ptype; /* * The DD, PLEN, and EOP bits are the only ones that are valid * in every frame. The error information is only valid when EOP * is set in the same frame. * * At this time, because we don't do any LRO or header * splitting. We expect that every frame should have EOP set in * it. When later functionality comes in, we'll want to * re-evaluate this. */ eop = stword & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT); VERIFY(eop != 0); error = (stword & I40E_RXD_QW1_ERROR_MASK) >> I40E_RXD_QW1_ERROR_SHIFT; if (error & I40E_RX_ERR_BITS) { itrq->itrq_rxstat.irxs_rx_desc_error.value.ui64++; goto discard; } plen = (stword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> I40E_RXD_QW1_LENGTH_PBUF_SHIFT; ptype = (stword & I40E_RXD_QW1_PTYPE_MASK) >> I40E_RXD_QW1_PTYPE_SHIFT; /* * This packet contains valid data. We should check to see if * we're actually going to consume it based on its length (to * ensure that we don't overshoot our quota). We determine * whether to bcopy or bind the DMA resources based on the size * of the frame. However, if on debug, we allow it to be * overridden for testing purposes. * * We should be smarter about this and do DMA binding for * larger frames, but for now, it's really more important that * we actually just get something simple working. */ /* * Ensure we don't exceed our polling quota by reading this * frame. Note we only bump bytes now, we bump frames later. */ if ((poll_bytes != I40E_POLL_NULL) && (rx_bytes + plen) > poll_bytes) break; rx_bytes += plen; mp = NULL; if (plen >= i40e->i40e_rx_dma_min) mp = i40e_rx_bind(itrq, rxd, cur_head, plen); if (mp == NULL) mp = i40e_rx_copy(itrq, rxd, cur_head, plen); if (mp != NULL) { if (i40e->i40e_rx_hcksum_enable) i40e_rx_hcksum(itrq, mp, stword, error, ptype); *mp_tail = mp; mp_tail = &mp->b_next; } /* * Now we need to prepare this frame for use again. See the * discussion in the big theory statements. * * However, right now we're doing the simple version of this. * Normally what we'd do would depend on whether or not we were * doing DMA binding or bcopying. But because we're always doing * bcopying, we can just always use the current index as a key * for what to do and reassign the buffer based on the ring. */ discard: rcb = rxd->rxd_work_list[cur_head]; cur_desc->read.pkt_addr = CPU_TO_LE64((uintptr_t)rcb->rcb_dma.dmab_dma_address); cur_desc->read.hdr_addr = 0; /* * Finally, update our loop invariants. */ cur_head = i40e_next_desc(cur_head, 1, rxd->rxd_ring_size); cur_desc = &rxd->rxd_desc_ring[cur_head]; stword = LE64_TO_CPU(cur_desc->wb.qword1.status_error_len); /* * To help provide liveness, we limit the amount of data that * we'll end up counting. Note that in these cases, an interrupt * is not dissimilar from a polling request. */ rx_frames++; if (rx_frames > i40e->i40e_rx_limit_per_intr) { itrq->itrq_rxstat.irxs_rx_intr_limit.value.ui64++; break; } } /* * As we've modified the ring, we need to make sure that we sync the * descriptor ring for the device. Next, we update the hardware and * update our notion of where the head for us to read from hardware is * next. */ I40E_DMA_SYNC(&rxd->rxd_desc_area, DDI_DMA_SYNC_FORDEV); if (i40e_check_dma_handle(rxd->rxd_desc_area.dmab_dma_handle) != DDI_FM_OK) { ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); atomic_or_32(&i40e->i40e_state, I40E_ERROR); } if (rx_frames != 0) { uint32_t tail; ddi_acc_handle_t rh = i40e->i40e_osdep_space.ios_reg_handle; rxd->rxd_desc_next = cur_head; tail = i40e_prev_desc(cur_head, 1, rxd->rxd_ring_size); I40E_WRITE_REG(hw, I40E_QRX_TAIL(itrq->itrq_index), tail); if (i40e_check_acc_handle(rh) != DDI_FM_OK) { ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); atomic_or_32(&i40e->i40e_state, I40E_ERROR); } itrq->itrq_rxstat.irxs_bytes.value.ui64 += rx_bytes; itrq->itrq_rxstat.irxs_packets.value.ui64 += rx_frames; } #ifdef DEBUG if (rx_frames == 0) { ASSERT(rx_bytes == 0); } #endif return (mp_head); } /* * This function is called by the GLDv3 when it wants to poll on a ring. The * only primary difference from when we call this during an interrupt is that we * have a limit on the number of bytes that we should consume. */ mblk_t * i40e_ring_rx_poll(void *arg, int poll_bytes) { i40e_trqpair_t *itrq = arg; mblk_t *mp; ASSERT(poll_bytes > 0); if (poll_bytes == 0) return (NULL); mutex_enter(&itrq->itrq_rx_lock); mp = i40e_ring_rx(itrq, poll_bytes); mutex_exit(&itrq->itrq_rx_lock); return (mp); } /* * This is a structure I wish someone would fill out for me for dorking with the * checksums. When we get some more experience with this, we should go ahead and * consider adding this to MAC. */ typedef enum mac_ether_offload_flags { MEOI_L2INFO_SET = 0x001, MEOI_VLAN_TAGGED = 0x002, MEOI_L3INFO_SET = 0x004, MEOI_L4INFO_SET = 0x010, MEOI_TUN_INFO_SET = 0x020, MEOI_TUN_L2INFO_SET = 0x040, MEOI_TUN_VLAN_TAGGED = 0x080, MEOI_TUN_L3INFO_SET = 0x100, MEOI_TUN_L4INFO_SET = 0x200, } mac_ether_offload_flags_t; #define MEOI_L2_L3_L4 (MEOI_L2INFO_SET | MEOI_L3INFO_SET | MEOI_L4INFO_SET) typedef struct mac_ether_offload_info { mac_ether_offload_flags_t meoi_flags; uint8_t meoi_l2hlen; /* How long is the Ethernet header? */ uint16_t meoi_l3proto; /* What's the Ethertype */ uint8_t meoi_l3hlen; /* How long is the header? */ uint8_t meoi_l4proto; /* What is the payload type? */ uint8_t meoi_l4hlen; /* How long is the L4 header */ /* * The following members are used when tunneling (e.g. vxlan) */ uint8_t meoi_tun_protlen; /* Length of the tunnel protocol */ uint8_t meoi_tun_l2hlen; /* How long is the Ethernet header? */ uint16_t meoi_tun_l3proto; /* What's the Ethertype */ uint8_t meoi_tun_l3hlen; /* How long is the header? */ uint8_t meoi_tun_l4proto; /* What is the payload type? */ uint8_t meoi_tun_l4hlen; /* How long is the L4 header */ } mac_ether_offload_info_t; /* * This is something that we'd like to make a general MAC function. Before we do * that, we should add support for TSO. * * We should really keep track of our offset and not walk everything every * time. I can't imagine that this will be kind to us at high packet rates; * however, for the moment, let's leave that. * * This walks a message block chain without pulling up to fill in the context * information. Note that the data we care about could be hidden across more * than one mblk_t. */ static int i40e_meoi_get_uint8(mblk_t *mp, off_t off, uint8_t *out) { size_t mpsize; uint8_t *bp; mpsize = msgsize(mp); /* Check for overflow */ if (off + sizeof (uint16_t) > mpsize) return (-1); mpsize = MBLKL(mp); while (off >= mpsize) { mp = mp->b_cont; off -= mpsize; mpsize = MBLKL(mp); } bp = mp->b_rptr + off; *out = *bp; return (0); } static int i40e_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out) { size_t mpsize; uint8_t *bp; mpsize = msgsize(mp); /* Check for overflow */ if (off + sizeof (uint16_t) > mpsize) return (-1); mpsize = MBLKL(mp); while (off >= mpsize) { mp = mp->b_cont; off -= mpsize; mpsize = MBLKL(mp); } /* * Data is in network order. Note the second byte of data might be in * the next mp. */ bp = mp->b_rptr + off; *out = *bp << 8; if (off + 1 == mpsize) { mp = mp->b_cont; bp = mp->b_rptr; } else { bp++; } *out |= *bp; return (0); } static int i40e_meoi_zero_uint16(mblk_t *mp, off_t off) { size_t mpsize; uint8_t *bp; mpsize = msgsize(mp); /* Check for overflow */ if (off + sizeof (uint16_t) > mpsize) return (-1); mpsize = MBLKL(mp); while (off >= mpsize) { mp = mp->b_cont; off -= mpsize; mpsize = MBLKL(mp); } /* * Data is in network order. Note the second byte of data might be in * the next mp. */ bp = mp->b_rptr + off; *bp = 0; if (off + 1 == mpsize) { mp = mp->b_cont; bp = mp->b_rptr; } else { bp++; } *bp = 0; return (0); } static int mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi, uint32_t ttype, size_t starting_off) { size_t off; uint16_t ether; uint8_t ipproto, iplen, l4len, maclen; bzero(meoi, sizeof (mac_ether_offload_info_t)); if (ttype != TTYPE_NONE && ttype != TTYPE_VXLAN) return (-1); off = offsetof(struct ether_header, ether_type) + starting_off; if (i40e_meoi_get_uint16(mp, off, ðer) != 0) return (-1); if (ether == ETHERTYPE_VLAN) { off = offsetof(struct ether_vlan_header, ether_type) + starting_off; if (i40e_meoi_get_uint16(mp, off, ðer) != 0) return (-1); meoi->meoi_flags |= MEOI_VLAN_TAGGED; maclen = sizeof (struct ether_vlan_header); } else { maclen = sizeof (struct ether_header); } meoi->meoi_flags |= MEOI_L2INFO_SET; meoi->meoi_l2hlen = maclen; meoi->meoi_l3proto = ether; switch (ether) { case ETHERTYPE_IP: /* * For IPv4 we need to get the length of the header, as it can * be variable. */ off = offsetof(ipha_t, ipha_version_and_hdr_length) + maclen + starting_off; if (i40e_meoi_get_uint8(mp, off, &iplen) != 0) return (-1); iplen &= 0x0f; if (iplen < 5 || iplen > 0x0f) return (-1); iplen *= 4; off = offsetof(ipha_t, ipha_protocol) + maclen + starting_off; if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1) return (-1); break; case ETHERTYPE_IPV6: iplen = 40; off = offsetof(ip6_t, ip6_nxt) + maclen + starting_off; if (i40e_meoi_get_uint8(mp, off, &ipproto) == -1) return (-1); break; default: return (0); } meoi->meoi_l3hlen = iplen; meoi->meoi_l4proto = ipproto; meoi->meoi_flags |= MEOI_L3INFO_SET; switch (ipproto) { case IPPROTO_TCP: off = offsetof(tcph_t, th_offset_and_rsrvd) + maclen + iplen + starting_off; if (i40e_meoi_get_uint8(mp, off, &l4len) == -1) return (-1); l4len = (l4len & 0xf0) >> 4; if (l4len < 5 || l4len > 0xf) return (-1); l4len *= 4; break; case IPPROTO_UDP: l4len = sizeof (struct udphdr); break; case IPPROTO_SCTP: l4len = sizeof (sctp_hdr_t); break; default: return (0); } meoi->meoi_l4hlen = l4len; meoi->meoi_flags |= MEOI_L4INFO_SET; if (ttype == TTYPE_VXLAN) { /* * Recursively call ourselves to obtain the tunneled L2/L3/L4 * data, using the proper starting offset to the tunneled * packet. */ int ret; mac_ether_offload_info_t meo; /* We currently only support tunneling within UDP */ if (ipproto != IPPROTO_UDP) return (-1); ASSERT(starting_off == 0); off = maclen + iplen + l4len + VXLAN_HDR_LEN; ret = mac_ether_offload_info(mp, &meo, TTYPE_NONE, off); if (ret != 0) return (ret); meoi->meoi_tun_protlen = VXLAN_HDR_LEN; meoi->meoi_flags |= MEOI_TUN_INFO_SET; if (meo.meoi_flags & MEOI_L2INFO_SET) { meoi->meoi_flags |= MEOI_TUN_L2INFO_SET; meoi->meoi_tun_l2hlen = meo.meoi_l2hlen; } if (meo.meoi_flags & MEOI_VLAN_TAGGED) { meoi->meoi_flags |= MEOI_TUN_VLAN_TAGGED; } if (meo.meoi_flags & MEOI_L3INFO_SET) { meoi->meoi_flags |= MEOI_TUN_L3INFO_SET; meoi->meoi_tun_l3proto = meo.meoi_l3proto; meoi->meoi_tun_l3hlen = meo.meoi_l3hlen; } if (meo.meoi_flags & MEOI_L4INFO_SET) { meoi->meoi_flags |= MEOI_TUN_L4INFO_SET; meoi->meoi_tun_l4proto = meo.meoi_l4proto; meoi->meoi_tun_l4hlen = meo.meoi_l4hlen; } } return (0); } /* * Determine if we have sufficient checksum flags to perform TSO. This varies * based on the tunnel type. If we have normal TSO traffic, we need both the * IPv4 header checksum and the L4 checksum. For VXLAN encoded traffic, we need * the outer IPv4 checksum and inner checksums. * * At this time the networking stack only supports TSO on IPv4 and the X710 * hardware can't support VXLAN aware TSO on IPv6 due to the fact that it can't * perform the UDP checksum. */ static inline boolean_t i40e_tx_tso_have_cksums(uint32_t chkflags, uint32_t ttype) { if (ttype == TTYPE_NONE) { if ((chkflags & HCK_IPV4_HDRCKSUM) == 0) return (B_FALSE); if ((chkflags & HCK_PARTIALCKSUM) == 0) return (B_FALSE); } else if (ttype == TTYPE_VXLAN) { if ((chkflags & HCK_IPV4_HDRCKSUM) == 0) return (B_FALSE); /* * We can't perform LSO if we need an outer checksum, so that's * an error. */ if ((chkflags & HCK_PARTIALCKSUM) != 0) return (B_FALSE); /* * When the networking stack supports TSO over IPv6, this check * will need to be conditional on protocol. */ if ((chkflags & HCK_INNER_IPV4_HDRCKSUM_NEEDED) == 0) return (B_FALSE); if ((chkflags & HCK_INNER_PSEUDO_NEEDED) == 0) return (B_FALSE); } else { return (B_FALSE); } return (B_TRUE); } /* * Fix up the message block for TSO to match what hardware expects. The hardware * requires that the length and checksum for all IP headers be zero. It requires * that the outer UDP checksum be zero and that the length field be zero. The * networking stack will have taken care of making sure that the inner (or * single) TCP header is OK. What we have to do is make sure that: * * 1. Outer IP length is zero * 2. Outer UDP length (if it exists) is zero * 3. Inner IP length (if it exists) is zero */ static boolean_t i40e_tx_tso_fix_mp(mblk_t *mp, uint32_t ttype, mac_ether_offload_info_t *infop) { off_t off = infop->meoi_l2hlen; if (infop->meoi_l3proto == ETHERTYPE_IP) { i40e_meoi_zero_uint16(mp, off + offsetof(ipha_t, ipha_length)); } else if (infop->meoi_l3proto == ETHERTYPE_IPV6) { i40e_meoi_zero_uint16(mp, off + offsetof(ip6_t, ip6_plen)); } else { return (B_FALSE); } if (ttype == TTYPE_NONE) { return (B_TRUE); } else if (ttype != TTYPE_VXLAN) { return (B_FALSE); } off += infop->meoi_l3hlen; if (infop->meoi_l4proto != IPPROTO_UDP) { return (B_FALSE); } #if 0 i40e_meoi_zero_uint16(mp, off + offsetof(struct udphdr, uh_ulen)); #endif off += infop->meoi_l4hlen + infop->meoi_tun_protlen + infop->meoi_tun_l2hlen; if (infop->meoi_tun_l3proto == ETHERTYPE_IP) { i40e_meoi_zero_uint16(mp, off + offsetof(ipha_t, ipha_length)); } else if (infop->meoi_tun_l3proto == ETHERTYPE_IPV6) { i40e_meoi_zero_uint16(mp, off + offsetof(ip6_t, ip6_plen)); } else { return (B_FALSE); } return (B_TRUE); } /* * Attempt to put togther the information we'll need to feed into a descriptor * to properly program the hardware for checksum offload as well as the * generally required flags. * * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to * 'or' into the descriptor based on the checksum flags for this mblk_t and the * actual information we care about. * * If we're using LSO or need to perform tunneling-based checksums, then we'll * fill in information that will be used for the Transmit Context Descriptor. */ static int i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, i40e_tx_context_t *tctx) { int ret; uint32_t chkflags, start, mss, lsoflags, ttype; mac_ether_offload_info_t meo; i40e_txq_stat_t *txs = &itrq->itrq_txstat; boolean_t tunneled; bzero(tctx, sizeof (i40e_tx_context_t)); if (i40e->i40e_tx_hcksum_enable != B_TRUE) return (0); mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags); mac_lso_get(mp, &mss, &lsoflags); mac_tunnel_type_get(mp, &ttype); if (chkflags == 0 && lsoflags == 0) return (0); /* * Have we been asked to perform an inner checksum? This implies a * tunneled packet. */ tunneled = (chkflags & (HCK_INNER_IPV4_HDRCKSUM_NEEDED | HCK_INNER_PSEUDO_NEEDED)) != 0; if (tunneled && ttype != TTYPE_VXLAN) { txs->itxs_hck_badttype.value.ui64++; return (-1); } tctx->itc_ctx_tunneled = tunneled; if ((ret = mac_ether_offload_info(mp, &meo, ttype, 0)) != 0) { txs->itxs_hck_meoifail.value.ui64++; return (ret); } /* * Tunneling: * See Table 8-21 * EIPT = 11b calc. outer IP checksum * IIPT = 11b calc. inner IP checksum * L4TUNT = 01b UDP/GRE tunneling * EIPLEN set to outer IP header len. * L4TUNLEN set to L4 tunnel len. see table 8-21 and the * "IP -> GRE / UDP -> MAC (with/without VLAN) * -> IP -> L4" entry. This is the UDP & VXLAN * header len. up to the inner IP header. * * See Figure 8-9 * Packet Headers and descriptor length fields: * Outer MAC | External IP | UDP | Inner MAC | Inner IP | L4 | data * <-MACLEN-> <---EIPLEN--> <----L4TUNLEN---> <-IPLEN--> <-L4LEN-> */ if (tunneled) { /* * Setup to calculate the 3 possible requested HW checksum(s) * for the tunnel: * Inner IPv4 checksum if IIPT = 11b * Outer IPv4 checksum if EIPT = 11b * L4 checksum if L4LEN is meaningful */ uint8_t eipt; uint_t l4tunlen; /* * The MAC ether offload logic should have verified that we have * the right information for calculating the checksums here. * Make sure that this is the case. We'll check that we have * what we need for the checksum types as appropriate. We always * requiere having the inner L2/L3 information. We only require * Inner L4 info if we've been asked to do something in * particular. */ if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) { txs->itxs_hck_nol2info.value.ui64++; return (-1); } if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) { txs->itxs_hck_nol3info.value.ui64++; return (-1); } if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0 || meo.meoi_l4proto != IPPROTO_UDP) { txs->itxs_hck_badl4.value.ui64++; return (-1); } if ((meo.meoi_flags & MEOI_TUN_L2INFO_SET) == 0) { txs->itxs_hck_notunl2info.value.ui64++; return (-1); } if ((meo.meoi_flags & MEOI_TUN_L3INFO_SET) == 0) { txs->itxs_hck_notunl3info.value.ui64++; return (-1); } if ((chkflags & HCK_PARTIALCKSUM) != 0) { txs->itxs_hck_outer.value.ui64++; return (-1); } /* * First fill in the descriptors for the tunneling extensions. */ l4tunlen = meo.meoi_l4hlen + meo.meoi_tun_l2hlen + meo.meoi_tun_protlen;; /* outer IP */ if (chkflags & HCK_IPV4_HDRCKSUM) { if (meo.meoi_l3proto == ETHERTYPE_IP) { eipt = I40E_TX_DESC_TNL_EIPT_IPV4_CSUM; } else { txs->itxs_hck_badl3.value.ui64++; return (-1); } } else { if (meo.meoi_l3proto == ETHERTYPE_IP) { eipt = I40E_TX_DESC_TNL_EIPT_IPV4; } else if (meo.meoi_l3proto == ETHERTYPE_IPV6) { eipt = I40E_TX_DESC_TNL_EIPT_IPV6; } else { txs->itxs_hck_badl3.value.ui64++; return (-1); } } /* XXX make comment for L4TUNT */ tctx->itc_ctx_tunnel_fld = I40E_TXD_TNL_SET_EIPT(eipt) | I40E_TXD_TNL_SET_EIPLEN(meo.meoi_l3hlen >> 2) | I40E_TXD_TNL_SET_L4TUNT(1) | I40E_TXD_TNL_SET_L4TUNLEN(l4tunlen >> 1) | I40E_TXD_TNL_SET_DECTTL(0); /* The MAC len is for the outer, irregardless of tunneling */ tctx->itc_data_offsets |= (meo.meoi_l2hlen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT; /* inner IP */ if (chkflags & HCK_INNER_IPV4_HDRCKSUM_NEEDED) { /* When tunneled, IIPT applies to the inner IP (L3) */ if (meo.meoi_tun_l3proto != ETHERTYPE_IP) { txs->itxs_hck_badl3.value.ui64++; return (-1); } tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; } else { if (meo.meoi_l3proto == ETHERTYPE_IP) { tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4; } else if (meo.meoi_l3proto == ETHERTYPE_IPV6) { tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV6; } else { txs->itxs_hck_badl3.value.ui64++; return (-1); } } /* set the inner IP header length */ tctx->itc_data_offsets |= (meo.meoi_tun_l3hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT; if (chkflags & HCK_INNER_PSEUDO_NEEDED) { if ((meo.meoi_flags & MEOI_TUN_L4INFO_SET) == 0) { txs->itxs_hck_notunl4info.value.ui64++; return (-1); } /* L4T */ switch (meo.meoi_tun_l4proto) { case IPPROTO_TCP: tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP; break; case IPPROTO_UDP: tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP; break; case IPPROTO_SCTP: tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP; break; default: txs->itxs_hck_badl4.value.ui64++; return (-1); } /* setting L4LEN initiates inner L4 HW checksum */ tctx->itc_data_offsets |= (meo.meoi_tun_l4hlen >> 2) << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; } } else { /* Not tunneled */ /* * Have we been asked to checksum an IPv4 header. If so, verify * that we have sufficient information and then set the proper * fields in the command structure. */ if (chkflags & HCK_IPV4_HDRCKSUM) { if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) { txs->itxs_hck_nol2info.value.ui64++; return (-1); } if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) { txs->itxs_hck_nol3info.value.ui64++; return (-1); } if (meo.meoi_l3proto != ETHERTYPE_IP) { txs->itxs_hck_badl3.value.ui64++; return (-1); } tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; tctx->itc_data_offsets |= (meo.meoi_l2hlen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT; tctx->itc_data_offsets |= (meo.meoi_l3hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT; } /* * Have we been asked to provide an L4 header. If so, first, * set up the IP information in the descriptor if we haven't * already, before moving on to seeing if we have enough * information for the L4 checksum offload. */ if (chkflags & HCK_PARTIALCKSUM) { if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) { txs->itxs_hck_nol4info.value.ui64++; return (-1); } if (!(chkflags & HCK_IPV4_HDRCKSUM)) { if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) { txs->itxs_hck_nol2info.value.ui64++; return (-1); } if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) { txs->itxs_hck_nol3info.value.ui64++; return (-1); } if (meo.meoi_l3proto == ETHERTYPE_IP) { tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4; } else if (meo.meoi_l3proto == ETHERTYPE_IPV6) { tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV6; } else { txs->itxs_hck_badl3.value.ui64++; return (-1); } tctx->itc_data_offsets |= (meo.meoi_l2hlen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT; tctx->itc_data_offsets |= (meo.meoi_l3hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT; } switch (meo.meoi_l4proto) { case IPPROTO_TCP: tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP; break; case IPPROTO_UDP: tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP; break; case IPPROTO_SCTP: tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP; break; default: txs->itxs_hck_badl4.value.ui64++; return (-1); } tctx->itc_data_offsets |= (meo.meoi_l4hlen >> 2) << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; } } if (lsoflags & HW_LSO) { /* * LSO requires that checksum offloads are enabled. If for * some reason they're not we bail out with an error. */ if (!i40e_tx_tso_have_cksums(chkflags, ttype)) { txs->itxs_hck_badtso.value.ui64++; return (-1); } if (!i40e_tx_tso_fix_mp(mp, ttype, &meo)) { txs->itxs_hck_badtso.value.ui64++; return (-1); } tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO; tctx->itc_ctx_mss = mss; tctx->itc_ctx_tsolen = msgsize(mp) - (meo.meoi_l2hlen + meo.meoi_l3hlen + meo.meoi_l4hlen); if (tunneled) { tctx->itc_ctx_tsolen -= meo.meoi_tun_protlen + meo.meoi_tun_l2hlen + meo.meoi_tun_l3hlen + meo.meoi_tun_l4hlen; } } return (0); } static void i40e_tcb_free(i40e_trqpair_t *itrq, i40e_tx_control_block_t *tcb) { ASSERT(tcb != NULL); mutex_enter(&itrq->itrq_tcb_lock); ASSERT(itrq->itrq_tcb_free < itrq->itrq_tx_free_list_size); itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = tcb; itrq->itrq_tcb_free++; mutex_exit(&itrq->itrq_tcb_lock); } static i40e_tx_control_block_t * i40e_tcb_alloc(i40e_trqpair_t *itrq) { i40e_tx_control_block_t *ret; mutex_enter(&itrq->itrq_tcb_lock); if (itrq->itrq_tcb_free == 0) { mutex_exit(&itrq->itrq_tcb_lock); return (NULL); } itrq->itrq_tcb_free--; ret = itrq->itrq_tcb_free_list[itrq->itrq_tcb_free]; itrq->itrq_tcb_free_list[itrq->itrq_tcb_free] = NULL; mutex_exit(&itrq->itrq_tcb_lock); ASSERT(ret != NULL); return (ret); } /* * This should be used to free any DMA resources, associated mblk_t's, etc. It's * used as part of recycling the message blocks when we have either an interrupt * or other activity that indicates that we need to take a look. */ static void i40e_tcb_reset(i40e_tx_control_block_t *tcb) { int i; switch (tcb->tcb_type) { case I40E_TX_COPY: tcb->tcb_dma.dmab_len = 0; break; case I40E_TX_DMA: if (tcb->tcb_used_lso == B_TRUE) (void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle); else (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle); if (tcb->tcb_bind_info != NULL) { for (i = 0; i < tcb->tcb_bind_ncookies; i++) { kmem_free(tcb->tcb_bind_info[i], sizeof (struct i40e_dma_bind_info)); } kmem_free(tcb->tcb_bind_info, tcb->tcb_bind_ncookies * sizeof (struct i40e_dma_bind_info *)); } tcb->tcb_bind_info = NULL; tcb->tcb_bind_ncookies = 0; tcb->tcb_used_lso = B_FALSE; break; case I40E_TX_DESC: break; case I40E_TX_NONE: /* Cast to pacify lint */ panic("trying to free tcb %p with bad type none", (void *)tcb); default: panic("unknown i40e tcb type: %d", tcb->tcb_type); } tcb->tcb_type = I40E_TX_NONE; if (tcb->tcb_mp != NULL) { freemsg(tcb->tcb_mp); tcb->tcb_mp = NULL; } tcb->tcb_next = NULL; } /* * This is called as part of shutting down to clean up all outstanding * descriptors. Similar to recycle, except we don't re-arm anything and instead * just return control blocks to the free list. */ void i40e_tx_cleanup_ring(i40e_trqpair_t *itrq) { uint32_t index; ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock)); ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); /* * Because we should have shut down the chip at this point, it should be * safe to just clean up all the entries between our head and tail. */ #ifdef DEBUG index = I40E_READ_REG(&itrq->itrq_i40e->i40e_hw_space, I40E_QTX_ENA(itrq->itrq_index)); VERIFY0(index & (I40E_QTX_ENA_QENA_REQ_MASK | I40E_QTX_ENA_QENA_STAT_MASK)); #endif index = itrq->itrq_desc_head; while (itrq->itrq_desc_free < itrq->itrq_tx_ring_size) { i40e_tx_control_block_t *tcb; tcb = itrq->itrq_tcb_work_list[index]; VERIFY(tcb != NULL); itrq->itrq_tcb_work_list[index] = NULL; i40e_tcb_reset(tcb); i40e_tcb_free(itrq, tcb); bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t)); index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size); itrq->itrq_desc_free++; } ASSERT(index == itrq->itrq_desc_tail); itrq->itrq_desc_head = index; } /* * We're here either by hook or by crook. We need to see if there are transmit * descriptors available for us to go and clean up and return to the hardware. * We may also be blocked, and if so, we should make sure that we let it know * we're good to go. */ void i40e_tx_recycle_ring(i40e_trqpair_t *itrq) { uint32_t wbhead, toclean, count; i40e_tx_control_block_t *tcbhead; i40e_t *i40e = itrq->itrq_i40e; int desc_per_tcb, i; mutex_enter(&itrq->itrq_tx_lock); ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); if (itrq->itrq_desc_free == itrq->itrq_tx_ring_size) { if (itrq->itrq_tx_blocked == B_TRUE) { itrq->itrq_tx_blocked = B_FALSE; mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring); itrq->itrq_txstat.itxs_num_unblocked.value.ui64++; } mutex_exit(&itrq->itrq_tx_lock); return; } /* * Now we need to try and see if there's anything available. The driver * will write to the head location and it guarantees that it does not * use relaxed ordering. */ VERIFY0(ddi_dma_sync(itrq->itrq_desc_area.dmab_dma_handle, (uintptr_t)itrq->itrq_desc_wbhead, sizeof (uint32_t), DDI_DMA_SYNC_FORKERNEL)); if (i40e_check_dma_handle(itrq->itrq_desc_area.dmab_dma_handle) != DDI_FM_OK) { mutex_exit(&itrq->itrq_tx_lock); ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); atomic_or_32(&i40e->i40e_state, I40E_ERROR); return; } wbhead = *itrq->itrq_desc_wbhead; toclean = itrq->itrq_desc_head; count = 0; tcbhead = NULL; while (toclean != wbhead) { i40e_tx_control_block_t *tcb; tcb = itrq->itrq_tcb_work_list[toclean]; itrq->itrq_tcb_work_list[toclean] = NULL; ASSERT(tcb != NULL); tcb->tcb_next = tcbhead; tcbhead = tcb; /* * In the DMA bind case, there may not necessarily be a 1:1 * mapping between tcb's and descriptors. If the tcb type * indicates a DMA binding then check the number of DMA * cookies to determine how many entries to clean in the * descriptor ring. */ if (tcb->tcb_type == I40E_TX_DMA) desc_per_tcb = tcb->tcb_bind_ncookies; else desc_per_tcb = 1; for (i = 0; i < desc_per_tcb; i++) { /* * We zero this out for sanity purposes. */ bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t)); toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size); count++; } } itrq->itrq_desc_head = wbhead; itrq->itrq_desc_free += count; itrq->itrq_txstat.itxs_recycled.value.ui64 += count; ASSERT(itrq->itrq_desc_free <= itrq->itrq_tx_ring_size); if (itrq->itrq_tx_blocked == B_TRUE && itrq->itrq_desc_free > i40e->i40e_tx_block_thresh) { itrq->itrq_tx_blocked = B_FALSE; mac_tx_ring_update(i40e->i40e_mac_hdl, itrq->itrq_mactxring); itrq->itrq_txstat.itxs_num_unblocked.value.ui64++; } mutex_exit(&itrq->itrq_tx_lock); /* * Now clean up the tcb. */ while (tcbhead != NULL) { i40e_tx_control_block_t *tcb = tcbhead; tcbhead = tcb->tcb_next; i40e_tcb_reset(tcb); i40e_tcb_free(itrq, tcb); } DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count); } static i40e_tx_control_block_t * i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp, boolean_t use_lso) { ddi_dma_handle_t dma_handle; ddi_dma_cookie_t dma_cookie; uint_t ncookies = 0, dmaflags; i40e_tx_control_block_t *tcb; i40e_txq_stat_t *txs = &itrq->itrq_txstat; int i = 0; struct i40e_dma_bind_info *dbi; if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { txs->itxs_err_notcb.value.ui64++; return (NULL); } tcb->tcb_type = I40E_TX_DMA; if (use_lso == B_TRUE) dma_handle = tcb->tcb_lso_dma_handle; else dma_handle = tcb->tcb_dma_handle; dmaflags = DDI_DMA_RDWR | DDI_DMA_STREAMING; if (ddi_dma_addr_bind_handle(dma_handle, NULL, (caddr_t)mp->b_rptr, MBLKL(mp), dmaflags, DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) { goto bffail; } tcb->tcb_bind_info = kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info *), KM_NOSLEEP); if (tcb->tcb_bind_info == NULL) goto bffail; while (i < ncookies) { if (i > 0) ddi_dma_nextcookie(dma_handle, &dma_cookie); dbi = kmem_zalloc(sizeof (struct i40e_dma_bind_info), KM_NOSLEEP); if (dbi == NULL) goto bffail; dbi->dbi_paddr = (caddr_t)dma_cookie.dmac_laddress; dbi->dbi_len = dma_cookie.dmac_size; tcb->tcb_bind_info[i++] = dbi; } tcb->tcb_bind_ncookies = ncookies; tcb->tcb_used_lso = use_lso; return (tcb); bffail: i40e_tcb_reset(tcb); i40e_tcb_free(itrq, tcb); if (ncookies != 0) (void) ddi_dma_unbind_handle(dma_handle); if (tcb->tcb_bind_info != NULL) kmem_free(tcb->tcb_bind_info, tcb->tcb_bind_ncookies * sizeof (struct i40e_dma_bind_info *)); tcb->tcb_bind_info = NULL; tcb->tcb_bind_ncookies = 0; return (NULL); } static void i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx, struct i40e_dma_bind_info *dbi, boolean_t last_desc) { i40e_tx_desc_t *txdesc; int type, cmd; ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock)); itrq->itrq_desc_free--; txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail]; itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1, itrq->itrq_tx_ring_size); type = I40E_TX_DESC_DTYPE_DATA; cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags; /* * The last data descriptor needs the EOP and RS bits set, so that the * HW knows that we're ready to send. */ if (last_desc == B_TRUE) { cmd |= I40E_TX_DESC_CMD_EOP; cmd |= I40E_TX_DESC_CMD_RS; } txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)dbi->dbi_paddr); txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type | ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) | ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) | ((uint64_t)dbi->dbi_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT))); } /* * We've been asked to send a message block on the wire. We'll only have a * single chain. There will not be any b_next pointers; however, there may be * multiple b_cont blocks. * * We may do one of three things with any given mblk_t chain: * * 1) Drop it * 2) Transmit it * 3) Return it * * If we return it to MAC, then MAC will flow control on our behalf. In other * words, it won't send us anything until we tell it that it's okay to send us * something. */ mblk_t * i40e_ring_tx(void *arg, mblk_t *mp) { const mblk_t *nmp; size_t mpsize, blksz; i40e_tx_control_block_t *tcb_ctx = NULL, *tcb_data = NULL, **tcb_dma = NULL; i40e_tx_desc_t *txdesc; i40e_tx_context_desc_t *ctxdesc; i40e_tx_context_t tctx; int cmd, type, i, c; uint_t needed_desc = 0, tail, nbufs = 0; boolean_t do_ctx_desc = B_FALSE, do_dma_bind = B_FALSE, last_desc, use_lso = B_FALSE; i40e_trqpair_t *itrq = arg; i40e_t *i40e = itrq->itrq_i40e; i40e_hw_t *hw = &i40e->i40e_hw_space; i40e_txq_stat_t *txs = &itrq->itrq_txstat; ASSERT(mp->b_next == NULL); if (!(i40e->i40e_state & I40E_STARTED) || (i40e->i40e_state & I40E_OVERTEMP) || (i40e->i40e_state & I40E_SUSPENDED) || (i40e->i40e_state & I40E_ERROR) || (i40e->i40e_link_state != LINK_STATE_UP)) { freemsg(mp); return (NULL); } /* * Figure out the relevant context about this frame that we might need * for enabling checksum, lso, etc. This also fills in information that * we might set around the packet type, etc. */ if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) { freemsg(mp); itrq->itrq_txstat.itxs_err_context.value.ui64++; return (NULL); } if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) use_lso = B_TRUE; if ((tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) || tctx.itc_ctx_tunneled) do_ctx_desc = B_TRUE; /* * For the primordial driver we can punt on doing any recycling right * now; however, longer term we need to probably do some more pro-active * recycling to cut back on stalls in the tx path. */ /* * Iterate through the mblks to calculate both the total size and the * number of message blocks. This is used to determine whether we're * doing DMA binding and, if so, how many control blocks we'll need. */ mpsize = 0; for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { blksz = MBLKL(nmp); if (blksz > 0) { mpsize += blksz; nbufs++; } } if (do_ctx_desc == B_TRUE) { /* * If we're doing tunneling or LSO, then we'll need a tx * context descriptor in addition to one or more tx data * descriptors. Since there's no data DMA block associated * with the context descriptor we create a special control * block that behaves effectively like a NOP. */ if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) { txs->itxs_err_notcb.value.ui64++; goto txfail; } tcb_ctx->tcb_type = I40E_TX_DESC; needed_desc++; } /* * For the non-LSO tx case, we alter our DMA strategy based on a * threshold tied to the frame size. This threshold is configurable * via tx_dma_threshold property. * * If the frame size is above the threshold, we do DMA binding of the * fragments, building a control block and data descriptor for each * piece. * * If it's below or at the threshold then we just use a single control * block and data descriptor and simply bcopy all of the fragments into * the pre-allocated DMA buffer in the control block. * * For the LSO tx case we always to DMA binding. */ if (use_lso == B_TRUE || mpsize > i40e->i40e_tx_dma_min) { do_dma_bind = B_TRUE; tcb_dma = kmem_zalloc(nbufs * sizeof (i40e_tx_control_block_t *), KM_NOSLEEP); if (tcb_dma == NULL) { i40e_error(i40e, "failed to allocate tcb_dma list"); goto txfail; } /* * For each b_cont: bind the control block's DMA handle to the * b_rptr, and record the cookies so that we can iterate * through them and build tx data descriptors. */ for (nmp = mp, i = 0; nmp != NULL; nmp = nmp->b_cont) { if (MBLKL(nmp) == 0) continue; tcb_dma[i] = i40e_tx_bind_fragment(itrq, nmp, use_lso); if (tcb_dma[i] == NULL) { i40e_error(i40e, "dma bind failed!"); goto txfail; } if (i == 0) tcb_dma[i]->tcb_mp = mp; needed_desc += tcb_dma[i++]->tcb_bind_ncookies; } } else { /* * Just use a single control block and bcopy all of the * fragments into its pre-allocated DMA buffer. */ if ((tcb_data = i40e_tcb_alloc(itrq)) == NULL) { txs->itxs_err_notcb.value.ui64++; goto txfail; } tcb_data->tcb_type = I40E_TX_COPY; ASSERT(tcb_data->tcb_dma.dmab_len == 0); ASSERT(tcb_data->tcb_dma.dmab_size >= mpsize); for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { size_t clen = MBLKL(nmp); void *coff = tcb_data->tcb_dma.dmab_address + tcb_data->tcb_dma.dmab_len; bcopy(nmp->b_rptr, coff, clen); tcb_data->tcb_dma.dmab_len += clen; } ASSERT(tcb_data->tcb_dma.dmab_len == mpsize); I40E_DMA_SYNC(&tcb_data->tcb_dma, DDI_DMA_SYNC_FORDEV); tcb_data->tcb_mp = mp; needed_desc++; } mutex_enter(&itrq->itrq_tx_lock); if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) { txs->itxs_err_nodescs.value.ui64++; mutex_exit(&itrq->itrq_tx_lock); goto txfail; } ASSERT(itrq->itrq_desc_free >= needed_desc); if (do_ctx_desc == B_TRUE) { /* * If we're enabling any offloads for this frame, then we'll * need to build up a transmit context descriptor, first. The * context descriptor needs to be placed in the tx ring before * the data descriptor(s). See section 8.4.2, table 8-16 */ itrq->itrq_desc_free--; tail = itrq->itrq_desc_tail; ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail]; itrq->itrq_tcb_work_list[tail] = tcb_ctx; itrq->itrq_desc_tail = i40e_next_desc(tail, 1, itrq->itrq_tx_ring_size); type = I40E_TX_DESC_DTYPE_CONTEXT; /* QW0 */ if (tctx.itc_ctx_tunneled) { ctxdesc->tunneling_params = tctx.itc_ctx_tunnel_fld; } else { ctxdesc->tunneling_params = 0; } ctxdesc->l2tag2 = 0; /* QW1 */ ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type); if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) { ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t) ((uint64_t)tctx.itc_ctx_cmdflags << I40E_TXD_CTX_QW1_CMD_SHIFT) | ((uint64_t)tctx.itc_ctx_tsolen << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) | ((uint64_t)tctx.itc_ctx_mss << I40E_TXD_CTX_QW1_MSS_SHIFT)); } } if (do_dma_bind == B_TRUE) { /* * Next build up a transmit data descriptor for each buffer. */ last_desc = B_FALSE; for (i = 0; i < nbufs; i++) { itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb_dma[i]; for (c = 0; c < tcb_dma[i]->tcb_bind_ncookies; c++) { if (i == (nbufs - 1) && c == (tcb_dma[i]->tcb_bind_ncookies - 1)) { last_desc = B_TRUE; } i40e_tx_set_data_desc(itrq, &tctx, tcb_dma[i]->tcb_bind_info[c], last_desc); } } kmem_free(tcb_dma, nbufs * sizeof (i40e_tx_control_block_t *)); } else { /* * Build up the single transmit data descriptor needed for the * non-DMA-bind case. */ itrq->itrq_desc_free--; txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail]; itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb_data; itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1, itrq->itrq_tx_ring_size); type = I40E_TX_DESC_DTYPE_DATA; cmd = I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS | I40E_TX_DESC_CMD_ICRC | tctx.itc_data_cmdflags; txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)tcb_data->tcb_dma.dmab_dma_address); txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type | ((uint64_t)tctx.itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) | ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) | ((uint64_t)tcb_data->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT))); } I40E_DMA_SYNC(&itrq->itrq_desc_area, DDI_DMA_SYNC_FORDEV); I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index), itrq->itrq_desc_tail); if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != DDI_FM_OK) { /* * Note, we can't really go through and clean this up very well, * because the memory has been given to the device, so just * indicate it's been transmitted. */ ddi_fm_service_impact(i40e->i40e_dip, DDI_SERVICE_DEGRADED); atomic_or_32(&i40e->i40e_state, I40E_ERROR); } txs->itxs_bytes.value.ui64 += mpsize; txs->itxs_packets.value.ui64++; txs->itxs_descriptors.value.ui64 += needed_desc; mutex_exit(&itrq->itrq_tx_lock); return (NULL); txfail: /* * We ran out of resources. Return it to MAC and indicate that we'll * need to signal MAC. If there are allocated tcb's, return them now. * Make sure to reset their message block's, since we'll return them * back to MAC. */ if (tcb_ctx != NULL) { tcb_ctx->tcb_mp = NULL; i40e_tcb_reset(tcb_ctx); i40e_tcb_free(itrq, tcb_ctx); } if (tcb_data != NULL) { tcb_data->tcb_mp = NULL; i40e_tcb_reset(tcb_data); i40e_tcb_free(itrq, tcb_data); } if (tcb_dma != NULL) kmem_free(tcb_dma, nbufs * sizeof (i40e_tx_control_block_t *)); mutex_enter(&itrq->itrq_tx_lock); itrq->itrq_tx_blocked = B_TRUE; mutex_exit(&itrq->itrq_tx_lock); return (mp); }