diff options
-rw-r--r-- | usr/src/cmd/mdb/intel/modules/i40e/amd64/Makefile | 3 | ||||
-rw-r--r-- | usr/src/cmd/mdb/intel/modules/i40e/i40e.c | 175 | ||||
-rw-r--r-- | usr/src/man/man7d/i40e.7d | 20 | ||||
-rw-r--r-- | usr/src/uts/common/io/i40e/i40e_gld.c | 103 | ||||
-rw-r--r-- | usr/src/uts/common/io/i40e/i40e_intr.c | 190 | ||||
-rw-r--r-- | usr/src/uts/common/io/i40e/i40e_main.c | 574 | ||||
-rw-r--r-- | usr/src/uts/common/io/i40e/i40e_stats.c | 77 | ||||
-rw-r--r-- | usr/src/uts/common/io/i40e/i40e_sw.h | 115 | ||||
-rw-r--r-- | usr/src/uts/common/io/i40e/i40e_transceiver.c | 1160 |
9 files changed, 1932 insertions, 485 deletions
diff --git a/usr/src/cmd/mdb/intel/modules/i40e/amd64/Makefile b/usr/src/cmd/mdb/intel/modules/i40e/amd64/Makefile index 66f97451b6..f1632172f5 100644 --- a/usr/src/cmd/mdb/intel/modules/i40e/amd64/Makefile +++ b/usr/src/cmd/mdb/intel/modules/i40e/amd64/Makefile @@ -10,7 +10,7 @@ # # -# Copyright 2017 Joyent, Inc. +# Copyright 2018 Joyent, Inc. # MODULE = i40e.so @@ -23,6 +23,7 @@ include ../../../../../Makefile.cmd.64 include ../../../Makefile.amd64 include ../../../../Makefile.module +CPPFLAGS += -I$(SRC)/cmd/mdb/common CPPFLAGS += -I$(SRC)/uts/common/io/i40e CPPFLAGS += -I$(SRC)/uts/common/io/i40e/core CPPFLAGS += -I$(SRC)/uts/common diff --git a/usr/src/cmd/mdb/intel/modules/i40e/i40e.c b/usr/src/cmd/mdb/intel/modules/i40e/i40e.c index 6d1f900b43..3f42d24d1f 100644 --- a/usr/src/cmd/mdb/intel/modules/i40e/i40e.c +++ b/usr/src/cmd/mdb/intel/modules/i40e/i40e.c @@ -10,9 +10,10 @@ */ /* - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ +#include <mdb/mdb_ctf.h> #include <sys/mdb_modapi.h> #include "i40e_sw.h" @@ -97,9 +98,181 @@ i40e_switch_rsrcs_dcmd(uintptr_t addr, uint_t flags, int argc, return (DCMD_OK); } +typedef struct mdb_i40e_trqpair { + uint32_t itrq_tx_ring_size; + uint32_t itrq_desc_free; + uint32_t *itrq_desc_wbhead; + uint32_t itrq_desc_head; + uint32_t itrq_desc_tail; + i40e_tx_desc_t *itrq_desc_ring; + i40e_tx_control_block_t **itrq_tcb_work_list; +} mdb_i40e_trqpair_t; + +static void +i40e_tx_ring_help() +{ + mdb_printf( + "\t -a dump all ring entries\n" + "\t or\n" + "\t combine -b [start index] with -e [end index] to specify a \n" + "\t range of ring entries to print\n"); +} + +static int +i40e_tx_ring_dcmd(uintptr_t addr, uint_t flags, int argc, + const mdb_arg_t *argv) +{ + mdb_i40e_trqpair_t trq; + i40e_tx_desc_t *descring; + i40e_tx_control_block_t **wklist; + uint32_t wbhead; + size_t ringsz, wklistsz; + boolean_t opt_a = B_FALSE; + char *opt_b = NULL, *opt_e = NULL; + uint64_t begin = UINT64_MAX, end = UINT64_MAX; + + if (!(flags & DCMD_ADDRSPEC)) { + mdb_warn("::i40e_tx_ring does not operate globally\n"); + return (DCMD_USAGE); + } + + if (mdb_getopts(argc, argv, + 'a', MDB_OPT_SETBITS, B_TRUE, &opt_a, + 'b', MDB_OPT_STR, &opt_b, + 'e', MDB_OPT_STR, &opt_e, NULL) != argc) + return (DCMD_USAGE); + + /* + * Verify that a legal combination of -a/-b/-e were used. + */ + if (opt_a && (opt_b != NULL || opt_e != NULL)) { + mdb_warn("-a and -b/-e are mutually exclusive\n"); + return (DCMD_USAGE); + } + if (argc > 0 && ! opt_a && (opt_b == NULL || opt_e == NULL)) { + mdb_warn("-b/-e must both be specified\n"); + return (DCMD_USAGE); + } + + if (mdb_ctf_vread(&trq, "i40e_trqpair_t", "mdb_i40e_trqpair_t", addr, + 0) == -1) { + mdb_warn("failed to read i40e_trqpair_t at %p", addr); + return (DCMD_ERR); + } + + if (opt_b != NULL) + begin = mdb_strtoull(opt_b); + if (opt_e != NULL) + end = mdb_strtoull(opt_e); + if (opt_a) { + begin = 0; + end = trq.itrq_tx_ring_size - 1; + } + + /* + * Verify that the requested range of ring entries makes sense. + */ + if (argc > 0 && (end < begin || begin >= trq.itrq_tx_ring_size || + end >= trq.itrq_tx_ring_size)) { + mdb_warn("invalid range specified\n"); + return (DCMD_USAGE); + } + + if (mdb_vread(&wbhead, sizeof (uint32_t), + (uintptr_t)trq.itrq_desc_wbhead) != sizeof (uint32_t)) { + mdb_warn("failed to read trq.itrq_desc_wbhead"); + return (DCMD_ERR); + } + mdb_printf("%-20s%d\n", "Ring Size:", trq.itrq_tx_ring_size); + mdb_printf("%-20s%d\n", "Free Descriptors:", trq.itrq_desc_free); + mdb_printf("%-20s%d\n", "Writeback Head:", wbhead); + mdb_printf("%-20s%d\n", "Head:", trq.itrq_desc_head); + mdb_printf("%-20s%d\n", "Tail:", trq.itrq_desc_tail); + + /* + * No arguments were specified, so we're done. + */ + if (argc == 0) + return (DCMD_OK); + + /* + * Allocate memory and read in the entire TX descriptor ring and + * TCB work list. + */ + ringsz = sizeof (i40e_tx_desc_t) * trq.itrq_tx_ring_size; + descring = mdb_alloc(ringsz, UM_SLEEP); + if (mdb_vread(descring, ringsz, (uintptr_t)trq.itrq_desc_ring) != + ringsz) { + mdb_warn("Failed to read in TX decriptor ring\n"); + mdb_free(descring, ringsz); + return (DCMD_ERR); + } + wklistsz = sizeof (i40e_tx_control_block_t *) * trq.itrq_tx_ring_size; + wklist = mdb_alloc(wklistsz, UM_SLEEP); + if (mdb_vread(wklist, wklistsz, (uintptr_t)trq.itrq_tcb_work_list) != + wklistsz) { + mdb_warn("Failed to read in TX TCB work list\n"); + mdb_free(descring, ringsz); + mdb_free(wklist, wklistsz); + return (DCMD_ERR); + } + + mdb_printf("\n%-10s %-10s %-16s %-16s %-10s\n", "Index", "Desc Type", + "Desc Ptr", "TCB Ptr", "Other"); + for (uint64_t i = begin; i <= end; i++) { + const char *dtype; + char dother[17]; + i40e_tx_desc_t *dptr; + i40e_tx_control_block_t *tcbptr; + uint64_t ctob; + + dptr = &descring[i]; + tcbptr = wklist[i]; + ctob = LE_64(dptr->cmd_type_offset_bsz); + if (ctob == 0) { + dtype = "FREE"; + } else { + switch (ctob & I40E_TXD_QW1_DTYPE_MASK) { + case (I40E_TX_DESC_DTYPE_CONTEXT): + dtype = "CONTEXT"; + break; + case (I40E_TX_DESC_DTYPE_DATA): + dtype = "DATA"; + break; + case (I40E_TX_DESC_DTYPE_FILTER_PROG): + dtype = "FILTER"; + break; + default: + dtype = "UNKNOWN"; + } + } + dother[0] = '\0'; + if (i == wbhead) + (void) strcat(dother, "WBHEAD"); + + if (i == trq.itrq_desc_head) + (void) strcat(dother, + strlen(dother) > 0 ? " HEAD" : "HEAD"); + + if (i == trq.itrq_desc_tail) + (void) strcat(dother, + strlen(dother) > 0 ? " TAIL" : "TAIL"); + + mdb_printf("%-10d %-10s %-16p %-16p %-10s\n", i, dtype, dptr, + tcbptr, dother); + } + + mdb_free(descring, ringsz); + mdb_free(wklist, wklistsz); + return (DCMD_OK); +} + static const mdb_dcmd_t i40e_dcmds[] = { { "i40e_switch_rsrcs", NULL, "print switch resources", i40e_switch_rsrcs_dcmd, NULL }, + { "i40e_tx_ring", "[-a] -b [start index] -e [end index]\n", + "dump TX descriptor ring state", i40e_tx_ring_dcmd, + i40e_tx_ring_help }, { NULL } }; diff --git a/usr/src/man/man7d/i40e.7d b/usr/src/man/man7d/i40e.7d index 2d8a2da45b..f025fba01a 100644 --- a/usr/src/man/man7d/i40e.7d +++ b/usr/src/man/man7d/i40e.7d @@ -9,9 +9,9 @@ .\" http://www.illumos.org/license/CDDL. .\" .\" -.\" Copyright (c) 2017 Joyent, Inc. +.\" Copyright (c) 2018 Joyent, Inc. .\" -.Dd September 8, 2017 +.Dd May 23, 2018 .Dt I40E 7D .Os .Sh NAME @@ -273,6 +273,22 @@ binding. By setting this property to its maximum, all frames will be processed by copying the frame. .Ed +.It Sy tx_lso_enable +.Bd -filled -compact +Minimum: +.Sy 0 | +Maximum: +.Sy 1 +.Ed +.Bd -filled +The +.Sy tx_lso_enable +property controls whether or not the device enables support for Large Segment +Offloand (LSO) when transmitting packets. +The default is to always enable support for this. +Turning it off will decrease throughput when transmitting packets, but should +be done if a hardware bug is suspected. +.Ed .El .Sh ARCHITECTURE The diff --git a/usr/src/uts/common/io/i40e/i40e_gld.c b/usr/src/uts/common/io/i40e/i40e_gld.c index d34057d64f..ccf814be0b 100644 --- a/usr/src/uts/common/io/i40e/i40e_gld.c +++ b/usr/src/uts/common/io/i40e/i40e_gld.c @@ -39,7 +39,8 @@ char *i40e_priv_props[] = { static int i40e_group_remove_mac(void *arg, const uint8_t *mac_addr) { - i40e_t *i40e = arg; + i40e_rx_group_t *rxg = arg; + i40e_t *i40e = rxg->irg_i40e; struct i40e_aqc_remove_macvlan_element_data filt; struct i40e_hw *hw = &i40e->i40e_hw_space; int ret, i, last; @@ -107,10 +108,11 @@ done: static int i40e_group_add_mac(void *arg, const uint8_t *mac_addr) { - i40e_t *i40e = arg; - struct i40e_hw *hw = &i40e->i40e_hw_space; - int i, ret; - i40e_uaddr_t *iua; + i40e_rx_group_t *rxg = arg; + i40e_t *i40e = rxg->irg_i40e; + struct i40e_hw *hw = &i40e->i40e_hw_space; + int i, ret; + i40e_uaddr_t *iua; struct i40e_aqc_add_macvlan_element_data filt; if (I40E_IS_MULTICAST(mac_addr)) @@ -136,16 +138,12 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr) } } - /* - * Note, the general use of the i40e_vsi_id will have to be refactored - * when we have proper group support. - */ bzero(&filt, sizeof (filt)); bcopy(mac_addr, filt.mac_addr, ETHERADDRL); filt.flags = I40E_AQC_MACVLAN_ADD_PERFECT_MATCH | I40E_AQC_MACVLAN_ADD_IGNORE_VLAN; - if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1, + if ((ret = i40e_aq_add_macvlan(hw, rxg->irg_vsi_seid, &filt, 1, NULL)) != I40E_SUCCESS) { i40e_error(i40e, "failed to add mac address " "%2x:%2x:%2x:%2x:%2x:%2x to unicast filter: %d", @@ -157,7 +155,7 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr) iua = &i40e->i40e_uaddrs[i40e->i40e_resources.ifr_nmacfilt_used]; bcopy(mac_addr, iua->iua_mac, ETHERADDRL); - iua->iua_vsi = i40e->i40e_vsi_id; + iua->iua_vsi = rxg->irg_vsi_seid; i40e->i40e_resources.ifr_nmacfilt_used++; ASSERT(i40e->i40e_resources.ifr_nmacfilt_used <= i40e->i40e_resources.ifr_nmacfilt); @@ -227,7 +225,7 @@ i40e_m_promisc(void *arg, boolean_t on) } - ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id, + ret = i40e_aq_set_vsi_unicast_promiscuous(hw, I40E_DEF_VSI_SEID(i40e), on, NULL, B_FALSE); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to %s unicast promiscuity on " @@ -246,7 +244,7 @@ i40e_m_promisc(void *arg, boolean_t on) goto done; } - ret = i40e_aq_set_vsi_multicast_promiscuous(hw, i40e->i40e_vsi_id, + ret = i40e_aq_set_vsi_multicast_promiscuous(hw, I40E_DEF_VSI_SEID(i40e), on, NULL); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to %s multicast promiscuity on " @@ -257,8 +255,8 @@ i40e_m_promisc(void *arg, boolean_t on) * Try our best to put us back into a state that MAC expects us * to be in. */ - ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id, - !on, NULL, B_FALSE); + ret = i40e_aq_set_vsi_unicast_promiscuous(hw, + I40E_DEF_VSI_SEID(i40e), !on, NULL, B_FALSE); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to %s unicast promiscuity on " "the default VSI after toggling multicast failed: " @@ -294,11 +292,11 @@ i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address) if (i40e->i40e_mcast_promisc_count == 0 && i40e->i40e_promisc_on == B_FALSE) { ret = i40e_aq_set_vsi_multicast_promiscuous(hw, - i40e->i40e_vsi_id, B_TRUE, NULL); + I40E_DEF_VSI_SEID(i40e), B_TRUE, NULL); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to enable multicast " "promiscuous mode on VSI %d: %d", - i40e->i40e_vsi_id, ret); + I40E_DEF_VSI_SEID(i40e), ret); return (EIO); } } @@ -312,7 +310,7 @@ i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address) filt.flags = I40E_AQC_MACVLAN_ADD_HASH_MATCH | I40E_AQC_MACVLAN_ADD_IGNORE_VLAN; - if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1, + if ((ret = i40e_aq_add_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, 1, NULL)) != I40E_SUCCESS) { i40e_error(i40e, "failed to add mac address " "%2x:%2x:%2x:%2x:%2x:%2x to multicast filter: %d", @@ -353,8 +351,8 @@ i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address) filt.flags = I40E_AQC_MACVLAN_DEL_HASH_MATCH | I40E_AQC_MACVLAN_DEL_IGNORE_VLAN; - if (i40e_aq_remove_macvlan(hw, i40e->i40e_vsi_id, - &filt, 1, NULL) != I40E_SUCCESS) { + if (i40e_aq_remove_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, + 1, NULL) != I40E_SUCCESS) { i40e_error(i40e, "failed to remove mac address " "%2x:%2x:%2x:%2x:%2x:%2x from multicast " "filter: %d", @@ -381,11 +379,11 @@ i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address) if (i40e->i40e_mcast_promisc_count == 1 && i40e->i40e_promisc_on == B_FALSE) { ret = i40e_aq_set_vsi_multicast_promiscuous(hw, - i40e->i40e_vsi_id, B_FALSE, NULL); + I40E_DEF_VSI_SEID(i40e), B_FALSE, NULL); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to disable " "multicast promiscuous mode on VSI %d: %d", - i40e->i40e_vsi_id, ret); + I40E_DEF_VSI_SEID(i40e), ret); return (EIO); } } @@ -490,7 +488,7 @@ i40e_fill_tx_ring(void *arg, mac_ring_type_t rtype, const int group_index, * we're not actually grouping things tx-wise at this time. */ ASSERT(group_index == -1); - ASSERT(ring_index < i40e->i40e_num_trqpairs); + ASSERT(ring_index < i40e->i40e_num_trqpairs_per_vsi); itrq->itrq_mactxring = rh; infop->mri_driver = (mac_ring_driver_t)itrq; @@ -516,15 +514,16 @@ i40e_fill_rx_ring(void *arg, mac_ring_type_t rtype, const int group_index, { i40e_t *i40e = arg; mac_intr_t *mintr = &infop->mri_intr; - i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[ring_index]; + uint_t trqpair_index; + i40e_trqpair_t *itrq; - /* - * We assert the group number and ring index to help sanity check - * ourselves and mark that we'll need to rework this when we have - * multiple groups. - */ - ASSERT3S(group_index, ==, 0); - ASSERT3S(ring_index, <, i40e->i40e_num_trqpairs); + /* This assumes static groups. */ + ASSERT3S(group_index, >=, 0); + ASSERT3S(ring_index, >=, 0); + trqpair_index = (group_index * i40e->i40e_num_trqpairs_per_vsi) + + ring_index; + ASSERT3U(trqpair_index, <, i40e->i40e_num_trqpairs); + itrq = &i40e->i40e_trqpairs[trqpair_index]; itrq->itrq_macrxring = rh; infop->mri_driver = (mac_ring_driver_t)itrq; @@ -552,24 +551,22 @@ i40e_fill_rx_group(void *arg, mac_ring_type_t rtype, const int index, mac_group_info_t *infop, mac_group_handle_t gh) { i40e_t *i40e = arg; + i40e_rx_group_t *rxg; if (rtype != MAC_RING_TYPE_RX) return; - /* - * Note, this is a simplified view of a group, given that we only have a - * single group and a single ring at the moment. We'll want to expand - * upon this as we leverage more hardware functionality. - */ - i40e->i40e_rx_group_handle = gh; - infop->mgi_driver = (mac_group_driver_t)i40e; + rxg = &i40e->i40e_rx_groups[index]; + rxg->irg_grp_hdl = gh; + + infop->mgi_driver = (mac_group_driver_t)rxg; infop->mgi_start = NULL; infop->mgi_stop = NULL; infop->mgi_addmac = i40e_group_add_mac; infop->mgi_remmac = i40e_group_remove_mac; - ASSERT(i40e->i40e_num_rx_groups == I40E_GROUP_MAX); - infop->mgi_count = i40e->i40e_num_trqpairs; + ASSERT(i40e->i40e_num_rx_groups <= I40E_GROUP_MAX); + infop->mgi_count = i40e->i40e_num_trqpairs_per_vsi; } static int @@ -732,20 +729,32 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) break; } + case MAC_CAPAB_LSO: { + mac_capab_lso_t *cap_lso = cap_data; + + if (i40e->i40e_tx_lso_enable == B_TRUE) { + cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; + cap_lso->lso_basic_tcp_ipv4.lso_max = I40E_LSO_MAXLEN; + } else { + return (B_FALSE); + } + break; + } + case MAC_CAPAB_RINGS: cap_rings = cap_data; cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; switch (cap_rings->mr_type) { case MAC_RING_TYPE_TX: /* - * Note, saying we have no rings, but some number of - * groups indicates to MAC that it should create - * psuedo-groups with one for each TX ring. This may not - * be the long term behavior we want, but it'll work for - * now. + * Note, saying we have no groups, but some + * number of rings indicates to MAC that it + * should create psuedo-groups with one for + * each TX ring. This may not be the long term + * behavior we want, but it'll work for now. */ cap_rings->mr_gnum = 0; - cap_rings->mr_rnum = i40e->i40e_num_trqpairs; + cap_rings->mr_rnum = i40e->i40e_num_trqpairs_per_vsi; cap_rings->mr_rget = i40e_fill_tx_ring; cap_rings->mr_gget = NULL; cap_rings->mr_gaddring = NULL; @@ -754,7 +763,7 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) case MAC_RING_TYPE_RX: cap_rings->mr_rnum = i40e->i40e_num_trqpairs; cap_rings->mr_rget = i40e_fill_rx_ring; - cap_rings->mr_gnum = I40E_GROUP_MAX; + cap_rings->mr_gnum = i40e->i40e_num_rx_groups; cap_rings->mr_gget = i40e_fill_rx_group; cap_rings->mr_gaddring = NULL; cap_rings->mr_gremring = NULL; diff --git a/usr/src/uts/common/io/i40e/i40e_intr.c b/usr/src/uts/common/io/i40e/i40e_intr.c index 51d1bbac92..170bef7ec6 100644 --- a/usr/src/uts/common/io/i40e/i40e_intr.c +++ b/usr/src/uts/common/io/i40e/i40e_intr.c @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2017 Tegile Systems, Inc. All rights reserved. */ @@ -229,12 +229,20 @@ i40e_intr_adminq_disable(i40e_t *i40e) I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTL0, reg); } +/* + * The next two functions enable/disable the reception of interrupts + * on the given vector. Only vectors 1..N are programmed by these + * functions; vector 0 is special and handled by a different register. + * We must subtract one from the vector because i40e implicitly adds + * one to the vector value. See section 10.2.2.10.13 for more details. + */ static void i40e_intr_io_enable(i40e_t *i40e, int vector) { uint32_t reg; i40e_hw_t *hw = &i40e->i40e_hw_space; + ASSERT3S(vector, >, 0); reg = I40E_PFINT_DYN_CTLN_INTENA_MASK | I40E_PFINT_DYN_CTLN_CLEARPBA_MASK | (I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT); @@ -247,6 +255,7 @@ i40e_intr_io_disable(i40e_t *i40e, int vector) uint32_t reg; i40e_hw_t *hw = &i40e->i40e_hw_space; + ASSERT3S(vector, >, 0); reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT; I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg); } @@ -375,49 +384,109 @@ i40e_intr_chip_fini(i40e_t *i40e) } /* - * Enable all of the queues and set the corresponding LNKLSTN registers. Note - * that we always enable queues as interrupt sources, even though we don't - * enable the MSI-X interrupt vectors. + * Set the head of the interrupt linked list. The PFINT_LNKLSTN[N] + * register actually refers to the 'N + 1' interrupt vector. E.g., + * PFINT_LNKLSTN[0] refers to interrupt vector 1. + */ +static void +i40e_set_lnklstn(i40e_t *i40e, uint_t vector, uint_t queue) +{ + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + reg = (queue << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_RX << I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT); + + I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(vector), reg); + DEBUGOUT2("PFINT_LNKLSTN[%u] = 0x%x", vector, reg); +} + +/* + * Set the QINT_RQCTL[queue] register. The next queue is always the Tx + * queue associated with this Rx queue. Unlike PFINT_LNKLSTN, the + * vector should be the actual vector this queue is on -- i.e., it + * should be equal to itrq_rx_intrvec. + */ +static void +i40e_set_rqctl(i40e_t *i40e, uint_t vector, uint_t queue) +{ + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_rx_intrvec); + + reg = (vector << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) | + (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) | + (queue << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) | + I40E_QINT_RQCTL_CAUSE_ENA_MASK; + + I40E_WRITE_REG(hw, I40E_QINT_RQCTL(queue), reg); + DEBUGOUT2("QINT_RQCTL[%u] = 0x%x", queue, reg); +} + +/* + * Like i40e_set_rqctl(), but for QINT_TQCTL[queue]. The next queue is + * either the Rx queue of another TRQP, or EOL. + */ +static void +i40e_set_tqctl(i40e_t *i40e, uint_t vector, uint_t queue, uint_t next_queue) +{ + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_tx_intrvec); + + reg = (vector << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) | + (I40E_ITR_INDEX_TX << I40E_QINT_TQCTL_ITR_INDX_SHIFT) | + (next_queue << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) | + I40E_QINT_TQCTL_CAUSE_ENA_MASK; + + I40E_WRITE_REG(hw, I40E_QINT_TQCTL(queue), reg); + DEBUGOUT2("QINT_TQCTL[%u] = 0x%x", queue, reg); +} + +/* + * Program the interrupt linked list. Each vector has a linked list of + * queues which act as event sources for that vector. When one of + * those sources has an event the associated interrupt vector is + * fired. This mapping must match the mapping found in + * i40e_map_intrs_to_vectors(). + * + * See section 7.5.3 for more information about the configuration of + * the interrupt linked list. */ static void i40e_intr_init_queue_msix(i40e_t *i40e) { - i40e_hw_t *hw = &i40e->i40e_hw_space; - uint32_t reg; - int i; + uint_t intr_count; /* - * Map queues to MSI-X interrupts. Queue i is mapped to vector i + 1. - * Note that we skip the ITR logic for the moment, just to make our - * lives as explicit and simple as possible. + * The 0th vector is for 'Other Interrupts' only (subject to + * change in the future). */ - for (i = 0; i < i40e->i40e_num_trqpairs; i++) { - i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i]; + intr_count = i40e->i40e_intr_count - 1; - reg = (i << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) | - (I40E_QUEUE_TYPE_RX << - I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT); - I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(i), reg); + for (uint_t vec = 0; vec < intr_count; vec++) { + boolean_t head = B_TRUE; - reg = - (itrq->itrq_rx_intrvec << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) | - (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) | - (i << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) | - (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) | - I40E_QINT_RQCTL_CAUSE_ENA_MASK; + for (uint_t qidx = vec; qidx < i40e->i40e_num_trqpairs; + qidx += intr_count) { + uint_t next_qidx = qidx + intr_count; - I40E_WRITE_REG(hw, I40E_QINT_RQCTL(i), reg); + next_qidx = (next_qidx > i40e->i40e_num_trqpairs) ? + I40E_QUEUE_TYPE_EOL : next_qidx; - reg = - (itrq->itrq_tx_intrvec << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) | - (I40E_ITR_INDEX_TX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) | - (I40E_QUEUE_TYPE_EOL << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) | - (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) | - I40E_QINT_TQCTL_CAUSE_ENA_MASK; + if (head) { + i40e_set_lnklstn(i40e, vec, qidx); + head = B_FALSE; + } - I40E_WRITE_REG(hw, I40E_QINT_TQCTL(i), reg); + i40e_set_rqctl(i40e, vec + 1, qidx); + i40e_set_tqctl(i40e, vec + 1, qidx, next_qidx); + } } - } /* @@ -604,31 +673,26 @@ i40e_intr_adminq_work(i40e_t *i40e) } static void -i40e_intr_rx_work(i40e_t *i40e, int queue) +i40e_intr_rx_work(i40e_t *i40e, i40e_trqpair_t *itrq) { mblk_t *mp = NULL; - i40e_trqpair_t *itrq; - - ASSERT(queue < i40e->i40e_num_trqpairs); - itrq = &i40e->i40e_trqpairs[queue]; mutex_enter(&itrq->itrq_rx_lock); if (!itrq->itrq_intr_poll) mp = i40e_ring_rx(itrq, I40E_POLL_NULL); mutex_exit(&itrq->itrq_rx_lock); - if (mp != NULL) { - mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp, - itrq->itrq_rxgen); - } + if (mp == NULL) + return; + + mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp, + itrq->itrq_rxgen); } +/* ARGSUSED */ static void -i40e_intr_tx_work(i40e_t *i40e, int queue) +i40e_intr_tx_work(i40e_t *i40e, i40e_trqpair_t *itrq) { - i40e_trqpair_t *itrq; - - itrq = &i40e->i40e_trqpairs[queue]; i40e_tx_recycle_ring(itrq); } @@ -665,11 +729,17 @@ i40e_intr_other_work(i40e_t *i40e) i40e_intr_adminq_enable(i40e); } +/* + * Handle an MSI-X interrupt. See section 7.5.1.3 for an overview of + * the MSI-X interrupt sequence. + */ uint_t i40e_intr_msix(void *arg1, void *arg2) { i40e_t *i40e = (i40e_t *)arg1; - int vector_idx = (int)(uintptr_t)arg2; + uint_t vector_idx = (uint_t)(uintptr_t)arg2; + + ASSERT3U(vector_idx, <, i40e->i40e_intr_count); /* * When using MSI-X interrupts, vector 0 is always reserved for the @@ -681,10 +751,29 @@ i40e_intr_msix(void *arg1, void *arg2) return (DDI_INTR_CLAIMED); } - i40e_intr_rx_work(i40e, vector_idx - 1); - i40e_intr_tx_work(i40e, vector_idx - 1); - i40e_intr_io_enable(i40e, vector_idx); + ASSERT3U(vector_idx, >, 0); + /* + * We determine the queue indexes via simple arithmetic (as + * opposed to keeping explicit state like a bitmap). While + * conveinent, it does mean that i40e_map_intrs_to_vectors(), + * i40e_intr_init_queue_msix(), and this function must be + * modified as a unit. + * + * We subtract 1 from the vector to offset the addition we + * performed during i40e_map_intrs_to_vectors(). + */ + for (uint_t i = vector_idx - 1; i < i40e->i40e_num_trqpairs; + i += (i40e->i40e_intr_count - 1)) { + i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i]; + + ASSERT3U(i, <, i40e->i40e_num_trqpairs); + ASSERT3P(itrq, !=, NULL); + i40e_intr_rx_work(i40e, itrq); + i40e_intr_tx_work(i40e, itrq); + } + + i40e_intr_io_enable(i40e, vector_idx); return (DDI_INTR_CLAIMED); } @@ -693,6 +782,7 @@ i40e_intr_notx(i40e_t *i40e, boolean_t shared) { i40e_hw_t *hw = &i40e->i40e_hw_space; uint32_t reg; + i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[0]; int ret = DDI_INTR_CLAIMED; if (shared == B_TRUE) { @@ -722,10 +812,10 @@ i40e_intr_notx(i40e_t *i40e, boolean_t shared) i40e_intr_adminq_work(i40e); if (reg & I40E_INTR_NOTX_RX_MASK) - i40e_intr_rx_work(i40e, 0); + i40e_intr_rx_work(i40e, itrq); if (reg & I40E_INTR_NOTX_TX_MASK) - i40e_intr_tx_work(i40e, 0); + i40e_intr_tx_work(i40e, itrq); done: i40e_intr_adminq_enable(i40e); diff --git a/usr/src/uts/common/io/i40e/i40e_main.c b/usr/src/uts/common/io/i40e/i40e_main.c index 54aef43424..99c64abe8c 100644 --- a/usr/src/uts/common/io/i40e/i40e_main.c +++ b/usr/src/uts/common/io/i40e/i40e_main.c @@ -11,7 +11,7 @@ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2017 Tegile Systems, Inc. All rights reserved. */ @@ -188,14 +188,15 @@ * VSI Management * -------------- * - * At this time, we currently only support a single MAC group, and thus a single - * VSI. This VSI is considered the default VSI and should be the only one that - * exists after a reset. Currently it is stored as the member - * i40e_t`i40e_vsi_id. While this works for the moment and for an initial - * driver, it's not sufficient for the longer-term path of the driver. Instead, - * we'll want to actually have a unique i40e_vsi_t structure which is used - * everywhere. Note that this means that every place that uses the - * i40e_t`i40e_vsi_id will need to be refactored. + * The PFs share 384 VSIs. The firmware creates one VSI per PF by default. + * During chip start we retrieve the SEID of this VSI and assign it as the + * default VSI for our VEB (one VEB per PF). We then add additional VSIs to + * the VEB up to the determined number of rx groups: i40e_t`i40e_num_rx_groups. + * We currently cap this number to I40E_GROUP_MAX to a) make sure all PFs can + * allocate the same number of VSIs, and b) to keep the interrupt multiplexing + * under control. In the future, when we improve the interrupt allocation, we + * may want to revisit this cap to make better use of the available VSIs. The + * VSI allocation and configuration can be found in i40e_chip_start(). * * ---------------- * Structure Layout @@ -240,7 +241,7 @@ * | i40e_hw_t --+---> Intel common code structure * | mac_handle_t --+---> GLDv3 handle to MAC * | ddi_periodic_t --+---> Link activity timer - * | int (vsi_id) --+---> VSI ID, main identifier + * | i40e_vsi_t * --+---> Array of VSIs * | i40e_func_rsrc_t --+---> Available hardware resources * | i40e_switch_rsrc_t * --+---> Switch resource snapshot * | i40e_sdu --+---> Current MTU @@ -249,11 +250,10 @@ * | i40e_maddr_t * --+---> Array of assigned multicast MACs * | i40e_mcast_promisccount --+---> Active multicast state * | i40e_promisc_on --+---> Current promiscuous mode state - * | int --+---> Number of transmit/receive pairs + * | uint_t --+---> Number of transmit/receive pairs + * | i40e_rx_group_t * --+---> Array of Rx groups * | kstat_t * --+---> PF kstats - * | kstat_t * --+---> VSI kstats * | i40e_pf_stats_t --+---> PF kstat backing data - * | i40e_vsi_stats_t --+---> VSI kstat backing data * | i40e_trqpair_t * --+---------+ * +---------------------------+ | * | @@ -359,8 +359,6 @@ * While bugs have been filed to cover this future work, the following gives an * overview of expected work: * - * o TSO support - * o Multiple group support * o DMA binding and breaking up the locking in ring recycling. * o Enhanced detection of device errors * o Participation in IRM @@ -371,7 +369,7 @@ #include "i40e_sw.h" -static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.1"; +static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.3"; /* * The i40e_glock primarily protects the lists below and the i40e_device_t @@ -761,15 +759,16 @@ i40e_fm_ereport(i40e_t *i40e, char *detail) } /* - * Here we're trying to get the ID of the default VSI. In general, when we come - * through and look at this shortly after attach, we expect there to only be a - * single element present, which is the default VSI. Importantly, each PF seems - * to not see any other devices, in part because of the simple switch mode that - * we're using. If for some reason, we see more artifact, we'll need to revisit - * what we're doing here. + * Here we're trying to set the SEID of the default VSI. In general, + * when we come through and look at this shortly after attach, we + * expect there to only be a single element present, which is the + * default VSI. Importantly, each PF seems to not see any other + * devices, in part because of the simple switch mode that we're + * using. If for some reason, we see more artifacts, we'll need to + * revisit what we're doing here. */ -static int -i40e_get_vsi_id(i40e_t *i40e) +static boolean_t +i40e_set_def_vsi_seid(i40e_t *i40e) { i40e_hw_t *hw = &i40e->i40e_hw_space; struct i40e_aqc_get_switch_config_resp *sw_config; @@ -784,17 +783,43 @@ i40e_get_vsi_id(i40e_t *i40e) if (rc != I40E_SUCCESS) { i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d", rc, hw->aq.asq_last_status); - return (-1); + return (B_FALSE); } if (LE_16(sw_config->header.num_reported) != 1) { i40e_error(i40e, "encountered multiple (%d) switching units " "during attach, not proceeding", LE_16(sw_config->header.num_reported)); + return (B_FALSE); + } + + I40E_DEF_VSI_SEID(i40e) = sw_config->element[0].seid; + return (B_TRUE); +} + +/* + * Get the SEID of the uplink MAC. + */ +static int +i40e_get_mac_seid(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + struct i40e_aqc_get_switch_config_resp *sw_config; + uint8_t aq_buf[I40E_AQ_LARGE_BUF]; + uint16_t next = 0; + int rc; + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + sw_config = (struct i40e_aqc_get_switch_config_resp *)aq_buf; + rc = i40e_aq_get_switch_config(hw, sw_config, sizeof (aq_buf), &next, + NULL); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d", + rc, hw->aq.asq_last_status); return (-1); } - return (sw_config->element[0].seid); + return (LE_16(sw_config->element[0].uplink_seid)); } /* @@ -1098,11 +1123,16 @@ i40e_disable_interrupts(i40e_t *i40e) static void i40e_free_trqpairs(i40e_t *i40e) { - int i; i40e_trqpair_t *itrq; + if (i40e->i40e_rx_groups != NULL) { + kmem_free(i40e->i40e_rx_groups, + sizeof (i40e_rx_group_t) * i40e->i40e_num_rx_groups); + i40e->i40e_rx_groups = NULL; + } + if (i40e->i40e_trqpairs != NULL) { - for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) { itrq = &i40e->i40e_trqpairs[i]; mutex_destroy(&itrq->itrq_rx_lock); mutex_destroy(&itrq->itrq_tx_lock); @@ -1133,7 +1163,6 @@ i40e_free_trqpairs(i40e_t *i40e) static boolean_t i40e_alloc_trqpairs(i40e_t *i40e) { - int i; void *mutexpri = DDI_INTR_PRI(i40e->i40e_intr_pri); /* @@ -1146,7 +1175,7 @@ i40e_alloc_trqpairs(i40e_t *i40e) i40e->i40e_trqpairs = kmem_zalloc(sizeof (i40e_trqpair_t) * i40e->i40e_num_trqpairs, KM_SLEEP); - for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) { i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i]; itrq->itrq_i40e = i40e; @@ -1156,6 +1185,16 @@ i40e_alloc_trqpairs(i40e_t *i40e) itrq->itrq_index = i; } + i40e->i40e_rx_groups = kmem_zalloc(sizeof (i40e_rx_group_t) * + i40e->i40e_num_rx_groups, KM_SLEEP); + + for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) { + i40e_rx_group_t *rxg = &i40e->i40e_rx_groups[i]; + + rxg->irg_index = i; + rxg->irg_i40e = i40e; + } + return (B_TRUE); } @@ -1164,16 +1203,19 @@ i40e_alloc_trqpairs(i40e_t *i40e) /* * Unless a .conf file already overrode i40e_t structure values, they will * be 0, and need to be set in conjunction with the now-available HW report. - * - * However, at the moment, we cap all of these resources as we only support a - * single receive ring and a single group. */ /* ARGSUSED */ static void i40e_hw_to_instance(i40e_t *i40e, i40e_hw_t *hw) { - if (i40e->i40e_num_trqpairs == 0) { - i40e->i40e_num_trqpairs = I40E_TRQPAIR_MAX; + if (i40e->i40e_num_trqpairs_per_vsi == 0) { + if (i40e_is_x722(i40e)) { + i40e->i40e_num_trqpairs_per_vsi = + I40E_722_MAX_TC_QUEUES; + } else { + i40e->i40e_num_trqpairs_per_vsi = + I40E_710_MAX_TC_QUEUES; + } } if (i40e->i40e_num_rx_groups == 0) { @@ -1309,12 +1351,11 @@ i40e_common_code_init(i40e_t *i40e, i40e_hw_t *hw) } /* - * We need to obtain the Virtual Station ID (VSI) before we can - * perform other operations on the device. + * We need to obtain the Default Virtual Station SEID (VSI) + * before we can perform other operations on the device. */ - i40e->i40e_vsi_id = i40e_get_vsi_id(i40e); - if (i40e->i40e_vsi_id == -1) { - i40e_error(i40e, "failed to obtain VSI ID"); + if (!i40e_set_def_vsi_seid(i40e)) { + i40e_error(i40e, "failed to obtain Default VSI SEID"); return (B_FALSE); } @@ -1559,6 +1600,9 @@ i40e_init_properties(i40e_t *i40e) i40e->i40e_tx_hcksum_enable = i40e_get_prop(i40e, "tx_hcksum_enable", B_FALSE, B_TRUE, B_TRUE); + i40e->i40e_tx_lso_enable = i40e_get_prop(i40e, "tx_lso_enable", + B_FALSE, B_TRUE, B_TRUE); + i40e->i40e_rx_hcksum_enable = i40e_get_prop(i40e, "rx_hcksum_enable", B_FALSE, B_TRUE, B_TRUE); @@ -1728,15 +1772,56 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo) } i40e->i40e_intr_type = 0; + i40e->i40e_num_rx_groups = I40E_GROUP_MAX; + /* + * We need to determine the number of queue pairs per traffic + * class. We only have one traffic class (TC0), so we'll base + * this off the number of interrupts provided. Furthermore, + * since we only use one traffic class, the number of queues + * per traffic class and per VSI are the same. + */ if ((intr_types & DDI_INTR_TYPE_MSIX) && - i40e->i40e_intr_force <= I40E_INTR_MSIX) { - if (i40e_alloc_intr_handles(i40e, devinfo, - DDI_INTR_TYPE_MSIX)) { - i40e->i40e_num_trqpairs = - MIN(i40e->i40e_intr_count - 1, max_trqpairs); - return (B_TRUE); - } + (i40e->i40e_intr_force <= I40E_INTR_MSIX) && + (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_MSIX))) { + uint32_t n; + + /* + * While we want the number of queue pairs to match + * the number of interrupts, we must keep stay in + * bounds of the maximum number of queues per traffic + * class. We subtract one from i40e_intr_count to + * account for interrupt zero; which is currently + * restricted to admin queue commands and other + * interrupt causes. + */ + n = MIN(i40e->i40e_intr_count - 1, max_trqpairs); + ASSERT3U(n, >, 0); + + /* + * Round up to the nearest power of two to ensure that + * the QBASE aligns with the TC size which must be + * programmed as a power of two. See the queue mapping + * description in section 7.4.9.5.5.1. + * + * If i40e_intr_count - 1 is not a power of two then + * some queue pairs on the same VSI will have to share + * an interrupt. + * + * We may want to revisit this logic in a future where + * we have more interrupts and more VSIs. Otherwise, + * each VSI will use as many interrupts as possible. + * Using more QPs per VSI means better RSS for each + * group, but at the same time may require more + * sharing of interrupts across VSIs. This may be a + * good candidate for a .conf tunable. + */ + n = 0x1 << ddi_fls(n); + i40e->i40e_num_trqpairs_per_vsi = n; + ASSERT3U(i40e->i40e_num_rx_groups, >, 0); + i40e->i40e_num_trqpairs = i40e->i40e_num_trqpairs_per_vsi * + i40e->i40e_num_rx_groups; + return (B_TRUE); } /* @@ -1745,6 +1830,7 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo) * single MSI interrupt. */ i40e->i40e_num_trqpairs = I40E_TRQPAIR_NOMSIX; + i40e->i40e_num_trqpairs_per_vsi = i40e->i40e_num_trqpairs; i40e->i40e_num_rx_groups = I40E_GROUP_NOMSIX; if ((intr_types & DDI_INTR_TYPE_MSI) && @@ -1767,24 +1853,20 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo) static boolean_t i40e_map_intrs_to_vectors(i40e_t *i40e) { - int i; - if (i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) { return (B_TRUE); } /* - * Each queue pair is mapped to a single interrupt, so transmit - * and receive interrupts for a given queue share the same vector. - * The number of queue pairs is one less than the number of interrupt - * vectors and is assigned the vector one higher than its index. - * Vector zero is reserved for the admin queue. + * Each queue pair is mapped to a single interrupt, so + * transmit and receive interrupts for a given queue share the + * same vector. Vector zero is reserved for the admin queue. */ - ASSERT(i40e->i40e_intr_count == i40e->i40e_num_trqpairs + 1); + for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) { + uint_t vector = i % (i40e->i40e_intr_count - 1); - for (i = 0; i < i40e->i40e_num_trqpairs; i++) { - i40e->i40e_trqpairs[i].itrq_rx_intrvec = i + 1; - i40e->i40e_trqpairs[i].itrq_tx_intrvec = i + 1; + i40e->i40e_trqpairs[i].itrq_rx_intrvec = vector + 1; + i40e->i40e_trqpairs[i].itrq_tx_intrvec = vector + 1; } return (B_TRUE); @@ -1923,89 +2005,251 @@ i40e_init_macaddrs(i40e_t *i40e, i40e_hw_t *hw) } /* - * Configure the hardware for the Virtual Station Interface (VSI). Currently - * we only support one, but in the future we could instantiate more than one - * per attach-point. + * Set the properties which have common values across all the VSIs. + * Consult the "Add VSI" command section (7.4.9.5.5.1) for a + * complete description of these properties. */ -static boolean_t -i40e_config_vsi(i40e_t *i40e, i40e_hw_t *hw) +static void +i40e_set_shared_vsi_props(i40e_t *i40e, + struct i40e_aqc_vsi_properties_data *info, uint_t vsi_idx) { - struct i40e_vsi_context context; - int err, tc_queues; - - bzero(&context, sizeof (struct i40e_vsi_context)); - context.seid = i40e->i40e_vsi_id; - context.pf_num = hw->pf_id; - err = i40e_aq_get_vsi_params(hw, &context, NULL); - if (err != I40E_SUCCESS) { - i40e_error(i40e, "get VSI params failed with %d", err); - return (B_FALSE); - } + uint_t tc_queues; + uint16_t vsi_qp_base; - i40e->i40e_vsi_num = context.vsi_number; + /* + * It's important that we use bitwise-OR here; callers to this + * function might enable other sections before calling this + * function. + */ + info->valid_sections |= LE_16(I40E_AQ_VSI_PROP_QUEUE_MAP_VALID | + I40E_AQ_VSI_PROP_VLAN_VALID); /* - * Set the queue and traffic class bits. Keep it simple for now. + * Calculate the starting QP index for this VSI. This base is + * relative to the PF queue space; so a value of 0 for PF#1 + * represents the absolute index PFLAN_QALLOC_FIRSTQ for PF#1. */ - context.info.valid_sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID; - context.info.mapping_flags = I40E_AQ_VSI_QUE_MAP_CONTIG; - context.info.queue_mapping[0] = I40E_ASSIGN_ALL_QUEUES; + vsi_qp_base = vsi_idx * i40e->i40e_num_trqpairs_per_vsi; + info->mapping_flags = LE_16(I40E_AQ_VSI_QUE_MAP_CONTIG); + info->queue_mapping[0] = + LE_16((vsi_qp_base << I40E_AQ_VSI_QUEUE_SHIFT) & + I40E_AQ_VSI_QUEUE_MASK); /* - * tc_queues determines the size of the traffic class, where the size is - * 2^^tc_queues to a maximum of 64 for the X710 and 128 for the X722. + * tc_queues determines the size of the traffic class, where + * the size is 2^^tc_queues to a maximum of 64 for the X710 + * and 128 for the X722. * * Some examples: - * i40e_num_trqpairs == 1 => tc_queues = 0, 2^^0 = 1. - * i40e_num_trqpairs == 7 => tc_queues = 3, 2^^3 = 8. - * i40e_num_trqpairs == 8 => tc_queues = 3, 2^^3 = 8. - * i40e_num_trqpairs == 9 => tc_queues = 4, 2^^4 = 16. - * i40e_num_trqpairs == 17 => tc_queues = 5, 2^^5 = 32. - * i40e_num_trqpairs == 64 => tc_queues = 6, 2^^6 = 64. + * i40e_num_trqpairs_per_vsi == 1 => tc_queues = 0, 2^^0 = 1. + * i40e_num_trqpairs_per_vsi == 7 => tc_queues = 3, 2^^3 = 8. + * i40e_num_trqpairs_per_vsi == 8 => tc_queues = 3, 2^^3 = 8. + * i40e_num_trqpairs_per_vsi == 9 => tc_queues = 4, 2^^4 = 16. + * i40e_num_trqpairs_per_vsi == 17 => tc_queues = 5, 2^^5 = 32. + * i40e_num_trqpairs_per_vsi == 64 => tc_queues = 6, 2^^6 = 64. */ - tc_queues = ddi_fls(i40e->i40e_num_trqpairs - 1); + tc_queues = ddi_fls(i40e->i40e_num_trqpairs_per_vsi - 1); - context.info.tc_mapping[0] = ((0 << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) & - I40E_AQ_VSI_TC_QUE_OFFSET_MASK) | - ((tc_queues << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) & - I40E_AQ_VSI_TC_QUE_NUMBER_MASK); + /* + * The TC queue mapping is in relation to the VSI queue space. + * Since we are only using one traffic class (TC0) we always + * start at queue offset 0. + */ + info->tc_mapping[0] = + LE_16(((0 << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) & + I40E_AQ_VSI_TC_QUE_OFFSET_MASK) | + ((tc_queues << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) & + I40E_AQ_VSI_TC_QUE_NUMBER_MASK)); - context.info.valid_sections |= I40E_AQ_VSI_PROP_VLAN_VALID; - context.info.port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL | + /* + * I40E_AQ_VSI_PVLAN_MODE_ALL ("VLAN driver insertion mode") + * + * Allow tagged and untagged packets to be sent to this + * VSI from the host. + * + * I40E_AQ_VSI_PVLAN_EMOD_NOTHING ("VLAN and UP expose mode") + * + * Leave the tag on the frame and place no VLAN + * information in the descriptor. We want this mode + * because our MAC layer will take care of the VLAN tag, + * if there is one. + */ + info->port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL | I40E_AQ_VSI_PVLAN_EMOD_NOTHING; +} - context.flags = LE16_TO_CPU(I40E_AQ_VSI_TYPE_PF); +/* + * Delete the VSI at this index, if one exists. We assume there is no + * action we can take if this command fails but to log the failure. + */ +static void +i40e_delete_vsi(i40e_t *i40e, uint_t idx) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + uint16_t seid = i40e->i40e_vsis[idx].iv_seid; - i40e->i40e_vsi_stat_id = LE16_TO_CPU(context.info.stat_counter_idx); - if (i40e_stat_vsi_init(i40e) == B_FALSE) - return (B_FALSE); + if (seid != 0) { + int rc; - err = i40e_aq_update_vsi_params(hw, &context, NULL); - if (err != I40E_SUCCESS) { - i40e_error(i40e, "Update VSI params failed with %d", err); + rc = i40e_aq_delete_element(hw, seid, NULL); + + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "Failed to delete VSI %d: %d", + rc, hw->aq.asq_last_status); + } + + i40e->i40e_vsis[idx].iv_seid = 0; + } +} + +/* + * Add a new VSI. + */ +static boolean_t +i40e_add_vsi(i40e_t *i40e, i40e_hw_t *hw, uint_t idx) +{ + struct i40e_vsi_context ctx; + i40e_rx_group_t *rxg; + int rc; + + /* + * The default VSI is created by the controller. This function + * creates new, non-defualt VSIs only. + */ + ASSERT3U(idx, !=, 0); + + bzero(&ctx, sizeof (struct i40e_vsi_context)); + ctx.uplink_seid = i40e->i40e_veb_seid; + ctx.pf_num = hw->pf_id; + ctx.flags = I40E_AQ_VSI_TYPE_PF; + ctx.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL; + i40e_set_shared_vsi_props(i40e, &ctx.info, idx); + + rc = i40e_aq_add_vsi(hw, &ctx, NULL); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "i40e_aq_add_vsi() failed %d: %d", rc, + hw->aq.asq_last_status); return (B_FALSE); } + rxg = &i40e->i40e_rx_groups[idx]; + rxg->irg_vsi_seid = ctx.seid; + i40e->i40e_vsis[idx].iv_number = ctx.vsi_number; + i40e->i40e_vsis[idx].iv_seid = ctx.seid; + i40e->i40e_vsis[idx].iv_stats_id = LE_16(ctx.info.stat_counter_idx); + + if (i40e_stat_vsi_init(i40e, idx) == B_FALSE) + return (B_FALSE); return (B_TRUE); } /* - * Configure the RSS key. For the X710 controller family, this is set on a - * per-PF basis via registers. For the X722, this is done on a per-VSI basis - * through the admin queue. + * Configure the hardware for the Default Virtual Station Interface (VSI). */ static boolean_t -i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw) +i40e_config_def_vsi(i40e_t *i40e, i40e_hw_t *hw) { - uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1]; + struct i40e_vsi_context ctx; + i40e_rx_group_t *def_rxg; + int err; + struct i40e_aqc_remove_macvlan_element_data filt; - (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed)); + bzero(&ctx, sizeof (struct i40e_vsi_context)); + ctx.seid = I40E_DEF_VSI_SEID(i40e); + ctx.pf_num = hw->pf_id; + err = i40e_aq_get_vsi_params(hw, &ctx, NULL); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "get VSI params failed with %d", err); + return (B_FALSE); + } - if (i40e_is_x722(i40e)) { + ctx.info.valid_sections = 0; + i40e->i40e_vsis[0].iv_number = ctx.vsi_number; + i40e->i40e_vsis[0].iv_stats_id = LE_16(ctx.info.stat_counter_idx); + if (i40e_stat_vsi_init(i40e, 0) == B_FALSE) + return (B_FALSE); + + i40e_set_shared_vsi_props(i40e, &ctx.info, I40E_DEF_VSI_IDX); + + err = i40e_aq_update_vsi_params(hw, &ctx, NULL); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "Update VSI params failed with %d", err); + return (B_FALSE); + } + + def_rxg = &i40e->i40e_rx_groups[0]; + def_rxg->irg_vsi_seid = I40E_DEF_VSI_SEID(i40e); + + /* + * The controller places an implicit L2 filter for the primary + * MAC pointing to the default VSI. We remove this filter to + * prevent duplicate delivery of packets destined for the + * primary MAC address as DLS will create the same filter on a + * non-default VSI for the primary MAC client. + */ + bzero(&filt, sizeof (filt)); + bcopy(hw->mac.port_addr, filt.mac_addr, ETHERADDRL); + filt.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH; + filt.vlan_tag = 0; + + + ASSERT3U(i40e->i40e_resources.ifr_nmacfilt_used, <=, 1); + + err = i40e_aq_remove_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, 1, + NULL); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "Failed to remove primary MAC from default VSI" + ": %d (%d)", err, hw->aq.asq_last_status); + return (B_FALSE); + } + + /* + * As mentioned above, the controller created an implicit L2 + * filter for the primary MAC. We want to remove both the + * filter and decrement the filter count. However, not all + * controllers count this implicit filter against the total + * MAC filter count. So here we are making sure it is either + * one or zero. If it is one, then we know it is for the + * implicit filter and we should decrement since we just + * removed the filter above. If it is zero then we know the + * controller that does not count the implicit filter, and it + * was enough to just remove it; we leave the count alone. + * But if it is neither, then we have never seen a controller + * like this before and we should fail to attach. + * + * It is unfortunate that this code must exist but the + * behavior of this implicit L2 filter and its corresponding + * count were dicovered through empirical testing. The + * programming manuals hint at this filter but do not + * explicitly call out the exact behavior. + */ + if (i40e->i40e_resources.ifr_nmacfilt_used == 1) { + i40e->i40e_resources.ifr_nmacfilt_used--; + } else { + if (i40e->i40e_resources.ifr_nmacfilt_used != 0) { + i40e_error(i40e, "Unexpected MAC filter count: %u" + " (expected 0)", + i40e->i40e_resources.ifr_nmacfilt_used); + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static boolean_t +i40e_config_rss_key_x722(i40e_t *i40e, i40e_hw_t *hw) +{ + for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) { + uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1]; struct i40e_aqc_get_set_rss_key_data key; - const char *u8seed = (char *)seed; + const char *u8seed; enum i40e_status_code status; + uint16_t vsi_number = i40e->i40e_vsis[i].iv_number; + + (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed)); + u8seed = (char *)seed; CTASSERT(sizeof (key) >= (sizeof (key.standard_rss_key) + sizeof (key.extended_hash_key))); @@ -2015,14 +2259,35 @@ i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw) bcopy(&u8seed[sizeof (key.standard_rss_key)], key.extended_hash_key, sizeof (key.extended_hash_key)); - status = i40e_aq_set_rss_key(hw, i40e->i40e_vsi_num, &key); + ASSERT3U(vsi_number, !=, 0); + status = i40e_aq_set_rss_key(hw, vsi_number, &key); + if (status != I40E_SUCCESS) { - i40e_error(i40e, "failed to set rss key: %d", status); + i40e_error(i40e, "failed to set RSS key for VSI %u: %d", + vsi_number, status); return (B_FALSE); } + } + + return (B_TRUE); +} + +/* + * Configure the RSS key. For the X710 controller family, this is set on a + * per-PF basis via registers. For the X722, this is done on a per-VSI basis + * through the admin queue. + */ +static boolean_t +i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw) +{ + if (i40e_is_x722(i40e)) { + if (!i40e_config_rss_key_x722(i40e, hw)) + return (B_FALSE); } else { - uint_t i; - for (i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++) + uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1]; + + (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed)); + for (uint_t i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++) i40e_write_rx_ctl(hw, I40E_PFQF_HKEY(i), seed[i]); } @@ -2034,11 +2299,12 @@ i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw) * family, with the X722 using a known 7-bit width. On the X710 controller, this * is programmed through its control registers where as on the X722 this is * configured through the admin queue. Also of note, the X722 allows the LUT to - * be set on a per-PF or VSI basis. At this time, as we only have a single VSI, - * we use the PF setting as it is the primary VSI. + * be set on a per-PF or VSI basis. At this time we use the PF setting. If we + * decide to use the per-VSI LUT in the future, then we will need to modify the + * i40e_add_vsi() function to set the RSS LUT bits in the queueing section. * * We populate the LUT in a round robin fashion with the rx queue indices from 0 - * to i40e_num_trqpairs - 1. + * to i40e_num_trqpairs_per_vsi - 1. */ static boolean_t i40e_config_rss_hlut(i40e_t *i40e, i40e_hw_t *hw) @@ -2068,15 +2334,20 @@ i40e_config_rss_hlut(i40e_t *i40e, i40e_hw_t *hw) lut_mask = (1 << hw->func_caps.rss_table_entry_width) - 1; } - for (i = 0; i < I40E_HLUT_TABLE_SIZE; i++) - ((uint8_t *)hlut)[i] = (i % i40e->i40e_num_trqpairs) & lut_mask; + for (i = 0; i < I40E_HLUT_TABLE_SIZE; i++) { + ((uint8_t *)hlut)[i] = + (i % i40e->i40e_num_trqpairs_per_vsi) & lut_mask; + } if (i40e_is_x722(i40e)) { enum i40e_status_code status; - status = i40e_aq_set_rss_lut(hw, i40e->i40e_vsi_num, B_TRUE, - (uint8_t *)hlut, I40E_HLUT_TABLE_SIZE); + + status = i40e_aq_set_rss_lut(hw, 0, B_TRUE, (uint8_t *)hlut, + I40E_HLUT_TABLE_SIZE); + if (status != I40E_SUCCESS) { - i40e_error(i40e, "failed to set RSS LUT: %d", status); + i40e_error(i40e, "failed to set RSS LUT %d: %d", + status, hw->aq.asq_last_status); goto out; } } else { @@ -2188,8 +2459,34 @@ i40e_chip_start(i40e_t *i40e) i40e_intr_chip_init(i40e); - if (!i40e_config_vsi(i40e, hw)) + rc = i40e_get_mac_seid(i40e); + if (rc == -1) { + i40e_error(i40e, "failed to obtain MAC Uplink SEID"); return (B_FALSE); + } + i40e->i40e_mac_seid = (uint16_t)rc; + + /* + * Create a VEB in order to support multiple VSIs. Each VSI + * functions as a MAC group. This call sets the PF's MAC as + * the uplink port and the PF's default VSI as the default + * downlink port. + */ + rc = i40e_aq_add_veb(hw, i40e->i40e_mac_seid, I40E_DEF_VSI_SEID(i40e), + 0x1, B_TRUE, &i40e->i40e_veb_seid, B_FALSE, NULL); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "i40e_aq_add_veb() failed %d: %d", rc, + hw->aq.asq_last_status); + return (B_FALSE); + } + + if (!i40e_config_def_vsi(i40e, hw)) + return (B_FALSE); + + for (uint_t i = 1; i < i40e->i40e_num_rx_groups; i++) { + if (!i40e_add_vsi(i40e, hw, i)) + return (B_FALSE); + } if (!i40e_config_rss(i40e, hw)) return (B_FALSE); @@ -2549,7 +2846,7 @@ i40e_setup_tx_hmc(i40e_trqpair_t *itrq) * assigned to traffic class zero, because we don't actually use them. */ bzero(&context, sizeof (struct i40e_vsi_context)); - context.seid = i40e->i40e_vsi_id; + context.seid = I40E_DEF_VSI_SEID(i40e); context.pf_num = hw->pf_id; err = i40e_aq_get_vsi_params(hw, &context, NULL); if (err != I40E_SUCCESS) { @@ -2653,7 +2950,8 @@ i40e_setup_tx_rings(i40e_t *i40e) void i40e_stop(i40e_t *i40e, boolean_t free_allocations) { - int i; + uint_t i; + i40e_hw_t *hw = &i40e->i40e_hw_space; ASSERT(MUTEX_HELD(&i40e->i40e_general_lock)); @@ -2689,6 +2987,27 @@ i40e_stop(i40e_t *i40e, boolean_t free_allocations) delay(50 * drv_usectohz(1000)); + /* + * We don't delete the default VSI because it replaces the VEB + * after VEB deletion (see the "Delete Element" section). + * Furthermore, since the default VSI is provided by the + * firmware, we never attempt to delete it. + */ + for (i = 1; i < i40e->i40e_num_rx_groups; i++) { + i40e_delete_vsi(i40e, i); + } + + if (i40e->i40e_veb_seid != 0) { + int rc = i40e_aq_delete_element(hw, i40e->i40e_veb_seid, NULL); + + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "Failed to delete VEB %d: %d", rc, + hw->aq.asq_last_status); + } + + i40e->i40e_veb_seid = 0; + } + i40e_intr_chip_fini(i40e); for (i = 0; i < i40e->i40e_num_trqpairs; i++) { @@ -2718,7 +3037,9 @@ i40e_stop(i40e_t *i40e, boolean_t free_allocations) mutex_exit(&i40e->i40e_trqpairs[i].itrq_tx_lock); } - i40e_stat_vsi_fini(i40e); + for (i = 0; i < i40e->i40e_num_rx_groups; i++) { + i40e_stat_vsi_fini(i40e, i); + } i40e->i40e_link_speed = 0; i40e->i40e_link_duplex = 0; @@ -2783,7 +3104,8 @@ i40e_start(i40e_t *i40e, boolean_t alloc) * Enable broadcast traffic; however, do not enable multicast traffic. * That's handle exclusively through MAC's mc_multicst routines. */ - err = i40e_aq_set_vsi_broadcast(hw, i40e->i40e_vsi_id, B_TRUE, NULL); + err = i40e_aq_set_vsi_broadcast(hw, I40E_DEF_VSI_SEID(i40e), B_TRUE, + NULL); if (err != I40E_SUCCESS) { i40e_error(i40e, "failed to set default VSI: %d", err); rc = B_FALSE; diff --git a/usr/src/uts/common/io/i40e/i40e_stats.c b/usr/src/uts/common/io/i40e/i40e_stats.c index 7a4f0faedd..e40c9f2c53 100644 --- a/usr/src/uts/common/io/i40e/i40e_stats.c +++ b/usr/src/uts/common/io/i40e/i40e_stats.c @@ -11,7 +11,7 @@ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include "i40e_sw.h" @@ -69,12 +69,7 @@ * --------------------- * * The hardware keeps statistics at each physical function/MAC (PF) and it keeps - * statistics on each virtual station interface (VSI). Currently we only use one - * VSI per PF (see the i40e_main.c theory statement). The hardware has a limited - * number of statistics units available. While every PF is guaranteed to have a - * statistics unit, it is possible that we will run out for a given VSI. We'll - * have to figure out an appropriate strategy here when we end up supporting - * multiple VSIs. + * statistics on each virtual station interface (VSI). * * The hardware keeps these statistics as 32-bit and 48-bit counters. We are * required to read them and then compute the differences between them. The @@ -100,10 +95,10 @@ * data. * * The pf kstats data is stored in the i40e_t`i40e_pf_kstat. It is backed by the - * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstat is in - * i40e_t`i40e_vsi_kstat and the data is backed in the i40e_t`i40e_vsi_stat. All - * of this data is protected by the i40e_stat_lock, which should be taken last, - * when acquiring locks. + * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstats are in + * i40e_t`i40e_vsis[idx].iv_kstats and the data is backed in the + * i40e_t`i40e_vsis[idx].iv_stats. All of this data is protected by the + * i40e_stat_lock, which should be taken last, when acquiring locks. */ static void @@ -169,15 +164,15 @@ i40e_stat_get_uint32(i40e_t *i40e, uintptr_t reg, kstat_named_t *kstat, } static void -i40e_stat_vsi_update(i40e_t *i40e, boolean_t init) +i40e_stat_vsi_update(i40e_t *i40e, uint_t idx, boolean_t init) { i40e_vsi_stats_t *ivs; i40e_vsi_kstats_t *ivk; - int id = i40e->i40e_vsi_stat_id; + uint16_t id = i40e->i40e_vsis[idx].iv_stats_id; - ASSERT(i40e->i40e_vsi_kstat != NULL); - ivs = &i40e->i40e_vsi_stat; - ivk = i40e->i40e_vsi_kstat->ks_data; + ASSERT3P(i40e->i40e_vsis[idx].iv_kstats, !=, NULL); + ivs = &i40e->i40e_vsis[idx].iv_stats; + ivk = i40e->i40e_vsis[idx].iv_kstats->ks_data; mutex_enter(&i40e->i40e_stat_lock); @@ -231,39 +226,41 @@ i40e_stat_vsi_kstat_update(kstat_t *ksp, int rw) return (EACCES); i40e = ksp->ks_private; - i40e_stat_vsi_update(i40e, B_FALSE); + for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) + i40e_stat_vsi_update(i40e, i, B_FALSE); + return (0); } void -i40e_stat_vsi_fini(i40e_t *i40e) +i40e_stat_vsi_fini(i40e_t *i40e, uint_t idx) { - if (i40e->i40e_vsi_kstat != NULL) { - kstat_delete(i40e->i40e_vsi_kstat); - i40e->i40e_vsi_kstat = NULL; + if (i40e->i40e_vsis[idx].iv_kstats != NULL) { + kstat_delete(i40e->i40e_vsis[idx].iv_kstats); + i40e->i40e_vsis[idx].iv_kstats = NULL; } } boolean_t -i40e_stat_vsi_init(i40e_t *i40e) +i40e_stat_vsi_init(i40e_t *i40e, uint_t idx) { kstat_t *ksp; i40e_vsi_kstats_t *ivk; char buf[64]; + uint16_t vsi_id = i40e->i40e_vsis[idx].iv_seid; - (void) snprintf(buf, sizeof (buf), "vsi_%d", i40e->i40e_vsi_id); + (void) snprintf(buf, sizeof (buf), "vsi_%u", vsi_id); ksp = kstat_create(I40E_MODULE_NAME, ddi_get_instance(i40e->i40e_dip), buf, "net", KSTAT_TYPE_NAMED, sizeof (i40e_vsi_kstats_t) / sizeof (kstat_named_t), 0); if (ksp == NULL) { - i40e_error(i40e, "Failed to create kstats for VSI %d", - i40e->i40e_vsi_id); + i40e_error(i40e, "Failed to create kstats for VSI %u", vsi_id); return (B_FALSE); } - i40e->i40e_vsi_kstat = ksp; + i40e->i40e_vsis[idx].iv_kstats = ksp; ivk = ksp->ks_data; ksp->ks_update = i40e_stat_vsi_kstat_update; ksp->ks_private = i40e; @@ -291,9 +288,9 @@ i40e_stat_vsi_init(i40e_t *i40e) kstat_named_init(&ivk->ivk_tx_errors, "tx_errors", KSTAT_DATA_UINT64); - bzero(&i40e->i40e_vsi_stat, sizeof (i40e_vsi_stats_t)); - i40e_stat_vsi_update(i40e, B_TRUE); - kstat_install(i40e->i40e_vsi_kstat); + bzero(&i40e->i40e_vsis[idx].iv_stats, sizeof (i40e_vsi_stats_t)); + i40e_stat_vsi_update(i40e, idx, B_TRUE); + kstat_install(i40e->i40e_vsis[idx].iv_kstats); return (B_TRUE); } @@ -670,7 +667,12 @@ i40e_stat_pf_init(i40e_t *i40e) void i40e_stats_fini(i40e_t *i40e) { - ASSERT(i40e->i40e_vsi_kstat == NULL); +#ifdef DEBUG + for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) { + ASSERT3P(i40e->i40e_vsis[i].iv_kstats, ==, NULL); + } +#endif + if (i40e->i40e_pf_kstat != NULL) { kstat_delete(i40e->i40e_pf_kstat); i40e->i40e_pf_kstat = NULL; @@ -1230,6 +1232,12 @@ i40e_stats_trqpair_init(i40e_trqpair_t *itrq) kstat_named_init(&tsp->itxs_recycled, "tx_recycled", KSTAT_DATA_UINT64); tsp->itxs_recycled.value.ui64 = 0; + kstat_named_init(&tsp->itxs_force_copy, "tx_force_copy", + KSTAT_DATA_UINT64); + tsp->itxs_force_copy.value.ui64 = 0; + kstat_named_init(&tsp->itxs_tso_force_copy, "tx_tso_force_copy", + KSTAT_DATA_UINT64); + tsp->itxs_tso_force_copy.value.ui64 = 0; kstat_named_init(&tsp->itxs_hck_meoifail, "tx_hck_meoifail", KSTAT_DATA_UINT64); @@ -1249,6 +1257,15 @@ i40e_stats_trqpair_init(i40e_trqpair_t *itrq) kstat_named_init(&tsp->itxs_hck_badl4, "tx_hck_badl4", KSTAT_DATA_UINT64); tsp->itxs_hck_badl4.value.ui64 = 0; + kstat_named_init(&tsp->itxs_lso_nohck, "tx_lso_nohck", + KSTAT_DATA_UINT64); + tsp->itxs_lso_nohck.value.ui64 = 0; + kstat_named_init(&tsp->itxs_bind_fails, "tx_bind_fails", + KSTAT_DATA_UINT64); + tsp->itxs_bind_fails.value.ui64 = 0; + kstat_named_init(&tsp->itxs_tx_short, "tx_short", + KSTAT_DATA_UINT64); + tsp->itxs_tx_short.value.ui64 = 0; kstat_named_init(&tsp->itxs_err_notcb, "tx_err_notcb", KSTAT_DATA_UINT64); tsp->itxs_err_notcb.value.ui64 = 0; diff --git a/usr/src/uts/common/io/i40e/i40e_sw.h b/usr/src/uts/common/io/i40e/i40e_sw.h index 78aced0144..e7b64c2160 100644 --- a/usr/src/uts/common/io/i40e/i40e_sw.h +++ b/usr/src/uts/common/io/i40e/i40e_sw.h @@ -11,7 +11,7 @@ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2017 Tegile Systems, Inc. All rights reserved. */ @@ -152,9 +152,10 @@ typedef enum i40e_itr_index { } i40e_itr_index_t; /* - * Table 1-5 of the PRM notes that LSO supports up to 256 KB. + * The hardware claims to support LSO up to 256 KB, but due to the limitations + * imposed by the IP header for non-jumbo frames, we cap it at 64 KB. */ -#define I40E_LSO_MAXLEN (256 * 1024) +#define I40E_LSO_MAXLEN (64 * 1024) #define I40E_CYCLIC_PERIOD NANOSEC /* 1 second */ #define I40E_DRAIN_RX_WAIT (500 * MILLISEC) /* In us */ @@ -173,13 +174,22 @@ typedef enum i40e_itr_index { #define I40E_BUF_IPHDR_ALIGNMENT 2 /* - * The XL710 controller has a limit of eight buffers being allowed to be used - * for the transmission of a single frame. This is defined in 8.4.1 - Transmit + * The XL710 controller has a total of eight buffers available for the + * transmission of any single frame. This is defined in 8.4.1 - Transmit * Packet in System Memory. */ #define I40E_TX_MAX_COOKIE 8 /* + * An LSO frame can be as large as 64KB, so we allow a DMA bind to span more + * cookies than a non-LSO frame. The key here to is to select a value such + * that once the HW has chunked up the LSO frame into MSS-sized segments that no + * single segment spans more than 8 cookies (see comments for + * I40E_TX_MAX_COOKIE) + */ +#define I40E_TX_LSO_MAX_COOKIE 32 + +/* * Sizing to determine the amount of available descriptors at which we'll * consider ourselves blocked. Also, when we have these available, we'll then * consider ourselves available to transmit to MAC again. Strictly speaking, the @@ -203,6 +213,12 @@ typedef enum i40e_itr_index { #define I40E_MAX_TX_DMA_THRESH INT32_MAX /* + * The max size of each individual tx buffer is 16KB - 1. + * See table 8-17 + */ +#define I40E_MAX_TX_BUFSZ 0x0000000000003FFFull + +/* * Resource sizing counts. There are various aspects of hardware where we may * have some variable number of elements that we need to handle. Such as the * hardware capabilities and switch capacities. We cannot know a priori how many @@ -240,21 +256,6 @@ typedef enum i40e_itr_index { #define I40E_HMC_TX_TPH_DISABLE 0 /* - * Whenever we establish and create a VSI, we need to assign some number of - * queues that it's allowed to access from the PF. Because we only have a single - * VSI per PF at this time, we assign it all the queues. - * - * Many of the devices support what's called Data-center Bridging. Which is a - * feature that we don't have much use of at this time. However, we still need - * to fill in this information. We follow the guidance of the note in Table 7-80 - * which talks about bytes 62-77. It says that if we don't want to assign - * anything to traffic classes, we should set the field to zero. Effectively - * this means that everything in the system is assigned to traffic class zero. - */ -#define I40E_ASSIGN_ALL_QUEUES 0 -#define I40E_TRAFFIC_CLASS_NO_QUEUES 0 - -/* * This defines the error mask that we care about from rx descriptors. Currently * we're only concerned with the general errors and oversize errors. */ @@ -268,12 +269,12 @@ typedef enum i40e_itr_index { #define I40E_DDI_PROP_LEN 64 /* - * We currently consolidate some overrides that we use in the code here. These - * will be gone in the fullness of time, but as we're bringing up the device, - * this is what we use. + * Place an artificial limit on the max number of groups. The X710 + * series supports up to 384 VSIs to be partitioned across PFs as the + * driver sees fit. But until we support more interrupts this seems + * like a good place to start. */ -#define I40E_GROUP_MAX 1 -#define I40E_TRQPAIR_MAX 1 +#define I40E_GROUP_MAX 32 #define I40E_GROUP_NOMSIX 1 #define I40E_TRQPAIR_NOMSIX 1 @@ -405,18 +406,29 @@ typedef struct i40e_rx_control_block { typedef enum { I40E_TX_NONE, I40E_TX_COPY, - I40E_TX_DMA + I40E_TX_DMA, + I40E_TX_DESC, } i40e_tx_type_t; typedef struct i40e_tx_desc i40e_tx_desc_t; +typedef struct i40e_tx_context_desc i40e_tx_context_desc_t; typedef union i40e_32byte_rx_desc i40e_rx_desc_t; +struct i40e_dma_bind_info { + caddr_t dbi_paddr; + size_t dbi_len; +}; + typedef struct i40e_tx_control_block { struct i40e_tx_control_block *tcb_next; mblk_t *tcb_mp; i40e_tx_type_t tcb_type; ddi_dma_handle_t tcb_dma_handle; + ddi_dma_handle_t tcb_lso_dma_handle; i40e_dma_buffer_t tcb_dma; + struct i40e_dma_bind_info *tcb_bind_info; + uint_t tcb_bind_ncookies; + boolean_t tcb_used_lso; } i40e_tx_control_block_t; /* @@ -517,6 +529,8 @@ typedef struct i40e_txq_stat { kstat_named_t itxs_packets; /* Packets out on queue */ kstat_named_t itxs_descriptors; /* Descriptors issued */ kstat_named_t itxs_recycled; /* Descriptors reclaimed */ + kstat_named_t itxs_force_copy; /* non-TSO force copy */ + kstat_named_t itxs_tso_force_copy; /* TSO force copy */ /* * Various failure conditions. */ @@ -526,6 +540,9 @@ typedef struct i40e_txq_stat { kstat_named_t itxs_hck_nol4info; /* Missing l4 info */ kstat_named_t itxs_hck_badl3; /* Not IPv4/IPv6 */ kstat_named_t itxs_hck_badl4; /* Bad L4 Paylaod */ + kstat_named_t itxs_lso_nohck; /* Missing offloads for LSO */ + kstat_named_t itxs_bind_fails; /* DMA bind failures */ + kstat_named_t itxs_tx_short; /* Tx chain too short */ kstat_named_t itxs_err_notcb; /* No tcb's available */ kstat_named_t itxs_err_nodescs; /* No tcb's available */ @@ -761,6 +778,25 @@ typedef struct i40e_func_rsrc { uint_t ifr_nmcastfilt_used; } i40e_func_rsrc_t; +typedef struct i40e_vsi { + uint16_t iv_seid; + uint16_t iv_number; + kstat_t *iv_kstats; + i40e_vsi_stats_t iv_stats; + uint16_t iv_stats_id; +} i40e_vsi_t; + +/* + * While irg_index and irg_grp_hdl aren't used anywhere, they are + * still useful for debugging. + */ +typedef struct i40e_rx_group { + uint32_t irg_index; /* index in i40e_rx_groups[] */ + uint16_t irg_vsi_seid; /* SEID of VSI for this group */ + mac_group_handle_t irg_grp_hdl; /* handle to mac_group_t */ + struct i40e *irg_i40e; /* ref to i40e_t */ +} i40e_rx_group_t; + /* * Main i40e per-instance state. */ @@ -789,11 +825,18 @@ typedef struct i40e { struct i40e_aq_get_phy_abilities_resp i40e_phy; void *i40e_aqbuf; +#define I40E_DEF_VSI_IDX 0 +#define I40E_DEF_VSI(i40e) ((i40e)->i40e_vsis[I40E_DEF_VSI_IDX]) +#define I40E_DEF_VSI_SEID(i40e) (I40E_DEF_VSI(i40e).iv_seid) + /* * Device state, switch information, and resources. */ - int i40e_vsi_id; - uint16_t i40e_vsi_num; + i40e_vsi_t i40e_vsis[I40E_GROUP_MAX]; + uint16_t i40e_mac_seid; /* SEID of physical MAC */ + uint16_t i40e_veb_seid; /* switch atop MAC (SEID) */ + uint16_t i40e_vsi_avail; /* VSIs avail to this PF */ + uint16_t i40e_vsi_used; /* VSIs used by this PF */ struct i40e_device *i40e_device; i40e_func_rsrc_t i40e_resources; uint16_t i40e_switch_rsrc_alloc; @@ -814,12 +857,13 @@ typedef struct i40e { */ i40e_trqpair_t *i40e_trqpairs; boolean_t i40e_mr_enable; - int i40e_num_trqpairs; + uint_t i40e_num_trqpairs; /* total TRQPs (per PF) */ + uint_t i40e_num_trqpairs_per_vsi; /* TRQPs per VSI */ uint_t i40e_other_itr; - int i40e_num_rx_groups; + i40e_rx_group_t *i40e_rx_groups; + uint_t i40e_num_rx_groups; int i40e_num_rx_descs; - mac_group_handle_t i40e_rx_group_handle; uint32_t i40e_rx_ring_size; uint32_t i40e_rx_buf_size; boolean_t i40e_rx_hcksum_enable; @@ -832,6 +876,7 @@ typedef struct i40e { uint32_t i40e_tx_buf_size; uint32_t i40e_tx_block_thresh; boolean_t i40e_tx_hcksum_enable; + boolean_t i40e_tx_lso_enable; uint32_t i40e_tx_dma_min; uint_t i40e_tx_itr; @@ -855,6 +900,7 @@ typedef struct i40e { */ ddi_dma_attr_t i40e_static_dma_attr; ddi_dma_attr_t i40e_txbind_dma_attr; + ddi_dma_attr_t i40e_txbind_lso_dma_attr; ddi_device_acc_attr_t i40e_desc_acc_attr; ddi_device_acc_attr_t i40e_buf_acc_attr; @@ -872,10 +918,7 @@ typedef struct i40e { */ kmutex_t i40e_stat_lock; kstat_t *i40e_pf_kstat; - kstat_t *i40e_vsi_kstat; i40e_pf_stats_t i40e_pf_stat; - i40e_vsi_stats_t i40e_vsi_stat; - uint16_t i40e_vsi_stat_id; /* * Misc. stats and counters that should maybe one day be kstats. @@ -975,8 +1018,8 @@ extern void i40e_tx_cleanup_ring(i40e_trqpair_t *); */ extern boolean_t i40e_stats_init(i40e_t *); extern void i40e_stats_fini(i40e_t *); -extern boolean_t i40e_stat_vsi_init(i40e_t *); -extern void i40e_stat_vsi_fini(i40e_t *); +extern boolean_t i40e_stat_vsi_init(i40e_t *, uint_t); +extern void i40e_stat_vsi_fini(i40e_t *, uint_t); extern boolean_t i40e_stats_trqpair_init(i40e_trqpair_t *); extern void i40e_stats_trqpair_fini(i40e_trqpair_t *); extern int i40e_m_stat(void *, uint_t, uint64_t *); diff --git a/usr/src/uts/common/io/i40e/i40e_transceiver.c b/usr/src/uts/common/io/i40e/i40e_transceiver.c index 57620f03fa..caafa3e102 100644 --- a/usr/src/uts/common/io/i40e/i40e_transceiver.c +++ b/usr/src/uts/common/io/i40e/i40e_transceiver.c @@ -11,7 +11,7 @@ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include "i40e_sw.h" @@ -60,19 +60,19 @@ * This size is then rounded up to the nearest 1k chunk, which represents the * actual amount of memory that we'll allocate for a single frame. * - * Note, that for rx, we do something that might be unexpected. We always add + * Note, that for RX, we do something that might be unexpected. We always add * an extra two bytes to the frame size that we allocate. We then offset the DMA * address that we receive a packet into by two bytes. This ensures that the IP * header will always be 4 byte aligned because the MAC header is either 14 or * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's * and MAC's lives easier. * - * Both the rx and tx descriptor rings (which are what we use to communicate + * Both the RX and TX descriptor rings (which are what we use to communicate * with hardware) are allocated as a single region of DMA memory which is the * size of the descriptor (4 bytes and 2 bytes respectively) times the total - * number of descriptors for an rx and tx ring. + * number of descriptors for an RX and TX ring. * - * While the rx and tx descriptors are allocated using DMA-based memory, the + * While the RX and TX descriptors are allocated using DMA-based memory, the * control blocks for each of them are allocated using normal kernel memory. * They aren't special from a DMA perspective. We'll go over the design of both * receiving and transmitting separately, as they have slightly different @@ -113,16 +113,16 @@ * * To try and ensure that the device always has blocks that it can receive data * into, we maintain two lists of control blocks, a working list and a free - * list. Each list is sized equal to the number of descriptors in the rx ring. - * During the GLDv3 mc_start routine, we allocate a number of rx control blocks + * list. Each list is sized equal to the number of descriptors in the RX ring. + * During the GLDv3 mc_start routine, we allocate a number of RX control blocks * equal to twice the number of descriptors in the ring and we assign them * equally to the free list and to the working list. Each control block also has * DMA memory allocated and associated with which it will be used to receive the * actual packet data. All of a received frame's data will end up in a single * DMA buffer. * - * During operation, we always maintain the invariant that each rx descriptor - * has an associated rx control block which lives in the working list. If we + * During operation, we always maintain the invariant that each RX descriptor + * has an associated RX control block which lives in the working list. If we * feel that we should loan up DMA memory to MAC in the form of a message block, * we can only do so if we can maintain this invariant. To do that, we swap in * one of the buffers from the free list. If none are available, then we resort @@ -130,14 +130,14 @@ * size. * * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is - * called on the block, at which point we restore the rx control block to the + * called on the block, at which point we restore the RX control block to the * free list and are able to reuse the DMA memory again. While the scheme may * seem odd, it importantly keeps us out of trying to do any DMA allocations in * the normal path of operation, even though we may still have to allocate * message blocks and copy. * - * The following state machine describes the life time of a rx control block. In - * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx + * The following state machine describes the life time of a RX control block. In + * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx * control block entry as rcb. * * | | @@ -160,11 +160,11 @@ * +--------------------<-----| rcb loaned to MAC | * +-------------------+ * - * Finally, note that every rx control block has a reference count on it. One + * Finally, note that every RX control block has a reference count on it. One * reference is added as long as the driver has had the GLDv3 mc_start endpoint * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and * no other DLPI consumers remain, then we'll decrement the reference count by - * one. Whenever we loan up the rx control block and associated buffer to MAC, + * one. Whenever we loan up the RX control block and associated buffer to MAC, * then we bump the reference count again. Even though the device is stopped, * there may still be loaned frames in upper levels that we'll want to account * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure @@ -192,10 +192,10 @@ * state tracking. Effectively, we cache the HEAD register and then update it * ourselves based on our work. * - * When we iterate over the rx descriptors and thus the received frames, we are + * When we iterate over the RX descriptors and thus the received frames, we are * either in an interrupt context or we've been asked by MAC to poll on the * ring. If we've been asked to poll on the ring, we have a maximum number of - * bytes of mblk_t's to return. If processing an rx descriptor would cause us to + * bytes of mblk_t's to return. If processing an RX descriptor would cause us to * exceed that count, then we do not process it. When in interrupt context, we * don't have a strict byte count. However, to ensure liveness, we limit the * amount of data based on a configuration value @@ -249,31 +249,54 @@ * differently due to the fact that all data is originated by the operating * system and not by the device. * - * Like rx, there is both a descriptor ring that we use to communicate to the - * driver and which points to the memory used to transmit a frame. Similarly, - * there is a corresponding transmit control block. Each transmit control block - * has a region of DMA memory allocated to it; however, the way we use it - * varies. + * Like RX, there is both a descriptor ring that we use to communicate to the + * driver and which points to the memory used to transmit a frame. Similarly, + * there is a corresponding transmit control block, however, the correspondence + * between descriptors and control blocks is more complex and not necessarily + * 1-to-1. * * The driver is asked to process a single frame at a time. That message block * may be made up of multiple fragments linked together by the mblk_t`b_cont * member. The device has a hard limit of up to 8 buffers being allowed for use - * for a single logical frame. For each fragment, we'll try and use an entry - * from the tx descriptor ring and then we'll allocate a corresponding tx - * control block. Depending on the size of the fragment, we may copy it around - * or we might instead try to do DMA binding of the fragment. - * - * If we exceed the number of blocks that fit, we'll try to pull up the block - * and then we'll do a DMA bind and send it out. - * - * If we don't have enough space in the ring or tx control blocks available, + * for a single non-LSO packet or LSO segment. The number of TX ring entires + * (and thus TX control blocks) used depends on the fragment sizes and DMA + * layout, as explained below. + * + * We alter our DMA strategy based on a threshold tied to the fragment size. + * This threshold is configurable via the tx_dma_threshold property. If the + * fragment is above the threshold, we DMA bind it -- consuming one TCB and + * potentially several data descriptors. The exact number of descriptors (equal + * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset + * into page, b_wptr offset into page, and the physical layout of the dblk's + * memory (contiguous or not). Essentially, we are at the mercy of the DMA + * engine and the dblk's memory allocation. Knowing the exact number of + * descriptors up front is a task best not taken on by the driver itself. + * Instead, we attempt to DMA bind the fragment and verify the descriptor + * layout meets hardware constraints. If the proposed DMA bind does not satisfy + * the hardware constaints, then we discard it and instead copy the entire + * fragment into the pre-allocated TCB buffer (or buffers if the fragment is + * larger than the TCB buffer). + * + * If the fragment is below or at the threshold, we copy it to the pre-allocated + * buffer of a TCB. We compress consecutive copy fragments into a single TCB to + * conserve resources. We are guaranteed that the TCB buffer is made up of only + * 1 DMA cookie; and therefore consumes only one descriptor on the controller. + * + * Furthermore, if the frame requires HW offloads such as LSO, tunneling or + * filtering, then the TX data descriptors must be preceeded by a single TX + * context descriptor. Because there is no DMA transfer associated with the + * context descriptor, we allocate a control block with a special type which + * indicates to the TX ring recycle code that there are no associated DMA + * resources to unbind when the control block is free'd. + * + * If we don't have enough space in the ring or TX control blocks available, * then we'll return the unprocessed message block to MAC. This will induce flow * control and once we recycle enough entries, we'll once again enable sending * on the ring. * * We size the working list as equal to the number of descriptors in the ring. * We size the free list as equal to 1.5 times the number of descriptors in the - * ring. We'll allocate a number of tx control block entries equal to the number + * ring. We'll allocate a number of TX control block entries equal to the number * of entries in the free list. By default, all entries are placed in the free * list. As we come along and try to send something, we'll allocate entries from * the free list and add them to the working list, where they'll stay until the @@ -325,7 +348,7 @@ * +------------------+ +------------------+ * | tcb on free list |---*------------------>| tcb on work list | * +------------------+ . +------------------+ - * ^ . tcb allocated | + * ^ . N tcbs allocated[1] | * | to send frame v * | or fragment on | * | wire, mblk from | @@ -335,20 +358,27 @@ * . * . Hardware indicates * entry transmitted. - * tcb recycled, mblk + * tcbs recycled, mblk * from MAC freed. * + * [1] We allocate N tcbs to transmit a single frame where N can be 1 context + * descriptor plus 1 data descriptor, in the non-DMA-bind case. In the DMA + * bind case, N can be 1 context descriptor plus 1 data descriptor per + * b_cont in the mblk. In this case, the mblk is associated with the first + * data descriptor and freed as part of freeing that data descriptor. + * * ------------ * Blocking MAC * ------------ * - * Wen performing transmit, we can run out of descriptors and ring entries. When - * such a case happens, we return the mblk_t to MAC to indicate that we've been - * blocked. At that point in time, MAC becomes blocked and will not transmit - * anything out that specific ring until we notify MAC. To indicate that we're - * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE. + * When performing transmit, we can run out of descriptors and ring entries. + * When such a case happens, we return the mblk_t to MAC to indicate that we've + * been blocked. At that point in time, MAC becomes blocked and will not + * transmit anything out that specific ring until we notify MAC. To indicate + * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member + * to B_TRUE. * - * When we recycle tx descriptors then we'll end up signaling MAC by calling + * When we recycle TX descriptors then we'll end up signaling MAC by calling * mac_tx_ring_update() if we were blocked, letting it know that it's safe to * start sending frames out to us again. */ @@ -367,13 +397,15 @@ /* * This structure is used to maintain information and flags related to - * transmitting a frame. The first member is the set of flags we need to or into - * the command word (generally checksumming related). The second member controls - * the word offsets which is required for IP and L4 checksumming. + * transmitting a frame. These fields are ultimately used to construct the + * TX data descriptor(s) and, if necessary, the TX context descriptor. */ typedef struct i40e_tx_context { - enum i40e_tx_desc_cmd_bits itc_cmdflags; - uint32_t itc_offsets; + enum i40e_tx_desc_cmd_bits itc_data_cmdflags; + uint32_t itc_data_offsets; + enum i40e_tx_ctx_desc_cmd_bits itc_ctx_cmdflags; + uint32_t itc_ctx_tsolen; + uint32_t itc_ctx_mss; } i40e_tx_context_t; /* @@ -395,14 +427,18 @@ i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT; * i40e_static_dma_attr, is designed to be used for both the descriptor rings * and the static buffers that we associate with control blocks. For this * reason, we force an SGL length of one. While technically the driver supports - * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our + * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our * management here. In addition, when the Intel common code wants to allocate * memory via the i40e_allocate_virt_mem osdep function, we have it leverage * the static dma attr. * - * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're - * binding a bunch of mblk_t fragments to go out the door. Note that the main - * difference here is that we're allowed a larger SGL length -- eight. + * The latter two sets of attributes, are what we use when we're binding a + * bunch of mblk_t fragments to go out the door. Note that the main difference + * here is that we're allowed a larger SGL length. For non-LSO TX, we + * restrict the SGL length to match the number of TX buffers available to the + * PF (8). For the LSO case we can go much larger, with the caveat that each + * MSS-sized chunk (segment) must not span more than 8 data descriptors and + * hence must not span more than 8 cookies. * * Note, we default to setting ourselves to be DMA capable here. However, * because we could have multiple instances which have different FMA error @@ -429,7 +465,7 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = { DMA_ATTR_V0, /* version number */ 0x0000000000000000ull, /* low address */ 0xFFFFFFFFFFFFFFFFull, /* high address */ - 0x00000000FFFFFFFFull, /* dma counter max */ + I40E_MAX_TX_BUFSZ - 1, /* dma counter max */ I40E_DMA_ALIGNMENT, /* alignment */ 0x00000FFF, /* burst sizes */ 0x00000001, /* minimum transfer size */ @@ -440,6 +476,21 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = { DDI_DMA_FLAGERR /* DMA flags */ }; +static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = { + DMA_ATTR_V0, /* version number */ + 0x0000000000000000ull, /* low address */ + 0xFFFFFFFFFFFFFFFFull, /* high address */ + I40E_MAX_TX_BUFSZ - 1, /* dma counter max */ + I40E_DMA_ALIGNMENT, /* alignment */ + 0x00000FFF, /* burst sizes */ + 0x00000001, /* minimum transfer size */ + 0x00000000FFFFFFFFull, /* maximum transfer size */ + 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ + I40E_TX_LSO_MAX_COOKIE, /* scatter/gather list length */ + 0x00000001, /* granularity */ + DDI_DMA_FLAGERR /* DMA flags */ +}; + /* * Next, we have the attributes for these structures. The descriptor rings are * all strictly little endian, while the data buffers are just arrays of bytes @@ -668,7 +719,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq) rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * rxd->rxd_ring_size, KM_NOSLEEP); if (rxd->rxd_work_list == NULL) { - i40e_error(i40e, "failed to allocate rx work list for a ring " + i40e_error(i40e, "failed to allocate RX work list for a ring " "of %d entries for ring %d", rxd->rxd_ring_size, itrq->itrq_index); goto cleanup; @@ -677,7 +728,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq) rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * rxd->rxd_free_list_size, KM_NOSLEEP); if (rxd->rxd_free_list == NULL) { - i40e_error(i40e, "failed to allocate a %d entry rx free list " + i40e_error(i40e, "failed to allocate a %d entry RX free list " "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index); goto cleanup; } @@ -765,7 +816,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd) i40e_t *i40e = rxd->rxd_i40e; /* - * First allocate the rx descriptor ring. + * First allocate the RX descriptor ring. */ dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size; VERIFY(dmasz > 0); @@ -773,7 +824,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd) &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE, B_TRUE, dmasz) == B_FALSE) { i40e_error(i40e, "failed to allocate DMA resources " - "for rx descriptor ring"); + "for RX descriptor ring"); return (B_FALSE); } rxd->rxd_desc_ring = @@ -799,7 +850,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd) if (i40e_alloc_dma_buffer(i40e, dmap, &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, B_TRUE, B_FALSE, dmasz) == B_FALSE) { - i40e_error(i40e, "failed to allocate rx dma buffer"); + i40e_error(i40e, "failed to allocate RX dma buffer"); return (B_FALSE); } @@ -841,6 +892,10 @@ i40e_free_tx_dma(i40e_trqpair_t *itrq) ddi_dma_free_handle(&tcb->tcb_dma_handle); tcb->tcb_dma_handle = NULL; } + if (tcb->tcb_lso_dma_handle != NULL) { + ddi_dma_free_handle(&tcb->tcb_lso_dma_handle); + tcb->tcb_lso_dma_handle = NULL; + } } fsz = sizeof (i40e_tx_control_block_t) * @@ -881,7 +936,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) (i40e->i40e_tx_ring_size >> 1); /* - * Allocate an additional tx descriptor for the writeback head. + * Allocate an additional TX descriptor for the writeback head. */ dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size; dmasz += sizeof (i40e_tx_desc_t); @@ -890,7 +945,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area, &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE, B_TRUE, dmasz) == B_FALSE) { - i40e_error(i40e, "failed to allocate DMA resources for tx " + i40e_error(i40e, "failed to allocate DMA resources for TX " "descriptor ring"); return (B_FALSE); } @@ -905,7 +960,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size * sizeof (i40e_tx_control_block_t *), KM_NOSLEEP); if (itrq->itrq_tcb_work_list == NULL) { - i40e_error(i40e, "failed to allocate a %d entry tx work list " + i40e_error(i40e, "failed to allocate a %d entry TX work list " "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index); goto cleanup; } @@ -913,14 +968,14 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size * sizeof (i40e_tx_control_block_t *), KM_SLEEP); if (itrq->itrq_tcb_free_list == NULL) { - i40e_error(i40e, "failed to allocate a %d entry tx free list " + i40e_error(i40e, "failed to allocate a %d entry TX free list " "for ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index); goto cleanup; } /* - * We allocate enough tx control blocks to cover the free list. + * We allocate enough TX control blocks to cover the free list. */ itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) * itrq->itrq_tx_free_list_size, KM_NOSLEEP); @@ -948,18 +1003,29 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL, &tcb->tcb_dma_handle); if (ret != DDI_SUCCESS) { - i40e_error(i40e, "failed to allocate DMA handle for tx " + i40e_error(i40e, "failed to allocate DMA handle for TX " "data binding on ring %d: %d", itrq->itrq_index, ret); tcb->tcb_dma_handle = NULL; goto cleanup; } + ret = ddi_dma_alloc_handle(i40e->i40e_dip, + &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL, + &tcb->tcb_lso_dma_handle); + if (ret != DDI_SUCCESS) { + i40e_error(i40e, "failed to allocate DMA handle for TX " + "LSO data binding on ring %d: %d", itrq->itrq_index, + ret); + tcb->tcb_lso_dma_handle = NULL; + goto cleanup; + } + if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma, &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, B_TRUE, B_FALSE, dmasz) == B_FALSE) { i40e_error(i40e, "failed to allocate %ld bytes of " - "DMA for tx data binding on ring %d", dmasz, + "DMA for TX data binding on ring %d", dmasz, itrq->itrq_index); goto cleanup; } @@ -989,10 +1055,17 @@ i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init) i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata; /* - * Clean up our rx data. We have to free DMA resources first and + * In some cases i40e_alloc_rx_data() may have failed + * and in that case there is no rxd to free. + */ + if (rxd == NULL) + continue; + + /* + * Clean up our RX data. We have to free DMA resources first and * then if we have no more pending RCB's, then we'll go ahead * and clean things up. Note, we can't set the stopped flag on - * the rx data until after we've done the first pass of the + * the RX data until after we've done the first pass of the * pending resources. Otherwise we might race with * i40e_rx_recycle on determining who should free the * i40e_rx_data_t above. @@ -1055,6 +1128,8 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma) sizeof (ddi_dma_attr_t)); bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr, sizeof (ddi_dma_attr_t)); + bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr, + sizeof (ddi_dma_attr_t)); bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr, sizeof (ddi_device_acc_attr_t)); bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr, @@ -1063,9 +1138,13 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma) if (fma == B_TRUE) { i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; + i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |= + DDI_DMA_FLAGERR; } else { i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; + i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &= + ~DDI_DMA_FLAGERR; } } @@ -1102,7 +1181,7 @@ i40e_rcb_alloc(i40e_rx_data_t *rxd) /* * This is the callback that we get from the OS when freemsg(9F) has been called * on a loaned descriptor. In addition, if we take the last reference count - * here, then we have to tear down all of the rx data. + * here, then we have to tear down all of the RX data. */ void i40e_rx_recycle(caddr_t arg) @@ -1768,17 +1847,18 @@ mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi) * to properly program the hardware for checksum offload as well as the * generally required flags. * - * The i40e_tx_context_t`itc_cmdflags contains the set of flags we need to or - * into the descriptor based on the checksum flags for this mblk_t and the + * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to + * 'or' into the descriptor based on the checksum flags for this mblk_t and the * actual information we care about. + * + * If the mblk requires LSO then we'll also gather the information that will be + * used to construct the Transmit Context Descriptor. */ static int i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, - i40e_tx_context_t *tctx) + mac_ether_offload_info_t *meo, i40e_tx_context_t *tctx) { - int ret; - uint32_t flags, start; - mac_ether_offload_info_t meo; + uint32_t chkflags, start, mss, lsoflags; i40e_txq_stat_t *txs = &itrq->itrq_txstat; bzero(tctx, sizeof (i40e_tx_context_t)); @@ -1786,37 +1866,34 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, if (i40e->i40e_tx_hcksum_enable != B_TRUE) return (0); - mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags); - if (flags == 0) - return (0); + mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags); + mac_lso_get(mp, &mss, &lsoflags); - if ((ret = mac_ether_offload_info(mp, &meo)) != 0) { - txs->itxs_hck_meoifail.value.ui64++; - return (ret); - } + if (chkflags == 0 && lsoflags == 0) + return (0); /* * Have we been asked to checksum an IPv4 header. If so, verify that we * have sufficient information and then set the proper fields in the * command structure. */ - if (flags & HCK_IPV4_HDRCKSUM) { - if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) { + if (chkflags & HCK_IPV4_HDRCKSUM) { + if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) { txs->itxs_hck_nol2info.value.ui64++; return (-1); } - if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) { + if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) { txs->itxs_hck_nol3info.value.ui64++; return (-1); } - if (meo.meoi_l3proto != ETHERTYPE_IP) { + if (meo->meoi_l3proto != ETHERTYPE_IP) { txs->itxs_hck_badl3.value.ui64++; return (-1); } - tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; - tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) << + tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; + tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT; - tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) << + tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT; } @@ -1826,57 +1903,77 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, * onto seeing if we have enough information for the L4 checksum * offload. */ - if (flags & HCK_PARTIALCKSUM) { - if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) { + if (chkflags & HCK_PARTIALCKSUM) { + if ((meo->meoi_flags & MEOI_L4INFO_SET) == 0) { txs->itxs_hck_nol4info.value.ui64++; return (-1); } - if (!(flags & HCK_IPV4_HDRCKSUM)) { - if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) { + if (!(chkflags & HCK_IPV4_HDRCKSUM)) { + if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) { txs->itxs_hck_nol2info.value.ui64++; return (-1); } - if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) { + if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) { txs->itxs_hck_nol3info.value.ui64++; return (-1); } - if (meo.meoi_l3proto == ETHERTYPE_IP) { - tctx->itc_cmdflags |= + if (meo->meoi_l3proto == ETHERTYPE_IP) { + tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4; - } else if (meo.meoi_l3proto == ETHERTYPE_IPV6) { - tctx->itc_cmdflags |= + } else if (meo->meoi_l3proto == ETHERTYPE_IPV6) { + tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV6; } else { txs->itxs_hck_badl3.value.ui64++; return (-1); } - tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) << + tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT; - tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) << + tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT; } - switch (meo.meoi_l4proto) { + switch (meo->meoi_l4proto) { case IPPROTO_TCP: - tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP; + tctx->itc_data_cmdflags |= + I40E_TX_DESC_CMD_L4T_EOFT_TCP; break; case IPPROTO_UDP: - tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP; + tctx->itc_data_cmdflags |= + I40E_TX_DESC_CMD_L4T_EOFT_UDP; break; case IPPROTO_SCTP: - tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP; + tctx->itc_data_cmdflags |= + I40E_TX_DESC_CMD_L4T_EOFT_SCTP; break; default: txs->itxs_hck_badl4.value.ui64++; return (-1); } - tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) << + tctx->itc_data_offsets |= (meo->meoi_l4hlen >> 2) << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; } + if (lsoflags & HW_LSO) { + /* + * LSO requires that checksum offloads are enabled. If for + * some reason they're not we bail out with an error. + */ + if ((chkflags & HCK_IPV4_HDRCKSUM) == 0 || + (chkflags & HCK_PARTIALCKSUM) == 0) { + txs->itxs_lso_nohck.value.ui64++; + return (-1); + } + + tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO; + tctx->itc_ctx_mss = mss; + tctx->itc_ctx_tsolen = msgsize(mp) - + (meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen); + } + return (0); } @@ -1925,7 +2022,20 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb) tcb->tcb_dma.dmab_len = 0; break; case I40E_TX_DMA: - (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle); + if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0) + (void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle); + else if (tcb->tcb_bind_ncookies > 0) + (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle); + if (tcb->tcb_bind_info != NULL) { + kmem_free(tcb->tcb_bind_info, + tcb->tcb_bind_ncookies * + sizeof (struct i40e_dma_bind_info)); + } + tcb->tcb_bind_info = NULL; + tcb->tcb_bind_ncookies = 0; + tcb->tcb_used_lso = B_FALSE; + break; + case I40E_TX_DESC: break; case I40E_TX_NONE: /* Cast to pacify lint */ @@ -1935,8 +2045,10 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb) } tcb->tcb_type = I40E_TX_NONE; - freemsg(tcb->tcb_mp); - tcb->tcb_mp = NULL; + if (tcb->tcb_mp != NULL) { + freemsg(tcb->tcb_mp); + tcb->tcb_mp = NULL; + } tcb->tcb_next = NULL; } @@ -1969,10 +2081,11 @@ i40e_tx_cleanup_ring(i40e_trqpair_t *itrq) i40e_tx_control_block_t *tcb; tcb = itrq->itrq_tcb_work_list[index]; - VERIFY(tcb != NULL); - itrq->itrq_tcb_work_list[index] = NULL; - i40e_tcb_reset(tcb); - i40e_tcb_free(itrq, tcb); + if (tcb != NULL) { + itrq->itrq_tcb_work_list[index] = NULL; + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + } bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t)); index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size); @@ -1995,6 +2108,7 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq) uint32_t wbhead, toclean, count; i40e_tx_control_block_t *tcbhead; i40e_t *i40e = itrq->itrq_i40e; + uint_t desc_per_tcb, i; mutex_enter(&itrq->itrq_tx_lock); @@ -2042,11 +2156,27 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq) tcbhead = tcb; /* - * We zero this out for sanity purposes. + * In the DMA bind case, there may not necessarily be a 1:1 + * mapping between tcb's and descriptors. If the tcb type + * indicates a DMA binding then check the number of DMA + * cookies to determine how many entries to clean in the + * descriptor ring. */ - bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t)); - toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size); - count++; + if (tcb->tcb_type == I40E_TX_DMA) + desc_per_tcb = tcb->tcb_bind_ncookies; + else + desc_per_tcb = 1; + + for (i = 0; i < desc_per_tcb; i++) { + /* + * We zero this out for sanity purposes. + */ + bzero(&itrq->itrq_desc_ring[toclean], + sizeof (i40e_tx_desc_t)); + toclean = i40e_next_desc(toclean, 1, + itrq->itrq_tx_ring_size); + count++; + } } itrq->itrq_desc_head = wbhead; @@ -2078,10 +2208,610 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq) DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count); } +static void +i40e_tx_copy_fragment(i40e_tx_control_block_t *tcb, const mblk_t *mp, + const size_t off, const size_t len) +{ + const void *soff = mp->b_rptr + off; + void *doff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len; + + ASSERT3U(len, >, 0); + ASSERT3P(soff, >=, mp->b_rptr); + ASSERT3P(soff, <=, mp->b_wptr); + ASSERT3U(len, <=, MBLKL(mp)); + ASSERT3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr); + ASSERT3U(tcb->tcb_dma.dmab_size - tcb->tcb_dma.dmab_len, >=, len); + bcopy(soff, doff, len); + tcb->tcb_type = I40E_TX_COPY; + tcb->tcb_dma.dmab_len += len; + I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV); +} + +static i40e_tx_control_block_t * +i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp, + size_t off, boolean_t use_lso) +{ + ddi_dma_handle_t dma_handle; + ddi_dma_cookie_t dma_cookie; + uint_t i = 0, ncookies = 0, dmaflags; + i40e_tx_control_block_t *tcb; + i40e_txq_stat_t *txs = &itrq->itrq_txstat; + + if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { + txs->itxs_err_notcb.value.ui64++; + return (NULL); + } + tcb->tcb_type = I40E_TX_DMA; + + if (use_lso == B_TRUE) + dma_handle = tcb->tcb_lso_dma_handle; + else + dma_handle = tcb->tcb_dma_handle; + + dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING; + if (ddi_dma_addr_bind_handle(dma_handle, NULL, + (caddr_t)(mp->b_rptr + off), MBLKL(mp) - off, dmaflags, + DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) { + txs->itxs_bind_fails.value.ui64++; + goto bffail; + } + + tcb->tcb_bind_ncookies = ncookies; + tcb->tcb_used_lso = use_lso; + + tcb->tcb_bind_info = + kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info), + KM_NOSLEEP); + if (tcb->tcb_bind_info == NULL) + goto bffail; + + while (i < ncookies) { + if (i > 0) + ddi_dma_nextcookie(dma_handle, &dma_cookie); + + tcb->tcb_bind_info[i].dbi_paddr = + (caddr_t)dma_cookie.dmac_laddress; + tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size; + } + + return (tcb); + +bffail: + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + return (NULL); +} + +static void +i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx, + caddr_t buff, size_t len, boolean_t last_desc) +{ + i40e_tx_desc_t *txdesc; + int cmd; + + ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock)); + itrq->itrq_desc_free--; + txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail]; + itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1, + itrq->itrq_tx_ring_size); + + cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags; + + /* + * The last data descriptor needs the EOP bit set, so that the HW knows + * that we're ready to send. Additionally, we set the RS (Report + * Status) bit, so that we are notified when the transmit engine has + * completed DMA'ing all of the data descriptors and data buffers + * associated with this frame. + */ + if (last_desc == B_TRUE) { + cmd |= I40E_TX_DESC_CMD_EOP; + cmd |= I40E_TX_DESC_CMD_RS; + } + + /* + * Per the X710 manual, section 8.4.2.1.1, the buffer size + * must be a value from 1 to 16K minus 1, inclusive. + */ + ASSERT3U(len, >=, 1); + ASSERT3U(len, <=, I40E_MAX_TX_BUFSZ); + + txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)buff); + txdesc->cmd_type_offset_bsz = + LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA | + ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) | + ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) | + ((uint64_t)len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT))); +} + +/* + * Place 'tcb' on the tail of the list represented by 'head'/'tail'. + */ +static inline void +tcb_list_append(i40e_tx_control_block_t **head, i40e_tx_control_block_t **tail, + i40e_tx_control_block_t *tcb) +{ + if (*head == NULL) { + *head = tcb; + *tail = *head; + } else { + ASSERT3P(*tail, !=, NULL); + ASSERT3P((*tail)->tcb_next, ==, NULL); + (*tail)->tcb_next = tcb; + *tail = tcb; + } +} + +/* + * This function takes a single packet, possibly consisting of + * multiple mblks, and creates a TCB chain to send to the controller. + * This TCB chain may span up to a maximum of 8 descriptors. A copy + * TCB consumes one descriptor; whereas a DMA TCB may consume 1 or + * more, depending on several factors. For each fragment (invidual + * mblk making up the packet), we determine if its size dictates a + * copy to the TCB buffer or a DMA bind of the dblk buffer. We keep a + * count of descriptors used; when that count reaches the max we force + * all remaining fragments into a single TCB buffer. We have a + * guarantee that the TCB buffer is always larger than the MTU -- so + * there is always enough room. Consecutive fragments below the DMA + * threshold are copied into a single TCB. In the event of an error + * this function returns NULL but leaves 'mp' alone. + */ +static i40e_tx_control_block_t * +i40e_non_lso_chain(i40e_trqpair_t *itrq, mblk_t *mp, uint_t *ndesc) +{ + const mblk_t *nmp = mp; + uint_t needed_desc = 0; + boolean_t force_copy = B_FALSE; + i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL; + i40e_t *i40e = itrq->itrq_i40e; + i40e_txq_stat_t *txs = &itrq->itrq_txstat; + + /* TCB buffer is always larger than MTU. */ + ASSERT3U(msgsize(mp), <, i40e->i40e_tx_buf_size); + + while (nmp != NULL) { + const size_t nmp_len = MBLKL(nmp); + + /* Ignore zero-length mblks. */ + if (nmp_len == 0) { + nmp = nmp->b_cont; + continue; + } + + if (nmp_len < i40e->i40e_tx_dma_min || force_copy) { + /* Compress consecutive copies into one TCB. */ + if (tcb != NULL && tcb->tcb_type == I40E_TX_COPY) { + i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len); + nmp = nmp->b_cont; + continue; + } + + if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { + txs->itxs_err_notcb.value.ui64++; + goto fail; + } + + /* + * TCB DMA buffer is guaranteed to be one + * cookie by i40e_alloc_dma_buffer(). + */ + i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len); + needed_desc++; + tcb_list_append(&tcbhead, &tcbtail, tcb); + } else { + uint_t total_desc; + + tcb = i40e_tx_bind_fragment(itrq, nmp, 0, B_FALSE); + if (tcb == NULL) { + i40e_error(i40e, "dma bind failed!"); + goto fail; + } + + /* + * If the new total exceeds the max or we've + * reached the limit and there's data left, + * then give up binding and copy the rest into + * the pre-allocated TCB buffer. + */ + total_desc = needed_desc + tcb->tcb_bind_ncookies; + if ((total_desc > I40E_TX_MAX_COOKIE) || + (total_desc == I40E_TX_MAX_COOKIE && + nmp->b_cont != NULL)) { + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + + if (tcbtail != NULL && + tcbtail->tcb_type == I40E_TX_COPY) { + tcb = tcbtail; + } else { + tcb = NULL; + } + + force_copy = B_TRUE; + txs->itxs_force_copy.value.ui64++; + continue; + } + + needed_desc += tcb->tcb_bind_ncookies; + tcb_list_append(&tcbhead, &tcbtail, tcb); + } + + nmp = nmp->b_cont; + } + + ASSERT3P(nmp, ==, NULL); + ASSERT3U(needed_desc, <=, I40E_TX_MAX_COOKIE); + ASSERT3P(tcbhead, !=, NULL); + *ndesc += needed_desc; + return (tcbhead); + +fail: + tcb = tcbhead; + while (tcb != NULL) { + i40e_tx_control_block_t *next = tcb->tcb_next; + + ASSERT(tcb->tcb_type == I40E_TX_DMA || + tcb->tcb_type == I40E_TX_COPY); + + tcb->tcb_mp = NULL; + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + tcb = next; + } + + return (NULL); +} + +/* + * Section 8.4.1 of the 700-series programming guide states that a + * segment may span up to 8 data descriptors; including both header + * and payload data. However, empirical evidence shows that the + * controller freezes the Tx queue when presented with a segment of 8 + * descriptors. Or, at least, when the first segment contains 8 + * descriptors. One explanation is that the controller counts the + * context descriptor against the first segment, even though the + * programming guide makes no mention of such a constraint. In any + * case, we limit TSO segments to 7 descriptors to prevent Tx queue + * freezes. We still allow non-TSO segments to utilize all 8 + * descriptors as they have not demonstrated the faulty behavior. + */ +uint_t i40e_lso_num_descs = 7; + +#define I40E_TCB_LEFT(tcb) \ + ((tcb)->tcb_dma.dmab_size - (tcb)->tcb_dma.dmab_len) + +/* + * This function is similar in spirit to i40e_non_lso_chain(), but + * much more complicated in reality. Like the previous function, it + * takes a packet (an LSO packet) as input and returns a chain of + * TCBs. The complication comes with the fact that we are no longer + * trying to fit the entire packet into 8 descriptors, but rather we + * must fit each MSS-size segment of the LSO packet into 8 descriptors. + * Except it's really 7 descriptors, see i40e_lso_num_descs. + * + * Your first inclination might be to verify that a given segment + * spans no more than 7 mblks; but it's actually much more subtle than + * that. First, let's describe what the hardware expects, and then we + * can expound on the software side of things. + * + * For an LSO packet the hardware expects the following: + * + * o Each MSS-sized segment must span no more than 7 descriptors. + * + * o The header size does not count towards the segment size. + * + * o If header and payload share the first descriptor, then the + * controller will count the descriptor twice. + * + * The most important thing to keep in mind is that the hardware does + * not view the segments in terms of mblks, like we do. The hardware + * only sees descriptors. It will iterate each descriptor in turn, + * keeping a tally of bytes seen and descriptors visited. If the byte + * count hasn't reached MSS by the time the descriptor count reaches + * 7, then the controller freezes the queue and we are stuck. + * Furthermore, the hardware picks up its tally where it left off. So + * if it reached MSS in the middle of a descriptor, it will start + * tallying the next segment in the middle of that descriptor. The + * hardware's view is entirely removed from the mblk chain or even the + * descriptor layout. Consider these facts: + * + * o The MSS will vary dpeneding on MTU and other factors. + * + * o The dblk allocation will sit at various offsets within a + * memory page. + * + * o The page size itself could vary in the future (i.e. not + * always 4K). + * + * o Just because a dblk is virtually contiguous doesn't mean + * it's physically contiguous. The number of cookies + * (descriptors) required by a DMA bind of a single dblk is at + * the mercy of the page size and physical layout. + * + * o The descriptors will most often NOT start/end on a MSS + * boundary. Thus the hardware will often start counting the + * MSS mid descriptor and finish mid descriptor. + * + * The upshot of all this is that the driver must learn to think like + * the controller; and verify that none of the constraints are broken. + * It does this by tallying up the segment just like the hardware + * would. This is handled by the two variables 'segsz' and 'segdesc'. + * After each attempt to bind a dblk, we check the constaints. If + * violated, we undo the DMA and force a copy until MSS is met. We + * have a guarantee that the TCB buffer is larger than MTU; thus + * ensuring we can always meet the MSS with a single copy buffer. We + * also copy consecutive non-DMA fragments into the same TCB buffer. + */ +static i40e_tx_control_block_t * +i40e_lso_chain(i40e_trqpair_t *itrq, const mblk_t *mp, + const mac_ether_offload_info_t *meo, const i40e_tx_context_t *tctx, + uint_t *ndesc) +{ + size_t mp_len = MBLKL(mp); + /* + * The cpoff (copy offset) variable tracks the offset inside + * the current mp. There are cases where the entire mp is not + * fully copied in one go: such as the header copy followed by + * a non-DMA mblk, or a TCB buffer that only has enough space + * to copy part of the current mp. + */ + size_t cpoff = 0; + /* + * The segsz and segdesc variables track the controller's view + * of the segment. The needed_desc variable tracks the total + * number of data descriptors used by the driver. + */ + size_t segsz = 0; + uint_t segdesc = 0; + uint_t needed_desc = 0; + size_t hdrcopied = 0; + const size_t hdrlen = + meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen; + const size_t mss = tctx->itc_ctx_mss; + boolean_t force_copy = B_FALSE; + i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL; + i40e_t *i40e = itrq->itrq_i40e; + i40e_txq_stat_t *txs = &itrq->itrq_txstat; + + /* + * We always copy the header in order to avoid more + * complicated code dealing with various edge cases. + */ + if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { + txs->itxs_err_notcb.value.ui64++; + goto fail; + } + + needed_desc++; + tcb_list_append(&tcbhead, &tcbtail, tcb); + + while (hdrcopied < hdrlen) { + const size_t tocopy = MIN(hdrlen - hdrcopied, mp_len); + i40e_tx_copy_fragment(tcb, mp, 0, tocopy); + hdrcopied += tocopy; + cpoff += tocopy; + if (tocopy == mp_len) { + /* + * This is a bit of defensive programming. We + * should never have a chain too short to + * satisfy the headers -- but just in case. + */ + if ((mp = mp->b_cont) == NULL) { + txs->itxs_tx_short.value.ui64++; + goto fail; + } + + while ((mp_len = MBLKL(mp)) == 0) { + if ((mp = mp->b_cont) == NULL) { + txs->itxs_tx_short.value.ui64++; + goto fail; + } + } + cpoff = 0; + } + } + ASSERT3U(hdrcopied, ==, hdrlen); + + /* + * A single descriptor containing both header and data is + * counted twice by the controller. + */ + if (mp_len < i40e->i40e_tx_dma_min) { + segdesc = 2; + } else { + segdesc = 1; + } + + while (mp != NULL) { + mp_len = MBLKL(mp); +force_copy: + /* Ignore zero-length mblks. */ + if (mp_len == 0) { + mp = mp->b_cont; + cpoff = 0; + continue; + } + + /* + * We copy into the preallocated TCB buffer when the + * current fragment is less than the DMA threshold OR + * when the DMA bind can't meet the controller's + * segment descriptor limit. + */ + if (mp_len < i40e->i40e_tx_dma_min || force_copy) { + size_t tocopy; + + /* + * Our objective here is to compress + * consecutive copies into one TCB (until it + * is full). If there is no current TCB, or if + * it is a DMA TCB, then allocate a new one. + */ + if (tcb == NULL || + (tcb != NULL && tcb->tcb_type != I40E_TX_COPY)) { + if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { + txs->itxs_err_notcb.value.ui64++; + goto fail; + } + + /* + * The TCB DMA buffer is guaranteed to + * be one cookie by i40e_alloc_dma_buffer(). + */ + needed_desc++; + segdesc++; + ASSERT3U(segdesc, <=, i40e_lso_num_descs); + tcb_list_append(&tcbhead, &tcbtail, tcb); + } else if (segdesc == 0) { + /* + * We are copying into an existing TCB + * but we just crossed the MSS + * boundary. Make sure to increment + * segdesc to track the descriptor + * count as the hardware would. + */ + segdesc++; + } + + tocopy = MIN(I40E_TCB_LEFT(tcb), mp_len - cpoff); + i40e_tx_copy_fragment(tcb, mp, cpoff, tocopy); + cpoff += tocopy; + segsz += tocopy; + + /* We have consumed the current mp. */ + if (cpoff == mp_len) { + mp = mp->b_cont; + cpoff = 0; + } + + /* We have consumed the current TCB buffer. */ + if (I40E_TCB_LEFT(tcb) == 0) { + tcb = NULL; + } + + /* + * We have met MSS with this copy; restart the + * counters. + */ + if (segsz >= mss) { + segsz = segsz % mss; + segdesc = segsz == 0 ? 0 : 1; + force_copy = B_FALSE; + } + + /* + * We are at the controller's descriptor + * limit; we must copy into the current TCB + * until MSS is reached. The TCB buffer is + * always bigger than the MTU so we know it is + * big enough to meet the MSS. + */ + if (segdesc == i40e_lso_num_descs) { + force_copy = B_TRUE; + } + } else { + uint_t tsegdesc = segdesc; + size_t tsegsz = segsz; + + ASSERT(force_copy == B_FALSE); + ASSERT3U(tsegdesc, <, i40e_lso_num_descs); + + tcb = i40e_tx_bind_fragment(itrq, mp, cpoff, B_TRUE); + if (tcb == NULL) { + i40e_error(i40e, "dma bind failed!"); + goto fail; + } + + for (uint_t i = 0; i < tcb->tcb_bind_ncookies; i++) { + struct i40e_dma_bind_info dbi = + tcb->tcb_bind_info[i]; + + tsegsz += dbi.dbi_len; + tsegdesc++; + ASSERT3U(tsegdesc, <=, i40e_lso_num_descs); + + /* + * We've met the MSS with this portion + * of the DMA. + */ + if (tsegsz >= mss) { + tsegsz = tsegsz % mss; + tsegdesc = tsegsz == 0 ? 0 : 1; + } + + /* + * We've reached max descriptors but + * have not met the MSS. Undo the bind + * and instead copy. + */ + if (tsegdesc == i40e_lso_num_descs) { + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + + if (tcbtail != NULL && + I40E_TCB_LEFT(tcb) > 0 && + tcbtail->tcb_type == I40E_TX_COPY) { + tcb = tcbtail; + } else { + tcb = NULL; + } + + /* + * Remember, we are still on + * the same mp. + */ + force_copy = B_TRUE; + txs->itxs_tso_force_copy.value.ui64++; + goto force_copy; + } + } + + ASSERT3U(tsegdesc, <=, i40e_lso_num_descs); + ASSERT3U(tsegsz, <, mss); + + /* + * We've made if through the loop without + * breaking the segment descriptor contract + * with the controller -- replace the segment + * tracking values with the temporary ones. + */ + segdesc = tsegdesc; + segsz = tsegsz; + needed_desc += tcb->tcb_bind_ncookies; + cpoff = 0; + tcb_list_append(&tcbhead, &tcbtail, tcb); + mp = mp->b_cont; + } + } + + ASSERT3P(mp, ==, NULL); + ASSERT3P(tcbhead, !=, NULL); + *ndesc += needed_desc; + return (tcbhead); + +fail: + tcb = tcbhead; + while (tcb != NULL) { + i40e_tx_control_block_t *next = tcb->tcb_next; + + ASSERT(tcb->tcb_type == I40E_TX_DMA || + tcb->tcb_type == I40E_TX_COPY); + + tcb->tcb_mp = NULL; + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + tcb = next; + } + + return (NULL); +} + /* * We've been asked to send a message block on the wire. We'll only have a * single chain. There will not be any b_next pointers; however, there may be - * multiple b_cont blocks. + * multiple b_cont blocks. The number of b_cont blocks may exceed the + * controller's Tx descriptor limit. * * We may do one of three things with any given mblk_t chain: * @@ -2096,12 +2826,14 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq) mblk_t * i40e_ring_tx(void *arg, mblk_t *mp) { - const mblk_t *nmp; - size_t mpsize; - i40e_tx_control_block_t *tcb; - i40e_tx_desc_t *txdesc; + size_t msglen; + i40e_tx_control_block_t *tcb_ctx = NULL, *tcb = NULL, *tcbhead = NULL; + i40e_tx_context_desc_t *ctxdesc; + mac_ether_offload_info_t meo; i40e_tx_context_t tctx; - int cmd, type; + int type; + uint_t needed_desc = 0; + boolean_t do_ctx_desc = B_FALSE, use_lso = B_FALSE; i40e_trqpair_t *itrq = arg; i40e_t *i40e = itrq->itrq_i40e; @@ -2119,107 +2851,137 @@ i40e_ring_tx(void *arg, mblk_t *mp) return (NULL); } + if (mac_ether_offload_info(mp, &meo) != 0) { + freemsg(mp); + itrq->itrq_txstat.itxs_hck_meoifail.value.ui64++; + return (NULL); + } + /* * Figure out the relevant context about this frame that we might need - * for enabling checksum, lso, etc. This also fills in information that + * for enabling checksum, LSO, etc. This also fills in information that * we might set around the packet type, etc. */ - if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) { + if (i40e_tx_context(i40e, itrq, mp, &meo, &tctx) < 0) { freemsg(mp); itrq->itrq_txstat.itxs_err_context.value.ui64++; return (NULL); } + if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) { + use_lso = B_TRUE; + do_ctx_desc = B_TRUE; + } /* * For the primordial driver we can punt on doing any recycling right * now; however, longer term we need to probably do some more pro-active - * recycling to cut back on stalls in the tx path. + * recycling to cut back on stalls in the TX path. */ - /* - * Do a quick size check to make sure it fits into what we think it - * should for this device. Note that longer term this will be false, - * particularly when we have the world of TSO. - */ - mpsize = 0; - for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { - mpsize += MBLKL(nmp); + msglen = msgsize(mp); + + if (do_ctx_desc) { + /* + * If we're doing tunneling or LSO, then we'll need a TX + * context descriptor in addition to one or more TX data + * descriptors. Since there's no data DMA block or handle + * associated with the context descriptor, we create a special + * control block that behaves effectively like a NOP. + */ + if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) { + txs->itxs_err_notcb.value.ui64++; + goto txfail; + } + tcb_ctx->tcb_type = I40E_TX_DESC; + needed_desc++; } - /* - * First we allocate our tx control block and prepare the packet for - * transmit before we do a final check for descriptors. We do it this - * way to minimize the time under the tx lock. - */ - tcb = i40e_tcb_alloc(itrq); - if (tcb == NULL) { - txs->itxs_err_notcb.value.ui64++; - goto txfail; + if (!use_lso) { + tcbhead = i40e_non_lso_chain(itrq, mp, &needed_desc); + } else { + tcbhead = i40e_lso_chain(itrq, mp, &meo, &tctx, &needed_desc); } - /* - * For transmitting a block, we're currently going to use just a - * single control block and bcopy all of the fragments into it. We - * should be more intelligent about doing DMA binding or otherwise, but - * for getting off the ground this will have to do. - */ - ASSERT(tcb->tcb_dma.dmab_len == 0); - ASSERT(tcb->tcb_dma.dmab_size >= mpsize); - for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { - size_t clen = MBLKL(nmp); - void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len; + if (tcbhead == NULL) + goto txfail; - bcopy(nmp->b_rptr, coff, clen); - tcb->tcb_dma.dmab_len += clen; - } - ASSERT(tcb->tcb_dma.dmab_len == mpsize); + tcbhead->tcb_mp = mp; /* - * While there's really no need to keep the mp here, but let's just do - * it to help with our own debugging for now. + * The second condition ensures that 'itrq_desc_tail' never + * equals 'itrq_desc_head'. This enforces the rule found in + * the second bullet point of section 8.4.3.1.5 of the XL710 + * PG, which declares the TAIL pointer in I40E_QTX_TAIL should + * never overlap with the head. This means that we only ever + * have 'itrq_tx_ring_size - 1' total available descriptors. */ - tcb->tcb_mp = mp; - tcb->tcb_type = I40E_TX_COPY; - I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV); - mutex_enter(&itrq->itrq_tx_lock); - if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) { + if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh || + (itrq->itrq_desc_free - 1) < needed_desc) { txs->itxs_err_nodescs.value.ui64++; mutex_exit(&itrq->itrq_tx_lock); goto txfail; } - /* - * Build up the descriptor and send it out. Thankfully at the moment - * we only need a single desc, because we're not doing anything fancy - * yet. - */ - ASSERT(itrq->itrq_desc_free > 0); - itrq->itrq_desc_free--; - txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail]; - itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb; - itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1, - itrq->itrq_tx_ring_size); + if (do_ctx_desc) { + /* + * If we're enabling any offloads for this frame, then we'll + * need to build up a transmit context descriptor, first. The + * context descriptor needs to be placed in the TX ring before + * the data descriptor(s). See section 8.4.2, table 8-16 + */ + uint_t tail = itrq->itrq_desc_tail; + itrq->itrq_desc_free--; + ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail]; + itrq->itrq_tcb_work_list[tail] = tcb_ctx; + itrq->itrq_desc_tail = i40e_next_desc(tail, 1, + itrq->itrq_tx_ring_size); + + /* QW0 */ + type = I40E_TX_DESC_DTYPE_CONTEXT; + ctxdesc->tunneling_params = 0; + ctxdesc->l2tag2 = 0; + + /* QW1 */ + ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type); + if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) { + ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t) + ((uint64_t)tctx.itc_ctx_cmdflags << + I40E_TXD_CTX_QW1_CMD_SHIFT) | + ((uint64_t)tctx.itc_ctx_tsolen << + I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) | + ((uint64_t)tctx.itc_ctx_mss << + I40E_TXD_CTX_QW1_MSS_SHIFT)); + } + } - /* - * Note, we always set EOP and RS which indicates that this is the last - * data frame and that we should ask for it to be transmitted. We also - * must always set ICRC, because that is an internal bit that must be - * set to one for data descriptors. The remaining bits in the command - * descriptor depend on checksumming and are determined based on the - * information set up in i40e_tx_context(). - */ - type = I40E_TX_DESC_DTYPE_DATA; - cmd = I40E_TX_DESC_CMD_EOP | - I40E_TX_DESC_CMD_RS | - I40E_TX_DESC_CMD_ICRC | - tctx.itc_cmdflags; - txdesc->buffer_addr = - CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address); - txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type | - ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) | - ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) | - ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT))); + tcb = tcbhead; + while (tcb != NULL) { + + itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb; + if (tcb->tcb_type == I40E_TX_COPY) { + boolean_t last_desc = (tcb->tcb_next == NULL); + + i40e_tx_set_data_desc(itrq, &tctx, + (caddr_t)tcb->tcb_dma.dmab_dma_address, + tcb->tcb_dma.dmab_len, last_desc); + } else { + boolean_t last_desc = B_FALSE; + ASSERT3S(tcb->tcb_type, ==, I40E_TX_DMA); + + for (uint_t c = 0; c < tcb->tcb_bind_ncookies; c++) { + last_desc = (c == tcb->tcb_bind_ncookies - 1) && + (tcb->tcb_next == NULL); + + i40e_tx_set_data_desc(itrq, &tctx, + tcb->tcb_bind_info[c].dbi_paddr, + tcb->tcb_bind_info[c].dbi_len, + last_desc); + } + } + + tcb = tcb->tcb_next; + } /* * Now, finally, sync the DMA data and alert hardware. @@ -2228,6 +2990,7 @@ i40e_ring_tx(void *arg, mblk_t *mp) I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index), itrq->itrq_desc_tail); + if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != DDI_FM_OK) { /* @@ -2239,9 +3002,9 @@ i40e_ring_tx(void *arg, mblk_t *mp) atomic_or_32(&i40e->i40e_state, I40E_ERROR); } - txs->itxs_bytes.value.ui64 += mpsize; + txs->itxs_bytes.value.ui64 += msglen; txs->itxs_packets.value.ui64++; - txs->itxs_descriptors.value.ui64++; + txs->itxs_descriptors.value.ui64 += needed_desc; mutex_exit(&itrq->itrq_tx_lock); @@ -2254,10 +3017,23 @@ txfail: * Make sure to reset their message block's, since we'll return them * back to MAC. */ - if (tcb != NULL) { + if (tcb_ctx != NULL) { + tcb_ctx->tcb_mp = NULL; + i40e_tcb_reset(tcb_ctx); + i40e_tcb_free(itrq, tcb_ctx); + } + + tcb = tcbhead; + while (tcb != NULL) { + i40e_tx_control_block_t *next = tcb->tcb_next; + + ASSERT(tcb->tcb_type == I40E_TX_DMA || + tcb->tcb_type == I40E_TX_COPY); + tcb->tcb_mp = NULL; i40e_tcb_reset(tcb); i40e_tcb_free(itrq, tcb); + tcb = next; } mutex_enter(&itrq->itrq_tx_lock); |