9 files changed, 1932 insertions, 485 deletions
diff --git a/usr/src/cmd/mdb/intel/modules/i40e/amd64/Makefile b/usr/src/cmd/mdb/intel/modules/i40e/amd64/Makefile
index 66f97451b6..f1632172f5 100644
--- a/usr/src/cmd/mdb/intel/modules/i40e/amd64/Makefile
+++ b/usr/src/cmd/mdb/intel/modules/i40e/amd64/Makefile
@@ -10,7 +10,7 @@
 #
 
 #
-# Copyright 2017 Joyent, Inc.
+# Copyright 2018 Joyent, Inc.
 #
 
 MODULE = i40e.so
@@ -23,6 +23,7 @@ include ../../../../../Makefile.cmd.64
 include ../../../Makefile.amd64
 include ../../../../Makefile.module
 
+CPPFLAGS += -I$(SRC)/cmd/mdb/common
 CPPFLAGS += -I$(SRC)/uts/common/io/i40e
 CPPFLAGS += -I$(SRC)/uts/common/io/i40e/core
 CPPFLAGS += -I$(SRC)/uts/common
diff --git a/usr/src/cmd/mdb/intel/modules/i40e/i40e.c b/usr/src/cmd/mdb/intel/modules/i40e/i40e.c
index 6d1f900b43..3f42d24d1f 100644
--- a/usr/src/cmd/mdb/intel/modules/i40e/i40e.c
+++ b/usr/src/cmd/mdb/intel/modules/i40e/i40e.c
@@ -10,9 +10,10 @@
  */
 
 /*
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
+#include <mdb/mdb_ctf.h>
 #include <sys/mdb_modapi.h>
 #include "i40e_sw.h"
 
@@ -97,9 +98,181 @@ i40e_switch_rsrcs_dcmd(uintptr_t addr, uint_t flags, int argc,
 	return (DCMD_OK);
 }
 
+typedef struct mdb_i40e_trqpair {
+	uint32_t		itrq_tx_ring_size;
+	uint32_t		itrq_desc_free;
+	uint32_t 		*itrq_desc_wbhead;
+	uint32_t		itrq_desc_head;
+	uint32_t		itrq_desc_tail;
+	i40e_tx_desc_t		*itrq_desc_ring;
+	i40e_tx_control_block_t	**itrq_tcb_work_list;
+} mdb_i40e_trqpair_t;
+
+static void
+i40e_tx_ring_help()
+{
+	mdb_printf(
+	    "\t -a dump all ring entries\n"
+	    "\t or\n"
+	    "\t combine -b [start index] with -e [end index] to specify a \n"
+	    "\t range of ring entries to print\n");
+}
+
+static int
+i40e_tx_ring_dcmd(uintptr_t addr, uint_t flags, int argc,
+    const mdb_arg_t *argv)
+{
+	mdb_i40e_trqpair_t trq;
+	i40e_tx_desc_t *descring;
+	i40e_tx_control_block_t **wklist;
+	uint32_t wbhead;
+	size_t ringsz, wklistsz;
+	boolean_t opt_a = B_FALSE;
+	char *opt_b = NULL, *opt_e = NULL;
+	uint64_t begin = UINT64_MAX, end = UINT64_MAX;
+
+	if (!(flags & DCMD_ADDRSPEC)) {
+		mdb_warn("::i40e_tx_ring does not operate globally\n");
+		return (DCMD_USAGE);
+	}
+
+	if (mdb_getopts(argc, argv,
+	    'a', MDB_OPT_SETBITS, B_TRUE, &opt_a,
+	    'b', MDB_OPT_STR, &opt_b,
+	    'e', MDB_OPT_STR, &opt_e, NULL) != argc)
+		return (DCMD_USAGE);
+
+	/*
+	 * Verify that a legal combination of -a/-b/-e were used.
+	 */
+	if (opt_a && (opt_b != NULL || opt_e != NULL)) {
+		mdb_warn("-a and -b/-e are mutually exclusive\n");
+		return (DCMD_USAGE);
+	}
+	if (argc > 0 && ! opt_a && (opt_b == NULL || opt_e == NULL)) {
+		mdb_warn("-b/-e must both be specified\n");
+		return (DCMD_USAGE);
+	}
+
+	if (mdb_ctf_vread(&trq, "i40e_trqpair_t", "mdb_i40e_trqpair_t", addr,
+	    0) == -1) {
+		mdb_warn("failed to read i40e_trqpair_t at %p", addr);
+		return (DCMD_ERR);
+	}
+
+	if (opt_b != NULL)
+		begin = mdb_strtoull(opt_b);
+	if (opt_e != NULL)
+		end = mdb_strtoull(opt_e);
+	if (opt_a) {
+		begin = 0;
+		end = trq.itrq_tx_ring_size - 1;
+	}
+
+	/*
+	 * Verify that the requested range of ring entries makes sense.
+	 */
+	if (argc > 0 && (end < begin || begin >= trq.itrq_tx_ring_size ||
+	    end >= trq.itrq_tx_ring_size)) {
+		mdb_warn("invalid range specified\n");
+		return (DCMD_USAGE);
+	}
+
+	if (mdb_vread(&wbhead, sizeof (uint32_t),
+	    (uintptr_t)trq.itrq_desc_wbhead) != sizeof (uint32_t)) {
+		mdb_warn("failed to read trq.itrq_desc_wbhead");
+		return (DCMD_ERR);
+	}
+	mdb_printf("%-20s%d\n", "Ring Size:", trq.itrq_tx_ring_size);
+	mdb_printf("%-20s%d\n", "Free Descriptors:", trq.itrq_desc_free);
+	mdb_printf("%-20s%d\n", "Writeback Head:", wbhead);
+	mdb_printf("%-20s%d\n", "Head:", trq.itrq_desc_head);
+	mdb_printf("%-20s%d\n", "Tail:", trq.itrq_desc_tail);
+
+	/*
+	 * No arguments were specified, so we're done.
+	 */
+	if (argc == 0)
+		return (DCMD_OK);
+
+	/*
+	 * Allocate memory and read in the entire TX descriptor ring and
+	 * TCB work list.
+	 */
+	ringsz = sizeof (i40e_tx_desc_t) * trq.itrq_tx_ring_size;
+	descring = mdb_alloc(ringsz, UM_SLEEP);
+	if (mdb_vread(descring, ringsz, (uintptr_t)trq.itrq_desc_ring) !=
+	    ringsz) {
+		mdb_warn("Failed to read in TX decriptor ring\n");
+		mdb_free(descring, ringsz);
+		return (DCMD_ERR);
+	}
+	wklistsz = sizeof (i40e_tx_control_block_t *) * trq.itrq_tx_ring_size;
+	wklist = mdb_alloc(wklistsz, UM_SLEEP);
+	if (mdb_vread(wklist, wklistsz, (uintptr_t)trq.itrq_tcb_work_list) !=
+	    wklistsz) {
+		mdb_warn("Failed to read in TX TCB work list\n");
+		mdb_free(descring, ringsz);
+		mdb_free(wklist, wklistsz);
+		return (DCMD_ERR);
+	}
+
+	mdb_printf("\n%-10s %-10s %-16s %-16s %-10s\n", "Index", "Desc Type",
+	    "Desc Ptr", "TCB Ptr", "Other");
+	for (uint64_t i = begin; i <= end; i++) {
+		const char *dtype;
+		char dother[17];
+		i40e_tx_desc_t *dptr;
+		i40e_tx_control_block_t *tcbptr;
+		uint64_t ctob;
+
+		dptr = &descring[i];
+		tcbptr = wklist[i];
+		ctob = LE_64(dptr->cmd_type_offset_bsz);
+		if (ctob == 0) {
+			dtype = "FREE";
+		} else {
+			switch (ctob & I40E_TXD_QW1_DTYPE_MASK) {
+			case (I40E_TX_DESC_DTYPE_CONTEXT):
+				dtype = "CONTEXT";
+				break;
+			case (I40E_TX_DESC_DTYPE_DATA):
+				dtype = "DATA";
+				break;
+			case (I40E_TX_DESC_DTYPE_FILTER_PROG):
+				dtype = "FILTER";
+				break;
+			default:
+				dtype = "UNKNOWN";
+			}
+		}
+		dother[0] = '\0';
+		if (i == wbhead)
+			(void) strcat(dother, "WBHEAD");
+
+		if (i == trq.itrq_desc_head)
+			(void) strcat(dother,
+			    strlen(dother) > 0 ? " HEAD" : "HEAD");
+
+		if (i == trq.itrq_desc_tail)
+			(void) strcat(dother,
+			    strlen(dother) > 0 ? " TAIL" : "TAIL");
+
+		mdb_printf("%-10d %-10s %-16p %-16p %-10s\n", i, dtype, dptr,
+		    tcbptr, dother);
+	}
+
+	mdb_free(descring, ringsz);
+	mdb_free(wklist, wklistsz);
+	return (DCMD_OK);
+}
+
 static const mdb_dcmd_t i40e_dcmds[] = {
 	{ "i40e_switch_rsrcs", NULL, "print switch resources",
 	    i40e_switch_rsrcs_dcmd, NULL },
+	{ "i40e_tx_ring", "[-a] -b [start index] -e [end index]\n",
+	    "dump TX descriptor ring state", i40e_tx_ring_dcmd,
+	    i40e_tx_ring_help },
 	{ NULL }
 };
 
diff --git a/usr/src/man/man7d/i40e.7d b/usr/src/man/man7d/i40e.7d
index 2d8a2da45b..f025fba01a 100644
--- a/usr/src/man/man7d/i40e.7d
+++ b/usr/src/man/man7d/i40e.7d
@@ -9,9 +9,9 @@
 .\" http://www.illumos.org/license/CDDL.
 .\"
 .\"
-.\" Copyright (c) 2017 Joyent, Inc.
+.\" Copyright (c) 2018 Joyent, Inc.
 .\"
-.Dd September 8, 2017
+.Dd May 23, 2018
 .Dt I40E 7D
 .Os
 .Sh NAME
@@ -273,6 +273,22 @@ binding.
 By setting this property to its maximum, all frames will be processed by copying
 the frame.
 .Ed
+.It Sy tx_lso_enable
+.Bd -filled -compact
+Minimum:
+.Sy 0 |
+Maximum:
+.Sy 1
+.Ed
+.Bd -filled
+The
+.Sy tx_lso_enable
+property controls whether or not the device enables support for Large Segment
+Offloand (LSO) when transmitting packets.
+The default is to always enable support for this.
+Turning it off will decrease throughput when transmitting packets, but should
+be done if a hardware bug is suspected.
+.Ed
 .El
 .Sh ARCHITECTURE
 The
diff --git a/usr/src/uts/common/io/i40e/i40e_gld.c b/usr/src/uts/common/io/i40e/i40e_gld.c
index d34057d64f..ccf814be0b 100644
--- a/usr/src/uts/common/io/i40e/i40e_gld.c
+++ b/usr/src/uts/common/io/i40e/i40e_gld.c
@@ -39,7 +39,8 @@ char *i40e_priv_props[] = {
 static int
 i40e_group_remove_mac(void *arg, const uint8_t *mac_addr)
 {
-	i40e_t *i40e = arg;
+	i40e_rx_group_t *rxg = arg;
+	i40e_t *i40e = rxg->irg_i40e;
 	struct i40e_aqc_remove_macvlan_element_data filt;
 	struct i40e_hw *hw = &i40e->i40e_hw_space;
 	int ret, i, last;
@@ -107,10 +108,11 @@ done:
 static int
 i40e_group_add_mac(void *arg, const uint8_t *mac_addr)
 {
-	i40e_t *i40e = arg;
-	struct i40e_hw *hw = &i40e->i40e_hw_space;
-	int i, ret;
-	i40e_uaddr_t *iua;
+	i40e_rx_group_t	*rxg = arg;
+	i40e_t		*i40e = rxg->irg_i40e;
+	struct i40e_hw	*hw = &i40e->i40e_hw_space;
+	int		i, ret;
+	i40e_uaddr_t	*iua;
 	struct i40e_aqc_add_macvlan_element_data filt;
 
 	if (I40E_IS_MULTICAST(mac_addr))
@@ -136,16 +138,12 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr)
 		}
 	}
 
-	/*
-	 * Note, the general use of the i40e_vsi_id will have to be refactored
-	 * when we have proper group support.
-	 */
 	bzero(&filt, sizeof (filt));
 	bcopy(mac_addr, filt.mac_addr, ETHERADDRL);
 	filt.flags = I40E_AQC_MACVLAN_ADD_PERFECT_MATCH	|
 	    I40E_AQC_MACVLAN_ADD_IGNORE_VLAN;
 
-	if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1,
+	if ((ret = i40e_aq_add_macvlan(hw, rxg->irg_vsi_seid, &filt, 1,
 	    NULL)) != I40E_SUCCESS) {
 		i40e_error(i40e, "failed to add mac address "
 		    "%2x:%2x:%2x:%2x:%2x:%2x to unicast filter: %d",
@@ -157,7 +155,7 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr)
 
 	iua = &i40e->i40e_uaddrs[i40e->i40e_resources.ifr_nmacfilt_used];
 	bcopy(mac_addr, iua->iua_mac, ETHERADDRL);
-	iua->iua_vsi = i40e->i40e_vsi_id;
+	iua->iua_vsi = rxg->irg_vsi_seid;
 	i40e->i40e_resources.ifr_nmacfilt_used++;
 	ASSERT(i40e->i40e_resources.ifr_nmacfilt_used <=
 	    i40e->i40e_resources.ifr_nmacfilt);
@@ -227,7 +225,7 @@ i40e_m_promisc(void *arg, boolean_t on)
 	}
 
 
-	ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id,
+	ret = i40e_aq_set_vsi_unicast_promiscuous(hw, I40E_DEF_VSI_SEID(i40e),
 	    on, NULL, B_FALSE);
 	if (ret != I40E_SUCCESS) {
 		i40e_error(i40e, "failed to %s unicast promiscuity on "
@@ -246,7 +244,7 @@ i40e_m_promisc(void *arg, boolean_t on)
 		goto done;
 	}
 
-	ret = i40e_aq_set_vsi_multicast_promiscuous(hw, i40e->i40e_vsi_id,
+	ret = i40e_aq_set_vsi_multicast_promiscuous(hw, I40E_DEF_VSI_SEID(i40e),
 	    on, NULL);
 	if (ret != I40E_SUCCESS) {
 		i40e_error(i40e, "failed to %s multicast promiscuity on "
@@ -257,8 +255,8 @@ i40e_m_promisc(void *arg, boolean_t on)
 		 * Try our best to put us back into a state that MAC expects us
 		 * to be in.
 		 */
-		ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id,
-		    !on, NULL, B_FALSE);
+		ret = i40e_aq_set_vsi_unicast_promiscuous(hw,
+		    I40E_DEF_VSI_SEID(i40e), !on, NULL, B_FALSE);
 		if (ret != I40E_SUCCESS) {
 			i40e_error(i40e, "failed to %s unicast promiscuity on "
 			    "the default VSI after toggling multicast failed: "
@@ -294,11 +292,11 @@ i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address)
 		if (i40e->i40e_mcast_promisc_count == 0 &&
 		    i40e->i40e_promisc_on == B_FALSE) {
 			ret = i40e_aq_set_vsi_multicast_promiscuous(hw,
-			    i40e->i40e_vsi_id, B_TRUE, NULL);
+			    I40E_DEF_VSI_SEID(i40e), B_TRUE, NULL);
 			if (ret != I40E_SUCCESS) {
 				i40e_error(i40e, "failed to enable multicast "
 				    "promiscuous mode on VSI %d: %d",
-				    i40e->i40e_vsi_id, ret);
+				    I40E_DEF_VSI_SEID(i40e), ret);
 				return (EIO);
 			}
 		}
@@ -312,7 +310,7 @@ i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address)
 	filt.flags = I40E_AQC_MACVLAN_ADD_HASH_MATCH |
 	    I40E_AQC_MACVLAN_ADD_IGNORE_VLAN;
 
-	if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1,
+	if ((ret = i40e_aq_add_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, 1,
 	    NULL)) != I40E_SUCCESS) {
 		i40e_error(i40e, "failed to add mac address "
 		    "%2x:%2x:%2x:%2x:%2x:%2x to multicast filter: %d",
@@ -353,8 +351,8 @@ i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address)
 		filt.flags = I40E_AQC_MACVLAN_DEL_HASH_MATCH |
 		    I40E_AQC_MACVLAN_DEL_IGNORE_VLAN;
 
-		if (i40e_aq_remove_macvlan(hw, i40e->i40e_vsi_id,
-		    &filt, 1, NULL) != I40E_SUCCESS) {
+		if (i40e_aq_remove_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt,
+		    1, NULL) != I40E_SUCCESS) {
 			i40e_error(i40e, "failed to remove mac address "
 			    "%2x:%2x:%2x:%2x:%2x:%2x from multicast "
 			    "filter: %d",
@@ -381,11 +379,11 @@ i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address)
 		if (i40e->i40e_mcast_promisc_count == 1 &&
 		    i40e->i40e_promisc_on == B_FALSE) {
 			ret = i40e_aq_set_vsi_multicast_promiscuous(hw,
-			    i40e->i40e_vsi_id, B_FALSE, NULL);
+			    I40E_DEF_VSI_SEID(i40e), B_FALSE, NULL);
 			if (ret != I40E_SUCCESS) {
 				i40e_error(i40e, "failed to disable "
 				    "multicast promiscuous mode on VSI %d: %d",
-				    i40e->i40e_vsi_id, ret);
+				    I40E_DEF_VSI_SEID(i40e), ret);
 				return (EIO);
 			}
 		}
@@ -490,7 +488,7 @@ i40e_fill_tx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
 	 * we're not actually grouping things tx-wise at this time.
 	 */
 	ASSERT(group_index == -1);
-	ASSERT(ring_index < i40e->i40e_num_trqpairs);
+	ASSERT(ring_index < i40e->i40e_num_trqpairs_per_vsi);
 
 	itrq->itrq_mactxring = rh;
 	infop->mri_driver = (mac_ring_driver_t)itrq;
@@ -516,15 +514,16 @@ i40e_fill_rx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
 {
 	i40e_t *i40e = arg;
 	mac_intr_t *mintr = &infop->mri_intr;
-	i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[ring_index];
+	uint_t trqpair_index;
+	i40e_trqpair_t *itrq;
 
-	/*
-	 * We assert the group number and ring index to help sanity check
-	 * ourselves and mark that we'll need to rework this when we have
-	 * multiple groups.
-	 */
-	ASSERT3S(group_index, ==, 0);
-	ASSERT3S(ring_index, <, i40e->i40e_num_trqpairs);
+	/* This assumes static groups. */
+	ASSERT3S(group_index, >=, 0);
+	ASSERT3S(ring_index, >=, 0);
+	trqpair_index = (group_index * i40e->i40e_num_trqpairs_per_vsi) +
+	    ring_index;
+	ASSERT3U(trqpair_index, <, i40e->i40e_num_trqpairs);
+	itrq = &i40e->i40e_trqpairs[trqpair_index];
 
 	itrq->itrq_macrxring = rh;
 	infop->mri_driver = (mac_ring_driver_t)itrq;
@@ -552,24 +551,22 @@ i40e_fill_rx_group(void *arg, mac_ring_type_t rtype, const int index,
     mac_group_info_t *infop, mac_group_handle_t gh)
 {
 	i40e_t *i40e = arg;
+	i40e_rx_group_t *rxg;
 
 	if (rtype != MAC_RING_TYPE_RX)
 		return;
 
-	/*
-	 * Note, this is a simplified view of a group, given that we only have a
-	 * single group and a single ring at the moment. We'll want to expand
-	 * upon this as we leverage more hardware functionality.
-	 */
-	i40e->i40e_rx_group_handle = gh;
-	infop->mgi_driver = (mac_group_driver_t)i40e;
+	rxg = &i40e->i40e_rx_groups[index];
+	rxg->irg_grp_hdl = gh;
+
+	infop->mgi_driver = (mac_group_driver_t)rxg;
 	infop->mgi_start = NULL;
 	infop->mgi_stop = NULL;
 	infop->mgi_addmac = i40e_group_add_mac;
 	infop->mgi_remmac = i40e_group_remove_mac;
 
-	ASSERT(i40e->i40e_num_rx_groups == I40E_GROUP_MAX);
-	infop->mgi_count = i40e->i40e_num_trqpairs;
+	ASSERT(i40e->i40e_num_rx_groups <= I40E_GROUP_MAX);
+	infop->mgi_count = i40e->i40e_num_trqpairs_per_vsi;
 }
 
 static int
@@ -732,20 +729,32 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 		break;
 	}
 
+	case MAC_CAPAB_LSO: {
+		mac_capab_lso_t *cap_lso = cap_data;
+
+		if (i40e->i40e_tx_lso_enable == B_TRUE) {
+			cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
+			cap_lso->lso_basic_tcp_ipv4.lso_max = I40E_LSO_MAXLEN;
+		} else {
+			return (B_FALSE);
+		}
+		break;
+	}
+
 	case MAC_CAPAB_RINGS:
 		cap_rings = cap_data;
 		cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
 		switch (cap_rings->mr_type) {
 		case MAC_RING_TYPE_TX:
 			/*
-			 * Note, saying we have no rings, but some number of
-			 * groups indicates to MAC that it should create
-			 * psuedo-groups with one for each TX ring. This may not
-			 * be the long term behavior we want, but it'll work for
-			 * now.
+			 * Note, saying we have no groups, but some
+			 * number of rings indicates to MAC that it
+			 * should create psuedo-groups with one for
+			 * each TX ring. This may not be the long term
+			 * behavior we want, but it'll work for now.
 			 */
 			cap_rings->mr_gnum = 0;
-			cap_rings->mr_rnum = i40e->i40e_num_trqpairs;
+			cap_rings->mr_rnum = i40e->i40e_num_trqpairs_per_vsi;
 			cap_rings->mr_rget = i40e_fill_tx_ring;
 			cap_rings->mr_gget = NULL;
 			cap_rings->mr_gaddring = NULL;
@@ -754,7 +763,7 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 		case MAC_RING_TYPE_RX:
 			cap_rings->mr_rnum = i40e->i40e_num_trqpairs;
 			cap_rings->mr_rget = i40e_fill_rx_ring;
-			cap_rings->mr_gnum = I40E_GROUP_MAX;
+			cap_rings->mr_gnum = i40e->i40e_num_rx_groups;
 			cap_rings->mr_gget = i40e_fill_rx_group;
 			cap_rings->mr_gaddring = NULL;
 			cap_rings->mr_gremring = NULL;
diff --git a/usr/src/uts/common/io/i40e/i40e_intr.c b/usr/src/uts/common/io/i40e/i40e_intr.c
index 51d1bbac92..170bef7ec6 100644
--- a/usr/src/uts/common/io/i40e/i40e_intr.c
+++ b/usr/src/uts/common/io/i40e/i40e_intr.c
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  * Copyright 2017 Tegile Systems, Inc.  All rights reserved.
  */
 
@@ -229,12 +229,20 @@ i40e_intr_adminq_disable(i40e_t *i40e)
 	I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTL0, reg);
 }
 
+/*
+ * The next two functions enable/disable the reception of interrupts
+ * on the given vector. Only vectors 1..N are programmed by these
+ * functions; vector 0 is special and handled by a different register.
+ * We must subtract one from the vector because i40e implicitly adds
+ * one to the vector value. See section 10.2.2.10.13 for more details.
+ */
 static void
 i40e_intr_io_enable(i40e_t *i40e, int vector)
 {
 	uint32_t reg;
 	i40e_hw_t *hw = &i40e->i40e_hw_space;
 
+	ASSERT3S(vector, >, 0);
 	reg = I40E_PFINT_DYN_CTLN_INTENA_MASK |
 	    I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
 	    (I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
@@ -247,6 +255,7 @@ i40e_intr_io_disable(i40e_t *i40e, int vector)
 	uint32_t reg;
 	i40e_hw_t *hw = &i40e->i40e_hw_space;
 
+	ASSERT3S(vector, >, 0);
 	reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
 	I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg);
 }
@@ -375,49 +384,109 @@ i40e_intr_chip_fini(i40e_t *i40e)
 }
 
 /*
- * Enable all of the queues and set the corresponding LNKLSTN registers. Note
- * that we always enable queues as interrupt sources, even though we don't
- * enable the MSI-X interrupt vectors.
+ * Set the head of the interrupt linked list. The PFINT_LNKLSTN[N]
+ * register actually refers to the 'N + 1' interrupt vector. E.g.,
+ * PFINT_LNKLSTN[0] refers to interrupt vector 1.
+ */
+static void
+i40e_set_lnklstn(i40e_t *i40e, uint_t vector, uint_t queue)
+{
+	uint32_t	reg;
+	i40e_hw_t	*hw = &i40e->i40e_hw_space;
+
+	reg = (queue << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_RX << I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
+
+	I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(vector), reg);
+	DEBUGOUT2("PFINT_LNKLSTN[%u] = 0x%x", vector, reg);
+}
+
+/*
+ * Set the QINT_RQCTL[queue] register. The next queue is always the Tx
+ * queue associated with this Rx queue. Unlike PFINT_LNKLSTN, the
+ * vector should be the actual vector this queue is on -- i.e., it
+ * should be equal to itrq_rx_intrvec.
+ */
+static void
+i40e_set_rqctl(i40e_t *i40e, uint_t vector, uint_t queue)
+{
+	uint32_t	reg;
+	i40e_hw_t	*hw = &i40e->i40e_hw_space;
+
+	ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_rx_intrvec);
+
+	reg = (vector << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
+	    (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
+	    (queue << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
+	    I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+
+	I40E_WRITE_REG(hw, I40E_QINT_RQCTL(queue), reg);
+	DEBUGOUT2("QINT_RQCTL[%u] = 0x%x", queue, reg);
+}
+
+/*
+ * Like i40e_set_rqctl(), but for QINT_TQCTL[queue]. The next queue is
+ * either the Rx queue of another TRQP, or EOL.
+ */
+static void
+i40e_set_tqctl(i40e_t *i40e, uint_t vector, uint_t queue, uint_t next_queue)
+{
+	uint32_t	reg;
+	i40e_hw_t	*hw = &i40e->i40e_hw_space;
+
+	ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_tx_intrvec);
+
+	reg = (vector << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) |
+	    (I40E_ITR_INDEX_TX << I40E_QINT_TQCTL_ITR_INDX_SHIFT) |
+	    (next_queue << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
+	    (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) |
+	    I40E_QINT_TQCTL_CAUSE_ENA_MASK;
+
+	I40E_WRITE_REG(hw, I40E_QINT_TQCTL(queue), reg);
+	DEBUGOUT2("QINT_TQCTL[%u] = 0x%x", queue, reg);
+}
+
+/*
+ * Program the interrupt linked list. Each vector has a linked list of
+ * queues which act as event sources for that vector. When one of
+ * those sources has an event the associated interrupt vector is
+ * fired. This mapping must match the mapping found in
+ * i40e_map_intrs_to_vectors().
+ *
+ * See section 7.5.3 for more information about the configuration of
+ * the interrupt linked list.
  */
 static void
 i40e_intr_init_queue_msix(i40e_t *i40e)
 {
-	i40e_hw_t *hw = &i40e->i40e_hw_space;
-	uint32_t reg;
-	int i;
+	uint_t intr_count;
 
 	/*
-	 * Map queues to MSI-X interrupts. Queue i is mapped to vector i + 1.
-	 * Note that we skip the ITR logic for the moment, just to make our
-	 * lives as explicit and simple as possible.
+	 * The 0th vector is for 'Other Interrupts' only (subject to
+	 * change in the future).
 	 */
-	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
-		i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
+	intr_count = i40e->i40e_intr_count - 1;
 
-		reg = (i << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) |
-		    (I40E_QUEUE_TYPE_RX <<
-		    I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
-		I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(i), reg);
+	for (uint_t vec = 0; vec < intr_count; vec++) {
+		boolean_t head = B_TRUE;
 
-		reg =
-		    (itrq->itrq_rx_intrvec << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
-		    (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
-		    (i << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
-		    (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
-		    I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+		for (uint_t qidx = vec; qidx < i40e->i40e_num_trqpairs;
+		     qidx += intr_count) {
+			uint_t next_qidx = qidx + intr_count;
 
-		I40E_WRITE_REG(hw, I40E_QINT_RQCTL(i), reg);
+			next_qidx = (next_qidx > i40e->i40e_num_trqpairs) ?
+			    I40E_QUEUE_TYPE_EOL : next_qidx;
 
-		reg =
-		    (itrq->itrq_tx_intrvec << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) |
-		    (I40E_ITR_INDEX_TX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
-		    (I40E_QUEUE_TYPE_EOL << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
-		    (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) |
-		    I40E_QINT_TQCTL_CAUSE_ENA_MASK;
+			if (head) {
+				i40e_set_lnklstn(i40e, vec, qidx);
+				head = B_FALSE;
+			}
 
-		I40E_WRITE_REG(hw, I40E_QINT_TQCTL(i), reg);
+			i40e_set_rqctl(i40e, vec + 1, qidx);
+			i40e_set_tqctl(i40e, vec + 1, qidx, next_qidx);
+		}
 	}
-
 }
 
 /*
@@ -604,31 +673,26 @@ i40e_intr_adminq_work(i40e_t *i40e)
 }
 
 static void
-i40e_intr_rx_work(i40e_t *i40e, int queue)
+i40e_intr_rx_work(i40e_t *i40e, i40e_trqpair_t *itrq)
 {
 	mblk_t *mp = NULL;
-	i40e_trqpair_t *itrq;
-
-	ASSERT(queue < i40e->i40e_num_trqpairs);
-	itrq = &i40e->i40e_trqpairs[queue];
 
 	mutex_enter(&itrq->itrq_rx_lock);
 	if (!itrq->itrq_intr_poll)
 		mp = i40e_ring_rx(itrq, I40E_POLL_NULL);
 	mutex_exit(&itrq->itrq_rx_lock);
 
-	if (mp != NULL) {
-		mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp,
-		    itrq->itrq_rxgen);
-	}
+	if (mp == NULL)
+		return;
+
+	mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp,
+	    itrq->itrq_rxgen);
 }
 
+/* ARGSUSED */
 static void
-i40e_intr_tx_work(i40e_t *i40e, int queue)
+i40e_intr_tx_work(i40e_t *i40e, i40e_trqpair_t *itrq)
 {
-	i40e_trqpair_t *itrq;
-
-	itrq = &i40e->i40e_trqpairs[queue];
 	i40e_tx_recycle_ring(itrq);
 }
 
@@ -665,11 +729,17 @@ i40e_intr_other_work(i40e_t *i40e)
 	i40e_intr_adminq_enable(i40e);
 }
 
+/*
+ * Handle an MSI-X interrupt. See section 7.5.1.3 for an overview of
+ * the MSI-X interrupt sequence.
+ */
 uint_t
 i40e_intr_msix(void *arg1, void *arg2)
 {
 	i40e_t *i40e = (i40e_t *)arg1;
-	int vector_idx = (int)(uintptr_t)arg2;
+	uint_t vector_idx = (uint_t)(uintptr_t)arg2;
+
+	ASSERT3U(vector_idx, <, i40e->i40e_intr_count);
 
 	/*
 	 * When using MSI-X interrupts, vector 0 is always reserved for the
@@ -681,10 +751,29 @@ i40e_intr_msix(void *arg1, void *arg2)
 		return (DDI_INTR_CLAIMED);
 	}
 
-	i40e_intr_rx_work(i40e, vector_idx - 1);
-	i40e_intr_tx_work(i40e, vector_idx - 1);
-	i40e_intr_io_enable(i40e, vector_idx);
+	ASSERT3U(vector_idx, >, 0);
 
+	/*
+	 * We determine the queue indexes via simple arithmetic (as
+	 * opposed to keeping explicit state like a bitmap). While
+	 * conveinent, it does mean that i40e_map_intrs_to_vectors(),
+	 * i40e_intr_init_queue_msix(), and this function must be
+	 * modified as a unit.
+	 *
+	 * We subtract 1 from the vector to offset the addition we
+	 * performed during i40e_map_intrs_to_vectors().
+	 */
+	for (uint_t i = vector_idx - 1; i < i40e->i40e_num_trqpairs;
+	     i += (i40e->i40e_intr_count - 1)) {
+		i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
+
+		ASSERT3U(i, <, i40e->i40e_num_trqpairs);
+		ASSERT3P(itrq, !=, NULL);
+		i40e_intr_rx_work(i40e, itrq);
+		i40e_intr_tx_work(i40e, itrq);
+	}
+
+	i40e_intr_io_enable(i40e, vector_idx);
 	return (DDI_INTR_CLAIMED);
 }
 
@@ -693,6 +782,7 @@ i40e_intr_notx(i40e_t *i40e, boolean_t shared)
 {
 	i40e_hw_t *hw = &i40e->i40e_hw_space;
 	uint32_t reg;
+	i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[0];
 	int ret = DDI_INTR_CLAIMED;
 
 	if (shared == B_TRUE) {
@@ -722,10 +812,10 @@ i40e_intr_notx(i40e_t *i40e, boolean_t shared)
 		i40e_intr_adminq_work(i40e);
 
 	if (reg & I40E_INTR_NOTX_RX_MASK)
-		i40e_intr_rx_work(i40e, 0);
+		i40e_intr_rx_work(i40e, itrq);
 
 	if (reg & I40E_INTR_NOTX_TX_MASK)
-		i40e_intr_tx_work(i40e, 0);
+		i40e_intr_tx_work(i40e, itrq);
 
 done:
 	i40e_intr_adminq_enable(i40e);
diff --git a/usr/src/uts/common/io/i40e/i40e_main.c b/usr/src/uts/common/io/i40e/i40e_main.c
index 54aef43424..99c64abe8c 100644
--- a/usr/src/uts/common/io/i40e/i40e_main.c
+++ b/usr/src/uts/common/io/i40e/i40e_main.c
@@ -11,7 +11,7 @@
 
 /*
  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  * Copyright 2017 Tegile Systems, Inc.  All rights reserved.
  */
 
@@ -188,14 +188,15 @@
  * VSI Management
  * --------------
  *
- * At this time, we currently only support a single MAC group, and thus a single
- * VSI. This VSI is considered the default VSI and should be the only one that
- * exists after a reset. Currently it is stored as the member
- * i40e_t`i40e_vsi_id. While this works for the moment and for an initial
- * driver, it's not sufficient for the longer-term path of the driver. Instead,
- * we'll want to actually have a unique i40e_vsi_t structure which is used
- * everywhere. Note that this means that every place that uses the
- * i40e_t`i40e_vsi_id will need to be refactored.
+ * The PFs share 384 VSIs. The firmware creates one VSI per PF by default.
+ * During chip start we retrieve the SEID of this VSI and assign it as the
+ * default VSI for our VEB (one VEB per PF). We then add additional VSIs to
+ * the VEB up to the determined number of rx groups: i40e_t`i40e_num_rx_groups.
+ * We currently cap this number to I40E_GROUP_MAX to a) make sure all PFs can
+ * allocate the same number of VSIs, and b) to keep the interrupt multiplexing
+ * under control. In the future, when we improve the interrupt allocation, we
+ * may want to revisit this cap to make better use of the available VSIs. The
+ * VSI allocation and configuration can be found in i40e_chip_start().
  *
  * ----------------
  * Structure Layout
@@ -240,7 +241,7 @@
  *          | i40e_hw_t               --+---> Intel common code structure
  *          | mac_handle_t            --+---> GLDv3 handle to MAC
  *          | ddi_periodic_t          --+---> Link activity timer
- *          | int (vsi_id)            --+---> VSI ID, main identifier
+ *          | i40e_vsi_t *            --+---> Array of VSIs
  *          | i40e_func_rsrc_t        --+---> Available hardware resources
  *          | i40e_switch_rsrc_t *    --+---> Switch resource snapshot
  *          | i40e_sdu                --+---> Current MTU
@@ -249,11 +250,10 @@
  *          | i40e_maddr_t *          --+---> Array of assigned multicast MACs
  *          | i40e_mcast_promisccount --+---> Active multicast state
  *          | i40e_promisc_on         --+---> Current promiscuous mode state
- *          | int                     --+---> Number of transmit/receive pairs
+ *          | uint_t                  --+---> Number of transmit/receive pairs
+ *          | i40e_rx_group_t *       --+---> Array of Rx groups
  *          | kstat_t *               --+---> PF kstats
- *          | kstat_t *               --+---> VSI kstats
  *          | i40e_pf_stats_t         --+---> PF kstat backing data
- *          | i40e_vsi_stats_t        --+---> VSI kstat backing data
  *          | i40e_trqpair_t *        --+---------+
  *          +---------------------------+         |
  *                                                |
@@ -359,8 +359,6 @@
  * While bugs have been filed to cover this future work, the following gives an
  * overview of expected work:
  *
- *  o TSO support
- *  o Multiple group support
  *  o DMA binding and breaking up the locking in ring recycling.
  *  o Enhanced detection of device errors
  *  o Participation in IRM
@@ -371,7 +369,7 @@
 
 #include "i40e_sw.h"
 
-static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.1";
+static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.3";
 
 /*
  * The i40e_glock primarily protects the lists below and the i40e_device_t
@@ -761,15 +759,16 @@ i40e_fm_ereport(i40e_t *i40e, char *detail)
 }
 
 /*
- * Here we're trying to get the ID of the default VSI. In general, when we come
- * through and look at this shortly after attach, we expect there to only be a
- * single element present, which is the default VSI. Importantly, each PF seems
- * to not see any other devices, in part because of the simple switch mode that
- * we're using. If for some reason, we see more artifact, we'll need to revisit
- * what we're doing here.
+ * Here we're trying to set the SEID of the default VSI. In general,
+ * when we come through and look at this shortly after attach, we
+ * expect there to only be a single element present, which is the
+ * default VSI. Importantly, each PF seems to not see any other
+ * devices, in part because of the simple switch mode that we're
+ * using. If for some reason, we see more artifacts, we'll need to
+ * revisit what we're doing here.
  */
-static int
-i40e_get_vsi_id(i40e_t *i40e)
+static boolean_t
+i40e_set_def_vsi_seid(i40e_t *i40e)
 {
 	i40e_hw_t *hw = &i40e->i40e_hw_space;
 	struct i40e_aqc_get_switch_config_resp *sw_config;
@@ -784,17 +783,43 @@ i40e_get_vsi_id(i40e_t *i40e)
 	if (rc != I40E_SUCCESS) {
 		i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d",
 		    rc, hw->aq.asq_last_status);
-		return (-1);
+		return (B_FALSE);
 	}
 
 	if (LE_16(sw_config->header.num_reported) != 1) {
 		i40e_error(i40e, "encountered multiple (%d) switching units "
 		    "during attach, not proceeding",
 		    LE_16(sw_config->header.num_reported));
+		return (B_FALSE);
+	}
+
+	I40E_DEF_VSI_SEID(i40e) = sw_config->element[0].seid;
+	return (B_TRUE);
+}
+
+/*
+ * Get the SEID of the uplink MAC.
+ */
+static int
+i40e_get_mac_seid(i40e_t *i40e)
+{
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
+	struct i40e_aqc_get_switch_config_resp *sw_config;
+	uint8_t aq_buf[I40E_AQ_LARGE_BUF];
+	uint16_t next = 0;
+	int rc;
+
+	/* LINTED: E_BAD_PTR_CAST_ALIGN */
+	sw_config = (struct i40e_aqc_get_switch_config_resp *)aq_buf;
+	rc = i40e_aq_get_switch_config(hw, sw_config, sizeof (aq_buf), &next,
+	    NULL);
+	if (rc != I40E_SUCCESS) {
+		i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d",
+		    rc, hw->aq.asq_last_status);
 		return (-1);
 	}
 
-	return (sw_config->element[0].seid);
+	return (LE_16(sw_config->element[0].uplink_seid));
 }
 
 /*
@@ -1098,11 +1123,16 @@ i40e_disable_interrupts(i40e_t *i40e)
 static void
 i40e_free_trqpairs(i40e_t *i40e)
 {
-	int i;
 	i40e_trqpair_t *itrq;
 
+	if (i40e->i40e_rx_groups != NULL) {
+		kmem_free(i40e->i40e_rx_groups,
+		    sizeof (i40e_rx_group_t) * i40e->i40e_num_rx_groups);
+		i40e->i40e_rx_groups = NULL;
+	}
+
 	if (i40e->i40e_trqpairs != NULL) {
-		for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) {
 			itrq = &i40e->i40e_trqpairs[i];
 			mutex_destroy(&itrq->itrq_rx_lock);
 			mutex_destroy(&itrq->itrq_tx_lock);
@@ -1133,7 +1163,6 @@ i40e_free_trqpairs(i40e_t *i40e)
 static boolean_t
 i40e_alloc_trqpairs(i40e_t *i40e)
 {
-	int i;
 	void *mutexpri = DDI_INTR_PRI(i40e->i40e_intr_pri);
 
 	/*
@@ -1146,7 +1175,7 @@ i40e_alloc_trqpairs(i40e_t *i40e)
 
 	i40e->i40e_trqpairs = kmem_zalloc(sizeof (i40e_trqpair_t) *
 	    i40e->i40e_num_trqpairs, KM_SLEEP);
-	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
+	for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) {
 		i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i];
 
 		itrq->itrq_i40e = i40e;
@@ -1156,6 +1185,16 @@ i40e_alloc_trqpairs(i40e_t *i40e)
 		itrq->itrq_index = i;
 	}
 
+	i40e->i40e_rx_groups = kmem_zalloc(sizeof (i40e_rx_group_t) *
+	    i40e->i40e_num_rx_groups, KM_SLEEP);
+
+	for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) {
+		i40e_rx_group_t *rxg = &i40e->i40e_rx_groups[i];
+
+		rxg->irg_index = i;
+		rxg->irg_i40e = i40e;
+	}
+
 	return (B_TRUE);
 }
 
@@ -1164,16 +1203,19 @@ i40e_alloc_trqpairs(i40e_t *i40e)
 /*
  * Unless a .conf file already overrode i40e_t structure values, they will
  * be 0, and need to be set in conjunction with the now-available HW report.
- *
- * However, at the moment, we cap all of these resources as we only support a
- * single receive ring and a single group.
  */
 /* ARGSUSED */
 static void
 i40e_hw_to_instance(i40e_t *i40e, i40e_hw_t *hw)
 {
-	if (i40e->i40e_num_trqpairs == 0) {
-		i40e->i40e_num_trqpairs = I40E_TRQPAIR_MAX;
+	if (i40e->i40e_num_trqpairs_per_vsi == 0) {
+		if (i40e_is_x722(i40e)) {
+			i40e->i40e_num_trqpairs_per_vsi =
+			    I40E_722_MAX_TC_QUEUES;
+		} else {
+			i40e->i40e_num_trqpairs_per_vsi =
+			    I40E_710_MAX_TC_QUEUES;
+		}
 	}
 
 	if (i40e->i40e_num_rx_groups == 0) {
@@ -1309,12 +1351,11 @@ i40e_common_code_init(i40e_t *i40e, i40e_hw_t *hw)
 	}
 
 	/*
-	 * We need to obtain the Virtual Station ID (VSI) before we can
-	 * perform other operations on the device.
+	 * We need to obtain the Default Virtual Station SEID (VSI)
+	 * before we can perform other operations on the device.
 	 */
-	i40e->i40e_vsi_id = i40e_get_vsi_id(i40e);
-	if (i40e->i40e_vsi_id == -1) {
-		i40e_error(i40e, "failed to obtain VSI ID");
+	if (!i40e_set_def_vsi_seid(i40e)) {
+		i40e_error(i40e, "failed to obtain Default VSI SEID");
 		return (B_FALSE);
 	}
 
@@ -1559,6 +1600,9 @@ i40e_init_properties(i40e_t *i40e)
 	i40e->i40e_tx_hcksum_enable = i40e_get_prop(i40e, "tx_hcksum_enable",
 	    B_FALSE, B_TRUE, B_TRUE);
 
+	i40e->i40e_tx_lso_enable = i40e_get_prop(i40e, "tx_lso_enable",
+	    B_FALSE, B_TRUE, B_TRUE);
+
 	i40e->i40e_rx_hcksum_enable = i40e_get_prop(i40e, "rx_hcksum_enable",
 	    B_FALSE, B_TRUE, B_TRUE);
 
@@ -1728,15 +1772,56 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo)
 	}
 
 	i40e->i40e_intr_type = 0;
+	i40e->i40e_num_rx_groups = I40E_GROUP_MAX;
 
+	/*
+	 * We need to determine the number of queue pairs per traffic
+	 * class. We only have one traffic class (TC0), so we'll base
+	 * this off the number of interrupts provided. Furthermore,
+	 * since we only use one traffic class, the number of queues
+	 * per traffic class and per VSI are the same.
+	 */
 	if ((intr_types & DDI_INTR_TYPE_MSIX) &&
-	    i40e->i40e_intr_force <= I40E_INTR_MSIX) {
-		if (i40e_alloc_intr_handles(i40e, devinfo,
-		    DDI_INTR_TYPE_MSIX)) {
-			i40e->i40e_num_trqpairs =
-			    MIN(i40e->i40e_intr_count - 1, max_trqpairs);
-			return (B_TRUE);
-		}
+	    (i40e->i40e_intr_force <= I40E_INTR_MSIX) &&
+	    (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_MSIX))) {
+		uint32_t n;
+
+		/*
+		 * While we want the number of queue pairs to match
+		 * the number of interrupts, we must keep stay in
+		 * bounds of the maximum number of queues per traffic
+		 * class. We subtract one from i40e_intr_count to
+		 * account for interrupt zero; which is currently
+		 * restricted to admin queue commands and other
+		 * interrupt causes.
+		 */
+		n = MIN(i40e->i40e_intr_count - 1, max_trqpairs);
+		ASSERT3U(n, >, 0);
+
+		/*
+		 * Round up to the nearest power of two to ensure that
+		 * the QBASE aligns with the TC size which must be
+		 * programmed as a power of two. See the queue mapping
+		 * description in section 7.4.9.5.5.1.
+		 *
+		 * If i40e_intr_count - 1 is not a power of two then
+		 * some queue pairs on the same VSI will have to share
+		 * an interrupt.
+		 *
+		 * We may want to revisit this logic in a future where
+		 * we have more interrupts and more VSIs. Otherwise,
+		 * each VSI will use as many interrupts as possible.
+		 * Using more QPs per VSI means better RSS for each
+		 * group, but at the same time may require more
+		 * sharing of interrupts across VSIs. This may be a
+		 * good candidate for a .conf tunable.
+		 */
+		n = 0x1 << ddi_fls(n);
+		i40e->i40e_num_trqpairs_per_vsi = n;
+		ASSERT3U(i40e->i40e_num_rx_groups, >, 0);
+		i40e->i40e_num_trqpairs = i40e->i40e_num_trqpairs_per_vsi *
+		    i40e->i40e_num_rx_groups;
+		return (B_TRUE);
 	}
 
 	/*
@@ -1745,6 +1830,7 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo)
 	 * single MSI interrupt.
 	 */
 	i40e->i40e_num_trqpairs = I40E_TRQPAIR_NOMSIX;
+	i40e->i40e_num_trqpairs_per_vsi = i40e->i40e_num_trqpairs;
 	i40e->i40e_num_rx_groups = I40E_GROUP_NOMSIX;
 
 	if ((intr_types & DDI_INTR_TYPE_MSI) &&
@@ -1767,24 +1853,20 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo)
 static boolean_t
 i40e_map_intrs_to_vectors(i40e_t *i40e)
 {
-	int i;
-
 	if (i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) {
 		return (B_TRUE);
 	}
 
 	/*
-	 * Each queue pair is mapped to a single interrupt, so transmit
-	 * and receive interrupts for a given queue share the same vector.
-	 * The number of queue pairs is one less than the number of interrupt
-	 * vectors and is assigned the vector one higher than its index.
-	 * Vector zero is reserved for the admin queue.
+	 * Each queue pair is mapped to a single interrupt, so
+	 * transmit and receive interrupts for a given queue share the
+	 * same vector. Vector zero is reserved for the admin queue.
 	 */
-	ASSERT(i40e->i40e_intr_count == i40e->i40e_num_trqpairs + 1);
+	for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) {
+		uint_t vector = i % (i40e->i40e_intr_count - 1);
 
-	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
-		i40e->i40e_trqpairs[i].itrq_rx_intrvec = i + 1;
-		i40e->i40e_trqpairs[i].itrq_tx_intrvec = i + 1;
+		i40e->i40e_trqpairs[i].itrq_rx_intrvec = vector + 1;
+		i40e->i40e_trqpairs[i].itrq_tx_intrvec = vector + 1;
 	}
 
 	return (B_TRUE);
@@ -1923,89 +2005,251 @@ i40e_init_macaddrs(i40e_t *i40e, i40e_hw_t *hw)
 }
 
 /*
- * Configure the hardware for the Virtual Station Interface (VSI).  Currently
- * we only support one, but in the future we could instantiate more than one
- * per attach-point.
+ * Set the properties which have common values across all the VSIs.
+ * Consult the "Add VSI" command section (7.4.9.5.5.1) for a
+ * complete description of these properties.
  */
-static boolean_t
-i40e_config_vsi(i40e_t *i40e, i40e_hw_t *hw)
+static void
+i40e_set_shared_vsi_props(i40e_t *i40e,
+    struct i40e_aqc_vsi_properties_data *info, uint_t vsi_idx)
 {
-	struct i40e_vsi_context	context;
-	int err, tc_queues;
-
-	bzero(&context, sizeof (struct i40e_vsi_context));
-	context.seid = i40e->i40e_vsi_id;
-	context.pf_num = hw->pf_id;
-	err = i40e_aq_get_vsi_params(hw, &context, NULL);
-	if (err != I40E_SUCCESS) {
-		i40e_error(i40e, "get VSI params failed with %d", err);
-		return (B_FALSE);
-	}
+	uint_t tc_queues;
+	uint16_t vsi_qp_base;
 
-	i40e->i40e_vsi_num = context.vsi_number;
+	/*
+	 * It's important that we use bitwise-OR here; callers to this
+	 * function might enable other sections before calling this
+	 * function.
+	 */
+	info->valid_sections |= LE_16(I40E_AQ_VSI_PROP_QUEUE_MAP_VALID |
+	    I40E_AQ_VSI_PROP_VLAN_VALID);
 
 	/*
-	 * Set the queue and traffic class bits.  Keep it simple for now.
+	 * Calculate the starting QP index for this VSI. This base is
+	 * relative to the PF queue space; so a value of 0 for PF#1
+	 * represents the absolute index PFLAN_QALLOC_FIRSTQ for PF#1.
 	 */
-	context.info.valid_sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID;
-	context.info.mapping_flags = I40E_AQ_VSI_QUE_MAP_CONTIG;
-	context.info.queue_mapping[0] = I40E_ASSIGN_ALL_QUEUES;
+	vsi_qp_base = vsi_idx * i40e->i40e_num_trqpairs_per_vsi;
+	info->mapping_flags = LE_16(I40E_AQ_VSI_QUE_MAP_CONTIG);
+	info->queue_mapping[0] =
+	    LE_16((vsi_qp_base << I40E_AQ_VSI_QUEUE_SHIFT) &
+		I40E_AQ_VSI_QUEUE_MASK);
 
 	/*
-	 * tc_queues determines the size of the traffic class, where the size is
-	 * 2^^tc_queues to a maximum of 64 for the X710 and 128 for the X722.
+	 * tc_queues determines the size of the traffic class, where
+	 * the size is 2^^tc_queues to a maximum of 64 for the X710
+	 * and 128 for the X722.
 	 *
 	 * Some examples:
-	 * 	i40e_num_trqpairs == 1 =>  tc_queues = 0, 2^^0 = 1.
-	 * 	i40e_num_trqpairs == 7 =>  tc_queues = 3, 2^^3 = 8.
-	 * 	i40e_num_trqpairs == 8 =>  tc_queues = 3, 2^^3 = 8.
-	 * 	i40e_num_trqpairs == 9 =>  tc_queues = 4, 2^^4 = 16.
-	 * 	i40e_num_trqpairs == 17 => tc_queues = 5, 2^^5 = 32.
-	 * 	i40e_num_trqpairs == 64 => tc_queues = 6, 2^^6 = 64.
+	 * 	i40e_num_trqpairs_per_vsi == 1 =>  tc_queues = 0, 2^^0 = 1.
+	 * 	i40e_num_trqpairs_per_vsi == 7 =>  tc_queues = 3, 2^^3 = 8.
+	 * 	i40e_num_trqpairs_per_vsi == 8 =>  tc_queues = 3, 2^^3 = 8.
+	 * 	i40e_num_trqpairs_per_vsi == 9 =>  tc_queues = 4, 2^^4 = 16.
+	 * 	i40e_num_trqpairs_per_vsi == 17 => tc_queues = 5, 2^^5 = 32.
+	 * 	i40e_num_trqpairs_per_vsi == 64 => tc_queues = 6, 2^^6 = 64.
 	 */
-	tc_queues = ddi_fls(i40e->i40e_num_trqpairs - 1);
+	tc_queues = ddi_fls(i40e->i40e_num_trqpairs_per_vsi - 1);
 
-	context.info.tc_mapping[0] = ((0 << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) &
-	    I40E_AQ_VSI_TC_QUE_OFFSET_MASK) |
-	    ((tc_queues << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) &
-	    I40E_AQ_VSI_TC_QUE_NUMBER_MASK);
+	/*
+	 * The TC queue mapping is in relation to the VSI queue space.
+	 * Since we are only using one traffic class (TC0) we always
+	 * start at queue offset 0.
+	 */
+	info->tc_mapping[0] =
+	    LE_16(((0 << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) &
+		    I40E_AQ_VSI_TC_QUE_OFFSET_MASK) |
+		((tc_queues << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) &
+		    I40E_AQ_VSI_TC_QUE_NUMBER_MASK));
 
-	context.info.valid_sections |= I40E_AQ_VSI_PROP_VLAN_VALID;
-	context.info.port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL |
+	/*
+	 * I40E_AQ_VSI_PVLAN_MODE_ALL ("VLAN driver insertion mode")
+	 *
+	 *	Allow tagged and untagged packets to be sent to this
+	 *	VSI from the host.
+	 *
+	 * I40E_AQ_VSI_PVLAN_EMOD_NOTHING ("VLAN and UP expose mode")
+	 *
+	 *	Leave the tag on the frame and place no VLAN
+	 *	information in the descriptor. We want this mode
+	 *	because our MAC layer will take care of the VLAN tag,
+	 *	if there is one.
+	 */
+	info->port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL |
 	    I40E_AQ_VSI_PVLAN_EMOD_NOTHING;
+}
 
-	context.flags = LE16_TO_CPU(I40E_AQ_VSI_TYPE_PF);
+/*
+ * Delete the VSI at this index, if one exists. We assume there is no
+ * action we can take if this command fails but to log the failure.
+ */
+static void
+i40e_delete_vsi(i40e_t *i40e, uint_t idx)
+{
+	i40e_hw_t	*hw = &i40e->i40e_hw_space;
+	uint16_t	seid = i40e->i40e_vsis[idx].iv_seid;
 
-	i40e->i40e_vsi_stat_id = LE16_TO_CPU(context.info.stat_counter_idx);
-	if (i40e_stat_vsi_init(i40e) == B_FALSE)
-		return (B_FALSE);
+	if (seid != 0) {
+		int rc;
 
-	err = i40e_aq_update_vsi_params(hw, &context, NULL);
-	if (err != I40E_SUCCESS) {
-		i40e_error(i40e, "Update VSI params failed with %d", err);
+		rc = i40e_aq_delete_element(hw, seid, NULL);
+
+		if (rc != I40E_SUCCESS) {
+			i40e_error(i40e, "Failed to delete VSI %d: %d",
+			    rc, hw->aq.asq_last_status);
+		}
+
+		i40e->i40e_vsis[idx].iv_seid = 0;
+	}
+}
+
+/*
+ * Add a new VSI.
+ */
+static boolean_t
+i40e_add_vsi(i40e_t *i40e, i40e_hw_t *hw, uint_t idx)
+{
+	struct i40e_vsi_context	ctx;
+	i40e_rx_group_t		*rxg;
+	int			rc;
+
+	/*
+	 * The default VSI is created by the controller. This function
+	 * creates new, non-defualt VSIs only.
+	 */
+	ASSERT3U(idx, !=, 0);
+
+	bzero(&ctx, sizeof (struct i40e_vsi_context));
+	ctx.uplink_seid = i40e->i40e_veb_seid;
+	ctx.pf_num = hw->pf_id;
+	ctx.flags = I40E_AQ_VSI_TYPE_PF;
+	ctx.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL;
+	i40e_set_shared_vsi_props(i40e, &ctx.info, idx);
+
+	rc = i40e_aq_add_vsi(hw, &ctx, NULL);
+	if (rc != I40E_SUCCESS) {
+		i40e_error(i40e, "i40e_aq_add_vsi() failed %d: %d", rc,
+		    hw->aq.asq_last_status);
 		return (B_FALSE);
 	}
 
+	rxg = &i40e->i40e_rx_groups[idx];
+	rxg->irg_vsi_seid = ctx.seid;
+	i40e->i40e_vsis[idx].iv_number = ctx.vsi_number;
+	i40e->i40e_vsis[idx].iv_seid = ctx.seid;
+	i40e->i40e_vsis[idx].iv_stats_id = LE_16(ctx.info.stat_counter_idx);
+
+	if (i40e_stat_vsi_init(i40e, idx) == B_FALSE)
+		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
- * Configure the RSS key. For the X710 controller family, this is set on a
- * per-PF basis via registers. For the X722, this is done on a per-VSI basis
- * through the admin queue.
+ * Configure the hardware for the Default Virtual Station Interface (VSI).
  */
 static boolean_t
-i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw)
+i40e_config_def_vsi(i40e_t *i40e, i40e_hw_t *hw)
 {
-	uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1];
+	struct i40e_vsi_context	ctx;
+	i40e_rx_group_t *def_rxg;
+	int err;
+	struct i40e_aqc_remove_macvlan_element_data filt;
 
-	(void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed));
+	bzero(&ctx, sizeof (struct i40e_vsi_context));
+	ctx.seid = I40E_DEF_VSI_SEID(i40e);
+	ctx.pf_num = hw->pf_id;
+	err = i40e_aq_get_vsi_params(hw, &ctx, NULL);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "get VSI params failed with %d", err);
+		return (B_FALSE);
+	}
 
-	if (i40e_is_x722(i40e)) {
+	ctx.info.valid_sections = 0;
+	i40e->i40e_vsis[0].iv_number = ctx.vsi_number;
+	i40e->i40e_vsis[0].iv_stats_id = LE_16(ctx.info.stat_counter_idx);
+	if (i40e_stat_vsi_init(i40e, 0) == B_FALSE)
+		return (B_FALSE);
+
+	i40e_set_shared_vsi_props(i40e, &ctx.info, I40E_DEF_VSI_IDX);
+
+	err = i40e_aq_update_vsi_params(hw, &ctx, NULL);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "Update VSI params failed with %d", err);
+		return (B_FALSE);
+	}
+
+	def_rxg = &i40e->i40e_rx_groups[0];
+	def_rxg->irg_vsi_seid = I40E_DEF_VSI_SEID(i40e);
+
+	/*
+	 * The controller places an implicit L2 filter for the primary
+	 * MAC pointing to the default VSI. We remove this filter to
+	 * prevent duplicate delivery of packets destined for the
+	 * primary MAC address as DLS will create the same filter on a
+	 * non-default VSI for the primary MAC client.
+	 */
+	bzero(&filt, sizeof (filt));
+	bcopy(hw->mac.port_addr, filt.mac_addr, ETHERADDRL);
+	filt.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH;
+	filt.vlan_tag = 0;
+
+
+	ASSERT3U(i40e->i40e_resources.ifr_nmacfilt_used, <=, 1);
+
+	err = i40e_aq_remove_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, 1,
+	    NULL);
+	if (err != I40E_SUCCESS) {
+		i40e_error(i40e, "Failed to remove primary MAC from default VSI"
+		    ":  %d (%d)", err, hw->aq.asq_last_status);
+		return (B_FALSE);
+	}
+
+	/*
+	 *  As mentioned above, the controller created an implicit L2
+	 *  filter for the primary MAC. We want to remove both the
+	 *  filter and decrement the filter count. However, not all
+	 *  controllers count this implicit filter against the total
+	 *  MAC filter count. So here we are making sure it is either
+	 *  one or zero. If it is one, then we know it is for the
+	 *  implicit filter and we should decrement since we just
+	 *  removed the filter above. If it is zero then we know the
+	 *  controller that does not count the implicit filter, and it
+	 *  was enough to just remove it; we leave the count alone.
+	 *  But if it is neither, then we have never seen a controller
+	 *  like this before and we should fail to attach.
+	 *
+	 *  It is unfortunate that this code must exist but the
+	 *  behavior of this implicit L2 filter and its corresponding
+	 *  count were dicovered through empirical testing. The
+	 *  programming manuals hint at this filter but do not
+	 *  explicitly call out the exact behavior.
+	 */
+	if (i40e->i40e_resources.ifr_nmacfilt_used == 1) {
+		i40e->i40e_resources.ifr_nmacfilt_used--;
+	} else {
+		if (i40e->i40e_resources.ifr_nmacfilt_used != 0) {
+			i40e_error(i40e, "Unexpected MAC filter count: %u"
+			    " (expected 0)",
+			    i40e->i40e_resources.ifr_nmacfilt_used);
+			    return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+static boolean_t
+i40e_config_rss_key_x722(i40e_t *i40e, i40e_hw_t *hw)
+{
+	for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) {
+		uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1];
 		struct i40e_aqc_get_set_rss_key_data key;
-		const char *u8seed = (char *)seed;
+		const char *u8seed;
 		enum i40e_status_code status;
+		uint16_t vsi_number = i40e->i40e_vsis[i].iv_number;
+
+		(void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed));
+		u8seed = (char *)seed;
 
 		CTASSERT(sizeof (key) >= (sizeof (key.standard_rss_key) +
 		    sizeof (key.extended_hash_key)));
@@ -2015,14 +2259,35 @@ i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw)
 		bcopy(&u8seed[sizeof (key.standard_rss_key)],
 		    key.extended_hash_key, sizeof (key.extended_hash_key));
 
-		status = i40e_aq_set_rss_key(hw, i40e->i40e_vsi_num, &key);
+		ASSERT3U(vsi_number, !=, 0);
+		status = i40e_aq_set_rss_key(hw, vsi_number, &key);
+
 		if (status != I40E_SUCCESS) {
-			i40e_error(i40e, "failed to set rss key: %d", status);
+			i40e_error(i40e, "failed to set RSS key for VSI %u: %d",
+			    vsi_number, status);
 			return (B_FALSE);
 		}
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Configure the RSS key. For the X710 controller family, this is set on a
+ * per-PF basis via registers. For the X722, this is done on a per-VSI basis
+ * through the admin queue.
+ */
+static boolean_t
+i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw)
+{
+	if (i40e_is_x722(i40e)) {
+		if (!i40e_config_rss_key_x722(i40e, hw))
+			return (B_FALSE);
 	} else {
-		uint_t i;
-		for (i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++)
+		uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1];
+
+		(void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed));
+		for (uint_t i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++)
 			i40e_write_rx_ctl(hw, I40E_PFQF_HKEY(i), seed[i]);
 	}
 
@@ -2034,11 +2299,12 @@ i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw)
  * family, with the X722 using a known 7-bit width. On the X710 controller, this
  * is programmed through its control registers where as on the X722 this is
  * configured through the admin queue. Also of note, the X722 allows the LUT to
- * be set on a per-PF or VSI basis. At this time, as we only have a single VSI,
- * we use the PF setting as it is the primary VSI.
+ * be set on a per-PF or VSI basis. At this time we use the PF setting. If we
+ * decide to use the per-VSI LUT in the future, then we will need to modify the
+ * i40e_add_vsi() function to set the RSS LUT bits in the queueing section.
  *
  * We populate the LUT in a round robin fashion with the rx queue indices from 0
- * to i40e_num_trqpairs - 1.
+ * to i40e_num_trqpairs_per_vsi - 1.
  */
 static boolean_t
 i40e_config_rss_hlut(i40e_t *i40e, i40e_hw_t *hw)
@@ -2068,15 +2334,20 @@ i40e_config_rss_hlut(i40e_t *i40e, i40e_hw_t *hw)
 		lut_mask = (1 << hw->func_caps.rss_table_entry_width) - 1;
 	}
 
-	for (i = 0; i < I40E_HLUT_TABLE_SIZE; i++)
-		((uint8_t *)hlut)[i] = (i % i40e->i40e_num_trqpairs) & lut_mask;
+	for (i = 0; i < I40E_HLUT_TABLE_SIZE; i++) {
+		((uint8_t *)hlut)[i] =
+		    (i % i40e->i40e_num_trqpairs_per_vsi) & lut_mask;
+	}
 
 	if (i40e_is_x722(i40e)) {
 		enum i40e_status_code status;
-		status = i40e_aq_set_rss_lut(hw, i40e->i40e_vsi_num, B_TRUE,
-		    (uint8_t *)hlut, I40E_HLUT_TABLE_SIZE);
+
+		status = i40e_aq_set_rss_lut(hw, 0, B_TRUE, (uint8_t *)hlut,
+		    I40E_HLUT_TABLE_SIZE);
+
 		if (status != I40E_SUCCESS) {
-			i40e_error(i40e, "failed to set RSS LUT: %d", status);
+			i40e_error(i40e, "failed to set RSS LUT %d: %d",
+			    status, hw->aq.asq_last_status);
 			goto out;
 		}
 	} else {
@@ -2188,8 +2459,34 @@ i40e_chip_start(i40e_t *i40e)
 
 	i40e_intr_chip_init(i40e);
 
-	if (!i40e_config_vsi(i40e, hw))
+	rc = i40e_get_mac_seid(i40e);
+	if (rc == -1) {
+		i40e_error(i40e, "failed to obtain MAC Uplink SEID");
 		return (B_FALSE);
+	}
+	i40e->i40e_mac_seid = (uint16_t)rc;
+
+	/*
+	 * Create a VEB in order to support multiple VSIs. Each VSI
+	 * functions as a MAC group. This call sets the PF's MAC as
+	 * the uplink port and the PF's default VSI as the default
+	 * downlink port.
+	 */
+	rc = i40e_aq_add_veb(hw, i40e->i40e_mac_seid, I40E_DEF_VSI_SEID(i40e),
+	    0x1, B_TRUE, &i40e->i40e_veb_seid, B_FALSE, NULL);
+	if (rc != I40E_SUCCESS) {
+		i40e_error(i40e, "i40e_aq_add_veb() failed %d: %d", rc,
+		    hw->aq.asq_last_status);
+		return (B_FALSE);
+	}
+
+	if (!i40e_config_def_vsi(i40e, hw))
+		return (B_FALSE);
+
+	for (uint_t i = 1; i < i40e->i40e_num_rx_groups; i++) {
+		if (!i40e_add_vsi(i40e, hw, i))
+			return (B_FALSE);
+	}
 
 	if (!i40e_config_rss(i40e, hw))
 		return (B_FALSE);
@@ -2549,7 +2846,7 @@ i40e_setup_tx_hmc(i40e_trqpair_t *itrq)
 	 * assigned to traffic class zero, because we don't actually use them.
 	 */
 	bzero(&context, sizeof (struct i40e_vsi_context));
-	context.seid = i40e->i40e_vsi_id;
+	context.seid = I40E_DEF_VSI_SEID(i40e);
 	context.pf_num = hw->pf_id;
 	err = i40e_aq_get_vsi_params(hw, &context, NULL);
 	if (err != I40E_SUCCESS) {
@@ -2653,7 +2950,8 @@ i40e_setup_tx_rings(i40e_t *i40e)
 void
 i40e_stop(i40e_t *i40e, boolean_t free_allocations)
 {
-	int i;
+	uint_t i;
+	i40e_hw_t *hw = &i40e->i40e_hw_space;
 
 	ASSERT(MUTEX_HELD(&i40e->i40e_general_lock));
 
@@ -2689,6 +2987,27 @@ i40e_stop(i40e_t *i40e, boolean_t free_allocations)
 
 	delay(50 * drv_usectohz(1000));
 
+	/*
+	 * We don't delete the default VSI because it replaces the VEB
+	 * after VEB deletion (see the "Delete Element" section).
+	 * Furthermore, since the default VSI is provided by the
+	 * firmware, we never attempt to delete it.
+	 */
+	for (i = 1; i < i40e->i40e_num_rx_groups; i++) {
+		i40e_delete_vsi(i40e, i);
+	}
+
+	if (i40e->i40e_veb_seid != 0) {
+		int rc = i40e_aq_delete_element(hw, i40e->i40e_veb_seid, NULL);
+
+		if (rc != I40E_SUCCESS) {
+			i40e_error(i40e, "Failed to delete VEB %d: %d", rc,
+			    hw->aq.asq_last_status);
+		}
+
+		i40e->i40e_veb_seid = 0;
+	}
+
 	i40e_intr_chip_fini(i40e);
 
 	for (i = 0; i < i40e->i40e_num_trqpairs; i++) {
@@ -2718,7 +3037,9 @@ i40e_stop(i40e_t *i40e, boolean_t free_allocations)
 		mutex_exit(&i40e->i40e_trqpairs[i].itrq_tx_lock);
 	}
 
-	i40e_stat_vsi_fini(i40e);
+	for (i = 0; i < i40e->i40e_num_rx_groups; i++) {
+		i40e_stat_vsi_fini(i40e, i);
+	}
 
 	i40e->i40e_link_speed = 0;
 	i40e->i40e_link_duplex = 0;
@@ -2783,7 +3104,8 @@ i40e_start(i40e_t *i40e, boolean_t alloc)
 	 * Enable broadcast traffic; however, do not enable multicast traffic.
 	 * That's handle exclusively through MAC's mc_multicst routines.
 	 */
-	err = i40e_aq_set_vsi_broadcast(hw, i40e->i40e_vsi_id, B_TRUE, NULL);
+	err = i40e_aq_set_vsi_broadcast(hw, I40E_DEF_VSI_SEID(i40e), B_TRUE,
+	    NULL);
 	if (err != I40E_SUCCESS) {
 		i40e_error(i40e, "failed to set default VSI: %d", err);
 		rc = B_FALSE;
diff --git a/usr/src/uts/common/io/i40e/i40e_stats.c b/usr/src/uts/common/io/i40e/i40e_stats.c
index 7a4f0faedd..e40c9f2c53 100644
--- a/usr/src/uts/common/io/i40e/i40e_stats.c
+++ b/usr/src/uts/common/io/i40e/i40e_stats.c
@@ -11,7 +11,7 @@
 
 /*
  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include "i40e_sw.h"
@@ -69,12 +69,7 @@
  * ---------------------
  *
  * The hardware keeps statistics at each physical function/MAC (PF) and it keeps
- * statistics on each virtual station interface (VSI). Currently we only use one
- * VSI per PF (see the i40e_main.c theory statement). The hardware has a limited
- * number of statistics units available. While every PF is guaranteed to have a
- * statistics unit, it is possible that we will run out for a given VSI. We'll
- * have to figure out an appropriate strategy here when we end up supporting
- * multiple VSIs.
+ * statistics on each virtual station interface (VSI).
  *
  * The hardware keeps these statistics as 32-bit and 48-bit counters. We are
  * required to read them and then compute the differences between them. The
@@ -100,10 +95,10 @@
  * data.
  *
  * The pf kstats data is stored in the i40e_t`i40e_pf_kstat. It is backed by the
- * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstat is in
- * i40e_t`i40e_vsi_kstat and the data is backed in the i40e_t`i40e_vsi_stat. All
- * of this data is protected by the i40e_stat_lock, which should be taken last,
- * when acquiring locks.
+ * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstats are in
+ * i40e_t`i40e_vsis[idx].iv_kstats and the data is backed in the
+ * i40e_t`i40e_vsis[idx].iv_stats. All of this data is protected by the
+ * i40e_stat_lock, which should be taken last, when acquiring locks.
  */
 
 static void
@@ -169,15 +164,15 @@ i40e_stat_get_uint32(i40e_t *i40e, uintptr_t reg, kstat_named_t *kstat,
 }
 
 static void
-i40e_stat_vsi_update(i40e_t *i40e, boolean_t init)
+i40e_stat_vsi_update(i40e_t *i40e, uint_t idx, boolean_t init)
 {
 	i40e_vsi_stats_t *ivs;
 	i40e_vsi_kstats_t *ivk;
-	int id = i40e->i40e_vsi_stat_id;
+	uint16_t id = i40e->i40e_vsis[idx].iv_stats_id;
 
-	ASSERT(i40e->i40e_vsi_kstat != NULL);
-	ivs = &i40e->i40e_vsi_stat;
-	ivk = i40e->i40e_vsi_kstat->ks_data;
+	ASSERT3P(i40e->i40e_vsis[idx].iv_kstats, !=, NULL);
+	ivs = &i40e->i40e_vsis[idx].iv_stats;
+	ivk = i40e->i40e_vsis[idx].iv_kstats->ks_data;
 
 	mutex_enter(&i40e->i40e_stat_lock);
 
@@ -231,39 +226,41 @@ i40e_stat_vsi_kstat_update(kstat_t *ksp, int rw)
 		return (EACCES);
 
 	i40e = ksp->ks_private;
-	i40e_stat_vsi_update(i40e, B_FALSE);
+	for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++)
+		i40e_stat_vsi_update(i40e, i, B_FALSE);
+
 	return (0);
 }
 
 void
-i40e_stat_vsi_fini(i40e_t *i40e)
+i40e_stat_vsi_fini(i40e_t *i40e, uint_t idx)
 {
-	if (i40e->i40e_vsi_kstat != NULL) {
-		kstat_delete(i40e->i40e_vsi_kstat);
-		i40e->i40e_vsi_kstat = NULL;
+	if (i40e->i40e_vsis[idx].iv_kstats != NULL) {
+		kstat_delete(i40e->i40e_vsis[idx].iv_kstats);
+		i40e->i40e_vsis[idx].iv_kstats = NULL;
 	}
 }
 
 boolean_t
-i40e_stat_vsi_init(i40e_t *i40e)
+i40e_stat_vsi_init(i40e_t *i40e, uint_t idx)
 {
 	kstat_t *ksp;
 	i40e_vsi_kstats_t *ivk;
 	char buf[64];
+	uint16_t vsi_id = i40e->i40e_vsis[idx].iv_seid;
 
-	(void) snprintf(buf, sizeof (buf), "vsi_%d", i40e->i40e_vsi_id);
+	(void) snprintf(buf, sizeof (buf), "vsi_%u", vsi_id);
 
 	ksp = kstat_create(I40E_MODULE_NAME, ddi_get_instance(i40e->i40e_dip),
 	    buf, "net", KSTAT_TYPE_NAMED,
 	    sizeof (i40e_vsi_kstats_t) / sizeof (kstat_named_t), 0);
 
 	if (ksp == NULL) {
-		i40e_error(i40e, "Failed to create kstats for VSI %d",
-		    i40e->i40e_vsi_id);
+		i40e_error(i40e, "Failed to create kstats for VSI %u", vsi_id);
 		return (B_FALSE);
 	}
 
-	i40e->i40e_vsi_kstat = ksp;
+	i40e->i40e_vsis[idx].iv_kstats = ksp;
 	ivk = ksp->ks_data;
 	ksp->ks_update = i40e_stat_vsi_kstat_update;
 	ksp->ks_private = i40e;
@@ -291,9 +288,9 @@ i40e_stat_vsi_init(i40e_t *i40e)
 	kstat_named_init(&ivk->ivk_tx_errors, "tx_errors",
 	    KSTAT_DATA_UINT64);
 
-	bzero(&i40e->i40e_vsi_stat, sizeof (i40e_vsi_stats_t));
-	i40e_stat_vsi_update(i40e, B_TRUE);
-	kstat_install(i40e->i40e_vsi_kstat);
+	bzero(&i40e->i40e_vsis[idx].iv_stats, sizeof (i40e_vsi_stats_t));
+	i40e_stat_vsi_update(i40e, idx, B_TRUE);
+	kstat_install(i40e->i40e_vsis[idx].iv_kstats);
 
 	return (B_TRUE);
 }
@@ -670,7 +667,12 @@ i40e_stat_pf_init(i40e_t *i40e)
 void
 i40e_stats_fini(i40e_t *i40e)
 {
-	ASSERT(i40e->i40e_vsi_kstat == NULL);
+#ifdef DEBUG
+	for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) {
+		ASSERT3P(i40e->i40e_vsis[i].iv_kstats, ==, NULL);
+	}
+#endif
+
 	if (i40e->i40e_pf_kstat != NULL) {
 		kstat_delete(i40e->i40e_pf_kstat);
 		i40e->i40e_pf_kstat = NULL;
@@ -1230,6 +1232,12 @@ i40e_stats_trqpair_init(i40e_trqpair_t *itrq)
 	kstat_named_init(&tsp->itxs_recycled, "tx_recycled",
 	    KSTAT_DATA_UINT64);
 	tsp->itxs_recycled.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_force_copy, "tx_force_copy",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_force_copy.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_tso_force_copy, "tx_tso_force_copy",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_tso_force_copy.value.ui64 = 0;
 
 	kstat_named_init(&tsp->itxs_hck_meoifail, "tx_hck_meoifail",
 	    KSTAT_DATA_UINT64);
@@ -1249,6 +1257,15 @@ i40e_stats_trqpair_init(i40e_trqpair_t *itrq)
 	kstat_named_init(&tsp->itxs_hck_badl4, "tx_hck_badl4",
 	    KSTAT_DATA_UINT64);
 	tsp->itxs_hck_badl4.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_lso_nohck, "tx_lso_nohck",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_lso_nohck.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_bind_fails, "tx_bind_fails",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_bind_fails.value.ui64 = 0;
+	kstat_named_init(&tsp->itxs_tx_short, "tx_short",
+	    KSTAT_DATA_UINT64);
+	tsp->itxs_tx_short.value.ui64 = 0;
 	kstat_named_init(&tsp->itxs_err_notcb, "tx_err_notcb",
 	    KSTAT_DATA_UINT64);
 	tsp->itxs_err_notcb.value.ui64 = 0;
diff --git a/usr/src/uts/common/io/i40e/i40e_sw.h b/usr/src/uts/common/io/i40e/i40e_sw.h
index 78aced0144..e7b64c2160 100644
--- a/usr/src/uts/common/io/i40e/i40e_sw.h
+++ b/usr/src/uts/common/io/i40e/i40e_sw.h
@@ -11,7 +11,7 @@
 
 /*
  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  * Copyright 2017 Tegile Systems, Inc.  All rights reserved.
  */
 
@@ -152,9 +152,10 @@ typedef enum i40e_itr_index {
 } i40e_itr_index_t;
 
 /*
- * Table 1-5 of the PRM notes that LSO supports up to 256 KB.
+ * The hardware claims to support LSO up to 256 KB, but due to the limitations
+ * imposed by the IP header for non-jumbo frames, we cap it at 64 KB.
  */
-#define	I40E_LSO_MAXLEN	(256 * 1024)
+#define	I40E_LSO_MAXLEN	(64 * 1024)
 
 #define	I40E_CYCLIC_PERIOD NANOSEC	/* 1 second */
 #define	I40E_DRAIN_RX_WAIT	(500 * MILLISEC)	/* In us */
@@ -173,13 +174,22 @@ typedef enum i40e_itr_index {
 #define	I40E_BUF_IPHDR_ALIGNMENT	2
 
 /*
- * The XL710 controller has a limit of eight buffers being allowed to be used
- * for the transmission of a single frame. This is defined in 8.4.1 - Transmit
+ * The XL710 controller has a total of eight buffers available for the
+ * transmission of any single frame. This is defined in 8.4.1 - Transmit
  * Packet in System Memory.
  */
 #define	I40E_TX_MAX_COOKIE	8
 
 /*
+ * An LSO frame can be as large as 64KB, so we allow a DMA bind to span more
+ * cookies than a non-LSO frame.  The key here to is to select a value such
+ * that once the HW has chunked up the LSO frame into MSS-sized segments that no
+ * single segment spans more than 8 cookies (see comments for
+ * I40E_TX_MAX_COOKIE)
+ */
+#define	I40E_TX_LSO_MAX_COOKIE	32
+
+/*
  * Sizing to determine the amount of available descriptors at which we'll
  * consider ourselves blocked. Also, when we have these available, we'll then
  * consider ourselves available to transmit to MAC again. Strictly speaking, the
@@ -203,6 +213,12 @@ typedef enum i40e_itr_index {
 #define	I40E_MAX_TX_DMA_THRESH		INT32_MAX
 
 /*
+ * The max size of each individual tx buffer is 16KB - 1.
+ * See table 8-17
+ */
+#define	I40E_MAX_TX_BUFSZ		0x0000000000003FFFull
+
+/*
  * Resource sizing counts. There are various aspects of hardware where we may
  * have some variable number of elements that we need to handle. Such as the
  * hardware capabilities and switch capacities. We cannot know a priori how many
@@ -240,21 +256,6 @@ typedef enum i40e_itr_index {
 #define	I40E_HMC_TX_TPH_DISABLE		0
 
 /*
- * Whenever we establish and create a VSI, we need to assign some number of
- * queues that it's allowed to access from the PF. Because we only have a single
- * VSI per PF at this time, we assign it all the queues.
- *
- * Many of the devices support what's called Data-center Bridging. Which is a
- * feature that we don't have much use of at this time. However, we still need
- * to fill in this information. We follow the guidance of the note in Table 7-80
- * which talks about bytes 62-77. It says that if we don't want to assign
- * anything to traffic classes, we should set the field to zero. Effectively
- * this means that everything in the system is assigned to traffic class zero.
- */
-#define	I40E_ASSIGN_ALL_QUEUES		0
-#define	I40E_TRAFFIC_CLASS_NO_QUEUES	0
-
-/*
  * This defines the error mask that we care about from rx descriptors. Currently
  * we're only concerned with the general errors and oversize errors.
  */
@@ -268,12 +269,12 @@ typedef enum i40e_itr_index {
 #define	I40E_DDI_PROP_LEN	64
 
 /*
- * We currently consolidate some overrides that we use in the code here. These
- * will be gone in the fullness of time, but as we're bringing up the device,
- * this is what we use.
+ * Place an artificial limit on the max number of groups. The X710
+ * series supports up to 384 VSIs to be partitioned across PFs as the
+ * driver sees fit. But until we support more interrupts this seems
+ * like a good place to start.
  */
-#define	I40E_GROUP_MAX		1
-#define	I40E_TRQPAIR_MAX	1
+#define	I40E_GROUP_MAX		32
 
 #define	I40E_GROUP_NOMSIX	1
 #define	I40E_TRQPAIR_NOMSIX	1
@@ -405,18 +406,29 @@ typedef struct i40e_rx_control_block {
 typedef enum {
 	I40E_TX_NONE,
 	I40E_TX_COPY,
-	I40E_TX_DMA
+	I40E_TX_DMA,
+	I40E_TX_DESC,
 } i40e_tx_type_t;
 
 typedef struct i40e_tx_desc i40e_tx_desc_t;
+typedef struct i40e_tx_context_desc i40e_tx_context_desc_t;
 typedef union i40e_32byte_rx_desc i40e_rx_desc_t;
 
+struct i40e_dma_bind_info {
+	caddr_t dbi_paddr;
+	size_t dbi_len;
+};
+
 typedef struct i40e_tx_control_block {
 	struct i40e_tx_control_block	*tcb_next;
 	mblk_t				*tcb_mp;
 	i40e_tx_type_t			tcb_type;
 	ddi_dma_handle_t		tcb_dma_handle;
+	ddi_dma_handle_t		tcb_lso_dma_handle;
 	i40e_dma_buffer_t		tcb_dma;
+	struct i40e_dma_bind_info	*tcb_bind_info;
+	uint_t				tcb_bind_ncookies;
+	boolean_t			tcb_used_lso;
 } i40e_tx_control_block_t;
 
 /*
@@ -517,6 +529,8 @@ typedef struct i40e_txq_stat {
 	kstat_named_t	itxs_packets;		/* Packets out on queue */
 	kstat_named_t	itxs_descriptors;	/* Descriptors issued */
 	kstat_named_t	itxs_recycled;		/* Descriptors reclaimed */
+	kstat_named_t	itxs_force_copy;	/* non-TSO force copy */
+	kstat_named_t	itxs_tso_force_copy;	/* TSO force copy */
 	/*
 	 * Various failure conditions.
 	 */
@@ -526,6 +540,9 @@ typedef struct i40e_txq_stat {
 	kstat_named_t	itxs_hck_nol4info;	/* Missing l4 info */
 	kstat_named_t	itxs_hck_badl3;		/* Not IPv4/IPv6 */
 	kstat_named_t	itxs_hck_badl4;		/* Bad L4 Paylaod */
+	kstat_named_t	itxs_lso_nohck;		/* Missing offloads for LSO */
+	kstat_named_t	itxs_bind_fails;	/* DMA bind failures */
+	kstat_named_t	itxs_tx_short;		/* Tx chain too short */
 
 	kstat_named_t	itxs_err_notcb;		/* No tcb's available */
 	kstat_named_t	itxs_err_nodescs;	/* No tcb's available */
@@ -761,6 +778,25 @@ typedef struct i40e_func_rsrc {
 	uint_t	ifr_nmcastfilt_used;
 } i40e_func_rsrc_t;
 
+typedef struct i40e_vsi {
+	uint16_t		iv_seid;
+	uint16_t		iv_number;
+	kstat_t			*iv_kstats;
+	i40e_vsi_stats_t	iv_stats;
+	uint16_t		iv_stats_id;
+} i40e_vsi_t;
+
+/*
+ * While irg_index and irg_grp_hdl aren't used anywhere, they are
+ * still useful for debugging.
+ */
+typedef struct i40e_rx_group {
+	uint32_t		irg_index;    /* index in i40e_rx_groups[] */
+	uint16_t		irg_vsi_seid; /* SEID of VSI for this group */
+	mac_group_handle_t	irg_grp_hdl;  /* handle to mac_group_t */
+	struct i40e		*irg_i40e;    /* ref to i40e_t */
+} i40e_rx_group_t;
+
 /*
  * Main i40e per-instance state.
  */
@@ -789,11 +825,18 @@ typedef struct i40e {
 	struct i40e_aq_get_phy_abilities_resp	i40e_phy;
 	void 					*i40e_aqbuf;
 
+#define	I40E_DEF_VSI_IDX	0
+#define	I40E_DEF_VSI(i40e)	((i40e)->i40e_vsis[I40E_DEF_VSI_IDX])
+#define	I40E_DEF_VSI_SEID(i40e)	(I40E_DEF_VSI(i40e).iv_seid)
+
 	/*
 	 * Device state, switch information, and resources.
 	 */
-	int			i40e_vsi_id;
-	uint16_t		i40e_vsi_num;
+	i40e_vsi_t		i40e_vsis[I40E_GROUP_MAX];
+	uint16_t		i40e_mac_seid;	 /* SEID of physical MAC */
+	uint16_t		i40e_veb_seid;	 /* switch atop MAC (SEID) */
+	uint16_t		i40e_vsi_avail;	 /* VSIs avail to this PF */
+	uint16_t		i40e_vsi_used;	 /* VSIs used by this PF */
 	struct i40e_device	*i40e_device;
 	i40e_func_rsrc_t	i40e_resources;
 	uint16_t		i40e_switch_rsrc_alloc;
@@ -814,12 +857,13 @@ typedef struct i40e {
 	 */
 	i40e_trqpair_t	*i40e_trqpairs;
 	boolean_t 	i40e_mr_enable;
-	int		i40e_num_trqpairs;
+	uint_t		i40e_num_trqpairs; /* total TRQPs (per PF) */
+	uint_t		i40e_num_trqpairs_per_vsi; /* TRQPs per VSI */
 	uint_t		i40e_other_itr;
 
-	int		i40e_num_rx_groups;
+	i40e_rx_group_t	*i40e_rx_groups;
+	uint_t		i40e_num_rx_groups;
 	int		i40e_num_rx_descs;
-	mac_group_handle_t i40e_rx_group_handle;
 	uint32_t	i40e_rx_ring_size;
 	uint32_t	i40e_rx_buf_size;
 	boolean_t	i40e_rx_hcksum_enable;
@@ -832,6 +876,7 @@ typedef struct i40e {
 	uint32_t	i40e_tx_buf_size;
 	uint32_t	i40e_tx_block_thresh;
 	boolean_t	i40e_tx_hcksum_enable;
+	boolean_t	i40e_tx_lso_enable;
 	uint32_t	i40e_tx_dma_min;
 	uint_t		i40e_tx_itr;
 
@@ -855,6 +900,7 @@ typedef struct i40e {
 	 */
 	ddi_dma_attr_t		i40e_static_dma_attr;
 	ddi_dma_attr_t		i40e_txbind_dma_attr;
+	ddi_dma_attr_t		i40e_txbind_lso_dma_attr;
 	ddi_device_acc_attr_t	i40e_desc_acc_attr;
 	ddi_device_acc_attr_t	i40e_buf_acc_attr;
 
@@ -872,10 +918,7 @@ typedef struct i40e {
 	 */
 	kmutex_t		i40e_stat_lock;
 	kstat_t			*i40e_pf_kstat;
-	kstat_t			*i40e_vsi_kstat;
 	i40e_pf_stats_t		i40e_pf_stat;
-	i40e_vsi_stats_t	i40e_vsi_stat;
-	uint16_t		i40e_vsi_stat_id;
 
 	/*
 	 * Misc. stats and counters that should maybe one day be kstats.
@@ -975,8 +1018,8 @@ extern void i40e_tx_cleanup_ring(i40e_trqpair_t *);
  */
 extern boolean_t i40e_stats_init(i40e_t *);
 extern void i40e_stats_fini(i40e_t *);
-extern boolean_t i40e_stat_vsi_init(i40e_t *);
-extern void i40e_stat_vsi_fini(i40e_t *);
+extern boolean_t i40e_stat_vsi_init(i40e_t *, uint_t);
+extern void i40e_stat_vsi_fini(i40e_t *, uint_t);
 extern boolean_t i40e_stats_trqpair_init(i40e_trqpair_t *);
 extern void i40e_stats_trqpair_fini(i40e_trqpair_t *);
 extern int i40e_m_stat(void *, uint_t, uint64_t *);
diff --git a/usr/src/uts/common/io/i40e/i40e_transceiver.c b/usr/src/uts/common/io/i40e/i40e_transceiver.c
index 57620f03fa..caafa3e102 100644
--- a/usr/src/uts/common/io/i40e/i40e_transceiver.c
+++ b/usr/src/uts/common/io/i40e/i40e_transceiver.c
@@ -11,7 +11,7 @@
 
 /*
  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include "i40e_sw.h"
@@ -60,19 +60,19 @@
  * This size is then rounded up to the nearest 1k chunk, which represents the
  * actual amount of memory that we'll allocate for a single frame.
  *
- * Note, that for rx, we do something that might be unexpected. We always add
+ * Note, that for RX, we do something that might be unexpected. We always add
  * an extra two bytes to the frame size that we allocate. We then offset the DMA
  * address that we receive a packet into by two bytes. This ensures that the IP
  * header will always be 4 byte aligned because the MAC header is either 14 or
  * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's
  * and MAC's lives easier.
  *
- * Both the rx and tx descriptor rings (which are what we use to communicate
+ * Both the RX and TX descriptor rings (which are what we use to communicate
  * with hardware) are allocated as a single region of DMA memory which is the
  * size of the descriptor (4 bytes and 2 bytes respectively) times the total
- * number of descriptors for an rx and tx ring.
+ * number of descriptors for an RX and TX ring.
  *
- * While the rx and tx descriptors are allocated using DMA-based memory, the
+ * While the RX and TX descriptors are allocated using DMA-based memory, the
  * control blocks for each of them are allocated using normal kernel memory.
  * They aren't special from a DMA perspective. We'll go over the design of both
  * receiving and transmitting separately, as they have slightly different
@@ -113,16 +113,16 @@
  *
  * To try and ensure that the device always has blocks that it can receive data
  * into, we maintain two lists of control blocks, a working list and a free
- * list. Each list is sized equal to the number of descriptors in the rx ring.
- * During the GLDv3 mc_start routine, we allocate a number of rx control blocks
+ * list. Each list is sized equal to the number of descriptors in the RX ring.
+ * During the GLDv3 mc_start routine, we allocate a number of RX control blocks
  * equal to twice the number of descriptors in the ring and we assign them
  * equally to the free list and to the working list. Each control block also has
  * DMA memory allocated and associated with which it will be used to receive the
  * actual packet data. All of a received frame's data will end up in a single
  * DMA buffer.
  *
- * During operation, we always maintain the invariant that each rx descriptor
- * has an associated rx control block which lives in the working list. If we
+ * During operation, we always maintain the invariant that each RX descriptor
+ * has an associated RX control block which lives in the working list. If we
  * feel that we should loan up DMA memory to MAC in the form of a message block,
  * we can only do so if we can maintain this invariant. To do that, we swap in
  * one of the buffers from the free list. If none are available, then we resort
@@ -130,14 +130,14 @@
  * size.
  *
  * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is
- * called on the block, at which point we restore the rx control block to the
+ * called on the block, at which point we restore the RX control block to the
  * free list and are able to reuse the DMA memory again. While the scheme may
  * seem odd, it importantly keeps us out of trying to do any DMA allocations in
  * the normal path of operation, even though we may still have to allocate
  * message blocks and copy.
  *
- * The following state machine describes the life time of a rx control block. In
- * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx
+ * The following state machine describes the life time of a RX control block. In
+ * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx
  * control block entry as rcb.
  *
  *             |                                   |
@@ -160,11 +160,11 @@
  *             +--------------------<-----| rcb loaned to MAC |
  *                                        +-------------------+
  *
- * Finally, note that every rx control block has a reference count on it. One
+ * Finally, note that every RX control block has a reference count on it. One
  * reference is added as long as the driver has had the GLDv3 mc_start endpoint
  * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and
  * no other DLPI consumers remain, then we'll decrement the reference count by
- * one. Whenever we loan up the rx control block and associated buffer to MAC,
+ * one. Whenever we loan up the RX control block and associated buffer to MAC,
  * then we bump the reference count again. Even though the device is stopped,
  * there may still be loaned frames in upper levels that we'll want to account
  * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure
@@ -192,10 +192,10 @@
  * state tracking. Effectively, we cache the HEAD register and then update it
  * ourselves based on our work.
  *
- * When we iterate over the rx descriptors and thus the received frames, we are
+ * When we iterate over the RX descriptors and thus the received frames, we are
  * either in an interrupt context or we've been asked by MAC to poll on the
  * ring. If we've been asked to poll on the ring, we have a maximum number of
- * bytes of mblk_t's to return. If processing an rx descriptor would cause us to
+ * bytes of mblk_t's to return. If processing an RX descriptor would cause us to
  * exceed that count, then we do not process it. When in interrupt context, we
  * don't have a strict byte count. However, to ensure liveness, we limit the
  * amount of data based on a configuration value
@@ -249,31 +249,54 @@
  * differently due to the fact that all data is originated by the operating
  * system and not by the device.
  *
- * Like rx, there is both a descriptor ring that we use to communicate to the
- * driver and which points to the memory used to transmit a frame. Similarly,
- * there is a corresponding transmit control block. Each transmit control block
- * has a region of DMA memory allocated to it; however, the way we use it
- * varies.
+ * Like RX, there is both a descriptor ring that we use to communicate to the
+ * driver and which points to the memory used to transmit a frame.  Similarly,
+ * there is a corresponding transmit control block, however, the correspondence
+ * between descriptors and control blocks is more complex and not necessarily
+ * 1-to-1.
  *
  * The driver is asked to process a single frame at a time. That message block
  * may be made up of multiple fragments linked together by the mblk_t`b_cont
  * member. The device has a hard limit of up to 8 buffers being allowed for use
- * for a single logical frame. For each fragment, we'll try and use an entry
- * from the tx descriptor ring and then we'll allocate a corresponding tx
- * control block. Depending on the size of the fragment, we may copy it around
- * or we might instead try to do DMA binding of the fragment.
- *
- * If we exceed the number of blocks that fit, we'll try to pull up the block
- * and then we'll do a DMA bind and send it out.
- *
- * If we don't have enough space in the ring or tx control blocks available,
+ * for a single non-LSO packet or LSO segment. The number of TX ring entires
+ * (and thus TX control blocks) used depends on the fragment sizes and DMA
+ * layout, as explained below.
+ *
+ * We alter our DMA strategy based on a threshold tied to the fragment size.
+ * This threshold is configurable via the tx_dma_threshold property. If the
+ * fragment is above the threshold, we DMA bind it -- consuming one TCB and
+ * potentially several data descriptors. The exact number of descriptors (equal
+ * to the number of DMA cookies) depends on page size, MTU size, b_rptr offset
+ * into page, b_wptr offset into page, and the physical layout of the dblk's
+ * memory (contiguous or not). Essentially, we are at the mercy of the DMA
+ * engine and the dblk's memory allocation. Knowing the exact number of
+ * descriptors up front is a task best not taken on by the driver itself.
+ * Instead, we attempt to DMA bind the fragment and verify the descriptor
+ * layout meets hardware constraints. If the proposed DMA bind does not satisfy
+ * the hardware constaints, then we discard it and instead copy the entire
+ * fragment into the pre-allocated TCB buffer (or buffers if the fragment is
+ * larger than the TCB buffer).
+ *
+ * If the fragment is below or at the threshold, we copy it to the pre-allocated
+ * buffer of a TCB. We compress consecutive copy fragments into a single TCB to
+ * conserve resources. We are guaranteed that the TCB buffer is made up of only
+ * 1 DMA cookie; and therefore consumes only one descriptor on the controller.
+ *
+ * Furthermore, if the frame requires HW offloads such as LSO, tunneling or
+ * filtering, then the TX data descriptors must be preceeded by a single TX
+ * context descriptor.  Because there is no DMA transfer associated with the
+ * context descriptor, we allocate a control block with a special type which
+ * indicates to the TX ring recycle code that there are no associated DMA
+ * resources to unbind when the control block is free'd.
+ *
+ * If we don't have enough space in the ring or TX control blocks available,
  * then we'll return the unprocessed message block to MAC. This will induce flow
  * control and once we recycle enough entries, we'll once again enable sending
  * on the ring.
  *
  * We size the working list as equal to the number of descriptors in the ring.
  * We size the free list as equal to 1.5 times the number of descriptors in the
- * ring. We'll allocate a number of tx control block entries equal to the number
+ * ring. We'll allocate a number of TX control block entries equal to the number
  * of entries in the free list. By default, all entries are placed in the free
  * list. As we come along and try to send something, we'll allocate entries from
  * the free list and add them to the working list, where they'll stay until the
@@ -325,7 +348,7 @@
  *    +------------------+                       +------------------+
  *    | tcb on free list |---*------------------>| tcb on work list |
  *    +------------------+   .                   +------------------+
- *             ^             . tcb allocated               |
+ *             ^             . N tcbs allocated[1]         |
  *             |               to send frame               v
  *             |               or fragment on              |
  *             |               wire, mblk from             |
@@ -335,20 +358,27 @@
  *                    .
  *                    . Hardware indicates
  *                      entry transmitted.
- *                      tcb recycled, mblk
+ *                      tcbs recycled, mblk
  *                      from MAC freed.
  *
+ * [1] We allocate N tcbs to transmit a single frame where N can be 1 context
+ *     descriptor plus 1 data descriptor, in the non-DMA-bind case.  In the DMA
+ *     bind case, N can be 1 context descriptor plus 1 data descriptor per
+ *     b_cont in the mblk.  In this case, the mblk is associated with the first
+ *     data descriptor and freed as part of freeing that data descriptor.
+ *
  * ------------
  * Blocking MAC
  * ------------
  *
- * Wen performing transmit, we can run out of descriptors and ring entries. When
- * such a case happens, we return the mblk_t to MAC to indicate that we've been
- * blocked. At that point in time, MAC becomes blocked and will not transmit
- * anything out that specific ring until we notify MAC. To indicate that we're
- * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE.
+ * When performing transmit, we can run out of descriptors and ring entries.
+ * When such a case happens, we return the mblk_t to MAC to indicate that we've
+ * been blocked. At that point in time, MAC becomes blocked and will not
+ * transmit anything out that specific ring until we notify MAC. To indicate
+ * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member
+ * to B_TRUE.
  *
- * When we recycle tx descriptors then we'll end up signaling MAC by calling
+ * When we recycle TX descriptors then we'll end up signaling MAC by calling
  * mac_tx_ring_update() if we were blocked, letting it know that it's safe to
  * start sending frames out to us again.
  */
@@ -367,13 +397,15 @@
 
 /*
  * This structure is used to maintain information and flags related to
- * transmitting a frame. The first member is the set of flags we need to or into
- * the command word (generally checksumming related). The second member controls
- * the word offsets which is required for IP and L4 checksumming.
+ * transmitting a frame.  These fields are ultimately used to construct the
+ * TX data descriptor(s) and, if necessary, the TX context descriptor.
  */
 typedef struct i40e_tx_context {
-	enum i40e_tx_desc_cmd_bits	itc_cmdflags;
-	uint32_t			itc_offsets;
+	enum i40e_tx_desc_cmd_bits	itc_data_cmdflags;
+	uint32_t			itc_data_offsets;
+	enum i40e_tx_ctx_desc_cmd_bits	itc_ctx_cmdflags;
+	uint32_t			itc_ctx_tsolen;
+	uint32_t			itc_ctx_mss;
 } i40e_tx_context_t;
 
 /*
@@ -395,14 +427,18 @@ i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT;
  * i40e_static_dma_attr, is designed to be used for both the descriptor rings
  * and the static buffers that we associate with control blocks. For this
  * reason, we force an SGL length of one. While technically the driver supports
- * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our
+ * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our
  * management here. In addition, when the Intel common code wants to allocate
  * memory via the i40e_allocate_virt_mem osdep function, we have it leverage
  * the static dma attr.
  *
- * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're
- * binding a bunch of mblk_t fragments to go out the door. Note that the main
- * difference here is that we're allowed a larger SGL length -- eight.
+ * The latter two sets of attributes, are what we use when we're binding a
+ * bunch of mblk_t fragments to go out the door. Note that the main difference
+ * here is that we're allowed a larger SGL length.  For non-LSO TX, we
+ * restrict the SGL length to match the number of TX buffers available to the
+ * PF (8).  For the LSO case we can go much larger, with the caveat that each
+ * MSS-sized chunk (segment) must not span more than 8 data descriptors and
+ * hence must not span more than 8 cookies.
  *
  * Note, we default to setting ourselves to be DMA capable here. However,
  * because we could have multiple instances which have different FMA error
@@ -429,7 +465,7 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
 	DMA_ATTR_V0,			/* version number */
 	0x0000000000000000ull,		/* low address */
 	0xFFFFFFFFFFFFFFFFull,		/* high address */
-	0x00000000FFFFFFFFull,		/* dma counter max */
+	I40E_MAX_TX_BUFSZ - 1,		/* dma counter max */
 	I40E_DMA_ALIGNMENT,		/* alignment */
 	0x00000FFF,			/* burst sizes */
 	0x00000001,			/* minimum transfer size */
@@ -440,6 +476,21 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = {
 	DDI_DMA_FLAGERR			/* DMA flags */
 };
 
+static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = {
+	DMA_ATTR_V0,			/* version number */
+	0x0000000000000000ull,		/* low address */
+	0xFFFFFFFFFFFFFFFFull,		/* high address */
+	I40E_MAX_TX_BUFSZ - 1,		/* dma counter max */
+	I40E_DMA_ALIGNMENT,		/* alignment */
+	0x00000FFF,			/* burst sizes */
+	0x00000001,			/* minimum transfer size */
+	0x00000000FFFFFFFFull,		/* maximum transfer size */
+	0xFFFFFFFFFFFFFFFFull,		/* maximum segment size	 */
+	I40E_TX_LSO_MAX_COOKIE,		/* scatter/gather list length */
+	0x00000001,			/* granularity */
+	DDI_DMA_FLAGERR			/* DMA flags */
+};
+
 /*
  * Next, we have the attributes for these structures. The descriptor rings are
  * all strictly little endian, while the data buffers are just arrays of bytes
@@ -668,7 +719,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
 	rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
 	    rxd->rxd_ring_size, KM_NOSLEEP);
 	if (rxd->rxd_work_list == NULL) {
-		i40e_error(i40e, "failed to allocate rx work list for a ring "
+		i40e_error(i40e, "failed to allocate RX work list for a ring "
 		    "of %d entries for ring %d", rxd->rxd_ring_size,
 		    itrq->itrq_index);
 		goto cleanup;
@@ -677,7 +728,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq)
 	rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) *
 	    rxd->rxd_free_list_size, KM_NOSLEEP);
 	if (rxd->rxd_free_list == NULL) {
-		i40e_error(i40e, "failed to allocate a %d entry rx free list "
+		i40e_error(i40e, "failed to allocate a %d entry RX free list "
 		    "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index);
 		goto cleanup;
 	}
@@ -765,7 +816,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
 	i40e_t *i40e = rxd->rxd_i40e;
 
 	/*
-	 * First allocate the rx descriptor ring.
+	 * First allocate the RX descriptor ring.
 	 */
 	dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size;
 	VERIFY(dmasz > 0);
@@ -773,7 +824,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
 	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE,
 	    B_TRUE, dmasz) == B_FALSE) {
 		i40e_error(i40e, "failed to allocate DMA resources "
-		    "for rx descriptor ring");
+		    "for RX descriptor ring");
 		return (B_FALSE);
 	}
 	rxd->rxd_desc_ring =
@@ -799,7 +850,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd)
 		if (i40e_alloc_dma_buffer(i40e, dmap,
 		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
 		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
-			i40e_error(i40e, "failed to allocate rx dma buffer");
+			i40e_error(i40e, "failed to allocate RX dma buffer");
 			return (B_FALSE);
 		}
 
@@ -841,6 +892,10 @@ i40e_free_tx_dma(i40e_trqpair_t *itrq)
 				ddi_dma_free_handle(&tcb->tcb_dma_handle);
 				tcb->tcb_dma_handle = NULL;
 			}
+			if (tcb->tcb_lso_dma_handle != NULL) {
+				ddi_dma_free_handle(&tcb->tcb_lso_dma_handle);
+				tcb->tcb_lso_dma_handle = NULL;
+			}
 		}
 
 		fsz = sizeof (i40e_tx_control_block_t) *
@@ -881,7 +936,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
 	    (i40e->i40e_tx_ring_size >> 1);
 
 	/*
-	 * Allocate an additional tx descriptor for the writeback head.
+	 * Allocate an additional TX descriptor for the writeback head.
 	 */
 	dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size;
 	dmasz += sizeof (i40e_tx_desc_t);
@@ -890,7 +945,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
 	if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area,
 	    &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr,
 	    B_FALSE, B_TRUE, dmasz) == B_FALSE) {
-		i40e_error(i40e, "failed to allocate DMA resources for tx "
+		i40e_error(i40e, "failed to allocate DMA resources for TX "
 		    "descriptor ring");
 		return (B_FALSE);
 	}
@@ -905,7 +960,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
 	itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size *
 	    sizeof (i40e_tx_control_block_t *), KM_NOSLEEP);
 	if (itrq->itrq_tcb_work_list == NULL) {
-		i40e_error(i40e, "failed to allocate a %d entry tx work list "
+		i40e_error(i40e, "failed to allocate a %d entry TX work list "
 		    "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index);
 		goto cleanup;
 	}
@@ -913,14 +968,14 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
 	itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size *
 	    sizeof (i40e_tx_control_block_t *), KM_SLEEP);
 	if (itrq->itrq_tcb_free_list == NULL) {
-		i40e_error(i40e, "failed to allocate a %d entry tx free list "
+		i40e_error(i40e, "failed to allocate a %d entry TX free list "
 		    "for ring %d", itrq->itrq_tx_free_list_size,
 		    itrq->itrq_index);
 		goto cleanup;
 	}
 
 	/*
-	 * We allocate enough tx control blocks to cover the free list.
+	 * We allocate enough TX control blocks to cover the free list.
 	 */
 	itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) *
 	    itrq->itrq_tx_free_list_size, KM_NOSLEEP);
@@ -948,18 +1003,29 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq)
 		    &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL,
 		    &tcb->tcb_dma_handle);
 		if (ret != DDI_SUCCESS) {
-			i40e_error(i40e, "failed to allocate DMA handle for tx "
+			i40e_error(i40e, "failed to allocate DMA handle for TX "
 			    "data binding on ring %d: %d", itrq->itrq_index,
 			    ret);
 			tcb->tcb_dma_handle = NULL;
 			goto cleanup;
 		}
 
+		ret = ddi_dma_alloc_handle(i40e->i40e_dip,
+		    &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL,
+		    &tcb->tcb_lso_dma_handle);
+		if (ret != DDI_SUCCESS) {
+			i40e_error(i40e, "failed to allocate DMA handle for TX "
+			    "LSO data binding on ring %d: %d", itrq->itrq_index,
+			    ret);
+			tcb->tcb_lso_dma_handle = NULL;
+			goto cleanup;
+		}
+
 		if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma,
 		    &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr,
 		    B_TRUE, B_FALSE, dmasz) == B_FALSE) {
 			i40e_error(i40e, "failed to allocate %ld bytes of "
-			    "DMA for tx data binding on ring %d", dmasz,
+			    "DMA for TX data binding on ring %d", dmasz,
 			    itrq->itrq_index);
 			goto cleanup;
 		}
@@ -989,10 +1055,17 @@ i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init)
 		i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata;
 
 		/*
-		 * Clean up our rx data. We have to free DMA resources first and
+		 * In some cases i40e_alloc_rx_data() may have failed
+		 * and in that case there is no rxd to free.
+		 */
+		if (rxd == NULL)
+			continue;
+
+		/*
+		 * Clean up our RX data. We have to free DMA resources first and
 		 * then if we have no more pending RCB's, then we'll go ahead
 		 * and clean things up. Note, we can't set the stopped flag on
-		 * the rx data until after we've done the first pass of the
+		 * the RX data until after we've done the first pass of the
 		 * pending resources. Otherwise we might race with
 		 * i40e_rx_recycle on determining who should free the
 		 * i40e_rx_data_t above.
@@ -1055,6 +1128,8 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
 	    sizeof (ddi_dma_attr_t));
 	bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr,
 	    sizeof (ddi_dma_attr_t));
+	bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr,
+	    sizeof (ddi_dma_attr_t));
 	bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr,
 	    sizeof (ddi_device_acc_attr_t));
 	bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr,
@@ -1063,9 +1138,13 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma)
 	if (fma == B_TRUE) {
 		i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
 		i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
+		i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |=
+		    DDI_DMA_FLAGERR;
 	} else {
 		i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
 		i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR;
+		i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &=
+		    ~DDI_DMA_FLAGERR;
 	}
 }
 
@@ -1102,7 +1181,7 @@ i40e_rcb_alloc(i40e_rx_data_t *rxd)
 /*
  * This is the callback that we get from the OS when freemsg(9F) has been called
  * on a loaned descriptor. In addition, if we take the last reference count
- * here, then we have to tear down all of the rx data.
+ * here, then we have to tear down all of the RX data.
  */
 void
 i40e_rx_recycle(caddr_t arg)
@@ -1768,17 +1847,18 @@ mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
  * to properly program the hardware for checksum offload as well as the
  * generally required flags.
  *
- * The i40e_tx_context_t`itc_cmdflags contains the set of flags we need to or
- * into the descriptor based on the checksum flags for this mblk_t and the
+ * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to
+ * 'or' into the descriptor based on the checksum flags for this mblk_t and the
  * actual information we care about.
+ *
+ * If the mblk requires LSO then we'll also gather the information that will be
+ * used to construct the Transmit Context Descriptor.
  */
 static int
 i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
-    i40e_tx_context_t *tctx)
+    mac_ether_offload_info_t *meo, i40e_tx_context_t *tctx)
 {
-	int ret;
-	uint32_t flags, start;
-	mac_ether_offload_info_t meo;
+	uint32_t chkflags, start, mss, lsoflags;
 	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
 
 	bzero(tctx, sizeof (i40e_tx_context_t));
@@ -1786,37 +1866,34 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
 	if (i40e->i40e_tx_hcksum_enable != B_TRUE)
 		return (0);
 
-	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags);
-	if (flags == 0)
-		return (0);
+	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags);
+	mac_lso_get(mp, &mss, &lsoflags);
 
-	if ((ret = mac_ether_offload_info(mp, &meo)) != 0) {
-		txs->itxs_hck_meoifail.value.ui64++;
-		return (ret);
-	}
+	if (chkflags == 0 && lsoflags == 0)
+		return (0);
 
 	/*
 	 * Have we been asked to checksum an IPv4 header. If so, verify that we
 	 * have sufficient information and then set the proper fields in the
 	 * command structure.
 	 */
-	if (flags & HCK_IPV4_HDRCKSUM) {
-		if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
+	if (chkflags & HCK_IPV4_HDRCKSUM) {
+		if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
 			txs->itxs_hck_nol2info.value.ui64++;
 			return (-1);
 		}
-		if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
+		if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
 			txs->itxs_hck_nol3info.value.ui64++;
 			return (-1);
 		}
-		if (meo.meoi_l3proto != ETHERTYPE_IP) {
+		if (meo->meoi_l3proto != ETHERTYPE_IP) {
 			txs->itxs_hck_badl3.value.ui64++;
 			return (-1);
 		}
-		tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
-		tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
+		tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
+		tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
 		    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
-		tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
+		tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
 		    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
 	}
 
@@ -1826,57 +1903,77 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
 	 * onto seeing if we have enough information for the L4 checksum
 	 * offload.
 	 */
-	if (flags & HCK_PARTIALCKSUM) {
-		if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) {
+	if (chkflags & HCK_PARTIALCKSUM) {
+		if ((meo->meoi_flags & MEOI_L4INFO_SET) == 0) {
 			txs->itxs_hck_nol4info.value.ui64++;
 			return (-1);
 		}
 
-		if (!(flags & HCK_IPV4_HDRCKSUM)) {
-			if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
+		if (!(chkflags & HCK_IPV4_HDRCKSUM)) {
+			if ((meo->meoi_flags & MEOI_L2INFO_SET) == 0) {
 				txs->itxs_hck_nol2info.value.ui64++;
 				return (-1);
 			}
-			if ((meo.meoi_flags & MEOI_L3INFO_SET) == 0) {
+			if ((meo->meoi_flags & MEOI_L3INFO_SET) == 0) {
 				txs->itxs_hck_nol3info.value.ui64++;
 				return (-1);
 			}
 
-			if (meo.meoi_l3proto == ETHERTYPE_IP) {
-				tctx->itc_cmdflags |=
+			if (meo->meoi_l3proto == ETHERTYPE_IP) {
+				tctx->itc_data_cmdflags |=
 				    I40E_TX_DESC_CMD_IIPT_IPV4;
-			} else if (meo.meoi_l3proto == ETHERTYPE_IPV6) {
-				tctx->itc_cmdflags |=
+			} else if (meo->meoi_l3proto == ETHERTYPE_IPV6) {
+				tctx->itc_data_cmdflags |=
 				    I40E_TX_DESC_CMD_IIPT_IPV6;
 			} else {
 				txs->itxs_hck_badl3.value.ui64++;
 				return (-1);
 			}
-			tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) <<
+			tctx->itc_data_offsets |= (meo->meoi_l2hlen >> 1) <<
 			    I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
-			tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) <<
+			tctx->itc_data_offsets |= (meo->meoi_l3hlen >> 2) <<
 			    I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
 		}
 
-		switch (meo.meoi_l4proto) {
+		switch (meo->meoi_l4proto) {
 		case IPPROTO_TCP:
-			tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
+			tctx->itc_data_cmdflags |=
+			    I40E_TX_DESC_CMD_L4T_EOFT_TCP;
 			break;
 		case IPPROTO_UDP:
-			tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
+			tctx->itc_data_cmdflags |=
+			    I40E_TX_DESC_CMD_L4T_EOFT_UDP;
 			break;
 		case IPPROTO_SCTP:
-			tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
+			tctx->itc_data_cmdflags |=
+			    I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
 			break;
 		default:
 			txs->itxs_hck_badl4.value.ui64++;
 			return (-1);
 		}
 
-		tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) <<
+		tctx->itc_data_offsets |= (meo->meoi_l4hlen >> 2) <<
 		    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
 	}
 
+	if (lsoflags & HW_LSO) {
+		/*
+		 * LSO requires that checksum offloads are enabled.  If for
+		 * some reason they're not we bail out with an error.
+		 */
+		if ((chkflags & HCK_IPV4_HDRCKSUM) == 0 ||
+		    (chkflags & HCK_PARTIALCKSUM) == 0) {
+			txs->itxs_lso_nohck.value.ui64++;
+			return (-1);
+		}
+
+		tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO;
+		tctx->itc_ctx_mss = mss;
+		tctx->itc_ctx_tsolen = msgsize(mp) -
+		    (meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen);
+	}
+
 	return (0);
 }
 
@@ -1925,7 +2022,20 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb)
 		tcb->tcb_dma.dmab_len = 0;
 		break;
 	case I40E_TX_DMA:
-		(void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
+		if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0)
+			(void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle);
+		else if (tcb->tcb_bind_ncookies > 0)
+			(void) ddi_dma_unbind_handle(tcb->tcb_dma_handle);
+		if (tcb->tcb_bind_info != NULL) {
+			kmem_free(tcb->tcb_bind_info,
+			    tcb->tcb_bind_ncookies *
+			    sizeof (struct i40e_dma_bind_info));
+		}
+		tcb->tcb_bind_info = NULL;
+		tcb->tcb_bind_ncookies = 0;
+		tcb->tcb_used_lso = B_FALSE;
+		break;
+	case I40E_TX_DESC:
 		break;
 	case I40E_TX_NONE:
 		/* Cast to pacify lint */
@@ -1935,8 +2045,10 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb)
 	}
 
 	tcb->tcb_type = I40E_TX_NONE;
-	freemsg(tcb->tcb_mp);
-	tcb->tcb_mp = NULL;
+	if (tcb->tcb_mp != NULL) {
+		freemsg(tcb->tcb_mp);
+		tcb->tcb_mp = NULL;
+	}
 	tcb->tcb_next = NULL;
 }
 
@@ -1969,10 +2081,11 @@ i40e_tx_cleanup_ring(i40e_trqpair_t *itrq)
 		i40e_tx_control_block_t *tcb;
 
 		tcb = itrq->itrq_tcb_work_list[index];
-		VERIFY(tcb != NULL);
-		itrq->itrq_tcb_work_list[index] = NULL;
-		i40e_tcb_reset(tcb);
-		i40e_tcb_free(itrq, tcb);
+		if (tcb != NULL) {
+			itrq->itrq_tcb_work_list[index] = NULL;
+			i40e_tcb_reset(tcb);
+			i40e_tcb_free(itrq, tcb);
+		}
 
 		bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t));
 		index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size);
@@ -1995,6 +2108,7 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
 	uint32_t wbhead, toclean, count;
 	i40e_tx_control_block_t *tcbhead;
 	i40e_t *i40e = itrq->itrq_i40e;
+	uint_t desc_per_tcb, i;
 
 	mutex_enter(&itrq->itrq_tx_lock);
 
@@ -2042,11 +2156,27 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
 		tcbhead = tcb;
 
 		/*
-		 * We zero this out for sanity purposes.
+		 * In the DMA bind case, there may not necessarily be a 1:1
+		 * mapping between tcb's and descriptors.  If the tcb type
+		 * indicates a DMA binding then check the number of DMA
+		 * cookies to determine how many entries to clean in the
+		 * descriptor ring.
 		 */
-		bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t));
-		toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size);
-		count++;
+		if (tcb->tcb_type == I40E_TX_DMA)
+			desc_per_tcb = tcb->tcb_bind_ncookies;
+		else
+			desc_per_tcb = 1;
+
+		for (i = 0; i < desc_per_tcb; i++) {
+			/*
+			 * We zero this out for sanity purposes.
+			 */
+			bzero(&itrq->itrq_desc_ring[toclean],
+			    sizeof (i40e_tx_desc_t));
+			toclean = i40e_next_desc(toclean, 1,
+			    itrq->itrq_tx_ring_size);
+			count++;
+		}
 	}
 
 	itrq->itrq_desc_head = wbhead;
@@ -2078,10 +2208,610 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
 	DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count);
 }
 
+static void
+i40e_tx_copy_fragment(i40e_tx_control_block_t *tcb, const mblk_t *mp,
+    const size_t off, const size_t len)
+{
+	const void *soff = mp->b_rptr + off;
+	void *doff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
+
+	ASSERT3U(len, >, 0);
+	ASSERT3P(soff, >=, mp->b_rptr);
+	ASSERT3P(soff, <=, mp->b_wptr);
+	ASSERT3U(len, <=, MBLKL(mp));
+	ASSERT3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr);
+	ASSERT3U(tcb->tcb_dma.dmab_size - tcb->tcb_dma.dmab_len, >=, len);
+	bcopy(soff, doff, len);
+	tcb->tcb_type = I40E_TX_COPY;
+	tcb->tcb_dma.dmab_len += len;
+	I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
+}
+
+static i40e_tx_control_block_t *
+i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp,
+    size_t off, boolean_t use_lso)
+{
+	ddi_dma_handle_t dma_handle;
+	ddi_dma_cookie_t dma_cookie;
+	uint_t i = 0, ncookies = 0, dmaflags;
+	i40e_tx_control_block_t *tcb;
+	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
+
+	if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
+		txs->itxs_err_notcb.value.ui64++;
+		return (NULL);
+	}
+	tcb->tcb_type = I40E_TX_DMA;
+
+	if (use_lso == B_TRUE)
+		dma_handle = tcb->tcb_lso_dma_handle;
+	else
+		dma_handle = tcb->tcb_dma_handle;
+
+	dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING;
+	if (ddi_dma_addr_bind_handle(dma_handle, NULL,
+	    (caddr_t)(mp->b_rptr + off), MBLKL(mp) - off, dmaflags,
+	    DDI_DMA_DONTWAIT, NULL, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) {
+		txs->itxs_bind_fails.value.ui64++;
+		goto bffail;
+	}
+
+	tcb->tcb_bind_ncookies = ncookies;
+	tcb->tcb_used_lso = use_lso;
+
+	tcb->tcb_bind_info =
+	    kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info),
+	    KM_NOSLEEP);
+	if (tcb->tcb_bind_info == NULL)
+		goto bffail;
+
+	while (i < ncookies) {
+		if (i > 0)
+			ddi_dma_nextcookie(dma_handle, &dma_cookie);
+
+		tcb->tcb_bind_info[i].dbi_paddr =
+		    (caddr_t)dma_cookie.dmac_laddress;
+		tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size;
+	}
+
+	return (tcb);
+
+bffail:
+	i40e_tcb_reset(tcb);
+	i40e_tcb_free(itrq, tcb);
+	return (NULL);
+}
+
+static void
+i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx,
+    caddr_t buff, size_t len, boolean_t last_desc)
+{
+	i40e_tx_desc_t *txdesc;
+	int cmd;
+
+	ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock));
+	itrq->itrq_desc_free--;
+	txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
+	itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
+	    itrq->itrq_tx_ring_size);
+
+	cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags;
+
+	/*
+	 * The last data descriptor needs the EOP bit set, so that the HW knows
+	 * that we're ready to send.  Additionally, we set the RS (Report
+	 * Status) bit, so that we are notified when the transmit engine has
+	 * completed DMA'ing all of the data descriptors and data buffers
+	 * associated with this frame.
+	 */
+	if (last_desc == B_TRUE) {
+		cmd |= I40E_TX_DESC_CMD_EOP;
+		cmd |= I40E_TX_DESC_CMD_RS;
+	}
+
+	/*
+	 * Per the X710 manual, section 8.4.2.1.1, the buffer size
+	 * must be a value from 1 to 16K minus 1, inclusive.
+	 */
+	ASSERT3U(len, >=, 1);
+	ASSERT3U(len, <=, I40E_MAX_TX_BUFSZ);
+
+	txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)buff);
+	txdesc->cmd_type_offset_bsz =
+	    LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA |
+	    ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
+	    ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
+	    ((uint64_t)len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
+}
+
+/*
+ * Place 'tcb' on the tail of the list represented by 'head'/'tail'.
+ */
+static inline void
+tcb_list_append(i40e_tx_control_block_t **head, i40e_tx_control_block_t **tail,
+    i40e_tx_control_block_t *tcb)
+{
+	if (*head == NULL) {
+		*head = tcb;
+		*tail = *head;
+	} else {
+		ASSERT3P(*tail, !=, NULL);
+		ASSERT3P((*tail)->tcb_next, ==, NULL);
+		(*tail)->tcb_next = tcb;
+		*tail = tcb;
+	}
+}
+
+/*
+ * This function takes a single packet, possibly consisting of
+ * multiple mblks, and creates a TCB chain to send to the controller.
+ * This TCB chain may span up to a maximum of 8 descriptors. A copy
+ * TCB consumes one descriptor; whereas a DMA TCB may consume 1 or
+ * more, depending on several factors. For each fragment (invidual
+ * mblk making up the packet), we determine if its size dictates a
+ * copy to the TCB buffer or a DMA bind of the dblk buffer. We keep a
+ * count of descriptors used; when that count reaches the max we force
+ * all remaining fragments into a single TCB buffer. We have a
+ * guarantee that the TCB buffer is always larger than the MTU -- so
+ * there is always enough room. Consecutive fragments below the DMA
+ * threshold are copied into a single TCB. In the event of an error
+ * this function returns NULL but leaves 'mp' alone.
+ */
+static i40e_tx_control_block_t *
+i40e_non_lso_chain(i40e_trqpair_t *itrq, mblk_t *mp, uint_t *ndesc)
+{
+	const mblk_t *nmp = mp;
+	uint_t needed_desc = 0;
+	boolean_t force_copy = B_FALSE;
+	i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
+	i40e_t *i40e = itrq->itrq_i40e;
+	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
+
+	/* TCB buffer is always larger than MTU. */
+	ASSERT3U(msgsize(mp), <, i40e->i40e_tx_buf_size);
+
+	while (nmp != NULL) {
+		const size_t nmp_len = MBLKL(nmp);
+
+		/* Ignore zero-length mblks. */
+		if (nmp_len == 0) {
+			nmp = nmp->b_cont;
+			continue;
+		}
+
+		if (nmp_len < i40e->i40e_tx_dma_min || force_copy) {
+			/* Compress consecutive copies into one TCB. */
+			if (tcb != NULL && tcb->tcb_type == I40E_TX_COPY) {
+				i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
+				nmp = nmp->b_cont;
+				continue;
+			}
+
+			if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
+				txs->itxs_err_notcb.value.ui64++;
+				goto fail;
+			}
+
+			/*
+			 * TCB DMA buffer is guaranteed to be one
+			 * cookie by i40e_alloc_dma_buffer().
+			 */
+			i40e_tx_copy_fragment(tcb, nmp, 0, nmp_len);
+			needed_desc++;
+			tcb_list_append(&tcbhead, &tcbtail, tcb);
+		} else {
+			uint_t total_desc;
+
+			tcb = i40e_tx_bind_fragment(itrq, nmp, 0, B_FALSE);
+			if (tcb == NULL) {
+				i40e_error(i40e, "dma bind failed!");
+				goto fail;
+			}
+
+			/*
+			 * If the new total exceeds the max or we've
+			 * reached the limit and there's data left,
+			 * then give up binding and copy the rest into
+			 * the pre-allocated TCB buffer.
+			 */
+			total_desc = needed_desc + tcb->tcb_bind_ncookies;
+			if ((total_desc > I40E_TX_MAX_COOKIE) ||
+			    (total_desc == I40E_TX_MAX_COOKIE &&
+			    nmp->b_cont != NULL)) {
+				i40e_tcb_reset(tcb);
+				i40e_tcb_free(itrq, tcb);
+
+				if (tcbtail != NULL &&
+				    tcbtail->tcb_type == I40E_TX_COPY) {
+					tcb = tcbtail;
+				} else {
+					tcb = NULL;
+				}
+
+				force_copy = B_TRUE;
+				txs->itxs_force_copy.value.ui64++;
+				continue;
+			}
+
+			needed_desc += tcb->tcb_bind_ncookies;
+			tcb_list_append(&tcbhead, &tcbtail, tcb);
+		}
+
+		nmp = nmp->b_cont;
+	}
+
+	ASSERT3P(nmp, ==, NULL);
+	ASSERT3U(needed_desc, <=, I40E_TX_MAX_COOKIE);
+	ASSERT3P(tcbhead, !=, NULL);
+	*ndesc += needed_desc;
+	return (tcbhead);
+
+fail:
+	tcb = tcbhead;
+	while (tcb != NULL) {
+		i40e_tx_control_block_t *next = tcb->tcb_next;
+
+		ASSERT(tcb->tcb_type == I40E_TX_DMA ||
+		    tcb->tcb_type == I40E_TX_COPY);
+
+		tcb->tcb_mp = NULL;
+		i40e_tcb_reset(tcb);
+		i40e_tcb_free(itrq, tcb);
+		tcb = next;
+	}
+
+	return (NULL);
+}
+
+/*
+ * Section 8.4.1 of the 700-series programming guide states that a
+ * segment may span up to 8 data descriptors; including both header
+ * and payload data. However, empirical evidence shows that the
+ * controller freezes the Tx queue when presented with a segment of 8
+ * descriptors. Or, at least, when the first segment contains 8
+ * descriptors. One explanation is that the controller counts the
+ * context descriptor against the first segment, even though the
+ * programming guide makes no mention of such a constraint. In any
+ * case, we limit TSO segments to 7 descriptors to prevent Tx queue
+ * freezes. We still allow non-TSO segments to utilize all 8
+ * descriptors as they have not demonstrated the faulty behavior.
+ */
+uint_t i40e_lso_num_descs = 7;
+
+#define	I40E_TCB_LEFT(tcb)				\
+	((tcb)->tcb_dma.dmab_size - (tcb)->tcb_dma.dmab_len)
+
+/*
+ * This function is similar in spirit to i40e_non_lso_chain(), but
+ * much more complicated in reality. Like the previous function, it
+ * takes a packet (an LSO packet) as input and returns a chain of
+ * TCBs. The complication comes with the fact that we are no longer
+ * trying to fit the entire packet into 8 descriptors, but rather we
+ * must fit each MSS-size segment of the LSO packet into 8 descriptors.
+ * Except it's really 7 descriptors, see i40e_lso_num_descs.
+ *
+ * Your first inclination might be to verify that a given segment
+ * spans no more than 7 mblks; but it's actually much more subtle than
+ * that. First, let's describe what the hardware expects, and then we
+ * can expound on the software side of things.
+ *
+ * For an LSO packet the hardware expects the following:
+ *
+ *	o Each MSS-sized segment must span no more than 7 descriptors.
+ *
+ *	o The header size does not count towards the segment size.
+ *
+ *	o If header and payload share the first descriptor, then the
+ *	  controller will count the descriptor twice.
+ *
+ * The most important thing to keep in mind is that the hardware does
+ * not view the segments in terms of mblks, like we do. The hardware
+ * only sees descriptors. It will iterate each descriptor in turn,
+ * keeping a tally of bytes seen and descriptors visited. If the byte
+ * count hasn't reached MSS by the time the descriptor count reaches
+ * 7, then the controller freezes the queue and we are stuck.
+ * Furthermore, the hardware picks up its tally where it left off. So
+ * if it reached MSS in the middle of a descriptor, it will start
+ * tallying the next segment in the middle of that descriptor. The
+ * hardware's view is entirely removed from the mblk chain or even the
+ * descriptor layout. Consider these facts:
+ *
+ *	o The MSS will vary dpeneding on MTU and other factors.
+ *
+ *	o The dblk allocation will sit at various offsets within a
+ *	  memory page.
+ *
+ *	o The page size itself could vary in the future (i.e. not
+ *	  always 4K).
+ *
+ *	o Just because a dblk is virtually contiguous doesn't mean
+ *	  it's physically contiguous. The number of cookies
+ *	  (descriptors) required by a DMA bind of a single dblk is at
+ *	  the mercy of the page size and physical layout.
+ *
+ *	o The descriptors will most often NOT start/end on a MSS
+ *	  boundary. Thus the hardware will often start counting the
+ *	  MSS mid descriptor and finish mid descriptor.
+ *
+ * The upshot of all this is that the driver must learn to think like
+ * the controller; and verify that none of the constraints are broken.
+ * It does this by tallying up the segment just like the hardware
+ * would. This is handled by the two variables 'segsz' and 'segdesc'.
+ * After each attempt to bind a dblk, we check the constaints. If
+ * violated, we undo the DMA and force a copy until MSS is met. We
+ * have a guarantee that the TCB buffer is larger than MTU; thus
+ * ensuring we can always meet the MSS with a single copy buffer. We
+ * also copy consecutive non-DMA fragments into the same TCB buffer.
+ */
+static i40e_tx_control_block_t *
+i40e_lso_chain(i40e_trqpair_t *itrq, const mblk_t *mp,
+    const mac_ether_offload_info_t *meo, const i40e_tx_context_t *tctx,
+    uint_t *ndesc)
+{
+	size_t mp_len = MBLKL(mp);
+	/*
+	 * The cpoff (copy offset) variable tracks the offset inside
+	 * the current mp. There are cases where the entire mp is not
+	 * fully copied in one go: such as the header copy followed by
+	 * a non-DMA mblk, or a TCB buffer that only has enough space
+	 * to copy part of the current mp.
+	 */
+	size_t cpoff = 0;
+	/*
+	 * The segsz and segdesc variables track the controller's view
+	 * of the segment. The needed_desc variable tracks the total
+	 * number of data descriptors used by the driver.
+	 */
+	size_t segsz = 0;
+	uint_t segdesc = 0;
+	uint_t needed_desc = 0;
+	size_t hdrcopied = 0;
+	const size_t hdrlen =
+	    meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen;
+	const size_t mss = tctx->itc_ctx_mss;
+	boolean_t force_copy = B_FALSE;
+	i40e_tx_control_block_t *tcb = NULL, *tcbhead = NULL, *tcbtail = NULL;
+	i40e_t *i40e = itrq->itrq_i40e;
+	i40e_txq_stat_t *txs = &itrq->itrq_txstat;
+
+	/*
+	 * We always copy the header in order to avoid more
+	 * complicated code dealing with various edge cases.
+	 */
+	if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
+		txs->itxs_err_notcb.value.ui64++;
+		goto fail;
+	}
+
+	needed_desc++;
+	tcb_list_append(&tcbhead, &tcbtail, tcb);
+
+	while (hdrcopied < hdrlen) {
+		const size_t tocopy = MIN(hdrlen - hdrcopied, mp_len);
+		i40e_tx_copy_fragment(tcb, mp, 0, tocopy);
+		hdrcopied += tocopy;
+		cpoff += tocopy;
+		if (tocopy == mp_len) {
+			/*
+			 * This is a bit of defensive programming. We
+			 * should never have a chain too short to
+			 * satisfy the headers -- but just in case.
+			 */
+			if ((mp = mp->b_cont) == NULL) {
+				txs->itxs_tx_short.value.ui64++;
+				goto fail;
+			}
+
+			while ((mp_len = MBLKL(mp)) == 0) {
+				if ((mp = mp->b_cont) == NULL) {
+					txs->itxs_tx_short.value.ui64++;
+					goto fail;
+				}
+			}
+			cpoff = 0;
+		}
+	}
+	ASSERT3U(hdrcopied, ==, hdrlen);
+
+	/*
+	 * A single descriptor containing both header and data is
+	 * counted twice by the controller.
+	 */
+	if (mp_len < i40e->i40e_tx_dma_min) {
+		segdesc = 2;
+	} else {
+		segdesc = 1;
+	}
+
+	while (mp != NULL) {
+		mp_len = MBLKL(mp);
+force_copy:
+		/* Ignore zero-length mblks. */
+		if (mp_len == 0) {
+			mp = mp->b_cont;
+			cpoff = 0;
+			continue;
+		}
+
+		/*
+		 * We copy into the preallocated TCB buffer when the
+		 * current fragment is less than the DMA threshold OR
+		 * when the DMA bind can't meet the controller's
+		 * segment descriptor limit.
+		 */
+		if (mp_len < i40e->i40e_tx_dma_min || force_copy) {
+			size_t tocopy;
+
+			/*
+			 * Our objective here is to compress
+			 * consecutive copies into one TCB (until it
+			 * is full). If there is no current TCB, or if
+			 * it is a DMA TCB, then allocate a new one.
+			 */
+			if (tcb == NULL ||
+			    (tcb != NULL && tcb->tcb_type != I40E_TX_COPY)) {
+				if ((tcb = i40e_tcb_alloc(itrq)) == NULL) {
+					txs->itxs_err_notcb.value.ui64++;
+					goto fail;
+				}
+
+				/*
+				 * The TCB DMA buffer is guaranteed to
+				 * be one cookie by i40e_alloc_dma_buffer().
+				 */
+				needed_desc++;
+				segdesc++;
+				ASSERT3U(segdesc, <=, i40e_lso_num_descs);
+				tcb_list_append(&tcbhead, &tcbtail, tcb);
+			} else if (segdesc == 0) {
+				/*
+				 * We are copying into an existing TCB
+				 * but we just crossed the MSS
+				 * boundary. Make sure to increment
+				 * segdesc to track the descriptor
+				 * count as the hardware would.
+				 */
+				segdesc++;
+			}
+
+			tocopy = MIN(I40E_TCB_LEFT(tcb), mp_len - cpoff);
+			i40e_tx_copy_fragment(tcb, mp, cpoff, tocopy);
+			cpoff += tocopy;
+			segsz += tocopy;
+
+			/* We have consumed the current mp. */
+			if (cpoff == mp_len) {
+				mp = mp->b_cont;
+				cpoff = 0;
+			}
+
+			/* We have consumed the current TCB buffer. */
+			if (I40E_TCB_LEFT(tcb) == 0) {
+				tcb = NULL;
+			}
+
+			/*
+			 * We have met MSS with this copy; restart the
+			 * counters.
+			 */
+			if (segsz >= mss) {
+				segsz = segsz % mss;
+				segdesc = segsz == 0 ? 0 : 1;
+				force_copy = B_FALSE;
+			}
+
+			/*
+			 * We are at the controller's descriptor
+			 * limit; we must copy into the current TCB
+			 * until MSS is reached. The TCB buffer is
+			 * always bigger than the MTU so we know it is
+			 * big enough to meet the MSS.
+			 */
+			if (segdesc == i40e_lso_num_descs) {
+				force_copy = B_TRUE;
+			}
+		} else {
+			uint_t tsegdesc = segdesc;
+			size_t tsegsz = segsz;
+
+			ASSERT(force_copy == B_FALSE);
+			ASSERT3U(tsegdesc, <, i40e_lso_num_descs);
+
+			tcb = i40e_tx_bind_fragment(itrq, mp, cpoff, B_TRUE);
+			if (tcb == NULL) {
+				i40e_error(i40e, "dma bind failed!");
+				goto fail;
+			}
+
+			for (uint_t i = 0; i < tcb->tcb_bind_ncookies; i++) {
+				struct i40e_dma_bind_info dbi =
+				    tcb->tcb_bind_info[i];
+
+				tsegsz += dbi.dbi_len;
+				tsegdesc++;
+				ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
+
+				/*
+				 * We've met the MSS with this portion
+				 * of the DMA.
+				 */
+				if (tsegsz >= mss) {
+					tsegsz = tsegsz % mss;
+					tsegdesc = tsegsz == 0 ? 0 : 1;
+				}
+
+				/*
+				 * We've reached max descriptors but
+				 * have not met the MSS. Undo the bind
+				 * and instead copy.
+				 */
+				if (tsegdesc == i40e_lso_num_descs) {
+					i40e_tcb_reset(tcb);
+					i40e_tcb_free(itrq, tcb);
+
+					if (tcbtail != NULL &&
+					    I40E_TCB_LEFT(tcb) > 0 &&
+					    tcbtail->tcb_type == I40E_TX_COPY) {
+						tcb = tcbtail;
+					} else {
+						tcb = NULL;
+					}
+
+					/*
+					 * Remember, we are still on
+					 * the same mp.
+					 */
+					force_copy = B_TRUE;
+					txs->itxs_tso_force_copy.value.ui64++;
+					goto force_copy;
+				}
+			}
+
+			ASSERT3U(tsegdesc, <=, i40e_lso_num_descs);
+			ASSERT3U(tsegsz, <, mss);
+
+			/*
+			 * We've made if through the loop without
+			 * breaking the segment descriptor contract
+			 * with the controller -- replace the segment
+			 * tracking values with the temporary ones.
+			 */
+			segdesc = tsegdesc;
+			segsz = tsegsz;
+			needed_desc += tcb->tcb_bind_ncookies;
+			cpoff = 0;
+			tcb_list_append(&tcbhead, &tcbtail, tcb);
+			mp = mp->b_cont;
+		}
+	}
+
+	ASSERT3P(mp, ==, NULL);
+	ASSERT3P(tcbhead, !=, NULL);
+	*ndesc += needed_desc;
+	return (tcbhead);
+
+fail:
+	tcb = tcbhead;
+	while (tcb != NULL) {
+		i40e_tx_control_block_t *next = tcb->tcb_next;
+
+		ASSERT(tcb->tcb_type == I40E_TX_DMA ||
+		    tcb->tcb_type == I40E_TX_COPY);
+
+		tcb->tcb_mp = NULL;
+		i40e_tcb_reset(tcb);
+		i40e_tcb_free(itrq, tcb);
+		tcb = next;
+	}
+
+	return (NULL);
+}
+
 /*
  * We've been asked to send a message block on the wire. We'll only have a
  * single chain. There will not be any b_next pointers; however, there may be
- * multiple b_cont blocks.
+ * multiple b_cont blocks. The number of b_cont blocks may exceed the
+ * controller's Tx descriptor limit.
  *
  * We may do one of three things with any given mblk_t chain:
  *
@@ -2096,12 +2826,14 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq)
 mblk_t *
 i40e_ring_tx(void *arg, mblk_t *mp)
 {
-	const mblk_t *nmp;
-	size_t mpsize;
-	i40e_tx_control_block_t *tcb;
-	i40e_tx_desc_t *txdesc;
+	size_t msglen;
+	i40e_tx_control_block_t *tcb_ctx = NULL, *tcb = NULL, *tcbhead = NULL;
+	i40e_tx_context_desc_t *ctxdesc;
+	mac_ether_offload_info_t meo;
 	i40e_tx_context_t tctx;
-	int cmd, type;
+	int type;
+	uint_t needed_desc = 0;
+	boolean_t do_ctx_desc = B_FALSE, use_lso = B_FALSE;
 
 	i40e_trqpair_t *itrq = arg;
 	i40e_t *i40e = itrq->itrq_i40e;
@@ -2119,107 +2851,137 @@ i40e_ring_tx(void *arg, mblk_t *mp)
 		return (NULL);
 	}
 
+	if (mac_ether_offload_info(mp, &meo) != 0) {
+		freemsg(mp);
+		itrq->itrq_txstat.itxs_hck_meoifail.value.ui64++;
+		return (NULL);
+	}
+
 	/*
 	 * Figure out the relevant context about this frame that we might need
-	 * for enabling checksum, lso, etc. This also fills in information that
+	 * for enabling checksum, LSO, etc. This also fills in information that
 	 * we might set around the packet type, etc.
 	 */
-	if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) {
+	if (i40e_tx_context(i40e, itrq, mp, &meo, &tctx) < 0) {
 		freemsg(mp);
 		itrq->itrq_txstat.itxs_err_context.value.ui64++;
 		return (NULL);
 	}
+	if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
+		use_lso = B_TRUE;
+		do_ctx_desc = B_TRUE;
+	}
 
 	/*
 	 * For the primordial driver we can punt on doing any recycling right
 	 * now; however, longer term we need to probably do some more pro-active
-	 * recycling to cut back on stalls in the tx path.
+	 * recycling to cut back on stalls in the TX path.
 	 */
 
-	/*
-	 * Do a quick size check to make sure it fits into what we think it
-	 * should for this device. Note that longer term this will be false,
-	 * particularly when we have the world of TSO.
-	 */
-	mpsize = 0;
-	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
-		mpsize += MBLKL(nmp);
+	msglen = msgsize(mp);
+
+	if (do_ctx_desc) {
+		/*
+		 * If we're doing tunneling or LSO, then we'll need a TX
+		 * context descriptor in addition to one or more TX data
+		 * descriptors.  Since there's no data DMA block or handle
+		 * associated with the context descriptor, we create a special
+		 * control block that behaves effectively like a NOP.
+		 */
+		if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) {
+			txs->itxs_err_notcb.value.ui64++;
+			goto txfail;
+		}
+		tcb_ctx->tcb_type = I40E_TX_DESC;
+		needed_desc++;
 	}
 
-	/*
-	 * First we allocate our tx control block and prepare the packet for
-	 * transmit before we do a final check for descriptors. We do it this
-	 * way to minimize the time under the tx lock.
-	 */
-	tcb = i40e_tcb_alloc(itrq);
-	if (tcb == NULL) {
-		txs->itxs_err_notcb.value.ui64++;
-		goto txfail;
+	if (!use_lso) {
+		tcbhead = i40e_non_lso_chain(itrq, mp, &needed_desc);
+	} else {
+		tcbhead = i40e_lso_chain(itrq, mp, &meo, &tctx, &needed_desc);
 	}
 
-	/*
-	 * For transmitting a block, we're currently going to use just a
-	 * single control block and bcopy all of the fragments into it. We
-	 * should be more intelligent about doing DMA binding or otherwise, but
-	 * for getting off the ground this will have to do.
-	 */
-	ASSERT(tcb->tcb_dma.dmab_len == 0);
-	ASSERT(tcb->tcb_dma.dmab_size >= mpsize);
-	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
-		size_t clen = MBLKL(nmp);
-		void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len;
+	if (tcbhead == NULL)
+		goto txfail;
 
-		bcopy(nmp->b_rptr, coff, clen);
-		tcb->tcb_dma.dmab_len += clen;
-	}
-	ASSERT(tcb->tcb_dma.dmab_len == mpsize);
+	tcbhead->tcb_mp = mp;
 
 	/*
-	 * While there's really no need to keep the mp here, but let's just do
-	 * it to help with our own debugging for now.
+	 * The second condition ensures that 'itrq_desc_tail' never
+	 * equals 'itrq_desc_head'. This enforces the rule found in
+	 * the second bullet point of section 8.4.3.1.5 of the XL710
+	 * PG, which declares the TAIL pointer in I40E_QTX_TAIL should
+	 * never overlap with the head. This means that we only ever
+	 * have 'itrq_tx_ring_size - 1' total available descriptors.
 	 */
-	tcb->tcb_mp = mp;
-	tcb->tcb_type = I40E_TX_COPY;
-	I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV);
-
 	mutex_enter(&itrq->itrq_tx_lock);
-	if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) {
+	if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh ||
+	    (itrq->itrq_desc_free - 1) < needed_desc) {
 		txs->itxs_err_nodescs.value.ui64++;
 		mutex_exit(&itrq->itrq_tx_lock);
 		goto txfail;
 	}
 
-	/*
-	 * Build up the descriptor and send it out. Thankfully at the moment
-	 * we only need a single desc, because we're not doing anything fancy
-	 * yet.
-	 */
-	ASSERT(itrq->itrq_desc_free > 0);
-	itrq->itrq_desc_free--;
-	txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail];
-	itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
-	itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1,
-	    itrq->itrq_tx_ring_size);
+	if (do_ctx_desc) {
+		/*
+		 * If we're enabling any offloads for this frame, then we'll
+		 * need to build up a transmit context descriptor, first.  The
+		 * context descriptor needs to be placed in the TX ring before
+		 * the data descriptor(s).  See section 8.4.2, table 8-16
+		 */
+		uint_t tail = itrq->itrq_desc_tail;
+		itrq->itrq_desc_free--;
+		ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail];
+		itrq->itrq_tcb_work_list[tail] = tcb_ctx;
+		itrq->itrq_desc_tail = i40e_next_desc(tail, 1,
+		    itrq->itrq_tx_ring_size);
+
+		/* QW0 */
+		type = I40E_TX_DESC_DTYPE_CONTEXT;
+		ctxdesc->tunneling_params = 0;
+		ctxdesc->l2tag2 = 0;
+
+		/* QW1 */
+		ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type);
+		if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) {
+			ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t)
+			    ((uint64_t)tctx.itc_ctx_cmdflags <<
+			    I40E_TXD_CTX_QW1_CMD_SHIFT) |
+			    ((uint64_t)tctx.itc_ctx_tsolen <<
+			    I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
+			    ((uint64_t)tctx.itc_ctx_mss <<
+			    I40E_TXD_CTX_QW1_MSS_SHIFT));
+		}
+	}
 
-	/*
-	 * Note, we always set EOP and RS which indicates that this is the last
-	 * data frame and that we should ask for it to be transmitted. We also
-	 * must always set ICRC, because that is an internal bit that must be
-	 * set to one for data descriptors. The remaining bits in the command
-	 * descriptor depend on checksumming and are determined based on the
-	 * information set up in i40e_tx_context().
-	 */
-	type = I40E_TX_DESC_DTYPE_DATA;
-	cmd = I40E_TX_DESC_CMD_EOP |
-	    I40E_TX_DESC_CMD_RS |
-	    I40E_TX_DESC_CMD_ICRC |
-	    tctx.itc_cmdflags;
-	txdesc->buffer_addr =
-	    CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address);
-	txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type |
-	    ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) |
-	    ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) |
-	    ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)));
+	tcb = tcbhead;
+	while (tcb != NULL) {
+
+		itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb;
+		if (tcb->tcb_type == I40E_TX_COPY) {
+			boolean_t last_desc = (tcb->tcb_next == NULL);
+
+			i40e_tx_set_data_desc(itrq, &tctx,
+			    (caddr_t)tcb->tcb_dma.dmab_dma_address,
+			    tcb->tcb_dma.dmab_len, last_desc);
+		} else {
+			boolean_t last_desc = B_FALSE;
+			ASSERT3S(tcb->tcb_type, ==, I40E_TX_DMA);
+
+			for (uint_t c = 0; c < tcb->tcb_bind_ncookies; c++) {
+				last_desc = (c == tcb->tcb_bind_ncookies - 1) &&
+				    (tcb->tcb_next == NULL);
+
+				i40e_tx_set_data_desc(itrq, &tctx,
+				    tcb->tcb_bind_info[c].dbi_paddr,
+				    tcb->tcb_bind_info[c].dbi_len,
+				    last_desc);
+			}
+		}
+
+		tcb = tcb->tcb_next;
+	}
 
 	/*
 	 * Now, finally, sync the DMA data and alert hardware.
@@ -2228,6 +2990,7 @@ i40e_ring_tx(void *arg, mblk_t *mp)
 
 	I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index),
 	    itrq->itrq_desc_tail);
+
 	if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) !=
 	    DDI_FM_OK) {
 		/*
@@ -2239,9 +3002,9 @@ i40e_ring_tx(void *arg, mblk_t *mp)
 		atomic_or_32(&i40e->i40e_state, I40E_ERROR);
 	}
 
-	txs->itxs_bytes.value.ui64 += mpsize;
+	txs->itxs_bytes.value.ui64 += msglen;
 	txs->itxs_packets.value.ui64++;
-	txs->itxs_descriptors.value.ui64++;
+	txs->itxs_descriptors.value.ui64 += needed_desc;
 
 	mutex_exit(&itrq->itrq_tx_lock);
 
@@ -2254,10 +3017,23 @@ txfail:
 	 * Make sure to reset their message block's, since we'll return them
 	 * back to MAC.
 	 */
-	if (tcb != NULL) {
+	if (tcb_ctx != NULL) {
+		tcb_ctx->tcb_mp = NULL;
+		i40e_tcb_reset(tcb_ctx);
+		i40e_tcb_free(itrq, tcb_ctx);
+	}
+
+	tcb = tcbhead;
+	while (tcb != NULL) {
+		i40e_tx_control_block_t *next = tcb->tcb_next;
+
+		ASSERT(tcb->tcb_type == I40E_TX_DMA ||
+		    tcb->tcb_type == I40E_TX_COPY);
+
 		tcb->tcb_mp = NULL;
 		i40e_tcb_reset(tcb);
 		i40e_tcb_free(itrq, tcb);
+		tcb = next;
 	}
 
 	mutex_enter(&itrq->itrq_tx_lock);