From c61a1653a4d73dbc950dac7d96350fd6cb517486 Mon Sep 17 00:00:00 2001
From: Ryan Zezeski <rpz@joyent.com>
Date: Mon, 4 May 2020 17:50:44 +0000
Subject: 12676 want better offloads for vnics 12677 simnet has bogus
 mi_tx_cksum_flags 12678 mac_tx() is too eager to emulate hardware offloads
 Portions contributed by: Patrick Mooney <patrick.mooney@joyent.com> Portions
 contributed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Patrick Mooney
 <pmooney@oxide.computer> Reviewed by: Andy Fiddaman <andy@omniosce.org>
 Approved by: Dan McDonald <danmcd@joyent.com>

---
 usr/src/uts/common/io/mac/mac.c                |   88 +-
 usr/src/uts/common/io/mac/mac_bcast.c          |   13 +-
 usr/src/uts/common/io/mac/mac_client.c         |  134 ++-
 usr/src/uts/common/io/mac/mac_datapath_setup.c |    2 +-
 usr/src/uts/common/io/mac/mac_flow.c           |    3 +-
 usr/src/uts/common/io/mac/mac_provider.c       |   96 +-
 usr/src/uts/common/io/mac/mac_sched.c          |   91 +-
 usr/src/uts/common/io/mac/mac_soft_ring.c      |    2 +-
 usr/src/uts/common/io/mac/mac_util.c           | 1490 +++++++++++++++++++++---
 9 files changed, 1620 insertions(+), 299 deletions(-)

(limited to 'usr/src/uts/common/io/mac')

diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c
index 76b4765de6..0a52043a15 100644
--- a/usr/src/uts/common/io/mac/mac.c
+++ b/usr/src/uts/common/io/mac/mac.c
@@ -1753,7 +1753,7 @@ mac_client_clear_flow_cb(mac_client_handle_t mch)
 	flow_entry_t		*flent = mcip->mci_flent;
 
 	mutex_enter(&flent->fe_lock);
-	flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+	flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
 	flent->fe_cb_arg1 = NULL;
 	flent->fe_cb_arg2 = NULL;
 	flent->fe_flags |= FE_MC_NO_DATAPATH;
@@ -1936,8 +1936,7 @@ mac_hwring_send_priv(mac_client_handle_t mch, mac_ring_handle_t rh, mblk_t *mp)
 	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
 	mac_impl_t *mip = mcip->mci_mip;
 
-	MAC_TX(mip, rh, mp, mcip);
-	return (mp);
+	return (mac_provider_tx(mip, rh, mp, mcip));
 }
 
 /*
@@ -4712,9 +4711,9 @@ mac_group_remmac(mac_group_t *group, const uint8_t *addr)
 }
 
 /*
- * This is the entry point for packets transmitted through the bridging code.
- * If no bridge is in place, MAC_RING_TX transmits using tx ring. The 'rh'
- * pointer may be NULL to select the default ring.
+ * This is the entry point for packets transmitted through the bridge
+ * code. If no bridge is in place, mac_ring_tx() transmits via the tx
+ * ring. The 'rh' pointer may be NULL to select the default ring.
  */
 mblk_t *
 mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp)
@@ -4731,8 +4730,34 @@ mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp)
 		mac_bridge_ref_cb(mh, B_TRUE);
 	mutex_exit(&mip->mi_bridge_lock);
 	if (mh == NULL) {
-		MAC_RING_TX(mip, rh, mp, mp);
+		mp = mac_ring_tx((mac_handle_t)mip, rh, mp);
 	} else {
+		/*
+		 * The bridge may place this mblk on a provider's Tx
+		 * path, a mac's Rx path, or both. Since we don't have
+		 * enough information at this point, we can't be sure
+		 * that the destination(s) are capable of handling the
+		 * hardware offloads requested by the mblk. We emulate
+		 * them here as it is the safest choice. In the
+		 * future, if bridge performance becomes a priority,
+		 * we can elide the emulation here and leave the
+		 * choice up to bridge.
+		 *
+		 * We don't clear the DB_CKSUMFLAGS here because
+		 * HCK_IPV4_HDRCKSUM (Tx) and HCK_IPV4_HDRCKSUM_OK
+		 * (Rx) still have the same value. If the bridge
+		 * receives a packet from a HCKSUM_IPHDRCKSUM NIC then
+		 * the mac(s) it is forwarded on may calculate the
+		 * checksum again, but incorrectly (because the
+		 * checksum field is not zero). Until the
+		 * HCK_IPV4_HDRCKSUM/HCK_IPV4_HDRCKSUM_OK issue is
+		 * resovled, we leave the flag clearing in bridge
+		 * itself.
+		 */
+		if ((DB_CKSUMFLAGS(mp) & (HCK_TX_FLAGS | HW_LSO_FLAGS)) != 0) {
+			mac_hw_emul(&mp, NULL, NULL, MAC_ALL_EMULS);
+		}
+
 		mp = mac_bridge_tx_cb(mh, rh, mp);
 		mac_bridge_ref_cb(mh, B_FALSE);
 	}
@@ -8804,3 +8829,52 @@ mac_led_set(mac_handle_t mh, mac_led_mode_t desired)
 
 	return (ret);
 }
+
+/*
+ * Send packets through the Tx ring ('mrh') or through the default
+ * handler if no ring is specified. Before passing the packet down to
+ * the MAC provider, emulate any hardware offloads which have been
+ * requested but are not supported by the provider.
+ */
+mblk_t *
+mac_ring_tx(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp)
+{
+	mac_impl_t *mip = (mac_impl_t *)mh;
+
+	if (mrh == NULL)
+		mrh = mip->mi_default_tx_ring;
+
+	if (mrh == NULL)
+		return (mip->mi_tx(mip->mi_driver, mp));
+	else
+		return (mac_hwring_tx(mrh, mp));
+}
+
+/*
+ * This is the final stop before reaching the underlying MAC provider.
+ * This is also where the bridging hook is inserted. Packets that are
+ * bridged will return through mac_bridge_tx(), with rh nulled out if
+ * the bridge chooses to send output on a different link due to
+ * forwarding.
+ */
+mblk_t *
+mac_provider_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp,
+    mac_client_impl_t *mcip)
+{
+	/*
+	 * If there is a bound Hybrid I/O share, send packets through
+	 * the default tx ring. When there's a bound Hybrid I/O share,
+	 * the tx rings of this client are mapped in the guest domain
+	 * and not accessible from here.
+	 */
+	if (mcip->mci_state_flags & MCIS_SHARE_BOUND)
+		rh = mip->mi_default_tx_ring;
+
+	if (mip->mi_promisc_list != NULL)
+		mac_promisc_dispatch(mip, mp, mcip, B_FALSE);
+
+	if (mip->mi_bridge_link == NULL)
+		return (mac_ring_tx((mac_handle_t)mip, rh, mp));
+	else
+		return (mac_bridge_tx(mip, rh, mp));
+}
diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c
index 1ff33c3578..5302b89196 100644
--- a/usr/src/uts/common/io/mac/mac_bcast.c
+++ b/usr/src/uts/common/io/mac/mac_bcast.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -146,7 +147,7 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
 	uint64_t gen;
 	uint_t i;
 	mblk_t *mp_chain1;
-	flow_entry_t	*flent;
+	flow_entry_t *flent;
 	int err;
 
 	rw_enter(&mip->mi_rw_lock, RW_READER);
@@ -182,13 +183,6 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
 		 */
 		if ((mp_chain1 = mac_copymsgchain_cksum(mp_chain)) == NULL)
 			break;
-		/*
-		 * Fix the checksum for packets originating
-		 * from the local machine.
-		 */
-		if ((src_mcip != NULL) &&
-		    (mp_chain1 = mac_fix_cksum(mp_chain1)) == NULL)
-			break;
 
 		FLOW_TRY_REFHOLD(flent, err);
 		if (err != 0) {
@@ -246,7 +240,8 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
 		MCIP_STAT_UPDATE(src_mcip, brdcstxmt, 1);
 		MCIP_STAT_UPDATE(src_mcip, brdcstxmtbytes, msgdsize(mp_chain));
 
-		MAC_TX(mip, mip->mi_default_tx_ring, mp_chain, src_mcip);
+		mp_chain = mac_provider_tx(mip, mip->mi_default_tx_ring,
+		    mp_chain, src_mcip);
 		if (mp_chain != NULL)
 			freemsgchain(mp_chain);
 	} else {
diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c
index 7ff05f2ab6..605cb51bf7 100644
--- a/usr/src/uts/common/io/mac/mac_client.c
+++ b/usr/src/uts/common/io/mac/mac_client.c
@@ -115,6 +115,7 @@
 #include <sys/stream.h>
 #include <sys/strsun.h>
 #include <sys/strsubr.h>
+#include <sys/pattr.h>
 #include <sys/dlpi.h>
 #include <sys/modhash.h>
 #include <sys/mac_impl.h>
@@ -1357,7 +1358,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name,
 
 	mcip->mci_mip = mip;
 	mcip->mci_upper_mip = NULL;
-	mcip->mci_rx_fn = mac_pkt_drop;
+	mcip->mci_rx_fn = mac_rx_def;
 	mcip->mci_rx_arg = NULL;
 	mcip->mci_rx_p_fn = NULL;
 	mcip->mci_rx_p_arg = NULL;
@@ -1629,7 +1630,7 @@ mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg)
 void
 mac_rx_clear(mac_client_handle_t mch)
 {
-	mac_rx_set(mch, mac_pkt_drop, NULL);
+	mac_rx_set(mch, mac_rx_def, NULL);
 }
 
 void
@@ -1641,7 +1642,7 @@ mac_rx_barrier(mac_client_handle_t mch)
 	i_mac_perim_enter(mip);
 
 	/* If a RX callback is set, quiesce and restart that datapath */
-	if (mcip->mci_rx_fn != mac_pkt_drop) {
+	if (mcip->mci_rx_fn != mac_rx_def) {
 		mac_rx_client_quiesce(mch);
 		mac_rx_client_restart(mch);
 	}
@@ -2998,7 +2999,7 @@ mac_client_datapath_teardown(mac_client_handle_t mch, mac_unicast_impl_t *muip,
 	mac_misc_stat_delete(flent);
 
 	/* Initialize the receiver function to a safe routine */
-	flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+	flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
 	flent->fe_cb_arg1 = NULL;
 	flent->fe_cb_arg2 = NULL;
 
@@ -3578,7 +3579,9 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint,
 	srs_tx = &srs->srs_tx;
 	if (srs_tx->st_mode == SRS_TX_DEFAULT &&
 	    (srs->srs_state & SRS_ENQUEUED) == 0 &&
-	    mip->mi_nactiveclients == 1 && mp_chain->b_next == NULL) {
+	    mip->mi_nactiveclients == 1 &&
+	    mp_chain->b_next == NULL &&
+	    (DB_CKSUMFLAGS(mp_chain) & HW_LSO) == 0) {
 		uint64_t	obytes;
 
 		/*
@@ -3613,7 +3616,9 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint,
 		obytes = (mp_chain->b_cont == NULL ? MBLKL(mp_chain) :
 		    msgdsize(mp_chain));
 
-		MAC_TX(mip, srs_tx->st_arg2, mp_chain, mcip);
+		mp_chain = mac_provider_tx(mip, srs_tx->st_arg2, mp_chain,
+		    mcip);
+
 		if (mp_chain == NULL) {
 			cookie = 0;
 			SRS_TX_STAT_UPDATE(srs, opackets, 1);
@@ -3625,7 +3630,74 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint,
 			mutex_exit(&srs->srs_lock);
 		}
 	} else {
-		cookie = srs_tx->st_func(srs, mp_chain, hint, flag, ret_mp);
+		mblk_t *mp = mp_chain;
+		mblk_t *new_head = NULL;
+		mblk_t *new_tail = NULL;
+
+		/*
+		 * There are occasions where the packets arriving here
+		 * may request hardware offloads that are not
+		 * available from the underlying MAC provider. This
+		 * currently only happens when a packet is sent across
+		 * the MAC-loopback path of one MAC and then forwarded
+		 * (via IP) to another MAC that lacks one or more of
+		 * the hardware offloads provided by the first one.
+		 * However, in the future, we may choose to pretend
+		 * all MAC providers support all offloads, performing
+		 * emulation on Tx as needed.
+		 *
+		 * We iterate each mblk in-turn, emulating hardware
+		 * offloads as required. From this process, we create
+		 * a new chain. The new chain may be the same as the
+		 * original chain (no hardware emulation needed), a
+		 * collection of new mblks (hardware emulation
+		 * needed), or a mix. At this point, the chain is safe
+		 * for consumption by the underlying MAC provider and
+		 * is passed down to the SRS.
+		 */
+		while (mp != NULL) {
+			mblk_t *next = mp->b_next;
+			mblk_t *tail = NULL;
+			const uint16_t needed =
+			    (DB_CKSUMFLAGS(mp) ^ mip->mi_tx_cksum_flags) &
+			    DB_CKSUMFLAGS(mp);
+
+			mp->b_next = NULL;
+
+			if ((needed & (HCK_TX_FLAGS | HW_LSO_FLAGS)) != 0) {
+				mac_emul_t emul = 0;
+
+				if (needed & HCK_IPV4_HDRCKSUM)
+					emul |= MAC_IPCKSUM_EMUL;
+				if (needed & (HCK_PARTIALCKSUM | HCK_FULLCKSUM))
+					emul |= MAC_HWCKSUM_EMUL;
+				if (needed & HW_LSO)
+					emul = MAC_LSO_EMUL;
+
+				mac_hw_emul(&mp, &tail, NULL, emul);
+
+				if (mp == NULL) {
+					mp = next;
+					continue;
+				}
+			}
+
+			if (new_head == NULL) {
+				new_head = mp;
+			} else {
+				new_tail->b_next = mp;
+			}
+
+			new_tail = (tail == NULL) ? mp : tail;
+			mp = next;
+		}
+
+		if (new_head == NULL) {
+			cookie = 0;
+			goto done;
+		}
+
+		cookie = srs_tx->st_func(srs, new_head, hint, flag, ret_mp);
 	}
 
 done:
@@ -4026,14 +4098,15 @@ mac_client_get_effective_resources(mac_client_handle_t mch,
  * The unicast packets of MAC_CLIENT_PROMISC_FILTER callbacks are dispatched
  * after classification by mac_rx_deliver().
  */
-
 static void
 mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp,
-    boolean_t loopback)
+    boolean_t loopback, boolean_t local)
 {
-	mblk_t *mp_copy, *mp_next;
+	mblk_t *mp_next;
 
 	if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag) {
+		mblk_t *mp_copy;
+
 		mp_copy = copymsg(mp);
 		if (mp_copy == NULL)
 			return;
@@ -4043,16 +4116,24 @@ mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp,
 			if (mp_copy == NULL)
 				return;
 		}
-		mp_next = NULL;
-	} else {
-		mp_copy = mp;
-		mp_next = mp->b_next;
+
+		/*
+		 * There is code upstack that can't deal with message
+		 * chains.
+		 */
+		for (mblk_t *tmp = mp_copy; tmp != NULL; tmp = mp_next) {
+			mp_next = tmp->b_next;
+			tmp->b_next = NULL;
+			mpip->mpi_fn(mpip->mpi_arg, NULL, tmp, loopback);
+		}
+
+		return;
 	}
-	mp_copy->b_next = NULL;
 
-	mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback);
-	if (mp_copy == mp)
-		mp->b_next = mp_next;
+	mp_next = mp->b_next;
+	mp->b_next = NULL;
+	mpip->mpi_fn(mpip->mpi_arg, NULL, mp, loopback);
+	mp->b_next = mp_next;
 }
 
 /*
@@ -4094,7 +4175,7 @@ mac_is_mcast(mac_impl_t *mip, mblk_t *mp)
  */
 void
 mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain,
-    mac_client_impl_t *sender)
+    mac_client_impl_t *sender, boolean_t local)
 {
 	mac_promisc_impl_t *mpip;
 	mac_cb_t *mcb;
@@ -4134,8 +4215,10 @@ mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain,
 
 			if (is_sender ||
 			    mpip->mpi_type == MAC_CLIENT_PROMISC_ALL ||
-			    is_mcast)
-				mac_promisc_dispatch_one(mpip, mp, is_sender);
+			    is_mcast) {
+				mac_promisc_dispatch_one(mpip, mp, is_sender,
+				    local);
+			}
 		}
 	}
 	MAC_PROMISC_WALKER_DCR(mip);
@@ -4164,7 +4247,8 @@ mac_promisc_client_dispatch(mac_client_impl_t *mcip, mblk_t *mp_chain)
 			mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
 			if (mpip->mpi_type == MAC_CLIENT_PROMISC_FILTERED &&
 			    !is_mcast) {
-				mac_promisc_dispatch_one(mpip, mp, B_FALSE);
+				mac_promisc_dispatch_one(mpip, mp, B_FALSE,
+				    B_FALSE);
 			}
 		}
 	}
@@ -4278,8 +4362,9 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
 	mac_impl_t *mip = (mac_impl_t *)mh;
 
 	/*
-	 * if mi_nactiveclients > 1, only MAC_CAPAB_LEGACY, MAC_CAPAB_HCKSUM,
-	 * MAC_CAPAB_NO_NATIVEVLAN and MAC_CAPAB_NO_ZCOPY can be advertised.
+	 * Some capabilities are restricted when there are more than one active
+	 * clients on the MAC resource.  The ones noted below are safe,
+	 * independent of that count.
 	 */
 	if (mip->mi_nactiveclients > 1) {
 		switch (cap) {
@@ -4287,6 +4372,7 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
 			return (B_TRUE);
 		case MAC_CAPAB_LEGACY:
 		case MAC_CAPAB_HCKSUM:
+		case MAC_CAPAB_LSO:
 		case MAC_CAPAB_NO_NATIVEVLAN:
 			break;
 		default:
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
index e3b660c3b3..9a5f94e7d2 100644
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -3476,7 +3476,7 @@ mac_srs_free(mac_soft_ring_set_t *mac_srs)
 	ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE |
 	    SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE));
 
-	mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE);
+	mac_drop_chain(mac_srs->srs_first, "SRS free");
 	mac_srs_ring_free(mac_srs);
 	mac_srs_soft_rings_free(mac_srs);
 	mac_srs_fanout_list_free(mac_srs);
diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c
index aa4985fe4c..62612122d6 100644
--- a/usr/src/uts/common/io/mac/mac_flow.c
+++ b/usr/src/uts/common/io/mac/mac_flow.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/strsun.h>
@@ -229,7 +230,7 @@ mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
 		cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
 
 		/* Initialize the receiver function to a safe routine */
-		flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+		flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
 		flent->fe_index = -1;
 	}
 	(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c
index fbeef1fd2f..ce986fd4bf 100644
--- a/usr/src/uts/common/io/mac/mac_provider.c
+++ b/usr/src/uts/common/io/mac/mac_provider.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
  */
 
@@ -115,6 +115,37 @@ mac_free(mac_register_t *mregp)
 	kmem_free(mregp, sizeof (mac_register_t));
 }
 
+/*
+ * Convert a MAC's offload features into the equivalent DB_CKSUMFLAGS
+ * value.
+ */
+static uint16_t
+mac_features_to_flags(mac_handle_t mh)
+{
+	uint16_t flags = 0;
+	uint32_t cap_sum = 0;
+	mac_capab_lso_t cap_lso;
+
+	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap_sum)) {
+		if (cap_sum & HCKSUM_IPHDRCKSUM)
+			flags |= HCK_IPV4_HDRCKSUM;
+
+		if (cap_sum & HCKSUM_INET_PARTIAL)
+			flags |= HCK_PARTIALCKSUM;
+		else if (cap_sum & (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
+			flags |= HCK_FULLCKSUM;
+	}
+
+	/*
+	 * We don't need the information stored in 'cap_lso', but we
+	 * need to pass a non-NULL pointer to appease the driver.
+	 */
+	if (mac_capab_get(mh, MAC_CAPAB_LSO, &cap_lso))
+		flags |= HW_LSO;
+
+	return (flags);
+}
+
 /*
  * mac_register() is how drivers register new MACs with the GLDv3
  * framework.  The mregp argument is allocated by drivers using the
@@ -345,9 +376,13 @@ mac_register(mac_register_t *mregp, mac_handle_t *mhp)
 	    mip, 0, &p0, TS_RUN, minclsyspri);
 
 	/*
-	 * Initialize the capabilities
+	 * Cache the DB_CKSUMFLAGS that this MAC supports.
 	 */
+	mip->mi_tx_cksum_flags = mac_features_to_flags((mac_handle_t)mip);
 
+	/*
+	 * Initialize the capabilities
+	 */
 	bzero(&mip->mi_rx_rings_cap, sizeof (mac_capab_rings_t));
 	bzero(&mip->mi_tx_rings_cap, sizeof (mac_capab_rings_t));
 
@@ -689,7 +724,7 @@ mac_trill_snoop(mac_handle_t mh, mblk_t *mp)
 	mac_impl_t *mip = (mac_impl_t *)mh;
 
 	if (mip->mi_promisc_list != NULL)
-		mac_promisc_dispatch(mip, mp, NULL);
+		mac_promisc_dispatch(mip, mp, NULL, B_FALSE);
 }
 
 /*
@@ -709,7 +744,7 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
 	 * this MAC, pass them a copy if appropriate.
 	 */
 	if (mip->mi_promisc_list != NULL)
-		mac_promisc_dispatch(mip, mp_chain, NULL);
+		mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE);
 
 	if (mr != NULL) {
 		/*
@@ -969,12 +1004,33 @@ mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize)
 }
 
 /*
- * Invoked by driver as well as the framework to notify its capability change.
+ * The mac provider or mac frameowrk calls this function when it wants
+ * to notify upstream consumers that the capabilities have changed and
+ * that they should modify their own internal state accordingly.
+ *
+ * We currently have no regard for the fact that a provider could
+ * decide to drop capabilities which would invalidate pending traffic.
+ * For example, if one was to disable the Tx checksum offload while
+ * TCP/IP traffic was being sent by mac clients relying on that
+ * feature, then those packets would hit the write with missing or
+ * partial checksums. A proper solution involves not only providing
+ * notfication, but also performing client quiescing. That is, a capab
+ * change should be treated as an atomic transaction that forms a
+ * barrier between traffic relying on the current capabs and traffic
+ * relying on the new capabs. In practice, simnet is currently the
+ * only provider that could hit this, and it's an easily avoidable
+ * situation (and at worst it should only lead to some dropped
+ * packets). But if we ever want better on-the-fly capab change to
+ * actual hardware providers, then we should give this update
+ * mechanism a proper implementation.
  */
 void
 mac_capab_update(mac_handle_t mh)
 {
-	/* Send MAC_NOTE_CAPAB_CHG notification */
+	/*
+	 * Send a MAC_NOTE_CAPAB_CHG notification to alert upstream
+	 * clients to renegotiate capabilities.
+	 */
 	i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG);
 }
 
@@ -1276,6 +1332,19 @@ i_mac_notify_thread(void *arg)
 			}
 		}
 
+		/*
+		 * Depending on which capabs have changed, the Tx
+		 * checksum flags may also need to be updated.
+		 */
+		if ((bits & (1 << MAC_NOTE_CAPAB_CHG)) != 0) {
+			mac_perim_handle_t mph;
+			mac_handle_t mh = (mac_handle_t)mip;
+
+			mac_perim_enter_by_mh(mh, &mph);
+			mip->mi_tx_cksum_flags = mac_features_to_flags(mh);
+			mac_perim_exit(mph);
+		}
+
 		/*
 		 * Do notification callbacks for each notification type.
 		 */
@@ -1542,15 +1611,22 @@ mac_hcksum_clone(const mblk_t *src, mblk_t *dst)
 	ASSERT3U(DB_TYPE(dst), ==, M_DATA);
 
 	/*
-	 * Do these assignments unconditionally, rather than only when flags is
-	 * non-zero.  This protects a situation where zeroed hcksum data does
-	 * not make the jump onto an mblk_t with stale data in those fields.
+	 * Do these assignments unconditionally, rather than only when
+	 * flags is non-zero. This protects a situation where zeroed
+	 * hcksum data does not make the jump onto an mblk_t with
+	 * stale data in those fields. It's important to copy all
+	 * possible flags (HCK_* as well as HW_*) and not just the
+	 * checksum specific flags. Dropping flags during a clone
+	 * could result in dropped packets. If the caller has good
+	 * reason to drop those flags then it should do it manually,
+	 * after the clone.
 	 */
-	DB_CKSUMFLAGS(dst) = (DB_CKSUMFLAGS(src) & HCK_FLAGS);
+	DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src);
 	DB_CKSUMSTART(dst) = DB_CKSUMSTART(src);
 	DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src);
 	DB_CKSUMEND(dst) = DB_CKSUMEND(src);
 	DB_CKSUM16(dst) = DB_CKSUM16(src);
+	DB_LSOMSS(dst) = DB_LSOMSS(src);
 }
 
 void
diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c
index cbd5ce1e19..5b3e87dfd1 100644
--- a/usr/src/uts/common/io/mac/mac_sched.c
+++ b/usr/src/uts/common/io/mac/mac_sched.c
@@ -968,6 +968,7 @@
 
 #include <sys/types.h>
 #include <sys/callb.h>
+#include <sys/pattr.h>
 #include <sys/sdt.h>
 #include <sys/strsubr.h>
 #include <sys/strsun.h>
@@ -1327,7 +1328,7 @@ int mac_srs_worker_wakeup_ticks = 0;
 			 * b_prev may be set to the fanout hint		\
 			 * hence can't use freemsg directly		\
 			 */						\
-			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);	\
+			mac_drop_chain(mp_chain, "SRS Tx max queue");	\
 			DTRACE_PROBE1(tx_queued_hiwat,			\
 			    mac_soft_ring_set_t *, srs);		\
 			enqueue = 0;					\
@@ -1346,11 +1347,11 @@ int mac_srs_worker_wakeup_ticks = 0;
 	if (!(srs->srs_type & SRST_TX))					\
 		mutex_exit(&srs->srs_bw->mac_bw_lock);
 
-#define	MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) {		\
-	mac_pkt_drop(NULL, NULL, mp, B_FALSE);			\
+#define	MAC_TX_SRS_DROP_MESSAGE(srs, chain, cookie, s) {	\
+	mac_drop_chain((chain), (s));				\
 	/* increment freed stats */				\
-	mac_srs->srs_tx.st_stat.mts_sdrops++;			\
-	cookie = (mac_tx_cookie_t)srs;				\
+	(srs)->srs_tx.st_stat.mts_sdrops++;			\
+	(cookie) = (mac_tx_cookie_t)(srs);			\
 }
 
 #define	MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) {		\
@@ -2321,7 +2322,7 @@ check_again:
 				if (smcip->mci_mip->mi_promisc_list != NULL) {
 					mutex_exit(lock);
 					mac_promisc_dispatch(smcip->mci_mip,
-					    head, NULL);
+					    head, NULL, B_FALSE);
 					mutex_enter(lock);
 				}
 			}
@@ -2893,7 +2894,7 @@ again:
 		mac_srs->srs_bw->mac_bw_sz -= sz;
 		mac_srs->srs_bw->mac_bw_drop_bytes += sz;
 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
-		mac_pkt_drop(NULL, NULL, head, B_FALSE);
+		mac_drop_chain(head, "Rx no bandwidth");
 		goto leave_poll;
 	} else {
 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
@@ -3275,9 +3276,10 @@ mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
 }
 
 /*
- * mac_rx_srs_process
- *
- * Receive side routine called from the interrupt path.
+ * MAC SRS receive side routine. If the data is coming from the
+ * network (i.e. from a NIC) then this is called in interrupt context.
+ * If the data is coming from a local sender (e.g. mac_tx_send() or
+ * bridge_forward()) then this is not called in interrupt context.
  *
  * loopback is set to force a context switch on the loopback
  * path between MAC clients.
@@ -3337,7 +3339,7 @@ mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
 			mac_bw->mac_bw_drop_bytes += sz;
 			mutex_exit(&mac_bw->mac_bw_lock);
 			mutex_exit(&mac_srs->srs_lock);
-			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+			mac_drop_chain(mp_chain, "Rx no bandwidth");
 			return;
 		} else {
 			if ((mac_bw->mac_bw_sz + sz) <=
@@ -3459,7 +3461,8 @@ mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
 
 	ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
 	if (flag & MAC_DROP_ON_NO_DESC) {
-		MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+		MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+		    "Tx no desc");
 	} else {
 		if (mac_srs->srs_first != NULL)
 			wakeup_worker = B_FALSE;
@@ -3522,7 +3525,8 @@ mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
 	if (flag & MAC_DROP_ON_NO_DESC) {
 		if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
-			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+			    "Tx SRS hiwat");
 		} else {
 			MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
 			    mp_chain, tail, cnt, sz);
@@ -3895,7 +3899,8 @@ mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
 			cookie = (mac_tx_cookie_t)mac_srs;
 			*ret_mp = mp_chain;
 		} else {
-			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+			    "Tx no bandwidth");
 		}
 		mutex_exit(&mac_srs->srs_lock);
 		return (cookie);
@@ -4342,7 +4347,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
 			    msgdsize(mp));
 
 			CHECK_VID_AND_ADD_TAG(mp);
-			MAC_TX(mip, ring, mp, src_mcip);
+			mp = mac_provider_tx(mip, ring, mp, src_mcip);
 
 			/*
 			 * If the driver is out of descriptors and does a
@@ -4373,7 +4378,6 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
 		flow_entry_t *dst_flow_ent;
 		void *flow_cookie;
 		size_t	pkt_size;
-		mblk_t *mp1;
 
 		next = mp->b_next;
 		mp->b_next = NULL;
@@ -4388,44 +4392,12 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
 		dst_flow_ent = mac_tx_classify(mip, mp);
 
 		if (dst_flow_ent != NULL) {
-			size_t	hdrsize;
-			int	err = 0;
-
-			if (mip->mi_info.mi_nativemedia == DL_ETHER) {
-				struct ether_vlan_header *evhp =
-				    (struct ether_vlan_header *)mp->b_rptr;
-
-				if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
-					hdrsize = sizeof (*evhp);
-				else
-					hdrsize = sizeof (struct ether_header);
-			} else {
-				mac_header_info_t	mhi;
-
-				err = mac_header_info((mac_handle_t)mip,
-				    mp, &mhi);
-				if (err == 0)
-					hdrsize = mhi.mhi_hdrsize;
-			}
-
 			/*
 			 * Got a matching flow. It's either another
 			 * MAC client, or a broadcast/multicast flow.
-			 * Make sure the packet size is within the
-			 * allowed size. If not drop the packet and
-			 * move to next packet.
 			 */
-			if (err != 0 ||
-			    (pkt_size - hdrsize) > mip->mi_sdu_max) {
-				oerrors++;
-				DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
-				    mblk_t *, mp);
-				freemsg(mp);
-				mp = next;
-				FLOW_REFRELE(dst_flow_ent);
-				continue;
-			}
 			flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
+
 			if (flow_cookie != NULL) {
 				/*
 				 * The vnic_bcast_send function expects
@@ -4443,6 +4415,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
 				 * bypass is set.
 				 */
 				boolean_t do_switch;
+
 				mac_client_impl_t *dst_mcip =
 				    dst_flow_ent->fe_mcip;
 
@@ -4458,19 +4431,23 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
 				 * check is done inside the MAC_TX()
 				 * macro.
 				 */
-				if (mip->mi_promisc_list != NULL)
-					mac_promisc_dispatch(mip, mp, src_mcip);
+				if (mip->mi_promisc_list != NULL) {
+					mac_promisc_dispatch(mip, mp, src_mcip,
+					    B_TRUE);
+				}
 
 				do_switch = ((src_mcip->mci_state_flags &
 				    dst_mcip->mci_state_flags &
 				    MCIS_CLIENT_POLL_CAPABLE) != 0);
 
-				if ((mp1 = mac_fix_cksum(mp)) != NULL) {
+				mac_hw_emul(&mp, NULL, NULL, MAC_ALL_EMULS);
+				if (mp != NULL) {
 					(dst_flow_ent->fe_cb_fn)(
 					    dst_flow_ent->fe_cb_arg1,
 					    dst_flow_ent->fe_cb_arg2,
-					    mp1, do_switch);
+					    mp, do_switch);
 				}
+
 			}
 			FLOW_REFRELE(dst_flow_ent);
 		} else {
@@ -4478,7 +4455,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
 			 * Unknown destination, send via the underlying
 			 * NIC.
 			 */
-			MAC_TX(mip, ring, mp, src_mcip);
+			mp = mac_provider_tx(mip, ring, mp, src_mcip);
 			if (mp != NULL) {
 				/*
 				 * Adjust for the last packet that
@@ -4827,7 +4804,7 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
 	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
 	if (flag & MAC_DROP_ON_NO_DESC) {
-		mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+		mac_drop_chain(mp_chain, "Tx softring no desc");
 		/* increment freed stats */
 		ringp->s_ring_drops += cnt;
 		cookie = (mac_tx_cookie_t)ringp;
@@ -4871,8 +4848,8 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
 					 * b_prev may be set to the fanout hint
 					 * hence can't use freemsg directly
 					 */
-					mac_pkt_drop(NULL, NULL,
-					    mp_chain, B_FALSE);
+					mac_drop_chain(mp_chain,
+					    "Tx softring max queue");
 					DTRACE_PROBE1(tx_queued_hiwat,
 					    mac_soft_ring_t *, ringp);
 					enqueue = B_FALSE;
diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c
index f4d2a5ee81..c8a16e6fd3 100644
--- a/usr/src/uts/common/io/mac/mac_soft_ring.c
+++ b/usr/src/uts/common/io/mac/mac_soft_ring.c
@@ -242,7 +242,7 @@ mac_soft_ring_free(mac_soft_ring_t *softring)
 	ASSERT((softring->s_ring_state &
 	    (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE | S_RING_PROC)) ==
 	    (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE));
-	mac_pkt_drop(NULL, NULL, softring->s_ring_first, B_FALSE);
+	mac_drop_chain(softring->s_ring_first, "softring free");
 	softring->s_ring_tx_arg2 = NULL;
 	mac_soft_ring_stat_delete(softring);
 	mac_callback_free(softring->s_ring_notify_cb_list);
diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c
index 924d018ad0..03da3a3504 100644
--- a/usr/src/uts/common/io/mac/mac_util.c
+++ b/usr/src/uts/common/io/mac/mac_util.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 /*
@@ -48,6 +48,75 @@
 #include <inet/sadb.h>
 #include <inet/ipsecesp.h>
 #include <inet/ipsecah.h>
+#include <inet/tcp.h>
+#include <inet/udp_impl.h>
+#include <inet/sctp_ip.h>
+
+/*
+ * The next two functions are used for dropping packets or chains of
+ * packets, respectively. We could use one function for both but
+ * separating the use cases allows us to specify intent and prevent
+ * dropping more data than intended.
+ *
+ * The purpose of these functions is to aid the debugging effort,
+ * especially in production. Rather than use freemsg()/freemsgchain(),
+ * it's preferable to use these functions when dropping a packet in
+ * the MAC layer. These functions should only be used during
+ * unexpected conditions. That is, any time a packet is dropped
+ * outside of the regular, successful datapath. Consolidating all
+ * drops on these functions allows the user to trace one location and
+ * determine why the packet was dropped based on the msg. It also
+ * allows the user to inspect the packet before it is freed. Finally,
+ * it allows the user to avoid tracing freemsg()/freemsgchain() thus
+ * keeping the hot path running as efficiently as possible.
+ *
+ * NOTE: At this time not all MAC drops are aggregated on these
+ * functions; but that is the plan. This comment should be erased once
+ * completed.
+ */
+
+/*PRINTFLIKE2*/
+void
+mac_drop_pkt(mblk_t *mp, const char *fmt, ...)
+{
+	va_list adx;
+	char msg[128];
+	char *msgp = msg;
+
+	ASSERT3P(mp->b_next, ==, NULL);
+
+	va_start(adx, fmt);
+	(void) vsnprintf(msgp, sizeof (msg), fmt, adx);
+	va_end(adx);
+
+	DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
+	freemsg(mp);
+}
+
+/*PRINTFLIKE2*/
+void
+mac_drop_chain(mblk_t *chain, const char *fmt, ...)
+{
+	va_list adx;
+	char msg[128];
+	char *msgp = msg;
+
+	va_start(adx, fmt);
+	(void) vsnprintf(msgp, sizeof (msg), fmt, adx);
+	va_end(adx);
+
+	/*
+	 * We could use freemsgchain() for the actual freeing but
+	 * since we are already walking the chain to fire the dtrace
+	 * probe we might as well free the msg here too.
+	 */
+	for (mblk_t *mp = chain, *next; mp != NULL; ) {
+		next = mp->b_next;
+		DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
+		freemsg(mp);
+		mp = next;
+	}
+}
 
 /*
  * Copy an mblk, preserving its hardware checksum flags.
@@ -89,222 +158,1272 @@ mac_copymsgchain_cksum(mblk_t *mp)
 }
 
 /*
- * Process the specified mblk chain for proper handling of hardware
- * checksum offload. This routine is invoked for loopback traffic
- * between MAC clients.
- * The function handles a NULL mblk chain passed as argument.
+ * Calculate the ULP checksum for IPv4. Return true if the calculation
+ * was successful, or false if an error occurred. If the later, place
+ * an error message into '*err'.
  */
-mblk_t *
-mac_fix_cksum(mblk_t *mp_chain)
+static boolean_t
+mac_sw_cksum_ipv4(mblk_t *mp, uint32_t ip_hdr_offset, ipha_t *ipha,
+    const char **err)
 {
-	mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
+	const uint8_t proto = ipha->ipha_protocol;
+	size_t len;
+	const uint32_t ip_hdr_sz = IPH_HDR_LENGTH(ipha);
+	/* ULP offset from start of L2. */
+	const uint32_t ulp_offset = ip_hdr_offset + ip_hdr_sz;
+	ipaddr_t src, dst;
+	uint32_t cksum;
+	uint16_t *up;
+
+	/*
+	 * We need a pointer to the ULP checksum. We're assuming the
+	 * ULP checksum pointer resides in the first mblk. Our native
+	 * TCP stack should always put the headers in the first mblk,
+	 * but currently we have no way to guarantee that other
+	 * clients don't spread headers (or even header fields) across
+	 * mblks.
+	 */
+	switch (proto) {
+	case IPPROTO_TCP:
+		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
+		if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
+			*err = "mblk doesn't contain TCP header";
+			goto bail;
+		}
+
+		up = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_sz);
+		cksum = IP_TCP_CSUM_COMP;
+		break;
+
+	case IPPROTO_UDP:
+		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
+		if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
+			*err = "mblk doesn't contain UDP header";
+			goto bail;
+		}
+
+		up = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_sz);
+		cksum = IP_UDP_CSUM_COMP;
+		break;
+
+	case IPPROTO_SCTP: {
+		sctp_hdr_t *sctph;
+
+		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
+		if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
+			*err = "mblk doesn't contain SCTP header";
+			goto bail;
+		}
+
+		sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
+		sctph->sh_chksum = 0;
+		sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
+		return (B_TRUE);
+	}
+
+	default:
+		*err = "unexpected protocol";
+		goto bail;
+
+	}
+
+	/* Pseudo-header checksum. */
+	src = ipha->ipha_src;
+	dst = ipha->ipha_dst;
+	len = ntohs(ipha->ipha_length) - ip_hdr_sz;
+
+	cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
+	cksum += htons(len);
+
+	/*
+	 * We have already accounted for the pseudo checksum above.
+	 * Make sure the ULP checksum field is zero before computing
+	 * the rest.
+	 */
+	*up = 0;
+	cksum = IP_CSUM(mp, ulp_offset, cksum);
+	*up = (uint16_t)(cksum ? cksum : ~cksum);
+
+	return (B_TRUE);
+
+bail:
+	return (B_FALSE);
+}
+
+/*
+ * Calculate the ULP checksum for IPv6. Return true if the calculation
+ * was successful, or false if an error occurred. If the later, place
+ * an error message into '*err'.
+ */
+static boolean_t
+mac_sw_cksum_ipv6(mblk_t *mp, uint32_t ip_hdr_offset, const char **err)
+{
+	ip6_t *ip6h = (ip6_t *)(mp->b_rptr + ip_hdr_offset);
+	const uint8_t proto = ip6h->ip6_nxt;
+	const uint16_t *iphs = (uint16_t *)ip6h;
+	/* ULP offset from start of L2. */
+	uint32_t ulp_offset;
+	size_t len;
+	uint32_t cksum;
+	uint16_t *up;
+	uint16_t ip_hdr_sz;
+
+	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_sz, NULL)) {
+		*err = "malformed IPv6 header";
+		goto bail;
+	}
+
+	ulp_offset = ip_hdr_offset + ip_hdr_sz;
+
+	/*
+	 * We need a pointer to the ULP checksum. We're assuming the
+	 * ULP checksum pointer resides in the first mblk. Our native
+	 * TCP stack should always put the headers in the first mblk,
+	 * but currently we have no way to guarantee that other
+	 * clients don't spread headers (or even header fields) across
+	 * mblks.
+	 */
+	switch (proto) {
+	case IPPROTO_TCP:
+		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
+		if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
+			*err = "mblk doesn't contain TCP header";
+			goto bail;
+		}
+
+		up = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_sz);
+		cksum = IP_TCP_CSUM_COMP;
+		break;
+
+	case IPPROTO_UDP:
+		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
+		if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
+			*err = "mblk doesn't contain UDP header";
+			goto bail;
+		}
+
+		up = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_sz);
+		cksum = IP_UDP_CSUM_COMP;
+		break;
+
+	case IPPROTO_SCTP: {
+		sctp_hdr_t *sctph;
+
+		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
+		if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
+			*err = "mblk doesn't contain SCTP header";
+			goto bail;
+		}
+
+		sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
+		/*
+		 * Zero out the checksum field to ensure proper
+		 * checksum calculation.
+		 */
+		sctph->sh_chksum = 0;
+		sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
+		return (B_TRUE);
+	}
+
+	default:
+		*err = "unexpected protocol";
+		goto bail;
+	}
+
+	/*
+	 * The payload length includes the payload and the IPv6
+	 * extension headers; the idea is to subtract the extension
+	 * header length to get the real payload length.
+	 */
+	len = ntohs(ip6h->ip6_plen) - (ip_hdr_sz - IPV6_HDR_LEN);
+	cksum += len;
+
+	/*
+	 * We accumulate the pseudo header checksum in cksum; then we
+	 * call IP_CSUM to compute the checksum over the payload.
+	 */
+	cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + iphs[8] + iphs[9] +
+	    iphs[10] + iphs[11] + iphs[12] + iphs[13] + iphs[14] + iphs[15] +
+	    iphs[16] + iphs[17] + iphs[18] + iphs[19];
+	cksum = IP_CSUM(mp, ulp_offset, cksum);
+
+	/* For UDP/IPv6 a zero UDP checksum is not allowed. Change to 0xffff */
+	if (proto == IPPROTO_UDP && cksum == 0)
+		cksum = ~cksum;
+
+	*up = (uint16_t)cksum;
+
+	return (B_TRUE);
+
+bail:
+	return (B_FALSE);
+}
+
+/*
+ * Perform software checksum on a single message, if needed. The
+ * emulation performed is determined by an intersection of the mblk's
+ * flags and the emul flags requested. The emul flags are documented
+ * in mac.h.
+ */
+static mblk_t *
+mac_sw_cksum(mblk_t *mp, mac_emul_t emul)
+{
+	mblk_t *skipped_hdr = NULL;
 	uint32_t flags, start, stuff, end, value;
+	uint32_t ip_hdr_offset;
+	uint16_t etype;
+	size_t ip_hdr_sz;
+	struct ether_header *ehp;
+	const char *err = "";
 
-	for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
-		uint16_t len;
-		uint32_t offset;
-		struct ether_header *ehp;
-		uint16_t sap;
+	/*
+	 * This function should only be called from mac_hw_emul()
+	 * which handles mblk chains and the shared ref case.
+	 */
+	ASSERT3P(mp->b_next, ==, NULL);
 
-		mac_hcksum_get(mp, &start, &stuff, &end, &value, &flags);
-		if (flags == 0)
-			continue;
+	mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL);
+
+	flags = DB_CKSUMFLAGS(mp);
+
+	/* Why call this if checksum emulation isn't needed? */
+	ASSERT3U(flags & (HCK_FLAGS), !=, 0);
+
+	/*
+	 * Ethernet, and optionally VLAN header. mac_hw_emul() has
+	 * already verified we have enough data to read the L2 header.
+	 */
+	ehp = (struct ether_header *)mp->b_rptr;
+	if (ntohs(ehp->ether_type) == VLAN_TPID) {
+		struct ether_vlan_header *evhp;
+
+		evhp = (struct ether_vlan_header *)mp->b_rptr;
+		etype = ntohs(evhp->ether_type);
+		ip_hdr_offset = sizeof (struct ether_vlan_header);
+	} else {
+		etype = ntohs(ehp->ether_type);
+		ip_hdr_offset = sizeof (struct ether_header);
+	}
+
+	/*
+	 * If this packet isn't IP, then leave it alone. We don't want
+	 * to affect non-IP traffic like ARP. Assume the IP header
+	 * doesn't include any options, for now. We will use the
+	 * correct size later after we know there are enough bytes to
+	 * at least fill out the basic header.
+	 */
+	switch (etype) {
+	case ETHERTYPE_IP:
+		ip_hdr_sz = sizeof (ipha_t);
+		break;
+	case ETHERTYPE_IPV6:
+		ip_hdr_sz = sizeof (ip6_t);
+		break;
+	default:
+		return (mp);
+	}
+
+	ASSERT3U(MBLKL(mp), >=, ip_hdr_offset);
+
+	/*
+	 * If the first mblk of this packet contains only the ethernet
+	 * header, skip past it for now. Packets with their data
+	 * contained in only a single mblk can then use the fastpaths
+	 * tuned to that possibility.
+	 */
+	if (MBLKL(mp) == ip_hdr_offset) {
+		ip_hdr_offset -= MBLKL(mp);
+		/* This is guaranteed by mac_hw_emul(). */
+		ASSERT3P(mp->b_cont, !=, NULL);
+		skipped_hdr = mp;
+		mp = mp->b_cont;
+	}
+
+	/*
+	 * Both full and partial checksum rely on finding the IP
+	 * header in the current mblk. Our native TCP stack honors
+	 * this assumption but it's prudent to guard our future
+	 * clients that might not honor this contract.
+	 */
+	ASSERT3U(MBLKL(mp), >=, ip_hdr_offset + ip_hdr_sz);
+	if (MBLKL(mp) < (ip_hdr_offset + ip_hdr_sz)) {
+		err = "mblk doesn't contain IP header";
+		goto bail;
+	}
+
+	/*
+	 * We are about to modify the header mblk; make sure we are
+	 * modifying our own copy. The code that follows assumes that
+	 * the IP/ULP headers exist in this mblk (and drops the
+	 * message if they don't).
+	 */
+	if (DB_REF(mp) > 1) {
+		mblk_t *tmp = copyb(mp);
+
+		if (tmp == NULL) {
+			err = "copyb failed";
+			goto bail;
+		}
+
+		if (skipped_hdr != NULL) {
+			ASSERT3P(skipped_hdr->b_cont, ==, mp);
+			skipped_hdr->b_cont = tmp;
+		}
+
+		tmp->b_cont = mp->b_cont;
+		freeb(mp);
+		mp = tmp;
+	}
+
+	if (etype == ETHERTYPE_IP) {
+		ipha_t *ipha = (ipha_t *)(mp->b_rptr + ip_hdr_offset);
+
+		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
+			if (!mac_sw_cksum_ipv4(mp, ip_hdr_offset, ipha, &err))
+				goto bail;
+		}
+
+		/* We always update the ULP checksum flags. */
+		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
+			flags &= ~HCK_FULLCKSUM;
+			flags |= HCK_FULLCKSUM_OK;
+			value = 0;
+		}
 
 		/*
-		 * Since the processing of checksum offload for loopback
-		 * traffic requires modification of the packet contents,
-		 * ensure sure that we are always modifying our own copy.
+		 * While unlikely, it's possible to write code that
+		 * might end up calling mac_sw_cksum() twice on the
+		 * same mblk (performing both LSO and checksum
+		 * emualtion in a single mblk chain loop -- the LSO
+		 * emulation inserts a new chain into the existing
+		 * chain and then the loop iterates back over the new
+		 * segments and emulates the checksum a second time).
+		 * Normally this wouldn't be a problem, because the
+		 * HCK_*_OK flags are supposed to indicate that we
+		 * don't need to do peform the work. But
+		 * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the
+		 * same value; so we cannot use these flags to
+		 * determine if the IP header checksum has already
+		 * been calculated or not. For this reason, we zero
+		 * out the the checksum first. In the future, we
+		 * should fix the HCK_* flags.
 		 */
-		if (DB_REF(mp) > 1) {
-			mp1 = copymsg(mp);
-			if (mp1 == NULL)
-				continue;
-			mp1->b_next = mp->b_next;
-			mp->b_next = NULL;
-			freemsg(mp);
-			if (prev != NULL)
-				prev->b_next = mp1;
-			else
-				new_chain = mp1;
-			mp = mp1;
+		if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
+			ipha->ipha_hdr_checksum = 0;
+			ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
+			flags &= ~HCK_IPV4_HDRCKSUM;
+			flags |= HCK_IPV4_HDRCKSUM_OK;
+		}
+	} else if (etype == ETHERTYPE_IPV6) {
+		/* There is no IP header checksum for IPv6. */
+		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
+			if (!mac_sw_cksum_ipv6(mp, ip_hdr_offset, &err))
+				goto bail;
+			flags &= ~HCK_FULLCKSUM;
+			flags |= HCK_FULLCKSUM_OK;
+			value = 0;
 		}
+	}
+
+	/*
+	 * Partial checksum is the same for both IPv4 and IPv6.
+	 */
+	if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
+		uint16_t *up, partial, cksum;
+		uchar_t *ipp; /* ptr to beginning of IP header */
+
+		ipp = mp->b_rptr + ip_hdr_offset;
+		up = (uint16_t *)((uchar_t *)ipp + stuff);
+		partial = *up;
+		*up = 0;
+
+		ASSERT3S(end, >, start);
+		cksum = ~IP_CSUM_PARTIAL(mp, ip_hdr_offset + start, partial);
+		*up = cksum != 0 ? cksum : ~cksum;
+	}
+
+	/* We always update the ULP checksum flags. */
+	if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
+		flags &= ~HCK_PARTIALCKSUM;
+		flags |= HCK_FULLCKSUM_OK;
+		value = 0;
+	}
+
+	mac_hcksum_set(mp, start, stuff, end, value, flags);
+
+	/* Don't forget to reattach the header. */
+	if (skipped_hdr != NULL) {
+		ASSERT3P(skipped_hdr->b_cont, ==, mp);
 
 		/*
-		 * Ethernet, and optionally VLAN header.
+		 * Duplicate the HCKSUM data into the header mblk.
+		 * This mimics mac_add_vlan_tag which ensures that
+		 * both the first mblk _and_ the first data bearing
+		 * mblk possess the HCKSUM information. Consumers like
+		 * IP will end up discarding the ether_header mblk, so
+		 * for now, it is important that the data be available
+		 * in both places.
 		 */
-		/* LINTED: improper alignment cast */
-		ehp = (struct ether_header *)mp->b_rptr;
-		if (ntohs(ehp->ether_type) == VLAN_TPID) {
-			struct ether_vlan_header *evhp;
+		mac_hcksum_clone(mp, skipped_hdr);
+		mp = skipped_hdr;
+	}
 
-			ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
-			/* LINTED: improper alignment cast */
-			evhp = (struct ether_vlan_header *)mp->b_rptr;
-			sap = ntohs(evhp->ether_type);
-			offset = sizeof (struct ether_vlan_header);
+	return (mp);
+
+bail:
+	if (skipped_hdr != NULL) {
+		ASSERT3P(skipped_hdr->b_cont, ==, mp);
+		mp = skipped_hdr;
+	}
+
+	mac_drop_pkt(mp, err);
+	return (NULL);
+}
+
+/*
+ * Build a single data segment from an LSO packet. The mblk chain
+ * returned, seg_head, represents the data segment and is always
+ * exactly seg_len bytes long. The lso_mp and offset input/output
+ * parameters track our position in the LSO packet. This function
+ * exists solely as a helper to mac_sw_lso().
+ *
+ * Case A
+ *
+ *     The current lso_mp is larger than the requested seg_len. The
+ *     beginning of seg_head may start at the beginning of lso_mp or
+ *     offset into it. In either case, a single mblk is returned, and
+ *     *offset is updated to reflect our new position in the current
+ *     lso_mp.
+ *
+ *          +----------------------------+
+ *          |  in *lso_mp / out *lso_mp  |
+ *          +----------------------------+
+ *          ^                        ^
+ *          |                        |
+ *          |                        |
+ *          |                        |
+ *          +------------------------+
+ *          |        seg_head        |
+ *          +------------------------+
+ *          ^                        ^
+ *          |                        |
+ *   in *offset = 0        out *offset = seg_len
+ *
+ *          |------   seg_len    ----|
+ *
+ *
+ *       +------------------------------+
+ *       |   in *lso_mp / out *lso_mp   |
+ *       +------------------------------+
+ *          ^                        ^
+ *          |                        |
+ *          |                        |
+ *          |                        |
+ *          +------------------------+
+ *          |        seg_head        |
+ *          +------------------------+
+ *          ^                        ^
+ *          |                        |
+ *   in *offset = N        out *offset = N + seg_len
+ *
+ *          |------   seg_len    ----|
+ *
+ *
+ *
+ * Case B
+ *
+ *    The requested seg_len consumes exactly the rest of the lso_mp.
+ *    I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr.
+ *    The seg_head may start at the beginning of the lso_mp or at some
+ *    offset into it. In either case we return a single mblk, reset
+ *    *offset to zero, and walk to the next lso_mp.
+ *
+ *          +------------------------+           +------------------------+
+ *          |       in *lso_mp       |---------->|      out *lso_mp       |
+ *          +------------------------+           +------------------------+
+ *          ^                        ^           ^
+ *          |                        |           |
+ *          |                        |    out *offset = 0
+ *          |                        |
+ *          +------------------------+
+ *          |        seg_head        |
+ *          +------------------------+
+ *          ^
+ *          |
+ *   in *offset = 0
+ *
+ *          |------   seg_len    ----|
+ *
+ *
+ *
+ *      +----------------------------+           +------------------------+
+ *      |         in *lso_mp         |---------->|      out *lso_mp       |
+ *      +----------------------------+           +------------------------+
+ *          ^                        ^           ^
+ *          |                        |           |
+ *          |                        |    out *offset = 0
+ *          |                        |
+ *          +------------------------+
+ *          |        seg_head        |
+ *          +------------------------+
+ *          ^
+ *          |
+ *   in *offset = N
+ *
+ *          |------   seg_len    ----|
+ *
+ *
+ * Case C
+ *
+ *    The requested seg_len is greater than the current lso_mp. In
+ *    this case we must consume LSO mblks until we have enough data to
+ *    satisfy either case (A) or (B) above. We will return multiple
+ *    mblks linked via b_cont, offset will be set based on the cases
+ *    above, and lso_mp will walk forward at least one mblk, but maybe
+ *    more.
+ *
+ *    N.B. This digram is not exhaustive. The seg_head may start on
+ *    the beginning of an lso_mp. The seg_tail may end exactly on the
+ *    boundary of an lso_mp. And there may be two (in this case the
+ *    middle block wouldn't exist), three, or more mblks in the
+ *    seg_head chain. This is meant as one example of what might
+ *    happen. The main thing to remember is that the seg_tail mblk
+ *    must be one of case (A) or (B) above.
+ *
+ *  +------------------+    +----------------+    +------------------+
+ *  |    in *lso_mp    |--->|    *lso_mp     |--->|   out *lso_mp    |
+ *  +------------------+    +----------------+    +------------------+
+ *        ^            ^    ^                ^    ^            ^
+ *        |            |    |                |    |            |
+ *        |            |    |                |    |            |
+ *        |            |    |                |    |            |
+ *        |            |    |                |    |            |
+ *        +------------+    +----------------+    +------------+
+ *        |  seg_head  |--->|                |--->|  seg_tail  |
+ *        +------------+    +----------------+    +------------+
+ *        ^                                                    ^
+ *        |                                                    |
+ *  in *offset = N                          out *offset = MBLKL(seg_tail)
+ *
+ *        |-------------------   seg_len    -------------------|
+ *
+ */
+static mblk_t *
+build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len)
+{
+	mblk_t *seg_head, *seg_tail, *seg_mp;
+
+	ASSERT3P(*lso_mp, !=, NULL);
+	ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr);
+
+	seg_mp = dupb(*lso_mp);
+	if (seg_mp == NULL)
+		return (NULL);
+
+	seg_head = seg_mp;
+	seg_tail = seg_mp;
+
+	/* Continue where we left off from in the lso_mp. */
+	seg_mp->b_rptr += *offset;
+
+last_mblk:
+	/* Case (A) */
+	if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) {
+		*offset += seg_len;
+		seg_mp->b_wptr = seg_mp->b_rptr + seg_len;
+		return (seg_head);
+	}
+
+	/* Case (B) */
+	if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) {
+		*offset = 0;
+		*lso_mp = (*lso_mp)->b_cont;
+		return (seg_head);
+	}
+
+	/* Case (C) */
+	ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr);
+
+	/*
+	 * The current LSO mblk doesn't have enough data to satisfy
+	 * seg_len -- continue peeling off LSO mblks to build the new
+	 * segment message. If allocation fails we free the previously
+	 * allocated segment mblks and return NULL.
+	 */
+	while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) {
+		ASSERT3U(MBLKL(seg_mp), <=, seg_len);
+		seg_len -= MBLKL(seg_mp);
+		*offset = 0;
+		*lso_mp = (*lso_mp)->b_cont;
+		seg_mp = dupb(*lso_mp);
+
+		if (seg_mp == NULL) {
+			freemsgchain(seg_head);
+			return (NULL);
+		}
+
+		seg_tail->b_cont = seg_mp;
+		seg_tail = seg_mp;
+	}
+
+	/*
+	 * We've walked enough LSO mblks that we can now satisfy the
+	 * remaining seg_len. At this point we need to jump back to
+	 * determine if we have arrived at case (A) or (B).
+	 */
+
+	/* Just to be paranoid that we didn't underflow. */
+	ASSERT3U(seg_len, <, IP_MAXPACKET);
+	ASSERT3U(seg_len, >, 0);
+	goto last_mblk;
+}
+
+/*
+ * Perform software segmentation of a single LSO message. Take an LSO
+ * message as input and return head/tail pointers as output. This
+ * function should not be invoked directly but instead through
+ * mac_hw_emul().
+ *
+ * The resulting chain is comprised of multiple (nsegs) MSS sized
+ * segments. Each segment will consist of two or more mblks joined by
+ * b_cont: a header and one or more data mblks. The header mblk is
+ * allocated anew for each message. The first segment's header is used
+ * as a template for the rest with adjustments made for things such as
+ * ID, sequence, length, TCP flags, etc. The data mblks reference into
+ * the existing LSO mblk (passed in as omp) by way of dupb(). Their
+ * b_rptr/b_wptr values are adjusted to reference only the fraction of
+ * the LSO message they are responsible for. At the successful
+ * completion of this function the original mblk (omp) is freed,
+ * leaving the newely created segment chain as the only remaining
+ * reference to the data.
+ */
+static void
+mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail,
+    uint_t *count)
+{
+	uint32_t ocsum_flags, ocsum_start, ocsum_stuff;
+	uint32_t mss;
+	uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen;
+	uint32_t oleft;
+	uint_t nsegs, seg;
+	int len;
+
+	struct ether_vlan_header *oevh;
+	const ipha_t *oiph;
+	const tcph_t *otcph;
+	ipha_t *niph;
+	tcph_t *ntcph;
+	uint16_t ip_id;
+	uint32_t tcp_seq, tcp_sum, otcp_sum;
+
+	uint32_t offset;
+	mblk_t *odatamp;
+	mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp;
+	mblk_t *tmptail;
+
+	ASSERT3P(head, !=, NULL);
+	ASSERT3P(tail, !=, NULL);
+	ASSERT3P(count, !=, NULL);
+	ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0);
+
+	/* Assume we are dealing with a single LSO message. */
+	ASSERT3P(omp->b_next, ==, NULL);
+
+	/*
+	 * XXX: This is a hack to deal with mac_add_vlan_tag().
+	 *
+	 * When VLANs are in play, mac_add_vlan_tag() creates a new
+	 * mblk with just the ether_vlan_header and tacks it onto the
+	 * front of 'omp'. This breaks the assumptions made below;
+	 * namely that the TCP/IP headers are in the first mblk. In
+	 * this case, since we already have to pay the cost of LSO
+	 * emulation, we simply pull up everything. While this might
+	 * seem irksome, keep in mind this will only apply in a couple
+	 * of scenarios: a) an LSO-capable VLAN client sending to a
+	 * non-LSO-capable client over the "MAC/bridge loopback"
+	 * datapath or b) an LSO-capable VLAN client is sending to a
+	 * client that, for whatever reason, doesn't have DLS-bypass
+	 * enabled. Finally, we have to check for both a tagged and
+	 * untagged sized mblk depending on if the mblk came via
+	 * mac_promisc_dispatch() or mac_rx_deliver().
+	 *
+	 * In the future, two things should be done:
+	 *
+	 * 1. This function should make use of some yet to be
+	 *    implemented "mblk helpers". These helper functions would
+	 *    perform all the b_cont walking for us and guarantee safe
+	 *    access to the mblk data.
+	 *
+	 * 2. We should add some slop to the mblks so that
+	 *    mac_add_vlan_tag() can just edit the first mblk instead
+	 *    of allocating on the hot path.
+	 */
+	if (MBLKL(omp) == sizeof (struct ether_vlan_header) ||
+	    MBLKL(omp) == sizeof (struct ether_header)) {
+		mblk_t *tmp = msgpullup(omp, -1);
+
+		if (tmp == NULL) {
+			mac_drop_pkt(omp, "failed to pull up");
+			goto fail;
+		}
+
+		mac_hcksum_clone(omp, tmp);
+		freemsg(omp);
+		omp = tmp;
+	}
+
+	mss = DB_LSOMSS(omp);
+	ASSERT3U(msgsize(omp), <=, IP_MAXPACKET +
+	    sizeof (struct ether_vlan_header));
+	opktlen = msgsize(omp);
+
+	/*
+	 * First, get references to the IP and TCP headers and
+	 * determine the total TCP length (header + data).
+	 *
+	 * Thanks to mac_hw_emul() we know that the first mblk must
+	 * contain (at minimum) the full L2 header. However, this
+	 * function assumes more than that. It assumes the L2/L3/L4
+	 * headers are all contained in the first mblk of a message
+	 * (i.e., no b_cont walking for headers). While this is a
+	 * current reality (our native TCP stack and viona both
+	 * enforce this) things may become more nuanced in the future
+	 * (e.g. when introducing encap support or adding new
+	 * clients). For now we guard against this case by dropping
+	 * the packet.
+	 */
+	oevh = (struct ether_vlan_header *)omp->b_rptr;
+	if (oevh->ether_tpid == htons(ETHERTYPE_VLAN))
+		oehlen = sizeof (struct ether_vlan_header);
+	else
+		oehlen = sizeof (struct ether_header);
+
+	ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t)));
+	if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) {
+		mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers");
+		goto fail;
+	}
+
+	oiph = (ipha_t *)(omp->b_rptr + oehlen);
+	oiphlen = IPH_HDR_LENGTH(oiph);
+	otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen);
+	otcphlen = TCP_HDR_LENGTH(otcph);
+
+	/*
+	 * Currently we only support LSO for TCP/IPv4.
+	 */
+	if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) {
+		mac_drop_pkt(omp, "LSO unsupported IP version: %uhh",
+		    IPH_HDR_VERSION(oiph));
+		goto fail;
+	}
+
+	if (oiph->ipha_protocol != IPPROTO_TCP) {
+		mac_drop_pkt(omp, "LSO unsupported protocol: %uhh",
+		    oiph->ipha_protocol);
+		goto fail;
+	}
+
+	if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) {
+		mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set");
+		goto fail;
+	}
+
+	ohdrslen = oehlen + oiphlen + otcphlen;
+	if ((len = MBLKL(omp)) < ohdrslen) {
+		mac_drop_pkt(omp, "LSO packet too short: %d < %u", len,
+		    ohdrslen);
+		goto fail;
+	}
+
+	/*
+	 * Either we have data in the first mblk or it's just the
+	 * header. In either case, we need to set rptr to the start of
+	 * the TCP data.
+	 */
+	if (len > ohdrslen) {
+		odatamp = omp;
+		offset = ohdrslen;
+	} else {
+		ASSERT3U(len, ==, ohdrslen);
+		odatamp = omp->b_cont;
+		offset = 0;
+	}
+
+	/* Make sure we still have enough data. */
+	ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen);
+
+	/*
+	 * If a MAC negotiated LSO then it must negotioate both
+	 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or
+	 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers
+	 * change during LSO segmentation (only the 3 fields of the
+	 * pseudo header checksum don't change: src, dst, proto). Thus
+	 * we would expect these flags (HCK_IPV4_HDRCKSUM |
+	 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this
+	 * function to emulate those checksums in software. However,
+	 * that assumes a world where we only expose LSO if the
+	 * underlying hardware exposes LSO. Moving forward the plan is
+	 * to assume LSO in the upper layers and have MAC perform
+	 * software LSO when the underlying provider doesn't support
+	 * it. In such a world, if the provider doesn't support LSO
+	 * but does support hardware checksum offload, then we could
+	 * simply perform the segmentation and allow the hardware to
+	 * calculate the checksums. To the hardware it's just another
+	 * chain of non-LSO packets.
+	 */
+	ASSERT3S(DB_TYPE(omp), ==, M_DATA);
+	ocsum_flags = DB_CKSUMFLAGS(omp);
+	ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0);
+	ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0);
+
+	/*
+	 * If hardware only provides partial checksum then software
+	 * must supply the pseudo-header checksum. In the case of LSO
+	 * we leave the TCP length at zero to be filled in by
+	 * hardware. This function must handle two scenarios.
+	 *
+	 * 1. Being called by a MAC client on the Rx path to segment
+	 *    an LSO packet and calculate the checksum.
+	 *
+	 * 2. Being called by a MAC provider to segment an LSO packet.
+	 *    In this case the LSO segmentation is performed in
+	 *    software (by this routine) but the MAC provider should
+	 *    still calculate the TCP/IP checksums in hardware.
+	 *
+	 *  To elaborate on the second case: we cannot have the
+	 *  scenario where IP sends LSO packets but the underlying HW
+	 *  doesn't support checksum offload -- because in that case
+	 *  TCP/IP would calculate the checksum in software (for the
+	 *  LSO packet) but then MAC would segment the packet and have
+	 *  to redo all the checksum work. So IP should never do LSO
+	 *  if HW doesn't support both IP and TCP checksum.
+	 */
+	if (ocsum_flags & HCK_PARTIALCKSUM) {
+		ocsum_start = (uint32_t)DB_CKSUMSTART(omp);
+		ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp);
+	}
+
+	odatalen = opktlen - ohdrslen;
+
+	/*
+	 * Subtract one to account for the case where the data length
+	 * is evenly divisble by the MSS. Add one to account for the
+	 * fact that the division will always result in one less
+	 * segment than needed.
+	 */
+	nsegs = ((odatalen - 1) / mss) + 1;
+	if (nsegs < 2) {
+		mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs);
+		goto fail;
+	}
+
+	DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph,
+	    __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t,
+	    nsegs);
+
+	seg_chain = NULL;
+	tmptail = seg_chain;
+	oleft = odatalen;
+
+	for (uint_t i = 0; i < nsegs; i++) {
+		boolean_t last_seg = ((i + 1) == nsegs);
+		uint32_t seg_len;
+
+		/*
+		 * If we fail to allocate, then drop the partially
+		 * allocated chain as well as the LSO packet. Let the
+		 * sender deal with the fallout.
+		 */
+		if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) {
+			freemsgchain(seg_chain);
+			mac_drop_pkt(omp, "failed to alloc segment header");
+			goto fail;
+		}
+		ASSERT3P(nhdrmp->b_cont, ==, NULL);
+
+		if (seg_chain == NULL) {
+			seg_chain = nhdrmp;
 		} else {
-			sap = ntohs(ehp->ether_type);
-			offset = sizeof (struct ether_header);
+			ASSERT3P(tmptail, !=, NULL);
+			tmptail->b_next = nhdrmp;
 		}
 
-		if (MBLKL(mp) <= offset) {
-			offset -= MBLKL(mp);
-			if (mp->b_cont == NULL) {
-				/* corrupted packet, skip it */
-				if (prev != NULL)
-					prev->b_next = mp->b_next;
-				else
-					new_chain = mp->b_next;
-				mp1 = mp->b_next;
-				mp->b_next = NULL;
-				freemsg(mp);
-				mp = mp1;
-				continue;
-			}
-			mp = mp->b_cont;
+		tmptail = nhdrmp;
+
+		/*
+		 * Calculate this segment's lengh. It's either the MSS
+		 * or whatever remains for the last segment.
+		 */
+		seg_len = last_seg ? oleft : mss;
+		ASSERT3U(seg_len, <=, mss);
+		ndatamp = build_data_seg(&odatamp, &offset, seg_len);
+
+		if (ndatamp == NULL) {
+			freemsgchain(seg_chain);
+			mac_drop_pkt(omp, "LSO failed to segment data");
+			goto fail;
 		}
 
-		if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
-			ipha_t *ipha = NULL;
+		/* Attach data mblk to header mblk. */
+		nhdrmp->b_cont = ndatamp;
+		DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO;
+		ASSERT3U(seg_len, <=, oleft);
+		oleft -= seg_len;
+	}
+
+	/* We should have consumed entire LSO msg. */
+	ASSERT3S(oleft, ==, 0);
+	ASSERT3P(odatamp, ==, NULL);
 
-			/*
-			 * In order to compute the full and header
-			 * checksums, we need to find and parse
-			 * the IP and/or ULP headers.
-			 */
+	/*
+	 * All seg data mblks are referenced by the header mblks, null
+	 * out this pointer to catch any bad derefs.
+	 */
+	ndatamp = NULL;
+
+	/*
+	 * Set headers and checksum for first segment.
+	 */
+	nhdrmp = seg_chain;
+	bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen);
+	nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+	niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+	ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss);
+	niph->ipha_length = htons(oiphlen + otcphlen + mss);
+	niph->ipha_hdr_checksum = 0;
+	ip_id = ntohs(niph->ipha_ident);
+	ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+	tcp_seq = BE32_TO_U32(ntcph->th_seq);
+	tcp_seq += mss;
+
+	/*
+	 * The first segment shouldn't:
+	 *
+	 *	o indicate end of data transmission (FIN),
+	 *	o indicate immediate handling of the data (PUSH).
+	 */
+	ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
+	DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
+
+	/*
+	 * If the underlying HW provides partial checksum, then make
+	 * sure to correct the pseudo header checksum before calling
+	 * mac_sw_cksum(). The native TCP stack doesn't include the
+	 * length field in the pseudo header when LSO is in play -- so
+	 * we need to calculate it here.
+	 */
+	if (ocsum_flags & HCK_PARTIALCKSUM) {
+		DB_CKSUMSTART(nhdrmp) = ocsum_start;
+		DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+		DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+		tcp_sum = BE16_TO_U16(ntcph->th_sum);
+		otcp_sum = tcp_sum;
+		tcp_sum += mss + otcphlen;
+		tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
+		U16_TO_BE16(tcp_sum, ntcph->th_sum);
+	}
+
+	if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+	    (emul & MAC_HWCKSUM_EMULS)) {
+		next_nhdrmp = nhdrmp->b_next;
+		nhdrmp->b_next = NULL;
+		nhdrmp = mac_sw_cksum(nhdrmp, emul);
+		nhdrmp->b_next = next_nhdrmp;
+		next_nhdrmp = NULL;
+
+		/*
+		 * We may have freed the nhdrmp argument during
+		 * checksum emulation, make sure that seg_chain
+		 * references a valid mblk.
+		 */
+		seg_chain = nhdrmp;
+	}
+
+	ASSERT3P(nhdrmp, !=, NULL);
 
-			sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
+	seg = 1;
+	DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+	    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+	    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss,
+	    uint_t, seg);
+	seg++;
 
+	/* There better be at least 2 segs. */
+	ASSERT3P(nhdrmp->b_next, !=, NULL);
+	prev_nhdrmp = nhdrmp;
+	nhdrmp = nhdrmp->b_next;
+
+	/*
+	 * Now adjust the headers of the middle segments. For each
+	 * header we need to adjust the following.
+	 *
+	 *	o IP ID
+	 *	o IP length
+	 *	o TCP sequence
+	 *	o TCP flags
+	 *	o cksum flags
+	 *	o cksum values (if MAC_HWCKSUM_EMUL is set)
+	 */
+	for (; seg < nsegs; seg++) {
+		/*
+		 * We use seg_chain as a reference to the first seg
+		 * header mblk -- this first header is a template for
+		 * the rest of the segments. This copy will include
+		 * the now updated checksum values from the first
+		 * header. We must reset these checksum values to
+		 * their original to make sure we produce the correct
+		 * value.
+		 */
+		bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
+		nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+		niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+		niph->ipha_ident = htons(++ip_id);
+		ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss);
+		niph->ipha_length = htons(oiphlen + otcphlen + mss);
+		niph->ipha_hdr_checksum = 0;
+		ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+		U32_TO_BE32(tcp_seq, ntcph->th_seq);
+		tcp_seq += mss;
+		/*
+		 * Just like the first segment, the middle segments
+		 * shouldn't have these flags set.
+		 */
+		ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
+		DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
+
+		if (ocsum_flags & HCK_PARTIALCKSUM) {
 			/*
-			 * IP header.
+			 * First and middle segs have same
+			 * pseudo-header checksum.
 			 */
-			if (sap != ETHERTYPE_IP)
-				continue;
+			U16_TO_BE16(tcp_sum, ntcph->th_sum);
+			DB_CKSUMSTART(nhdrmp) = ocsum_start;
+			DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+			DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+		}
 
-			ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
-			/* LINTED: improper alignment cast */
-			ipha = (ipha_t *)(mp->b_rptr + offset);
-
-			if (flags & HCK_FULLCKSUM) {
-				ipaddr_t src, dst;
-				uint32_t cksum;
-				uint16_t *up;
-				uint8_t proto;
-
-				/*
-				 * Pointer to checksum field in ULP header.
-				 */
-				proto = ipha->ipha_protocol;
-				ASSERT(ipha->ipha_version_and_hdr_length ==
-				    IP_SIMPLE_HDR_VERSION);
-
-				switch (proto) {
-				case IPPROTO_TCP:
-					/* LINTED: improper alignment cast */
-					up = IPH_TCPH_CHECKSUMP(ipha,
-					    IP_SIMPLE_HDR_LENGTH);
-					break;
-
-				case IPPROTO_UDP:
-					/* LINTED: improper alignment cast */
-					up = IPH_UDPH_CHECKSUMP(ipha,
-					    IP_SIMPLE_HDR_LENGTH);
-					break;
-
-				default:
-					cmn_err(CE_WARN, "mac_fix_cksum: "
-					    "unexpected protocol: %d", proto);
-					continue;
-				}
-
-				/*
-				 * Pseudo-header checksum.
-				 */
-				src = ipha->ipha_src;
-				dst = ipha->ipha_dst;
-				len = ntohs(ipha->ipha_length) -
-				    IP_SIMPLE_HDR_LENGTH;
-
-				cksum = (dst >> 16) + (dst & 0xFFFF) +
-				    (src >> 16) + (src & 0xFFFF);
-				cksum += htons(len);
-
-				/*
-				 * The checksum value stored in the packet needs
-				 * to be correct. Compute it here.
-				 */
-				*up = 0;
-				cksum += (((proto) == IPPROTO_UDP) ?
-				    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
-				cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
-				    offset, cksum);
-				*(up) = (uint16_t)(cksum ? cksum : ~cksum);
-
-				/*
-				 * Flag the packet so that it appears
-				 * that the checksum has already been
-				 * verified by the hardware.
-				 */
-				flags &= ~HCK_FULLCKSUM;
-				flags |= HCK_FULLCKSUM_OK;
-				value = 0;
-			}
+		if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+		    (emul & MAC_HWCKSUM_EMULS)) {
+			next_nhdrmp = nhdrmp->b_next;
+			nhdrmp->b_next = NULL;
+			nhdrmp = mac_sw_cksum(nhdrmp, emul);
+			nhdrmp->b_next = next_nhdrmp;
+			next_nhdrmp = NULL;
+			/* We may have freed the original nhdrmp. */
+			prev_nhdrmp->b_next = nhdrmp;
+		}
 
-			if (flags & HCK_IPV4_HDRCKSUM) {
-				ASSERT(ipha != NULL);
-				ipha->ipha_hdr_checksum =
-				    (uint16_t)ip_csum_hdr(ipha);
-				flags &= ~HCK_IPV4_HDRCKSUM;
-				flags |= HCK_IPV4_HDRCKSUM_OK;
+		DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+		    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+		    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen),
+		    uint_t, mss, uint_t, seg);
 
-			}
+		ASSERT3P(nhdrmp->b_next, !=, NULL);
+		prev_nhdrmp = nhdrmp;
+		nhdrmp = nhdrmp->b_next;
+	}
+
+	/* Make sure we are on the last segment. */
+	ASSERT3U(seg, ==, nsegs);
+	ASSERT3P(nhdrmp->b_next, ==, NULL);
+
+	/*
+	 * Now we set the last segment header. The difference being
+	 * that FIN/PSH/RST flags are allowed.
+	 */
+	bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
+	nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+	niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+	niph->ipha_ident = htons(++ip_id);
+	len = msgsize(nhdrmp->b_cont);
+	ASSERT3S(len, >, 0);
+	niph->ipha_length = htons(oiphlen + otcphlen + len);
+	niph->ipha_hdr_checksum = 0;
+	ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+	U32_TO_BE32(tcp_seq, ntcph->th_seq);
+
+	DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
+	if (ocsum_flags & HCK_PARTIALCKSUM) {
+		DB_CKSUMSTART(nhdrmp) = ocsum_start;
+		DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+		DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+		tcp_sum = otcp_sum;
+		tcp_sum += len + otcphlen;
+		tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
+		U16_TO_BE16(tcp_sum, ntcph->th_sum);
+	}
+
+	if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+	    (emul & MAC_HWCKSUM_EMULS)) {
+		/* This should be the last mblk. */
+		ASSERT3P(nhdrmp->b_next, ==, NULL);
+		nhdrmp = mac_sw_cksum(nhdrmp, emul);
+		prev_nhdrmp->b_next = nhdrmp;
+	}
+
+	DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+	    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+	    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len,
+	    uint_t, seg);
+
+	/*
+	 * Free the reference to the original LSO message as it is
+	 * being replaced by seg_cahin.
+	 */
+	freemsg(omp);
+	*head = seg_chain;
+	*tail = nhdrmp;
+	*count = nsegs;
+	return;
+
+fail:
+	*head = NULL;
+	*tail = NULL;
+	*count = 0;
+}
+
+#define	HCK_NEEDED	(HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM)
+
+/*
+ * Emulate various hardware offload features in software. Take a chain
+ * of packets as input and emulate the hardware features specified in
+ * 'emul'. The resulting chain's head pointer replaces the 'mp_chain'
+ * pointer given as input, and its tail pointer is written to
+ * '*otail'. The number of packets in the new chain is written to
+ * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus
+ * may be NULL. The 'mp_chain' argument may point to a NULL chain; in
+ * which case 'mp_chain' will simply stay a NULL chain.
+ *
+ * While unlikely, it is technically possible that this function could
+ * receive a non-NULL chain as input and return a NULL chain as output
+ * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be
+ * zero). This could happen if all the packets in the chain are
+ * dropped or if we fail to allocate new mblks. In this case, there is
+ * nothing for the caller to free. In any event, the caller shouldn't
+ * assume that '*mp_chain' is non-NULL on return.
+ *
+ * This function was written with three main use cases in mind.
+ *
+ * 1. To emulate hardware offloads when traveling mac-loopback (two
+ *    clients on the same mac). This is wired up in mac_tx_send().
+ *
+ * 2. To provide hardware offloads to the client when the underlying
+ *    provider cannot. This is currently wired up in mac_tx() but we
+ *    still only negotiate offloads when the underlying provider
+ *    supports them.
+ *
+ * 3. To emulate real hardware in simnet.
+ */
+void
+mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul)
+{
+	mblk_t *head = NULL, *tail = NULL;
+	uint_t count = 0;
+
+	ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0);
+	ASSERT3P(mp_chain, !=, NULL);
+
+	for (mblk_t *mp = *mp_chain; mp != NULL; ) {
+		mblk_t *tmp, *next, *tmphead, *tmptail;
+		struct ether_header *ehp;
+		uint32_t flags;
+		uint_t len = MBLKL(mp), l2len;
+
+		/* Perform LSO/cksum one message at a time. */
+		next = mp->b_next;
+		mp->b_next = NULL;
+
+		/*
+		 * For our sanity the first mblk should contain at
+		 * least the full L2 header.
+		 */
+		if (len < sizeof (struct ether_header)) {
+			mac_drop_pkt(mp, "packet too short (A): %u", len);
+			mp = next;
+			continue;
 		}
 
-		if (flags & HCK_PARTIALCKSUM) {
-			uint16_t *up, partial, cksum;
-			uchar_t *ipp; /* ptr to beginning of IP header */
-
-			if (mp->b_cont != NULL) {
-				mblk_t *mp1;
-
-				mp1 = msgpullup(mp, offset + end);
-				if (mp1 == NULL)
-					continue;
-				mp1->b_next = mp->b_next;
-				mp->b_next = NULL;
-				freemsg(mp);
-				if (prev != NULL)
-					prev->b_next = mp1;
-				else
-					new_chain = mp1;
-				mp = mp1;
-			}
+		ehp = (struct ether_header *)mp->b_rptr;
+		if (ntohs(ehp->ether_type) == VLAN_TPID)
+			l2len = sizeof (struct ether_vlan_header);
+		else
+			l2len = sizeof (struct ether_header);
 
-			ipp = mp->b_rptr + offset;
-			/* LINTED: cast may result in improper alignment */
-			up = (uint16_t *)((uchar_t *)ipp + stuff);
-			partial = *up;
-			*up = 0;
+		/*
+		 * If the first mblk is solely the L2 header, then
+		 * there better be more data.
+		 */
+		if (len < l2len || (len == l2len && mp->b_cont == NULL)) {
+			mac_drop_pkt(mp, "packet too short (C): %u", len);
+			mp = next;
+			continue;
+		}
+
+		DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul);
+
+		/*
+		 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get())
+		 * because we don't want to mask-out the LSO flag.
+		 */
+		flags = DB_CKSUMFLAGS(mp);
 
-			cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
-			    end - start, partial);
-			cksum = ~cksum;
-			*up = cksum ? cksum : ~cksum;
+		if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) {
+			uint_t tmpcount = 0;
 
 			/*
-			 * Since we already computed the whole checksum,
-			 * indicate to the stack that it has already
-			 * been verified by the hardware.
+			 * LSO fix-up handles checksum emulation
+			 * inline (if requested). It also frees mp.
 			 */
-			flags &= ~HCK_PARTIALCKSUM;
-			flags |= HCK_FULLCKSUM_OK;
-			value = 0;
+			mac_sw_lso(mp, emul, &tmphead, &tmptail,
+			    &tmpcount);
+			if (tmphead == NULL) {
+				/* mac_sw_lso() freed the mp. */
+				mp = next;
+				continue;
+			}
+			count += tmpcount;
+		} else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) {
+			tmp = mac_sw_cksum(mp, emul);
+			if (tmp == NULL) {
+				/* mac_sw_cksum() freed the mp. */
+				mp = next;
+				continue;
+			}
+			tmphead = tmp;
+			tmptail = tmp;
+			count++;
+		} else {
+			/* There is nothing to emulate. */
+			tmp = mp;
+			tmphead = tmp;
+			tmptail = tmp;
+			count++;
+		}
+
+		/*
+		 * The tmp mblk chain is either the start of the new
+		 * chain or added to the tail of the new chain.
+		 */
+		if (head == NULL) {
+			head = tmphead;
+			tail = tmptail;
+		} else {
+			/* Attach the new mblk to the end of the new chain. */
+			tail->b_next = tmphead;
+			tail = tmptail;
 		}
 
-		mac_hcksum_set(mp, start, stuff, end, value, flags);
+		mp = next;
 	}
 
-	return (new_chain);
+	*mp_chain = head;
+
+	if (otail != NULL)
+		*otail = tail;
+
+	if (ocount != NULL)
+		*ocount = count;
 }
 
 /*
@@ -449,17 +1568,10 @@ mac_strip_vlan_tag_chain(mblk_t *mp_chain)
  */
 /* ARGSUSED */
 void
-mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
+mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain,
     boolean_t loopback)
 {
-	mblk_t	*mp1 = mp;
-
-	while (mp1 != NULL) {
-		mp1->b_prev = NULL;
-		mp1->b_queue = NULL;
-		mp1 = mp1->b_next;
-	}
-	freemsgchain(mp);
+	freemsgchain(mp_chain);
 }
 
 /*
-- 
cgit v1.2.3