summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorRyan Zezeski <rpz@joyent.com>2018-09-21 08:48:14 -0600
committerRyan Zezeski <rpz@joyent.com>2018-09-21 13:41:23 -0600
commit6e965f095af82dd97c746e5670ab4c1620145bea (patch)
tree42b61d39dabafd73cd14710f809850226b75ac30 /usr/src
parent198b273b500c16488856bc36cabe112849dd6d3b (diff)
downloadillumos-joyent-rpz-lso.tar.gz
OS-2340 vnics should support LSOrpz-lso
OS-6778 MAC loopback traffic should avoid cksum work OS-6794 want LSO support in viona
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/uts/common/inet/ip/ip_input.c16
-rw-r--r--usr/src/uts/common/inet/ip_impl.h23
-rw-r--r--usr/src/uts/common/io/bridge.c192
-rw-r--r--usr/src/uts/common/io/dls/dls_link.c47
-rw-r--r--usr/src/uts/common/io/mac/mac.c2
-rw-r--r--usr/src/uts/common/io/mac/mac_bcast.c10
-rw-r--r--usr/src/uts/common/io/mac/mac_client.c75
-rw-r--r--usr/src/uts/common/io/mac/mac_datapath_setup.c2
-rw-r--r--usr/src/uts/common/io/mac/mac_flow.c3
-rw-r--r--usr/src/uts/common/io/mac/mac_provider.c18
-rw-r--r--usr/src/uts/common/io/mac/mac_sched.c105
-rw-r--r--usr/src/uts/common/io/mac/mac_soft_ring.c2
-rw-r--r--usr/src/uts/common/io/mac/mac_util.c1323
-rw-r--r--usr/src/uts/common/io/simnet/simnet.c8
-rw-r--r--usr/src/uts/common/io/vnic/vnic_dev.c23
-rw-r--r--usr/src/uts/common/sys/mac.h32
-rw-r--r--usr/src/uts/common/sys/mac_client.h2
-rw-r--r--usr/src/uts/common/sys/mac_client_impl.h3
-rw-r--r--usr/src/uts/common/sys/mac_impl.h17
-rw-r--r--usr/src/uts/common/sys/pattr.h20
-rw-r--r--usr/src/uts/common/sys/vnic_impl.h3
-rw-r--r--usr/src/uts/common/xen/io/xnb.c5
-rw-r--r--usr/src/uts/i86pc/io/viona/viona.c220
23 files changed, 1717 insertions, 434 deletions
diff --git a/usr/src/uts/common/inet/ip/ip_input.c b/usr/src/uts/common/inet/ip/ip_input.c
index 88f80a926b..22c1c74391 100644
--- a/usr/src/uts/common/inet/ip/ip_input.c
+++ b/usr/src/uts/common/inet/ip/ip_input.c
@@ -57,6 +57,7 @@
#include <sys/vtrace.h>
#include <sys/isa_defs.h>
#include <sys/mac.h>
+#include <sys/mac_client.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <net/route.h>
@@ -659,11 +660,13 @@ ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
}
/*
- * If there is a good HW IP header checksum we clear the need
+ * If the packet originated from a same-machine sender or
+ * there is a good HW IP header checksum, we clear the need
* look at the IP header checksum.
*/
- if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&
- ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
+ if ((DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) ||
+ ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&
+ ILL_HCKSUM_CAPABLE(ill) && dohwcksum)) {
/* Header checksum was ok. Clear the flag */
DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
@@ -2256,12 +2259,13 @@ ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha,
* We apply this for all ULP protocols. Does the HW know to
* not set the flags for SCTP and other protocols.
*/
-
hck_flags = DB_CKSUMFLAGS(mp);
- if (hck_flags & HCK_FULLCKSUM_OK) {
+ if ((hck_flags & HCK_FULLCKSUM_OK) || (hck_flags & HW_LOCAL_MAC)) {
/*
- * Hardware has already verified the checksum.
+ * Either the hardware already verified the checksum
+ * or the packet is from a same-machine sender in
+ * which case we assume data integrity.
*/
return (B_TRUE);
}
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index 2b37528eb9..fc90e6f217 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _INET_IP_IMPL_H
@@ -159,9 +160,27 @@ extern "C" {
#define ILL_DIRECT_CAPABLE(ill) \
(((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0)
-/* This macro is used by the mac layer */
+/*
+ * Determine if a mblk needs to take the "slow path", aka OTH
+ * softring. There are multiple reasons why a mblk might take the slow
+ * path.
+ *
+ * o The mblk is not a data message.
+ *
+ * o There is more than one outstanding reference to the mblk and it
+ * does not originate from a local MAC client. If the mblk does
+ * originate from a local MAC then allow it to pass through with
+ * more than one reference and leave the copying up to the consumer.
+ *
+ * o The IP header is not aligned (we assume alignment in the checksum
+ * routine).
+ *
+ * o The mblk doesn't contain enough data to populate a simple IP header.
+ */
#define MBLK_RX_FANOUT_SLOWPATH(mp, ipha) \
- (DB_TYPE(mp) != M_DATA || DB_REF(mp) != 1 || !OK_32PTR(ipha) || \
+ (DB_TYPE(mp) != M_DATA || \
+ (DB_REF(mp) != 1 && ((DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) == 0)) || \
+ !OK_32PTR(ipha) || \
(((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH) >= (mp)->b_wptr))
/*
diff --git a/usr/src/uts/common/io/bridge.c b/usr/src/uts/common/io/bridge.c
index 97ee9f1f0e..587de5c131 100644
--- a/usr/src/uts/common/io/bridge.c
+++ b/usr/src/uts/common/io/bridge.c
@@ -23,6 +23,7 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -41,6 +42,7 @@
#include <sys/modctl.h>
#include <sys/note.h>
#include <sys/param.h>
+#include <sys/pattr.h>
#include <sys/policy.h>
#include <sys/sdt.h>
#include <sys/stat.h>
@@ -1692,7 +1694,8 @@ bridge_learn(bridge_link_t *blp, const uint8_t *saddr, uint16_t ingress_nick,
* The passed-in tci is the "impossible" value 0xFFFF when no tag is present.
*/
static mblk_t *
-reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
+reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid,
+ boolean_t keep_flags)
{
boolean_t source_has_tag = (tci != 0xFFFF);
mblk_t *mpcopy;
@@ -1704,8 +1707,13 @@ reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
if (mp == NULL)
return (mp);
- /* No forwarded packet can have hardware checksum enabled */
- DB_CKSUMFLAGS(mp) = 0;
+ /*
+ * A forwarded packet cannot have HW offloads enabled unless
+ * the destination is known to be local to the host and HW
+ * offloads haven't been emulated.
+ */
+ if (!keep_flags)
+ DB_CKSUMFLAGS(mp) = 0;
/* Get the no-modification cases out of the way first */
if (!source_has_tag && vlanid == pvid) /* 1a */
@@ -1906,17 +1914,46 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
blp->bl_trillthreads++;
mutex_exit(&blp->bl_trilllock);
update_header(mp, hdr_info, B_FALSE);
- if (is_xmit)
- mp = mac_fix_cksum(mp);
- /* all trill data frames have Inner.VLAN */
- mp = reform_vlan_header(mp, vlanid, tci, 0);
- if (mp == NULL) {
- KIINCR(bki_drops);
- fwd_unref(bfp);
- return (NULL);
+
+ if (is_xmit) {
+ mac_hw_emul(&mp, NULL, NULL,
+ MAC_HWCKSUM_EMUL | MAC_LSO_EMUL);
+
+ if (mp == NULL) {
+ KIINCR(bki_drops);
+ goto done;
+ }
}
- trill_encap_fn(tdp, blp, hdr_info, mp,
- bfp->bf_trill_nick);
+
+ while (mp != NULL) {
+ mblk_t *next = mp->b_next;
+
+ mp->b_next = NULL;
+
+ /*
+ * All trill data frames have
+ * Inner.VLAN.
+ */
+ mp = reform_vlan_header(mp, vlanid, tci,
+ 0, B_FALSE);
+
+ if (mp == NULL) {
+ /*
+ * Make sure to free
+ * any remaining
+ * segments.
+ */
+ freemsgchain(next);
+ KIINCR(bki_drops);
+ goto done;
+ }
+
+ trill_encap_fn(tdp, blp, hdr_info, mp,
+ bfp->bf_trill_nick);
+ mp = next;
+ }
+
+done:
mutex_enter(&blp->bl_trilllock);
if (--blp->bl_trillthreads == 0 &&
blp->bl_trilldata == NULL)
@@ -1958,31 +1995,68 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
mpsend = copymsg(mp);
}
- if (!from_trill && is_xmit)
- mpsend = mac_fix_cksum(mpsend);
+ /*
+ * If the destination is not local to the host
+ * then we need to emulate HW offloads because
+ * we can't guarantee the forwarding
+ * destination provides them.
+ */
+ if (!from_trill && is_xmit &&
+ !(bfp->bf_flags & BFF_LOCALADDR)) {
+ mac_hw_emul(&mpsend, NULL, NULL,
+ MAC_HWCKSUM_EMUL | MAC_LSO_EMUL);
- mpsend = reform_vlan_header(mpsend, vlanid, tci,
- blpsend->bl_pvid);
- if (mpsend == NULL) {
- KIINCR(bki_drops);
- continue;
+ if (mpsend == NULL) {
+ KIINCR(bki_drops);
+ continue;
+ }
+ }
+
+ /*
+ * The HW emulation above may have segmented
+ * an LSO mblk.
+ */
+ while ((mpsend != NULL) &&
+ !(bfp->bf_flags & BFF_LOCALADDR)) {
+ mblk_t *next = mpsend->b_next;
+
+ mpsend->b_next = NULL;
+ mpsend = reform_vlan_header(mpsend, vlanid, tci,
+ blpsend->bl_pvid, B_FALSE);
+
+ if (mpsend == NULL) {
+ KIINCR(bki_drops);
+ mpsend = next;
+ continue;
+ }
+
+ KIINCR(bki_forwards);
+ KLPINCR(blpsend, bkl_xmit);
+ MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
+ mpsend);
+ freemsg(mpsend);
+ mpsend = next;
}
- KIINCR(bki_forwards);
/*
* No need to bump up the link reference count, as
* the forwarding entry itself holds a reference to
* the link.
*/
if (bfp->bf_flags & BFF_LOCALADDR) {
+ mpsend = reform_vlan_header(mpsend, vlanid, tci,
+ blpsend->bl_pvid, B_TRUE);
+
+ if (mpsend == NULL) {
+ KIINCR(bki_drops);
+ continue;
+ }
+
+ KIINCR(bki_forwards);
mac_rx_common(blpsend->bl_mh, NULL, mpsend);
- } else {
- KLPINCR(blpsend, bkl_xmit);
- MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
- mpsend);
- freemsg(mpsend);
}
}
+
/*
* Handle a special case: if we're transmitting to the original
* link, then check whether the localaddr flag is set. If it
@@ -2018,7 +2092,7 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
* Inner.VLAN
*/
mpsend = reform_vlan_header(mpsend,
- vlanid, tci, 0);
+ vlanid, tci, 0, B_FALSE);
if (mpsend == NULL) {
KIINCR(bki_drops);
} else {
@@ -2069,25 +2143,57 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
mpsend = copymsg(mp);
}
- if (!from_trill && is_xmit)
- mpsend = mac_fix_cksum(mpsend);
+ /*
+ * In this case, send to all links connected
+ * to the bridge. Some of these destinations
+ * may not provide HW offload -- so just
+ * emulate it here.
+ */
+ if (!from_trill && is_xmit) {
+ mac_hw_emul(&mpsend, NULL, NULL,
+ MAC_HWCKSUM_EMUL | MAC_LSO_EMUL);
- mpsend = reform_vlan_header(mpsend, vlanid, tci,
- blpsend->bl_pvid);
- if (mpsend == NULL) {
- KIINCR(bki_drops);
- continue;
+ if (mpsend == NULL) {
+ KIINCR(bki_drops);
+ continue;
+ }
+ }
+
+ /*
+ * The HW emulation above may have segmented
+ * an LSO mblk.
+ */
+ while (mpsend != NULL) {
+ mblk_t *next = mpsend->b_next;
+
+ mpsend->b_next = NULL;
+ mpsend = reform_vlan_header(mpsend, vlanid, tci,
+ blpsend->bl_pvid, B_FALSE);
+
+ if (mpsend == NULL) {
+ KIINCR(bki_drops);
+ mpsend = next;
+ continue;
+ }
+
+ if (hdr_info->mhi_dsttype ==
+ MAC_ADDRTYPE_UNICAST)
+ KIINCR(bki_unknown);
+ else
+ KIINCR(bki_mbcast);
+
+ KLPINCR(blpsend, bkl_xmit);
+ if ((mpcopy = copymsg(mpsend)) != NULL) {
+ mac_rx_common(blpsend->bl_mh, NULL,
+ mpcopy);
+ }
+
+ MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
+ mpsend);
+ freemsg(mpsend);
+ mpsend = next;
}
- if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST)
- KIINCR(bki_unknown);
- else
- KIINCR(bki_mbcast);
- KLPINCR(blpsend, bkl_xmit);
- if ((mpcopy = copymsg(mpsend)) != NULL)
- mac_rx_common(blpsend->bl_mh, NULL, mpcopy);
- MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, mpsend);
- freemsg(mpsend);
link_unref(blpsend);
}
}
diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c
index 904cb47ba4..c792251052 100644
--- a/usr/src/uts/common/io/dls/dls_link.c
+++ b/usr/src/uts/common/io/dls/dls_link.c
@@ -30,6 +30,7 @@
#include <sys/sysmacros.h>
#include <sys/strsubr.h>
+#include <sys/pattr.h>
#include <sys/strsun.h>
#include <sys/vlan.h>
#include <sys/dld_impl.h>
@@ -162,6 +163,18 @@ i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip,
uint16_t cvid, cpri;
int err;
+ /*
+ * If this message is from a same-machine sender, then
+ * there may be HW checksum offloads to emulate.
+ */
+ if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) {
+ mblk_t *tmpnext = mp->b_next;
+
+ mp->b_next = NULL;
+ mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
+ mp->b_next = tmpnext;
+ }
+
DLS_PREPARE_PKT(dlp->dl_mh, mp, &cmhi, err);
if (err != 0)
break;
@@ -356,6 +369,22 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
int err, rval;
/*
+ * The mac_hw_emul() function, by design, doesn't predicate on
+ * HW_LOCAL_MAC. But since we are in Rx context we know that
+ * any LSO packet must also be from a same-machine sender. We
+ * take advantage of that and forgoe writing a manual loop to
+ * predicate on HW_LOCAL_MAC.
+ *
+ * But for checksum emulation we need to predicate on
+ * HW_LOCAL_MAC to avoid calling mac_hw_emul() on packets that
+ * don't need it (thanks to the fact that HCK_IPV4_HDRCKSUM
+ * and HCK_IPV4_HDRCKSUM_OK use the same value). Therefore we
+ * do the checksum emulation in the second loop and in
+ * subchain matching.
+ */
+ mac_hw_emul(&mp, NULL, NULL, MAC_LSO_EMUL);
+
+ /*
* Walk the packet chain.
*/
for (; mp != NULL; mp = nextp) {
@@ -364,6 +393,18 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
*/
accepted = B_FALSE;
+ /*
+ * If this message is from a same-machine sender, then
+ * there may be HW checksum offloads to emulate.
+ */
+ if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) {
+ mblk_t *tmpnext = mp->b_next;
+
+ mp->b_next = NULL;
+ mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
+ mp->b_next = tmpnext;
+ }
+
DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err);
if (err != 0) {
atomic_inc_32(&(dlp->dl_unknowns));
@@ -566,7 +607,13 @@ dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
dls_head_t *dhp;
mod_hash_key_t key;
+ /*
+ * We expect to deal with only a single packet.
+ */
+ ASSERT3P(mp->b_next, ==, NULL);
+
DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err);
+
if (err != 0)
goto drop;
diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c
index 2176f7d2af..a63a6a5c61 100644
--- a/usr/src/uts/common/io/mac/mac.c
+++ b/usr/src/uts/common/io/mac/mac.c
@@ -1670,7 +1670,7 @@ mac_client_clear_flow_cb(mac_client_handle_t mch)
flow_entry_t *flent = mcip->mci_flent;
mutex_enter(&flent->fe_lock);
- flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+ flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
flent->fe_cb_arg1 = NULL;
flent->fe_cb_arg2 = NULL;
flent->fe_flags |= FE_MC_NO_DATAPATH;
diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c
index 1ff33c3578..3b674be1d0 100644
--- a/usr/src/uts/common/io/mac/mac_bcast.c
+++ b/usr/src/uts/common/io/mac/mac_bcast.c
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/types.h>
@@ -146,7 +147,7 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
uint64_t gen;
uint_t i;
mblk_t *mp_chain1;
- flow_entry_t *flent;
+ flow_entry_t *flent;
int err;
rw_enter(&mip->mi_rw_lock, RW_READER);
@@ -182,13 +183,6 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
*/
if ((mp_chain1 = mac_copymsgchain_cksum(mp_chain)) == NULL)
break;
- /*
- * Fix the checksum for packets originating
- * from the local machine.
- */
- if ((src_mcip != NULL) &&
- (mp_chain1 = mac_fix_cksum(mp_chain1)) == NULL)
- break;
FLOW_TRY_REFHOLD(flent, err);
if (err != 0) {
diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c
index da944d79d4..de5ef6121f 100644
--- a/usr/src/uts/common/io/mac/mac_client.c
+++ b/usr/src/uts/common/io/mac/mac_client.c
@@ -114,6 +114,7 @@
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
+#include <sys/pattr.h>
#include <sys/dlpi.h>
#include <sys/modhash.h>
#include <sys/mac_impl.h>
@@ -1356,7 +1357,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name,
mcip->mci_mip = mip;
mcip->mci_upper_mip = NULL;
- mcip->mci_rx_fn = mac_pkt_drop;
+ mcip->mci_rx_fn = mac_rx_def;
mcip->mci_rx_arg = NULL;
mcip->mci_rx_p_fn = NULL;
mcip->mci_rx_p_arg = NULL;
@@ -1628,7 +1629,7 @@ mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg)
void
mac_rx_clear(mac_client_handle_t mch)
{
- mac_rx_set(mch, mac_pkt_drop, NULL);
+ mac_rx_set(mch, mac_rx_def, NULL);
}
void
@@ -2969,7 +2970,7 @@ mac_client_datapath_teardown(mac_client_handle_t mch, mac_unicast_impl_t *muip,
mac_misc_stat_delete(flent);
/* Initialize the receiver function to a safe routine */
- flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+ flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
flent->fe_cb_arg1 = NULL;
flent->fe_cb_arg2 = NULL;
@@ -3590,6 +3591,13 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint,
obytes = (mp_chain->b_cont == NULL ? MBLKL(mp_chain) :
msgdsize(mp_chain));
+ /*
+ * There's a chance this primary client might be part
+ * of a bridge and the packet forwarded to a local
+ * receiver -- mark the packet accordingly.
+ */
+ DB_CKSUMFLAGS(mp_chain) |= HW_LOCAL_MAC;
+
MAC_TX(mip, srs_tx->st_arg2, mp_chain, mcip);
if (mp_chain == NULL) {
cookie = NULL;
@@ -4003,21 +4011,36 @@ mac_client_get_effective_resources(mac_client_handle_t mch,
* The unicast packets of MAC_CLIENT_PROMISC_FILTER callbacks are dispatched
* after classification by mac_rx_deliver().
*/
-
static void
mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp,
- boolean_t loopback, boolean_t local)
+ boolean_t loopback)
{
- mblk_t *mp_copy, *mp_next;
+ mblk_t *mp_next;
+ boolean_t local = (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) != 0;
if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag ||
(mpip->mpi_do_fixups && local)) {
+ mblk_t *mp_copy;
+
mp_copy = copymsg(mp);
if (mp_copy == NULL)
return;
+ /*
+ * The consumer has requested we emulate HW offloads
+ * for host-local packets.
+ */
if (mpip->mpi_do_fixups && local) {
- mp_copy = mac_fix_cksum(mp_copy);
+ /*
+ * Remember that copymsg() doesn't copy
+ * b_next, so we are only passing a single
+ * packet to mac_hw_emul(). Also keep in mind
+ * that mp_copy will become an mblk chain if
+ * the argument is an LSO message.
+ */
+ mac_hw_emul(&mp_copy, NULL, NULL,
+ MAC_HWCKSUM_EMUL | MAC_LSO_EMUL);
+
if (mp_copy == NULL)
return;
}
@@ -4027,16 +4050,24 @@ mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp,
if (mp_copy == NULL)
return;
}
- mp_next = NULL;
- } else {
- mp_copy = mp;
- mp_next = mp->b_next;
+
+ /*
+ * There is code upstack that can't deal with message
+ * chains.
+ */
+ for (mblk_t *tmp = mp_copy; tmp != NULL; tmp = mp_next) {
+ mp_next = tmp->b_next;
+ tmp->b_next = NULL;
+ mpip->mpi_fn(mpip->mpi_arg, NULL, tmp, loopback);
+ }
+
+ return;
}
- mp_copy->b_next = NULL;
- mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback);
- if (mp_copy == mp)
- mp->b_next = mp_next;
+ mp_next = mp->b_next;
+ mp->b_next = NULL;
+ mpip->mpi_fn(mpip->mpi_arg, NULL, mp, loopback);
+ mp->b_next = mp_next;
}
/*
@@ -4078,7 +4109,7 @@ mac_is_mcast(mac_impl_t *mip, mblk_t *mp)
*/
void
mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain,
- mac_client_impl_t *sender, boolean_t local)
+ mac_client_impl_t *sender)
{
mac_promisc_impl_t *mpip;
mac_cb_t *mcb;
@@ -4119,8 +4150,7 @@ mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain,
if (is_sender ||
mpip->mpi_type == MAC_CLIENT_PROMISC_ALL ||
is_mcast) {
- mac_promisc_dispatch_one(mpip, mp, is_sender,
- local);
+ mac_promisc_dispatch_one(mpip, mp, is_sender);
}
}
}
@@ -4150,8 +4180,7 @@ mac_promisc_client_dispatch(mac_client_impl_t *mcip, mblk_t *mp_chain)
mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
if (mpip->mpi_type == MAC_CLIENT_PROMISC_FILTERED &&
!is_mcast) {
- mac_promisc_dispatch_one(mpip, mp, B_FALSE,
- B_FALSE);
+ mac_promisc_dispatch_one(mpip, mp, B_FALSE);
}
}
}
@@ -4249,8 +4278,9 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
mac_impl_t *mip = (mac_impl_t *)mh;
/*
- * if mi_nactiveclients > 1, only MAC_CAPAB_LEGACY, MAC_CAPAB_HCKSUM,
- * MAC_CAPAB_NO_NATIVEVLAN and MAC_CAPAB_NO_ZCOPY can be advertised.
+ * Some capabilities are restricted when there are more than one active
+ * clients on the MAC resource. The ones noted below are safe,
+ * independent of that count.
*/
if (mip->mi_nactiveclients > 1) {
switch (cap) {
@@ -4258,6 +4288,7 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
return (B_TRUE);
case MAC_CAPAB_LEGACY:
case MAC_CAPAB_HCKSUM:
+ case MAC_CAPAB_LSO:
case MAC_CAPAB_NO_NATIVEVLAN:
break;
default:
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
index 6eea8b0343..70585df698 100644
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -3496,7 +3496,7 @@ mac_srs_free(mac_soft_ring_set_t *mac_srs)
ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE |
SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE));
- mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE);
+ mac_drop_chain(mac_srs->srs_first, "SRS free");
mac_srs_ring_free(mac_srs);
mac_srs_soft_rings_free(mac_srs);
mac_srs_fanout_list_free(mac_srs);
diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c
index aa4985fe4c..62612122d6 100644
--- a/usr/src/uts/common/io/mac/mac_flow.c
+++ b/usr/src/uts/common/io/mac/mac_flow.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/strsun.h>
@@ -229,7 +230,7 @@ mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
/* Initialize the receiver function to a safe routine */
- flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+ flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
flent->fe_index = -1;
}
(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c
index d739fad87a..607ced864d 100644
--- a/usr/src/uts/common/io/mac/mac_provider.c
+++ b/usr/src/uts/common/io/mac/mac_provider.c
@@ -688,7 +688,7 @@ mac_trill_snoop(mac_handle_t mh, mblk_t *mp)
mac_impl_t *mip = (mac_impl_t *)mh;
if (mip->mi_promisc_list != NULL)
- mac_promisc_dispatch(mip, mp, NULL, B_FALSE);
+ mac_promisc_dispatch(mip, mp, NULL);
}
/*
@@ -708,7 +708,7 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
* this MAC, pass them a copy if appropriate.
*/
if (mip->mi_promisc_list != NULL)
- mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE);
+ mac_promisc_dispatch(mip, mp_chain, NULL);
if (mr != NULL) {
/*
@@ -1541,11 +1541,17 @@ mac_hcksum_clone(const mblk_t *src, mblk_t *dst)
ASSERT3U(DB_TYPE(dst), ==, M_DATA);
/*
- * Do these assignments unconditionally, rather than only when flags is
- * non-zero. This protects a situation where zeroed hcksum data does
- * not make the jump onto an mblk_t with stale data in those fields.
+ * Do these assignments unconditionally, rather than only when
+ * flags is non-zero. This protects a situation where zeroed
+ * hcksum data does not make the jump onto an mblk_t with
+ * stale data in those fields. It's important to copy all
+ * possible flags (HCK_* as well as HW_*) and not just the
+ * checksum specific flags. Dropping flags during a clone
+ * could result in dropped packets. If the caller has good
+ * reason to drop those flags then it should do it manually,
+ * after the clone.
*/
- DB_CKSUMFLAGS(dst) = (DB_CKSUMFLAGS(src) & HCK_FLAGS);
+ DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src);
DB_CKSUMSTART(dst) = DB_CKSUMSTART(src);
DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src);
DB_CKSUMEND(dst) = DB_CKSUMEND(src);
diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c
index 59d59287b4..e42cbd1320 100644
--- a/usr/src/uts/common/io/mac/mac_sched.c
+++ b/usr/src/uts/common/io/mac/mac_sched.c
@@ -968,6 +968,7 @@
#include <sys/types.h>
#include <sys/callb.h>
+#include <sys/pattr.h>
#include <sys/sdt.h>
#include <sys/strsubr.h>
#include <sys/strsun.h>
@@ -1327,7 +1328,7 @@ int mac_srs_worker_wakeup_ticks = 0;
* b_prev may be set to the fanout hint \
* hence can't use freemsg directly \
*/ \
- mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \
+ mac_drop_chain(mp_chain, "SRS Tx max queue"); \
DTRACE_PROBE1(tx_queued_hiwat, \
mac_soft_ring_set_t *, srs); \
enqueue = 0; \
@@ -1346,11 +1347,11 @@ int mac_srs_worker_wakeup_ticks = 0;
if (!(srs->srs_type & SRST_TX)) \
mutex_exit(&srs->srs_bw->mac_bw_lock);
-#define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \
- mac_pkt_drop(NULL, NULL, mp, B_FALSE); \
+#define MAC_TX_SRS_DROP_MESSAGE(srs, chain, cookie, s) { \
+ mac_drop_pkt((chain), (s)); \
/* increment freed stats */ \
- mac_srs->srs_tx.st_stat.mts_sdrops++; \
- cookie = (mac_tx_cookie_t)srs; \
+ (srs)->srs_tx.st_stat.mts_sdrops++; \
+ (cookie) = (mac_tx_cookie_t)(srs); \
}
#define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \
@@ -2321,7 +2322,7 @@ check_again:
if (smcip->mci_mip->mi_promisc_list != NULL) {
mutex_exit(lock);
mac_promisc_dispatch(smcip->mci_mip,
- head, NULL, B_FALSE);
+ head, NULL);
mutex_enter(lock);
}
}
@@ -2893,7 +2894,7 @@ again:
mac_srs->srs_bw->mac_bw_sz -= sz;
mac_srs->srs_bw->mac_bw_drop_bytes += sz;
mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
- mac_pkt_drop(NULL, NULL, head, B_FALSE);
+ mac_drop_chain(head, "Rx no bandwidth");
goto leave_poll;
} else {
mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
@@ -3275,9 +3276,10 @@ mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
}
/*
- * mac_rx_srs_process
- *
- * Receive side routine called from the interrupt path.
+ * MAC SRS receive side routine. If the data is coming from the
+ * network (i.e. from a NIC) then this is called in interrupt context.
+ * If the data is coming from a local sender (e.g. mac_tx_send() or
+ * bridge_forward()) then this is not called in interrupt context.
*
* loopback is set to force a context switch on the loopback
* path between MAC clients.
@@ -3337,7 +3339,7 @@ mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
mac_bw->mac_bw_drop_bytes += sz;
mutex_exit(&mac_bw->mac_bw_lock);
mutex_exit(&mac_srs->srs_lock);
- mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+ mac_drop_chain(mp_chain, "Rx no bandwidth");
return;
} else {
if ((mac_bw->mac_bw_sz + sz) <=
@@ -3459,7 +3461,8 @@ mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
if (flag & MAC_DROP_ON_NO_DESC) {
- MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+ "Tx no desc");
} else {
if (mac_srs->srs_first != NULL)
wakeup_worker = B_FALSE;
@@ -3522,7 +3525,8 @@ mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
if (flag & MAC_DROP_ON_NO_DESC) {
if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
- MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+ "Tx SRS hiwat");
} else {
MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
mp_chain, tail, cnt, sz);
@@ -3895,7 +3899,8 @@ mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
cookie = (mac_tx_cookie_t)mac_srs;
*ret_mp = mp_chain;
} else {
- MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+ "Tx no bandwidth");
}
mutex_exit(&mac_srs->srs_lock);
return (cookie);
@@ -4341,6 +4346,14 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
obytes += (mp->b_cont == NULL ? MBLKL(mp) :
msgdsize(mp));
+ /*
+ * Mark all packets as local so that a
+ * receiver can determine if a packet arrived
+ * from a local source or from the network.
+ * This allows some consumers to avoid
+ * unecessary work like checksum computation.
+ */
+ DB_CKSUMFLAGS(mp) |= HW_LOCAL_MAC;
CHECK_VID_AND_ADD_TAG(mp);
MAC_TX(mip, ring, mp, src_mcip);
@@ -4373,7 +4386,6 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
flow_entry_t *dst_flow_ent;
void *flow_cookie;
size_t pkt_size;
- mblk_t *mp1;
next = mp->b_next;
mp->b_next = NULL;
@@ -4383,49 +4395,25 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
CHECK_VID_AND_ADD_TAG(mp);
/*
+ * Mark all packets as local so that a receiver can
+ * determine if a packet arrived from a local source
+ * or from the network. This allows some consumers to
+ * avoid unecessary work like checksum computation.
+ */
+ DB_CKSUMFLAGS(mp) |= HW_LOCAL_MAC;
+
+ /*
* Find the destination.
*/
dst_flow_ent = mac_tx_classify(mip, mp);
if (dst_flow_ent != NULL) {
- size_t hdrsize;
- int err = 0;
-
- if (mip->mi_info.mi_nativemedia == DL_ETHER) {
- struct ether_vlan_header *evhp =
- (struct ether_vlan_header *)mp->b_rptr;
-
- if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
- hdrsize = sizeof (*evhp);
- else
- hdrsize = sizeof (struct ether_header);
- } else {
- mac_header_info_t mhi;
-
- err = mac_header_info((mac_handle_t)mip,
- mp, &mhi);
- if (err == 0)
- hdrsize = mhi.mhi_hdrsize;
- }
-
/*
* Got a matching flow. It's either another
* MAC client, or a broadcast/multicast flow.
- * Make sure the packet size is within the
- * allowed size. If not drop the packet and
- * move to next packet.
*/
- if (err != 0 ||
- (pkt_size - hdrsize) > mip->mi_sdu_max) {
- oerrors++;
- DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
- mblk_t *, mp);
- freemsg(mp);
- mp = next;
- FLOW_REFRELE(dst_flow_ent);
- continue;
- }
flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
+
if (flow_cookie != NULL) {
/*
* The vnic_bcast_send function expects
@@ -4443,6 +4431,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
* bypass is set.
*/
boolean_t do_switch;
+
mac_client_impl_t *dst_mcip =
dst_flow_ent->fe_mcip;
@@ -4459,20 +4448,18 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
* macro.
*/
if (mip->mi_promisc_list != NULL) {
- mac_promisc_dispatch(mip, mp, src_mcip,
- B_TRUE);
+ mac_promisc_dispatch(mip, mp, src_mcip);
}
do_switch = ((src_mcip->mci_state_flags &
dst_mcip->mci_state_flags &
MCIS_CLIENT_POLL_CAPABLE) != 0);
- if ((mp1 = mac_fix_cksum(mp)) != NULL) {
- (dst_flow_ent->fe_cb_fn)(
- dst_flow_ent->fe_cb_arg1,
- dst_flow_ent->fe_cb_arg2,
- mp1, do_switch);
- }
+ (dst_flow_ent->fe_cb_fn)(
+ dst_flow_ent->fe_cb_arg1,
+ dst_flow_ent->fe_cb_arg2,
+ mp, do_switch);
+
}
FLOW_REFRELE(dst_flow_ent);
} else {
@@ -4829,7 +4816,7 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
if (flag & MAC_DROP_ON_NO_DESC) {
- mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+ mac_drop_chain(mp_chain, "Tx softring no desc");
/* increment freed stats */
ringp->s_ring_drops += cnt;
cookie = (mac_tx_cookie_t)ringp;
@@ -4873,8 +4860,8 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
* b_prev may be set to the fanout hint
* hence can't use freemsg directly
*/
- mac_pkt_drop(NULL, NULL,
- mp_chain, B_FALSE);
+ mac_drop_chain(mp_chain,
+ "Tx softring max queue");
DTRACE_PROBE1(tx_queued_hiwat,
mac_soft_ring_t *, ringp);
enqueue = B_FALSE;
diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c
index 34f89328c3..c62bd997a8 100644
--- a/usr/src/uts/common/io/mac/mac_soft_ring.c
+++ b/usr/src/uts/common/io/mac/mac_soft_ring.c
@@ -242,7 +242,7 @@ mac_soft_ring_free(mac_soft_ring_t *softring)
ASSERT((softring->s_ring_state &
(S_RING_CONDEMNED | S_RING_CONDEMNED_DONE | S_RING_PROC)) ==
(S_RING_CONDEMNED | S_RING_CONDEMNED_DONE));
- mac_pkt_drop(NULL, NULL, softring->s_ring_first, B_FALSE);
+ mac_drop_chain(softring->s_ring_first, "softring free");
softring->s_ring_tx_arg2 = NULL;
mac_soft_ring_stat_delete(softring);
mac_callback_free(softring->s_ring_notify_cb_list);
diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c
index a877ca258c..209abacef8 100644
--- a/usr/src/uts/common/io/mac/mac_util.c
+++ b/usr/src/uts/common/io/mac/mac_util.c
@@ -48,6 +48,74 @@
#include <inet/sadb.h>
#include <inet/ipsecesp.h>
#include <inet/ipsecah.h>
+#include <inet/tcp.h>
+#include <inet/udp_impl.h>
+
+/*
+ * The next two functions are used for dropping packets or chains of
+ * packets, respectively. We could use one function for both but
+ * separating the use cases allows us to specify intent and prevent
+ * dropping more data than intended.
+ *
+ * The purpose of these functions is to aid the debugging effort,
+ * especially in production. Rather than use freemsg()/freemsgchain(),
+ * it's preferable to use these functions when dropping a packet in
+ * the MAC layer. These functions should only be used during
+ * unexpected conditions. That is, any time a packet is dropped
+ * outside of the regular, successful datapath. Consolidating all
+ * drops on these functions allows the user to trace one location and
+ * determine why the packet was dropped based on the msg. It also
+ * allows the user to inspect the packet before it is freed. Finally,
+ * it allows the user to avoid tracing freemsg()/freemsgchain() thus
+ * keeping the hot path running as efficiently as possible.
+ *
+ * NOTE: At this time not all MAC drops are aggregated on these
+ * functions; but that is the plan. This comment should be erased once
+ * completed.
+ */
+
+/*PRINTFLIKE2*/
+void
+mac_drop_pkt(mblk_t *mp, const char *fmt, ...)
+{
+ va_list adx;
+ char msg[128];
+ char *msgp = msg;
+
+ ASSERT3P(mp->b_next, ==, NULL);
+
+ va_start(adx, fmt);
+ (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
+ va_end(adx);
+
+ DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
+ freemsg(mp);
+}
+
+/*PRINTFLIKE2*/
+void
+mac_drop_chain(mblk_t *chain, const char *fmt, ...)
+{
+ va_list adx;
+ char msg[128];
+ char *msgp = msg;
+
+ va_start(adx, fmt);
+ (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
+ va_end(adx);
+
+ /*
+ * We could use freemsgchain() for the actual freeing but
+ * since we are already walking the chain to fire the dtrace
+ * probe we might as well free the msg here too.
+ */
+ for (mblk_t *mp = chain, *next; mp != NULL; ) {
+ next = mp->b_next;
+ DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
+ freemsg(mp);
+ mp = next;
+ }
+}
/*
* Copy an mblk, preserving its hardware checksum flags.
@@ -89,274 +157,1078 @@ mac_copymsgchain_cksum(mblk_t *mp)
}
/*
- * Process the specified mblk chain for proper handling of hardware
- * checksum offload. This routine is invoked for loopback traffic
- * between MAC clients.
- * The function handles a NULL mblk chain passed as argument.
+ * Perform software checksum on a single message, if needed. The
+ * emulation performed is determined by an intersection of the mblk's
+ * flags and the emul flags requested. The emul flags are documented
+ * in mac.h.
*/
-mblk_t *
-mac_fix_cksum(mblk_t *mp_chain)
+static mblk_t *
+mac_sw_cksum(mblk_t *mp, mac_emul_t emul)
{
- mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
+ mblk_t *orig = mp, *skipped_hdr = NULL;
uint32_t flags, start, stuff, end, value;
+ uint16_t len;
+ uint32_t offset;
+ uint16_t etype;
+ struct ether_header *ehp;
- for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
- uint16_t len;
- uint32_t offset;
- struct ether_header *ehp;
- uint16_t sap;
- mblk_t *skipped_hdr = NULL;
+ /*
+ * This function should only be called from mac_hw_emul()
+ * which handles mblk chains and the shared ref case.
+ */
+ ASSERT3P(mp->b_next, ==, NULL);
- mac_hcksum_get(mp, &start, &stuff, &end, &value, &flags);
- if (flags == 0)
- continue;
+ mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL);
+
+ /*
+ * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) because
+ * we don't want to mask-out the HW_LOCAL_MAC flag.
+ */
+ flags = DB_CKSUMFLAGS(mp);
+
+ /* Why call this if checksum emulation isn't needed? */
+ ASSERT3U(flags & (HCK_FLAGS), !=, 0);
+
+ /*
+ * Ethernet, and optionally VLAN header. mac_hw_emul() has
+ * already verified we have enough data to read the L2 header.
+ */
+ ehp = (struct ether_header *)mp->b_rptr;
+ if (ntohs(ehp->ether_type) == VLAN_TPID) {
+ struct ether_vlan_header *evhp;
+
+ evhp = (struct ether_vlan_header *)mp->b_rptr;
+ etype = ntohs(evhp->ether_type);
+ offset = sizeof (struct ether_vlan_header);
+ } else {
+ etype = ntohs(ehp->ether_type);
+ offset = sizeof (struct ether_header);
+ }
+
+ /*
+ * If this packet isn't IPv4, then leave it alone. We still
+ * need to add IPv6 support and we don't want to affect non-IP
+ * traffic like ARP.
+ */
+ if (etype != ETHERTYPE_IP)
+ return (mp);
+
+ ASSERT3U(MBLKL(mp), >=, offset);
+
+ /*
+ * If the first mblk of this packet contains only the ethernet
+ * header, skip past it for now. Packets with their data
+ * contained in only a single mblk can then use the fastpaths
+ * tuned to that possibility.
+ */
+ if (MBLKL(mp) == offset) {
+ offset -= MBLKL(mp);
+ /* This is guaranteed by mac_hw_emul(). */
+ ASSERT3P(mp->b_cont, !=, NULL);
+ skipped_hdr = mp;
+ mp = mp->b_cont;
+ }
+
+ /*
+ * Both full and partial checksum rely on finding the IP
+ * header in the current mblk. Our native TCP stack honors
+ * this assumption but it's prudent to guard our future
+ * clients that might not honor this contract.
+ */
+ ASSERT3U(MBLKL(mp), >=, offset + sizeof (ipha_t));
+ if (MBLKL(mp) < (offset + sizeof (ipha_t))) {
+ mac_drop_pkt(mp, "mblk doesn't contain IP header");
+ return (NULL);
+ }
+
+ /*
+ * We are about to modify the header mblk; make sure we are
+ * modifying our own copy. The code that follows assumes that
+ * the IP/ULP headers exist in this mblk (and drops the
+ * message if they don't).
+ */
+ if (DB_REF(mp) > 1) {
+ mblk_t *tmp = copyb(mp);
+
+ if (tmp == NULL) {
+ mac_drop_pkt(mp, "copyb failed");
+ return (NULL);
+ }
+
+ tmp->b_cont = mp->b_cont;
+ freeb(mp);
+ mp = tmp;
+ }
+
+ if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
+ ipha_t *ipha = (ipha_t *)(mp->b_rptr + offset);
+
+
+ if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
+ ipaddr_t src, dst;
+ uint32_t cksum;
+ uint16_t *up;
+ uint8_t proto;
+
+ /*
+ * This code assumes a "simple" IP header (20
+ * bytes, no options). IPv4 options are mostly
+ * a historic artifact. The one slight
+ * exception is Router Alert, but we don't
+ * expect such a packet to land here.
+ */
+ proto = ipha->ipha_protocol;
+ ASSERT(ipha->ipha_version_and_hdr_length ==
+ IP_SIMPLE_HDR_VERSION);
+ if (ipha->ipha_version_and_hdr_length !=
+ IP_SIMPLE_HDR_VERSION) {
+ mac_drop_pkt(mp, "not simple IP header");
+ return (NULL);
+ }
+
+ /* Get a pointer to the ULP checksum. */
+ switch (proto) {
+ case IPPROTO_TCP:
+ ASSERT3U(MBLKL(mp), >=,
+ (offset + sizeof (ipha_t) +
+ sizeof (tcph_t)));
+ if (MBLKL(mp) < (offset + sizeof (ipha_t) +
+ sizeof (tcph_t))) {
+ mac_drop_pkt(mp,
+ "mblk doesn't contain TCP header");
+ return (NULL);
+ }
+
+ /* LINTED: improper alignment cast */
+ up = IPH_TCPH_CHECKSUMP(ipha,
+ IP_SIMPLE_HDR_LENGTH);
+ break;
+
+ case IPPROTO_UDP:
+ ASSERT3U(MBLKL(mp), >=,
+ (offset + sizeof (ipha_t) +
+ sizeof (udpha_t)));
+ if (MBLKL(mp) < (offset + sizeof (ipha_t) +
+ sizeof (udpha_t))) {
+ mac_drop_pkt(mp,
+ "mblk doesn't contain UDP header");
+ return (NULL);
+ }
+
+ /* LINTED: improper alignment cast */
+ up = IPH_UDPH_CHECKSUMP(ipha,
+ IP_SIMPLE_HDR_LENGTH);
+ break;
+
+ default:
+ mac_drop_pkt(orig, "unexpected protocol: %d",
+ proto);
+ return (NULL);
+ }
+
+ /* Pseudo-header checksum. */
+ src = ipha->ipha_src;
+ dst = ipha->ipha_dst;
+ len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
+
+ cksum = (dst >> 16) + (dst & 0xFFFF) +
+ (src >> 16) + (src & 0xFFFF);
+ cksum += htons(len);
+
+ /*
+ * The checksum value stored in the packet
+ * needs to be correct. Compute it here.
+ */
+ *up = 0;
+ cksum += (((proto) == IPPROTO_UDP) ?
+ IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
+ cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
+ offset, cksum);
+ *(up) = (uint16_t)(cksum ? cksum : ~cksum);
+
+ }
+
+ /* We always update the ULP checksum flags. */
+ if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
+ flags &= ~HCK_FULLCKSUM;
+ flags |= HCK_FULLCKSUM_OK;
+ value = 0;
+ }
/*
- * Since the processing of checksum offload for loopback
- * traffic requires modification of the packet contents,
- * ensure sure that we are always modifying our own copy.
+ * Out of paranoia, and for the sake of correctness,
+ * we won't calulate the IP header checksum if it's
+ * already populated. While unlikely, it's possible to
+ * write code that might end up calling mac_sw_cksum()
+ * twice on the same mblk (performing both LSO and
+ * checksum emualtion in a single mblk chain loop --
+ * the LSO emulation inserts a new chain into the
+ * existing chain and then the loop iterates back over
+ * the new segments and emulates the checksum a second
+ * time). Normally this wouldn't be a problem, because
+ * the HCK_*_OK flags are supposed to indicate that we
+ * don't need to do peform the work. But
+ * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the
+ * same value; so we cannot use these flags to
+ * determine if the IP header checksum has already
+ * been calculated or not. Luckily, if IP requests
+ * HCK_IPV4_HDRCKSUM, then the IP header checksum will
+ * be zero. So this test works just as well as
+ * checking the flag. However, in the future, we
+ * should fix the HCK_* flags.
*/
- if (DB_REF(mp) > 1) {
- mp1 = copymsg(mp);
- if (mp1 == NULL)
- continue;
- mp1->b_next = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- if (prev != NULL)
- prev->b_next = mp1;
- else
- new_chain = mp1;
- mp = mp1;
+ if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS) &&
+ ipha->ipha_hdr_checksum == 0) {
+ ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
+ flags &= ~HCK_IPV4_HDRCKSUM;
+ flags |= HCK_IPV4_HDRCKSUM_OK;
}
+ }
+
+ if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
+ uint16_t *up, partial, cksum;
+ uchar_t *ipp; /* ptr to beginning of IP header */
+
+ ASSERT3U(MBLKL(mp), >=,
+ (offset + sizeof (ipha_t) + sizeof (tcph_t)));
+ if (MBLKL(mp) < (offset + sizeof (ipha_t) + sizeof (tcph_t))) {
+ mac_drop_pkt(mp, "mblk doesn't contain TCP header");
+ return (NULL);
+ }
+
+ ipp = mp->b_rptr + offset;
+ /* LINTED: cast may result in improper alignment */
+ up = (uint16_t *)((uchar_t *)ipp + stuff);
+ partial = *up;
+ *up = 0;
+
+ ASSERT3S(end, >, start);
+ cksum = ~IP_CSUM_PARTIAL(mp, offset + start, partial);
+ *up = cksum != 0 ? cksum : ~cksum;
+ }
+
+ /* We always update the ULP checksum flags. */
+ if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
+ flags &= ~HCK_PARTIALCKSUM;
+ flags |= HCK_FULLCKSUM_OK;
+ value = 0;
+ }
+
+ mac_hcksum_set(mp, start, stuff, end, value, flags);
+
+ /* Don't forget to reattach the header. */
+ if (skipped_hdr != NULL) {
+ ASSERT3P(skipped_hdr->b_cont, ==, mp);
/*
- * Ethernet, and optionally VLAN header.
+ * Duplicate the HCKSUM data into the header mblk.
+ * This mimics mac_add_vlan_tag which ensures that
+ * both the first mblk _and_ the first data bearing
+ * mblk possess the HCKSUM information. Consumers like
+ * IP will end up discarding the ether_header mblk, so
+ * for now, it is important that the data be available
+ * in both places.
*/
- /* LINTED: improper alignment cast */
- ehp = (struct ether_header *)mp->b_rptr;
- if (ntohs(ehp->ether_type) == VLAN_TPID) {
- struct ether_vlan_header *evhp;
+ mac_hcksum_clone(mp, skipped_hdr);
+ mp = skipped_hdr;
+ }
- ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
- /* LINTED: improper alignment cast */
- evhp = (struct ether_vlan_header *)mp->b_rptr;
- sap = ntohs(evhp->ether_type);
- offset = sizeof (struct ether_vlan_header);
+ return (mp);
+}
+
+/*
+ * Build a single data segment from an LSO packet. The mblk chain
+ * returned, seg_head, represents the data segment and is always
+ * exactly seg_len bytes long. The lso_mp and offset input/output
+ * parameters track our position in the LSO packet. This function
+ * exists solely as a helper to mac_sw_lso().
+ *
+ * Case A
+ *
+ * The current lso_mp is larger than the requested seg_len. The
+ * beginning of seg_head may start at the beginning of lso_mp or
+ * offset into it. In either case, a single mblk is returned, and
+ * *offset is updated to reflect our new position in the current
+ * lso_mp.
+ *
+ * +----------------------------+
+ * | in *lso_mp / out *lso_mp |
+ * +----------------------------+
+ * ^ ^
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ * | seg_head |
+ * +------------------------+
+ * ^ ^
+ * | |
+ * in *offset = 0 out *offset = seg_len
+ *
+ * |------ seg_len ----|
+ *
+ *
+ * +------------------------------+
+ * | in *lso_mp / out *lso_mp |
+ * +------------------------------+
+ * ^ ^
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ * | seg_head |
+ * +------------------------+
+ * ^ ^
+ * | |
+ * in *offset = N out *offset = N + seg_len
+ *
+ * |------ seg_len ----|
+ *
+ *
+ *
+ * Case B
+ *
+ * The requested seg_len consumes exactly the rest of the lso_mp.
+ * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr.
+ * The seg_head may start at the beginning of the lso_mp or at some
+ * offset into it. In either case we return a single mblk, reset
+ * *offset to zero, and walk to the next lso_mp.
+ *
+ * +------------------------+ +------------------------+
+ * | in *lso_mp |---------->| out *lso_mp |
+ * +------------------------+ +------------------------+
+ * ^ ^ ^
+ * | | |
+ * | | out *offset = 0
+ * | |
+ * +------------------------+
+ * | seg_head |
+ * +------------------------+
+ * ^
+ * |
+ * in *offset = 0
+ *
+ * |------ seg_len ----|
+ *
+ *
+ *
+ * +----------------------------+ +------------------------+
+ * | in *lso_mp |---------->| out *lso_mp |
+ * +----------------------------+ +------------------------+
+ * ^ ^ ^
+ * | | |
+ * | | out *offset = 0
+ * | |
+ * +------------------------+
+ * | seg_head |
+ * +------------------------+
+ * ^
+ * |
+ * in *offset = N
+ *
+ * |------ seg_len ----|
+ *
+ *
+ * Case C
+ *
+ * The requested seg_len is greater than the current lso_mp. In
+ * this case we must consume LSO mblks until we have enough data to
+ * satisfy either case (A) or (B) above. We will return multiple
+ * mblks linked via b_cont, offset will be set based on the cases
+ * above, and lso_mp will walk forward at least one mblk, but maybe
+ * more.
+ *
+ * N.B. This digram is not exhaustive. The seg_head may start on
+ * the beginning of an lso_mp. The seg_tail may end exactly on the
+ * boundary of an lso_mp. And there may be two (in this case the
+ * middle block wouldn't exist), three, or more mblks in the
+ * seg_head chain. This is meant as one example of what might
+ * happen. The main thing to remember is that the seg_tail mblk
+ * must be one of case (A) or (B) above.
+ *
+ * +------------------+ +----------------+ +------------------+
+ * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp |
+ * +------------------+ +----------------+ +------------------+
+ * ^ ^ ^ ^ ^ ^
+ * | | | | | |
+ * | | | | | |
+ * | | | | | |
+ * | | | | | |
+ * +------------+ +----------------+ +------------+
+ * | seg_head |--->| |--->| seg_tail |
+ * +------------+ +----------------+ +------------+
+ * ^ ^
+ * | |
+ * in *offset = N out *offset = MBLKL(seg_tail)
+ *
+ * |------------------- seg_len -------------------|
+ *
+ */
+static mblk_t *
+build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len)
+{
+ mblk_t *seg_head, *seg_tail, *seg_mp;
+
+ ASSERT3P(*lso_mp, !=, NULL);
+ ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr);
+
+ seg_mp = dupb(*lso_mp);
+ if (seg_mp == NULL)
+ return (NULL);
+
+ seg_head = seg_mp;
+ seg_tail = seg_mp;
+
+ /* Continue where we left off from in the lso_mp. */
+ seg_mp->b_rptr += *offset;
+
+last_mblk:
+ /* Case (A) */
+ if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) {
+ *offset += seg_len;
+ seg_mp->b_wptr = seg_mp->b_rptr + seg_len;
+ return (seg_head);
+ }
+
+ /* Case (B) */
+ if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) {
+ *offset = 0;
+ *lso_mp = (*lso_mp)->b_cont;
+ return (seg_head);
+ }
+
+ /* Case (C) */
+ ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr);
+
+ /*
+ * The current LSO mblk doesn't have enough data to satisfy
+ * seg_len -- continue peeling off LSO mblks to build the new
+ * segment message. If allocation fails we free the previously
+ * allocated segment mblks and return NULL.
+ */
+ while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) {
+ ASSERT3U(MBLKL(seg_mp), <=, seg_len);
+ seg_len -= MBLKL(seg_mp);
+ *offset = 0;
+ *lso_mp = (*lso_mp)->b_cont;
+ seg_mp = dupb(*lso_mp);
+
+ if (seg_mp == NULL) {
+ freemsgchain(seg_head);
+ return (NULL);
+ }
+
+ seg_tail->b_cont = seg_mp;
+ seg_tail = seg_mp;
+ }
+
+ /*
+ * We've walked enough LSO mblks that we can now satisfy the
+ * remaining seg_len. At this point we need to jump back to
+ * determine if we have arrived at case (A) or (B).
+ */
+
+ /* Just to be paranoid that we didn't underflow. */
+ ASSERT3U(seg_len, <, IP_MAXPACKET);
+ ASSERT3U(seg_len, >, 0);
+ goto last_mblk;
+}
+
+/*
+ * Perform software segmentation of a single LSO message. Take an LSO
+ * message as input and return head/tail pointers as output. This
+ * function should not be invoked directly but instead through
+ * mac_hw_emul().
+ *
+ * The resulting chain is comprised of multiple (nsegs) MSS sized
+ * segments. Each segment will consist of two or more mblks joined by
+ * b_cont: a header and one or more data mblks. The header mblk is
+ * allocated anew for each message. The first segment's header is used
+ * as a template for the rest with adjustments made for things such as
+ * ID, sequence, length, TCP flags, etc. The data mblks reference into
+ * the existing LSO mblk (passed in as omp) by way of dupb(). Their
+ * b_rptr/b_wptr values are adjusted to reference only the fraction of
+ * the LSO message they are responsible for. At the successful
+ * completion of this function the original mblk (omp) is freed,
+ * leaving the newely created segment chain as the only remaining
+ * reference to the data.
+ */
+static void
+mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail,
+ uint_t *count)
+{
+ uint32_t ocsum_flags, ocsum_start, ocsum_stuff;
+ uint32_t mss;
+ uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen;
+ uint32_t oleft;
+ uint_t nsegs, seg;
+ int len;
+
+ struct ether_vlan_header *oevh;
+ const ipha_t *oiph;
+ const tcph_t *otcph;
+ ipha_t *niph;
+ tcph_t *ntcph;
+ uint16_t ip_id;
+ uint32_t tcp_seq, tcp_sum, otcp_sum;
+
+ uint32_t offset;
+ mblk_t *odatamp;
+ mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp;
+ mblk_t *tmptail;
+
+ ASSERT3P(head, !=, NULL);
+ ASSERT3P(tail, !=, NULL);
+ ASSERT3P(count, !=, NULL);
+ ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0);
+
+ /* Assume we are dealing with a single LSO message. */
+ ASSERT3P(omp->b_next, ==, NULL);
+
+ mss = DB_LSOMSS(omp);
+ ASSERT3U(msgsize(omp), <=, IP_MAXPACKET +
+ sizeof (struct ether_vlan_header));
+ opktlen = msgsize(omp);
+
+ /*
+ * First, get references to the IP and TCP headers and
+ * determine the total TCP length (header + data).
+ *
+ * Thanks to mac_hw_emul() we know that the first mblk must
+ * contain (at minimum) the full L2 header. However, this
+ * function assumes more than that. It assumes the L2/L3/L4
+ * headers are all contained in the first mblk of a message
+ * (i.e., no b_cont walking for headers). While this is a
+ * current reality (our native TCP stack and viona both
+ * enforce this) things may become more nuanced in the future
+ * (e.g. when introducing encap support or adding new
+ * clients). For now we guard against this case by dropping
+ * the packet.
+ */
+ oevh = (struct ether_vlan_header *)omp->b_rptr;
+ if (oevh->ether_tpid == htons(ETHERTYPE_VLAN))
+ oehlen = sizeof (struct ether_vlan_header);
+ else
+ oehlen = sizeof (struct ether_header);
+
+ ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t)));
+ if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) {
+ mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers");
+ goto fail;
+ }
+
+ oiph = (ipha_t *)(omp->b_rptr + oehlen);
+ oiphlen = IPH_HDR_LENGTH(oiph);
+ otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen);
+ otcphlen = TCP_HDR_LENGTH(otcph);
+
+ /*
+ * Currently we only support LSO for TCP/IPv4.
+ */
+ if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) {
+ mac_drop_pkt(omp, "LSO unsupported IP version: %uhh",
+ IPH_HDR_VERSION(oiph));
+ goto fail;
+ }
+
+ if (oiph->ipha_protocol != IPPROTO_TCP) {
+ mac_drop_pkt(omp, "LSO unsupported protocol: %uhh",
+ oiph->ipha_protocol);
+ goto fail;
+ }
+
+ if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) {
+ mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set");
+ goto fail;
+ }
+
+ ohdrslen = oehlen + oiphlen + otcphlen;
+ if ((len = MBLKL(omp)) < ohdrslen) {
+ mac_drop_pkt(omp, "LSO packet too short: %d < %u", len,
+ ohdrslen);
+ goto fail;
+ }
+
+ /*
+ * Either we have data in the first mblk or it's just the
+ * header. In either case, we need to set rptr to the start of
+ * the TCP data.
+ */
+ if (len > ohdrslen) {
+ odatamp = omp;
+ offset = ohdrslen;
+ } else {
+ ASSERT3U(len, ==, ohdrslen);
+ odatamp = omp->b_cont;
+ offset = 0;
+ }
+
+ /* Make sure we still have enough data. */
+ ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen);
+
+ /*
+ * If a MAC negotiated LSO then it must negotioate both
+ * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or
+ * HCKSUM_INET_PARTIAL; because both the IP and TCP headers
+ * change during LSO segmentation (only the 3 fields of the
+ * pseudo header checksum don't change: src, dst, proto). Thus
+ * we would expect these flags (HCK_IPV4_HDRCKSUM |
+ * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this
+ * function to emulate those checksums in software. However,
+ * that assumes a world where we only expose LSO if the
+ * underlying hardware exposes LSO. Moving forward the plan is
+ * to assume LSO in the upper layers and have MAC perform
+ * software LSO when the underlying provider doesn't support
+ * it. In such a world, if the provider doesn't support LSO
+ * but does support hardware checksum offload, then we could
+ * simply perform the segmentation and allow the hardware to
+ * calculate the checksums. To the hardware it's just another
+ * chain of non-LSO packets.
+ */
+ ASSERT3S(DB_TYPE(omp), ==, M_DATA);
+ ocsum_flags = DB_CKSUMFLAGS(omp);
+ ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0);
+ ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0);
+
+ /*
+ * If hardware only provides partial checksum then software
+ * must supply the pseudo-header checksum. In the case of LSO
+ * we leave the TCP length at zero to be filled in by
+ * hardware. This function must handle two scenarios.
+ *
+ * 1. Being called by a MAC client on the Rx path to segment
+ * an LSO packet and calculate the checksum.
+ *
+ * 2. Being called by a MAC provider to segment an LSO packet.
+ * In this case the LSO segmentation is performed in
+ * software (by this routine) but the MAC provider should
+ * still calculate the TCP/IP checksums in hardware.
+ *
+ * To elaborate on the second case: we cannot have the
+ * scenario where IP sends LSO packets but the underlying HW
+ * doesn't support checksum offload -- because in that case
+ * TCP/IP would calculate the checksum in software (for the
+ * LSO packet) but then MAC would segment the packet and have
+ * to redo all the checksum work. So IP should never do LSO
+ * if HW doesn't support both IP and TCP checksum.
+ */
+ if (ocsum_flags & HCK_PARTIALCKSUM) {
+ ocsum_start = (uint32_t)DB_CKSUMSTART(omp);
+ ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp);
+ }
+
+ odatalen = opktlen - ohdrslen;
+
+ /*
+ * Subtract one to account for the case where the data length
+ * is evenly divisble by the MSS. Add one to account for the
+ * fact that the division will always result in one less
+ * segment than needed.
+ */
+ nsegs = ((odatalen - 1) / mss) + 1;
+ if (nsegs < 2) {
+ mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs);
+ goto fail;
+ }
+
+ DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph,
+ __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t,
+ nsegs);
+
+ seg_chain = NULL;
+ tmptail = seg_chain;
+ oleft = odatalen;
+
+ for (uint_t i = 0; i < nsegs; i++) {
+ boolean_t last_seg = ((i + 1) == nsegs);
+ uint32_t seg_len;
+
+ /*
+ * If we fail to allocate, then drop the partially
+ * allocated chain as well as the LSO packet. Let the
+ * sender deal with the fallout.
+ */
+ if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) {
+ freemsgchain(seg_chain);
+ mac_drop_pkt(omp, "failed to alloc segment header");
+ goto fail;
+ }
+ ASSERT3P(nhdrmp->b_cont, ==, NULL);
+
+ if (seg_chain == NULL) {
+ seg_chain = nhdrmp;
} else {
- sap = ntohs(ehp->ether_type);
- offset = sizeof (struct ether_header);
+ ASSERT3P(tmptail, !=, NULL);
+ tmptail->b_next = nhdrmp;
}
+ tmptail = nhdrmp;
+
/*
- * If the first mblk in the chain for this packet contains only
- * the ethernet header, skip past it for now. Packets with
- * their data contained in only a single mblk can then use the
- * fastpaths tuned to that possibility.
+ * Calculate this segment's lengh. It's either the MSS
+ * or whatever remains for the last segment.
*/
- if (MBLKL(mp) <= offset) {
- offset -= MBLKL(mp);
- if (mp->b_cont == NULL) {
- /* corrupted packet, skip it */
- if (prev != NULL)
- prev->b_next = mp->b_next;
- else
- new_chain = mp->b_next;
- mp1 = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- mp = mp1;
- continue;
- }
- skipped_hdr = mp;
- mp = mp->b_cont;
+ seg_len = last_seg ? oleft : mss;
+ ASSERT3U(seg_len, <=, mss);
+ ndatamp = build_data_seg(&odatamp, &offset, seg_len);
+
+ if (ndatamp == NULL) {
+ freemsgchain(seg_chain);
+ mac_drop_pkt(omp, "LSO failed to segment data");
+ goto fail;
}
- if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
- ipha_t *ipha = NULL;
+ /* Attach data mblk to header mblk. */
+ nhdrmp->b_cont = ndatamp;
+ DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO;
+ ASSERT3U(seg_len, <=, oleft);
+ oleft -= seg_len;
+ }
- /*
- * In order to compute the full and header
- * checksums, we need to find and parse
- * the IP and/or ULP headers.
- */
+ /* We should have consumed entire LSO msg. */
+ ASSERT3S(oleft, ==, 0);
+ ASSERT3P(odatamp, ==, NULL);
- sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
+ /*
+ * All seg data mblks are referenced by the header mblks, null
+ * out this pointer to catch any bad derefs.
+ */
+ ndatamp = NULL;
- /*
- * IP header.
- */
- if (sap != ETHERTYPE_IP)
- continue;
-
- ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
- /* LINTED: improper alignment cast */
- ipha = (ipha_t *)(mp->b_rptr + offset);
-
- if (flags & HCK_FULLCKSUM) {
- ipaddr_t src, dst;
- uint32_t cksum;
- uint16_t *up;
- uint8_t proto;
-
- /*
- * Pointer to checksum field in ULP header.
- */
- proto = ipha->ipha_protocol;
- ASSERT(ipha->ipha_version_and_hdr_length ==
- IP_SIMPLE_HDR_VERSION);
-
- switch (proto) {
- case IPPROTO_TCP:
- /* LINTED: improper alignment cast */
- up = IPH_TCPH_CHECKSUMP(ipha,
- IP_SIMPLE_HDR_LENGTH);
- break;
-
- case IPPROTO_UDP:
- /* LINTED: improper alignment cast */
- up = IPH_UDPH_CHECKSUMP(ipha,
- IP_SIMPLE_HDR_LENGTH);
- break;
-
- default:
- cmn_err(CE_WARN, "mac_fix_cksum: "
- "unexpected protocol: %d", proto);
- continue;
- }
+ /*
+ * Set headers and checksum for first segment.
+ */
+ nhdrmp = seg_chain;
+ bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen);
+ nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+ niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+ ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss);
+ niph->ipha_length = htons(oiphlen + otcphlen + mss);
+ niph->ipha_hdr_checksum = 0;
+ ip_id = ntohs(niph->ipha_ident);
+ ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+ tcp_seq = BE32_TO_U32(ntcph->th_seq);
+ tcp_seq += mss;
- /*
- * Pseudo-header checksum.
- */
- src = ipha->ipha_src;
- dst = ipha->ipha_dst;
- len = ntohs(ipha->ipha_length) -
- IP_SIMPLE_HDR_LENGTH;
-
- cksum = (dst >> 16) + (dst & 0xFFFF) +
- (src >> 16) + (src & 0xFFFF);
- cksum += htons(len);
-
- /*
- * The checksum value stored in the packet needs
- * to be correct. Compute it here.
- */
- *up = 0;
- cksum += (((proto) == IPPROTO_UDP) ?
- IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
- cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
- offset, cksum);
- *(up) = (uint16_t)(cksum ? cksum : ~cksum);
-
- /*
- * Flag the packet so that it appears
- * that the checksum has already been
- * verified by the hardware.
- */
- flags &= ~HCK_FULLCKSUM;
- flags |= HCK_FULLCKSUM_OK;
- value = 0;
- }
+ /*
+ * The first segment shouldn't:
+ *
+ * o indicate end of data transmission (FIN),
+ * o indicate immediate handling of the data (PUSH).
+ */
+ ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
+ DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
- if (flags & HCK_IPV4_HDRCKSUM) {
- ASSERT(ipha != NULL);
- ipha->ipha_hdr_checksum =
- (uint16_t)ip_csum_hdr(ipha);
- flags &= ~HCK_IPV4_HDRCKSUM;
- flags |= HCK_IPV4_HDRCKSUM_OK;
+ /*
+ * If the underlying HW provides partial checksum, then make
+ * sure to correct the pseudo header checksum before calling
+ * mac_sw_cksum(). The native TCP stack doesn't include the
+ * length field in the pseudo header when LSO is in play -- so
+ * we need to calculate it here.
+ */
+ if (ocsum_flags & HCK_PARTIALCKSUM) {
+ DB_CKSUMSTART(nhdrmp) = ocsum_start;
+ DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+ DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+ tcp_sum = BE16_TO_U16(ntcph->th_sum);
+ otcp_sum = tcp_sum;
+ tcp_sum += mss + otcphlen;
+ tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
+ U16_TO_BE16(tcp_sum, ntcph->th_sum);
+ }
- }
- }
+ if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+ (emul & MAC_HWCKSUM_EMULS)) {
+ next_nhdrmp = nhdrmp->b_next;
+ nhdrmp->b_next = NULL;
+ nhdrmp = mac_sw_cksum(nhdrmp, emul);
+ nhdrmp->b_next = next_nhdrmp;
+ next_nhdrmp = NULL;
- if (flags & HCK_PARTIALCKSUM) {
- uint16_t *up, partial, cksum;
- uchar_t *ipp; /* ptr to beginning of IP header */
- mblk_t *old_mp = NULL;
+ /*
+ * We may have freed the nhdrmp argument during
+ * checksum emulation, make sure that seg_chain
+ * references a valid mblk.
+ */
+ seg_chain = nhdrmp;
+ }
- if (mp->b_cont != NULL) {
- mblk_t *new_mp;
+ ASSERT3P(nhdrmp, !=, NULL);
- new_mp = msgpullup(mp, offset + end);
- if (new_mp == NULL) {
- continue;
- }
- old_mp = mp;
- mp = new_mp;
- }
+ seg = 1;
+ DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+ (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+ (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss,
+ uint_t, seg);
+ seg++;
- ipp = mp->b_rptr + offset;
- /* LINTED: cast may result in improper alignment */
- up = (uint16_t *)((uchar_t *)ipp + stuff);
- partial = *up;
- *up = 0;
+ /* There better be at least 2 segs. */
+ ASSERT3P(nhdrmp->b_next, !=, NULL);
+ prev_nhdrmp = nhdrmp;
+ nhdrmp = nhdrmp->b_next;
- cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
- end - start, partial);
- cksum = ~cksum;
- *up = cksum ? cksum : ~cksum;
+ /*
+ * Now adjust the headers of the middle segments. For each
+ * header we need to adjust the following.
+ *
+ * o IP ID
+ * o IP length
+ * o TCP sequence
+ * o TCP flags
+ * o cksum flags
+ * o cksum values (if MAC_HWCKSUM_EMUL is set)
+ */
+ for (; seg < nsegs; seg++) {
+ /*
+ * We use seg_chain as a reference to the first seg
+ * header mblk -- this first header is a template for
+ * the rest of the segments. This copy will include
+ * the now updated checksum values from the first
+ * header. We must reset these checksum values to
+ * their original to make sure we produce the correct
+ * value.
+ */
+ bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
+ nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+ niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+ niph->ipha_ident = htons(++ip_id);
+ ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss);
+ niph->ipha_length = htons(oiphlen + otcphlen + mss);
+ niph->ipha_hdr_checksum = 0;
+ ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+ U32_TO_BE32(tcp_seq, ntcph->th_seq);
+ tcp_seq += mss;
+ /*
+ * Just like the first segment, the middle segments
+ * shouldn't have these flags set.
+ */
+ ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
+ DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
+ if (ocsum_flags & HCK_PARTIALCKSUM) {
/*
- * Since we already computed the whole checksum,
- * indicate to the stack that it has already
- * been verified by the hardware.
+ * First and middle segs have same
+ * pseudo-header checksum.
*/
- flags &= ~HCK_PARTIALCKSUM;
- flags |= HCK_FULLCKSUM_OK;
- value = 0;
+ U16_TO_BE16(tcp_sum, ntcph->th_sum);
+ DB_CKSUMSTART(nhdrmp) = ocsum_start;
+ DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+ DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+ }
- /*
- * If 'mp' is the result of a msgpullup(), it needs to
- * be properly reattached into the existing chain of
- * messages before continuing.
- */
- if (old_mp != NULL) {
- if (skipped_hdr != NULL) {
- /*
- * If the ethernet header was cast
- * aside before checksum calculation,
- * prepare for it to be reattached to
- * the pulled-up mblk.
- */
- skipped_hdr->b_cont = mp;
- } else {
- /* Link the new mblk into the chain. */
- mp->b_next = old_mp->b_next;
-
- if (prev != NULL)
- prev->b_next = mp;
- else
- new_chain = mp;
- }
+ if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+ (emul & MAC_HWCKSUM_EMULS)) {
+ next_nhdrmp = nhdrmp->b_next;
+ nhdrmp->b_next = NULL;
+ nhdrmp = mac_sw_cksum(nhdrmp, emul);
+ nhdrmp->b_next = next_nhdrmp;
+ next_nhdrmp = NULL;
+ /* We may have freed the original nhdrmp. */
+ prev_nhdrmp->b_next = nhdrmp;
+ }
- old_mp->b_next = NULL;
- freemsg(old_mp);
- }
+ DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+ (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+ (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen),
+ uint_t, mss, uint_t, seg);
+
+ ASSERT3P(nhdrmp->b_next, !=, NULL);
+ prev_nhdrmp = nhdrmp;
+ nhdrmp = nhdrmp->b_next;
+ }
+
+ /* Make sure we are on the last segment. */
+ ASSERT3U(seg, ==, nsegs);
+ ASSERT3P(nhdrmp->b_next, ==, NULL);
+
+ /*
+ * Now we set the last segment header. The difference being
+ * that FIN/PSH/RST flags are allowed.
+ */
+ bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
+ nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+ niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+ niph->ipha_ident = htons(++ip_id);
+ len = msgsize(nhdrmp->b_cont);
+ ASSERT3S(len, >, 0);
+ niph->ipha_length = htons(oiphlen + otcphlen + len);
+ niph->ipha_hdr_checksum = 0;
+ ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+ U32_TO_BE32(tcp_seq, ntcph->th_seq);
+
+ DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
+ if (ocsum_flags & HCK_PARTIALCKSUM) {
+ DB_CKSUMSTART(nhdrmp) = ocsum_start;
+ DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+ DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+ tcp_sum = otcp_sum;
+ tcp_sum += len + otcphlen;
+ tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
+ U16_TO_BE16(tcp_sum, ntcph->th_sum);
+ }
+
+ if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+ (emul & MAC_HWCKSUM_EMULS)) {
+ /* This should be the last mblk. */
+ ASSERT3P(nhdrmp->b_next, ==, NULL);
+ nhdrmp = mac_sw_cksum(nhdrmp, emul);
+ prev_nhdrmp->b_next = nhdrmp;
+ }
+
+ DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+ (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+ (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len,
+ uint_t, seg);
+
+ /*
+ * Free the reference to the original LSO message as it is
+ * being replaced by seg_cahin.
+ */
+ freemsg(omp);
+ *head = seg_chain;
+ *tail = nhdrmp;
+ *count = nsegs;
+ return;
+
+fail:
+ *head = NULL;
+ *tail = NULL;
+ *count = 0;
+}
+
+#define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM)
+
+/*
+ * Emulate various hardware offload features in software. Take a chain
+ * of packets as input and emulate the hardware features specified in
+ * 'emul'. The resulting chain's head pointer replaces the 'mp_chain'
+ * pointer given as input, and its tail pointer is written to
+ * '*otail'. The number of packets in the new chain is written to
+ * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus
+ * may be NULL. The 'mp_chain' argument may point to a NULL chain; in
+ * which case 'mp_chain' will simply stay a NULL chain.
+ *
+ * While unlikely, it is technically possible that this function could
+ * receive a non-NULL chain as input and return a NULL chain as output
+ * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be
+ * zero). This could happen if all the packets in the chain are
+ * dropped or if we fail to allocate new mblks. In this case, there is
+ * nothing for the caller to free. In any event, the caller shouldn't
+ * assume that '*mp_chain' is non-NULL on return.
+ *
+ * This function was written with two main use cases in mind.
+ *
+ * 1. A way for MAC clients to emulate hardware offloads when they
+ * can't directly handle LSO packets or packets without fully
+ * calculated checksums.
+ *
+ * 2. A way for MAC providers (drivers) to offer LSO even when the
+ * underlying HW can't or won't supply LSO offload.
+ *
+ * At the time of this writing no provider is making use of this
+ * function. However, the plan for the future is to always assume LSO
+ * is available and then add SW LSO emulation to all providers that
+ * don't support it in HW.
+ */
+void
+mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul)
+{
+ mblk_t *head = NULL, *tail = NULL;
+ uint_t count = 0;
+
+ ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0);
+ ASSERT3P(mp_chain, !=, NULL);
+
+ for (mblk_t *mp = *mp_chain; mp != NULL; ) {
+ mblk_t *tmp, *next, *tmphead, *tmptail;
+ struct ether_header *ehp;
+ uint32_t flags;
+ uint_t len = MBLKL(mp), l2len;
+
+ /* Perform LSO/cksum one message at a time. */
+ next = mp->b_next;
+ mp->b_next = NULL;
+
+ /*
+ * For our sanity the first mblk should contain at
+ * least the full L2 header.
+ */
+ if (len < sizeof (struct ether_header)) {
+ mac_drop_pkt(mp, "packet too short (A): %u", len);
+ mp = next;
+ continue;
}
- mac_hcksum_set(mp, start, stuff, end, value, flags);
+ ehp = (struct ether_header *)mp->b_rptr;
+ if (ntohs(ehp->ether_type) == VLAN_TPID)
+ l2len = sizeof (struct ether_vlan_header);
+ else
+ l2len = sizeof (struct ether_header);
/*
- * If the header was skipped over, we must seek back to it,
- * since it is that mblk that is part of any packet chain.
+ * If the first mblk is solely the L2 header, then
+ * there better be more data.
*/
- if (skipped_hdr != NULL) {
- ASSERT3P(skipped_hdr->b_cont, ==, mp);
+ if (len < l2len || (len == l2len && mp->b_cont == NULL)) {
+ mac_drop_pkt(mp, "packet too short (C): %u", len);
+ mp = next;
+ continue;
+ }
+
+ DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul);
+
+ /*
+ * We use DB_CKSUMFLAGS (instead of mac_hcksum_get())
+ * because we don't want to mask-out the LSO flag.
+ */
+ flags = DB_CKSUMFLAGS(mp);
+
+ if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) {
+ uint_t tmpcount = 0;
/*
- * Duplicate the HCKSUM data into the header mblk.
- * This mimics mac_add_vlan_tag which ensures that both
- * the first mblk _and_ the first data bearing mblk
- * possess the HCKSUM information. Consumers like IP
- * will end up discarding the ether_header mblk, so for
- * now, it is important that the data be available in
- * both places.
+ * LSO fix-up handles checksum emulation
+ * inline (if requested). It also frees mp.
*/
- mac_hcksum_clone(mp, skipped_hdr);
- mp = skipped_hdr;
+ mac_sw_lso(mp, emul, &tmphead, &tmptail,
+ &tmpcount);
+ count += tmpcount;
+ } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) {
+ tmp = mac_sw_cksum(mp, emul);
+ tmphead = tmp;
+ tmptail = tmp;
+ count++;
+ } else {
+ /* There is nothing to emulate. */
+ tmp = mp;
+ tmphead = tmp;
+ tmptail = tmp;
+ count++;
+ }
+
+ /*
+ * The tmp mblk chain is either the start of the new
+ * chain or added to the tail of the new chain.
+ */
+ if (head == NULL) {
+ head = tmphead;
+ tail = tmptail;
+ } else {
+ /* Attach the new mblk to the end of the new chain. */
+ tail->b_next = tmphead;
+ tail = tmptail;
}
+
+ mp = next;
}
- return (new_chain);
+ *mp_chain = head;
+
+ if (otail != NULL)
+ *otail = tail;
+
+ if (ocount != NULL)
+ *ocount = count;
}
/*
@@ -501,16 +1373,9 @@ mac_strip_vlan_tag_chain(mblk_t *mp_chain)
*/
/* ARGSUSED */
void
-mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
+mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp,
boolean_t loopback)
{
- mblk_t *mp1 = mp;
-
- while (mp1 != NULL) {
- mp1->b_prev = NULL;
- mp1->b_queue = NULL;
- mp1 = mp1->b_next;
- }
freemsgchain(mp);
}
diff --git a/usr/src/uts/common/io/simnet/simnet.c b/usr/src/uts/common/io/simnet/simnet.c
index 727fbbad8e..9bfe2fe7cf 100644
--- a/usr/src/uts/common/io/simnet/simnet.c
+++ b/usr/src/uts/common/io/simnet/simnet.c
@@ -21,6 +21,8 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -795,12 +797,6 @@ simnet_m_tx(void *arg, mblk_t *mp_chain)
continue;
}
- /* Fix mblk checksum as the pkt dest is local */
- if ((mp = mac_fix_cksum(mp)) == NULL) {
- sdev->sd_stats.xmit_errors++;
- continue;
- }
-
/* Hold reference for taskq receive processing per-pkt */
if (!simnet_thread_ref(sdev_rx)) {
freemsg(mp);
diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c
index 57c02b0808..e532a551e7 100644
--- a/usr/src/uts/common/io/vnic/vnic_dev.c
+++ b/usr/src/uts/common/io/vnic/vnic_dev.c
@@ -456,6 +456,20 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
} else {
vnic->vn_hcksum_txflags = 0;
}
+
+ /*
+ * Check for LSO capabilities. LSO implementations
+ * depend on hardware checksumming, so the same
+ * requirement is enforced here.
+ */
+ if (vnic->vn_hcksum_txflags != 0) {
+ if (!mac_capab_get(vnic->vn_lower_mh, MAC_CAPAB_LSO,
+ &vnic->vn_cap_lso)) {
+ vnic->vn_cap_lso.lso_flags = 0;
+ }
+ } else {
+ vnic->vn_cap_lso.lso_flags = 0;
+ }
}
/* register with the MAC module */
@@ -826,6 +840,15 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
HCKSUM_INET_PARTIAL);
break;
}
+ case MAC_CAPAB_LSO: {
+ mac_capab_lso_t *cap_lso = cap_data;
+
+ if (vnic->vn_cap_lso.lso_flags == 0) {
+ return (B_FALSE);
+ }
+ *cap_lso = vnic->vn_cap_lso;
+ break;
+ }
case MAC_CAPAB_VNIC: {
mac_capab_vnic_t *vnic_capab = cap_data;
diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h
index a1ee3e3c70..6af2e0bccb 100644
--- a/usr/src/uts/common/sys/mac.h
+++ b/usr/src/uts/common/sys/mac.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright (c) 2015 Garrett D'Amore <garrett@damore.org>
*/
@@ -623,6 +623,36 @@ typedef struct mactype_register_s {
} mactype_register_t;
/*
+ * Flags to describe the hardware emulation desired from a client when
+ * calling mac_hw_emul().
+ *
+ * MAC_HWCKSUM_EMUL
+ *
+ * If an mblk is marked with HCK_* flags, then calculate those
+ * checksums and update the checksum flags.
+ *
+ * MAC_IPCKSUM_EMUL
+ *
+ * Like MAC_HWCKSUM_EMUL, except only calculate the IPv4 header
+ * checksum. We still update both the IPv4 and ULP checksum
+ * flags.
+ *
+ * MAC_LSO_EMUL
+ *
+ * If an mblk is marked with HW_LSO, then segment the LSO mblk
+ * into a new chain of mblks which reference the original data
+ * block. This flag DOES NOT imply MAC_HWCKSUM_EMUL. If the
+ * caller needs both then it must set both.
+ */
+typedef enum mac_emul {
+ MAC_HWCKSUM_EMUL = (1 << 0),
+ MAC_IPCKSUM_EMUL = (1 << 1),
+ MAC_LSO_EMUL = (1 << 2)
+} mac_emul_t;
+
+#define MAC_HWCKSUM_EMULS (MAC_HWCKSUM_EMUL | MAC_IPCKSUM_EMUL)
+
+/*
* Driver interface functions.
*/
extern int mac_open_by_linkid(datalink_id_t,
diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h
index b6040ad679..3290db92e6 100644
--- a/usr/src/uts/common/sys/mac_client.h
+++ b/usr/src/uts/common/sys/mac_client.h
@@ -200,6 +200,8 @@ extern int mac_set_mtu(mac_handle_t, uint_t, uint_t *);
extern void mac_client_set_rings(mac_client_handle_t, int, int);
+extern void mac_hw_emul(mblk_t **, mblk_t **, uint_t *, mac_emul_t);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h
index d64b895304..21e8620121 100644
--- a/usr/src/uts/common/sys/mac_client_impl.h
+++ b/usr/src/uts/common/sys/mac_client_impl.h
@@ -411,8 +411,7 @@ extern int mac_tx_percpu_cnt;
extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *);
extern void mac_client_init(void);
extern void mac_client_fini(void);
-extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *,
- mac_client_impl_t *, boolean_t);
+extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, mac_client_impl_t *);
extern int mac_validate_props(mac_impl_t *, mac_resource_props_t *);
diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h
index 17aebffc38..593322b990 100644
--- a/usr/src/uts/common/sys/mac_impl.h
+++ b/usr/src/uts/common/sys/mac_impl.h
@@ -345,7 +345,7 @@ struct mac_group_s {
if ((src_mcip)->mci_state_flags & MCIS_SHARE_BOUND) \
rhandle = (mip)->mi_default_tx_ring; \
if (mip->mi_promisc_list != NULL) \
- mac_promisc_dispatch(mip, mp, src_mcip, B_TRUE); \
+ mac_promisc_dispatch(mip, mp, src_mcip); \
/* \
* Grab the proper transmit pointer and handle. Special \
* optimization: we can test mi_bridge_link itself atomically, \
@@ -743,12 +743,23 @@ typedef struct mac_client_impl_s mac_client_impl_t;
extern void mac_init(void);
extern int mac_fini(void);
+/*
+ * MAC packet/chain drop functions to aggregate all dropped-packet
+ * debugging to a single surface.
+ */
+/*PRINTFLIKE2*/
+extern void mac_drop_pkt(mblk_t *, const char *, ...)
+ __KPRINTFLIKE(2);
+
+/*PRINTFLIKE2*/
+extern void mac_drop_chain(mblk_t *, const char *, ...)
+ __KPRINTFLIKE(2);
+
extern void mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *);
extern boolean_t mac_ip_hdr_length_v6(ip6_t *, uint8_t *, uint16_t *,
uint8_t *, ip6_frag_t **);
extern mblk_t *mac_copymsgchain_cksum(mblk_t *);
-extern mblk_t *mac_fix_cksum(mblk_t *);
extern void mac_packet_print(mac_handle_t, mblk_t *);
extern void mac_rx_deliver(void *, mac_resource_handle_t, mblk_t *,
mac_header_info_t *);
@@ -853,7 +864,7 @@ extern void mac_flow_set_name(flow_entry_t *, const char *);
extern mblk_t *mac_add_vlan_tag(mblk_t *, uint_t, uint16_t);
extern mblk_t *mac_add_vlan_tag_chain(mblk_t *, uint_t, uint16_t);
extern mblk_t *mac_strip_vlan_tag_chain(mblk_t *);
-extern void mac_pkt_drop(void *, mac_resource_handle_t, mblk_t *, boolean_t);
+extern void mac_rx_def(void *, mac_resource_handle_t, mblk_t *, boolean_t);
extern mblk_t *mac_rx_flow(mac_handle_t, mac_resource_handle_t, mblk_t *);
extern void i_mac_share_alloc(mac_client_impl_t *);
diff --git a/usr/src/uts/common/sys/pattr.h b/usr/src/uts/common/sys/pattr.h
index 1269aeca10..587a51f0aa 100644
--- a/usr/src/uts/common/sys/pattr.h
+++ b/usr/src/uts/common/sys/pattr.h
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_PATTR_H
@@ -106,6 +107,25 @@ typedef struct pattr_hcksum_s {
#define HW_LSO_FLAGS HW_LSO /* All LSO flags, currently only one */
/*
+ * The packet originates from a MAC on the same machine as the
+ * receiving MAC. There are two ways this can happen.
+ *
+ * 1. MAC loopback: When a packet is destined for a MAC client on the
+ * same MAC as the sender. This datapath is taken in
+ * max_tx_send().
+ *
+ * 2. Bridge Fwd: When a packet is destined for a MAC client on the
+ * same bridge as the sender. This datapath is taken in
+ * bridge_forward().
+ *
+ * Presented with this flag, a receiver can then decide whether or not
+ * it needs to emulate some or all of the HW offloads that the NIC
+ * would have performed otherwise -- or whether it should accept the
+ * packet as-is.
+ */
+#define HW_LOCAL_MAC 0x100
+
+/*
* Structure used for zerocopy attribute.
*/
typedef struct pattr_zcopy_s {
diff --git a/usr/src/uts/common/sys/vnic_impl.h b/usr/src/uts/common/sys/vnic_impl.h
index 1a91158da6..4c8d49c621 100644
--- a/usr/src/uts/common/sys/vnic_impl.h
+++ b/usr/src/uts/common/sys/vnic_impl.h
@@ -21,7 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_VNIC_IMPL_H
@@ -64,6 +64,7 @@ typedef struct vnic_s {
mac_notify_handle_t vn_mnh;
uint32_t vn_hcksum_txflags;
+ mac_capab_lso_t vn_cap_lso;
uint32_t vn_mtu;
link_state_t vn_ls;
} vnic_t;
diff --git a/usr/src/uts/common/xen/io/xnb.c b/usr/src/uts/common/xen/io/xnb.c
index 761597653b..c21476df89 100644
--- a/usr/src/uts/common/xen/io/xnb.c
+++ b/usr/src/uts/common/xen/io/xnb.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#ifdef DEBUG
@@ -251,8 +252,8 @@ xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
* because it doesn't cover all of the interesting cases :-(
*/
mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
-
- return (mac_fix_cksum(mp));
+ mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
+ return (mp);
}
mblk_t *
diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c
index c22c0bf646..d33051b7ff 100644
--- a/usr/src/uts/i86pc/io/viona/viona.c
+++ b/usr/src/uts/i86pc/io/viona/viona.c
@@ -258,27 +258,36 @@
#define VIRTIO_NET_HDR_F_NEEDS_CSUM (1 << 0)
#define VIRTIO_NET_HDR_F_DATA_VALID (1 << 1)
+#define VIRTIO_NET_HDR_GSO_NONE 0
+#define VIRTIO_NET_HDR_GSO_TCPV4 1
#define VRING_AVAIL_F_NO_INTERRUPT 1
#define VRING_USED_F_NO_NOTIFY 1
#define BCM_NIC_DRIVER "bnxe"
+
/*
- * Host capabilities
+ * Feature bits. See section 5.1.3 of the VIRTIO 1.0 spec.
*/
-#define VIRTIO_NET_F_CSUM (1 << 0)
-#define VIRTIO_NET_F_GUEST_CSUM (1 << 1)
-#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
+#define VIRTIO_NET_F_CSUM (1 << 0)
+#define VIRTIO_NET_F_GUEST_CSUM (1 << 1)
+#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
+#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can accept TSO */
+#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can accept TSO */
#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
#define VIRTIO_F_RING_NOTIFY_ON_EMPTY (1 << 24)
#define VIRTIO_F_RING_INDIRECT_DESC (1 << 28)
#define VIRTIO_F_RING_EVENT_IDX (1 << 29)
+/*
+ * Host capabilities.
+ */
#define VIONA_S_HOSTCAPS ( \
VIRTIO_NET_F_GUEST_CSUM | \
VIRTIO_NET_F_MAC | \
+ VIRTIO_NET_F_GUEST_TSO4 | \
VIRTIO_NET_F_MRG_RXBUF | \
VIRTIO_NET_F_STATUS | \
VIRTIO_F_RING_NOTIFY_ON_EMPTY | \
@@ -787,6 +796,13 @@ viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
break;
}
val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
+
+ if ((val & VIRTIO_NET_F_CSUM) == 0)
+ val &= ~VIRTIO_NET_F_HOST_TSO4;
+
+ if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
+ val &= ~VIRTIO_NET_F_GUEST_TSO4;
+
link->l_features = val;
break;
case VNA_IOC_RING_INIT:
@@ -859,6 +875,7 @@ viona_get_mac_capab(viona_link_t *link)
{
mac_handle_t mh = link->l_mh;
uint32_t cap = 0;
+ mac_capab_lso_t lso_cap;
link->l_features_hw = 0;
if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
@@ -871,6 +888,19 @@ viona_get_mac_capab(viona_link_t *link)
}
link->l_cap_csum = cap;
}
+
+ if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
+ mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
+ /*
+ * Virtio doesn't allow for negotiating a maximum LSO
+ * packet size. We have to assume that the guest may
+ * send a maximum length IP packet. Make sure the
+ * underlying MAC can handle an LSO of this size.
+ */
+ if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
+ lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
+ link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
+ }
}
static int
@@ -1827,6 +1857,7 @@ viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz)
size_t len, copied = 0;
caddr_t buf = NULL;
boolean_t end = B_FALSE;
+ const uint32_t features = ring->vr_link->l_features;
ASSERT(msz >= MIN_BUF_SIZE);
@@ -1881,9 +1912,15 @@ viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz)
copied += hdr_sz;
/* Add chksum bits, if needed */
- if ((ring->vr_link->l_features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
+ if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
uint32_t cksum_flags;
+ if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
+ ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
+ hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
+ hdr->vrh_gso_size = DB_LSOMSS(mp);
+ }
+
mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
&cksum_flags);
if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
@@ -1916,6 +1953,7 @@ viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz)
struct virtio_net_mrgrxhdr *hdr = NULL;
const size_t hdr_sz = sizeof (struct virtio_net_mrgrxhdr);
boolean_t end = B_FALSE;
+ const uint32_t features = ring->vr_link->l_features;
ASSERT(msz >= MIN_BUF_SIZE);
@@ -2021,9 +2059,15 @@ viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz)
}
/* Add chksum bits, if needed */
- if ((ring->vr_link->l_features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
+ if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
uint32_t cksum_flags;
+ if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
+ ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
+ hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
+ hdr->vrh_gso_size = DB_LSOMSS(mp);
+ }
+
mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
&cksum_flags);
if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
@@ -2067,8 +2111,29 @@ viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t loopback)
mblk_t *mpdrop = NULL, **mpdrop_prevp = &mpdrop;
const boolean_t do_merge =
((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0);
+ const boolean_t guest_csum =
+ ((link->l_features & VIRTIO_NET_F_GUEST_CSUM) != 0);
+ const boolean_t guest_tso4 =
+ ((link->l_features & VIRTIO_NET_F_GUEST_TSO4) != 0);
+
size_t nrx = 0, ndrop = 0;
+ /*
+ * The mac_hw_emul() function, by design, doesn't predicate on
+ * HW_LOCAL_MAC. Since we are in Rx context we know that any
+ * LSO packet must also be from a same-machine sender. We take
+ * advantage of that and forgoe writing a manual loop to
+ * predicate on HW_LOCAL_MAC.
+ *
+ * For checksum emulation we need to predicate on HW_LOCAL_MAC
+ * to avoid calling mac_hw_emul() on packets that don't need
+ * it (thanks to the fact that HCK_IPV4_HDRCKSUM and
+ * HCK_IPV4_HDRCKSUM_OK use the same value). Therefore, we do
+ * the checksum emulation in the second loop.
+ */
+ if (!guest_tso4)
+ mac_hw_emul(&mp, NULL, NULL, MAC_LSO_EMUL);
+
while (mp != NULL) {
mblk_t *next, *pad = NULL;
size_t size;
@@ -2076,6 +2141,25 @@ viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t loopback)
next = mp->b_next;
mp->b_next = NULL;
+
+ if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) {
+ /*
+ * The VIRTIO_NET_HDR_F_DATA_VALID flag only
+ * covers the ULP checksum -- so we still have
+ * to populate the IP header checksum.
+ */
+ if (guest_csum) {
+ mac_hw_emul(&mp, NULL, NULL, MAC_IPCKSUM_EMUL);
+ } else {
+ mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
+ }
+
+ if (mp == NULL) {
+ mp = next;
+ continue;
+ }
+ }
+
size = msgsize(mp);
/*
@@ -2222,28 +2306,6 @@ viona_desb_release(viona_desb_t *dp)
mutex_exit(&ring->vr_lock);
}
-static int
-viona_mb_get_uint8(mblk_t *mp, off_t off, uint8_t *out)
-{
- size_t mpsize;
- uint8_t *bp;
-
- mpsize = msgsize(mp);
- if (off + sizeof (uint8_t) > mpsize)
- return (-1);
-
- mpsize = MBLKL(mp);
- while (off >= mpsize) {
- mp = mp->b_cont;
- off -= mpsize;
- mpsize = MBLKL(mp);
- }
-
- bp = mp->b_rptr + off;
- *out = *bp;
- return (0);
-}
-
static boolean_t
viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
mblk_t *mp, uint32_t len)
@@ -2252,15 +2314,22 @@ viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
const struct ether_header *eth;
uint_t eth_len = sizeof (struct ether_header);
ushort_t ftype;
+ ipha_t *ipha = NULL;
uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
+ uint16_t flags = 0;
- eth = (const struct ether_header *)mp->b_rptr;
if (MBLKL(mp) < sizeof (*eth)) {
/* Buffers shorter than an ethernet header are hopeless */
return (B_FALSE);
}
+ /*
+ * This is guaranteed to be safe thanks to the header copying
+ * done in viona_tx().
+ */
+ eth = (const struct ether_header *)mp->b_rptr;
ftype = ntohs(eth->ether_type);
+
if (ftype == ETHERTYPE_VLAN) {
const struct ether_vlan_header *veth;
@@ -2271,16 +2340,80 @@ viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
}
if (ftype == ETHERTYPE_IP) {
- const size_t off = offsetof(ipha_t, ipha_protocol) + eth_len;
+ ipha = (ipha_t *)(mp->b_rptr + eth_len);
- (void) viona_mb_get_uint8(mp, off, &ipproto);
+ ipproto = ipha->ipha_protocol;
} else if (ftype == ETHERTYPE_IPV6) {
- const size_t off = offsetof(ip6_t, ip6_nxt) + eth_len;
+ ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
- (void) viona_mb_get_uint8(mp, off, &ipproto);
+ ipproto = ip6h->ip6_nxt;
}
/*
+ * We ignore hdr_len because the spec says it can't be
+ * trusted. Besides, our own stack will determine the header
+ * boundary.
+ */
+ if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
+ (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
+ ftype == ETHERTYPE_IP) {
+ uint16_t *cksump;
+ uint32_t cksum;
+ ipaddr_t src = ipha->ipha_src;
+ ipaddr_t dst = ipha->ipha_dst;
+
+ /*
+ * Our native IP stack doesn't set the L4 length field
+ * of the pseudo header when LSO is in play. Other IP
+ * stacks, e.g. Linux, do include the length field.
+ * This is a problem because the hardware expects that
+ * the length field is not set. When it is set it will
+ * cause an incorrect TCP checksum to be generated.
+ * The reason this works in Linux is because Linux
+ * corrects the pseudo-header checksum in the driver
+ * code. In order to get the correct HW checksum we
+ * need to assume the guest's IP stack gave us a bogus
+ * TCP partial checksum and calculate it ourselves.
+ */
+ cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
+ cksum = IP_TCP_CSUM_COMP;
+ cksum += (dst >> 16) + (dst & 0xFFFF) +
+ (src >> 16) + (src & 0xFFFF);
+ cksum = (cksum & 0xFFFF) + (cksum >> 16);
+ *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
+
+ /*
+ * Since viona is a "legacy device", the data stored
+ * by the driver will be in the guest's native endian
+ * format (see sections 2.4.3 and 5.1.6.1 of the
+ * VIRTIO 1.0 spec for more info). At this time the
+ * only guests using viona are x86 and we can assume
+ * little-endian.
+ */
+ lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
+
+ /*
+ * Hardware, like ixgbe, expects the client to request
+ * IP header checksum offload if it's sending LSO (see
+ * ixgbe_get_context()). Unfortunately, virtio makes
+ * no allowances for negotiating IP header checksum
+ * and HW offload, only TCP checksum. We add the flag
+ * and zero-out the checksum field. This mirrors the
+ * behavior of our native IP stack (which does this in
+ * the interest of HW that expects the field to be
+ * zero).
+ */
+ flags |= HCK_IPV4_HDRCKSUM;
+ ipha->ipha_hdr_checksum = 0;
+ }
+
+ /*
+ * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
+ * HW_LSO, if present, is not lost.
+ */
+ flags |= DB_CKSUMFLAGS(mp);
+
+ /*
* Partial checksum support from the NIC is ideal, since it most
* closely maps to the interface defined by virtio.
*/
@@ -2289,14 +2422,14 @@ viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
uint_t start, stuff, end;
/*
- * The lower-level driver is expecting these offsets to be
- * relative to the start of the L3 header rather than the
- * ethernet frame.
+ * MAC expects these offsets to be relative to the
+ * start of the L3 header rather than the L2 frame.
*/
start = hdr->vrh_csum_start - eth_len;
stuff = start + hdr->vrh_csum_offset;
end = len - eth_len;
- mac_hcksum_set(mp, start, stuff, end, 0, HCK_PARTIALCKSUM);
+ flags |= HCK_PARTIALCKSUM;
+ mac_hcksum_set(mp, start, stuff, end, 0, flags);
return (B_TRUE);
}
@@ -2308,7 +2441,8 @@ viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
if (ftype == ETHERTYPE_IP) {
if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
(ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
- mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
+ flags |= HCK_FULLCKSUM;
+ mac_hcksum_set(mp, 0, 0, 0, 0, flags);
return (B_TRUE);
}
@@ -2319,7 +2453,8 @@ viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
} else if (ftype == ETHERTYPE_IPV6) {
if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
(ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
- mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
+ flags |= HCK_FULLCKSUM;
+ mac_hcksum_set(mp, 0, 0, 0, 0, flags);
return (B_TRUE);
}
@@ -2463,7 +2598,12 @@ viona_tx(viona_link_t *link, viona_vring_t *ring)
mp_tail = mp;
}
- /* Request hardware checksumming, if necessary */
+ /*
+ * Request hardware checksumming, if necessary. If the guest
+ * sent an LSO packet then it must have also negotiated and
+ * requested partial checksum; therefore the LSO logic is
+ * contained within viona_tx_csum().
+ */
if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
(hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {