diff options
| author | Ryan Zezeski <rpz@joyent.com> | 2018-10-23 13:40:32 -0600 |
|---|---|---|
| committer | Ryan Zezeski <rpz@joyent.com> | 2018-10-23 13:40:32 -0600 |
| commit | 92b1263de0f0e8df3856a798e95a142d3bb7547b (patch) | |
| tree | df1e1ab0ae38327ea09288ec69fa91b6b459ef9c /usr/src | |
| parent | ffe74e17db37f9d0aa120a35a5cfcf2d7342dadb (diff) | |
| download | illumos-joyent-92b1263de0f0e8df3856a798e95a142d3bb7547b.tar.gz | |
Revert "OS-2340 vnics should support LSO" [breaks COAL]
This reverts commit 104c53876a87e773ef729efa9419a70fe24933cb.
Diffstat (limited to 'usr/src')
23 files changed, 434 insertions, 1761 deletions
diff --git a/usr/src/uts/common/inet/ip/ip_input.c b/usr/src/uts/common/inet/ip/ip_input.c index 22c1c74391..88f80a926b 100644 --- a/usr/src/uts/common/inet/ip/ip_input.c +++ b/usr/src/uts/common/inet/ip/ip_input.c @@ -57,7 +57,6 @@ #include <sys/vtrace.h> #include <sys/isa_defs.h> #include <sys/mac.h> -#include <sys/mac_client.h> #include <net/if.h> #include <net/if_arp.h> #include <net/route.h> @@ -660,13 +659,11 @@ ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg, } /* - * If the packet originated from a same-machine sender or - * there is a good HW IP header checksum, we clear the need + * If there is a good HW IP header checksum we clear the need * look at the IP header checksum. */ - if ((DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) || - ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && - ILL_HCKSUM_CAPABLE(ill) && dohwcksum)) { + if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && + ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { /* Header checksum was ok. Clear the flag */ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM; @@ -2259,13 +2256,12 @@ ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha, * We apply this for all ULP protocols. Does the HW know to * not set the flags for SCTP and other protocols. */ + hck_flags = DB_CKSUMFLAGS(mp); - if ((hck_flags & HCK_FULLCKSUM_OK) || (hck_flags & HW_LOCAL_MAC)) { + if (hck_flags & HCK_FULLCKSUM_OK) { /* - * Either the hardware already verified the checksum - * or the packet is from a same-machine sender in - * which case we assume data integrity. + * Hardware has already verified the checksum. */ return (B_TRUE); } diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h index fc90e6f217..2b37528eb9 100644 --- a/usr/src/uts/common/inet/ip_impl.h +++ b/usr/src/uts/common/inet/ip_impl.h @@ -21,7 +21,6 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2018 Joyent, Inc. */ #ifndef _INET_IP_IMPL_H @@ -160,27 +159,9 @@ extern "C" { #define ILL_DIRECT_CAPABLE(ill) \ (((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) -/* - * Determine if a mblk needs to take the "slow path", aka OTH - * softring. There are multiple reasons why a mblk might take the slow - * path. - * - * o The mblk is not a data message. - * - * o There is more than one outstanding reference to the mblk and it - * does not originate from a local MAC client. If the mblk does - * originate from a local MAC then allow it to pass through with - * more than one reference and leave the copying up to the consumer. - * - * o The IP header is not aligned (we assume alignment in the checksum - * routine). - * - * o The mblk doesn't contain enough data to populate a simple IP header. - */ +/* This macro is used by the mac layer */ #define MBLK_RX_FANOUT_SLOWPATH(mp, ipha) \ - (DB_TYPE(mp) != M_DATA || \ - (DB_REF(mp) != 1 && ((DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) == 0)) || \ - !OK_32PTR(ipha) || \ + (DB_TYPE(mp) != M_DATA || DB_REF(mp) != 1 || !OK_32PTR(ipha) || \ (((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH) >= (mp)->b_wptr)) /* diff --git a/usr/src/uts/common/io/bridge.c b/usr/src/uts/common/io/bridge.c index 587de5c131..97ee9f1f0e 100644 --- a/usr/src/uts/common/io/bridge.c +++ b/usr/src/uts/common/io/bridge.c @@ -23,7 +23,6 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2016 by Delphix. All rights reserved. - * Copyright 2018 Joyent, Inc. */ /* @@ -42,7 +41,6 @@ #include <sys/modctl.h> #include <sys/note.h> #include <sys/param.h> -#include <sys/pattr.h> #include <sys/policy.h> #include <sys/sdt.h> #include <sys/stat.h> @@ -1694,8 +1692,7 @@ bridge_learn(bridge_link_t *blp, const uint8_t *saddr, uint16_t ingress_nick, * The passed-in tci is the "impossible" value 0xFFFF when no tag is present. */ static mblk_t * -reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid, - boolean_t keep_flags) +reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid) { boolean_t source_has_tag = (tci != 0xFFFF); mblk_t *mpcopy; @@ -1707,13 +1704,8 @@ reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid, if (mp == NULL) return (mp); - /* - * A forwarded packet cannot have HW offloads enabled unless - * the destination is known to be local to the host and HW - * offloads haven't been emulated. - */ - if (!keep_flags) - DB_CKSUMFLAGS(mp) = 0; + /* No forwarded packet can have hardware checksum enabled */ + DB_CKSUMFLAGS(mp) = 0; /* Get the no-modification cases out of the way first */ if (!source_has_tag && vlanid == pvid) /* 1a */ @@ -1914,46 +1906,17 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, blp->bl_trillthreads++; mutex_exit(&blp->bl_trilllock); update_header(mp, hdr_info, B_FALSE); - - if (is_xmit) { - mac_hw_emul(&mp, NULL, NULL, - MAC_HWCKSUM_EMUL | MAC_LSO_EMUL); - - if (mp == NULL) { - KIINCR(bki_drops); - goto done; - } - } - - while (mp != NULL) { - mblk_t *next = mp->b_next; - - mp->b_next = NULL; - - /* - * All trill data frames have - * Inner.VLAN. - */ - mp = reform_vlan_header(mp, vlanid, tci, - 0, B_FALSE); - - if (mp == NULL) { - /* - * Make sure to free - * any remaining - * segments. - */ - freemsgchain(next); - KIINCR(bki_drops); - goto done; - } - - trill_encap_fn(tdp, blp, hdr_info, mp, - bfp->bf_trill_nick); - mp = next; + if (is_xmit) + mp = mac_fix_cksum(mp); + /* all trill data frames have Inner.VLAN */ + mp = reform_vlan_header(mp, vlanid, tci, 0); + if (mp == NULL) { + KIINCR(bki_drops); + fwd_unref(bfp); + return (NULL); } - -done: + trill_encap_fn(tdp, blp, hdr_info, mp, + bfp->bf_trill_nick); mutex_enter(&blp->bl_trilllock); if (--blp->bl_trillthreads == 0 && blp->bl_trilldata == NULL) @@ -1995,68 +1958,31 @@ done: mpsend = copymsg(mp); } - /* - * If the destination is not local to the host - * then we need to emulate HW offloads because - * we can't guarantee the forwarding - * destination provides them. - */ - if (!from_trill && is_xmit && - !(bfp->bf_flags & BFF_LOCALADDR)) { - mac_hw_emul(&mpsend, NULL, NULL, - MAC_HWCKSUM_EMUL | MAC_LSO_EMUL); - - if (mpsend == NULL) { - KIINCR(bki_drops); - continue; - } - } - - /* - * The HW emulation above may have segmented - * an LSO mblk. - */ - while ((mpsend != NULL) && - !(bfp->bf_flags & BFF_LOCALADDR)) { - mblk_t *next = mpsend->b_next; + if (!from_trill && is_xmit) + mpsend = mac_fix_cksum(mpsend); - mpsend->b_next = NULL; - mpsend = reform_vlan_header(mpsend, vlanid, tci, - blpsend->bl_pvid, B_FALSE); - - if (mpsend == NULL) { - KIINCR(bki_drops); - mpsend = next; - continue; - } - - KIINCR(bki_forwards); - KLPINCR(blpsend, bkl_xmit); - MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, - mpsend); - freemsg(mpsend); - mpsend = next; + mpsend = reform_vlan_header(mpsend, vlanid, tci, + blpsend->bl_pvid); + if (mpsend == NULL) { + KIINCR(bki_drops); + continue; } + KIINCR(bki_forwards); /* * No need to bump up the link reference count, as * the forwarding entry itself holds a reference to * the link. */ if (bfp->bf_flags & BFF_LOCALADDR) { - mpsend = reform_vlan_header(mpsend, vlanid, tci, - blpsend->bl_pvid, B_TRUE); - - if (mpsend == NULL) { - KIINCR(bki_drops); - continue; - } - - KIINCR(bki_forwards); mac_rx_common(blpsend->bl_mh, NULL, mpsend); + } else { + KLPINCR(blpsend, bkl_xmit); + MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, + mpsend); + freemsg(mpsend); } } - /* * Handle a special case: if we're transmitting to the original * link, then check whether the localaddr flag is set. If it @@ -2092,7 +2018,7 @@ done: * Inner.VLAN */ mpsend = reform_vlan_header(mpsend, - vlanid, tci, 0, B_FALSE); + vlanid, tci, 0); if (mpsend == NULL) { KIINCR(bki_drops); } else { @@ -2143,57 +2069,25 @@ done: mpsend = copymsg(mp); } - /* - * In this case, send to all links connected - * to the bridge. Some of these destinations - * may not provide HW offload -- so just - * emulate it here. - */ - if (!from_trill && is_xmit) { - mac_hw_emul(&mpsend, NULL, NULL, - MAC_HWCKSUM_EMUL | MAC_LSO_EMUL); - - if (mpsend == NULL) { - KIINCR(bki_drops); - continue; - } - } - - /* - * The HW emulation above may have segmented - * an LSO mblk. - */ - while (mpsend != NULL) { - mblk_t *next = mpsend->b_next; - - mpsend->b_next = NULL; - mpsend = reform_vlan_header(mpsend, vlanid, tci, - blpsend->bl_pvid, B_FALSE); - - if (mpsend == NULL) { - KIINCR(bki_drops); - mpsend = next; - continue; - } - - if (hdr_info->mhi_dsttype == - MAC_ADDRTYPE_UNICAST) - KIINCR(bki_unknown); - else - KIINCR(bki_mbcast); + if (!from_trill && is_xmit) + mpsend = mac_fix_cksum(mpsend); - KLPINCR(blpsend, bkl_xmit); - if ((mpcopy = copymsg(mpsend)) != NULL) { - mac_rx_common(blpsend->bl_mh, NULL, - mpcopy); - } - - MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, - mpsend); - freemsg(mpsend); - mpsend = next; + mpsend = reform_vlan_header(mpsend, vlanid, tci, + blpsend->bl_pvid); + if (mpsend == NULL) { + KIINCR(bki_drops); + continue; } + if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST) + KIINCR(bki_unknown); + else + KIINCR(bki_mbcast); + KLPINCR(blpsend, bkl_xmit); + if ((mpcopy = copymsg(mpsend)) != NULL) + mac_rx_common(blpsend->bl_mh, NULL, mpcopy); + MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, mpsend); + freemsg(mpsend); link_unref(blpsend); } } diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c index c792251052..904cb47ba4 100644 --- a/usr/src/uts/common/io/dls/dls_link.c +++ b/usr/src/uts/common/io/dls/dls_link.c @@ -30,7 +30,6 @@ #include <sys/sysmacros.h> #include <sys/strsubr.h> -#include <sys/pattr.h> #include <sys/strsun.h> #include <sys/vlan.h> #include <sys/dld_impl.h> @@ -163,18 +162,6 @@ i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip, uint16_t cvid, cpri; int err; - /* - * If this message is from a same-machine sender, then - * there may be HW checksum offloads to emulate. - */ - if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) { - mblk_t *tmpnext = mp->b_next; - - mp->b_next = NULL; - mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL); - mp->b_next = tmpnext; - } - DLS_PREPARE_PKT(dlp->dl_mh, mp, &cmhi, err); if (err != 0) break; @@ -369,22 +356,6 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, int err, rval; /* - * The mac_hw_emul() function, by design, doesn't predicate on - * HW_LOCAL_MAC. But since we are in Rx context we know that - * any LSO packet must also be from a same-machine sender. We - * take advantage of that and forgoe writing a manual loop to - * predicate on HW_LOCAL_MAC. - * - * But for checksum emulation we need to predicate on - * HW_LOCAL_MAC to avoid calling mac_hw_emul() on packets that - * don't need it (thanks to the fact that HCK_IPV4_HDRCKSUM - * and HCK_IPV4_HDRCKSUM_OK use the same value). Therefore we - * do the checksum emulation in the second loop and in - * subchain matching. - */ - mac_hw_emul(&mp, NULL, NULL, MAC_LSO_EMUL); - - /* * Walk the packet chain. */ for (; mp != NULL; mp = nextp) { @@ -393,18 +364,6 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, */ accepted = B_FALSE; - /* - * If this message is from a same-machine sender, then - * there may be HW checksum offloads to emulate. - */ - if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) { - mblk_t *tmpnext = mp->b_next; - - mp->b_next = NULL; - mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL); - mp->b_next = tmpnext; - } - DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err); if (err != 0) { atomic_inc_32(&(dlp->dl_unknowns)); @@ -607,13 +566,7 @@ dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp, dls_head_t *dhp; mod_hash_key_t key; - /* - * We expect to deal with only a single packet. - */ - ASSERT3P(mp->b_next, ==, NULL); - DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err); - if (err != 0) goto drop; diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index a63a6a5c61..2176f7d2af 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -1670,7 +1670,7 @@ mac_client_clear_flow_cb(mac_client_handle_t mch) flow_entry_t *flent = mcip->mci_flent; mutex_enter(&flent->fe_lock); - flent->fe_cb_fn = (flow_fn_t)mac_rx_def; + flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; flent->fe_cb_arg1 = NULL; flent->fe_cb_arg2 = NULL; flent->fe_flags |= FE_MC_NO_DATAPATH; diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c index 3b674be1d0..1ff33c3578 100644 --- a/usr/src/uts/common/io/mac/mac_bcast.c +++ b/usr/src/uts/common/io/mac/mac_bcast.c @@ -21,7 +21,6 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -147,7 +146,7 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback) uint64_t gen; uint_t i; mblk_t *mp_chain1; - flow_entry_t *flent; + flow_entry_t *flent; int err; rw_enter(&mip->mi_rw_lock, RW_READER); @@ -183,6 +182,13 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback) */ if ((mp_chain1 = mac_copymsgchain_cksum(mp_chain)) == NULL) break; + /* + * Fix the checksum for packets originating + * from the local machine. + */ + if ((src_mcip != NULL) && + (mp_chain1 = mac_fix_cksum(mp_chain1)) == NULL) + break; FLOW_TRY_REFHOLD(flent, err); if (err != 0) { diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c index de5ef6121f..da944d79d4 100644 --- a/usr/src/uts/common/io/mac/mac_client.c +++ b/usr/src/uts/common/io/mac/mac_client.c @@ -114,7 +114,6 @@ #include <sys/stream.h> #include <sys/strsun.h> #include <sys/strsubr.h> -#include <sys/pattr.h> #include <sys/dlpi.h> #include <sys/modhash.h> #include <sys/mac_impl.h> @@ -1357,7 +1356,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name, mcip->mci_mip = mip; mcip->mci_upper_mip = NULL; - mcip->mci_rx_fn = mac_rx_def; + mcip->mci_rx_fn = mac_pkt_drop; mcip->mci_rx_arg = NULL; mcip->mci_rx_p_fn = NULL; mcip->mci_rx_p_arg = NULL; @@ -1629,7 +1628,7 @@ mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg) void mac_rx_clear(mac_client_handle_t mch) { - mac_rx_set(mch, mac_rx_def, NULL); + mac_rx_set(mch, mac_pkt_drop, NULL); } void @@ -2970,7 +2969,7 @@ mac_client_datapath_teardown(mac_client_handle_t mch, mac_unicast_impl_t *muip, mac_misc_stat_delete(flent); /* Initialize the receiver function to a safe routine */ - flent->fe_cb_fn = (flow_fn_t)mac_rx_def; + flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; flent->fe_cb_arg1 = NULL; flent->fe_cb_arg2 = NULL; @@ -3591,13 +3590,6 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint, obytes = (mp_chain->b_cont == NULL ? MBLKL(mp_chain) : msgdsize(mp_chain)); - /* - * There's a chance this primary client might be part - * of a bridge and the packet forwarded to a local - * receiver -- mark the packet accordingly. - */ - DB_CKSUMFLAGS(mp_chain) |= HW_LOCAL_MAC; - MAC_TX(mip, srs_tx->st_arg2, mp_chain, mcip); if (mp_chain == NULL) { cookie = NULL; @@ -4011,36 +4003,21 @@ mac_client_get_effective_resources(mac_client_handle_t mch, * The unicast packets of MAC_CLIENT_PROMISC_FILTER callbacks are dispatched * after classification by mac_rx_deliver(). */ + static void mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp, - boolean_t loopback) + boolean_t loopback, boolean_t local) { - mblk_t *mp_next; - boolean_t local = (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) != 0; + mblk_t *mp_copy, *mp_next; if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag || (mpip->mpi_do_fixups && local)) { - mblk_t *mp_copy; - mp_copy = copymsg(mp); if (mp_copy == NULL) return; - /* - * The consumer has requested we emulate HW offloads - * for host-local packets. - */ if (mpip->mpi_do_fixups && local) { - /* - * Remember that copymsg() doesn't copy - * b_next, so we are only passing a single - * packet to mac_hw_emul(). Also keep in mind - * that mp_copy will become an mblk chain if - * the argument is an LSO message. - */ - mac_hw_emul(&mp_copy, NULL, NULL, - MAC_HWCKSUM_EMUL | MAC_LSO_EMUL); - + mp_copy = mac_fix_cksum(mp_copy); if (mp_copy == NULL) return; } @@ -4050,24 +4027,16 @@ mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp, if (mp_copy == NULL) return; } - - /* - * There is code upstack that can't deal with message - * chains. - */ - for (mblk_t *tmp = mp_copy; tmp != NULL; tmp = mp_next) { - mp_next = tmp->b_next; - tmp->b_next = NULL; - mpip->mpi_fn(mpip->mpi_arg, NULL, tmp, loopback); - } - - return; + mp_next = NULL; + } else { + mp_copy = mp; + mp_next = mp->b_next; } + mp_copy->b_next = NULL; - mp_next = mp->b_next; - mp->b_next = NULL; - mpip->mpi_fn(mpip->mpi_arg, NULL, mp, loopback); - mp->b_next = mp_next; + mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback); + if (mp_copy == mp) + mp->b_next = mp_next; } /* @@ -4109,7 +4078,7 @@ mac_is_mcast(mac_impl_t *mip, mblk_t *mp) */ void mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain, - mac_client_impl_t *sender) + mac_client_impl_t *sender, boolean_t local) { mac_promisc_impl_t *mpip; mac_cb_t *mcb; @@ -4150,7 +4119,8 @@ mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain, if (is_sender || mpip->mpi_type == MAC_CLIENT_PROMISC_ALL || is_mcast) { - mac_promisc_dispatch_one(mpip, mp, is_sender); + mac_promisc_dispatch_one(mpip, mp, is_sender, + local); } } } @@ -4180,7 +4150,8 @@ mac_promisc_client_dispatch(mac_client_impl_t *mcip, mblk_t *mp_chain) mpip = (mac_promisc_impl_t *)mcb->mcb_objp; if (mpip->mpi_type == MAC_CLIENT_PROMISC_FILTERED && !is_mcast) { - mac_promisc_dispatch_one(mpip, mp, B_FALSE); + mac_promisc_dispatch_one(mpip, mp, B_FALSE, + B_FALSE); } } } @@ -4278,9 +4249,8 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) mac_impl_t *mip = (mac_impl_t *)mh; /* - * Some capabilities are restricted when there are more than one active - * clients on the MAC resource. The ones noted below are safe, - * independent of that count. + * if mi_nactiveclients > 1, only MAC_CAPAB_LEGACY, MAC_CAPAB_HCKSUM, + * MAC_CAPAB_NO_NATIVEVLAN and MAC_CAPAB_NO_ZCOPY can be advertised. */ if (mip->mi_nactiveclients > 1) { switch (cap) { @@ -4288,7 +4258,6 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) return (B_TRUE); case MAC_CAPAB_LEGACY: case MAC_CAPAB_HCKSUM: - case MAC_CAPAB_LSO: case MAC_CAPAB_NO_NATIVEVLAN: break; default: diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c index 70585df698..6eea8b0343 100644 --- a/usr/src/uts/common/io/mac/mac_datapath_setup.c +++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c @@ -3496,7 +3496,7 @@ mac_srs_free(mac_soft_ring_set_t *mac_srs) ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE | SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE)); - mac_drop_chain(mac_srs->srs_first, "SRS free"); + mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE); mac_srs_ring_free(mac_srs); mac_srs_soft_rings_free(mac_srs); mac_srs_fanout_list_free(mac_srs); diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c index 62612122d6..aa4985fe4c 100644 --- a/usr/src/uts/common/io/mac/mac_flow.c +++ b/usr/src/uts/common/io/mac/mac_flow.c @@ -22,7 +22,6 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2018 Joyent, Inc. */ #include <sys/strsun.h> @@ -230,7 +229,7 @@ mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name, cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL); /* Initialize the receiver function to a safe routine */ - flent->fe_cb_fn = (flow_fn_t)mac_rx_def; + flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; flent->fe_index = -1; } (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c index cb1a76aef6..d739fad87a 100644 --- a/usr/src/uts/common/io/mac/mac_provider.c +++ b/usr/src/uts/common/io/mac/mac_provider.c @@ -688,7 +688,7 @@ mac_trill_snoop(mac_handle_t mh, mblk_t *mp) mac_impl_t *mip = (mac_impl_t *)mh; if (mip->mi_promisc_list != NULL) - mac_promisc_dispatch(mip, mp, NULL); + mac_promisc_dispatch(mip, mp, NULL, B_FALSE); } /* @@ -708,7 +708,7 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) * this MAC, pass them a copy if appropriate. */ if (mip->mi_promisc_list != NULL) - mac_promisc_dispatch(mip, mp_chain, NULL); + mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE); if (mr != NULL) { /* @@ -1541,22 +1541,15 @@ mac_hcksum_clone(const mblk_t *src, mblk_t *dst) ASSERT3U(DB_TYPE(dst), ==, M_DATA); /* - * Do these assignments unconditionally, rather than only when - * flags is non-zero. This protects a situation where zeroed - * hcksum data does not make the jump onto an mblk_t with - * stale data in those fields. It's important to copy all - * possible flags (HCK_* as well as HW_*) and not just the - * checksum specific flags. Dropping flags during a clone - * could result in dropped packets. If the caller has good - * reason to drop those flags then it should do it manually, - * after the clone. + * Do these assignments unconditionally, rather than only when flags is + * non-zero. This protects a situation where zeroed hcksum data does + * not make the jump onto an mblk_t with stale data in those fields. */ - DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src); + DB_CKSUMFLAGS(dst) = (DB_CKSUMFLAGS(src) & HCK_FLAGS); DB_CKSUMSTART(dst) = DB_CKSUMSTART(src); DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src); DB_CKSUMEND(dst) = DB_CKSUMEND(src); DB_CKSUM16(dst) = DB_CKSUM16(src); - DB_LSOMSS(dst) = DB_LSOMSS(src); } void diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c index e42cbd1320..59d59287b4 100644 --- a/usr/src/uts/common/io/mac/mac_sched.c +++ b/usr/src/uts/common/io/mac/mac_sched.c @@ -968,7 +968,6 @@ #include <sys/types.h> #include <sys/callb.h> -#include <sys/pattr.h> #include <sys/sdt.h> #include <sys/strsubr.h> #include <sys/strsun.h> @@ -1328,7 +1327,7 @@ int mac_srs_worker_wakeup_ticks = 0; * b_prev may be set to the fanout hint \ * hence can't use freemsg directly \ */ \ - mac_drop_chain(mp_chain, "SRS Tx max queue"); \ + mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ DTRACE_PROBE1(tx_queued_hiwat, \ mac_soft_ring_set_t *, srs); \ enqueue = 0; \ @@ -1347,11 +1346,11 @@ int mac_srs_worker_wakeup_ticks = 0; if (!(srs->srs_type & SRST_TX)) \ mutex_exit(&srs->srs_bw->mac_bw_lock); -#define MAC_TX_SRS_DROP_MESSAGE(srs, chain, cookie, s) { \ - mac_drop_pkt((chain), (s)); \ +#define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ + mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ /* increment freed stats */ \ - (srs)->srs_tx.st_stat.mts_sdrops++; \ - (cookie) = (mac_tx_cookie_t)(srs); \ + mac_srs->srs_tx.st_stat.mts_sdrops++; \ + cookie = (mac_tx_cookie_t)srs; \ } #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ @@ -2322,7 +2321,7 @@ check_again: if (smcip->mci_mip->mi_promisc_list != NULL) { mutex_exit(lock); mac_promisc_dispatch(smcip->mci_mip, - head, NULL); + head, NULL, B_FALSE); mutex_enter(lock); } } @@ -2894,7 +2893,7 @@ again: mac_srs->srs_bw->mac_bw_sz -= sz; mac_srs->srs_bw->mac_bw_drop_bytes += sz; mutex_exit(&mac_srs->srs_bw->mac_bw_lock); - mac_drop_chain(head, "Rx no bandwidth"); + mac_pkt_drop(NULL, NULL, head, B_FALSE); goto leave_poll; } else { mutex_exit(&mac_srs->srs_bw->mac_bw_lock); @@ -3276,10 +3275,9 @@ mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, } /* - * MAC SRS receive side routine. If the data is coming from the - * network (i.e. from a NIC) then this is called in interrupt context. - * If the data is coming from a local sender (e.g. mac_tx_send() or - * bridge_forward()) then this is not called in interrupt context. + * mac_rx_srs_process + * + * Receive side routine called from the interrupt path. * * loopback is set to force a context switch on the loopback * path between MAC clients. @@ -3339,7 +3337,7 @@ mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, mac_bw->mac_bw_drop_bytes += sz; mutex_exit(&mac_bw->mac_bw_lock); mutex_exit(&mac_srs->srs_lock); - mac_drop_chain(mp_chain, "Rx no bandwidth"); + mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); return; } else { if ((mac_bw->mac_bw_sz + sz) <= @@ -3461,8 +3459,7 @@ mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); if (flag & MAC_DROP_ON_NO_DESC) { - MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie, - "Tx no desc"); + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); } else { if (mac_srs->srs_first != NULL) wakeup_worker = B_FALSE; @@ -3525,8 +3522,7 @@ mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); if (flag & MAC_DROP_ON_NO_DESC) { if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { - MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie, - "Tx SRS hiwat"); + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); } else { MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, cnt, sz); @@ -3899,8 +3895,7 @@ mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, cookie = (mac_tx_cookie_t)mac_srs; *ret_mp = mp_chain; } else { - MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie, - "Tx no bandwidth"); + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); } mutex_exit(&mac_srs->srs_lock); return (cookie); @@ -4346,14 +4341,6 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, obytes += (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); - /* - * Mark all packets as local so that a - * receiver can determine if a packet arrived - * from a local source or from the network. - * This allows some consumers to avoid - * unecessary work like checksum computation. - */ - DB_CKSUMFLAGS(mp) |= HW_LOCAL_MAC; CHECK_VID_AND_ADD_TAG(mp); MAC_TX(mip, ring, mp, src_mcip); @@ -4386,6 +4373,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, flow_entry_t *dst_flow_ent; void *flow_cookie; size_t pkt_size; + mblk_t *mp1; next = mp->b_next; mp->b_next = NULL; @@ -4395,25 +4383,49 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, CHECK_VID_AND_ADD_TAG(mp); /* - * Mark all packets as local so that a receiver can - * determine if a packet arrived from a local source - * or from the network. This allows some consumers to - * avoid unecessary work like checksum computation. - */ - DB_CKSUMFLAGS(mp) |= HW_LOCAL_MAC; - - /* * Find the destination. */ dst_flow_ent = mac_tx_classify(mip, mp); if (dst_flow_ent != NULL) { + size_t hdrsize; + int err = 0; + + if (mip->mi_info.mi_nativemedia == DL_ETHER) { + struct ether_vlan_header *evhp = + (struct ether_vlan_header *)mp->b_rptr; + + if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) + hdrsize = sizeof (*evhp); + else + hdrsize = sizeof (struct ether_header); + } else { + mac_header_info_t mhi; + + err = mac_header_info((mac_handle_t)mip, + mp, &mhi); + if (err == 0) + hdrsize = mhi.mhi_hdrsize; + } + /* * Got a matching flow. It's either another * MAC client, or a broadcast/multicast flow. + * Make sure the packet size is within the + * allowed size. If not drop the packet and + * move to next packet. */ + if (err != 0 || + (pkt_size - hdrsize) > mip->mi_sdu_max) { + oerrors++; + DTRACE_PROBE2(loopback__drop, size_t, pkt_size, + mblk_t *, mp); + freemsg(mp); + mp = next; + FLOW_REFRELE(dst_flow_ent); + continue; + } flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); - if (flow_cookie != NULL) { /* * The vnic_bcast_send function expects @@ -4431,7 +4443,6 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, * bypass is set. */ boolean_t do_switch; - mac_client_impl_t *dst_mcip = dst_flow_ent->fe_mcip; @@ -4448,18 +4459,20 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, * macro. */ if (mip->mi_promisc_list != NULL) { - mac_promisc_dispatch(mip, mp, src_mcip); + mac_promisc_dispatch(mip, mp, src_mcip, + B_TRUE); } do_switch = ((src_mcip->mci_state_flags & dst_mcip->mci_state_flags & MCIS_CLIENT_POLL_CAPABLE) != 0); - (dst_flow_ent->fe_cb_fn)( - dst_flow_ent->fe_cb_arg1, - dst_flow_ent->fe_cb_arg2, - mp, do_switch); - + if ((mp1 = mac_fix_cksum(mp)) != NULL) { + (dst_flow_ent->fe_cb_fn)( + dst_flow_ent->fe_cb_arg1, + dst_flow_ent->fe_cb_arg2, + mp1, do_switch); + } } FLOW_REFRELE(dst_flow_ent); } else { @@ -4816,7 +4829,7 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); if (flag & MAC_DROP_ON_NO_DESC) { - mac_drop_chain(mp_chain, "Tx softring no desc"); + mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); /* increment freed stats */ ringp->s_ring_drops += cnt; cookie = (mac_tx_cookie_t)ringp; @@ -4860,8 +4873,8 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, * b_prev may be set to the fanout hint * hence can't use freemsg directly */ - mac_drop_chain(mp_chain, - "Tx softring max queue"); + mac_pkt_drop(NULL, NULL, + mp_chain, B_FALSE); DTRACE_PROBE1(tx_queued_hiwat, mac_soft_ring_t *, ringp); enqueue = B_FALSE; diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c index c62bd997a8..34f89328c3 100644 --- a/usr/src/uts/common/io/mac/mac_soft_ring.c +++ b/usr/src/uts/common/io/mac/mac_soft_ring.c @@ -242,7 +242,7 @@ mac_soft_ring_free(mac_soft_ring_t *softring) ASSERT((softring->s_ring_state & (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE | S_RING_PROC)) == (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE)); - mac_drop_chain(softring->s_ring_first, "softring free"); + mac_pkt_drop(NULL, NULL, softring->s_ring_first, B_FALSE); softring->s_ring_tx_arg2 = NULL; mac_soft_ring_stat_delete(softring); mac_callback_free(softring->s_ring_notify_cb_list); diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c index 2529d9c8c9..a877ca258c 100644 --- a/usr/src/uts/common/io/mac/mac_util.c +++ b/usr/src/uts/common/io/mac/mac_util.c @@ -48,74 +48,6 @@ #include <inet/sadb.h> #include <inet/ipsecesp.h> #include <inet/ipsecah.h> -#include <inet/tcp.h> -#include <inet/udp_impl.h> - -/* - * The next two functions are used for dropping packets or chains of - * packets, respectively. We could use one function for both but - * separating the use cases allows us to specify intent and prevent - * dropping more data than intended. - * - * The purpose of these functions is to aid the debugging effort, - * especially in production. Rather than use freemsg()/freemsgchain(), - * it's preferable to use these functions when dropping a packet in - * the MAC layer. These functions should only be used during - * unexpected conditions. That is, any time a packet is dropped - * outside of the regular, successful datapath. Consolidating all - * drops on these functions allows the user to trace one location and - * determine why the packet was dropped based on the msg. It also - * allows the user to inspect the packet before it is freed. Finally, - * it allows the user to avoid tracing freemsg()/freemsgchain() thus - * keeping the hot path running as efficiently as possible. - * - * NOTE: At this time not all MAC drops are aggregated on these - * functions; but that is the plan. This comment should be erased once - * completed. - */ - -/*PRINTFLIKE2*/ -void -mac_drop_pkt(mblk_t *mp, const char *fmt, ...) -{ - va_list adx; - char msg[128]; - char *msgp = msg; - - ASSERT3P(mp->b_next, ==, NULL); - - va_start(adx, fmt); - (void) vsnprintf(msgp, sizeof (msg), fmt, adx); - va_end(adx); - - DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); - freemsg(mp); -} - -/*PRINTFLIKE2*/ -void -mac_drop_chain(mblk_t *chain, const char *fmt, ...) -{ - va_list adx; - char msg[128]; - char *msgp = msg; - - va_start(adx, fmt); - (void) vsnprintf(msgp, sizeof (msg), fmt, adx); - va_end(adx); - - /* - * We could use freemsgchain() for the actual freeing but - * since we are already walking the chain to fire the dtrace - * probe we might as well free the msg here too. - */ - for (mblk_t *mp = chain, *next; mp != NULL; ) { - next = mp->b_next; - DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); - freemsg(mp); - mp = next; - } -} /* * Copy an mblk, preserving its hardware checksum flags. @@ -157,1121 +89,274 @@ mac_copymsgchain_cksum(mblk_t *mp) } /* - * Perform software checksum on a single message, if needed. The - * emulation performed is determined by an intersection of the mblk's - * flags and the emul flags requested. The emul flags are documented - * in mac.h. + * Process the specified mblk chain for proper handling of hardware + * checksum offload. This routine is invoked for loopback traffic + * between MAC clients. + * The function handles a NULL mblk chain passed as argument. */ -static mblk_t * -mac_sw_cksum(mblk_t *mp, mac_emul_t emul) +mblk_t * +mac_fix_cksum(mblk_t *mp_chain) { - mblk_t *orig = mp, *skipped_hdr = NULL; + mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1; uint32_t flags, start, stuff, end, value; - uint16_t len; - uint32_t offset; - uint16_t etype; - struct ether_header *ehp; - - /* - * This function should only be called from mac_hw_emul() - * which handles mblk chains and the shared ref case. - */ - ASSERT3P(mp->b_next, ==, NULL); - - mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL); - - /* - * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) because - * we don't want to mask-out the HW_LOCAL_MAC flag. - */ - flags = DB_CKSUMFLAGS(mp); - - /* Why call this if checksum emulation isn't needed? */ - ASSERT3U(flags & (HCK_FLAGS), !=, 0); - - /* - * Ethernet, and optionally VLAN header. mac_hw_emul() has - * already verified we have enough data to read the L2 header. - */ - ehp = (struct ether_header *)mp->b_rptr; - if (ntohs(ehp->ether_type) == VLAN_TPID) { - struct ether_vlan_header *evhp; - - evhp = (struct ether_vlan_header *)mp->b_rptr; - etype = ntohs(evhp->ether_type); - offset = sizeof (struct ether_vlan_header); - } else { - etype = ntohs(ehp->ether_type); - offset = sizeof (struct ether_header); - } - - /* - * If this packet isn't IPv4, then leave it alone. We still - * need to add IPv6 support and we don't want to affect non-IP - * traffic like ARP. - */ - if (etype != ETHERTYPE_IP) - return (mp); - - ASSERT3U(MBLKL(mp), >=, offset); - - /* - * If the first mblk of this packet contains only the ethernet - * header, skip past it for now. Packets with their data - * contained in only a single mblk can then use the fastpaths - * tuned to that possibility. - */ - if (MBLKL(mp) == offset) { - offset -= MBLKL(mp); - /* This is guaranteed by mac_hw_emul(). */ - ASSERT3P(mp->b_cont, !=, NULL); - skipped_hdr = mp; - mp = mp->b_cont; - } - - /* - * Both full and partial checksum rely on finding the IP - * header in the current mblk. Our native TCP stack honors - * this assumption but it's prudent to guard our future - * clients that might not honor this contract. - */ - ASSERT3U(MBLKL(mp), >=, offset + sizeof (ipha_t)); - if (MBLKL(mp) < (offset + sizeof (ipha_t))) { - mac_drop_pkt(mp, "mblk doesn't contain IP header"); - return (NULL); - } - - /* - * We are about to modify the header mblk; make sure we are - * modifying our own copy. The code that follows assumes that - * the IP/ULP headers exist in this mblk (and drops the - * message if they don't). - */ - if (DB_REF(mp) > 1) { - mblk_t *tmp = copyb(mp); - - if (tmp == NULL) { - mac_drop_pkt(mp, "copyb failed"); - return (NULL); - } - - tmp->b_cont = mp->b_cont; - freeb(mp); - mp = tmp; - } - - if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) { - ipha_t *ipha = (ipha_t *)(mp->b_rptr + offset); - - - if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { - ipaddr_t src, dst; - uint32_t cksum; - uint16_t *up; - uint8_t proto; - - /* - * This code assumes a "simple" IP header (20 - * bytes, no options). IPv4 options are mostly - * a historic artifact. The one slight - * exception is Router Alert, but we don't - * expect such a packet to land here. - */ - proto = ipha->ipha_protocol; - ASSERT(ipha->ipha_version_and_hdr_length == - IP_SIMPLE_HDR_VERSION); - if (ipha->ipha_version_and_hdr_length != - IP_SIMPLE_HDR_VERSION) { - mac_drop_pkt(mp, "not simple IP header"); - return (NULL); - } - - /* Get a pointer to the ULP checksum. */ - switch (proto) { - case IPPROTO_TCP: - ASSERT3U(MBLKL(mp), >=, - (offset + sizeof (ipha_t) + - sizeof (tcph_t))); - if (MBLKL(mp) < (offset + sizeof (ipha_t) + - sizeof (tcph_t))) { - mac_drop_pkt(mp, - "mblk doesn't contain TCP header"); - return (NULL); - } - - /* LINTED: improper alignment cast */ - up = IPH_TCPH_CHECKSUMP(ipha, - IP_SIMPLE_HDR_LENGTH); - break; - - case IPPROTO_UDP: - ASSERT3U(MBLKL(mp), >=, - (offset + sizeof (ipha_t) + - sizeof (udpha_t))); - if (MBLKL(mp) < (offset + sizeof (ipha_t) + - sizeof (udpha_t))) { - mac_drop_pkt(mp, - "mblk doesn't contain UDP header"); - return (NULL); - } - - /* LINTED: improper alignment cast */ - up = IPH_UDPH_CHECKSUMP(ipha, - IP_SIMPLE_HDR_LENGTH); - break; - default: - mac_drop_pkt(orig, "unexpected protocol: %d", - proto); - return (NULL); - } - - /* Pseudo-header checksum. */ - src = ipha->ipha_src; - dst = ipha->ipha_dst; - len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH; - - cksum = (dst >> 16) + (dst & 0xFFFF) + - (src >> 16) + (src & 0xFFFF); - cksum += htons(len); - - /* - * The checksum value stored in the packet - * needs to be correct. Compute it here. - */ - *up = 0; - cksum += (((proto) == IPPROTO_UDP) ? - IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP); - cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH + - offset, cksum); - *(up) = (uint16_t)(cksum ? cksum : ~cksum); - - } - - /* We always update the ULP checksum flags. */ - if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { - flags &= ~HCK_FULLCKSUM; - flags |= HCK_FULLCKSUM_OK; - value = 0; - } - - /* - * Out of paranoia, and for the sake of correctness, - * we won't calulate the IP header checksum if it's - * already populated. While unlikely, it's possible to - * write code that might end up calling mac_sw_cksum() - * twice on the same mblk (performing both LSO and - * checksum emualtion in a single mblk chain loop -- - * the LSO emulation inserts a new chain into the - * existing chain and then the loop iterates back over - * the new segments and emulates the checksum a second - * time). Normally this wouldn't be a problem, because - * the HCK_*_OK flags are supposed to indicate that we - * don't need to do peform the work. But - * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the - * same value; so we cannot use these flags to - * determine if the IP header checksum has already - * been calculated or not. Luckily, if IP requests - * HCK_IPV4_HDRCKSUM, then the IP header checksum will - * be zero. So this test works just as well as - * checking the flag. However, in the future, we - * should fix the HCK_* flags. - */ - if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS) && - ipha->ipha_hdr_checksum == 0) { - ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); - flags &= ~HCK_IPV4_HDRCKSUM; - flags |= HCK_IPV4_HDRCKSUM_OK; - } - } - - if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { - uint16_t *up, partial, cksum; - uchar_t *ipp; /* ptr to beginning of IP header */ - - ASSERT3U(MBLKL(mp), >=, - (offset + sizeof (ipha_t) + sizeof (tcph_t))); - if (MBLKL(mp) < (offset + sizeof (ipha_t) + sizeof (tcph_t))) { - mac_drop_pkt(mp, "mblk doesn't contain TCP header"); - return (NULL); - } - - ipp = mp->b_rptr + offset; - /* LINTED: cast may result in improper alignment */ - up = (uint16_t *)((uchar_t *)ipp + stuff); - partial = *up; - *up = 0; - - ASSERT3S(end, >, start); - cksum = ~IP_CSUM_PARTIAL(mp, offset + start, partial); - *up = cksum != 0 ? cksum : ~cksum; - } - - /* We always update the ULP checksum flags. */ - if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { - flags &= ~HCK_PARTIALCKSUM; - flags |= HCK_FULLCKSUM_OK; - value = 0; - } - - mac_hcksum_set(mp, start, stuff, end, value, flags); + for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) { + uint16_t len; + uint32_t offset; + struct ether_header *ehp; + uint16_t sap; + mblk_t *skipped_hdr = NULL; - /* Don't forget to reattach the header. */ - if (skipped_hdr != NULL) { - ASSERT3P(skipped_hdr->b_cont, ==, mp); + mac_hcksum_get(mp, &start, &stuff, &end, &value, &flags); + if (flags == 0) + continue; /* - * Duplicate the HCKSUM data into the header mblk. - * This mimics mac_add_vlan_tag which ensures that - * both the first mblk _and_ the first data bearing - * mblk possess the HCKSUM information. Consumers like - * IP will end up discarding the ether_header mblk, so - * for now, it is important that the data be available - * in both places. + * Since the processing of checksum offload for loopback + * traffic requires modification of the packet contents, + * ensure sure that we are always modifying our own copy. */ - mac_hcksum_clone(mp, skipped_hdr); - mp = skipped_hdr; - } - - return (mp); -} - -/* - * Build a single data segment from an LSO packet. The mblk chain - * returned, seg_head, represents the data segment and is always - * exactly seg_len bytes long. The lso_mp and offset input/output - * parameters track our position in the LSO packet. This function - * exists solely as a helper to mac_sw_lso(). - * - * Case A - * - * The current lso_mp is larger than the requested seg_len. The - * beginning of seg_head may start at the beginning of lso_mp or - * offset into it. In either case, a single mblk is returned, and - * *offset is updated to reflect our new position in the current - * lso_mp. - * - * +----------------------------+ - * | in *lso_mp / out *lso_mp | - * +----------------------------+ - * ^ ^ - * | | - * | | - * | | - * +------------------------+ - * | seg_head | - * +------------------------+ - * ^ ^ - * | | - * in *offset = 0 out *offset = seg_len - * - * |------ seg_len ----| - * - * - * +------------------------------+ - * | in *lso_mp / out *lso_mp | - * +------------------------------+ - * ^ ^ - * | | - * | | - * | | - * +------------------------+ - * | seg_head | - * +------------------------+ - * ^ ^ - * | | - * in *offset = N out *offset = N + seg_len - * - * |------ seg_len ----| - * - * - * - * Case B - * - * The requested seg_len consumes exactly the rest of the lso_mp. - * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr. - * The seg_head may start at the beginning of the lso_mp or at some - * offset into it. In either case we return a single mblk, reset - * *offset to zero, and walk to the next lso_mp. - * - * +------------------------+ +------------------------+ - * | in *lso_mp |---------->| out *lso_mp | - * +------------------------+ +------------------------+ - * ^ ^ ^ - * | | | - * | | out *offset = 0 - * | | - * +------------------------+ - * | seg_head | - * +------------------------+ - * ^ - * | - * in *offset = 0 - * - * |------ seg_len ----| - * - * - * - * +----------------------------+ +------------------------+ - * | in *lso_mp |---------->| out *lso_mp | - * +----------------------------+ +------------------------+ - * ^ ^ ^ - * | | | - * | | out *offset = 0 - * | | - * +------------------------+ - * | seg_head | - * +------------------------+ - * ^ - * | - * in *offset = N - * - * |------ seg_len ----| - * - * - * Case C - * - * The requested seg_len is greater than the current lso_mp. In - * this case we must consume LSO mblks until we have enough data to - * satisfy either case (A) or (B) above. We will return multiple - * mblks linked via b_cont, offset will be set based on the cases - * above, and lso_mp will walk forward at least one mblk, but maybe - * more. - * - * N.B. This digram is not exhaustive. The seg_head may start on - * the beginning of an lso_mp. The seg_tail may end exactly on the - * boundary of an lso_mp. And there may be two (in this case the - * middle block wouldn't exist), three, or more mblks in the - * seg_head chain. This is meant as one example of what might - * happen. The main thing to remember is that the seg_tail mblk - * must be one of case (A) or (B) above. - * - * +------------------+ +----------------+ +------------------+ - * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp | - * +------------------+ +----------------+ +------------------+ - * ^ ^ ^ ^ ^ ^ - * | | | | | | - * | | | | | | - * | | | | | | - * | | | | | | - * +------------+ +----------------+ +------------+ - * | seg_head |--->| |--->| seg_tail | - * +------------+ +----------------+ +------------+ - * ^ ^ - * | | - * in *offset = N out *offset = MBLKL(seg_tail) - * - * |------------------- seg_len -------------------| - * - */ -static mblk_t * -build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len) -{ - mblk_t *seg_head, *seg_tail, *seg_mp; - - ASSERT3P(*lso_mp, !=, NULL); - ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr); - - seg_mp = dupb(*lso_mp); - if (seg_mp == NULL) - return (NULL); - - seg_head = seg_mp; - seg_tail = seg_mp; - - /* Continue where we left off from in the lso_mp. */ - seg_mp->b_rptr += *offset; - -last_mblk: - /* Case (A) */ - if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) { - *offset += seg_len; - seg_mp->b_wptr = seg_mp->b_rptr + seg_len; - return (seg_head); - } - - /* Case (B) */ - if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) { - *offset = 0; - *lso_mp = (*lso_mp)->b_cont; - return (seg_head); - } - - /* Case (C) */ - ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr); - - /* - * The current LSO mblk doesn't have enough data to satisfy - * seg_len -- continue peeling off LSO mblks to build the new - * segment message. If allocation fails we free the previously - * allocated segment mblks and return NULL. - */ - while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) { - ASSERT3U(MBLKL(seg_mp), <=, seg_len); - seg_len -= MBLKL(seg_mp); - *offset = 0; - *lso_mp = (*lso_mp)->b_cont; - seg_mp = dupb(*lso_mp); - - if (seg_mp == NULL) { - freemsgchain(seg_head); - return (NULL); - } - - seg_tail->b_cont = seg_mp; - seg_tail = seg_mp; - } - - /* - * We've walked enough LSO mblks that we can now satisfy the - * remaining seg_len. At this point we need to jump back to - * determine if we have arrived at case (A) or (B). - */ - - /* Just to be paranoid that we didn't underflow. */ - ASSERT3U(seg_len, <, IP_MAXPACKET); - ASSERT3U(seg_len, >, 0); - goto last_mblk; -} - -/* - * Perform software segmentation of a single LSO message. Take an LSO - * message as input and return head/tail pointers as output. This - * function should not be invoked directly but instead through - * mac_hw_emul(). - * - * The resulting chain is comprised of multiple (nsegs) MSS sized - * segments. Each segment will consist of two or more mblks joined by - * b_cont: a header and one or more data mblks. The header mblk is - * allocated anew for each message. The first segment's header is used - * as a template for the rest with adjustments made for things such as - * ID, sequence, length, TCP flags, etc. The data mblks reference into - * the existing LSO mblk (passed in as omp) by way of dupb(). Their - * b_rptr/b_wptr values are adjusted to reference only the fraction of - * the LSO message they are responsible for. At the successful - * completion of this function the original mblk (omp) is freed, - * leaving the newely created segment chain as the only remaining - * reference to the data. - */ -static void -mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail, - uint_t *count) -{ - uint32_t ocsum_flags, ocsum_start, ocsum_stuff; - uint32_t mss; - uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen; - uint32_t oleft; - uint_t nsegs, seg; - int len; - - struct ether_vlan_header *oevh; - const ipha_t *oiph; - const tcph_t *otcph; - ipha_t *niph; - tcph_t *ntcph; - uint16_t ip_id; - uint32_t tcp_seq, tcp_sum, otcp_sum; - - uint32_t offset; - mblk_t *odatamp; - mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp; - mblk_t *tmptail; - - ASSERT3P(head, !=, NULL); - ASSERT3P(tail, !=, NULL); - ASSERT3P(count, !=, NULL); - ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0); - - /* Assume we are dealing with a single LSO message. */ - ASSERT3P(omp->b_next, ==, NULL); - - /* - * XXX: This is a hack to deal with mac_add_vlan_tag(). - * - * When VLANs are in play, mac_add_vlan_tag() creates a new - * mblk with just the ether_vlan_header and tacks it onto the - * front of 'omp'. This breaks the assumptions made below; - * namely that the TCP/IP headers are in the first mblk. In - * this case, since we already have to pay the cost of LSO - * emulation, we simply pull up everything. While this might - * seem irksome, keep in mind this will only apply in a couple - * of scenarios: a) an LSO-capable VLAN client sending to a - * non-LSO-capable client over the "MAC/bridge loopback" - * datapath or b) an LSO-capable VLAN client is sending to a - * client that, for whatever reason, doesn't have DLS-bypass - * enabled. Finally, we have to check for both a tagged and - * untagged sized mblk depending on if the mblk came via - * mac_promisc_dispatch() or mac_rx_deliver(). - * - * In the future, two things should be done: - * - * 1. This function should make use of some yet to be - * implemented "mblk helpers". These helper functions would - * perform all the b_cont walking for us and guarantee safe - * access to the mblk data. - * - * 2. We should add some slop to the mblks so that - * mac_add_vlan_tag() can just edit the first mblk instead - * of allocating on the hot path. - */ - if (MBLKL(omp) == sizeof (struct ether_vlan_header) || - MBLKL(omp) == sizeof (struct ether_header)) { - mblk_t *tmp = msgpullup(omp, -1); - - if (tmp == NULL) { - mac_drop_pkt(omp, "failed to pull up"); - goto fail; + if (DB_REF(mp) > 1) { + mp1 = copymsg(mp); + if (mp1 == NULL) + continue; + mp1->b_next = mp->b_next; + mp->b_next = NULL; + freemsg(mp); + if (prev != NULL) + prev->b_next = mp1; + else + new_chain = mp1; + mp = mp1; } - mac_hcksum_clone(omp, tmp); - freemsg(omp); - omp = tmp; - } - - mss = DB_LSOMSS(omp); - ASSERT3U(msgsize(omp), <=, IP_MAXPACKET + - sizeof (struct ether_vlan_header)); - opktlen = msgsize(omp); - - /* - * First, get references to the IP and TCP headers and - * determine the total TCP length (header + data). - * - * Thanks to mac_hw_emul() we know that the first mblk must - * contain (at minimum) the full L2 header. However, this - * function assumes more than that. It assumes the L2/L3/L4 - * headers are all contained in the first mblk of a message - * (i.e., no b_cont walking for headers). While this is a - * current reality (our native TCP stack and viona both - * enforce this) things may become more nuanced in the future - * (e.g. when introducing encap support or adding new - * clients). For now we guard against this case by dropping - * the packet. - */ - oevh = (struct ether_vlan_header *)omp->b_rptr; - if (oevh->ether_tpid == htons(ETHERTYPE_VLAN)) - oehlen = sizeof (struct ether_vlan_header); - else - oehlen = sizeof (struct ether_header); - - ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t))); - if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) { - mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers"); - goto fail; - } - - oiph = (ipha_t *)(omp->b_rptr + oehlen); - oiphlen = IPH_HDR_LENGTH(oiph); - otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen); - otcphlen = TCP_HDR_LENGTH(otcph); - - /* - * Currently we only support LSO for TCP/IPv4. - */ - if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) { - mac_drop_pkt(omp, "LSO unsupported IP version: %uhh", - IPH_HDR_VERSION(oiph)); - goto fail; - } - - if (oiph->ipha_protocol != IPPROTO_TCP) { - mac_drop_pkt(omp, "LSO unsupported protocol: %uhh", - oiph->ipha_protocol); - goto fail; - } - - if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) { - mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set"); - goto fail; - } - - ohdrslen = oehlen + oiphlen + otcphlen; - if ((len = MBLKL(omp)) < ohdrslen) { - mac_drop_pkt(omp, "LSO packet too short: %d < %u", len, - ohdrslen); - goto fail; - } - - /* - * Either we have data in the first mblk or it's just the - * header. In either case, we need to set rptr to the start of - * the TCP data. - */ - if (len > ohdrslen) { - odatamp = omp; - offset = ohdrslen; - } else { - ASSERT3U(len, ==, ohdrslen); - odatamp = omp->b_cont; - offset = 0; - } - - /* Make sure we still have enough data. */ - ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen); - - /* - * If a MAC negotiated LSO then it must negotioate both - * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or - * HCKSUM_INET_PARTIAL; because both the IP and TCP headers - * change during LSO segmentation (only the 3 fields of the - * pseudo header checksum don't change: src, dst, proto). Thus - * we would expect these flags (HCK_IPV4_HDRCKSUM | - * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this - * function to emulate those checksums in software. However, - * that assumes a world where we only expose LSO if the - * underlying hardware exposes LSO. Moving forward the plan is - * to assume LSO in the upper layers and have MAC perform - * software LSO when the underlying provider doesn't support - * it. In such a world, if the provider doesn't support LSO - * but does support hardware checksum offload, then we could - * simply perform the segmentation and allow the hardware to - * calculate the checksums. To the hardware it's just another - * chain of non-LSO packets. - */ - ASSERT3S(DB_TYPE(omp), ==, M_DATA); - ocsum_flags = DB_CKSUMFLAGS(omp); - ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0); - ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0); - - /* - * If hardware only provides partial checksum then software - * must supply the pseudo-header checksum. In the case of LSO - * we leave the TCP length at zero to be filled in by - * hardware. This function must handle two scenarios. - * - * 1. Being called by a MAC client on the Rx path to segment - * an LSO packet and calculate the checksum. - * - * 2. Being called by a MAC provider to segment an LSO packet. - * In this case the LSO segmentation is performed in - * software (by this routine) but the MAC provider should - * still calculate the TCP/IP checksums in hardware. - * - * To elaborate on the second case: we cannot have the - * scenario where IP sends LSO packets but the underlying HW - * doesn't support checksum offload -- because in that case - * TCP/IP would calculate the checksum in software (for the - * LSO packet) but then MAC would segment the packet and have - * to redo all the checksum work. So IP should never do LSO - * if HW doesn't support both IP and TCP checksum. - */ - if (ocsum_flags & HCK_PARTIALCKSUM) { - ocsum_start = (uint32_t)DB_CKSUMSTART(omp); - ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp); - } - - odatalen = opktlen - ohdrslen; - - /* - * Subtract one to account for the case where the data length - * is evenly divisble by the MSS. Add one to account for the - * fact that the division will always result in one less - * segment than needed. - */ - nsegs = ((odatalen - 1) / mss) + 1; - if (nsegs < 2) { - mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs); - goto fail; - } - - DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph, - __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t, - nsegs); - - seg_chain = NULL; - tmptail = seg_chain; - oleft = odatalen; - - for (uint_t i = 0; i < nsegs; i++) { - boolean_t last_seg = ((i + 1) == nsegs); - uint32_t seg_len; - /* - * If we fail to allocate, then drop the partially - * allocated chain as well as the LSO packet. Let the - * sender deal with the fallout. + * Ethernet, and optionally VLAN header. */ - if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) { - freemsgchain(seg_chain); - mac_drop_pkt(omp, "failed to alloc segment header"); - goto fail; - } - ASSERT3P(nhdrmp->b_cont, ==, NULL); + /* LINTED: improper alignment cast */ + ehp = (struct ether_header *)mp->b_rptr; + if (ntohs(ehp->ether_type) == VLAN_TPID) { + struct ether_vlan_header *evhp; - if (seg_chain == NULL) { - seg_chain = nhdrmp; + ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); + /* LINTED: improper alignment cast */ + evhp = (struct ether_vlan_header *)mp->b_rptr; + sap = ntohs(evhp->ether_type); + offset = sizeof (struct ether_vlan_header); } else { - ASSERT3P(tmptail, !=, NULL); - tmptail->b_next = nhdrmp; + sap = ntohs(ehp->ether_type); + offset = sizeof (struct ether_header); } - tmptail = nhdrmp; - /* - * Calculate this segment's lengh. It's either the MSS - * or whatever remains for the last segment. + * If the first mblk in the chain for this packet contains only + * the ethernet header, skip past it for now. Packets with + * their data contained in only a single mblk can then use the + * fastpaths tuned to that possibility. */ - seg_len = last_seg ? oleft : mss; - ASSERT3U(seg_len, <=, mss); - ndatamp = build_data_seg(&odatamp, &offset, seg_len); - - if (ndatamp == NULL) { - freemsgchain(seg_chain); - mac_drop_pkt(omp, "LSO failed to segment data"); - goto fail; + if (MBLKL(mp) <= offset) { + offset -= MBLKL(mp); + if (mp->b_cont == NULL) { + /* corrupted packet, skip it */ + if (prev != NULL) + prev->b_next = mp->b_next; + else + new_chain = mp->b_next; + mp1 = mp->b_next; + mp->b_next = NULL; + freemsg(mp); + mp = mp1; + continue; + } + skipped_hdr = mp; + mp = mp->b_cont; } - /* Attach data mblk to header mblk. */ - nhdrmp->b_cont = ndatamp; - DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO; - ASSERT3U(seg_len, <=, oleft); - oleft -= seg_len; - } - - /* We should have consumed entire LSO msg. */ - ASSERT3S(oleft, ==, 0); - ASSERT3P(odatamp, ==, NULL); - - /* - * All seg data mblks are referenced by the header mblks, null - * out this pointer to catch any bad derefs. - */ - ndatamp = NULL; + if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) { + ipha_t *ipha = NULL; - /* - * Set headers and checksum for first segment. - */ - nhdrmp = seg_chain; - bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen); - nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; - niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); - ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss); - niph->ipha_length = htons(oiphlen + otcphlen + mss); - niph->ipha_hdr_checksum = 0; - ip_id = ntohs(niph->ipha_ident); - ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); - tcp_seq = BE32_TO_U32(ntcph->th_seq); - tcp_seq += mss; - - /* - * The first segment shouldn't: - * - * o indicate end of data transmission (FIN), - * o indicate immediate handling of the data (PUSH). - */ - ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); - DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); - - /* - * If the underlying HW provides partial checksum, then make - * sure to correct the pseudo header checksum before calling - * mac_sw_cksum(). The native TCP stack doesn't include the - * length field in the pseudo header when LSO is in play -- so - * we need to calculate it here. - */ - if (ocsum_flags & HCK_PARTIALCKSUM) { - DB_CKSUMSTART(nhdrmp) = ocsum_start; - DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); - DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; - tcp_sum = BE16_TO_U16(ntcph->th_sum); - otcp_sum = tcp_sum; - tcp_sum += mss + otcphlen; - tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); - U16_TO_BE16(tcp_sum, ntcph->th_sum); - } - - if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && - (emul & MAC_HWCKSUM_EMULS)) { - next_nhdrmp = nhdrmp->b_next; - nhdrmp->b_next = NULL; - nhdrmp = mac_sw_cksum(nhdrmp, emul); - nhdrmp->b_next = next_nhdrmp; - next_nhdrmp = NULL; - - /* - * We may have freed the nhdrmp argument during - * checksum emulation, make sure that seg_chain - * references a valid mblk. - */ - seg_chain = nhdrmp; - } - - ASSERT3P(nhdrmp, !=, NULL); - - seg = 1; - DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, - (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, - (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss, - uint_t, seg); - seg++; - - /* There better be at least 2 segs. */ - ASSERT3P(nhdrmp->b_next, !=, NULL); - prev_nhdrmp = nhdrmp; - nhdrmp = nhdrmp->b_next; - - /* - * Now adjust the headers of the middle segments. For each - * header we need to adjust the following. - * - * o IP ID - * o IP length - * o TCP sequence - * o TCP flags - * o cksum flags - * o cksum values (if MAC_HWCKSUM_EMUL is set) - */ - for (; seg < nsegs; seg++) { - /* - * We use seg_chain as a reference to the first seg - * header mblk -- this first header is a template for - * the rest of the segments. This copy will include - * the now updated checksum values from the first - * header. We must reset these checksum values to - * their original to make sure we produce the correct - * value. - */ - bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); - nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; - niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); - niph->ipha_ident = htons(++ip_id); - ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss); - niph->ipha_length = htons(oiphlen + otcphlen + mss); - niph->ipha_hdr_checksum = 0; - ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); - U32_TO_BE32(tcp_seq, ntcph->th_seq); - tcp_seq += mss; - /* - * Just like the first segment, the middle segments - * shouldn't have these flags set. - */ - ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); - DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); - - if (ocsum_flags & HCK_PARTIALCKSUM) { /* - * First and middle segs have same - * pseudo-header checksum. + * In order to compute the full and header + * checksums, we need to find and parse + * the IP and/or ULP headers. */ - U16_TO_BE16(tcp_sum, ntcph->th_sum); - DB_CKSUMSTART(nhdrmp) = ocsum_start; - DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); - DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; - } - if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && - (emul & MAC_HWCKSUM_EMULS)) { - next_nhdrmp = nhdrmp->b_next; - nhdrmp->b_next = NULL; - nhdrmp = mac_sw_cksum(nhdrmp, emul); - nhdrmp->b_next = next_nhdrmp; - next_nhdrmp = NULL; - /* We may have freed the original nhdrmp. */ - prev_nhdrmp->b_next = nhdrmp; - } + sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; - DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, - (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, - (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), - uint_t, mss, uint_t, seg); - - ASSERT3P(nhdrmp->b_next, !=, NULL); - prev_nhdrmp = nhdrmp; - nhdrmp = nhdrmp->b_next; - } - - /* Make sure we are on the last segment. */ - ASSERT3U(seg, ==, nsegs); - ASSERT3P(nhdrmp->b_next, ==, NULL); + /* + * IP header. + */ + if (sap != ETHERTYPE_IP) + continue; + + ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t)); + /* LINTED: improper alignment cast */ + ipha = (ipha_t *)(mp->b_rptr + offset); + + if (flags & HCK_FULLCKSUM) { + ipaddr_t src, dst; + uint32_t cksum; + uint16_t *up; + uint8_t proto; + + /* + * Pointer to checksum field in ULP header. + */ + proto = ipha->ipha_protocol; + ASSERT(ipha->ipha_version_and_hdr_length == + IP_SIMPLE_HDR_VERSION); + + switch (proto) { + case IPPROTO_TCP: + /* LINTED: improper alignment cast */ + up = IPH_TCPH_CHECKSUMP(ipha, + IP_SIMPLE_HDR_LENGTH); + break; + + case IPPROTO_UDP: + /* LINTED: improper alignment cast */ + up = IPH_UDPH_CHECKSUMP(ipha, + IP_SIMPLE_HDR_LENGTH); + break; + + default: + cmn_err(CE_WARN, "mac_fix_cksum: " + "unexpected protocol: %d", proto); + continue; + } - /* - * Now we set the last segment header. The difference being - * that FIN/PSH/RST flags are allowed. - */ - bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); - nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; - niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); - niph->ipha_ident = htons(++ip_id); - len = msgsize(nhdrmp->b_cont); - ASSERT3S(len, >, 0); - niph->ipha_length = htons(oiphlen + otcphlen + len); - niph->ipha_hdr_checksum = 0; - ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); - U32_TO_BE32(tcp_seq, ntcph->th_seq); - - DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); - if (ocsum_flags & HCK_PARTIALCKSUM) { - DB_CKSUMSTART(nhdrmp) = ocsum_start; - DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); - DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; - tcp_sum = otcp_sum; - tcp_sum += len + otcphlen; - tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); - U16_TO_BE16(tcp_sum, ntcph->th_sum); - } + /* + * Pseudo-header checksum. + */ + src = ipha->ipha_src; + dst = ipha->ipha_dst; + len = ntohs(ipha->ipha_length) - + IP_SIMPLE_HDR_LENGTH; + + cksum = (dst >> 16) + (dst & 0xFFFF) + + (src >> 16) + (src & 0xFFFF); + cksum += htons(len); + + /* + * The checksum value stored in the packet needs + * to be correct. Compute it here. + */ + *up = 0; + cksum += (((proto) == IPPROTO_UDP) ? + IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP); + cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH + + offset, cksum); + *(up) = (uint16_t)(cksum ? cksum : ~cksum); + + /* + * Flag the packet so that it appears + * that the checksum has already been + * verified by the hardware. + */ + flags &= ~HCK_FULLCKSUM; + flags |= HCK_FULLCKSUM_OK; + value = 0; + } - if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && - (emul & MAC_HWCKSUM_EMULS)) { - /* This should be the last mblk. */ - ASSERT3P(nhdrmp->b_next, ==, NULL); - nhdrmp = mac_sw_cksum(nhdrmp, emul); - prev_nhdrmp->b_next = nhdrmp; - } + if (flags & HCK_IPV4_HDRCKSUM) { + ASSERT(ipha != NULL); + ipha->ipha_hdr_checksum = + (uint16_t)ip_csum_hdr(ipha); + flags &= ~HCK_IPV4_HDRCKSUM; + flags |= HCK_IPV4_HDRCKSUM_OK; - DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, - (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, - (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len, - uint_t, seg); + } + } - /* - * Free the reference to the original LSO message as it is - * being replaced by seg_cahin. - */ - freemsg(omp); - *head = seg_chain; - *tail = nhdrmp; - *count = nsegs; - return; - -fail: - *head = NULL; - *tail = NULL; - *count = 0; -} + if (flags & HCK_PARTIALCKSUM) { + uint16_t *up, partial, cksum; + uchar_t *ipp; /* ptr to beginning of IP header */ + mblk_t *old_mp = NULL; -#define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM) + if (mp->b_cont != NULL) { + mblk_t *new_mp; -/* - * Emulate various hardware offload features in software. Take a chain - * of packets as input and emulate the hardware features specified in - * 'emul'. The resulting chain's head pointer replaces the 'mp_chain' - * pointer given as input, and its tail pointer is written to - * '*otail'. The number of packets in the new chain is written to - * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus - * may be NULL. The 'mp_chain' argument may point to a NULL chain; in - * which case 'mp_chain' will simply stay a NULL chain. - * - * While unlikely, it is technically possible that this function could - * receive a non-NULL chain as input and return a NULL chain as output - * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be - * zero). This could happen if all the packets in the chain are - * dropped or if we fail to allocate new mblks. In this case, there is - * nothing for the caller to free. In any event, the caller shouldn't - * assume that '*mp_chain' is non-NULL on return. - * - * This function was written with two main use cases in mind. - * - * 1. A way for MAC clients to emulate hardware offloads when they - * can't directly handle LSO packets or packets without fully - * calculated checksums. - * - * 2. A way for MAC providers (drivers) to offer LSO even when the - * underlying HW can't or won't supply LSO offload. - * - * At the time of this writing no provider is making use of this - * function. However, the plan for the future is to always assume LSO - * is available and then add SW LSO emulation to all providers that - * don't support it in HW. - */ -void -mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul) -{ - mblk_t *head = NULL, *tail = NULL; - uint_t count = 0; - - ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0); - ASSERT3P(mp_chain, !=, NULL); + new_mp = msgpullup(mp, offset + end); + if (new_mp == NULL) { + continue; + } + old_mp = mp; + mp = new_mp; + } - for (mblk_t *mp = *mp_chain; mp != NULL; ) { - mblk_t *tmp, *next, *tmphead, *tmptail; - struct ether_header *ehp; - uint32_t flags; - uint_t len = MBLKL(mp), l2len; + ipp = mp->b_rptr + offset; + /* LINTED: cast may result in improper alignment */ + up = (uint16_t *)((uchar_t *)ipp + stuff); + partial = *up; + *up = 0; - /* Perform LSO/cksum one message at a time. */ - next = mp->b_next; - mp->b_next = NULL; + cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start, + end - start, partial); + cksum = ~cksum; + *up = cksum ? cksum : ~cksum; - /* - * For our sanity the first mblk should contain at - * least the full L2 header. - */ - if (len < sizeof (struct ether_header)) { - mac_drop_pkt(mp, "packet too short (A): %u", len); - mp = next; - continue; - } + /* + * Since we already computed the whole checksum, + * indicate to the stack that it has already + * been verified by the hardware. + */ + flags &= ~HCK_PARTIALCKSUM; + flags |= HCK_FULLCKSUM_OK; + value = 0; - ehp = (struct ether_header *)mp->b_rptr; - if (ntohs(ehp->ether_type) == VLAN_TPID) - l2len = sizeof (struct ether_vlan_header); - else - l2len = sizeof (struct ether_header); + /* + * If 'mp' is the result of a msgpullup(), it needs to + * be properly reattached into the existing chain of + * messages before continuing. + */ + if (old_mp != NULL) { + if (skipped_hdr != NULL) { + /* + * If the ethernet header was cast + * aside before checksum calculation, + * prepare for it to be reattached to + * the pulled-up mblk. + */ + skipped_hdr->b_cont = mp; + } else { + /* Link the new mblk into the chain. */ + mp->b_next = old_mp->b_next; + + if (prev != NULL) + prev->b_next = mp; + else + new_chain = mp; + } - /* - * If the first mblk is solely the L2 header, then - * there better be more data. - */ - if (len < l2len || (len == l2len && mp->b_cont == NULL)) { - mac_drop_pkt(mp, "packet too short (C): %u", len); - mp = next; - continue; + old_mp->b_next = NULL; + freemsg(old_mp); + } } - DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul); + mac_hcksum_set(mp, start, stuff, end, value, flags); /* - * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) - * because we don't want to mask-out the LSO flag. + * If the header was skipped over, we must seek back to it, + * since it is that mblk that is part of any packet chain. */ - flags = DB_CKSUMFLAGS(mp); - - if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) { - uint_t tmpcount = 0; + if (skipped_hdr != NULL) { + ASSERT3P(skipped_hdr->b_cont, ==, mp); /* - * LSO fix-up handles checksum emulation - * inline (if requested). It also frees mp. + * Duplicate the HCKSUM data into the header mblk. + * This mimics mac_add_vlan_tag which ensures that both + * the first mblk _and_ the first data bearing mblk + * possess the HCKSUM information. Consumers like IP + * will end up discarding the ether_header mblk, so for + * now, it is important that the data be available in + * both places. */ - mac_sw_lso(mp, emul, &tmphead, &tmptail, - &tmpcount); - count += tmpcount; - } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) { - tmp = mac_sw_cksum(mp, emul); - tmphead = tmp; - tmptail = tmp; - count++; - } else { - /* There is nothing to emulate. */ - tmp = mp; - tmphead = tmp; - tmptail = tmp; - count++; + mac_hcksum_clone(mp, skipped_hdr); + mp = skipped_hdr; } - - /* - * The tmp mblk chain is either the start of the new - * chain or added to the tail of the new chain. - */ - if (head == NULL) { - head = tmphead; - tail = tmptail; - } else { - /* Attach the new mblk to the end of the new chain. */ - tail->b_next = tmphead; - tail = tmptail; - } - - mp = next; } - *mp_chain = head; - - if (otail != NULL) - *otail = tail; - - if (ocount != NULL) - *ocount = count; + return (new_chain); } /* @@ -1416,9 +501,16 @@ mac_strip_vlan_tag_chain(mblk_t *mp_chain) */ /* ARGSUSED */ void -mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp, +mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp, boolean_t loopback) { + mblk_t *mp1 = mp; + + while (mp1 != NULL) { + mp1->b_prev = NULL; + mp1->b_queue = NULL; + mp1 = mp1->b_next; + } freemsgchain(mp); } diff --git a/usr/src/uts/common/io/simnet/simnet.c b/usr/src/uts/common/io/simnet/simnet.c index 9bfe2fe7cf..727fbbad8e 100644 --- a/usr/src/uts/common/io/simnet/simnet.c +++ b/usr/src/uts/common/io/simnet/simnet.c @@ -21,8 +21,6 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * - * Copyright 2018 Joyent, Inc. */ /* @@ -797,6 +795,12 @@ simnet_m_tx(void *arg, mblk_t *mp_chain) continue; } + /* Fix mblk checksum as the pkt dest is local */ + if ((mp = mac_fix_cksum(mp)) == NULL) { + sdev->sd_stats.xmit_errors++; + continue; + } + /* Hold reference for taskq receive processing per-pkt */ if (!simnet_thread_ref(sdev_rx)) { freemsg(mp); diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c index e532a551e7..57c02b0808 100644 --- a/usr/src/uts/common/io/vnic/vnic_dev.c +++ b/usr/src/uts/common/io/vnic/vnic_dev.c @@ -456,20 +456,6 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, } else { vnic->vn_hcksum_txflags = 0; } - - /* - * Check for LSO capabilities. LSO implementations - * depend on hardware checksumming, so the same - * requirement is enforced here. - */ - if (vnic->vn_hcksum_txflags != 0) { - if (!mac_capab_get(vnic->vn_lower_mh, MAC_CAPAB_LSO, - &vnic->vn_cap_lso)) { - vnic->vn_cap_lso.lso_flags = 0; - } - } else { - vnic->vn_cap_lso.lso_flags = 0; - } } /* register with the MAC module */ @@ -840,15 +826,6 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) HCKSUM_INET_PARTIAL); break; } - case MAC_CAPAB_LSO: { - mac_capab_lso_t *cap_lso = cap_data; - - if (vnic->vn_cap_lso.lso_flags == 0) { - return (B_FALSE); - } - *cap_lso = vnic->vn_cap_lso; - break; - } case MAC_CAPAB_VNIC: { mac_capab_vnic_t *vnic_capab = cap_data; diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h index 6af2e0bccb..a1ee3e3c70 100644 --- a/usr/src/uts/common/sys/mac.h +++ b/usr/src/uts/common/sys/mac.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018 Joyent, Inc. + * Copyright (c) 2017, Joyent, Inc. * Copyright (c) 2015 Garrett D'Amore <garrett@damore.org> */ @@ -623,36 +623,6 @@ typedef struct mactype_register_s { } mactype_register_t; /* - * Flags to describe the hardware emulation desired from a client when - * calling mac_hw_emul(). - * - * MAC_HWCKSUM_EMUL - * - * If an mblk is marked with HCK_* flags, then calculate those - * checksums and update the checksum flags. - * - * MAC_IPCKSUM_EMUL - * - * Like MAC_HWCKSUM_EMUL, except only calculate the IPv4 header - * checksum. We still update both the IPv4 and ULP checksum - * flags. - * - * MAC_LSO_EMUL - * - * If an mblk is marked with HW_LSO, then segment the LSO mblk - * into a new chain of mblks which reference the original data - * block. This flag DOES NOT imply MAC_HWCKSUM_EMUL. If the - * caller needs both then it must set both. - */ -typedef enum mac_emul { - MAC_HWCKSUM_EMUL = (1 << 0), - MAC_IPCKSUM_EMUL = (1 << 1), - MAC_LSO_EMUL = (1 << 2) -} mac_emul_t; - -#define MAC_HWCKSUM_EMULS (MAC_HWCKSUM_EMUL | MAC_IPCKSUM_EMUL) - -/* * Driver interface functions. */ extern int mac_open_by_linkid(datalink_id_t, diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h index 3290db92e6..b6040ad679 100644 --- a/usr/src/uts/common/sys/mac_client.h +++ b/usr/src/uts/common/sys/mac_client.h @@ -200,8 +200,6 @@ extern int mac_set_mtu(mac_handle_t, uint_t, uint_t *); extern void mac_client_set_rings(mac_client_handle_t, int, int); -extern void mac_hw_emul(mblk_t **, mblk_t **, uint_t *, mac_emul_t); - #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h index 21e8620121..d64b895304 100644 --- a/usr/src/uts/common/sys/mac_client_impl.h +++ b/usr/src/uts/common/sys/mac_client_impl.h @@ -411,7 +411,8 @@ extern int mac_tx_percpu_cnt; extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *); extern void mac_client_init(void); extern void mac_client_fini(void); -extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, mac_client_impl_t *); +extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, + mac_client_impl_t *, boolean_t); extern int mac_validate_props(mac_impl_t *, mac_resource_props_t *); diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index 593322b990..17aebffc38 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -345,7 +345,7 @@ struct mac_group_s { if ((src_mcip)->mci_state_flags & MCIS_SHARE_BOUND) \ rhandle = (mip)->mi_default_tx_ring; \ if (mip->mi_promisc_list != NULL) \ - mac_promisc_dispatch(mip, mp, src_mcip); \ + mac_promisc_dispatch(mip, mp, src_mcip, B_TRUE); \ /* \ * Grab the proper transmit pointer and handle. Special \ * optimization: we can test mi_bridge_link itself atomically, \ @@ -743,23 +743,12 @@ typedef struct mac_client_impl_s mac_client_impl_t; extern void mac_init(void); extern int mac_fini(void); -/* - * MAC packet/chain drop functions to aggregate all dropped-packet - * debugging to a single surface. - */ -/*PRINTFLIKE2*/ -extern void mac_drop_pkt(mblk_t *, const char *, ...) - __KPRINTFLIKE(2); - -/*PRINTFLIKE2*/ -extern void mac_drop_chain(mblk_t *, const char *, ...) - __KPRINTFLIKE(2); - extern void mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *); extern boolean_t mac_ip_hdr_length_v6(ip6_t *, uint8_t *, uint16_t *, uint8_t *, ip6_frag_t **); extern mblk_t *mac_copymsgchain_cksum(mblk_t *); +extern mblk_t *mac_fix_cksum(mblk_t *); extern void mac_packet_print(mac_handle_t, mblk_t *); extern void mac_rx_deliver(void *, mac_resource_handle_t, mblk_t *, mac_header_info_t *); @@ -864,7 +853,7 @@ extern void mac_flow_set_name(flow_entry_t *, const char *); extern mblk_t *mac_add_vlan_tag(mblk_t *, uint_t, uint16_t); extern mblk_t *mac_add_vlan_tag_chain(mblk_t *, uint_t, uint16_t); extern mblk_t *mac_strip_vlan_tag_chain(mblk_t *); -extern void mac_rx_def(void *, mac_resource_handle_t, mblk_t *, boolean_t); +extern void mac_pkt_drop(void *, mac_resource_handle_t, mblk_t *, boolean_t); extern mblk_t *mac_rx_flow(mac_handle_t, mac_resource_handle_t, mblk_t *); extern void i_mac_share_alloc(mac_client_impl_t *); diff --git a/usr/src/uts/common/sys/pattr.h b/usr/src/uts/common/sys/pattr.h index 587a51f0aa..1269aeca10 100644 --- a/usr/src/uts/common/sys/pattr.h +++ b/usr/src/uts/common/sys/pattr.h @@ -21,7 +21,6 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_PATTR_H @@ -107,25 +106,6 @@ typedef struct pattr_hcksum_s { #define HW_LSO_FLAGS HW_LSO /* All LSO flags, currently only one */ /* - * The packet originates from a MAC on the same machine as the - * receiving MAC. There are two ways this can happen. - * - * 1. MAC loopback: When a packet is destined for a MAC client on the - * same MAC as the sender. This datapath is taken in - * max_tx_send(). - * - * 2. Bridge Fwd: When a packet is destined for a MAC client on the - * same bridge as the sender. This datapath is taken in - * bridge_forward(). - * - * Presented with this flag, a receiver can then decide whether or not - * it needs to emulate some or all of the HW offloads that the NIC - * would have performed otherwise -- or whether it should accept the - * packet as-is. - */ -#define HW_LOCAL_MAC 0x100 - -/* * Structure used for zerocopy attribute. */ typedef struct pattr_zcopy_s { diff --git a/usr/src/uts/common/sys/vnic_impl.h b/usr/src/uts/common/sys/vnic_impl.h index 4c8d49c621..1a91158da6 100644 --- a/usr/src/uts/common/sys/vnic_impl.h +++ b/usr/src/uts/common/sys/vnic_impl.h @@ -21,7 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2018 Joyent, Inc. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_VNIC_IMPL_H @@ -64,7 +64,6 @@ typedef struct vnic_s { mac_notify_handle_t vn_mnh; uint32_t vn_hcksum_txflags; - mac_capab_lso_t vn_cap_lso; uint32_t vn_mtu; link_state_t vn_ls; } vnic_t; diff --git a/usr/src/uts/common/xen/io/xnb.c b/usr/src/uts/common/xen/io/xnb.c index c21476df89..761597653b 100644 --- a/usr/src/uts/common/xen/io/xnb.c +++ b/usr/src/uts/common/xen/io/xnb.c @@ -22,7 +22,6 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2018 Joyent, Inc. */ #ifdef DEBUG @@ -252,8 +251,8 @@ xnb_software_csum(xnb_t *xnbp, mblk_t *mp) * because it doesn't cover all of the interesting cases :-( */ mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM); - mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL); - return (mp); + + return (mac_fix_cksum(mp)); } mblk_t * diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c index 0a8fd9f141..8266537f1a 100644 --- a/usr/src/uts/i86pc/io/viona/viona.c +++ b/usr/src/uts/i86pc/io/viona/viona.c @@ -279,36 +279,27 @@ #define VIRTIO_NET_HDR_F_NEEDS_CSUM (1 << 0) #define VIRTIO_NET_HDR_F_DATA_VALID (1 << 1) -#define VIRTIO_NET_HDR_GSO_NONE 0 -#define VIRTIO_NET_HDR_GSO_TCPV4 1 #define VRING_AVAIL_F_NO_INTERRUPT 1 #define VRING_USED_F_NO_NOTIFY 1 #define BCM_NIC_DRIVER "bnxe" - /* - * Feature bits. See section 5.1.3 of the VIRTIO 1.0 spec. + * Host capabilities */ -#define VIRTIO_NET_F_CSUM (1 << 0) -#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) -#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ -#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can accept TSO */ -#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can accept TSO */ +#define VIRTIO_NET_F_CSUM (1 << 0) +#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ #define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ #define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ #define VIRTIO_F_RING_NOTIFY_ON_EMPTY (1 << 24) #define VIRTIO_F_RING_INDIRECT_DESC (1 << 28) #define VIRTIO_F_RING_EVENT_IDX (1 << 29) -/* - * Host capabilities. - */ #define VIONA_S_HOSTCAPS ( \ VIRTIO_NET_F_GUEST_CSUM | \ VIRTIO_NET_F_MAC | \ - VIRTIO_NET_F_GUEST_TSO4 | \ VIRTIO_NET_F_MRG_RXBUF | \ VIRTIO_NET_F_STATUS | \ VIRTIO_F_RING_NOTIFY_ON_EMPTY | \ @@ -900,13 +891,6 @@ viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv) break; } val &= (VIONA_S_HOSTCAPS | link->l_features_hw); - - if ((val & VIRTIO_NET_F_CSUM) == 0) - val &= ~VIRTIO_NET_F_HOST_TSO4; - - if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0) - val &= ~VIRTIO_NET_F_GUEST_TSO4; - link->l_features = val; break; case VNA_IOC_RING_INIT: @@ -979,7 +963,6 @@ viona_get_mac_capab(viona_link_t *link) { mac_handle_t mh = link->l_mh; uint32_t cap = 0; - mac_capab_lso_t lso_cap; link->l_features_hw = 0; if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) { @@ -992,19 +975,6 @@ viona_get_mac_capab(viona_link_t *link) } link->l_cap_csum = cap; } - - if ((link->l_features_hw & VIRTIO_NET_F_CSUM) && - mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) { - /* - * Virtio doesn't allow for negotiating a maximum LSO - * packet size. We have to assume that the guest may - * send a maximum length IP packet. Make sure the - * underlying MAC can handle an LSO of this size. - */ - if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) && - lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET) - link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4; - } } static int @@ -2011,7 +1981,6 @@ viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz) size_t len, copied = 0; caddr_t buf = NULL; boolean_t end = B_FALSE; - const uint32_t features = ring->vr_link->l_features; ASSERT(msz >= MIN_BUF_SIZE); @@ -2066,15 +2035,9 @@ viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz) copied += hdr_sz; /* Add chksum bits, if needed */ - if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) { + if ((ring->vr_link->l_features & VIRTIO_NET_F_GUEST_CSUM) != 0) { uint32_t cksum_flags; - if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) && - ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) { - hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4; - hdr->vrh_gso_size = DB_LSOMSS(mp); - } - mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL, &cksum_flags); if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) { @@ -2107,7 +2070,6 @@ viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz) struct virtio_net_mrgrxhdr *hdr = NULL; const size_t hdr_sz = sizeof (struct virtio_net_mrgrxhdr); boolean_t end = B_FALSE; - const uint32_t features = ring->vr_link->l_features; ASSERT(msz >= MIN_BUF_SIZE); @@ -2213,15 +2175,9 @@ viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz) } /* Add chksum bits, if needed */ - if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) { + if ((ring->vr_link->l_features & VIRTIO_NET_F_GUEST_CSUM) != 0) { uint32_t cksum_flags; - if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) && - ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) { - hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4; - hdr->vrh_gso_size = DB_LSOMSS(mp); - } - mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL, &cksum_flags); if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) { @@ -2265,29 +2221,8 @@ viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t loopback) mblk_t *mpdrop = NULL, **mpdrop_prevp = &mpdrop; const boolean_t do_merge = ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0); - const boolean_t guest_csum = - ((link->l_features & VIRTIO_NET_F_GUEST_CSUM) != 0); - const boolean_t guest_tso4 = - ((link->l_features & VIRTIO_NET_F_GUEST_TSO4) != 0); - size_t nrx = 0, ndrop = 0; - /* - * The mac_hw_emul() function, by design, doesn't predicate on - * HW_LOCAL_MAC. Since we are in Rx context we know that any - * LSO packet must also be from a same-machine sender. We take - * advantage of that and forgoe writing a manual loop to - * predicate on HW_LOCAL_MAC. - * - * For checksum emulation we need to predicate on HW_LOCAL_MAC - * to avoid calling mac_hw_emul() on packets that don't need - * it (thanks to the fact that HCK_IPV4_HDRCKSUM and - * HCK_IPV4_HDRCKSUM_OK use the same value). Therefore, we do - * the checksum emulation in the second loop. - */ - if (!guest_tso4) - mac_hw_emul(&mp, NULL, NULL, MAC_LSO_EMUL); - while (mp != NULL) { mblk_t *next, *pad = NULL; size_t size; @@ -2295,25 +2230,6 @@ viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t loopback) next = mp->b_next; mp->b_next = NULL; - - if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) { - /* - * The VIRTIO_NET_HDR_F_DATA_VALID flag only - * covers the ULP checksum -- so we still have - * to populate the IP header checksum. - */ - if (guest_csum) { - mac_hw_emul(&mp, NULL, NULL, MAC_IPCKSUM_EMUL); - } else { - mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL); - } - - if (mp == NULL) { - mp = next; - continue; - } - } - size = msgsize(mp); /* @@ -2492,6 +2408,28 @@ viona_desb_release(viona_desb_t *dp) mutex_exit(&ring->vr_lock); } +static int +viona_mb_get_uint8(mblk_t *mp, off_t off, uint8_t *out) +{ + size_t mpsize; + uint8_t *bp; + + mpsize = msgsize(mp); + if (off + sizeof (uint8_t) > mpsize) + return (-1); + + mpsize = MBLKL(mp); + while (off >= mpsize) { + mp = mp->b_cont; + off -= mpsize; + mpsize = MBLKL(mp); + } + + bp = mp->b_rptr + off; + *out = *bp; + return (0); +} + static boolean_t viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr, mblk_t *mp, uint32_t len) @@ -2500,22 +2438,15 @@ viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr, const struct ether_header *eth; uint_t eth_len = sizeof (struct ether_header); ushort_t ftype; - ipha_t *ipha = NULL; uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */ - uint16_t flags = 0; + eth = (const struct ether_header *)mp->b_rptr; if (MBLKL(mp) < sizeof (*eth)) { /* Buffers shorter than an ethernet header are hopeless */ return (B_FALSE); } - /* - * This is guaranteed to be safe thanks to the header copying - * done in viona_tx(). - */ - eth = (const struct ether_header *)mp->b_rptr; ftype = ntohs(eth->ether_type); - if (ftype == ETHERTYPE_VLAN) { const struct ether_vlan_header *veth; @@ -2526,80 +2457,16 @@ viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr, } if (ftype == ETHERTYPE_IP) { - ipha = (ipha_t *)(mp->b_rptr + eth_len); + const size_t off = offsetof(ipha_t, ipha_protocol) + eth_len; - ipproto = ipha->ipha_protocol; + (void) viona_mb_get_uint8(mp, off, &ipproto); } else if (ftype == ETHERTYPE_IPV6) { - ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len); + const size_t off = offsetof(ip6_t, ip6_nxt) + eth_len; - ipproto = ip6h->ip6_nxt; + (void) viona_mb_get_uint8(mp, off, &ipproto); } /* - * We ignore hdr_len because the spec says it can't be - * trusted. Besides, our own stack will determine the header - * boundary. - */ - if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 && - (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 && - ftype == ETHERTYPE_IP) { - uint16_t *cksump; - uint32_t cksum; - ipaddr_t src = ipha->ipha_src; - ipaddr_t dst = ipha->ipha_dst; - - /* - * Our native IP stack doesn't set the L4 length field - * of the pseudo header when LSO is in play. Other IP - * stacks, e.g. Linux, do include the length field. - * This is a problem because the hardware expects that - * the length field is not set. When it is set it will - * cause an incorrect TCP checksum to be generated. - * The reason this works in Linux is because Linux - * corrects the pseudo-header checksum in the driver - * code. In order to get the correct HW checksum we - * need to assume the guest's IP stack gave us a bogus - * TCP partial checksum and calculate it ourselves. - */ - cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha)); - cksum = IP_TCP_CSUM_COMP; - cksum += (dst >> 16) + (dst & 0xFFFF) + - (src >> 16) + (src & 0xFFFF); - cksum = (cksum & 0xFFFF) + (cksum >> 16); - *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); - - /* - * Since viona is a "legacy device", the data stored - * by the driver will be in the guest's native endian - * format (see sections 2.4.3 and 5.1.6.1 of the - * VIRTIO 1.0 spec for more info). At this time the - * only guests using viona are x86 and we can assume - * little-endian. - */ - lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO); - - /* - * Hardware, like ixgbe, expects the client to request - * IP header checksum offload if it's sending LSO (see - * ixgbe_get_context()). Unfortunately, virtio makes - * no allowances for negotiating IP header checksum - * and HW offload, only TCP checksum. We add the flag - * and zero-out the checksum field. This mirrors the - * behavior of our native IP stack (which does this in - * the interest of HW that expects the field to be - * zero). - */ - flags |= HCK_IPV4_HDRCKSUM; - ipha->ipha_hdr_checksum = 0; - } - - /* - * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure - * HW_LSO, if present, is not lost. - */ - flags |= DB_CKSUMFLAGS(mp); - - /* * Partial checksum support from the NIC is ideal, since it most * closely maps to the interface defined by virtio. */ @@ -2608,14 +2475,14 @@ viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr, uint_t start, stuff, end; /* - * MAC expects these offsets to be relative to the - * start of the L3 header rather than the L2 frame. + * The lower-level driver is expecting these offsets to be + * relative to the start of the L3 header rather than the + * ethernet frame. */ start = hdr->vrh_csum_start - eth_len; stuff = start + hdr->vrh_csum_offset; end = len - eth_len; - flags |= HCK_PARTIALCKSUM; - mac_hcksum_set(mp, start, stuff, end, 0, flags); + mac_hcksum_set(mp, start, stuff, end, 0, HCK_PARTIALCKSUM); return (B_TRUE); } @@ -2627,8 +2494,7 @@ viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr, if (ftype == ETHERTYPE_IP) { if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 && (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { - flags |= HCK_FULLCKSUM; - mac_hcksum_set(mp, 0, 0, 0, 0, flags); + mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM); return (B_TRUE); } @@ -2639,8 +2505,7 @@ viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr, } else if (ftype == ETHERTYPE_IPV6) { if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 && (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { - flags |= HCK_FULLCKSUM; - mac_hcksum_set(mp, 0, 0, 0, 0, flags); + mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM); return (B_TRUE); } @@ -2816,12 +2681,7 @@ viona_tx(viona_link_t *link, viona_vring_t *ring) dp->d_ref--; } - /* - * Request hardware checksumming, if necessary. If the guest - * sent an LSO packet then it must have also negotiated and - * requested partial checksum; therefore the LSO logic is - * contained within viona_tx_csum(). - */ + /* Request hardware checksumming, if necessary */ if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 && (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) { if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) { |
