summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/dladm/dladm.c12
-rw-r--r--usr/src/lib/libdladm/common/libdloverlay.c12
-rw-r--r--usr/src/uts/common/inet/ip.h2
-rw-r--r--usr/src/uts/common/inet/ip/conn_opt.c3
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c153
-rw-r--r--usr/src/uts/common/inet/ip/ip_output.c56
-rw-r--r--usr/src/uts/common/inet/ip_if.h3
-rw-r--r--usr/src/uts/common/inet/udp/udp.c248
-rw-r--r--usr/src/uts/common/inet/udp/udp_opt_data.c5
-rw-r--r--usr/src/uts/common/inet/udp_impl.h7
-rw-r--r--usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c2
-rw-r--r--usr/src/uts/common/io/dld/dld_proto.c9
-rw-r--r--usr/src/uts/common/io/i40e/i40e_gld.c10
-rw-r--r--usr/src/uts/common/io/i40e/i40e_main.c8
-rw-r--r--usr/src/uts/common/io/i40e/i40e_transceiver.c99
-rw-r--r--usr/src/uts/common/io/mac/mac_provider.c8
-rw-r--r--usr/src/uts/common/io/overlay/overlay.c118
-rw-r--r--usr/src/uts/common/io/overlay/overlay_mux.c31
-rw-r--r--usr/src/uts/common/io/overlay/overlay_prop.c3
-rw-r--r--usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c122
-rw-r--r--usr/src/uts/common/mapfiles/mac.mapfile3
-rw-r--r--usr/src/uts/common/netinet/udp.h23
-rw-r--r--usr/src/uts/common/sys/dld.h2
-rw-r--r--usr/src/uts/common/sys/dlpi.h12
-rw-r--r--usr/src/uts/common/sys/mac_provider.h11
-rw-r--r--usr/src/uts/common/sys/overlay_common.h3
-rw-r--r--usr/src/uts/common/sys/overlay_impl.h4
-rw-r--r--usr/src/uts/common/sys/overlay_plugin.h8
-rw-r--r--usr/src/uts/common/sys/pattr.h42
-rw-r--r--usr/src/uts/common/sys/strsubr.h1
30 files changed, 892 insertions, 128 deletions
diff --git a/usr/src/cmd/dladm/dladm.c b/usr/src/cmd/dladm/dladm.c
index 1edd13b0cb..b20300bfbd 100644
--- a/usr/src/cmd/dladm/dladm.c
+++ b/usr/src/cmd/dladm/dladm.c
@@ -10013,6 +10013,7 @@ print_overlay_value(char *outbuf, uint_t bufsize, uint_t type, const void *pbuf,
{
const struct in6_addr *ipv6;
struct in_addr ip;
+ const uint32_t *bval;
switch (type) {
case OVERLAY_PROP_T_INT:
@@ -10079,6 +10080,17 @@ print_overlay_value(char *outbuf, uint_t bufsize, uint_t type, const void *pbuf,
case OVERLAY_PROP_T_STRING:
(void) snprintf(outbuf, bufsize, "%s", pbuf);
break;
+ case OVERLAY_PROP_T_BOOLEAN:
+ if (psize != sizeof (uint32_t)) {
+ warn("malformed overlay boolean property: wrong byte "
+ "size %d bytes\n", psize);
+ (void) snprintf(outbuf, bufsize, "--");
+ break;
+ }
+ bval = pbuf;
+ (void) snprintf(outbuf, bufsize, "%s", *bval > 0 ? "true" :
+ "false");
+ break;
default:
abort();
}
diff --git a/usr/src/lib/libdladm/common/libdloverlay.c b/usr/src/lib/libdladm/common/libdloverlay.c
index a83105b91c..baee571ee6 100644
--- a/usr/src/lib/libdladm/common/libdloverlay.c
+++ b/usr/src/lib/libdladm/common/libdloverlay.c
@@ -82,6 +82,7 @@ dladm_overlay_parse_prop(overlay_prop_type_t type, void *buf, uint32_t *sizep,
int ret;
int64_t ival;
uint64_t uval;
+ uint32_t bval;
char *eptr;
struct in6_addr ipv6;
struct in_addr ip;
@@ -127,6 +128,17 @@ dladm_overlay_parse_prop(overlay_prop_type_t type, void *buf, uint32_t *sizep,
bcopy(&ipv6, buf, sizeof (struct in6_addr));
*sizep = sizeof (struct in6_addr);
break;
+ case OVERLAY_PROP_T_BOOLEAN:
+ if (strcmp(val, "true") == 0) {
+ bval = 1;
+ } else if (strcmp(vap, "false") == 0) {
+ bval = 0;
+ } else {
+ return (DLADM_STATUS_BADARG);
+ }
+ bcopy(&bval, buf, sizeof (bval));
+ *sizep = sizeof (bval);
+ break;
default:
abort();
}
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index cc8c489c8c..733c65ea29 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -2254,6 +2254,8 @@ struct ip_xmit_attr_s {
#define IXAF_VERIFY_ZCOPY 0x400000000 /* Check Zero Copy capability */
#define IXAF_ZCOPY_CAPAB 0x800000000 /* Capable of ZEROCOPY */
+#define IXAF_SKIP_ULP_CKSUM 0x1000000000 /* Checksum IP, but skip ULP */
+
/*
* The normal flags for sending packets e.g., icmp errors
*/
diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c
index b4bff4d7b4..6b7b1a9f33 100644
--- a/usr/src/uts/common/inet/ip/conn_opt.c
+++ b/usr/src/uts/common/inet/ip/conn_opt.c
@@ -1201,6 +1201,7 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
case T_IP_TOS:
case IP_TTL:
case IP_DONTFRAG:
+ case IP_BOUND_IF:
break;
default:
return (EINVAL);
@@ -2454,7 +2455,7 @@ ip_attr_newdst(ip_xmit_attr_t *ixa)
{
ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM |
IXAF_NO_TTL_CHANGE | IXAF_IPV6_ADD_FRAGHDR |
- IXAF_NO_LOOP_ZONEID_SET);
+ IXAF_NO_LOOP_ZONEID_SET | IXAF_SKIP_ULP_CKSUM);
}
/*
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index 7d3125f2a3..917e526bb1 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -19138,3 +19138,156 @@ ip_sioctl_get_lifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
return (0);
}
+
+static int
+ip_bindif_getipif(conn_t *connp, ipif_t **ipifp)
+{
+ in6_addr_t laddrv6;
+ in_addr_t laddrv4;
+ ushort_t ipvers;
+ ipif_t *ipif;
+ ip_stack_t *ipst;
+ int ret;
+
+ ipst = connp->conn_netstack->netstack_ip;
+
+ /*
+ * The caller has made sure that this socket is bound before calling.
+ * This makes it safe to cache this data and not hold the conn lock
+ * across this operation.
+ */
+ mutex_enter(&connp->conn_lock);
+ ipvers = connp->conn_ipversion;
+ if (ipvers == IPV4_VERSION) {
+ laddrv4 = connp->conn_saddr_v4;
+ } else if (ipvers == IPV6_VERSION) {
+ laddrv6 = connp->conn_saddr_v6;
+ } else {
+ mutex_exit(&connp->conn_lock);
+ return (EINVAL);
+ }
+ mutex_exit(&connp->conn_lock);
+
+ if (ipvers == IPV4_VERSION) {
+ ipif = ipif_lookup_addr_nondup(laddrv4, NULL, ALL_ZONES, ipst);
+ } else {
+ ipif = ipif_lookup_addr_nondup_v6(&laddrv6, NULL, ALL_ZONES,
+ ipst);
+ }
+
+ if (ipif == NULL) {
+ return (ENOENT);
+ }
+
+ *ipifp = ipif;
+ return (0);
+}
+
+int
+ip_bindif_ifindex(conn_t *connp, uint_t *ifindex)
+{
+ int ret;
+ ipif_t *ipif;
+
+ if (connp == NULL || ifindex == NULL)
+ return (EINVAL);
+
+ if ((ret = ip_bindif_getipif(connp, &ipif)) != 0) {
+ return (ret);
+ }
+
+ if (IS_VNI(ipif->ipif_ill) || IS_IPMP(ipif->ipif_ill) ||
+ IS_LOOPBACK(ipif->ipif_ill)) {
+ ret = ENOTSUP;
+ goto out;
+ }
+
+ mutex_enter(&ipif->ipif_ill->ill_lock);
+ *ifindex = ipif->ipif_ill->ill_phyint->phyint_ifindex;
+ mutex_exit(&ipif->ipif_ill->ill_lock);
+out:
+ if (ipif != NULL)
+ ipif_refrele(ipif);
+ return (ret);
+}
+
+int
+ip_bindif_hwcaps(conn_t *connp, uint_t *hckflags, uint_t *lsoflags,
+ uint_t *lsomax)
+{
+ in6_addr_t laddrv6;
+ in_addr_t laddrv4;
+ ushort_t ipvers;
+ ipif_t *ipif;
+ ip_stack_t *ipst;
+ int ret;
+
+ if (connp == NULL || hckflags == NULL || lsoflags == NULL ||
+ lsomax == NULL) {
+ return (EINVAL);
+ }
+
+ ipst = connp->conn_netstack->netstack_ip;
+
+ /*
+ * The caller has made sure that this socket is bound before calling.
+ * This makes it safe to cache this data and not hold the conn lock
+ * across this operation.
+ */
+ mutex_enter(&connp->conn_lock);
+ ipvers = connp->conn_ipversion;
+ if (ipvers == IPV4_VERSION) {
+ laddrv4 = connp->conn_saddr_v4;
+ } else if (ipvers == IPV6_VERSION) {
+ laddrv6 = connp->conn_saddr_v6;
+ } else {
+ mutex_exit(&connp->conn_lock);
+ return (EINVAL);
+ }
+ mutex_exit(&connp->conn_lock);
+
+ if (ipvers == IPV4_VERSION) {
+ ipif = ipif_lookup_addr_nondup(laddrv4, NULL, ALL_ZONES, ipst);
+ } else {
+ ipif = ipif_lookup_addr_nondup_v6(&laddrv6, NULL, ALL_ZONES,
+ ipst);
+ }
+
+ if (ipif == NULL) {
+ return (ENOENT);
+ }
+
+ if (IS_VNI(ipif->ipif_ill) || IS_IPMP(ipif->ipif_ill) ||
+ IS_LOOPBACK(ipif->ipif_ill)) {
+ ret = ENOTSUP;
+ goto out;
+ }
+
+ /*
+ * XXX We should consider entering the ipsq here via ipsq_enter().
+ * There's really no good way to get a consistent snapshot of the
+ * hardware capabilities from an ill. We'll revisit this when we need
+ * to deal with getting updates.
+ */
+ if (ILL_LSO_USABLE(ipif->ipif_ill)) {
+ ill_lso_capab_t *lsop = ipif->ipif_ill->ill_lso_capab;
+ *lsoflags = lsop->ill_lso_flags;
+ *lsomax = lsop->ill_lso_max;
+ } else {
+ *lsoflags = 0;
+ *lsomax = 0;
+ }
+
+ if (ILL_HCKSUM_CAPABLE(ipif->ipif_ill)) {
+ ill_hcksum_capab_t *hck = ipif->ipif_ill->ill_hcksum_capab;
+ *hckflags = hck->ill_hcksum_txflags;
+ } else {
+ *hckflags = 0;
+ }
+
+ ret = 0;
+out:
+ if (ipif != NULL)
+ ipif_refrele(ipif);
+ return (ret);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_output.c b/usr/src/uts/common/inet/ip/ip_output.c
index 5caa043a35..690f39e0dc 100644
--- a/usr/src/uts/common/inet/ip/ip_output.c
+++ b/usr/src/uts/common/inet/ip/ip_output.c
@@ -1604,7 +1604,7 @@ ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
}
/*
- * Calculate a checksum ignoring any hardware capabilities
+ * Calculate a checksum ignoring any hardware capabilities.
*
* Returns B_FALSE if the packet was too short for the checksum. Caller
* should free and do stats.
@@ -1621,8 +1621,14 @@ ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa)
ipaddr_t dst = ipha->ipha_dst;
ipaddr_t src = ipha->ipha_src;
- /* Just in case it contained garbage */
- DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
+ /*
+ * Just in case it contained garbage. There may be valid flags if this
+ * is a tunneled packet.
+ */
+ DB_CKSUMFLAGS(mp) &= ~HCK_OUTER_FLAGS;
+
+ if ((ixa->ixa_flags & IXAF_SKIP_ULP_CKSUM) != 0)
+ goto ip_hdr_cksum;
/*
* Calculate ULP checksum
@@ -1688,6 +1694,7 @@ ip_hdr_cksum:
* Calculate the ULP checksum - try to use hardware.
* In the case of MULTIRT, broadcast or multicast the
* IXAF_NO_HW_CKSUM is set in which case we use software.
+ * If IXAF_SKIP_ULP_CKSUM is set, only do the IP checksum.
*
* If the hardware supports IP header checksum offload; then clear the
* contents of IP header checksum field as expected by NIC.
@@ -1702,10 +1709,11 @@ ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
{
uint_t pktlen = ixa->ixa_pktlen;
uint16_t *cksump;
- uint16_t hck_flags;
+ uint16_t hck_flags, mp_hck_flags, ttype;
uint32_t cksum;
uint8_t protocol = ixa->ixa_protocol;
uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length;
+ boolean_t can_inet, can_full, can_partial;
if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
!dohwcksum) {
@@ -1713,6 +1721,13 @@ ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
}
/*
+ * If we've been asked to skip the ULP checksum, then just let IP do its
+ * business.
+ */
+ if ((ixa->ixa_flags & IXAF_SKIP_ULP_CKSUM) != 0)
+ goto ip_hdr_cksum;
+
+ /*
* Calculate ULP checksum. Note that we don't use cksump and cksum
* if the ill has FULL support.
*/
@@ -1753,11 +1768,34 @@ ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
* the payload; leave the payload checksum for the hardware to
* calculate. N.B: We only need to set up checksum info on the
* first mblk.
+ *
+ * We must check to see if an inner checksum has already been
+ * computed. If so, we need to look at different hardware flags
+ * to determine if we can perform full or partial checksums.
*/
hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
- DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
- if (hck_flags & HCKSUM_INET_FULL_V4) {
+ mp_hck_flags = DB_CKSUMFLAGS(mp);
+ ttype = (DB_TTYPEFLAGS(mp) & TTYPE_MASK) >> TTYPE_SHIFT;
+ if ((mp_hck_flags & HCK_INNER_FLAGS_NEEDED) != 0) {
+ switch (ttype) {
+ case TTYPE_VXLAN:
+ can_inet = (hck_flags & HCKSUM_TUNNEL_VXLAN_OIP) != 0;
+ can_full = (hck_flags & HCKSUM_VXLAN_FULL) != 0;
+ can_partial = (hck_flags & HCKSUM_VXLAN_PSEUDO) != 0;
+ break;
+ default:
+ can_inet = B_FALSE;
+ can_full = B_FALSE;
+ can_partial = B_FALSE;
+ }
+ } else {
+ can_inet = (hck_flags & HCKSUM_IPHDRCKSUM) != 0;
+ can_full = (hck_flags & HCKSUM_INET_FULL_V4) != 0;
+ can_partial = (hck_flags & HCKSUM_INET_PARTIAL) != 0;
+ }
+ DB_CKSUMFLAGS(mp) &= ~HCK_OUTER_FLAGS;
+ if (can_full) {
/*
* Hardware calculates pseudo-header, header and the
* payload checksums, so clear the checksum field in
@@ -1767,14 +1805,14 @@ ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
ipha->ipha_hdr_checksum = 0;
- if (hck_flags & HCKSUM_IPHDRCKSUM) {
+ if (can_inet) {
DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
} else {
ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
}
return (B_TRUE);
}
- if ((hck_flags) & HCKSUM_INET_PARTIAL) {
+ if (can_partial) {
ipaddr_t dst = ipha->ipha_dst;
ipaddr_t src = ipha->ipha_src;
/*
@@ -1803,7 +1841,7 @@ ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
ipha->ipha_hdr_checksum = 0;
- if (hck_flags & HCKSUM_IPHDRCKSUM) {
+ if (can_inet) {
DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
} else {
ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h
index a625ec9238..e705b6ee08 100644
--- a/usr/src/uts/common/inet/ip_if.h
+++ b/usr/src/uts/common/inet/ip_if.h
@@ -493,6 +493,9 @@ extern int ipif_arp_up(ipif_t *, enum ip_resolver_action, boolean_t);
extern void ipif_dup_recovery(void *);
extern void ipif_do_recovery(ipif_t *);
+extern int ip_bindif_hwcaps(conn_t *, uint_t *, uint_t *, uint_t *);
+extern int ip_bindif_ifindex(conn_t *, uint_t *);
+
/*
* Notes on reference tracing on ill, ipif, ire, nce data structures:
*
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index a88bac932c..1222c68e83 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -79,6 +79,7 @@
#include <inet/ipnet.h>
#include <sys/vxlan.h>
#include <inet/inet_hash.h>
+#include <sys/pattr.h>
#include <sys/tsol/label.h>
#include <sys/tsol/tnet.h>
@@ -347,6 +348,11 @@ void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol,
typedef union T_primitives *t_primp_t;
+typedef enum udp_hash_type {
+ UDP_HASH_NONE,
+ UDP_HASH_VXLAN
+} udp_hash_type_t;
+
/*
* Various protocols that encapsulate UDP have no real use for the source port.
* Instead, they want to vary the source port to provide better equal-cost
@@ -369,7 +375,7 @@ typedef union T_primitives *t_primp_t;
* hashed. That should be an uncommon event.
*/
uint16_t
-udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max,
+udp_srcport_hash(mblk_t *mp, udp_hash_type_t type, uint16_t min, uint16_t max,
uint16_t def)
{
size_t szused = 0;
@@ -1566,6 +1572,47 @@ udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
return (B_TRUE);
}
+static int
+udp_do_opt_tunnel_get(conn_t *connp, udp_t *udp, udp_tunnel_opt_t *optp)
+{
+ uint_t hck, lso, mss;
+
+ mutex_enter(&connp->conn_lock);
+ bzero(optp, sizeof (udp_tunnel_opt_t));
+
+ if (udp->udp_tunnel == 0) {
+ mutex_exit(&connp->conn_lock);
+ return (sizeof (udp_tunnel_opt_t));
+ }
+
+ optp->uto_type = UDP_TUNNEL_VXLAN;
+ if (udp->udp_vxlanhash != 0) {
+ optp->uto_opts |= UDP_TUNNEL_OPT_SRCPORT_HASH;
+ }
+
+ if (udp->udp_tunnel_hwcap != 0) {
+ optp->uto_opts |= UDP_TUNNEL_OPT_HWCAP;
+ }
+
+ if (udp->udp_skip_cksum != 0) {
+ optp->uto_opts |= UDP_TUNNEL_OPT_RELAX_CKSUM;
+ }
+
+ mutex_exit(&connp->conn_lock);
+
+ if ((optp->uto_opts & UDP_TUNNEL_OPT_HWCAP) != 0) {
+ if (ip_bindif_hwcaps(connp, &hck, &lso, &mss) != 0)
+ return (-1);
+
+ optp->uto_type = UDP_TUNNEL_VXLAN;
+ optp->uto_cksum_flags = hck;
+ optp->uto_lso_flags = lso;
+ optp->uto_lso_max = mss;
+ }
+
+ return (sizeof (udp_tunnel_opt_t));
+}
+
/*
* This routine gets default values of certain options whose default
* values are maintained by protcol specific code
@@ -1668,11 +1715,9 @@ udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name,
*i1 = udp->udp_rcvhdr ? 1 : 0;
mutex_exit(&connp->conn_lock);
return (sizeof (int));
- case UDP_SRCPORT_HASH:
- mutex_enter(&connp->conn_lock);
- *i1 = udp->udp_vxlanhash;
- mutex_exit(&connp->conn_lock);
- return (sizeof (int));
+ case UDP_TUNNEL:
+ return (udp_do_opt_tunnel_get(connp, udp,
+ (udp_tunnel_opt_t *)ptr));
case UDP_SND_TO_CONNECTED:
mutex_enter(&connp->conn_lock);
*i1 = udp->udp_snd_to_conn ? 1 : 0;
@@ -1700,6 +1745,111 @@ udp_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
return (err);
}
+static int
+udp_do_opt_tunnel_set(conn_opt_arg_t *coa, cred_t *cr, udp_tunnel_opt_t *optp)
+{
+ conn_t *connp = coa->coa_connp;
+ udp_t *udp = connp->conn_udp;
+
+ if (optp->uto_type != UDP_TUNNEL_VXLAN)
+ return (EINVAL);
+
+ if ((optp->uto_opts & ~(UDP_TUNNEL_OPT_SRCPORT_HASH |
+ UDP_TUNNEL_OPT_HWCAP | UDP_TUNNEL_OPT_RELAX_CKSUM)) != 0)
+ return (EINVAL);
+
+ mutex_enter(&connp->conn_lock);
+
+ if (udp->udp_tunnel != 0) {
+ mutex_exit(&connp->conn_lock);
+ return (EEXIST);
+ }
+
+ /*
+ * Check to make sure the caller has already called bind(2) on this
+ * socket. If not, this is not acceptable.
+ */
+ if (udp->udp_state < TS_IDLE) {
+ mutex_exit(&connp->conn_lock);
+ return (EINVAL);
+ }
+
+ /*
+ * For now, don't allow multicast / broadcast. In the future if we do
+ * interface binding with this, then that's fine.
+ */
+ if (connp->conn_mcbc_bind) {
+ mutex_exit(&connp->conn_lock);
+ return (EINVAL);
+ }
+
+ if ((optp->uto_opts & UDP_TUNNEL_OPT_RELAX_CKSUM) != 0 &&
+ connp->conn_ipversion != IPV4_VERSION) {
+ mutex_exit(&connp->conn_lock);
+ return (EINVAL);
+ }
+
+ /*
+ * Set the fact that this is tunneled. We'll leave actually fetching the
+ * information to the getsockopt.
+ */
+ udp->udp_tunnel = 1;
+
+ /*
+ * We trust that the caller has asked for strict binding.
+ */
+ if ((optp->uto_opts & UDP_TUNNEL_OPT_HWCAP) != 0) {
+ uint_t ifindex;
+ int ret;
+ t_scalar_t proto, cmd;
+
+ if (connp->conn_ipversion == IPV4_VERSION) {
+ proto = IPPROTO_IP;
+ cmd = IP_BOUND_IF;
+ } else {
+ proto = IPPROTO_IPV6;
+ cmd = IPV6_BOUND_IF;
+ }
+ mutex_exit(&connp->conn_lock);
+
+ /*
+ * Try and set up the strict binding to the listen interface.
+ */
+ if ((ret = ip_bindif_ifindex(connp, &ifindex)) != 0) {
+ return (ret);
+ }
+
+ ret = conn_opt_set(coa, proto, cmd, sizeof (ifindex),
+ (uchar_t *)&ifindex, B_FALSE, cr);
+ if (ret != 0) {
+ mutex_enter(&connp->conn_lock);
+ udp->udp_tunnel = 0;
+ mutex_exit(&connp->conn_lock);
+ return (ret);
+ }
+
+ mutex_enter(&connp->conn_lock);
+ udp->udp_tunnel_hwcap = 1;
+ }
+
+ if ((optp->uto_opts & UDP_TUNNEL_OPT_SRCPORT_HASH) != 0) {
+ udp->udp_vxlanhash = 1;
+ }
+
+ /*
+ * We only relax the checksum when using IPv4. UDP over IPv6 is required
+ * to have a checksum.
+ */
+ if ((optp->uto_opts & UDP_TUNNEL_OPT_RELAX_CKSUM) != 0 &&
+ connp->conn_ipversion == IPV4_VERSION) {
+ udp->udp_skip_cksum = 1;
+ }
+
+ mutex_exit(&connp->conn_lock);
+
+ return (0);
+}
+
/*
* This routine sets socket options.
*/
@@ -1813,31 +1963,20 @@ udp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
udp->udp_rcvhdr = onoff;
mutex_exit(&connp->conn_lock);
return (0);
- case UDP_SRCPORT_HASH:
- /*
- * This should have already been verified, but double
- * check.
- */
- if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
- return (error);
- }
-
- /* First see if the val is something we understand */
- if (*i1 != UDP_HASH_DISABLE && *i1 != UDP_HASH_VXLAN)
- return (EINVAL);
-
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- udp->udp_vxlanhash = *i1;
- mutex_exit(&connp->conn_lock);
- }
- /* Fully handled this option. */
- return (0);
case UDP_SND_TO_CONNECTED:
mutex_enter(&connp->conn_lock);
udp->udp_snd_to_conn = onoff;
mutex_exit(&connp->conn_lock);
return (0);
+ case UDP_TUNNEL:
+ if (cr != kcred) {
+ return (EPERM);
+ }
+
+ if (checkonly)
+ return (0);
+ return (udp_do_opt_tunnel_set(coa, cr,
+ (udp_tunnel_opt_t *)invalp));
}
break;
}
@@ -2106,6 +2245,35 @@ udp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
}
/*
+ * If the message block that we're operating on belongs to an overlay device,
+ * then it may have information in the checksum and lso headers that we care
+ * about and need to move to the template message block.
+ */
+static void
+udp_prepend_tunnel_attr(udp_t *udp, const mblk_t *src, mblk_t *dst)
+{
+ uint16_t ckflags;
+
+ if (udp->udp_tunnel == 0)
+ return;
+ /* XXX Maybe assert? */
+ if (DB_TYPE(src) != M_DATA)
+ return;
+
+ ckflags = DB_CKSUMFLAGS(src) & HCK_INNER_FLAGS;
+ if (ckflags != 0) {
+ DB_CKSUMFLAGS(dst) |= ckflags;
+ }
+
+ if ((DB_LSOFLAGS(src) & HW_LSO) != 0) {
+ DB_LSOFLAGS(dst) |= HW_LSO;
+ DB_LSOMSS(dst) = DB_LSOMSS(src);
+ }
+
+ DB_TTYPEFLAGS(dst) |= (DB_TTYPEFLAGS(src) & TTYPE_MASK);
+}
+
+/*
* Setup IP and UDP headers.
* Returns NULL on allocation failure, in which case data_mp is freed.
*/
@@ -2123,7 +2291,7 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
boolean_t insert_spi = udp->udp_nat_t_endpoint;
boolean_t hash_srcport = udp->udp_vxlanhash;
uint_t ulp_hdr_len;
- uint16_t srcport;
+ uint16_t srcport, ckflags;
data_len = msgdsize(data_mp);
ulp_hdr_len = UDPH_SIZE;
@@ -2146,6 +2314,9 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
ASSERT(*errorp != 0);
return (NULL);
}
+ if (mp != data_mp) {
+ udp_prepend_tunnel_attr(udp, data_mp, mp);
+ }
data_len += ulp_hdr_len;
ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
@@ -2182,7 +2353,9 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
/* IP does the checksum if uha_checksum is non-zero */
- if (us->us_do_checksum) {
+ if (udp->udp_skip_cksum) {
+ udpha->uha_checksum = 0;
+ } else if (us->us_do_checksum) {
if (cksum == 0)
udpha->uha_checksum = 0xffff;
else
@@ -2201,6 +2374,7 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
}
/* Insert all-0s SPI now. */
+skip_cksum:
if (insert_spi)
*((uint32_t *)(udpha + 1)) = 0;
@@ -2884,6 +3058,11 @@ udp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
dstport = connp->conn_fport;
flowinfo = connp->conn_flowinfo;
}
+
+ if (udp->udp_skip_cksum != 0) {
+ ixa->ixa_flags |= IXAF_SKIP_ULP_CKSUM;
+ }
+
mutex_exit(&connp->conn_lock);
/* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
@@ -3377,6 +3556,9 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
*errorp = ENOMEM;
return (NULL);
}
+
+ udp_prepend_tunnel_attr(udp, mp, mp1);
+
mp1->b_wptr = DB_LIM(mp1);
mp1->b_cont = mp;
mp = mp1;
@@ -3411,8 +3593,11 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
ipha->ipha_length = htons((uint16_t)pktlen);
/* IP does the checksum if uha_checksum is non-zero */
- if (us->us_do_checksum)
+ if (udp->udp_skip_cksum) {
+ udpha->uha_checksum = 0;
+ } else if (us->us_do_checksum) {
udpha->uha_checksum = htons(cksum);
+ }
/* if IP_PKTINFO specified an addres it wins over bind() */
if ((ipp->ipp_fields & IPPF_ADDR) &&
@@ -3915,6 +4100,11 @@ udp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
}
}
}
+
+ if (udp->udp_skip_cksum != 0) {
+ ixa->ixa_flags |= IXAF_SKIP_ULP_CKSUM;
+ }
+
/* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) {
ip_pkt_t *ipp = &connp->conn_xmit_ipp;
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index 847e2cdde6..ad3ea48956 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -293,9 +293,10 @@ opdes_t udp_opt_arr[] = {
},
{ UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int),
0 },
-{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 },
{ UDP_SND_TO_CONNECTED, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
- 0 }
+ 0 },
+{ UDP_TUNNEL, IPPROTO_UDP, 0, OA_RW, OP_CONFIG, OP_NODEFAULT,
+ sizeof (udp_tunnel_opt_t), 0 }
};
/*
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index ebba10c0f7..3fccefb119 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -179,12 +179,15 @@ typedef struct udp_s {
udp_issocket : 1, /* socket mode; sockfs is on top */
udp_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */
udp_rcvhdr : 1, /* UDP_RCVHDR option */
- udp_vxlanhash: 1, /* UDP_SRCPORT_HASH option */
+ udp_vxlanhash: 1, /* Perform source port hashing */
/* Because there's only VXLAN, cheat */
/* and only use a single bit */
udp_snd_to_conn: 1, /* UDP_SND_TO_CONNECTED option */
+ udp_tunnel: 1, /* UDP_TUNNEL called */
+ udp_tunnel_hwcap: 1, /* UDP_TUNNEL asked for strict bind */
+ udp_skip_cksum: 1, /* UDP_TUNNEL asked for no checksum */
- udp_pad_to_bit_31 : 27;
+ udp_pad_to_bit_31 : 25;
/* Following 2 fields protected by the uf_lock */
struct udp_s *udp_bind_hash; /* Bind hash chain */
diff --git a/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c b/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c
index 0ec67c8d19..e8fa96d1cd 100644
--- a/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c
+++ b/usr/src/uts/common/io/cxgbe/t4nex/t4_mac.c
@@ -833,7 +833,7 @@ t4_mc_getcapab(void *arg, mac_capab_t cap, void *data)
case MAC_CAPAB_HCKSUM:
if (pi->features & CXGBE_HW_CSUM) {
uint32_t *d = data;
- *d = HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM;
+ *d = HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM;
} else
status = B_FALSE;
break;
diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c
index 1ee00681fc..d6f57091b4 100644
--- a/usr/src/uts/common/io/dld/dld_proto.c
+++ b/usr/src/uts/common/io/dld/dld_proto.c
@@ -1514,6 +1514,15 @@ dld_capab_lso(dld_str_t *dsp, void *data, uint_t flags)
/* translate the flag for mac clients */
if ((mac_lso.lso_flags & LSO_TX_BASIC_TCP_IPV4) != 0)
lso->lso_flags |= DLD_LSO_BASIC_TCP_IPV4;
+ /* XXX We should probably not rely on equality */
+ if ((mac_lso.lso_flags & LSO_TX_VXLAN_TCP) != 0 &&
+ mac_lso.lso_vxlan_tcp.lso_tcpv4_max == lso->lso_max) {
+ lso->lso_flags |= DLD_LSO_VXLAN_TCP_IPV4;
+ }
+ if ((mac_lso.lso_flags & LSO_TX_VXLAN_TCP) != 0 &&
+ mac_lso.lso_vxlan_tcp.lso_tcpv6_max == lso->lso_max) {
+ lso->lso_flags |= DLD_LSO_VXLAN_TCP_IPV6;
+ }
dsp->ds_lso = B_TRUE;
dsp->ds_lso_max = lso->lso_max;
} else {
diff --git a/usr/src/uts/common/io/i40e/i40e_gld.c b/usr/src/uts/common/io/i40e/i40e_gld.c
index b319459cd3..5e488c4606 100644
--- a/usr/src/uts/common/io/i40e/i40e_gld.c
+++ b/usr/src/uts/common/io/i40e/i40e_gld.c
@@ -730,14 +730,16 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
* HW checksum offload
* Inner L3 | Inner L4 | Outer L3 | Outer L4
* yes | yes | yes | only on x722
- * i.e. this HCKSUM_VXLAN_FULL_NO_OL4, except for x722, but we
- * currently don't break out x722 separately.
+ *
+ * The L4 checksum offload requires that the pseudo-header is
+ * calculated. Hence why we use HCKSUM_INET_PARTIAL and
+ * HCKSUM_VXLAN_PSEUDO_NO_OL4. Eventually we can change this so
+ * on the X722 we use HCKSUM_VXLAN_PSEUDO.
*/
-
*txflags = 0;
if (i40e->i40e_tx_hcksum_enable == B_TRUE)
*txflags = HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM |
- HCKSUM_VXLAN_FULL_NO_OL4;
+ HCKSUM_VXLAN_PSEUDO_NO_OL4;
break;
}
diff --git a/usr/src/uts/common/io/i40e/i40e_main.c b/usr/src/uts/common/io/i40e/i40e_main.c
index c15acbb265..02ebe5218e 100644
--- a/usr/src/uts/common/io/i40e/i40e_main.c
+++ b/usr/src/uts/common/io/i40e/i40e_main.c
@@ -2799,6 +2799,14 @@ i40e_start(i40e_t *i40e, boolean_t alloc)
goto done;
}
+ /* XXX */
+ {
+ enum i40e_status_code r;
+
+ r = i40e_aq_add_udp_tunnel(hw, 4789, 0, NULL, NULL);
+ cmn_err(CE_WARN, "i40e add UDP tunnel: %x", r);
+ }
+
/*
* Finally, make sure that we're happy from an FM perspective.
*/
diff --git a/usr/src/uts/common/io/i40e/i40e_transceiver.c b/usr/src/uts/common/io/i40e/i40e_transceiver.c
index 7b0e181810..72754f4071 100644
--- a/usr/src/uts/common/io/i40e/i40e_transceiver.c
+++ b/usr/src/uts/common/io/i40e/i40e_transceiver.c
@@ -1670,6 +1670,7 @@ typedef struct mac_ether_offload_info {
/*
* The following members are used when tunneling (e.g. vxlan)
*/
+ uint8_t meoi_tun_protlen; /* Length of the tunnel protocol */
uint8_t meoi_tun_l2hlen; /* How long is the Ethernet header? */
uint16_t meoi_tun_l3proto; /* What's the Ethertype */
uint8_t meoi_tun_l3hlen; /* How long is the header? */
@@ -1758,7 +1759,7 @@ i40e_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out)
static int
mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi,
- boolean_t tunneled, size_t starting_off)
+ uint32_t ttype, size_t starting_off)
{
size_t off;
uint16_t ether;
@@ -1766,6 +1767,9 @@ mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi,
bzero(meoi, sizeof (mac_ether_offload_info_t));
+ if (ttype != TTYPE_NONE && ttype != TTYPE_VXLAN)
+ return (-1);
+
off = offsetof(struct ether_header, ether_type) + starting_off;
if (i40e_meoi_get_uint16(mp, off, &ether) != 0)
return (-1);
@@ -1839,7 +1843,7 @@ mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi,
meoi->meoi_l4hlen = l4len;
meoi->meoi_flags |= MEOI_L4INFO_SET;
- if (tunneled) {
+ if (ttype == TTYPE_VXLAN) {
/*
* Recursively call ourselves to obtain the tunneled L2/L3/L4
* data, using the proper starting offset to the tunneled
@@ -1854,13 +1858,14 @@ mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi,
ASSERT(starting_off == 0);
off = maclen + iplen + l4len + VXLAN_HDR_LEN;
- ret = mac_ether_offload_info(mp, &meo, B_FALSE, off);
+ ret = mac_ether_offload_info(mp, &meo, TTYPE_NONE, off);
if (ret != 0)
return (ret);
if ((meo.meoi_flags & MEOI_L2_L3_L4) != MEOI_L2_L3_L4)
return (-1);
+ meoi->meoi_tun_protlen = VXLAN_HDR_LEN;
meoi->meoi_tun_l2hlen = meo.meoi_l2hlen;
meoi->meoi_tun_l3proto = meo.meoi_l3proto;
meoi->meoi_tun_l3hlen = meo.meoi_l3hlen;
@@ -1882,14 +1887,15 @@ mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi,
* 'or' into the descriptor based on the checksum flags for this mblk_t and the
* actual information we care about.
*
- * XXX - update comment
+ * If we're using LSO or need to perform tunneling-based checksums, then we'll
+ * fill in information that will be used for the Transmit Context Descriptor.
*/
static int
i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
i40e_tx_context_t *tctx)
{
int ret;
- uint32_t chkflags, start, mss, lsoflags;
+ uint32_t chkflags, start, mss, lsoflags, ttype;
mac_ether_offload_info_t meo;
i40e_txq_stat_t *txs = &itrq->itrq_txstat;
boolean_t tunneled;
@@ -1901,6 +1907,7 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags);
mac_lso_get(mp, &mss, &lsoflags);
+ mac_tunnel_type_get(mp, &ttype);
if (chkflags == 0 && lsoflags == 0)
return (0);
@@ -1910,10 +1917,15 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
* tunneled packet.
*/
tunneled = (chkflags &
- (HCK_INNER_IPV4_HDRCKSUM_NEEDED | HCK_INNER_FULLCKSUM_NEEDED)) != 0;
+ (HCK_INNER_IPV4_HDRCKSUM_NEEDED | HCK_INNER_PSEUDO_NEEDED)) != 0;
+ if (tunneled && ttype != TTYPE_VXLAN) {
+ /* XXX kstat */
+ return (-1);
+ }
+
tctx->itc_ctx_tunneled = tunneled;
- if ((ret = mac_ether_offload_info(mp, &meo, tunneled, 0)) != 0) {
+ if ((ret = mac_ether_offload_info(mp, &meo, ttype, 0)) != 0) {
txs->itxs_hck_meoifail.value.ui64++;
return (ret);
}
@@ -1942,23 +1954,15 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
* Inner IPv4 checksum if IIPT = 11b
* Outer IPv4 checksum if EIPT = 11b
* L4 checksum if L4LEN is meaningful
- *
- * XXX JJ is the VXLAN_HDR_LEN properly accounted for?
- * XXX JJ do I need to set something in the DECTTL field?
*/
uint8_t eipt;
- uint_t l4tunlen = meo.meoi_l4hlen + meo.meoi_tun_l2hlen;
+ uint_t l4tunlen;
/*
- * Tunneling implies inner checksumming is requested, but that
- * is current only supported when the outer L4 proto is UDP.
+ * The MAC ether offload logic should have verified that we have
+ * the right information for calculating the checksums here.
+ * Make sure that this is the case.
*/
- if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0 ||
- meo.meoi_l4proto != IPPROTO_UDP ||
- (meo.meoi_flags & MEOI_TUNNEL_INFO_SET) == 0) {
- txs->itxs_hck_badl4.value.ui64++;
- return (-1);
- }
if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) {
txs->itxs_hck_nol2info.value.ui64++;
return (-1);
@@ -1968,21 +1972,31 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
return (-1);
}
- if (chkflags & (HCK_FULLCKSUM | HCK_PARTIALCKSUM)) {
+ if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0 ||
+ meo.meoi_l4proto != IPPROTO_UDP) {
+ txs->itxs_hck_badl4.value.ui64++;
+ return (-1);
+ }
+
+ if ((meo.meoi_flags & MEOI_TUNNEL_INFO_SET) == 0) {
+ /* XXX Missing kstat */
+ return (-1);
+ }
+
+ if ((chkflags & HCK_PARTIALCKSUM) != 0) {
/*
* There is no HW support for outer checksum other than
* the (outer) HCK_IPV4_HDRCKSUM.
- * Note: no kstat for invalid request.
+ * XXX missing kstat
*/
return (-1);
}
- /* L4TUNT is UDP */
- tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
-
- /* The MAC len is for the outer, irregardless of tunneling */
- tctx->itc_data_offsets |= (meo.meoi_l2hlen >> 1) <<
- I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
+ /*
+ * First fill in the descriptors for the tunneling extensions.
+ */
+ l4tunlen = meo.meoi_l4hlen + meo.meoi_tun_l2hlen +
+ meo.meoi_tun_protlen;;
/* outer IP */
if (chkflags & HCK_IPV4_HDRCKSUM) {
@@ -2003,6 +2017,17 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
}
}
+ tctx->itc_ctx_tunnel_fld =
+ I40E_TXD_TNL_SET_EIPT(eipt) |
+ I40E_TXD_TNL_SET_EIPLEN(meo.meoi_l3hlen >> 2) |
+ I40E_TXD_TNL_SET_L4TUNT(I40E_TX_DESC_TNL_L4TUNT_UDP) |
+ I40E_TXD_TNL_SET_L4TUNLEN(l4tunlen >> 1) |
+ I40E_TXD_TNL_SET_DECTTL(0);
+
+ /* The MAC len is for the outer, irregardless of tunneling */
+ tctx->itc_data_offsets |= (meo.meoi_l2hlen >> 1) <<
+ I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
+
/* inner IP */
if (chkflags & HCK_INNER_IPV4_HDRCKSUM_NEEDED) {
/* When tunneled, IIPT applies to the inner IP (L3) */
@@ -2012,13 +2037,24 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
}
tctx->itc_data_cmdflags |=
I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
+ } else {
+ if (meo.meoi_l3proto == ETHERTYPE_IP) {
+ tctx->itc_data_cmdflags |=
+ I40E_TX_DESC_CMD_IIPT_IPV4;
+ } else if (meo.meoi_l3proto == ETHERTYPE_IPV6) {
+ tctx->itc_data_cmdflags |=
+ I40E_TX_DESC_CMD_IIPT_IPV6;
+ } else {
+ txs->itxs_hck_badl3.value.ui64++;
+ return (-1);
+ }
}
/* set the inner IP header length */
tctx->itc_data_offsets |= (meo.meoi_tun_l3hlen >> 2) <<
I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
- if (chkflags & HCK_INNER_FULLCKSUM_NEEDED) {
+ if (chkflags & HCK_INNER_PSEUDO_NEEDED) {
/* L4T */
switch (meo.meoi_tun_l4proto) {
case IPPROTO_TCP:
@@ -2043,13 +2079,6 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp,
I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
}
- tctx->itc_ctx_tunnel_fld =
- I40E_TXD_TNL_SET_EIPT(eipt) |
- I40E_TXD_TNL_SET_EIPLEN(meo.meoi_l3hlen) |
- I40E_TXD_TNL_SET_L4TUNT(I40E_TX_DESC_TNL_L4TUNT_UDP) |
- I40E_TXD_TNL_SET_L4TUNLEN(l4tunlen) |
- I40E_TXD_TNL_SET_DECTTL(0);
-
} else {
/* Not tunneled */
diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c
index 73e7daac5f..a437c58b7e 100644
--- a/usr/src/uts/common/io/mac/mac_provider.c
+++ b/usr/src/uts/common/io/mac/mac_provider.c
@@ -1559,3 +1559,11 @@ mac_transceiver_info_set_usable(mac_transceiver_info_t *infop,
{
infop->mti_usable = usable;
}
+
+void
+mac_tunnel_type_get(const mblk_t *mp, uint32_t *typep)
+{
+ ASSERT(DB_TYPE(mp) == M_DATA);
+
+ *typep = (DB_TTYPEFLAGS(mp) & TTYPE_MASK) >> TTYPE_SHIFT;
+}
diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c
index 3f34ec3b58..36c4fd38b1 100644
--- a/usr/src/uts/common/io/overlay/overlay.c
+++ b/usr/src/uts/common/io/overlay/overlay.c
@@ -817,6 +817,7 @@
#include <sys/mac_client_priv.h>
#include <sys/mac_ether.h>
#include <sys/vlan.h>
+#include <sys/pattr.h>
#include <sys/overlay_impl.h>
@@ -830,15 +831,17 @@ typedef enum overlay_dev_prop {
OVERLAY_DEV_P_MTU = 0,
OVERLAY_DEV_P_VNETID,
OVERLAY_DEV_P_ENCAP,
- OVERLAY_DEV_P_VARPDID
+ OVERLAY_DEV_P_VARPDID,
+ OVERLAY_DEV_P_STRICTIF
} overlay_dev_prop_t;
-#define OVERLAY_DEV_NPROPS 4
+#define OVERLAY_DEV_NPROPS 5
static const char *overlay_dev_props[] = {
"mtu",
"vnetid",
"encap",
- "varpd/id"
+ "varpd/id",
+ "mux/bound"
};
#define OVERLAY_MTU_MIN 576
@@ -973,7 +976,7 @@ overlay_m_start(void *arg)
return (ret);
mux = overlay_mux_open(odd->odd_plugin, domain, family, prot,
- (struct sockaddr *)&storage, slen, &ret);
+ (struct sockaddr *)&storage, slen, odd->odd_strictif, &ret);
if (mux == NULL)
return (ret);
@@ -984,6 +987,12 @@ overlay_m_start(void *arg)
odd->odd_flags |= OVERLAY_F_IN_MUX;
mutex_exit(&odd->odd_lock);
+ /*
+ * Now that we're in the MUX trigger MAC to rescan our capabilities,
+ * which is important for VNICs on top of us.
+ */
+ mac_capab_update(odd->odd_mh);
+
return (0);
}
@@ -1044,6 +1053,28 @@ overlay_m_unicast(void *arg, const uint8_t *macaddr)
return (0);
}
+static inline void
+overlay_tx_checksum_shift(mblk_t *source, mblk_t *target)
+{
+ uint32_t oflags, nflags = 0;
+
+ mac_hcksum_get(source, NULL, NULL, NULL, NULL, &oflags);
+ mac_hcksum_set(source, NULL, NULL, NULL, NULL, 0);
+
+ if ((oflags & HCK_IPV4_HDRCKSUM) != 0)
+ nflags |= HCK_INNER_IPV4_HDRCKSUM_NEEDED;
+ if ((oflags & HCK_FULLCKSUM) != 0) {
+ nflags |= HCK_INNER_FULLCKSUM_NEEDED;
+ } else if ((oflags & HCK_PARTIALCKSUM) != 0) {
+ nflags |= HCK_INNER_PSEUDO_NEEDED;
+ }
+
+ /*
+ * Manually or in the flags so we don't clobber existing information.
+ */
+ DB_CKSUMFLAGS(target) |= nflags;
+}
+
mblk_t *
overlay_m_tx(void *arg, mblk_t *mp_chain)
{
@@ -1095,6 +1126,12 @@ overlay_m_tx(void *arg, mblk_t *mp_chain)
goto out;
}
+ /*
+ * Make sure any checksum flags that ended up on mp from the
+ * lower level are shifted over to emp as outer flags.
+ */
+ overlay_tx_checksum_shift(mp, ep);
+
ep->b_cont = mp;
ret = overlay_mux_tx(odd->odd_mux, &hdr, ep);
if (ret != 0)
@@ -1121,12 +1158,50 @@ overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
static boolean_t
overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
{
+ overlay_dev_t *odd = arg;
+
/*
- * Tell MAC we're an overlay.
+ * Always tell MAC we're an overlay.
*/
if (cap == MAC_CAPAB_OVERLAY)
return (B_TRUE);
- return (B_FALSE);
+
+ /*
+ * Check to see if this is a capability that we'd consider letting a
+ * module know how to ask the mux about.
+ */
+ switch (cap) {
+ case MAC_CAPAB_HCKSUM:
+ case MAC_CAPAB_LSO:
+ break;
+ default:
+ return (B_FALSE);
+ }
+
+ if (odd->odd_plugin->ovp_ops->ovpo_mac_capab == NULL) {
+ return (B_FALSE);
+ }
+
+ /*
+ * Once the device is present in a MUX it will know if it has the
+ * ability to offer various capabillities to underlying hardware. Check
+ * if we're in a mux and if so, offer that to the device. We can rely on
+ * the fact that MAC won't stop us while it's asking us about a
+ * capability to know that we can't be removed from a mux if we're not
+ * in it right now.
+ *
+ * Also, even if we're not in a MUX yet, we will retrigger capability
+ * scans once we are in one.
+ */
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_IN_MUX) == 0) {
+ mutex_exit(&odd->odd_lock);
+ return (B_FALSE);
+ }
+ mutex_exit(&odd->odd_lock);
+
+ return (odd->odd_plugin->ovp_ops->ovpo_mac_capab(odd->odd_pvoid,
+ cap, cap_data, odd->odd_mux->omux_ksock));
}
/* ARGSUSED */
@@ -1359,6 +1434,7 @@ overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL);
odd->odd_ref = 0;
odd->odd_flags = 0;
+ odd->odd_strictif = B_TRUE;
list_insert_tail(&overlay_dev_list, odd);
mutex_exit(&overlay_dev_lock);
@@ -1615,6 +1691,7 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
uint_t propid = UINT_MAX;
overlay_ioc_propinfo_t *oip = karg;
overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
+ const uint32_t def_true = 1;
odd = overlay_hold_by_dlid(oip->oipi_linkid);
if (odd == NULL)
@@ -1695,6 +1772,11 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
overlay_prop_set_nodefault(phdl);
break;
+ case OVERLAY_DEV_P_STRICTIF:
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_BOOLEAN);
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_default(phdl, &def_true, sizeof (def_true));
+ break;
default:
overlay_hold_rele(odd);
mac_perim_exit(mph);
@@ -1804,6 +1886,13 @@ overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
}
mutex_exit(&odd->odd_lock);
break;
+ case OVERLAY_DEV_P_STRICTIF:
+ mutex_enter(&odd->odd_lock);
+
+ oip->oip_size = sizeof (odd->odd_strictif);
+ bcopy(&odd->odd_strictif, oip->oip_value, oip->oip_size);
+ mutex_exit(&odd->odd_lock);
+ break;
default:
ret = ENOENT;
}
@@ -1856,6 +1945,7 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
uint_t propid = UINT_MAX;
mac_perim_handle_t mph;
uint64_t maxid, *vidp;
+ uint32_t *boolp;
if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
return (EINVAL);
@@ -1941,6 +2031,22 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
case OVERLAY_DEV_P_VARPDID:
ret = EPERM;
break;
+ case OVERLAY_DEV_P_STRICTIF:
+ if (oip->oip_size != sizeof (uint32_t)) {
+ ret = EINVAL;
+ break;
+ }
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_IN_MUX) != 0) {
+ mutex_exit(&odd->odd_lock);
+ ret = EBUSY;
+ break;
+ }
+
+ boolp = (uint32_t *)oip->oip_value;
+ odd->odd_strictif = *boolp > 0 ? B_TRUE : B_FALSE;
+ mutex_exit(&odd->odd_lock);
+ break;
default:
ret = ENOENT;
}
diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c
index 9f70e8c83e..1f330b622c 100644
--- a/usr/src/uts/common/io/overlay/overlay_mux.c
+++ b/usr/src/uts/common/io/overlay/overlay_mux.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015 Joyent, Inc.
+ * Copyright (c) 2018, Joyent, Inc.
*/
/*
@@ -30,6 +30,7 @@
#include <sys/strsubr.h>
#include <sys/strsun.h>
#include <sys/tihdr.h>
+#include <sys/pattr.h>
#include <sys/overlay_impl.h>
@@ -71,6 +72,24 @@ overlay_mux_comparator(const void *a, const void *b)
}
/*
+ * Look at the checksum flags that are set on the block. Hardware may support
+ * checksumming the inner frames. If so, we need to update the checksum flags on
+ * the message block to make sure that it makes sense.
+ */
+static inline void
+overlay_recv_checksum_shift(mblk_t *mp)
+{
+ uint32_t oflags, nflags = 0;
+
+ mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &oflags);
+ if ((oflags & HCK_INNER_IPV4_HDRCKSUM_OK) != 0)
+ nflags |= HCK_IPV4_HDRCKSUM_OK;
+ if ((oflags & HCK_INNER_FULLCKSUM_OK) != 0)
+ nflags |= HCK_FULLCKSUM_OK;
+ mac_hcksum_set(mp, NULL, NULL, NULL, NULL, nflags);
+}
+
+/*
* This is the central receive data path. We need to decode the packet, if we
* can, and then deliver it to the appropriate overlay.
*/
@@ -187,6 +206,8 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
mutex_exit(&odd->odd_lock);
mutex_exit(&mux->omux_lock);
+ overlay_recv_checksum_shift(mp);
+
mac_rx(odd->odd_mh, NULL, mp);
mutex_enter(&odd->odd_lock);
@@ -203,7 +224,7 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
*/
overlay_mux_t *
overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
- struct sockaddr *addr, socklen_t len, int *errp)
+ struct sockaddr *addr, socklen_t len, boolean_t strictif, int *errp)
{
int err;
overlay_mux_t *mux;
@@ -221,7 +242,8 @@ overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
len == mux->omux_alen &&
bcmp(addr, mux->omux_addr, len) == 0) {
- if (opp != mux->omux_plugin) {
+ if (opp != mux->omux_plugin ||
+ strictif != mux->omux_strictif) {
*errp = EEXIST;
return (NULL);
}
@@ -260,7 +282,7 @@ overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
* then ask it to perform any additional socket set up it'd like to do.
*/
if (opp->ovp_ops->ovpo_sockopt != NULL &&
- (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
+ (*errp = opp->ovp_ops->ovpo_sockopt(ksock, strictif)) != 0) {
mutex_exit(&overlay_mux_lock);
ksocket_close(ksock, kcred);
return (NULL);
@@ -273,6 +295,7 @@ overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
mux->omux_domain = domain;
mux->omux_family = family;
mux->omux_protocol = protocol;
+ mux->omux_strictif = strictif;
mux->omux_addr = kmem_alloc(len, KM_SLEEP);
bcopy(addr, mux->omux_addr, len);
mux->omux_alen = len;
diff --git a/usr/src/uts/common/io/overlay/overlay_prop.c b/usr/src/uts/common/io/overlay/overlay_prop.c
index ba1ea2a629..159fde0f78 100644
--- a/usr/src/uts/common/io/overlay/overlay_prop.c
+++ b/usr/src/uts/common/io/overlay/overlay_prop.c
@@ -54,7 +54,8 @@ overlay_prop_set_type(overlay_prop_handle_t phdl, overlay_prop_type_t type)
}
int
-overlay_prop_set_default(overlay_prop_handle_t phdl, void *def, ssize_t len)
+overlay_prop_set_default(overlay_prop_handle_t phdl, const void *def,
+ ssize_t len)
{
overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
diff --git a/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c
index 8b4e4ecb42..a381a0c793 100644
--- a/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c
+++ b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015 Joyent, Inc.
+ * Copyright (c) 2018 Joyent, Inc.
*/
/*
@@ -48,6 +48,9 @@
#include <inet/ip.h>
#include <netinet/in.h>
#include <sys/strsun.h>
+#include <sys/dld.h>
+#include <sys/dlpi.h>
+#include <sys/pattr.h>
#include <netinet/udp.h>
static const char *vxlan_ident = "vxlan";
@@ -64,12 +67,21 @@ static const char *vxlan_props[] = {
NULL
};
+typedef enum vxlan_capab_state {
+ VXLAN_C_UNKNOWN = 0,
+ VXLAN_C_VALID,
+ VXLAN_C_FAILED
+} vxlan_capab_state_t;
+
typedef struct vxlan {
kmutex_t vxl_lock;
overlay_handle_t vxl_oh;
uint16_t vxl_lport;
boolean_t vxl_hladdr;
struct in6_addr vxl_laddr;
+ vxlan_capab_state_t vxl_cstate;
+ int vxl_cstate_err;
+ udp_tunnel_opt_t vxl_utunnel;
} vxlan_t;
static int
@@ -77,12 +89,14 @@ vxlan_o_init(overlay_handle_t oh, void **outp)
{
vxlan_t *vxl;
- vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP);
+ vxl = kmem_zalloc(sizeof (vxlan_t), KM_SLEEP);
*outp = vxl;
mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL);
vxl->vxl_oh = oh;
vxl->vxl_lport = vxlan_defport;
vxl->vxl_hladdr = B_FALSE;
+ vxl->vxl_cstate = VXLAN_C_UNKNOWN;
+ vxl->vxl_cstate_err = 0;
return (0);
}
@@ -128,16 +142,24 @@ vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr,
}
static int
-vxlan_o_sockopt(ksocket_t ksock)
+vxlan_o_sockopt(ksocket_t ksock, boolean_t strictif)
{
int val, err;
- if (vxlan_fanout == B_FALSE)
- return (0);
+ udp_tunnel_opt_t topt;
- val = UDP_HASH_VXLAN;
- err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val,
- sizeof (val), kcred);
- return (err);
+ bzero(&topt, sizeof (udp_tunnel_opt_t));
+ topt.uto_type = UDP_TUNNEL_VXLAN;
+ topt.uto_opts = UDP_TUNNEL_OPT_SRCPORT_HASH;
+ if (strictif) {
+ topt.uto_opts |= UDP_TUNNEL_OPT_HWCAP | UDP_TUNNEL_OPT_RELAX_CKSUM;
+ }
+
+ if ((err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_TUNNEL, &topt,
+ sizeof (topt), kcred) != 0)) {
+ return (err);
+ }
+
+ return (0);
}
/* ARGSUSED */
@@ -166,6 +188,13 @@ vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop,
vxh->vxlan_flags = ntohl(VXLAN_F_VDI);
vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT);
ob->b_wptr += VXLAN_HDR_LEN;
+
+ /*
+ * Make sure to set the fact that this is a VXLAN packet on this message
+ * block.
+ */
+ DB_TTYPEFLAGS(ob) |= (TTYPE_VXLAN << TTYPE_SHIFT);
+
*outp = ob;
return (0);
@@ -305,6 +334,78 @@ vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl)
return (EINVAL);
}
+static boolean_t
+vxlan_o_mac_capab(void *arg, mac_capab_t capab, void *cap_data, ksocket_t ksock)
+{
+ vxlan_t *vxl = arg;
+ boolean_t hcapab = B_FALSE;
+
+ if (capab != MAC_CAPAB_HCKSUM && capab != MAC_CAPAB_LSO)
+ return (B_FALSE);
+
+ mutex_enter(&vxl->vxl_lock);
+ if (vxl->vxl_cstate == VXLAN_C_FAILED) {
+ goto out;
+ } else if (vxl->vxl_cstate == VXLAN_C_UNKNOWN) {
+ int len = sizeof (udp_tunnel_opt_t);
+ bzero(&vxl->vxl_utunnel, sizeof (udp_tunnel_opt_t));
+ vxl->vxl_cstate_err = ksocket_getsockopt(ksock, IPPROTO_UDP,
+ UDP_TUNNEL, &vxl->vxl_utunnel, &len, kcred);
+ if (vxl->vxl_cstate_err != 0) {
+ vxl->vxl_cstate = VXLAN_C_FAILED;
+ goto out;
+ }
+
+ if (vxl->vxl_utunnel.uto_type != UDP_TUNNEL_VXLAN) {
+ vxl->vxl_cstate = VXLAN_C_FAILED;
+ vxl->vxl_cstate_err = -1;
+ goto out;
+ }
+ }
+
+ switch (capab) {
+ case MAC_CAPAB_HCKSUM:
+ /*
+ * XXX Almost certainly some things are going to need the right
+ * psuedo-header on transmit.
+ */
+ if ((vxl->vxl_utunnel.uto_cksum_flags & (HCKSUM_VXLAN_FULL |
+ HCKSUM_VXLAN_PSEUDO | HCKSUM_VXLAN_PSEUDO_NO_OL4)) != 0) {
+ uint32_t *hck = cap_data;
+ *hck = HCKSUM_IPHDRCKSUM;
+ if ((vxl->vxl_utunnel.uto_cksum_flags &
+ HCKSUM_VXLAN_FULL) != 0) {
+ *hck |= HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6;
+ } else if ((vxl->vxl_utunnel.uto_cksum_flags &
+ (HCKSUM_VXLAN_PSEUDO |
+ HCKSUM_VXLAN_PSEUDO_NO_OL4)) != 0) {
+ *hck |= HCKSUM_INET_PARTIAL;
+ }
+ hcapab = B_TRUE;
+ }
+ break;
+#if 0
+ case MAC_CAPAB_LSO:
+ if ((vxl->vxl_utunnel.uto_lso_flags & DLD_LSO_VXLAN_TCP_IPV4) != 0) {
+ mac_capab_lso_t *lso = cap_data;
+ lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
+ /* XXX Check value */
+ lso->lso_basic_tcp_ipv4.lso_max =
+ vxl->vxl_utunnel.uto_lso_max - 100;
+ hcapab = B_TRUE;
+ }
+ break;
+#endif
+ default:
+ hcapab = B_FALSE;
+ break;
+ }
+
+out:
+ mutex_exit(&vxl->vxl_lock);
+ return (hcapab);
+}
+
static struct overlay_plugin_ops vxlan_o_ops = {
0,
vxlan_o_init,
@@ -315,7 +416,8 @@ static struct overlay_plugin_ops vxlan_o_ops = {
vxlan_o_sockopt,
vxlan_o_getprop,
vxlan_o_setprop,
- vxlan_o_propinfo
+ vxlan_o_propinfo,
+ vxlan_o_mac_capab
};
static struct modlmisc vxlan_modlmisc = {
diff --git a/usr/src/uts/common/mapfiles/mac.mapfile b/usr/src/uts/common/mapfiles/mac.mapfile
index d40c09b311..79a465c19b 100644
--- a/usr/src/uts/common/mapfiles/mac.mapfile
+++ b/usr/src/uts/common/mapfiles/mac.mapfile
@@ -10,7 +10,7 @@
#
#
-# Copyright (c) 2017, Joyent, Inc.
+# Copyright (c) 2018, Joyent, Inc.
#
#
@@ -51,6 +51,7 @@ SYMBOL_SCOPE {
mac_rx_ring { FLAGS = EXTERN };
mac_transceiver_info_set_present { FLAGS = EXTERN };
mac_transceiver_info_set_usable { FLAGS = EXTERN };
+ mac_tunnel_type_get { FLAGS = EXTERN };
mac_tx_ring_update { FLAGS = EXTERN };
mac_tx_update { FLAGS = EXTERN };
mac_unregister { FLAGS = EXTERN };
diff --git a/usr/src/uts/common/netinet/udp.h b/usr/src/uts/common/netinet/udp.h
index 74cff75d43..a775a4fc00 100644
--- a/usr/src/uts/common/netinet/udp.h
+++ b/usr/src/uts/common/netinet/udp.h
@@ -34,15 +34,28 @@ struct udphdr {
#define UDP_EXCLBIND 0x0101 /* for internal use only */
#define UDP_RCVHDR 0x0102 /* for internal use only */
#define UDP_NAT_T_ENDPOINT 0x0103 /* for internal use only */
-#define UDP_SRCPORT_HASH 0x0104 /* for internal use only */
+#define UDP_TUNNEL 0x0104 /* for internal use only */
#define UDP_SND_TO_CONNECTED 0x0105 /* for internal use only */
+#ifdef _KERNEL
+
/*
- * Hash definitions for UDP_SRCPORT_HASH that effectively tell UDP how to go
- * handle UDP_SRCPORT_HASH.
+ * Internal structure defintions for UDP_TUNNEL.
*/
-#define UDP_HASH_DISABLE 0x0000 /* for internal use only */
-#define UDP_HASH_VXLAN 0x0001 /* for internal use only */
+#define UDP_TUNNEL_VXLAN 1
+#define UDP_TUNNEL_OPT_SRCPORT_HASH 0x01
+#define UDP_TUNNEL_OPT_HWCAP 0x02
+#define UDP_TUNNEL_OPT_RELAX_CKSUM 0x04
+
+typedef struct udp_tunnel_opt {
+ uint32_t uto_type;
+ uint32_t uto_opts;
+ uint32_t uto_cksum_flags;
+ uint32_t uto_lso_flags;
+ uint32_t uto_lso_max;
+} udp_tunnel_opt_t;
+
+#endif /* _KERNEL */
/*
* Following option in UDP_ namespace required to be exposed through
diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h
index 158a802c4a..3e74f0e03b 100644
--- a/usr/src/uts/common/sys/dld.h
+++ b/usr/src/uts/common/sys/dld.h
@@ -437,6 +437,8 @@ typedef struct dld_capab_poll_s {
*/
#define DLD_LSO_BASIC_TCP_IPV4 0x01 /* TCP LSO over IPv4 capability */
#define DLD_LSO_BASIC_TCP_IPV6 0x02 /* TCP LSO over IPv6 capability */
+#define DLD_LSO_VXLAN_TCP_IPV4 0x04 /* TCPv4 LSO encapsulated in VXLAN */
+#define DLD_LSO_VXLAN_TCP_IPV6 0x08 /* TCPv6 LSO encapsulated in VXLAN */
typedef struct dld_capab_lso_s {
uint_t lso_flags; /* capability flags */
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index b2fd131066..087b3bbc48 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -689,14 +689,20 @@ typedef struct {
#define HCKSUM_INET_FULL_ICMPV6 0x0040 /* Full 1's complement checksum */
/* ability for IPv6 ICMP packets. */
#define HCKSUM_VXLAN_FULL 0x0080 /* Inner L3/L4 & outer L3/L4 offload */
-#define HCKSUM_VXLAN_FULL_NO_OL4 0x0100 /* Same as HCKSUM_VXLAN_FULL but no */
- /* outer L4 offload */
+#define HCKSUM_VXLAN_PSEUDO 0x0100 /* Inner L3/L4 & outer L3/L4 offload */
+ /* L4 requires psuedo header */
+#define HCKSUM_VXLAN_PSEUDO_NO_OL4 0x0200 /* Same as HCKSUM_VXLAN_PSEUDO */
+ /* but no outer L4 offload */
#define HCKSUM_ALL_BUT_ENBL (HCKSUM_INET_PARTIAL | \
HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | \
HCKSUM_IPHDRCKSUM | \
HCKSUM_INET_FULL_ICMPV4 | HCKSUM_INET_FULL_ICMPV6 | \
- HCKSUM_VXLAN_FULL | HCKSUM_VXLAN_FULL_NO_OL4)
+ HCKSUM_VXLAN_FULL | HCKSUM_VXLAN_PSEUDO | \
+ HCKSUM_VXLAN_PSEUDO_NO_OL4)
+
+#define HCKSUM_TUNNEL_VXLAN_OIP (HCKSUM_VXLAN_FULL | HCKSUM_VXLAN_PSEUDO | \
+ HCKSUM_VXLAN_PSEUDO_NO_OL4)
#define HCKSUM_ALL (HCKSUM_ENABLE | HCKSUM_ALL_BUT_ENBL)
diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h
index f5c91e7933..e27ba63f2d 100644
--- a/usr/src/uts/common/sys/mac_provider.h
+++ b/usr/src/uts/common/sys/mac_provider.h
@@ -120,10 +120,16 @@ typedef struct lso_basic_tcp_ipv4_s {
t_uscalar_t lso_max; /* maximum payload */
} lso_basic_tcp_ipv4_t;
+typedef struct lso_vxlan_tcp {
+ t_uscalar_t lso_tcpv4_max; /* maximum payload */
+ t_uscalar_t lso_tcpv6_max; /* maximum payload */
+} lso_vxlan_tcp_t;
+
/*
* Currently supported flags for LSO.
*/
-#define LSO_TX_BASIC_TCP_IPV4 0x01 /* TCP LSO capability */
+#define LSO_TX_BASIC_TCP_IPV4 0x01 /* TCPv4 LSO capability */
+#define LSO_TX_VXLAN_TCP 0x02 /* VXLAN LSO capability */
/*
* Future LSO capabilities can be added at the end of the mac_capab_lso_t.
@@ -136,6 +142,7 @@ typedef struct lso_basic_tcp_ipv4_s {
typedef struct mac_capab_lso_s {
t_uscalar_t lso_flags;
lso_basic_tcp_ipv4_t lso_basic_tcp_ipv4;
+ lso_vxlan_tcp_t lso_vxlan_tcp;
/* Add future lso capabilities here */
} mac_capab_lso_t;
@@ -603,6 +610,8 @@ extern void mac_transceiver_info_set_usable(
mac_transceiver_info_t *,
boolean_t);
+extern void mac_tunnel_type_get(const mblk_t *, uint32_t *);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h
index d638096006..b137c24ca5 100644
--- a/usr/src/uts/common/sys/overlay_common.h
+++ b/usr/src/uts/common/sys/overlay_common.h
@@ -42,7 +42,8 @@ typedef enum overlay_prop_type {
OVERLAY_PROP_T_INT = 0x1, /* signed int */
OVERLAY_PROP_T_UINT, /* unsigned int */
OVERLAY_PROP_T_IP, /* sinaddr6 */
- OVERLAY_PROP_T_STRING /* OVERLAY_PROPS_SIZEMAX */
+ OVERLAY_PROP_T_STRING, /* OVERLAY_PROPS_SIZEMAX */
+ OVERLAY_PROP_T_BOOLEAN /* unsinged int */
} overlay_prop_type_t;
typedef enum overlay_prop_prot {
diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h
index 7fb8b8da1d..0240ac3090 100644
--- a/usr/src/uts/common/sys/overlay_impl.h
+++ b/usr/src/uts/common/sys/overlay_impl.h
@@ -61,6 +61,7 @@ typedef struct overlay_mux {
int omux_protocol; /* RO: socket protocol */
struct sockaddr *omux_addr; /* RO: socket address */
socklen_t omux_alen; /* RO: sockaddr len */
+ boolean_t omux_strictif; /* RO: strict IF bind */
kmutex_t omux_lock; /* Protects everything below */
uint_t omux_count; /* Active instances */
avl_tree_t omux_devices; /* Tree of devices */
@@ -115,6 +116,7 @@ typedef struct overlay_dev {
uint_t odd_txcount; /* protected by odd_lock */
overlay_mux_t *odd_mux; /* protected by odd_lock */
uint64_t odd_vid; /* RO if active else odd_lock */
+ boolean_t odd_strictif; /* RO if active else odd_lock */
avl_node_t odd_muxnode; /* managed by mux */
overlay_target_t *odd_target; /* See big theory statement */
char odd_fmamsg[OVERLAY_STATUS_BUFLEN]; /* odd_lock */
@@ -167,7 +169,7 @@ extern void overlay_mux_init(void);
extern void overlay_mux_fini(void);
extern overlay_mux_t *overlay_mux_open(overlay_plugin_t *, int, int, int,
- struct sockaddr *, socklen_t, int *);
+ struct sockaddr *, socklen_t, boolean_t, int *);
extern void overlay_mux_close(overlay_mux_t *);
extern void overlay_mux_add_dev(overlay_mux_t *, overlay_dev_t *);
extern void overlay_mux_remove_dev(overlay_mux_t *, overlay_dev_t *);
diff --git a/usr/src/uts/common/sys/overlay_plugin.h b/usr/src/uts/common/sys/overlay_plugin.h
index 07efaa05df..13447808ee 100644
--- a/usr/src/uts/common/sys/overlay_plugin.h
+++ b/usr/src/uts/common/sys/overlay_plugin.h
@@ -267,12 +267,14 @@ typedef int (*overlay_plugin_init_t)(overlay_handle_t, void **);
typedef void (*overlay_plugin_fini_t)(void *);
typedef int (*overlay_plugin_socket_t)(void *, int *, int *, int *,
struct sockaddr *, socklen_t *);
-typedef int (*overlay_plugin_sockopt_t)(ksocket_t);
+typedef int (*overlay_plugin_sockopt_t)(ksocket_t, boolean_t);
typedef int (*overlay_plugin_getprop_t)(void *, const char *, void *,
uint32_t *);
typedef int (*overlay_plugin_setprop_t)(void *, const char *, const void *,
uint32_t);
typedef int (*overlay_plugin_propinfo_t)(const char *, overlay_prop_handle_t);
+typedef boolean_t (*overlay_plugin_mac_capab_t)(void *, mac_capab_t, void *,
+ ksocket_t);
typedef struct overlay_plugin_ops {
uint_t ovpo_callbacks;
@@ -285,6 +287,7 @@ typedef struct overlay_plugin_ops {
overlay_plugin_getprop_t ovpo_getprop;
overlay_plugin_setprop_t ovpo_setprop;
overlay_plugin_propinfo_t ovpo_propinfo;
+ overlay_plugin_mac_capab_t ovpo_mac_capab;
} overlay_plugin_ops_t;
typedef struct overlay_plugin_register {
@@ -311,7 +314,8 @@ extern int overlay_plugin_unregister(const char *);
extern void overlay_prop_set_name(overlay_prop_handle_t, const char *);
extern void overlay_prop_set_prot(overlay_prop_handle_t, overlay_prop_prot_t);
extern void overlay_prop_set_type(overlay_prop_handle_t, overlay_prop_type_t);
-extern int overlay_prop_set_default(overlay_prop_handle_t, void *, ssize_t);
+extern int overlay_prop_set_default(overlay_prop_handle_t, const void *,
+ ssize_t);
extern void overlay_prop_set_nodefault(overlay_prop_handle_t);
extern void overlay_prop_set_range_uint32(overlay_prop_handle_t, uint32_t,
uint32_t);
diff --git a/usr/src/uts/common/sys/pattr.h b/usr/src/uts/common/sys/pattr.h
index a9d999a11b..6545c5d619 100644
--- a/usr/src/uts/common/sys/pattr.h
+++ b/usr/src/uts/common/sys/pattr.h
@@ -105,26 +105,39 @@ typedef struct pattr_hcksum_s {
/* On Receive: equivalent to */
/* HCK_IPV4_HDRCKSUM_OK for the inner */
/* header */
+#define HCK_INNER_IPV4_HDRCKSUM_NEEDED 0x20 /* On Transmit: equivalent to */
+ /* HCK_IPV4_HDRCKSUM; HW calculates */
+ /* inner checksum */
#define HCK_INNER_FULLCKSUM_OK 0x40 /* On Transmit: N/A */
/* On Receive: equivalent to */
/* HCK_FULLCKSUM_OK for the inner */
/* header */
-#define HCK_INNER_IPV4_HDRCKSUM_NEEDED 0x80 /* On Transmit: equivalent to */
- /* HCK_IPV4_HDRCKSUM; HW calculates */
- /* inner checksum */
-
-#define HCK_INNER_FULLCKSUM_NEEDED 0x100 /* On Transmit: equivalent to */
+#define HCK_INNER_FULLCKSUM_NEEDED 0x40 /* On Transmit: equivalent to */
/* HCK_FULLCKSUM; HW calculates inner */
/* L4 header checksum. */
-#define HCK_FLAGS (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | \
- HCK_FULLCKSUM | HCK_FULLCKSUM_OK | \
- HCK_INNER_IPV4_HDRCKSUM_OK | \
+#define HCK_INNER_PSEUDO_NEEDED 0x80 /* On Transmit: offload */
+ /* of the inner TCP/UDP header, but */
+ /* requires that the pseudo-header */
+ /* is filled in the checksum. Like */
+ /* HCK_PARTIALCKSUM, but no fields */
+ /* saved */
+
+#define HCK_INNER_FLAGS_NEEDED (HCK_INNER_IPV4_HDRCKSUM_NEEDED | \
+ HCK_INNER_FULLCKSUM_NEEDED | \
+ HCK_INNER_PSEUDO_NEEDED)
+
+#define HCK_INNER_FLAGS (HCK_INNER_IPV4_HDRCKSUM_OK | \
HCK_INNER_FULLCKSUM_OK | \
- HCK_INNER_IPV4_HDRCKSUM_NEEDED | \
- HCK_INNER_FULLCKSUM_NEEDED)
+ HCK_INNER_FLAGS_NEEDED)
+
+#define HCK_OUTER_FLAGS (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | \
+ HCK_FULLCKSUM | HCK_FULLCKSUM_OK)
+
+#define HCK_FLAGS (HCK_INNER_FLAGS | HCK_OUTER_FLAGS)
+
/*
* Extended hardware offloading flags that also use hcksum_flags
*/
@@ -134,6 +147,15 @@ typedef struct pattr_hcksum_s {
#define HW_LSO_FLAGS HW_LSO /* All LSO flags, currently only one */
/*
+ * The upper three bits are used to indicate if the packet has any known
+ * tunneling information.
+ */
+#define TTYPE_MASK 0xe000
+#define TTYPE_SHIFT 13
+#define TTYPE_NONE 0x00
+#define TTYPE_VXLAN 0x01
+
+/*
* Structure used for zerocopy attribute.
*/
typedef struct pattr_zcopy_s {
diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h
index 0f29dd3675..54d39389f0 100644
--- a/usr/src/uts/common/sys/strsubr.h
+++ b/usr/src/uts/common/sys/strsubr.h
@@ -1345,6 +1345,7 @@ extern int SAMESTR(queue_t *);
#define DB_CKSUM32(mp) ((mp)->b_datap->db_cksum32)
#define DB_LSOFLAGS(mp) ((mp)->b_datap->db_struioun.cksum.flags)
#define DB_LSOMSS(mp) ((mp)->b_datap->db_struioun.cksum.pad)
+#define DB_TTYPEFLAGS(mp) ((mp)->b_datap->db_struioun.cksum.flags)
#endif /* _KERNEL */
#ifdef __cplusplus