summaryrefslogtreecommitdiff
path: root/usr/src/uts/common
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common')
-rw-r--r--usr/src/uts/common/Makefile.files1
-rw-r--r--usr/src/uts/common/Makefile.rules12
-rw-r--r--usr/src/uts/common/io/overlay/overlay.c452
-rw-r--r--usr/src/uts/common/io/overlay/overlay_mux.c15
-rw-r--r--usr/src/uts/common/io/overlay/overlay_target.c1290
-rw-r--r--usr/src/uts/common/qqcache/qqcache.c444
-rw-r--r--usr/src/uts/common/sys/Makefile4
-rw-r--r--usr/src/uts/common/sys/ethernet.h14
-rw-r--r--usr/src/uts/common/sys/overlay.h4
-rw-r--r--usr/src/uts/common/sys/overlay_common.h3
-rw-r--r--usr/src/uts/common/sys/overlay_impl.h57
-rw-r--r--usr/src/uts/common/sys/overlay_target.h88
-rw-r--r--usr/src/uts/common/sys/qqcache.h176
-rw-r--r--usr/src/uts/common/sys/qqcache_impl.h72
14 files changed, 2419 insertions, 213 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 752fe56100..e973cf58ad 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -299,6 +299,7 @@ GENUNIX_OBJS += \
resolvepath.o \
retire_store.o \
process.o \
+ qqcache.o \
rlimit.o \
rmap.o \
rw.o \
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index a32c094f3b..ba8945b6fb 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -26,6 +26,7 @@
# Copyright 2020 Joyent, Inc.
# Copyright 2018 Nexenta Systems, Inc.
# Copyright (c) 2017 by Delphix. All rights reserved.
+# Copyright 2018 Joyent, Inc.
# Copyright 2020 Oxide Computer Company
#
@@ -1627,6 +1628,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/pcmcia/pcs/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/qqcache/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/refhash/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/rpc/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -2778,6 +2787,9 @@ $(LINTS_DIR)/%.ln: $(COMMONBASE)/nvpair/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/os/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/qqcache/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/rpc/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c
index 2ad3f4f591..c54a6e0d9c 100644
--- a/usr/src/uts/common/io/overlay/overlay.c
+++ b/usr/src/uts/common/io/overlay/overlay.c
@@ -134,6 +134,40 @@
* be sent to. In addition, they handle questions related to how to handle
* things like broadcast and multicast traffic, etc.
*
+ * ROUTING
+ *
+ * Supporting routing of packets between VLANs that exist on an overlay
+ * network require two major differences. First, packets destined for off-VLAN
+ * destinations need to be identified. Second, we must obtain the necessary
+ * additional information necessary to deliver the packet to its off-VLAN
+ * destination.
+ *
+ * To solve the first issue, we utilize the existing IP routing functionality.
+ * Off-vlan destinations are given routes with next hops in the originating
+ * netstack's routing table--just like in physical networks. The system will
+ * then attempt to generate an ARP query, which will be sent out to varpd in
+ * the exact same manner as is described above for other on-VLAN destinations.
+ * The response for this will include a MAC address that is both used for the
+ * ARP reply, and is added to our VL2 MAC->UL3 hash table, but is added
+ * with the OVERLAY_ENTRY_F_ROUTER flag set. Once this is done, the originating
+ * netstack will send off-VLAN packets to this router MAC, allowing the
+ * overlay device to identify these packets as requiring routing.
+ *
+ * Once packets with an off-VLAN destination are identified, we must determine
+ * what the destination vid, VL2 VLAN, and VL2 MAC values are for the given
+ * packet. For reasons similar to the VL2 MAC->UL3 lookup described above,
+ * we utilize the flexibility of user land to perform these lookups (also
+ * using varpd). In this instance we are attempting to find a destination VL3
+ * IP to a UL3 IP mapping (a few extra bits of information are necessary to
+ * allow for disambiguation of the destination VL3 IP for situations such as
+ * mirroring a production environment including VL3 IPs in an isolated set of
+ * VLANs). We then store these results in a VL3->UL3 hash table for future
+ * lookups.
+ *
+ * To prevent the size of both the VL2->UL3 and VL3->UL3 hash tables from
+ * growing without bound, we cap the number of entries in each hash table and
+ * utilize the ARC algorithm to manage their contents.
+ *
* ----------
* Properties
* ----------
@@ -205,6 +239,10 @@
* UTF-8. Note that the size of the string includes the null
* terminator.
*
+ * OVERLAY_PROP_T_ETHER
+ *
+ * An ether_addr_t, which has a fixed size.
+ *
* The next thing that we apply to a property is its permission. The permissions
* are put together by the bitwise or of the following flags and values.
*
@@ -461,7 +499,7 @@
* On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
* are much more interesting and as a result, more complicated. We primarily
* store lists of overlay_target_entry_t's which are stored in both an avl tree
- * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
+ * and a qqcache_t. The primary look up path uses the qqcache_t and the avl tree
* is only used for a few of the target ioctls used to dump data such that we
* can get a consistent iteration order for things like dladm show-overlay -t.
* The key that we use for the reference hashtable is based on the mac address
@@ -486,6 +524,28 @@
* any outstanding data to that place. For the full story on how we look that up
* will be discussed in the section on the Target Cache Lifecycle.
*
+ * For routing, everything works largely the same as it does in the non-routing
+ * situations. The major differences are that both the target cache is always
+ * an OVERLAY_TARGET_DYNAMIC cache, and that an additional hash table lookup
+ * occurs. When a routed packet is sent down stack, the
+ * overlay_target_entry_t in the VL2 cache will have its
+ * OVERLAY_ENTRY_F_ROUTER flag set, which will prompt a lookup in the VL3->UL3
+ * cache (using the source VL3, source VL2 VLAN, and destination VL3 values
+ * from the packet as the lookup key). The entry returned from the cache is
+ * used to modify the source and destination VL2 MAC addresses as well as
+ * the VL2 VLAN ID, and then is encapsulated and sent to its UL3 destination.
+ * On reception, decapsulation happens exactely the same as in the non-routed
+ * case, and the packet appears just as if it was sent out the VL2 network
+ * from a router connected to it. This is done both to maintain the illusion
+ * of a physical network when sniffing packets at the instance level, and
+ * so that the mac layer sitting above the destinations overlay device
+ * (for the vnic created over the overlay) does not discard the packet because
+ * its VLAN tag does not match the VLAN tag of the destination VNIC. While
+ * some of these modifications could be split between the source and
+ * destination hosts, by doing the work on the source, it maximizes any
+ * potential parallelism that might be present from multiple flows to a given
+ * destination.
+ *
* ------------------------
* FMA and Degraded Devices
* ------------------------
@@ -830,21 +890,62 @@ typedef enum overlay_dev_prop {
OVERLAY_DEV_P_MTU = 0,
OVERLAY_DEV_P_VNETID,
OVERLAY_DEV_P_ENCAP,
- OVERLAY_DEV_P_VARPDID
+ OVERLAY_DEV_P_VARPDID,
+ OVERLAY_DEV_P_DCID,
+ OVERLAY_DEV_P_VL2_CACHE_SIZE,
+ OVERLAY_DEV_P_VL2_CACHE_A,
+ OVERLAY_DEV_P_ROUTE_CACHE_SIZE,
+ OVERLAY_DEV_P_ROUTE_CACHE_A
} overlay_dev_prop_t;
-#define OVERLAY_DEV_NPROPS 4
+#define OVERLAY_DEV_NPROPS 9
static const char *overlay_dev_props[] = {
"mtu",
"vnetid",
"encap",
- "varpd/id"
+ "varpd/id",
+ "dcid",
+ "vl2_cache_size",
+ "_vl2_cache_a",
+ "route_cache_size",
+ "_route_cache_a"
};
+/* properties that can be changed live */
+static boolean_t overlay_dev_liveprop[] = {
+ B_FALSE, /* mtu */
+ B_FALSE, /* vnetid */
+ B_FALSE, /* encap */
+ B_FALSE, /* varpd/id */
+ B_FALSE, /* dcid */
+ B_TRUE, /* vl2_cache_size */
+ B_TRUE, /* _vl2_cache_a */
+ B_TRUE, /* route_cache_size */
+ B_TRUE /* _route_cache_a */
+};
+
+CTASSERT(ARRAY_SIZE(overlay_dev_props) == OVERLAY_DEV_NPROPS);
+CTASSERT(ARRAY_SIZE(overlay_dev_liveprop) == OVERLAY_DEV_NPROPS);
+
#define OVERLAY_MTU_MIN 576
#define OVERLAY_MTU_DEF 1400
#define OVERLAY_MTU_MAX 8900
+/* The 2Q parameter 'a' is a percentage */
+#define OVERLAY_CACHE_MAX_A 100
+/* An somewhat arbitrary default, biasing towards storing more MFU entries */
+#define OVERLAY_CACHE_A_DEF 75
+
+/* Somewhat arbitrary min and max values */
+#define OVERLAY_VL2_CACHE_MIN 256
+#define OVERLAY_VL2_CACHE_MAX 10240
+#define OVERLAY_VL2_CACHE_DEF OVERLAY_VL2_CACHE_MIN
+
+/* Somewhat arbitrary min and max values */
+#define OVERLAY_ROUTE_CACHE_MIN 256
+#define OVERLAY_ROUTE_CACHE_MAX 10240
+#define OVERLAY_ROUTE_CACHE_DEF OVERLAY_ROUTE_CACHE_MIN
+
overlay_dev_t *
overlay_hold_by_dlid(datalink_id_t id)
{
@@ -1066,7 +1167,6 @@ overlay_m_tx(void *arg, mblk_t *mp_chain)
bzero(&hdr, sizeof (struct msghdr));
bzero(&einfo, sizeof (ovep_encap_info_t));
- einfo.ovdi_id = odd->odd_vid;
mp = mp_chain;
while (mp != NULL) {
socklen_t slen;
@@ -1077,7 +1177,7 @@ overlay_m_tx(void *arg, mblk_t *mp_chain)
ep = NULL;
ret = overlay_target_lookup(odd, mp,
- (struct sockaddr *)&storage, &slen);
+ (struct sockaddr *)&storage, &slen, &einfo.ovdi_id);
if (ret != OVERLAY_TARGET_OK) {
if (ret == OVERLAY_TARGET_DROP)
freemsg(mp);
@@ -1260,6 +1360,19 @@ overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
}
odd->odd_vid = oicp->oic_vnetid;
+ if (oicp->oic_dcid > UINT32_MAX) {
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EINVAL);
+ }
+ odd->odd_dcid = oicp->oic_dcid;
+
+ odd->odd_vl2sz = OVERLAY_VL2_CACHE_DEF;
+ odd->odd_vl2a = OVERLAY_CACHE_A_DEF;
+ odd->odd_routesz = OVERLAY_ROUTE_CACHE_DEF;
+ odd->odd_routea = OVERLAY_CACHE_A_DEF;
+
mac = mac_alloc(MAC_VERSION);
if (mac == NULL) {
mutex_exit(&overlay_dev_lock);
@@ -1613,6 +1726,7 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
int ret;
mac_perim_handle_t mph;
uint_t propid = UINT_MAX;
+ uint32_t def;
overlay_ioc_propinfo_t *oip = karg;
overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
@@ -1695,6 +1809,42 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
overlay_prop_set_nodefault(phdl);
break;
+ case OVERLAY_DEV_P_DCID:
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_nodefault(phdl);
+ overlay_prop_set_range_uint32(phdl, 0, UINT32_MAX);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_SIZE:
+ def = OVERLAY_VL2_CACHE_DEF;
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, OVERLAY_VL2_CACHE_MIN,
+ OVERLAY_VL2_CACHE_MAX);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_A:
+ def = OVERLAY_CACHE_A_DEF;
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, 0, OVERLAY_CACHE_MAX_A);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_SIZE:
+ def = OVERLAY_ROUTE_CACHE_DEF;
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, OVERLAY_ROUTE_CACHE_MIN,
+ OVERLAY_ROUTE_CACHE_MAX);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_A:
+ def = OVERLAY_CACHE_A_DEF;
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, 0, OVERLAY_CACHE_MAX_A);
+ break;
default:
overlay_hold_rele(odd);
mac_perim_exit(mph);
@@ -1804,6 +1954,41 @@ overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
}
mutex_exit(&odd->odd_lock);
break;
+ case OVERLAY_DEV_P_DCID:
+ /*
+ * While it's read-only while inside of a mux, we're not in a
+ * context that can guarantee that. Therefore we always grab the
+ * overlay_dev_t's odd_lock.
+ */
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_dcid, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_SIZE:
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_vl2sz, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_A:
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_vl2a, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_SIZE:
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_routesz, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_A:
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_routea, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
default:
ret = ENOENT;
}
@@ -1845,6 +2030,146 @@ overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
mutex_exit(&odd->odd_lock);
}
+static void
+overlay_setprop_dcid(overlay_dev_t *odd, uint32_t dcid)
+{
+ mutex_enter(&odd->odd_lock);
+
+ /* Simple case, not active */
+ if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+ odd->odd_dcid = dcid;
+ mutex_exit(&odd->odd_lock);
+ return;
+ }
+
+ /*
+ * In the hard case, we need to set the drop flag, quiesce I/O and then
+ * we can go ahead and do everything.
+ */
+ odd->odd_flags |= OVERLAY_F_MDDROP;
+ overlay_io_wait(odd, OVERLAY_F_IOMASK);
+ mutex_exit(&odd->odd_lock);
+
+ overlay_mux_remove_dev(odd->odd_mux, odd);
+ mutex_enter(&odd->odd_lock);
+ odd->odd_dcid = dcid;
+ mutex_exit(&odd->odd_lock);
+ overlay_mux_add_dev(odd->odd_mux, odd);
+
+ mutex_enter(&odd->odd_lock);
+ ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
+ odd->odd_flags &= ~OVERLAY_F_IN_MUX;
+ mutex_exit(&odd->odd_lock);
+}
+
+static int
+overlay_setprop_vl2_cachesz(overlay_dev_t *odd, uint32_t sz)
+{
+ overlay_target_t *ott = NULL;
+ int ret = 0;
+
+ if (sz == 0)
+ sz = OVERLAY_VL2_CACHE_DEF;
+
+ /* Caller should have validated this */
+ ASSERT3U(sz, >=, OVERLAY_VL2_CACHE_MIN);
+ ASSERT3U(sz, <=, OVERLAY_VL2_CACHE_MAX);
+
+ mutex_enter(&odd->odd_lock);
+ ott = odd->odd_target;
+
+ /* ott_mode is RO if the target exists */
+ if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ mutex_enter(&ott->ott_lock);
+ ret = qqcache_adjust_size(ott->ott_u.ott_dyn.ott_dhash, sz);
+ mutex_exit(&ott->ott_lock);
+ }
+
+ if (ret == 0)
+ odd->odd_vl2sz = sz;
+ mutex_exit(&odd->odd_lock);
+
+ return (ret);
+}
+
+static int
+overlay_setprop_vl2_cachea(overlay_dev_t *odd, uint32_t a)
+{
+ overlay_target_t *ott = NULL;
+ int ret = 0;
+
+ /* Caller should have validated this */
+ ASSERT3U(a, <=, 100);
+
+ mutex_enter(&odd->odd_lock);
+ ott = odd->odd_target;
+
+ /* ott_mode is RO if the target exists */
+ if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ mutex_enter(&ott->ott_lock);
+ ret = qqcache_adjust_a(ott->ott_u.ott_dyn.ott_dhash, a);
+ mutex_exit(&ott->ott_lock);
+ }
+ if (ret == 0)
+ odd->odd_vl2a = a;
+ mutex_exit(&odd->odd_lock);
+
+ return (ret);
+}
+
+static int
+overlay_setprop_route_cachesz(overlay_dev_t *odd, uint32_t sz)
+{
+ overlay_target_t *ott = NULL;
+ int ret = 0;
+
+ if (sz == 0)
+ sz = OVERLAY_ROUTE_CACHE_DEF;
+
+ ASSERT3U(sz, >=, OVERLAY_ROUTE_CACHE_MIN);
+ ASSERT3U(sz, <=, OVERLAY_ROUTE_CACHE_MAX);
+
+ mutex_enter(&odd->odd_lock);
+ ott = odd->odd_target;
+
+ /* ott_mode is RO if the target exists */
+ if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ mutex_enter(&ott->ott_lock);
+ ret = qqcache_adjust_size(ott->ott_u.ott_dyn.ott_l3dhash, sz);
+ mutex_exit(&ott->ott_lock);
+ }
+ if (ret == 0)
+ odd->odd_routesz = sz;
+ mutex_exit(&odd->odd_lock);
+
+ return (ret);
+}
+
+static int
+overlay_setprop_route_cachea(overlay_dev_t *odd, uint32_t a)
+{
+ overlay_target_t *ott = NULL;
+ int ret = 0;
+
+ /* Caller should have validated this */
+ ASSERT3U(a, <=, 100);
+
+ mutex_enter(&odd->odd_lock);
+ ott = odd->odd_target;
+
+ /* ott_mode is RO if the target exists */
+ if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ mutex_enter(&ott->ott_lock);
+ ret = qqcache_adjust_a(ott->ott_u.ott_dyn.ott_l3dhash, a);
+ mutex_exit(&ott->ott_lock);
+ }
+ if (ret == 0)
+ odd->odd_routea = a;
+ mutex_exit(&odd->odd_lock);
+
+ return (ret);
+}
+
/* ARGSUSED */
static int
overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
@@ -1855,7 +2180,7 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
overlay_ioc_prop_t *oip = karg;
uint_t propid = UINT_MAX;
mac_perim_handle_t mph;
- uint64_t maxid, *vidp;
+ uint64_t maxid, *vidp, *dcidp, *vl2szp, *vl2ap, *routeszp, *routeap;
if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
return (EINVAL);
@@ -1865,31 +2190,48 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
return (ENOENT);
oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
- mac_perim_enter_by_mh(odd->odd_mh, &mph);
- mutex_enter(&odd->odd_lock);
- if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
- mac_perim_exit(mph);
- mutex_exit(&odd->odd_lock);
- return (ENOTSUP);
- }
- mutex_exit(&odd->odd_lock);
+
+ /*
+ * Currently, only certain overlay properties (and no encapsulation
+ * properties) can be changed while the overlay device is active.
+ */
if (oip->oip_id == -1) {
int i;
for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
break;
- if (i == OVERLAY_DEV_NPROPS) {
- ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
- odd->odd_pvoid, oip->oip_name,
- oip->oip_value, oip->oip_size);
- overlay_hold_rele(odd);
- mac_perim_exit(mph);
- return (ret);
- }
}
- propid = i;
+ if (i < OVERLAY_DEV_NPROPS)
+ propid = i;
+ } else if (oip->oip_id < OVERLAY_DEV_NPROPS) {
+ propid = oip->oip_id;
+ }
+
+ /*
+ * A bit tricky, but propid is initalized to UINT_MAX, so we know we
+ * have an overlay property whenever propid < OVERLAY_DEV_NPROPS,
+ * otherwise we have a plugin property.
+ */
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_ACTIVATED) &&
+ ((propid >= OVERLAY_DEV_NPROPS || !overlay_dev_liveprop[propid]))) {
+ mutex_exit(&odd->odd_lock);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_exit(&odd->odd_lock);
+
+ if (oip->oip_id == -1 && propid >= OVERLAY_DEV_NPROPS) {
+ ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
+ odd->odd_pvoid, oip->oip_name,
+ oip->oip_value, oip->oip_size);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (ret);
} else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
@@ -1941,6 +2283,68 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
case OVERLAY_DEV_P_VARPDID:
ret = EPERM;
break;
+ case OVERLAY_DEV_P_DCID:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ dcidp = (uint64_t *)oip->oip_value;
+ if (*dcidp > UINT32_MAX) {
+ ret = EINVAL;
+ break;
+ }
+ overlay_setprop_dcid(odd, *dcidp);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_SIZE:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ vl2szp = (uint64_t *)oip->oip_value;
+ if (*vl2szp != 0 && (*vl2szp < OVERLAY_VL2_CACHE_MIN ||
+ *vl2szp > OVERLAY_VL2_CACHE_MAX)) {
+ ret = EINVAL;
+ break;
+ }
+ ret = overlay_setprop_vl2_cachesz(odd, *vl2szp);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_A:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ vl2ap = (uint64_t *)oip->oip_value;
+ if (*vl2ap > OVERLAY_CACHE_MAX_A) {
+ ret = EINVAL;
+ break;
+ }
+ ret = overlay_setprop_vl2_cachea(odd, *vl2ap);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_SIZE:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ routeszp = (uint64_t *)oip->oip_value;
+ if (*routeszp != 0 && (*routeszp < OVERLAY_ROUTE_CACHE_MIN ||
+ OVERLAY_ROUTE_CACHE_MAX)) {
+ ret = EINVAL;
+ break;
+ }
+ ret = overlay_setprop_route_cachesz(odd, *routeszp);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_A:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ routeap = (uint64_t *)oip->oip_value;
+ if (*routeap > OVERLAY_CACHE_MAX_A) {
+ ret = EINVAL;
+ break;
+ }
+ ret = overlay_setprop_route_cachea(odd, *routeap);
+ break;
default:
ret = ENOENT;
}
diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c
index 0c21bb8689..2688d2791c 100644
--- a/usr/src/uts/common/io/overlay/overlay_mux.c
+++ b/usr/src/uts/common/io/overlay/overlay_mux.c
@@ -127,6 +127,18 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
freeb(fmp);
/*
+ * In cases of looped-back vxlan, that tends to have a
+ * prepended IP+UDP-only mblk, followed by the data. Parsing
+ * would've made that mblk a zero-length one (rptr == wptr).
+ */
+ if (mp->b_rptr == mp->b_wptr && mp->b_cont != NULL) {
+ /* Ended up with zero-length mblk, lose it! */
+ fmp = mp;
+ mp = fmp->b_cont;
+ freeb(fmp);
+ }
+
+ /*
* Until we have VXLAN-or-other-decap HW acceleration support
* (e.g. we support NICs that reach into VXLAN-encapsulated
* packets and check the inside-VXLAN IP packets' checksums,
@@ -161,10 +173,9 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
if (rem == blkl) {
fmp = mp;
mp = fmp->b_cont;
- fmp->b_cont = NULL;
OVERLAY_FREEMSG(mp,
"freed a fmp block");
- freemsg(fmp);
+ freeb(fmp);
}
}
if (mp == NULL) {
diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c
index f4147b56d1..171acd034f 100644
--- a/usr/src/uts/common/io/overlay/overlay_target.c
+++ b/usr/src/uts/common/io/overlay/overlay_target.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -20,6 +20,8 @@
* uts/common/io/overlay/overlay.c
*/
+#include <inet/ip.h>
+#include <inet/ip6.h>
#include <sys/types.h>
#include <sys/ethernet.h>
#include <sys/kmem.h>
@@ -42,6 +44,9 @@
#include <sys/overlay_impl.h>
#include <sys/sdt.h>
+#define OVERLAY_DROP(mp, reason) \
+ DTRACE_PROBE2(overlay__drop, mblk_t *, mp, char *, reason)
+
/*
* This is total straw man, but at least it's a prime number. Here we're
* going to have to go through and do a lot of evaluation and understanding as
@@ -52,6 +57,19 @@
#define OVERLAY_HSIZE 823
/*
+ * The default size of each target cache. This is also a complete strawman
+ * whose value could change as we gain better operational experience with
+ * overlay routing.
+ */
+#define OVERLAY_CACHE_SIZE 512
+
+/*
+ * A somewhat arbitrary value. The percentage of the target cache dedicated
+ * to MFU entries (i.e. entries that have been looked up more than once).
+ */
+#define OVERLAY_CACHE_A 60
+
+/*
* We use this data structure to keep track of what requests have been actively
* allocated to a given instance so we know what to put back on the pending
* list.
@@ -69,7 +87,7 @@ typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int);
typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *);
typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int);
-typedef struct overaly_target_ioctl {
+typedef struct overlay_target_ioctl {
int oti_cmd; /* ioctl id */
boolean_t oti_write; /* ioctl requires FWRITE */
boolean_t oti_ncopyout; /* copyout data? */
@@ -144,25 +162,60 @@ overlay_entry_cache_destructor(void *buf, void *arg)
static uint64_t
overlay_mac_hash(const void *v)
{
+ const overlay_target_mac_t *m = v;
+
uint32_t crc;
- CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
+ CRC32(crc, m->otm_mac, ETHERADDRL, -1U, crc32_table);
+ CRC32(crc, &m->otm_dcid, sizeof (uint32_t), crc, crc32_table);
return (crc);
}
static int
overlay_mac_cmp(const void *a, const void *b)
{
- return (bcmp(a, b, ETHERADDRL));
+ const overlay_target_mac_t *l = a;
+ const overlay_target_mac_t *r = b;
+
+ if (l->otm_dcid != r->otm_dcid)
+ return (1);
+ return (bcmp(l->otm_mac, r->otm_mac, ETHERADDRL) != 0);
+}
+
+static uint64_t
+overlay_ip_hash(const void *v)
+{
+ const overlay_target_vl3_t *vl3 = v;
+
+ uint32_t crc;
+ CRC32(crc, &vl3->otvl3_src, sizeof (vl3->otvl3_src), -1U, crc32_table);
+ CRC32(crc, &vl3->otvl3_dst, sizeof (vl3->otvl3_dst), crc, crc32_table);
+ CRC32(crc, &vl3->otvl3_src_vlan, sizeof (vl3->otvl3_src_vlan), crc,
+ crc32_table);
+ return (crc);
+}
+
+static int
+overlay_ip_cmp(const void *a, const void *b)
+{
+ const overlay_target_vl3_t *l = a;
+ const overlay_target_vl3_t *r = b;
+
+ if (l->otvl3_src_vlan != r->otvl3_src_vlan)
+ return (1);
+ if (!IN6_ARE_ADDR_EQUAL(&l->otvl3_src, &r->otvl3_src))
+ return (1);
+ if (!IN6_ARE_ADDR_EQUAL(&l->otvl3_dst, &r->otvl3_dst))
+ return (1);
+ return (0);
}
-/* ARGSUSED */
static void
overlay_target_entry_dtor(void *arg)
{
overlay_target_entry_t *ote = arg;
ote->ote_flags = 0;
- bzero(ote->ote_addr, ETHERADDRL);
+ bzero(&ote->ote_u, sizeof (ote->ote_u));
ote->ote_ott = NULL;
ote->ote_odd = NULL;
freemsgchain(ote->ote_chead);
@@ -172,21 +225,76 @@ overlay_target_entry_dtor(void *arg)
kmem_cache_free(overlay_entry_cache, ote);
}
+static void
+overlay_target_entry_l2qq_dtor(void *arg)
+{
+ overlay_target_entry_t *ote = arg;
+ overlay_target_t *ott = ote->ote_ott;
+
+ ASSERT(MUTEX_HELD(&ott->ott_lock));
+ ASSERT3S((ote->ote_flags & OVERLAY_ENTRY_F_VL3), ==, 0);
+
+ avl_remove(&ott->ott_u.ott_dyn.ott_tree, ote);
+ overlay_target_entry_dtor(ote);
+}
+
+static void
+overlay_target_entry_l3qq_dtor(void *arg)
+{
+ overlay_target_entry_t *ote = arg;
+ overlay_target_t *ott = ote->ote_ott;
+
+ ASSERT(MUTEX_HELD(&ott->ott_lock));
+ ASSERT3S((ote->ote_flags & OVERLAY_ENTRY_F_VL3), ==,
+ OVERLAY_ENTRY_F_VL3);
+
+ avl_remove(&ott->ott_u.ott_dyn.ott_l3tree, ote);
+ overlay_target_entry_dtor(ote);
+}
+
static int
overlay_mac_avl(const void *a, const void *b)
{
+ const overlay_target_entry_t *le = a;
+ const overlay_target_entry_t *re = b;
+ const overlay_target_mac_t *lm = &le->ote_u.ote_vl2.otvl2_mac;
+ const overlay_target_mac_t *rm = &re->ote_u.ote_vl2.otvl2_mac;
int i;
- const overlay_target_entry_t *l, *r;
- l = a;
- r = b;
+
+ /* Order by DCID, then MAC */
+ if (lm->otm_dcid < rm->otm_dcid)
+ return (-1);
+ if (lm->otm_dcid > rm->otm_dcid)
+ return (1);
for (i = 0; i < ETHERADDRL; i++) {
- if (l->ote_addr[i] > r->ote_addr[i])
+ if (lm->otm_mac[i] > rm->otm_mac[i])
return (1);
- else if (l->ote_addr[i] < r->ote_addr[i])
+ else if (lm->otm_mac[i] < rm->otm_mac[i])
return (-1);
}
+ return (0);
+}
+static int
+overlay_ip_avl(const void *a, const void *b)
+{
+ const overlay_target_entry_t *l = a;
+ const overlay_target_entry_t *r = b;
+ const overlay_target_vl3_t *l_vl3 = &l->ote_u.ote_vl3;
+ const overlay_target_vl3_t *r_vl3 = &r->ote_u.ote_vl3;
+ int ret;
+
+ if ((ret = memcmp(&l_vl3->otvl3_src, &r_vl3->otvl3_src,
+ sizeof (l_vl3->otvl3_src))) != 0)
+ return (ret < 0 ? -1 : 1);
+ if ((ret = memcmp(&l_vl3->otvl3_dst, &r_vl3->otvl3_dst,
+ sizeof (l_vl3->otvl3_dst))) != 0)
+ return (ret < 0 ? -1 : 1);
+ if (l_vl3->otvl3_src_vlan < r_vl3->otvl3_src_vlan)
+ return (-1);
+ if (l_vl3->otvl3_src_vlan > r_vl3->otvl3_src_vlan)
+ return (1);
return (0);
}
@@ -233,25 +341,20 @@ overlay_target_free(overlay_dev_t *odd)
return;
if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
- refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
- avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
- overlay_target_entry_t *ote;
-
+ mutex_enter(&odd->odd_target->ott_lock);
/*
- * Our AVL tree and hashtable contain the same elements,
- * therefore we should just remove it from the tree, but then
- * delete the entries when we remove them from the hash table
- * (which happens through the refhash dtor).
+ * Our VL3 AVL tree and hashtable contain the same elements.
+ * Additionally, when an entry is removed from the 2Q cache,
+ * the entry is removed from the corresponding AVL tree.
+ * Deleting the 2Q cache will destroy any remaining entries,
+ * so all we need to do is destroy the 2Q caches.
*/
- while ((ote = avl_first(ap)) != NULL)
- avl_remove(ap, ote);
-
- avl_destroy(ap);
- for (ote = refhash_first(rp); ote != NULL;
- ote = refhash_next(rp, ote)) {
- refhash_remove(rp, ote);
- }
- refhash_destroy(rp);
+ qqcache_destroy(odd->odd_target->ott_u.ott_dyn.ott_dhash);
+ qqcache_destroy(odd->odd_target->ott_u.ott_dyn.ott_l3dhash);
+ ASSERT(avl_is_empty(&odd->odd_target->ott_u.ott_dyn.ott_tree));
+ ASSERT(avl_is_empty(
+ &odd->odd_target->ott_u.ott_dyn.ott_l3tree));
+ mutex_exit(&odd->odd_target->ott_lock);
}
ASSERT(odd->odd_target->ott_ocount == 0);
@@ -270,18 +373,42 @@ overlay_target_busy()
return (ret);
}
+/*
+ * Queue the target entry on the list of varpd requests. entry should be
+ * refheld for the duration of this call (this call takes its own additional
+ * hold that is released when we receive a response).
+ */
static void
overlay_target_queue(overlay_target_entry_t *entry)
{
+ overlay_target_t *ott = entry->ote_ott;
+ boolean_t is_vl3 = B_FALSE;
+
+ /*
+ * ote_ott is read-only and set at entry creation, so it can be
+ * read without ote_lock held
+ */
+ ASSERT(!MUTEX_HELD(&entry->ote_lock));
+
+ mutex_enter(&entry->ote_lock);
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0)
+ is_vl3 = B_TRUE;
+ mutex_exit(&entry->ote_lock);
+
mutex_enter(&overlay_target_lock);
- mutex_enter(&entry->ote_ott->ott_lock);
- if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
- mutex_exit(&entry->ote_ott->ott_lock);
+ mutex_enter(&ott->ott_lock);
+ if (ott->ott_flags & OVERLAY_T_TEARDOWN) {
+ mutex_exit(&ott->ott_lock);
mutex_exit(&overlay_target_lock);
return;
}
- entry->ote_ott->ott_ocount++;
- mutex_exit(&entry->ote_ott->ott_lock);
+ ott->ott_ocount++;
+ if (is_vl3)
+ qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ else
+ qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
+
+ mutex_exit(&ott->ott_lock);
list_insert_tail(&overlay_target_list, entry);
cv_signal(&overlay_target_condvar);
mutex_exit(&overlay_target_lock);
@@ -300,22 +427,446 @@ overlay_target_quiesce(overlay_target_t *ott)
}
/*
- * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
+ * Write the VL3 src/dst IP from the packet in mp into src and dst. If the
+ * addresses are IPv4 addresses, they are written as mapped addresses.
+ */
+static int
+overlay_get_vl3_ips(mblk_t *mp, struct in6_addr *src, struct in6_addr *dst)
+{
+ uint16_t sap;
+
+#if 1
+ /* Temporary until mblk helpers are integrated */
+ struct ether_vlan_header *eth = (struct ether_vlan_header *)mp->b_rptr;
+ ipha_t *iphp = (ipha_t *)(eth + 1);
+ ip6_t *ip6hp = (ip6_t *)(eth + 1);
+ size_t mlen = MBLKL(mp);
+
+ if (mlen < sizeof (struct ether_vlan_header))
+ return (EINVAL);
+ mlen -= sizeof (struct ether_vlan_header);
+
+ /* We currently don't support routing on untagged vlans */
+ if ((sap = ntohs(eth->ether_tpid)) != ETHERTYPE_VLAN)
+ return (EINVAL);
+
+ sap = ntohs(eth->ether_type);
+ if (mlen == 0) {
+ if ((mp = mp->b_cont) == NULL)
+ return (EINVAL);
+ mlen = MBLKL(mp);
+ iphp = (ipha_t *)mp->b_rptr;
+ ip6hp = (ip6_t *)mp->b_rptr;
+ }
+
+ switch (sap) {
+ case ETHERTYPE_IP:
+ if (mlen < sizeof (ipha_t))
+ return (EINVAL);
+ ASSERT3U(IPH_HDR_VERSION(iphp), ==, IPV4_VERSION);
+ IN6_IPADDR_TO_V4MAPPED(iphp->ipha_src, src);
+ IN6_IPADDR_TO_V4MAPPED(iphp->ipha_dst, dst);
+ break;
+ case ETHERTYPE_IPV6:
+ if (mlen < sizeof (ip6_t))
+ return (EINVAL);
+ ASSERT3U(IPH_HDR_VERSION(iphp), ==, IPV6_VERSION);
+ bcopy(&ip6hp->ip6_src, src, sizeof (*src));
+ bcopy(&ip6hp->ip6_dst, dst, sizeof (*dst));
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+#else
+ size_t soff, doff;
+ uint32_t v4s, v4d;
+ int i;
+
+ if (!mblk_read_uint16(mp, offsetof(struct ether_header, ether_type),
+ &sap))
+ return (EINVAL);
+
+ if (sap == ETHERTYPE_VLAN) {
+ if (!mblk_read_uint16(mp,
+ offsetof(struct ether_vlan_header, ether_type), &sap))
+ return (EINVAL);
+ soff = doff = sizeof (struct ether_vlan_header);
+ } else {
+ soff = doff = sizeof (struct ether_header);
+ }
+
+ switch (sap) {
+ case ETHERTYPE_IP:
+ soff += offsetof(ipha_t, ipha_src);
+ doff += offsetof(ipha_t, ipha_dst);
+
+ if (!mblk_read_uint32(mp, soff, &v4s) ||
+ !mblk_read_uint32(mp, doff, &v4d))
+ return (EINVAL);
+ IN6_IPADDR_TO_V4MAPPED(&v4s, src);
+ IN6_IPADDR_TO_V4MAPPED(&v4d, dst);
+ break;
+ case ETHERTYPE_IPV6:
+ soff += offsetof(ip6_t, ip6_src);
+ doff += offsetof(ip6_6, ip6_dst);
+
+ for (i = 0; i < 4; i++) {
+ if (!mblk_read_uint32(mp, soff, &src->s6_addr32[i]) ||
+ !mblk_read_uint32(mp, doff, &dst->s6_addr32[i]))
+ return (EINVAL);
+ soff += sizeof (uint32_t);
+ doff += sizeof (uint32_t);
+ }
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+#endif
+}
+
+static int
+overlay_route(overlay_dev_t *odd, mblk_t *mp,
+ const overlay_target_route_t *route, const overlay_target_mac_t *dst_mac)
+{
+ uint16_t tci;
+
+ if (MBLKL(mp) >= sizeof (struct ether_vlan_header)) {
+ struct ether_vlan_header *evh;
+
+ evh = (struct ether_vlan_header *)mp->b_rptr;
+ tci = ntohs(evh->ether_tci);
+
+ /*
+ * Today we require all encapsulated frames to be vlan tagged.
+ * If this is relaxed in the future, we will need to allow for
+ * insertion and removal of the vlan tag as appropriate here.
+ */
+ if (ntohs(evh->ether_tpid) != ETHERTYPE_VLAN) {
+ OVERLAY_DROP(mp, "not vlan tagged");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ tci &= ~(VLAN_ID_MASK);
+ tci |= route->otr_vlan;
+ evh->ether_tci = htons(tci);
+ bcopy(dst_mac->otm_mac, &evh->ether_dhost, ETHERADDRL);
+ bcopy(route->otr_srcmac, &evh->ether_shost, ETHERADDRL);
+ return (OVERLAY_TARGET_OK);
+ }
+
+#if 1
+ /* Temporary until mblk helpers are integrated */
+ OVERLAY_DROP(mp, "ethernet header split between mblks");
+ return (OVERLAY_TARGET_DROP);
+#else
+ size_t off;
+
+ off = offsetof(struct ether_vlan_header, ether_tpid);
+ if (!mblk_read_uint16(mp, off, &tci)) {
+ OVERLAY_DROP(mp, "cannot read tpid");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ tci = ntohs(evh->ether_tci);
+ tci &= ~(VLAN_ID_MASK);
+ tci |= route->otr_vlan;
+
+ if (!mblk_write_uint16(mp, off, tci)) {
+ OVERLAY_DROP(mp, "cannot set routed destination vlan");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ for (int i = 0; i < ETHERADDRL; i++) {
+ if (!mblk_write_uint8(mp, i, dst_msc->otm_mac[i]) ||
+ !mblk_write_uint8(mp, i + ETHERADDRL,
+ route->otr_srcmac[i])) {
+ OVERLAY_DROP(mp, "cannot set routed macs");
+ return (OVERLAY_TARGET_DROP);
+ }
+ }
+
+ return (OVERLAY_TARGET_OK);
+#endif
+}
+
+/*
+ * Attempt to add mp to the packet queue of target entry. If the queue is
+ * already full, it returns OVERLAY_TARGET_DROP, otherwise OVERLAY_TARGET_ASYNC
+ * is returned. If the entry isn't already pending a response from varpd,
+ * queue the target entry on the list of outstanding varpd requests.
+ *
+ * Entry should already be locked, however since it is intended that this
+ * should be the final step in dealing with this entry (for handling the
+ * packet in question), it always releases ote_lock before returning.
+ * entry should be refheld for the duration of this call.
+ */
+static int
+overlay_target_try_queue(overlay_target_entry_t *entry, mblk_t *mp)
+{
+ size_t mlen = msgsize(mp);
+ boolean_t queue = B_FALSE;
+
+ ASSERT(MUTEX_HELD(&entry->ote_lock));
+
+ if (mlen + entry->ote_mbsize > overlay_ent_size) {
+ OVERLAY_DROP(mp, "target queue full");
+ mutex_exit(&entry->ote_lock);
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ if (entry->ote_ctail != NULL) {
+ ASSERT(entry->ote_ctail->b_next == NULL);
+ entry->ote_ctail->b_next = mp;
+ entry->ote_ctail = mp;
+ } else {
+ entry->ote_chead = mp;
+ entry->ote_ctail = mp;
+ }
+ entry->ote_mbsize += mlen;
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0) {
+ entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+ queue = B_TRUE;
+ }
+ mutex_exit(&entry->ote_lock);
+
+ if (queue)
+ overlay_target_queue(entry);
+
+ return (OVERLAY_TARGET_ASYNC);
+}
+
+/*
+ * Given the VL3 IP->VL2 mac entry (vl3e), and the corresponding VL2 MAC->UL3
+ * entry (vl2e), if both entries are valid, sets *vidp, *v6, *slenp to the
+ * correct UL3 destination and return OVERLAY_TARGET_OK. If either of the
+ * entries are still pending lookups (or v2le is NULL because the entry is
+ * missing), mp is queued (if there is space) on the appropriate entry and
+ * OVERLAY_TARGET_ASYNC is returned. If the VL2 entry is flagged to drop all
+ * packets, OVERLAY_TARGET_DROP is returned.
+ *
+ * In all cases, the caller should acquire vl3e->ote_lock prior to calling
+ * overlay_route_lookup_vl2(). Because vl2e can be missing (NULL), the caller
+ * should not acquire vl2e->ote_lock prior to calling
+ * overlay_route_lookup_vl2(). vl3e->ote_lock is alway dropped prior to
+ * returning.
+ */
+static int
+overlay_route_lookup_vl2(overlay_target_entry_t *vl3e,
+ overlay_target_entry_t *vl2e, uint64_t *vidp, struct sockaddr_in6 *v6,
+ socklen_t *slenp, mblk_t *mp)
+{
+ overlay_target_vl2_t *vl2p;
+ int ret;
+
+ ASSERT(MUTEX_HELD(&vl3e->ote_lock));
+
+ if (vl2e == NULL) {
+ vl3e->ote_flags &= ~OVERLAY_ENTRY_F_VALID;
+ /* This drops vl3e->ote_lock */
+ return (overlay_target_try_queue(vl3e, mp));
+ }
+
+ mutex_enter(&vl2e->ote_lock);
+ if (vl2e->ote_flags & (OVERLAY_ENTRY_F_DROP | OVERLAY_ENTRY_F_ROUTER)) {
+ overlay_target_entry_flags_t flags = vl2e->ote_flags;
+
+ mutex_exit(&vl2e->ote_lock);
+ mutex_exit(&vl3e->ote_lock);
+
+ if (flags & OVERLAY_ENTRY_F_DROP) {
+ OVERLAY_DROP(mp, "VL2 target marked drop");
+ } else {
+ OVERLAY_DROP(mp, "VL2 target is overlay router");
+ }
+
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ /*
+ * If the route is missing queue on the VL3 entry so a VL3->UL3
+ * lookup is done (to get the route data).
+ */
+ if ((vl2e->ote_flags & OVERLAY_ENTRY_F_HAS_ROUTE) == 0) {
+ mutex_exit(&vl2e->ote_lock);
+
+ vl3e->ote_flags &= ~OVERLAY_ENTRY_F_VALID;
+ /* This drops vl3e->ote_lock */
+ return (overlay_target_try_queue(vl3e, mp));
+ }
+
+ /*
+ * If the VL2 target point is missing, we try to be a bit (though
+ * hopefully not too) clever. We can always queue on the VL3 entry
+ * which will trigger a VL3->UL3 lookup request (as it is effectively
+ * a superset of the VL2->UL3 lookup). However, if we know we already
+ * have an outstanding VL3->UL3 request, we queue on the VL2 entry and
+ * avoid doing another redundant lookup. We can also queue on the VL2
+ * entry when it is a local (same vnet, same DC) destination -- we
+ * currently cannot generate VL2->UL3 lookups for remote destinations,
+ * only same vnet, same DC. Queueing on the VL2 entry also allows
+ * instances on the same vlan as the queued VL2 entry to piggy back on
+ * the lookup request and avoid a redundant lookup. However if the
+ * VL2 entry is remote, we have to do a VL3->UL3 lookup.
+ */
+ if ((vl2e->ote_flags & OVERLAY_ENTRY_F_VALID) == 0) {
+ if ((vl2e->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0 &&
+ vl2e->ote_u.ote_vl2.otvl2_mac.otm_dcid !=
+ vl2e->ote_odd->odd_dcid) {
+ mutex_exit(&vl2e->ote_lock);
+ /* This drops vl3e->ote_lock */
+ return (overlay_target_try_queue(vl3e, mp));
+ }
+
+ mutex_exit(&vl3e->ote_lock);
+ /* This drops vl2e->ote_lock */
+ return (overlay_target_try_queue(vl2e, mp));
+ }
+
+ ASSERT(vl2e->ote_flags & OVERLAY_ENTRY_F_VALID);
+
+ vl2p = &vl2e->ote_u.ote_vl2;
+
+ *vidp = vl2p->otvl2_route.otr_vnet;
+ bcopy(&vl2p->otvl2_dest.otp_ip, &v6->sin6_addr,
+ sizeof (struct in6_addr));
+ v6->sin6_port = htons(vl2p->otvl2_dest.otp_port);
+ *slenp = sizeof (struct sockaddr_in6);
+
+ ret = overlay_route(vl2e->ote_odd, mp, &vl2p->otvl2_route,
+ &vl2p->otvl2_mac);
+ mutex_exit(&vl2e->ote_lock);
+ mutex_exit(&vl3e->ote_lock);
+ return (ret);
+}
+
+static int
+overlay_route_lookup(overlay_dev_t *odd, mblk_t *mp, uint16_t vlan,
+ struct sockaddr *sock, socklen_t *slenp, uint64_t *vidp)
+{
+ overlay_target_t *ott = odd->odd_target;
+ overlay_target_entry_t *entry, *vl2_entry = NULL;
+ struct sockaddr_in6 *v6 = (struct sockaddr_in6 *)sock;
+ overlay_target_vl3_t vl3 = { 0 };
+ int ret = OVERLAY_TARGET_DROP;
+
+ /* overlay_target_lookup() should have set this */
+ ASSERT3U(v6->sin6_family, ==, AF_INET6);
+
+ /* We should only be called for dynamic endpoints */
+ ASSERT3U(ott->ott_mode, ==, OVERLAY_TARGET_DYNAMIC);
+
+ vl3.otvl3_src_vlan = vlan;
+ if ((ret = overlay_get_vl3_ips(mp, &vl3.otvl3_src, &vl3.otvl3_dst))
+ != OVERLAY_TARGET_OK) {
+ OVERLAY_DROP(mp, "could not read VL3 src/dst IPs");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ mutex_enter(&ott->ott_lock);
+ entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_l3dhash, &vl3);
+ if (entry == NULL) {
+ if ((entry = kmem_cache_alloc(overlay_entry_cache,
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
+ mutex_exit(&ott->ott_lock);
+ OVERLAY_DROP(mp, "failed VL3 target entry allocation");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ bcopy(&vl3, &entry->ote_u.ote_vl3, sizeof (vl3));
+ entry->ote_flags = OVERLAY_ENTRY_F_VL3;
+
+ entry->ote_chead = entry->ote_ctail = mp;
+ entry->ote_mbsize = msgsize(mp);
+ entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+
+ entry->ote_ott = ott;
+ entry->ote_odd = odd;
+
+ qqcache_insert(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ avl_add(&ott->ott_u.ott_dyn.ott_l3tree, entry);
+ mutex_exit(&ott->ott_lock);
+
+ overlay_target_queue(entry);
+
+ mutex_enter(&ott->ott_lock);
+ qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ mutex_exit(&ott->ott_lock);
+ return (OVERLAY_TARGET_ASYNC);
+ }
+ qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ mutex_enter(&entry->ote_lock);
+
+ /*
+ * A bit ugly, but if we need the VL2 entry, we want to look it up
+ * while we still hold ott_lock.
+ */
+ if ((entry->ote_flags &
+ (OVERLAY_ENTRY_F_DROP|OVERLAY_ENTRY_F_ROUTER|
+ OVERLAY_ENTRY_F_VALID)) == OVERLAY_ENTRY_F_VALID) {
+ vl2_entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ &entry->ote_u.ote_vl3.otvl3_vl2);
+ if (vl2_entry != NULL)
+ qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, vl2_entry);
+ }
+ mutex_exit(&ott->ott_lock);
+
+ ASSERT(entry->ote_flags & OVERLAY_ENTRY_F_VL3);
+
+ if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ mutex_exit(&entry->ote_lock);
+ OVERLAY_DROP(mp, "VL3 target entry marked drop");
+ ret = OVERLAY_TARGET_DROP;
+ } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ /*
+ * XXX: A packet with a dst IP of an overlay router.
+ * Maybe generate an ICMP reply? For now, we drop.
+ */
+ mutex_exit(&entry->ote_lock);
+ OVERLAY_DROP(mp, "VL3 target entry is router");
+ ret = OVERLAY_TARGET_DROP;
+ } else if ((entry->ote_flags & OVERLAY_ENTRY_F_VALID) == 0) {
+ /* This drops entry->ote_lock */
+ ret = overlay_target_try_queue(entry, mp);
+ } else {
+ /* This drops entry->ote_lock */
+ ret = overlay_route_lookup_vl2(entry, vl2_entry, vidp, v6,
+ slenp, mp);
+ }
+
+ mutex_enter(&ott->ott_lock);
+ qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ if (vl2_entry != NULL)
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, vl2_entry);
+ mutex_exit(&ott->ott_lock);
+ return (ret);
+}
+
+/*
+ * This function assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
* OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
- * this time, say for NVGRE, we drop all packets that mcuh this.
+ * this time, say for NVGRE, we drop all packets that match this.
*/
int
overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
- socklen_t *slenp)
+ socklen_t *slenp, uint64_t *vidp)
{
int ret;
struct sockaddr_in6 *v6;
overlay_target_t *ott;
- mac_header_info_t mhi;
overlay_target_entry_t *entry;
+ mac_header_info_t mhi;
+ overlay_target_mac_t omac;
ASSERT(odd->odd_target != NULL);
+ /* Default to our local vid, routing may change this if necessary */
+ *vidp = odd->odd_vid;
+
/*
* At this point, the overlay device is in a mux which means that it's
* been activated. At this point, parts of the target, such as the mode
@@ -323,8 +874,10 @@ overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
* about synchronization for them.
*/
ott = odd->odd_target;
- if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
+ if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) {
+ OVERLAY_DROP(mp, "plugin doesn't support IP or port");
return (OVERLAY_TARGET_DROP);
+ }
v6 = (struct sockaddr_in6 *)sock;
bzero(v6, sizeof (struct sockaddr_in6));
@@ -343,76 +896,89 @@ overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
- /*
- * Note we only want the MAC address here, therefore we won't bother
- * using mac_vlan_header_info(). If any caller needs the vlan info at
- * this point, this should change to a call to mac_vlan_header_info().
- */
- if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
+ if (mac_vlan_header_info(odd->odd_mh, mp, &mhi) != 0) {
+ OVERLAY_DROP(mp, "could not read vlan header");
return (OVERLAY_TARGET_DROP);
+ }
+
+ omac.otm_dcid = odd->odd_dcid;
+ bcopy(mhi.mhi_daddr, omac.otm_mac, ETHERADDRL);
+
mutex_enter(&ott->ott_lock);
- entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- mhi.mhi_daddr);
+ entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, &omac);
if (entry == NULL) {
+ overlay_target_vl2_t *vl2p;
+
entry = kmem_cache_alloc(overlay_entry_cache,
KM_NOSLEEP | KM_NORMALPRI);
if (entry == NULL) {
mutex_exit(&ott->ott_lock);
+ OVERLAY_DROP(mp, "VL2 target entry allocation failed");
return (OVERLAY_TARGET_DROP);
}
- bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
+
+ vl2p = &entry->ote_u.ote_vl2;
+ bcopy(mhi.mhi_daddr, vl2p->otvl2_mac.otm_mac, ETHERADDRL);
+ vl2p->otvl2_mac.otm_dcid = odd->odd_dcid;
+ vl2p->otvl2_route.otr_vnet = odd->odd_vid;
+ vl2p->otvl2_route.otr_vlan = VLAN_ID(mhi.mhi_tci);
+
entry->ote_chead = entry->ote_ctail = mp;
entry->ote_mbsize = msgsize(mp);
entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+
entry->ote_ott = ott;
entry->ote_odd = odd;
- refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
+
+ qqcache_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
+ qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
mutex_exit(&ott->ott_lock);
+
overlay_target_queue(entry);
+
+ mutex_enter(&ott->ott_lock);
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+ mutex_exit(&ott->ott_lock);
return (OVERLAY_TARGET_ASYNC);
}
- refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
+ qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
mutex_exit(&ott->ott_lock);
mutex_enter(&entry->ote_lock);
if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ mutex_exit(&entry->ote_lock);
+ OVERLAY_DROP(mp, "VL2 target marked drop");
ret = OVERLAY_TARGET_DROP;
+ } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ if (mhi.mhi_bindsap == ETHERTYPE_ARP) {
+ /*
+ * Send unicast ARP requests to varpd for processing.
+ * We will eventually need something similar for IPv6.
+ * This drops entry->ote_lock.
+ */
+ ret = overlay_target_try_queue(entry, mp);
+ } else {
+ mutex_exit(&entry->ote_lock);
+ ret = overlay_route_lookup(odd, mp,
+ VLAN_ID(mhi.mhi_tci), sock, slenp, vidp);
+ }
} else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
- bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
- sizeof (struct in6_addr));
- v6->sin6_port = htons(entry->ote_dest.otp_port);
+ overlay_target_point_t *otp = &entry->ote_u.ote_vl2.otvl2_dest;
+
+ bcopy(&otp->otp_ip, &v6->sin6_addr, sizeof (struct in6_addr));
+ v6->sin6_port = htons(otp->otp_port);
+ mutex_exit(&entry->ote_lock);
+
*slenp = sizeof (struct sockaddr_in6);
ret = OVERLAY_TARGET_OK;
} else {
- size_t mlen = msgsize(mp);
-
- if (mlen + entry->ote_mbsize > overlay_ent_size) {
- ret = OVERLAY_TARGET_DROP;
- } else {
- if (entry->ote_ctail != NULL) {
- ASSERT(entry->ote_ctail->b_next ==
- NULL);
- entry->ote_ctail->b_next = mp;
- entry->ote_ctail = mp;
- } else {
- entry->ote_chead = mp;
- entry->ote_ctail = mp;
- }
- entry->ote_mbsize += mlen;
- if ((entry->ote_flags &
- OVERLAY_ENTRY_F_PENDING) == 0) {
- entry->ote_flags |=
- OVERLAY_ENTRY_F_PENDING;
- overlay_target_queue(entry);
- }
- ret = OVERLAY_TARGET_ASYNC;
- }
+ /* This drops entry->ote_lock */
+ ret = overlay_target_try_queue(entry, mp);
}
- mutex_exit(&entry->ote_lock);
mutex_enter(&ott->ott_lock);
- refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
mutex_exit(&ott->ott_lock);
return (ret);
@@ -437,6 +1003,7 @@ overlay_target_info(overlay_target_hdl_t *thdl, void *arg)
if (odd->odd_flags & OVERLAY_F_ACTIVATED)
oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE;
oti->oti_vnetid = odd->odd_vid;
+ oti->oti_dcid = odd->odd_dcid;
mutex_exit(&odd->odd_lock);
overlay_hold_rele(odd);
return (0);
@@ -488,6 +1055,13 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
}
}
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_VARPD) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (EEXIST);
+ }
+
ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP);
ott->ott_flags = 0;
ott->ott_ocount = 0;
@@ -499,21 +1073,44 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
bcopy(&ota->ota_point, &ott->ott_u.ott_point,
sizeof (overlay_target_point_t));
} else {
- ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
+ int ret;
+
+ ret = qqcache_create(&ott->ott_u.ott_dyn.ott_dhash,
+ odd->odd_vl2sz, odd->odd_vl2a, OVERLAY_HSIZE,
overlay_mac_hash, overlay_mac_cmp,
- overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
+ overlay_target_entry_l2qq_dtor,
+ sizeof (overlay_target_entry_t),
offsetof(overlay_target_entry_t, ote_reflink),
- offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
+ offsetof(overlay_target_entry_t, ote_u.ote_vl2.otvl2_mac),
+ KM_SLEEP);
+ if (ret != 0) {
+ mutex_exit(&odd->odd_lock);
+ kmem_cache_free(overlay_target_cache, ott);
+ overlay_hold_rele(odd);
+ return (ret);
+ }
+
+ ret = qqcache_create(&ott->ott_u.ott_dyn.ott_l3dhash,
+ odd->odd_routesz, odd->odd_routea, OVERLAY_HSIZE,
+ overlay_ip_hash, overlay_ip_cmp,
+ overlay_target_entry_l3qq_dtor,
+ sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_reflink),
+ offsetof(overlay_target_entry_t, ote_u.ote_vl3), KM_SLEEP);
+ if (ret != 0) {
+ mutex_exit(&odd->odd_lock);
+ qqcache_destroy(ott->ott_u.ott_dyn.ott_l3dhash);
+ kmem_cache_free(overlay_target_cache, ott);
+ overlay_hold_rele(odd);
+ return (ret);
+ }
+
avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
sizeof (overlay_target_entry_t),
offsetof(overlay_target_entry_t, ote_avllink));
- }
- mutex_enter(&odd->odd_lock);
- if (odd->odd_flags & OVERLAY_F_VARPD) {
- mutex_exit(&odd->odd_lock);
- kmem_cache_free(overlay_target_cache, ott);
- overlay_hold_rele(odd);
- return (EEXIST);
+ avl_create(&ott->ott_u.ott_dyn.ott_l3tree, overlay_ip_avl,
+ sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_avllink));
}
odd->odd_flags |= OVERLAY_F_VARPD;
@@ -521,8 +1118,6 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
mutex_exit(&odd->odd_lock);
overlay_hold_rele(odd);
-
-
return (0);
}
@@ -601,7 +1196,16 @@ again:
entry = list_remove_head(&overlay_target_list);
mutex_exit(&overlay_target_lock);
mutex_enter(&entry->ote_lock);
- if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ /*
+ * Router entries may send lookups to varpd even when valid. For
+ * example, illumos systems will send unicast ARP queries to cached
+ * entries (including the router mac address). To answer those, we
+ * need to forward on the query to varpd. IPv6 will eventually
+ * need something similar for ND requests.
+ */
+ if ((entry->ote_flags &
+ (OVERLAY_ENTRY_F_VALID|OVERLAY_ENTRY_F_ROUTER)) ==
+ OVERLAY_ENTRY_F_VALID) {
ASSERT(entry->ote_chead == NULL);
mutex_exit(&entry->ote_lock);
goto again;
@@ -637,10 +1241,23 @@ again:
otl->otl_hdrsize = mhi.mhi_hdrsize;
otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
- bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL);
- bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL);
- otl->otl_dsttype = mhi.mhi_dsttype;
- otl->otl_sap = mhi.mhi_bindsap;
+ if (entry->ote_flags & OVERLAY_ENTRY_F_VL3) {
+ overlay_targ_l3_t *l3p = &otl->otl_addru.otlu_l3;
+
+ otl->otl_l3req = B_TRUE;
+ bcopy(&entry->ote_u.ote_vl3.otvl3_src, &l3p->otl3_srcip,
+ sizeof (struct in6_addr));
+ bcopy(&entry->ote_u.ote_vl3.otvl3_dst, &l3p->otl3_dstip,
+ sizeof (struct in6_addr));
+ } else {
+ overlay_targ_l2_t *l2p = &otl->otl_addru.otlu_l2;
+
+ otl->otl_l3req = B_FALSE;
+ bcopy(mhi.mhi_daddr, l2p->otl2_dstaddr, ETHERADDRL);
+ bcopy(mhi.mhi_saddr, l2p->otl2_srcaddr, ETHERADDRL);
+ l2p->otl2_dsttype = mhi.mhi_dsttype;
+ l2p->otl2_sap = mhi.mhi_bindsap;
+ }
otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
mutex_exit(&entry->ote_lock);
@@ -651,12 +1268,128 @@ again:
return (0);
}
+static void
+overlay_target_lookup_respond_vl3(const overlay_targ_resp_t *otr,
+ overlay_target_entry_t *entry)
+{
+ overlay_target_entry_t *shared = NULL;
+ overlay_target_entry_t *vl2_entry;
+ overlay_target_t *ott = entry->ote_ott;
+ qqcache_t *mhash = ott->ott_u.ott_dyn.ott_dhash;
+ hrtime_t now = gethrtime();
+
+ ASSERT(MUTEX_HELD(&entry->ote_lock));
+ ASSERT(entry->ote_flags & OVERLAY_ENTRY_F_VL3);
+
+ /*
+ * A cross-{vlan,dc,vnet} packet with a destination VL3 of an overlay
+ * router IP. For now we drop these.
+ */
+ if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+ entry->ote_flags |= OVERLAY_ENTRY_F_DROP;
+ return;
+ }
+
+ bcopy(&otr->otr_mac, &entry->ote_u.ote_vl3.otvl3_vl2,
+ sizeof (overlay_target_mac_t));
+
+ mutex_enter(&ott->ott_lock);
+ if ((shared = qqcache_lookup(mhash, &otr->otr_mac)) != NULL)
+ qqcache_hold(mhash, shared);
+ mutex_exit(&ott->ott_lock);
+
+ /*
+ * Once we have the VL2 destination, we need to see if we already
+ * have an existing VL2 entry we can reuse. If not, we create a
+ * fully-formed (i.e. valid) VL2 entry that we add to the cache.
+ */
+ if (shared == NULL) {
+ vl2_entry = kmem_cache_alloc(overlay_entry_cache,
+ KM_NOSLEEP | KM_NORMALPRI);
+ if (vl2_entry == NULL) {
+ /*
+ * If we can't allocate a VL2 entry for the VL3
+ * destination, we just give up for now and drain
+ * any queued packets. New packets will retry this
+ * allocation, so if the memory pressure lets up, we
+ * should recover.
+ */
+ freemsgchain(entry->ote_chead);
+ entry->ote_chead = entry->ote_ctail = NULL;
+ return;
+ }
+
+ vl2_entry->ote_ott = ott;
+ vl2_entry->ote_odd = entry->ote_odd;
+
+ bcopy(&otr->otr_answer, &vl2_entry->ote_u.ote_vl2.otvl2_dest,
+ sizeof (overlay_target_point_t));
+ bcopy(&otr->otr_mac, &vl2_entry->ote_u.ote_vl2.otvl2_mac,
+ sizeof (overlay_target_mac_t));
+ bcopy(&otr->otr_route, &vl2_entry->ote_u.ote_vl2.otvl2_route,
+ sizeof (overlay_target_route_t));
+ vl2_entry->ote_flags =
+ OVERLAY_ENTRY_F_HAS_ROUTE | OVERLAY_ENTRY_F_VALID;
+ vl2_entry->ote_vtime = entry->ote_vtime = now;
+
+ mutex_enter(&ott->ott_lock);
+ if ((shared = qqcache_lookup(mhash, &otr->otr_mac)) != NULL) {
+ overlay_target_entry_dtor(vl2_entry);
+ kmem_cache_free(overlay_entry_cache, vl2_entry);
+ qqcache_hold(mhash, shared);
+
+ vl2_entry = shared;
+ } else {
+ qqcache_insert(mhash, vl2_entry);
+ avl_add(&ott->ott_u.ott_dyn.ott_tree, vl2_entry);
+ qqcache_hold(mhash, vl2_entry);
+ }
+ mutex_exit(&ott->ott_lock);
+ } else {
+ vl2_entry = shared;
+ }
+
+ mutex_enter(&vl2_entry->ote_lock);
+ if ((vl2_entry->ote_flags & (OVERLAY_ENTRY_F_HAS_ROUTE)) == 0) {
+ bcopy(&otr->otr_route, &vl2_entry->ote_u.ote_vl2.otvl2_route,
+ sizeof (overlay_target_route_t));
+ vl2_entry->ote_flags |= OVERLAY_ENTRY_F_HAS_ROUTE;
+ }
+
+ /*
+ * Update the VL2 entry if it doesn't have a valid destination, hasn't
+ * been marked as dropping all packets, and doesn't have an existing
+ * outstanding request. If a route and VL2 request involving the
+ * same VL2 destination are pending and the route response is processed
+ * prior to the VL2 request, we will continue to queue (on the VL2
+ * entry) until the VL2 response is received, even though we have
+ * an answer from the route response. If we set the valid flag
+ * while there's still oustanding requests, it will cause problems
+ * with the outstanding requests.
+ */
+ if ((vl2_entry->ote_flags & (OVERLAY_ENTRY_F_PENDING|
+ OVERLAY_ENTRY_F_VALID|OVERLAY_ENTRY_F_DROP)) == 0) {
+ bcopy(&otr->otr_answer, &vl2_entry->ote_u.ote_vl2.otvl2_dest,
+ sizeof (overlay_target_point_t));
+ vl2_entry->ote_vtime = gethrtime();
+ vl2_entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
+ }
+ mutex_exit(&vl2_entry->ote_lock);
+
+ mutex_enter(&ott->ott_lock);
+ qqcache_rele(mhash, vl2_entry);
+ mutex_exit(&ott->ott_lock);
+}
+
static int
overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
{
const overlay_targ_resp_t *otr = arg;
+ overlay_target_t *ott;
overlay_target_entry_t *entry;
mblk_t *mp;
+ boolean_t is_vl3 = B_FALSE;
mutex_enter(&thdl->oth_lock);
for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
@@ -673,38 +1406,88 @@ overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
mutex_exit(&thdl->oth_lock);
mutex_enter(&entry->ote_lock);
- bcopy(&otr->otr_answer, &entry->ote_dest,
- sizeof (overlay_target_point_t));
+ ott = entry->ote_ott;
+
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0)
+ is_vl3 = B_TRUE;
+
+ /*
+ * If we ever support a protocol that uses MAC addresses as the UL
+ * destination address, this check should probably include checking
+ * that otp_mac is also all zeros.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&otr->otr_answer.otp_ip) &&
+ otr->otr_answer.otp_port == 0)
+ entry->ote_flags |= OVERLAY_ENTRY_F_ROUTER;
+
+ if (!is_vl3) {
+ bcopy(&otr->otr_answer, &entry->ote_u.ote_vl2.otvl2_dest,
+ sizeof (overlay_target_point_t));
+ entry->ote_vtime = gethrtime();
+ } else {
+ overlay_target_lookup_respond_vl3(otr, entry);
+ }
+
entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
+
mp = entry->ote_chead;
entry->ote_chead = NULL;
entry->ote_ctail = NULL;
entry->ote_mbsize = 0;
- entry->ote_vtime = gethrtime();
mutex_exit(&entry->ote_lock);
/*
- * For now do an in-situ drain.
+ * For now do an in-situ drain. For VL3 entries, if we re-use
+ * and existing VL2 entry, it is possible the VL2 lookup is still
+ * pending (though should be rare). In such instances, the packets
+ * queued on the VL3 entry will get queued on the VL2 entry until
+ * the VL2 entry is resolved.
*/
mp = overlay_m_tx(entry->ote_odd, mp);
freemsgchain(mp);
- mutex_enter(&entry->ote_ott->ott_lock);
- entry->ote_ott->ott_ocount--;
- cv_signal(&entry->ote_ott->ott_cond);
- mutex_exit(&entry->ote_ott->ott_lock);
+ mutex_enter(&ott->ott_lock);
+ ott->ott_ocount--;
+ if (is_vl3)
+ qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ else
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+ cv_signal(&ott->ott_cond);
+ mutex_exit(&ott->ott_lock);
return (0);
}
+static boolean_t
+overlay_target_for_varpd(overlay_dev_t *odd, mblk_t *mp)
+{
+ mac_header_info_t mhi;
+
+ /* We should have dropped runts prior to ever queueing */
+ VERIFY0(mac_vlan_header_info(odd->odd_mh, mp, &mhi));
+ if (mhi.mhi_bindsap == ETHERTYPE_ARP)
+ return (B_TRUE);
+
+ /* TODO: NDP packets */
+ return (B_FALSE);
+}
+
+typedef enum overlay_target_lookup_drop_act {
+ OTLDA_NONE,
+ OTLDA_QUEUE,
+ OTLDA_DELETE
+} overlay_target_lookup_drop_act_t;
+
static int
overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
{
const overlay_targ_resp_t *otr = arg;
+ overlay_target_t *ott;
overlay_target_entry_t *entry;
mblk_t *mp;
- boolean_t queue = B_FALSE;
+ overlay_target_lookup_drop_act_t action = OTLDA_NONE;
+ boolean_t is_vl3 = B_FALSE;
mutex_enter(&thdl->oth_lock);
for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
@@ -721,9 +1504,19 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
mutex_exit(&thdl->oth_lock);
mutex_enter(&entry->ote_lock);
+ ott = entry->ote_ott;
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0)
+ is_vl3 = B_TRUE;
- /* Safeguard against a confused varpd */
- if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ mp = entry->ote_chead;
+
+ /*
+ * Safeguard against a confused varpd. Packets specifically for
+ * varpd may receive replies (e.g. ARP replies) that require us to
+ * drop, even when the entry is valid.
+ */
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_VALID) &&
+ !overlay_target_for_varpd(entry->ote_odd, mp)) {
entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
DTRACE_PROBE1(overlay__target__valid__drop,
overlay_target_entry_t *, entry);
@@ -731,7 +1524,21 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
goto done;
}
- mp = entry->ote_chead;
+ /*
+ * If varpd is instructing us to drop the head mblk in a VL2 entry,
+ * this could be because it's already provided a response (e.g. an
+ * ARP reply), and the entry itself might still be used for other
+ * purposes. VL3 entries on the other had have no such uses. If
+ * we are told to drop the packet, there is no reason to retain
+ * the VL3 entry and we can delete it.
+ */
+ if (is_vl3) {
+ action = OTLDA_DELETE;
+ mutex_exit(&entry->ote_lock);
+ goto done;
+ }
+
+ /* Drop the first packet in entry */
if (mp != NULL) {
entry->ote_chead = mp->b_next;
mp->b_next = NULL;
@@ -739,23 +1546,34 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
entry->ote_ctail = entry->ote_chead;
entry->ote_mbsize -= msgsize(mp);
}
+
if (entry->ote_chead != NULL) {
- queue = B_TRUE;
+ action = OTLDA_QUEUE;
entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
} else {
entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
}
mutex_exit(&entry->ote_lock);
- if (queue == B_TRUE)
+ if (action == OTLDA_QUEUE)
overlay_target_queue(entry);
freemsg(mp);
done:
- mutex_enter(&entry->ote_ott->ott_lock);
- entry->ote_ott->ott_ocount--;
- cv_signal(&entry->ote_ott->ott_cond);
- mutex_exit(&entry->ote_ott->ott_lock);
+ mutex_enter(&ott->ott_lock);
+ ott->ott_ocount--;
+ if (action == OTLDA_DELETE) {
+ /* overlay_target_entry_dtor() will free the mblk chain */
+ qqcache_remove(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ }
+
+ if (is_vl3)
+ qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ else
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+
+ cv_signal(&ott->ott_cond);
+ mutex_exit(&ott->ott_lock);
return (0);
}
@@ -1083,31 +1901,35 @@ overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg)
sizeof (overlay_target_point_t));
} else {
overlay_target_entry_t *ote;
- ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- otc->otc_entry.otce_mac);
- if (ote != NULL) {
- mutex_enter(&ote->ote_lock);
- if ((ote->ote_flags &
- OVERLAY_ENTRY_F_VALID_MASK) != 0) {
- if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
- otc->otc_entry.otce_flags =
- OVERLAY_TARGET_CACHE_DROP;
- } else {
- otc->otc_entry.otce_flags = 0;
- bcopy(&ote->ote_dest,
- &otc->otc_entry.otce_dest,
- sizeof (overlay_target_point_t));
- }
- ret = 0;
+
+ if ((ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ &otc->otc_entry.otce_mac)) == NULL) {
+ ret = ENOENT;
+ goto done;
+ }
+
+ mutex_enter(&ote->ote_lock);
+ if ((ote->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) != 0) {
+ if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ otc->otc_entry.otce_flags =
+ OVERLAY_TARGET_CACHE_DROP;
+ } else if (ote->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ otc->otc_entry.otce_flags =
+ OVERLAY_TARGET_CACHE_ROUTER;
} else {
- ret = ENOENT;
+ otc->otc_entry.otce_flags = 0;
+ bcopy(&ote->ote_u.ote_vl2.otvl2_dest,
+ &otc->otc_entry.otce_dest,
+ sizeof (overlay_target_point_t));
}
- mutex_exit(&ote->ote_lock);
+ ret = 0;
} else {
ret = ENOENT;
}
+ mutex_exit(&ote->ote_lock);
}
+done:
mutex_exit(&ott->ott_lock);
overlay_hold_rele(odd);
@@ -1120,53 +1942,64 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
{
overlay_dev_t *odd;
overlay_target_t *ott;
- overlay_target_entry_t *ote;
+ overlay_target_entry_t *ote, *new = NULL;
overlay_targ_cache_t *otc = arg;
mblk_t *mp = NULL;
- if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
+ if (otc->otc_entry.otce_flags &
+ ~(OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER))
+ return (EINVAL);
+
+ if (otc->otc_entry.otce_flags ==
+ (OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER))
return (EINVAL);
odd = overlay_hold_by_dlid(otc->otc_linkid);
if (odd == NULL)
return (ENOENT);
+ /*
+ * Optimistically create the new entry. If not needed, we'll free it.
+ * We shouldn't be calling this ioctl rapidly enough that any potential
+ * alloc/free churn should cause a problem.
+ */
+ new = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
+ bcopy(&otc->otc_entry.otce_mac, &new->ote_u.ote_vl2.otvl2_mac,
+ sizeof (overlay_target_mac_t));
+
mutex_enter(&odd->odd_lock);
if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
mutex_exit(&odd->odd_lock);
overlay_hold_rele(odd);
+ overlay_target_entry_dtor(new);
return (ENXIO);
}
ott = odd->odd_target;
if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
mutex_exit(&odd->odd_lock);
overlay_hold_rele(odd);
+ overlay_target_entry_dtor(new);
return (ENOTSUP);
}
+
+ new->ote_ott = ott;
+ new->ote_odd = odd;
+
mutex_enter(&ott->ott_lock);
mutex_exit(&odd->odd_lock);
- ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- otc->otc_entry.otce_mac);
- if (ote == NULL) {
- ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
- bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL);
- ote->ote_chead = ote->ote_ctail = NULL;
- ote->ote_mbsize = 0;
- ote->ote_ott = ott;
- ote->ote_odd = odd;
- mutex_enter(&ote->ote_lock);
- refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
- avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
- } else {
- mutex_enter(&ote->ote_lock);
- }
+ if ((ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ &otc->otc_entry.otce_mac)) == NULL)
+ ote = new;
+ mutex_enter(&ote->ote_lock);
if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
} else {
ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
- bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
+ if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_ROUTER)
+ ote->ote_flags |= OVERLAY_ENTRY_F_ROUTER;
+ bcopy(&otc->otc_entry.otce_dest, &ote->ote_u.ote_vl2.otvl2_dest,
sizeof (overlay_target_point_t));
mp = ote->ote_chead;
ote->ote_chead = NULL;
@@ -1175,6 +2008,10 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
ote->ote_vtime = gethrtime();
}
+ if (ote == new) {
+ qqcache_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
+ avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
+ }
mutex_exit(&ote->ote_lock);
mutex_exit(&ott->ott_lock);
@@ -1185,6 +2022,9 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
overlay_hold_rele(odd);
+ if (ote != new)
+ overlay_target_entry_dtor(new);
+
return (0);
}
@@ -1217,8 +2057,11 @@ overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg)
mutex_enter(&ott->ott_lock);
mutex_exit(&odd->odd_lock);
- ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- otc->otc_entry.otce_mac);
+ if (otc->otc_entry.otce_mac.otm_dcid == 0)
+ otc->otc_entry.otce_mac.otm_dcid = odd->odd_dcid;
+
+ ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ &otc->otc_entry.otce_mac);
if (ote != NULL) {
mutex_enter(&ote->ote_lock);
ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
@@ -1269,8 +2112,13 @@ overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg)
ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
mutex_exit(&ote->ote_lock);
}
- ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- otc->otc_entry.otce_mac);
+
+ avl = &ott->ott_u.ott_dyn.ott_l3tree;
+ for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) {
+ mutex_enter(&ote->ote_lock);
+ ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
+ mutex_exit(&ote->ote_lock);
+ }
mutex_exit(&ott->ott_lock);
overlay_hold_rele(odd);
@@ -1304,9 +2152,10 @@ overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize,
}
typedef struct overlay_targ_cache_marker {
- uint8_t otcm_mac[ETHERADDRL];
+ overlay_target_mac_t otcm_mac;
uint16_t otcm_done;
-} overlay_targ_cache_marker_t;
+} overlay_targ_cache_marker_t __aligned(8);
+CTASSERT(sizeof (overlay_targ_cache_marker_t) == 2 * sizeof (uint64_t));
/* ARGSUSED */
static int
@@ -1356,7 +2205,7 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
if (ott->ott_mode == OVERLAY_TARGET_POINT) {
overlay_targ_cache_entry_t *out = &iter->otci_ents[0];
- bzero(out->otce_mac, ETHERADDRL);
+ bzero(&out->otce_mac, sizeof (out->otce_mac));
out->otce_flags = 0;
bcopy(&ott->ott_u.ott_point, &out->otce_dest,
sizeof (overlay_target_point_t));
@@ -1365,7 +2214,9 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
}
avl = &ott->ott_u.ott_dyn.ott_tree;
- bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL);
+ lookup.ote_u.ote_vl2.otvl2_mac.otm_dcid = odd->odd_dcid;
+ bcopy(&mark->otcm_mac, &lookup.ote_u.ote_vl2.otvl2_mac,
+ sizeof (mark->otcm_mac));
ent = avl_find(avl, &lookup, &where);
/*
@@ -1390,19 +2241,21 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
mutex_exit(&ent->ote_lock);
continue;
}
- bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL);
+ bcopy(&ent->ote_u.ote_vl2.otvl2_mac, &out->otce_mac,
+ sizeof (out->otce_mac));
out->otce_flags = 0;
if (ent->ote_flags & OVERLAY_ENTRY_F_DROP)
out->otce_flags |= OVERLAY_TARGET_CACHE_DROP;
if (ent->ote_flags & OVERLAY_ENTRY_F_VALID)
- bcopy(&ent->ote_dest, &out->otce_dest,
+ bcopy(&ent->ote_u.ote_vl2.otvl2_dest, &out->otce_dest,
sizeof (overlay_target_point_t));
written++;
mutex_exit(&ent->ote_lock);
}
if (ent != NULL) {
- bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL);
+ bcopy(&ent->ote_u.ote_vl2.otvl2_mac, &mark->otcm_mac,
+ sizeof (mark->otcm_mac));
} else {
mark->otcm_done = 1;
}
@@ -1432,6 +2285,122 @@ overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize,
return (0);
}
+/*
+ * Take an IPv6 address + prefix length, and turn it into the network address.
+ * E.g. ::ffff:192.168.51.50/120 -> ::ffff:192.168.51.0
+ */
+static void
+overlay_in6_to_subnet(const struct in6_addr *src, struct in6_addr *dst,
+ uint8_t prefixlen)
+{
+ uint32_t val;
+
+ for (size_t i = 0; i < 4; i++) {
+ val = ntohl(src->_S6_un._S6_u32[i]);
+ val &= IN6_MASK_FROM_PREFIX(i, prefixlen);
+ dst->_S6_un._S6_u32[i] = htonl(val);
+ }
+}
+
+/*
+ * Find the first target entry whose source IP falls within the source subnet
+ * given by otcne. If no entries match, NULL is returned.
+ */
+static overlay_target_entry_t *
+overlay_target_cache_first_net(overlay_target_t *ott,
+ const overlay_targ_cache_net_entry_t *otcne)
+{
+ avl_tree_t *avl;
+ overlay_target_entry_t *ote;
+ struct in6_addr *start;
+ overlay_target_entry_t cmp = { 0 };
+ avl_index_t where = { 0 };
+
+ ASSERT(MUTEX_HELD(&ott->ott_lock));
+
+ avl = &ott->ott_u.ott_dyn.ott_l3tree;
+ start = &cmp.ote_u.ote_vl3.otvl3_src;
+
+ /*
+ * The first possible source address for a subnet is the network
+ * address (e.g. 192.160.10.0 for a /24). While it normally shouldn't
+ * appear, we either start here, or at the first entry after where
+ * it would exist if present. This should be the first possible
+ * entry in the subnet. If it's not within the subnet, then we
+ * know no entries with that source subnet are present.
+ */
+ overlay_in6_to_subnet(&otcne->otcne_src, start,
+ otcne->otcne_src_prefixlen);
+
+ if ((ote = avl_find(avl, &cmp, &where)) == NULL)
+ ote = avl_nearest(avl, where, AVL_AFTER);
+
+ if (ote == NULL || !IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_src,
+ &ote->ote_u.ote_vl3.otvl3_src, otcne->otcne_src_prefixlen))
+ return (NULL);
+
+ return (ote);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_remove_net(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_cache_net_t *otcn = arg;
+ overlay_targ_cache_net_entry_t *otcne = &otcn->otcn_entry;
+ overlay_dev_t *odd = NULL;
+ overlay_target_t *ott = NULL;
+ overlay_target_entry_t *ote = NULL, *ote_next = NULL;
+ avl_tree_t *avl = NULL;
+
+ odd = overlay_hold_by_dlid(otcn->otcn_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ avl = &ott->ott_u.ott_dyn.ott_l3tree;
+
+ for (ote = overlay_target_cache_first_net(ott, otcne);
+ ote != NULL && IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_src,
+ &ote->ote_u.ote_vl3.otvl3_src, otcne->otcne_src_prefixlen);
+ ote = ote_next) {
+ ote_next = AVL_NEXT(avl, ote);
+
+ /*
+ * Entries are sorted by src ip, dst ip, src vlan, there can
+ * be entries from this src ip to destinations on other
+ * subnets besides the one we are removing that will need to
+ * be skipped over.
+ */
+ if (ote->ote_u.ote_vl3.otvl3_src_vlan != otcne->otcne_vlan)
+ continue;
+
+ if (!IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_dst,
+ &ote->ote_u.ote_vl3.otvl3_dst, otcne->otcne_dst_prefixlen))
+ continue;
+
+ qqcache_remove(ott->ott_u.ott_dyn.ott_l3dhash, ote);
+ }
+
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+ return (0);
+}
+
static overlay_target_ioctl_t overlay_target_ioctab[] = {
{ OVERLAY_TARG_INFO, B_TRUE, B_TRUE,
NULL, overlay_target_info,
@@ -1492,6 +2461,9 @@ static overlay_target_ioctl_t overlay_target_ioctab[] = {
overlay_target_cache_iter,
overlay_target_cache_iter_copyout,
sizeof (overlay_targ_cache_iter_t) },
+ { OVERLAY_TARG_CACHE_REMOVE_NET, B_TRUE, B_TRUE,
+ NULL, overlay_target_cache_remove_net,
+ NULL, sizeof (overlay_targ_cache_net_t) },
{ 0 }
};
diff --git a/usr/src/uts/common/qqcache/qqcache.c b/usr/src/uts/common/qqcache/qqcache.c
new file mode 100644
index 0000000000..ccd90c3814
--- /dev/null
+++ b/usr/src/uts/common/qqcache/qqcache.c
@@ -0,0 +1,444 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018, Joyent, Inc.
+ */
+
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/null.h>
+#include <sys/types.h>
+#include <sys/qqcache.h>
+#include <sys/qqcache_impl.h>
+#include <sys/stddef.h>
+
+/*
+ * Currently, the non _KERNEL pieces are to support testing in usr/src/test.
+ */
+#ifdef _KERNEL
+#include <sys/kmem.h>
+#define ZALLOC kmem_zalloc
+#define FREE kmem_free
+#else
+#include <umem.h>
+#define ZALLOC umem_zalloc
+#define FREE umem_free
+#endif
+
+
+/*
+ * The *_overflow functions mimic the gcc/clang intrinsic functions. Once
+ * we are using a newer compiler version to that includes these as intrisnics,
+ * these can be replaced with those versions.
+ */
+static int
+uadd_overflow(const size_t a, const size_t b, size_t *sump)
+{
+ *sump = a + b;
+ if (*sump < a || *sump < b)
+ return (1);
+ return (0);
+}
+
+#define MUL_NO_OVERFLOW ((size_t)1 << (sizeof (size_t) * 4))
+
+static int
+umul_overflow(const size_t a, const size_t b, size_t *cp)
+{
+ *cp = a * b;
+
+ if ((a >= MUL_NO_OVERFLOW || b >= MUL_NO_OVERFLOW) &&
+ a != 0 && b != 0 && SIZE_MAX / a < b)
+ return (1);
+
+ return (0);
+}
+
+/* Calculate the capacity of each list based on sz and a */
+static void
+qqcache_size_lists(size_t sz, size_t a, size_t *maxp)
+{
+ VERIFY3U(sz, >=, QQCACHE_NUM_LISTS);
+
+ /*
+ * The general approach is to start with list 0 being sized as a% of
+ * sz. However every other list must be able to hold at least one
+ * entry unless a == 100 (i.e. 100%). If the straight percentage
+ * leaves any of the remaining lists with zero entries, we give them
+ * a size of 1, and then adjust list0's size according so that the
+ * sum off all list sizes == sz (this is mostly only a concern where
+ * sz is small enough such that (100 - a)% of sz < QQCACHE_NUM_LISTS).
+ */
+ size_t list0sz = sz * a / 100;
+ size_t othersz = (sz - list0sz) / (QQCACHE_NUM_LISTS - 1);
+
+ if (list0sz == 0)
+ list0sz = 1;
+
+ if (othersz == 0 && a != 100)
+ othersz = 1;
+
+ if (list0sz + othersz * (QQCACHE_NUM_LISTS - 1) > sz)
+ list0sz = sz - othersz * (QQCACHE_NUM_LISTS - 1);
+
+ maxp[0] = list0sz;
+ for (size_t i = 1; i < QQCACHE_NUM_LISTS; i++)
+ maxp[i] = othersz;
+}
+
+int
+qqcache_create(qqcache_t **qp, size_t sz, size_t a, size_t buckets,
+ qqcache_hash_fn_t hash_fn, qqcache_cmp_fn_t cmp_fn,
+ qqcache_dtor_fn_t dtor_fn, size_t elsize, size_t link_off, size_t tag_off,
+ int kmflags)
+{
+ qqcache_t *qc;
+ size_t len = 0;
+
+ if (sz < QQCACHE_MIN_SIZE)
+ return (EINVAL);
+ if (a > 100)
+ return (EINVAL);
+
+ if (umul_overflow(sizeof (qqcache_list_t), buckets, &len))
+ return (EINVAL);
+ if (uadd_overflow(sizeof (*qc), len, &len))
+ return (EINVAL);
+
+ if ((qc = ZALLOC(len, kmflags)) == NULL)
+ return (ENOMEM);
+
+ qc->qqc_hash_fn = hash_fn;
+ qc->qqc_cmp_fn = cmp_fn;
+ qc->qqc_dtor_fn = dtor_fn;
+ qc->qqc_link_off = link_off;
+ qc->qqc_tag_off = tag_off;
+ qc->qqc_nbuckets = buckets;
+ qc->qqc_size = sz;
+ qc->qqc_a = a;
+
+ qqcache_size_lists(sz, a, qc->qqc_max);
+
+ for (size_t i = 0; i < buckets; i++) {
+ list_create(&qc->qqc_buckets[i].qqcl_list, elsize,
+ offsetof(qqcache_link_t, qqln_hash_link));
+ }
+
+ for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) {
+ list_create(&qc->qqc_lists[i].qqcl_list, elsize,
+ offsetof(qqcache_link_t, qqln_list_link));
+ }
+
+ *qp = qc;
+ return (0);
+}
+
+void
+qqcache_destroy(qqcache_t *qc)
+{
+ size_t len;
+
+ if (qc == NULL)
+ return;
+
+ /* If creation succeeded, this calculation cannot overflow */
+ len = sizeof (*qc) + qc->qqc_nbuckets * sizeof (qqcache_list_t);
+
+ for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) {
+ list_t *l = &qc->qqc_lists[i].qqcl_list;
+ qqcache_link_t *lnk;
+
+ while ((lnk = list_remove_head(l)) != NULL)
+ ;
+ }
+
+ for (size_t i = 0; i < qc->qqc_nbuckets; i++) {
+ list_t *l = &qc->qqc_buckets[i].qqcl_list;
+ qqcache_link_t *lnk;
+
+ while ((lnk = list_remove_head(l)) != NULL) {
+ ASSERT0(lnk->qqln_refcnt);
+ qc->qqc_dtor_fn(link_to_obj(qc, lnk));
+ }
+ }
+
+ FREE(qc, len);
+}
+
+/*
+ * Removal of an entry is a two step process. qqcache_remove() removes the
+ * entry from the cache lists, and if a reference is held, sets the
+ * QQCACHE_F_DEAD flag. When there are no more references held on an entry,
+ * (either none are held at the time qqcache_remove() is called, or the last
+ * reference is removed via qqcache_rele(), qqcache_delete() is called which
+ * removes the entry from its hash bucket and calls the entry's dtor function.
+ *
+ * The main reason for the two step process is largely simplicity. If the
+ * entry remains in the cache lists w/ the QQCACHE_F_DEAD flag set, it
+ * complicates keeping each cache within its size limits -- either the
+ * list size must reflect the number of non-dead entries (which could be
+ * confusing during troubleshooting), or as we push things down the list, we
+ * would need to skip/ignore dead entries. The hash buckets however don't
+ * have any size limits (to impose limits would require the hash function
+ * provided by the consumer to produce perfectly equal distribution of entries
+ * across all the hash buckets at all times). The only time we care about
+ * the QQCACHE_F_DEAD flag in the hash buckets is when trying to lookup a
+ * 'dead' value, so leaving the entries in there does not present the same
+ * issues as leaving them in the hash buckets (while still providing a way to
+ * find refheld entries).
+ */
+static void
+qqcache_delete(qqcache_t *qc, qqcache_link_t *lp)
+{
+ void *op = link_to_obj(qc, lp);
+ void *tp = obj_to_tag(qc, op);
+ uint_t n = qc->qqc_hash_fn(tp) % qc->qqc_nbuckets;
+
+ ASSERT3U(qc->qqc_buckets[n].qqcl_len, >, 0);
+ ASSERT(!list_is_empty(&qc->qqc_buckets[n].qqcl_list));
+ ASSERT(!list_link_active(&lp->qqln_list_link));
+ ASSERT(list_link_active(&lp->qqln_hash_link));
+
+ list_remove(&qc->qqc_buckets[n].qqcl_list, lp);
+ qc->qqc_buckets[n].qqcl_len--;
+ qc->qqc_dtor_fn(op);
+}
+
+void
+qqcache_remove(qqcache_t *qc, void *op)
+{
+ qqcache_link_t *lp = obj_to_link(qc, op);
+ qqcache_list_t *lst = QQCACHE_LIST(qc, lp);
+
+ ASSERT(!list_is_empty(&lst->qqcl_list));
+ ASSERT3U(lst->qqcl_len, >, 0);
+
+ list_remove(&lst->qqcl_list, lp);
+ lst->qqcl_len--;
+
+ if (lp->qqln_refcnt > 0)
+ lp->qqln_flags |= QQCACHE_F_DEAD;
+ else
+ qqcache_delete(qc, lp);
+}
+
+void
+qqcache_hold(qqcache_t *qc, void *op)
+{
+ qqcache_link_t *lp = obj_to_link(qc, op);
+
+ ++lp->qqln_refcnt;
+}
+
+void
+qqcache_rele(qqcache_t *qc, void *op)
+{
+ qqcache_link_t *lp = obj_to_link(qc, op);
+
+ VERIFY3U(lp->qqln_refcnt, >, 0);
+
+ if (--lp->qqln_refcnt == 0 && (lp->qqln_flags & QQCACHE_F_DEAD))
+ qqcache_delete(qc, lp);
+}
+
+static qqcache_link_t *
+qqcache_hash_lookup(qqcache_t *qc, const void *tp, qqcache_list_t **lpp)
+{
+ uint_t n = qc->qqc_hash_fn(tp) % qc->qqc_nbuckets;
+ qqcache_link_t *lp;
+ qqcache_list_t *bucket = &qc->qqc_buckets[n];
+ list_t *l = &bucket->qqcl_list;
+ void *cmp;
+
+ if (lpp != NULL)
+ *lpp = bucket;
+
+ for (lp = list_head(l); lp != NULL; lp = list_next(l, lp)) {
+ cmp = obj_to_tag(qc, link_to_obj(qc, lp));
+
+ if (qc->qqc_cmp_fn(cmp, tp) == 0 &&
+ !(lp->qqln_flags & QQCACHE_F_DEAD)) {
+ return (lp);
+ }
+ }
+
+ return (NULL);
+}
+
+/*
+ * Starting at listnum, push entries from the tail of cache list 'n' to the
+ * head of * list 'n + 1', keeping each list within their size limits. Excess
+ * entries on the tail of the last list are deleted. If 'for_insert' is
+ * B_TRUE, also guarantee after this returns that there are no more than
+ * 'max - 1' entries on listnum (so there is room to insert an entry onto
+ * listnum).
+ */
+static void
+qqcache_ripple(qqcache_t *qc, uint_t listnum, boolean_t for_insert)
+{
+ VERIFY3U(listnum, <, QQCACHE_NUM_LISTS);
+
+ for (uint_t i = listnum; i < QQCACHE_NUM_LISTS; i++) {
+ qqcache_list_t *ql = &qc->qqc_lists[i];
+ qqcache_list_t *qlnext = &qc->qqc_lists[i + 1];
+ size_t max = qc->qqc_max[i];
+
+ ASSERT3U(max, >, 0);
+
+ /*
+ * If we're planning to insert an entry on list 'listnum',
+ * we bump the maximum size down by one to guarantee we
+ * have sufficient room for the entry
+ */
+ if (for_insert && i == listnum)
+ max--;
+
+ while (ql->qqcl_len > max) {
+ qqcache_link_t *lnk = list_tail(&ql->qqcl_list);
+
+ if (i + 1 < QQCACHE_NUM_LISTS) {
+ list_remove(&ql->qqcl_list, lnk);
+ ql->qqcl_len--;
+
+ ASSERT3U(lnk->qqln_listnum, ==, i);
+ lnk->qqln_listnum++;
+
+ list_insert_head(&qlnext->qqcl_list, lnk);
+ qlnext->qqcl_len++;
+ } else {
+ qqcache_remove(qc, link_to_obj(qc, lnk));
+ }
+ }
+ }
+}
+
+int
+qqcache_insert(qqcache_t *qc, void *obj)
+{
+ qqcache_link_t *lp = obj_to_link(qc, obj);
+ qqcache_list_t *bucket;
+
+ if (qqcache_hash_lookup(qc, obj_to_tag(qc, obj), &bucket) != NULL)
+ return (EEXIST);
+
+ list_link_init(&lp->qqln_hash_link);
+ list_link_init(&lp->qqln_list_link);
+ lp->qqln_refcnt = 0;
+ lp->qqln_flags = 0;
+ lp->qqln_listnum = QQCACHE_INSERT_LIST;
+
+ qqcache_ripple(qc, QQCACHE_INSERT_LIST, B_TRUE);
+
+ list_insert_tail(&bucket->qqcl_list, lp);
+ bucket->qqcl_len++;
+
+ list_insert_head(&qc->qqc_lists[QQCACHE_INSERT_LIST].qqcl_list, lp);
+ qc->qqc_lists[QQCACHE_INSERT_LIST].qqcl_len++;
+
+ return (0);
+}
+
+void *
+qqcache_lookup(qqcache_t *qc, const void *tp)
+{
+ qqcache_link_t *lp;
+ qqcache_list_t *src;
+ uint_t tgtnum;
+
+ if ((lp = qqcache_hash_lookup(qc, tp, NULL)) == NULL)
+ return (NULL);
+
+ src = QQCACHE_LIST(qc, lp);
+ list_remove(&src->qqcl_list, lp);
+ src->qqcl_len--;
+
+ tgtnum = (lp->qqln_listnum > 0) ? lp->qqln_listnum - 1 : 0;
+
+ if (tgtnum != lp->qqln_listnum)
+ qqcache_ripple(qc, tgtnum, B_TRUE);
+
+ lp->qqln_listnum = tgtnum;
+ list_insert_head(&qc->qqc_lists[tgtnum].qqcl_list, lp);
+ qc->qqc_lists[tgtnum].qqcl_len++;
+
+ return (link_to_obj(qc, lp));
+}
+
+int
+qqcache_adjust_size(qqcache_t *qc, size_t sz)
+{
+ if (sz < QQCACHE_MIN_SIZE)
+ return (EINVAL);
+
+ qc->qqc_size = sz;
+ qqcache_size_lists(sz, qc->qqc_a, qc->qqc_max);
+ qqcache_ripple(qc, 0, B_FALSE);
+ return (0);
+}
+
+int
+qqcache_adjust_a(qqcache_t *qc, size_t a)
+{
+ if (a > 100)
+ return (EINVAL);
+
+ qc->qqc_a = a;
+ qqcache_size_lists(qc->qqc_size, a, qc->qqc_max);
+ qqcache_ripple(qc, 0, B_FALSE);
+ return (0);
+}
+
+size_t
+qqcache_size(const qqcache_t *qc)
+{
+ return (qc->qqc_size);
+}
+
+size_t
+qqcache_a(const qqcache_t *qc)
+{
+ return (qc->qqc_a);
+}
+
+void *
+qqcache_first(qqcache_t *qc)
+{
+ for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) {
+ qqcache_list_t *l = &qc->qqc_lists[i];
+
+ if (l->qqcl_len > 0)
+ return (link_to_obj(qc, list_head(&l->qqcl_list)));
+ }
+
+ return (NULL);
+}
+
+void *
+qqcache_next(qqcache_t *qc, void *obj)
+{
+ qqcache_link_t *lp = obj_to_link(qc, obj);
+ qqcache_link_t *next;
+ qqcache_list_t *l = QQCACHE_LIST(qc, lp);
+
+ ASSERT3U(lp->qqln_listnum, <, QQCACHE_NUM_LISTS);
+
+ if ((next = list_next(&l->qqcl_list, lp)) != NULL)
+ return (link_to_obj(qc, next));
+
+ for (size_t i = lp->qqln_listnum + 1; i < QQCACHE_NUM_LISTS; i++) {
+ l = &qc->qqc_lists[i];
+ if (l->qqcl_len > 0)
+ return (link_to_obj(qc, list_head(&l->qqcl_list)));
+ }
+
+ return (NULL);
+}
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 24fdd94c11..eaf06f476c 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -28,7 +28,7 @@
# Copyright 2017 Nexenta Systems, Inc.
# Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
# Copyright 2019 Peter Tribble.
-# Copyright 2015, Joyent, Inc. All rights reserved.
+# Copyright 2018 Joyent, Inc.
#
include $(SRC)/uts/Makefile.uts
@@ -491,6 +491,8 @@ CHKHDRS= \
ptem.h \
ptms.h \
ptyvar.h \
+ qqcache.h \
+ qqcache_impl.h \
raidioctl.h \
ramdisk.h \
random.h \
diff --git a/usr/src/uts/common/sys/ethernet.h b/usr/src/uts/common/sys/ethernet.h
index 5b9de2f2bf..f19912bfc3 100644
--- a/usr/src/uts/common/sys/ethernet.h
+++ b/usr/src/uts/common/sys/ethernet.h
@@ -23,6 +23,8 @@
*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018, Joyent, Inc.
*/
/*
@@ -139,6 +141,18 @@ struct ether_vlan_extinfo {
#define ether_copy(a, b) (bcopy((caddr_t)a, (caddr_t)b, 6))
#endif
+/*
+ * Ethernet is-zero check
+ */
+#if defined(__sparc) || defined(__i386) || defined(__amd64)
+#define ether_is_zero(a) \
+ (((short *)a)[0] == 0 && ((short *)a)[1] == 0 && ((short *)a)[2] == 0)
+#else
+#define ether_is_zero(a) (((uint8_t *)a)[0] == 0 && ((uint8_t *)a)[1] == 0 && \
+ ((uint8_t *)a)[2] == 0 && ((uint8_t *)a)[3] == 0 && \
+ ((uint8_t *)a)[4] == 0 && ((uint8_t *)a)[5] == 0)
+#endif
+
#ifdef _KERNEL
extern int localetheraddr(struct ether_addr *, struct ether_addr *);
extern char *ether_sprintf(struct ether_addr *);
diff --git a/usr/src/uts/common/sys/overlay.h b/usr/src/uts/common/sys/overlay.h
index 12d0dbca51..90f1843282 100644
--- a/usr/src/uts/common/sys/overlay.h
+++ b/usr/src/uts/common/sys/overlay.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015, Joyent, Inc.
+ * Copyright 2018, Joyent, Inc.
*/
#ifndef _SYS_OVERLAY_H
@@ -40,7 +40,7 @@ extern "C" {
typedef struct overlay_ioc_create {
datalink_id_t oic_linkid;
- uint32_t oic_filler;
+ uint32_t oic_dcid;
uint64_t oic_vnetid;
char oic_encap[MAXLINKNAMELEN];
} overlay_ioc_create_t;
diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h
index d638096006..de682a0397 100644
--- a/usr/src/uts/common/sys/overlay_common.h
+++ b/usr/src/uts/common/sys/overlay_common.h
@@ -42,7 +42,8 @@ typedef enum overlay_prop_type {
OVERLAY_PROP_T_INT = 0x1, /* signed int */
OVERLAY_PROP_T_UINT, /* unsigned int */
OVERLAY_PROP_T_IP, /* sinaddr6 */
- OVERLAY_PROP_T_STRING /* OVERLAY_PROPS_SIZEMAX */
+ OVERLAY_PROP_T_STRING, /* OVERLAY_PROPS_SIZEMAX */
+ OVERLAY_PROP_T_ETHER /* 6-byte MAC address */
} overlay_prop_type_t;
typedef enum overlay_prop_prot {
diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h
index 7fb8b8da1d..28e80d6d58 100644
--- a/usr/src/uts/common/sys/overlay_impl.h
+++ b/usr/src/uts/common/sys/overlay_impl.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_OVERLAY_IMPL_H
@@ -29,9 +29,9 @@
#include <sys/avl.h>
#include <sys/ksocket.h>
#include <sys/socket.h>
-#include <sys/refhash.h>
#include <sys/ethernet.h>
#include <sys/list.h>
+#include <sys/qqcache.h>
#ifdef __cplusplus
extern "C" {
@@ -59,7 +59,7 @@ typedef struct overlay_mux {
int omux_domain; /* RO: socket domain */
int omux_family; /* RO: socket family */
int omux_protocol; /* RO: socket protocol */
- struct sockaddr *omux_addr; /* RO: socket address */
+ struct sockaddr *omux_addr; /* RO: socket address */
socklen_t omux_alen; /* RO: sockaddr len */
kmutex_t omux_lock; /* Protects everything below */
uint_t omux_count; /* Active instances */
@@ -81,8 +81,10 @@ typedef struct overlay_target {
union { /* ott_lock */
overlay_target_point_t ott_point;
struct overlay_target_dyn {
- refhash_t *ott_dhash;
+ qqcache_t *ott_dhash;
+ qqcache_t *ott_l3dhash;
avl_tree_t ott_tree;
+ avl_tree_t ott_l3tree;
} ott_dyn;
} ott_u;
} overlay_target_t;
@@ -117,6 +119,12 @@ typedef struct overlay_dev {
uint64_t odd_vid; /* RO if active else odd_lock */
avl_node_t odd_muxnode; /* managed by mux */
overlay_target_t *odd_target; /* See big theory statement */
+ uint32_t odd_dcid; /* RO if active else odd_lock */
+ uint32_t odd_vl2sz; /* protected by odd_lock */
+ uint32_t odd_vl2a; /* protected by odd_lock */
+ uint32_t odd_routesz; /* protected by odd_lock */
+ uint32_t odd_routea; /* protected by odd_lock */
+ uint8_t odd_macaddr[ETHERADDRL]; /* RO same as odd_dcid */
char odd_fmamsg[OVERLAY_STATUS_BUFLEN]; /* odd_lock */
} overlay_dev_t;
@@ -124,25 +132,50 @@ typedef enum overlay_target_entry_flags {
OVERLAY_ENTRY_F_PENDING = 0x01, /* lookup in progress */
OVERLAY_ENTRY_F_VALID = 0x02, /* entry is currently valid */
OVERLAY_ENTRY_F_DROP = 0x04, /* always drop target */
- OVERLAY_ENTRY_F_VALID_MASK = 0x06
+ OVERLAY_ENTRY_F_ROUTER = 0x08, /* VL2 router entry */
+ OVERLAY_ENTRY_F_HAS_ROUTE = 0x10,
+ OVERLAY_ENTRY_F_VALID_MASK = 0x1e,
+ OVERLAY_ENTRY_F_VL3 = 0x20, /* Is VL3 entry */
} overlay_target_entry_flags_t;
-typedef struct overlay_target_entry {
+struct overlay_target_entry;
+typedef struct overlay_target_entry overlay_target_entry_t;
+
+/*
+ * For VL3 target entries, if we need to lock both the VL3 entry and the
+ * (possibly shared with multiple VL3 entries) VL2 entry, we must always
+ * take the VL3 lock prior to the VL2 entry lock.
+ */
+typedef struct overlay_target_vl3 {
+ struct in6_addr otvl3_src;
+ struct in6_addr otvl3_dst;
+ uint16_t otvl3_src_vlan;
+ overlay_target_mac_t otvl3_vl2;
+} overlay_target_vl3_t;
+
+typedef struct overlay_target_vl2 {
+ overlay_target_route_t otvl2_route;
+ overlay_target_mac_t otvl2_mac;
+ overlay_target_point_t otvl2_dest;
+} overlay_target_vl2_t;
+
+struct overlay_target_entry {
kmutex_t ote_lock;
- refhash_link_t ote_reflink; /* hashtable link */
+ qqcache_link_t ote_reflink; /* hashtable link */
avl_node_t ote_avllink; /* iteration link */
list_node_t ote_qlink;
overlay_target_entry_flags_t ote_flags; /* RW: state flags */
- uint8_t ote_addr[ETHERADDRL]; /* RO: mac addr */
overlay_target_t *ote_ott; /* RO */
overlay_dev_t *ote_odd; /* RO */
- overlay_target_point_t ote_dest; /* RW: destination */
mblk_t *ote_chead; /* RW: blocked mb chain head */
mblk_t *ote_ctail; /* RW: blocked mb chain tail */
size_t ote_mbsize; /* RW: outstanding mblk size */
hrtime_t ote_vtime; /* RW: valid timestamp */
-} overlay_target_entry_t;
-
+ union {
+ overlay_target_vl2_t ote_vl2;
+ overlay_target_vl3_t ote_vl3;
+ } ote_u;
+};
#define OVERLAY_CTL "overlay"
@@ -186,7 +219,7 @@ extern void overlay_target_free(overlay_dev_t *);
#define OVERLAY_TARGET_DROP 1
#define OVERLAY_TARGET_ASYNC 2
extern int overlay_target_lookup(overlay_dev_t *, mblk_t *, struct sockaddr *,
- socklen_t *);
+ socklen_t *, uint64_t *);
extern void overlay_target_quiesce(overlay_target_t *);
extern void overlay_target_fini(void);
diff --git a/usr/src/uts/common/sys/overlay_target.h b/usr/src/uts/common/sys/overlay_target.h
index ae92ef3532..28c0559acb 100644
--- a/usr/src/uts/common/sys/overlay_target.h
+++ b/usr/src/uts/common/sys/overlay_target.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright (c) 2015 Joyent, Inc.
+ * Copyright (c) 2018 Joyent, Inc.
*/
#ifndef _OVERLAY_TARGET_H
@@ -29,11 +29,43 @@
extern "C" {
#endif
+/*
+ * The overlay_target_point_t structure represents the destination where
+ * encapsulated frames are sent. Currently supported virtualization protocols
+ * (i.e. vxlan) only use otp_ip and otp_port, but other methods might use
+ * a L2 address instead of an L3 address to represent a destination.
+ */
typedef struct overlay_target_point {
- uint8_t otp_mac[ETHERADDRL];
struct in6_addr otp_ip;
uint16_t otp_port;
-} overlay_target_point_t;
+ uint8_t otp_mac[ETHERADDRL];
+} overlay_target_point_t __aligned(8);
+
+/*
+ * An overlay_target_mac_t represents the overlay representation of a VL2 MAC
+ * address. With the advent of cross-DC routing, it is possible to have
+ * duplicate MAC addresses in different data centers, so the data center id
+ * is necessary to uniquely identify a MAC address.
+ *
+ * XXX: In hindsight, using a uint16_t for the DCID might have been nicer.
+ */
+typedef struct overlay_target_mac {
+ uint32_t otm_dcid;
+ uint8_t otm_mac[ETHERADDRL];
+} overlay_target_mac_t;
+
+/*
+ * The overlay_target_route_t represents the fields of the packet that
+ * have to be modified to deliver a packet to remote (routed) destinations.
+ * All three values are always populated when a packet is routed, even if
+ * some of the overlay_target_route_t values end up being the same as the
+ * original values in the packet being routed.
+ */
+typedef struct overlay_target_route {
+ uint64_t otr_vnet;
+ uint8_t otr_srcmac[ETHERADDRL];
+ uint16_t otr_vlan;
+} overlay_target_route_t;
#define OVERLAY_TARG_IOCTL (('o' << 24) | ('v' << 16) | ('t' << 8))
@@ -52,6 +84,7 @@ typedef struct overlay_targ_info {
uint32_t oti_needs;
uint64_t oti_flags;
uint64_t oti_vnetid;
+ uint32_t oti_dcid;
} overlay_targ_info_t;
/*
@@ -134,7 +167,7 @@ typedef struct overlay_targ_id {
*
* This ioctl can be used to copy data from a given request into a
* user buffer. This can be used in combination with
- * OVERLAY_TARG_INJECT to implemnt services such as a proxy-arp.
+ * OVERLAY_TARG_INJECT to implement services such as a proxy-arp.
*
*
* OVERLAY_TARG_RESEND - overlay_targ_pkt_t
@@ -152,6 +185,18 @@ typedef struct overlay_targ_id {
#define OVERLAY_TARG_PKT (OVERLAY_TARG_IOCTL | 0x14)
#define OVERLAY_TARG_RESEND (OVERLAY_TARG_IOCTL | 0x15)
+typedef struct overlay_targ_l2 {
+ uint8_t otl2_srcaddr[ETHERADDRL];
+ uint8_t otl2_dstaddr[ETHERADDRL];
+ uint32_t otl2_dsttype;
+ uint32_t otl2_sap;
+} overlay_targ_l2_t;
+
+typedef struct overlay_targ_l3 {
+ struct in6_addr otl3_srcip;
+ struct in6_addr otl3_dstip;
+} overlay_targ_l3_t;
+
typedef struct overlay_targ_lookup {
uint64_t otl_dlid;
uint64_t otl_reqid;
@@ -159,16 +204,20 @@ typedef struct overlay_targ_lookup {
uint64_t otl_vnetid;
uint64_t otl_hdrsize;
uint64_t otl_pktsize;
- uint8_t otl_srcaddr[ETHERADDRL];
- uint8_t otl_dstaddr[ETHERADDRL];
- uint32_t otl_dsttype;
- uint32_t otl_sap;
+ union {
+ overlay_targ_l2_t otlu_l2;
+ overlay_targ_l3_t otlu_l3;
+ } otl_addru;
int32_t otl_vlan;
+ boolean_t otl_l3req;
} overlay_targ_lookup_t;
+
typedef struct overlay_targ_resp {
- uint64_t otr_reqid;
- overlay_target_point_t otr_answer;
+ uint64_t otr_reqid;
+ overlay_target_route_t otr_route; /* Ignored for VL2->UL3 requests */
+ overlay_target_mac_t otr_mac; /* Ignored for VL2->UL3 requests */
+ overlay_target_point_t otr_answer;
} overlay_targ_resp_t;
typedef struct overlay_targ_pkt {
@@ -255,6 +304,7 @@ typedef struct overlay_targ_list {
#define OVERLAY_TARG_CACHE_REMOVE (OVERLAY_TARG_IOCTL | 0x32)
#define OVERLAY_TARG_CACHE_FLUSH (OVERLAY_TARG_IOCTL | 0x33)
#define OVERLAY_TARG_CACHE_ITER (OVERLAY_TARG_IOCTL | 0x34)
+#define OVERLAY_TARG_CACHE_REMOVE_NET (OVERLAY_TARG_IOCTL | 0x35)
/*
* This is a pretty arbitrary number that we're constraining ourselves to
@@ -265,22 +315,36 @@ typedef struct overlay_targ_list {
#define OVERLAY_TARGET_ITER_MAX 500
#define OVERLAY_TARGET_CACHE_DROP 0x01
+#define OVERLAY_TARGET_CACHE_ROUTER 0x02
typedef struct overlay_targ_cache_entry {
- uint8_t otce_mac[ETHERADDRL];
+ overlay_target_mac_t otce_mac;
uint16_t otce_flags;
overlay_target_point_t otce_dest;
} overlay_targ_cache_entry_t;
+typedef struct overlay_targ_cache_net_entry {
+ struct in6_addr otcne_src;
+ struct in6_addr otcne_dst;
+ uint16_t otcne_vlan; /* src vlan */
+ uint8_t otcne_src_prefixlen;
+ uint8_t otcne_dst_prefixlen;
+} overlay_targ_cache_net_entry_t;
+
typedef struct overlay_targ_cache {
datalink_id_t otc_linkid;
overlay_targ_cache_entry_t otc_entry;
} overlay_targ_cache_t;
+typedef struct overlay_targ_cache_net {
+ datalink_id_t otcn_linkid;
+ overlay_targ_cache_net_entry_t otcn_entry;
+} overlay_targ_cache_net_t;
+
typedef struct overlay_targ_cache_iter {
datalink_id_t otci_linkid;
uint32_t otci_pad;
- uint64_t otci_marker;
+ uint64_t otci_marker[2];
uint16_t otci_count;
uint8_t otci_pad2[3];
overlay_targ_cache_entry_t otci_ents[];
diff --git a/usr/src/uts/common/sys/qqcache.h b/usr/src/uts/common/sys/qqcache.h
new file mode 100644
index 0000000000..a2244338dd
--- /dev/null
+++ b/usr/src/uts/common/sys/qqcache.h
@@ -0,0 +1,176 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018, Joyent, Inc.
+ */
+
+#ifndef _QQCACHE_H
+#define _QQCACHE_H
+
+#include <sys/list.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This implements a fixed-size hash table that uses the 2Q algorithm
+ * from Johnson and Shasha to manage the contents of the entries.
+ *
+ * Briefly, there are two fixed sizes lists (0 and 1). New entries are
+ * added to the head of list 1, and upon subsequent access (lookup), are
+ * moved to the head of list 0. Entries that fall off the end of list 0
+ * are pushed onto the head of list 1, and entries that fall off the end
+ * of list 1 are deleted. The percentage of the total size of the cache
+ * for each list is determined by the parameter 'a', which is a percentage
+ * (0-100) of the cache size that is dedicated to list 0.
+ *
+ * This implementation does generalize this algorithm somewhat to an
+ * arbitrary number of lists (instead of just 2) via the QQCACHE_NUM_LISTS
+ * and QQCACHE_INSERT_LIST preprocessor symbols (defined in
+ * sys/qqcache_impl.h). New entries are added to list QQCACHE_INSERT_LIST
+ * and as each list gets full, the oldest entry in each list is pushed to
+ * the head of the succeeding list, and the oldest entries are removed
+ * from the cache (so each list never has more entries than their maximum
+ * size).
+ *
+ * The API itself is very similar to that of refhash. A qqcache_link_t struct
+ * is embedded within the definition of the entries that are being stored in
+ * a given qqcache_t. Functions are provided to hash/compare the tag (key)
+ * value of an entry, as well as destroying the entry during the creation
+ * of the cache. Lookups then occur by passing a pointer to the key value
+ * being looked up.
+ *
+ * NOTE: As one can take references to entries in the cache via the
+ * qqcache_hold() function, refheld entries that are marked for deletion are
+ * not counted when tracking the cache size, and their dtor function is not
+ * called until the last reference has been released (by calling the
+ * qqcache_rele() function).
+ */
+
+typedef enum qqcache_flag {
+ QQCACHE_F_DEAD = 0x01,
+} qqcache_flag_t;
+
+typedef struct qqcache_link {
+ list_node_t qqln_hash_link; /* Hash chain bucket */
+ list_node_t qqln_list_link; /* Cache list link */
+ uint_t qqln_listnum;
+ uint_t qqln_refcnt;
+ qqcache_flag_t qqln_flags;
+} qqcache_link_t;
+
+struct qqcache;
+typedef struct qqcache qqcache_t;
+
+typedef uint64_t (*qqcache_hash_fn_t)(const void *);
+typedef int (*qqcache_cmp_fn_t)(const void *, const void *);
+typedef void (*qqcache_dtor_fn_t)(void *);
+
+/*
+ * qqcache_create(qcp, sz, a, buckets, hash_fn, cmp_fn, dtor_fn,
+ * elsize, link_off, tag_off, flags);
+ *
+ * Creates a new 2Q cache:
+ *
+ * qqcache_t **qcp A pointer to the pointer that will hold the new
+ * cache.
+ *
+ * size_t sz The size of the cache (in entries).
+ *
+ * size_t a The percentage (0-100) of the cache dedicated to
+ * MRU entries (list 0);
+ *
+ * size_t buckets The number of hash buckets in the cache.
+ *
+ * qqcache_hash_fn_t hash_fn The function used to create a
+ * hash value for a given entry's tag
+ * value.
+ *
+ * qqcache_cmp_fn_t cmp_fn The function used to compare the two
+ * tag values of two entries. The function
+ * should return '0' if the two entries
+ * are equal, '1' if they are not equal.
+ *
+ * qqcache_dtor_fn_t dtor_fn The function used to destroy/free
+ * entries.
+ *
+ * size_t elsize The size of each entry.
+ *
+ * size_t link_off The offset of the qqcache_link_t struct in the entry.
+ *
+ * size_t tag_off The offset in the entry of the tag value (used for
+ * hashing and comparison).
+ *
+ * int flags The flags passed to kmem_zalloc/umem_zalloc.
+ *
+ * Returns:
+ * 0 Success
+ * EINVAL A parameter was not valid
+ * ENOMEM The memory allocation failed (only possible when
+ * KM_NOSLEEP/UMEM_DEFAULT is passed to flags).
+ */
+extern int qqcache_create(qqcache_t **, size_t, size_t, size_t,
+ qqcache_hash_fn_t, qqcache_cmp_fn_t, qqcache_dtor_fn_t,
+ size_t, size_t, size_t, int);
+
+/* Destroy the given qqcache_t */
+extern void qqcache_destroy(qqcache_t *);
+
+/*
+ * qqcache_insert(qc, obj)
+ *
+ * qqcache_t *qc The cache to insert the item into.
+ *
+ * void *obj The object to add.
+ *
+ * Returns:
+ * 0 Success
+ * EEXIST The same entry (as determined by the cache cmp function) already
+ * exists in the cache.
+ */
+extern int qqcache_insert(qqcache_t *, void *);
+
+/* Lookup an entry with the given tag/key, or return NULL if not found */
+extern void *qqcache_lookup(qqcache_t *, const void *);
+
+/* Remove the given entry from the cache */
+extern void qqcache_remove(qqcache_t *, void *);
+
+/* Add a hold on the entry in the cache */
+extern void qqcache_hold(qqcache_t *, void *);
+
+/* Release the hold on the entry in the cache */
+extern void qqcache_rele(qqcache_t *, void *);
+
+/*
+ * Adjust the size and percentage of the cache for list 0. If new values are
+ * smaller than current values, entries may be evicted as necessary to reduce
+ * the size of the cache to the given size.
+ */
+extern int qqcache_adjust_size(qqcache_t *, size_t);
+extern int qqcache_adjust_a(qqcache_t *, size_t);
+
+/* Return the current values of size or a. */
+extern size_t qqcache_size(const qqcache_t *);
+extern size_t qqcache_a(const qqcache_t *);
+
+/* Iterate through entries. */
+extern void *qqcache_first(qqcache_t *);
+extern void *qqcache_next(qqcache_t *, void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _QQCACHE_H */
diff --git a/usr/src/uts/common/sys/qqcache_impl.h b/usr/src/uts/common/sys/qqcache_impl.h
new file mode 100644
index 0000000000..f709b74d6c
--- /dev/null
+++ b/usr/src/uts/common/sys/qqcache_impl.h
@@ -0,0 +1,72 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018, Joyent, Inc.
+ */
+
+#ifndef _QQCACHE_IMPL_H
+#define _QQCACHE_IMPL_H
+
+#include <sys/debug.h>
+#include <sys/qqcache.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QQCACHE_NUM_LISTS 2
+#define QQCACHE_INSERT_LIST 1
+#define QQCACHE_MIN_SIZE 10
+
+CTASSERT(QQCACHE_INSERT_LIST < QQCACHE_NUM_LISTS);
+CTASSERT(QQCACHE_NUM_LISTS >= 2);
+
+typedef struct qqcache_list {
+ list_t qqcl_list;
+ size_t qqcl_len;
+} qqcache_list_t;
+
+struct qqcache {
+ qqcache_hash_fn_t qqc_hash_fn;
+ qqcache_cmp_fn_t qqc_cmp_fn;
+ qqcache_dtor_fn_t qqc_dtor_fn;
+ size_t qqc_link_off;
+ size_t qqc_tag_off;
+ size_t qqc_nbuckets;
+ size_t qqc_size;
+ size_t qqc_a;
+ size_t qqc_max[QQCACHE_NUM_LISTS];
+ qqcache_list_t qqc_lists[QQCACHE_NUM_LISTS];
+ qqcache_list_t qqc_buckets[];
+};
+
+#define QQCACHE_LIST(qqc, lnk) \
+ (&(qqc)->qqc_lists[(lnk)->qqln_listnum])
+
+#ifdef lint
+extern qqcache_link_t *obj_to_link(qqcache_t *, void *);
+extern void *link_to_obj(qqcache_t *, qqcache_link_t *);
+extern void *obj_to_tag(qqcache_t *, void *);
+#else
+#define obj_to_link(_q, _o) \
+ ((qqcache_link_t *)(((char *)(_o)) + (_q)->qqc_link_off))
+#define link_to_obj(_q, _l) \
+ ((void *)(((char *)(_l)) - (_q)->qqc_link_off))
+#define obj_to_tag(_q, _o) \
+ ((void *)(((char *)(_o)) + (_q)->qqc_tag_off))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _QQCACHE_IMPL_H */