summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/io
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/io')
-rw-r--r--usr/src/uts/common/io/overlay/overlay.c452
-rw-r--r--usr/src/uts/common/io/overlay/overlay_mux.c15
-rw-r--r--usr/src/uts/common/io/overlay/overlay_target.c1290
3 files changed, 1572 insertions, 185 deletions
diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c
index 2ad3f4f591..c54a6e0d9c 100644
--- a/usr/src/uts/common/io/overlay/overlay.c
+++ b/usr/src/uts/common/io/overlay/overlay.c
@@ -134,6 +134,40 @@
* be sent to. In addition, they handle questions related to how to handle
* things like broadcast and multicast traffic, etc.
*
+ * ROUTING
+ *
+ * Supporting routing of packets between VLANs that exist on an overlay
+ * network require two major differences. First, packets destined for off-VLAN
+ * destinations need to be identified. Second, we must obtain the necessary
+ * additional information necessary to deliver the packet to its off-VLAN
+ * destination.
+ *
+ * To solve the first issue, we utilize the existing IP routing functionality.
+ * Off-vlan destinations are given routes with next hops in the originating
+ * netstack's routing table--just like in physical networks. The system will
+ * then attempt to generate an ARP query, which will be sent out to varpd in
+ * the exact same manner as is described above for other on-VLAN destinations.
+ * The response for this will include a MAC address that is both used for the
+ * ARP reply, and is added to our VL2 MAC->UL3 hash table, but is added
+ * with the OVERLAY_ENTRY_F_ROUTER flag set. Once this is done, the originating
+ * netstack will send off-VLAN packets to this router MAC, allowing the
+ * overlay device to identify these packets as requiring routing.
+ *
+ * Once packets with an off-VLAN destination are identified, we must determine
+ * what the destination vid, VL2 VLAN, and VL2 MAC values are for the given
+ * packet. For reasons similar to the VL2 MAC->UL3 lookup described above,
+ * we utilize the flexibility of user land to perform these lookups (also
+ * using varpd). In this instance we are attempting to find a destination VL3
+ * IP to a UL3 IP mapping (a few extra bits of information are necessary to
+ * allow for disambiguation of the destination VL3 IP for situations such as
+ * mirroring a production environment including VL3 IPs in an isolated set of
+ * VLANs). We then store these results in a VL3->UL3 hash table for future
+ * lookups.
+ *
+ * To prevent the size of both the VL2->UL3 and VL3->UL3 hash tables from
+ * growing without bound, we cap the number of entries in each hash table and
+ * utilize the ARC algorithm to manage their contents.
+ *
* ----------
* Properties
* ----------
@@ -205,6 +239,10 @@
* UTF-8. Note that the size of the string includes the null
* terminator.
*
+ * OVERLAY_PROP_T_ETHER
+ *
+ * An ether_addr_t, which has a fixed size.
+ *
* The next thing that we apply to a property is its permission. The permissions
* are put together by the bitwise or of the following flags and values.
*
@@ -461,7 +499,7 @@
* On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
* are much more interesting and as a result, more complicated. We primarily
* store lists of overlay_target_entry_t's which are stored in both an avl tree
- * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
+ * and a qqcache_t. The primary look up path uses the qqcache_t and the avl tree
* is only used for a few of the target ioctls used to dump data such that we
* can get a consistent iteration order for things like dladm show-overlay -t.
* The key that we use for the reference hashtable is based on the mac address
@@ -486,6 +524,28 @@
* any outstanding data to that place. For the full story on how we look that up
* will be discussed in the section on the Target Cache Lifecycle.
*
+ * For routing, everything works largely the same as it does in the non-routing
+ * situations. The major differences are that both the target cache is always
+ * an OVERLAY_TARGET_DYNAMIC cache, and that an additional hash table lookup
+ * occurs. When a routed packet is sent down stack, the
+ * overlay_target_entry_t in the VL2 cache will have its
+ * OVERLAY_ENTRY_F_ROUTER flag set, which will prompt a lookup in the VL3->UL3
+ * cache (using the source VL3, source VL2 VLAN, and destination VL3 values
+ * from the packet as the lookup key). The entry returned from the cache is
+ * used to modify the source and destination VL2 MAC addresses as well as
+ * the VL2 VLAN ID, and then is encapsulated and sent to its UL3 destination.
+ * On reception, decapsulation happens exactely the same as in the non-routed
+ * case, and the packet appears just as if it was sent out the VL2 network
+ * from a router connected to it. This is done both to maintain the illusion
+ * of a physical network when sniffing packets at the instance level, and
+ * so that the mac layer sitting above the destinations overlay device
+ * (for the vnic created over the overlay) does not discard the packet because
+ * its VLAN tag does not match the VLAN tag of the destination VNIC. While
+ * some of these modifications could be split between the source and
+ * destination hosts, by doing the work on the source, it maximizes any
+ * potential parallelism that might be present from multiple flows to a given
+ * destination.
+ *
* ------------------------
* FMA and Degraded Devices
* ------------------------
@@ -830,21 +890,62 @@ typedef enum overlay_dev_prop {
OVERLAY_DEV_P_MTU = 0,
OVERLAY_DEV_P_VNETID,
OVERLAY_DEV_P_ENCAP,
- OVERLAY_DEV_P_VARPDID
+ OVERLAY_DEV_P_VARPDID,
+ OVERLAY_DEV_P_DCID,
+ OVERLAY_DEV_P_VL2_CACHE_SIZE,
+ OVERLAY_DEV_P_VL2_CACHE_A,
+ OVERLAY_DEV_P_ROUTE_CACHE_SIZE,
+ OVERLAY_DEV_P_ROUTE_CACHE_A
} overlay_dev_prop_t;
-#define OVERLAY_DEV_NPROPS 4
+#define OVERLAY_DEV_NPROPS 9
static const char *overlay_dev_props[] = {
"mtu",
"vnetid",
"encap",
- "varpd/id"
+ "varpd/id",
+ "dcid",
+ "vl2_cache_size",
+ "_vl2_cache_a",
+ "route_cache_size",
+ "_route_cache_a"
};
+/* properties that can be changed live */
+static boolean_t overlay_dev_liveprop[] = {
+ B_FALSE, /* mtu */
+ B_FALSE, /* vnetid */
+ B_FALSE, /* encap */
+ B_FALSE, /* varpd/id */
+ B_FALSE, /* dcid */
+ B_TRUE, /* vl2_cache_size */
+ B_TRUE, /* _vl2_cache_a */
+ B_TRUE, /* route_cache_size */
+ B_TRUE /* _route_cache_a */
+};
+
+CTASSERT(ARRAY_SIZE(overlay_dev_props) == OVERLAY_DEV_NPROPS);
+CTASSERT(ARRAY_SIZE(overlay_dev_liveprop) == OVERLAY_DEV_NPROPS);
+
#define OVERLAY_MTU_MIN 576
#define OVERLAY_MTU_DEF 1400
#define OVERLAY_MTU_MAX 8900
+/* The 2Q parameter 'a' is a percentage */
+#define OVERLAY_CACHE_MAX_A 100
+/* An somewhat arbitrary default, biasing towards storing more MFU entries */
+#define OVERLAY_CACHE_A_DEF 75
+
+/* Somewhat arbitrary min and max values */
+#define OVERLAY_VL2_CACHE_MIN 256
+#define OVERLAY_VL2_CACHE_MAX 10240
+#define OVERLAY_VL2_CACHE_DEF OVERLAY_VL2_CACHE_MIN
+
+/* Somewhat arbitrary min and max values */
+#define OVERLAY_ROUTE_CACHE_MIN 256
+#define OVERLAY_ROUTE_CACHE_MAX 10240
+#define OVERLAY_ROUTE_CACHE_DEF OVERLAY_ROUTE_CACHE_MIN
+
overlay_dev_t *
overlay_hold_by_dlid(datalink_id_t id)
{
@@ -1066,7 +1167,6 @@ overlay_m_tx(void *arg, mblk_t *mp_chain)
bzero(&hdr, sizeof (struct msghdr));
bzero(&einfo, sizeof (ovep_encap_info_t));
- einfo.ovdi_id = odd->odd_vid;
mp = mp_chain;
while (mp != NULL) {
socklen_t slen;
@@ -1077,7 +1177,7 @@ overlay_m_tx(void *arg, mblk_t *mp_chain)
ep = NULL;
ret = overlay_target_lookup(odd, mp,
- (struct sockaddr *)&storage, &slen);
+ (struct sockaddr *)&storage, &slen, &einfo.ovdi_id);
if (ret != OVERLAY_TARGET_OK) {
if (ret == OVERLAY_TARGET_DROP)
freemsg(mp);
@@ -1260,6 +1360,19 @@ overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
}
odd->odd_vid = oicp->oic_vnetid;
+ if (oicp->oic_dcid > UINT32_MAX) {
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EINVAL);
+ }
+ odd->odd_dcid = oicp->oic_dcid;
+
+ odd->odd_vl2sz = OVERLAY_VL2_CACHE_DEF;
+ odd->odd_vl2a = OVERLAY_CACHE_A_DEF;
+ odd->odd_routesz = OVERLAY_ROUTE_CACHE_DEF;
+ odd->odd_routea = OVERLAY_CACHE_A_DEF;
+
mac = mac_alloc(MAC_VERSION);
if (mac == NULL) {
mutex_exit(&overlay_dev_lock);
@@ -1613,6 +1726,7 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
int ret;
mac_perim_handle_t mph;
uint_t propid = UINT_MAX;
+ uint32_t def;
overlay_ioc_propinfo_t *oip = karg;
overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
@@ -1695,6 +1809,42 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
overlay_prop_set_nodefault(phdl);
break;
+ case OVERLAY_DEV_P_DCID:
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_nodefault(phdl);
+ overlay_prop_set_range_uint32(phdl, 0, UINT32_MAX);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_SIZE:
+ def = OVERLAY_VL2_CACHE_DEF;
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, OVERLAY_VL2_CACHE_MIN,
+ OVERLAY_VL2_CACHE_MAX);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_A:
+ def = OVERLAY_CACHE_A_DEF;
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, 0, OVERLAY_CACHE_MAX_A);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_SIZE:
+ def = OVERLAY_ROUTE_CACHE_DEF;
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, OVERLAY_ROUTE_CACHE_MIN,
+ OVERLAY_ROUTE_CACHE_MAX);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_A:
+ def = OVERLAY_CACHE_A_DEF;
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, 0, OVERLAY_CACHE_MAX_A);
+ break;
default:
overlay_hold_rele(odd);
mac_perim_exit(mph);
@@ -1804,6 +1954,41 @@ overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
}
mutex_exit(&odd->odd_lock);
break;
+ case OVERLAY_DEV_P_DCID:
+ /*
+ * While it's read-only while inside of a mux, we're not in a
+ * context that can guarantee that. Therefore we always grab the
+ * overlay_dev_t's odd_lock.
+ */
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_dcid, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_SIZE:
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_vl2sz, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_A:
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_vl2a, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_SIZE:
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_routesz, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_A:
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_routea, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
default:
ret = ENOENT;
}
@@ -1845,6 +2030,146 @@ overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
mutex_exit(&odd->odd_lock);
}
+static void
+overlay_setprop_dcid(overlay_dev_t *odd, uint32_t dcid)
+{
+ mutex_enter(&odd->odd_lock);
+
+ /* Simple case, not active */
+ if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+ odd->odd_dcid = dcid;
+ mutex_exit(&odd->odd_lock);
+ return;
+ }
+
+ /*
+ * In the hard case, we need to set the drop flag, quiesce I/O and then
+ * we can go ahead and do everything.
+ */
+ odd->odd_flags |= OVERLAY_F_MDDROP;
+ overlay_io_wait(odd, OVERLAY_F_IOMASK);
+ mutex_exit(&odd->odd_lock);
+
+ overlay_mux_remove_dev(odd->odd_mux, odd);
+ mutex_enter(&odd->odd_lock);
+ odd->odd_dcid = dcid;
+ mutex_exit(&odd->odd_lock);
+ overlay_mux_add_dev(odd->odd_mux, odd);
+
+ mutex_enter(&odd->odd_lock);
+ ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
+ odd->odd_flags &= ~OVERLAY_F_IN_MUX;
+ mutex_exit(&odd->odd_lock);
+}
+
+static int
+overlay_setprop_vl2_cachesz(overlay_dev_t *odd, uint32_t sz)
+{
+ overlay_target_t *ott = NULL;
+ int ret = 0;
+
+ if (sz == 0)
+ sz = OVERLAY_VL2_CACHE_DEF;
+
+ /* Caller should have validated this */
+ ASSERT3U(sz, >=, OVERLAY_VL2_CACHE_MIN);
+ ASSERT3U(sz, <=, OVERLAY_VL2_CACHE_MAX);
+
+ mutex_enter(&odd->odd_lock);
+ ott = odd->odd_target;
+
+ /* ott_mode is RO if the target exists */
+ if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ mutex_enter(&ott->ott_lock);
+ ret = qqcache_adjust_size(ott->ott_u.ott_dyn.ott_dhash, sz);
+ mutex_exit(&ott->ott_lock);
+ }
+
+ if (ret == 0)
+ odd->odd_vl2sz = sz;
+ mutex_exit(&odd->odd_lock);
+
+ return (ret);
+}
+
+static int
+overlay_setprop_vl2_cachea(overlay_dev_t *odd, uint32_t a)
+{
+ overlay_target_t *ott = NULL;
+ int ret = 0;
+
+ /* Caller should have validated this */
+ ASSERT3U(a, <=, 100);
+
+ mutex_enter(&odd->odd_lock);
+ ott = odd->odd_target;
+
+ /* ott_mode is RO if the target exists */
+ if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ mutex_enter(&ott->ott_lock);
+ ret = qqcache_adjust_a(ott->ott_u.ott_dyn.ott_dhash, a);
+ mutex_exit(&ott->ott_lock);
+ }
+ if (ret == 0)
+ odd->odd_vl2a = a;
+ mutex_exit(&odd->odd_lock);
+
+ return (ret);
+}
+
+static int
+overlay_setprop_route_cachesz(overlay_dev_t *odd, uint32_t sz)
+{
+ overlay_target_t *ott = NULL;
+ int ret = 0;
+
+ if (sz == 0)
+ sz = OVERLAY_ROUTE_CACHE_DEF;
+
+ ASSERT3U(sz, >=, OVERLAY_ROUTE_CACHE_MIN);
+ ASSERT3U(sz, <=, OVERLAY_ROUTE_CACHE_MAX);
+
+ mutex_enter(&odd->odd_lock);
+ ott = odd->odd_target;
+
+ /* ott_mode is RO if the target exists */
+ if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ mutex_enter(&ott->ott_lock);
+ ret = qqcache_adjust_size(ott->ott_u.ott_dyn.ott_l3dhash, sz);
+ mutex_exit(&ott->ott_lock);
+ }
+ if (ret == 0)
+ odd->odd_routesz = sz;
+ mutex_exit(&odd->odd_lock);
+
+ return (ret);
+}
+
+static int
+overlay_setprop_route_cachea(overlay_dev_t *odd, uint32_t a)
+{
+ overlay_target_t *ott = NULL;
+ int ret = 0;
+
+ /* Caller should have validated this */
+ ASSERT3U(a, <=, 100);
+
+ mutex_enter(&odd->odd_lock);
+ ott = odd->odd_target;
+
+ /* ott_mode is RO if the target exists */
+ if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ mutex_enter(&ott->ott_lock);
+ ret = qqcache_adjust_a(ott->ott_u.ott_dyn.ott_l3dhash, a);
+ mutex_exit(&ott->ott_lock);
+ }
+ if (ret == 0)
+ odd->odd_routea = a;
+ mutex_exit(&odd->odd_lock);
+
+ return (ret);
+}
+
/* ARGSUSED */
static int
overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
@@ -1855,7 +2180,7 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
overlay_ioc_prop_t *oip = karg;
uint_t propid = UINT_MAX;
mac_perim_handle_t mph;
- uint64_t maxid, *vidp;
+ uint64_t maxid, *vidp, *dcidp, *vl2szp, *vl2ap, *routeszp, *routeap;
if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
return (EINVAL);
@@ -1865,31 +2190,48 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
return (ENOENT);
oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
- mac_perim_enter_by_mh(odd->odd_mh, &mph);
- mutex_enter(&odd->odd_lock);
- if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
- mac_perim_exit(mph);
- mutex_exit(&odd->odd_lock);
- return (ENOTSUP);
- }
- mutex_exit(&odd->odd_lock);
+
+ /*
+ * Currently, only certain overlay properties (and no encapsulation
+ * properties) can be changed while the overlay device is active.
+ */
if (oip->oip_id == -1) {
int i;
for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
break;
- if (i == OVERLAY_DEV_NPROPS) {
- ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
- odd->odd_pvoid, oip->oip_name,
- oip->oip_value, oip->oip_size);
- overlay_hold_rele(odd);
- mac_perim_exit(mph);
- return (ret);
- }
}
- propid = i;
+ if (i < OVERLAY_DEV_NPROPS)
+ propid = i;
+ } else if (oip->oip_id < OVERLAY_DEV_NPROPS) {
+ propid = oip->oip_id;
+ }
+
+ /*
+ * A bit tricky, but propid is initalized to UINT_MAX, so we know we
+ * have an overlay property whenever propid < OVERLAY_DEV_NPROPS,
+ * otherwise we have a plugin property.
+ */
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_ACTIVATED) &&
+ ((propid >= OVERLAY_DEV_NPROPS || !overlay_dev_liveprop[propid]))) {
+ mutex_exit(&odd->odd_lock);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_exit(&odd->odd_lock);
+
+ if (oip->oip_id == -1 && propid >= OVERLAY_DEV_NPROPS) {
+ ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
+ odd->odd_pvoid, oip->oip_name,
+ oip->oip_value, oip->oip_size);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (ret);
} else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
@@ -1941,6 +2283,68 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
case OVERLAY_DEV_P_VARPDID:
ret = EPERM;
break;
+ case OVERLAY_DEV_P_DCID:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ dcidp = (uint64_t *)oip->oip_value;
+ if (*dcidp > UINT32_MAX) {
+ ret = EINVAL;
+ break;
+ }
+ overlay_setprop_dcid(odd, *dcidp);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_SIZE:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ vl2szp = (uint64_t *)oip->oip_value;
+ if (*vl2szp != 0 && (*vl2szp < OVERLAY_VL2_CACHE_MIN ||
+ *vl2szp > OVERLAY_VL2_CACHE_MAX)) {
+ ret = EINVAL;
+ break;
+ }
+ ret = overlay_setprop_vl2_cachesz(odd, *vl2szp);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_A:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ vl2ap = (uint64_t *)oip->oip_value;
+ if (*vl2ap > OVERLAY_CACHE_MAX_A) {
+ ret = EINVAL;
+ break;
+ }
+ ret = overlay_setprop_vl2_cachea(odd, *vl2ap);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_SIZE:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ routeszp = (uint64_t *)oip->oip_value;
+ if (*routeszp != 0 && (*routeszp < OVERLAY_ROUTE_CACHE_MIN ||
+ OVERLAY_ROUTE_CACHE_MAX)) {
+ ret = EINVAL;
+ break;
+ }
+ ret = overlay_setprop_route_cachesz(odd, *routeszp);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_A:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ routeap = (uint64_t *)oip->oip_value;
+ if (*routeap > OVERLAY_CACHE_MAX_A) {
+ ret = EINVAL;
+ break;
+ }
+ ret = overlay_setprop_route_cachea(odd, *routeap);
+ break;
default:
ret = ENOENT;
}
diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c
index 0c21bb8689..2688d2791c 100644
--- a/usr/src/uts/common/io/overlay/overlay_mux.c
+++ b/usr/src/uts/common/io/overlay/overlay_mux.c
@@ -127,6 +127,18 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
freeb(fmp);
/*
+ * In cases of looped-back vxlan, that tends to have a
+ * prepended IP+UDP-only mblk, followed by the data. Parsing
+ * would've made that mblk a zero-length one (rptr == wptr).
+ */
+ if (mp->b_rptr == mp->b_wptr && mp->b_cont != NULL) {
+ /* Ended up with zero-length mblk, lose it! */
+ fmp = mp;
+ mp = fmp->b_cont;
+ freeb(fmp);
+ }
+
+ /*
* Until we have VXLAN-or-other-decap HW acceleration support
* (e.g. we support NICs that reach into VXLAN-encapsulated
* packets and check the inside-VXLAN IP packets' checksums,
@@ -161,10 +173,9 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
if (rem == blkl) {
fmp = mp;
mp = fmp->b_cont;
- fmp->b_cont = NULL;
OVERLAY_FREEMSG(mp,
"freed a fmp block");
- freemsg(fmp);
+ freeb(fmp);
}
}
if (mp == NULL) {
diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c
index f4147b56d1..171acd034f 100644
--- a/usr/src/uts/common/io/overlay/overlay_target.c
+++ b/usr/src/uts/common/io/overlay/overlay_target.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -20,6 +20,8 @@
* uts/common/io/overlay/overlay.c
*/
+#include <inet/ip.h>
+#include <inet/ip6.h>
#include <sys/types.h>
#include <sys/ethernet.h>
#include <sys/kmem.h>
@@ -42,6 +44,9 @@
#include <sys/overlay_impl.h>
#include <sys/sdt.h>
+#define OVERLAY_DROP(mp, reason) \
+ DTRACE_PROBE2(overlay__drop, mblk_t *, mp, char *, reason)
+
/*
* This is total straw man, but at least it's a prime number. Here we're
* going to have to go through and do a lot of evaluation and understanding as
@@ -52,6 +57,19 @@
#define OVERLAY_HSIZE 823
/*
+ * The default size of each target cache. This is also a complete strawman
+ * whose value could change as we gain better operational experience with
+ * overlay routing.
+ */
+#define OVERLAY_CACHE_SIZE 512
+
+/*
+ * A somewhat arbitrary value. The percentage of the target cache dedicated
+ * to MFU entries (i.e. entries that have been looked up more than once).
+ */
+#define OVERLAY_CACHE_A 60
+
+/*
* We use this data structure to keep track of what requests have been actively
* allocated to a given instance so we know what to put back on the pending
* list.
@@ -69,7 +87,7 @@ typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int);
typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *);
typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int);
-typedef struct overaly_target_ioctl {
+typedef struct overlay_target_ioctl {
int oti_cmd; /* ioctl id */
boolean_t oti_write; /* ioctl requires FWRITE */
boolean_t oti_ncopyout; /* copyout data? */
@@ -144,25 +162,60 @@ overlay_entry_cache_destructor(void *buf, void *arg)
static uint64_t
overlay_mac_hash(const void *v)
{
+ const overlay_target_mac_t *m = v;
+
uint32_t crc;
- CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
+ CRC32(crc, m->otm_mac, ETHERADDRL, -1U, crc32_table);
+ CRC32(crc, &m->otm_dcid, sizeof (uint32_t), crc, crc32_table);
return (crc);
}
static int
overlay_mac_cmp(const void *a, const void *b)
{
- return (bcmp(a, b, ETHERADDRL));
+ const overlay_target_mac_t *l = a;
+ const overlay_target_mac_t *r = b;
+
+ if (l->otm_dcid != r->otm_dcid)
+ return (1);
+ return (bcmp(l->otm_mac, r->otm_mac, ETHERADDRL) != 0);
+}
+
+static uint64_t
+overlay_ip_hash(const void *v)
+{
+ const overlay_target_vl3_t *vl3 = v;
+
+ uint32_t crc;
+ CRC32(crc, &vl3->otvl3_src, sizeof (vl3->otvl3_src), -1U, crc32_table);
+ CRC32(crc, &vl3->otvl3_dst, sizeof (vl3->otvl3_dst), crc, crc32_table);
+ CRC32(crc, &vl3->otvl3_src_vlan, sizeof (vl3->otvl3_src_vlan), crc,
+ crc32_table);
+ return (crc);
+}
+
+static int
+overlay_ip_cmp(const void *a, const void *b)
+{
+ const overlay_target_vl3_t *l = a;
+ const overlay_target_vl3_t *r = b;
+
+ if (l->otvl3_src_vlan != r->otvl3_src_vlan)
+ return (1);
+ if (!IN6_ARE_ADDR_EQUAL(&l->otvl3_src, &r->otvl3_src))
+ return (1);
+ if (!IN6_ARE_ADDR_EQUAL(&l->otvl3_dst, &r->otvl3_dst))
+ return (1);
+ return (0);
}
-/* ARGSUSED */
static void
overlay_target_entry_dtor(void *arg)
{
overlay_target_entry_t *ote = arg;
ote->ote_flags = 0;
- bzero(ote->ote_addr, ETHERADDRL);
+ bzero(&ote->ote_u, sizeof (ote->ote_u));
ote->ote_ott = NULL;
ote->ote_odd = NULL;
freemsgchain(ote->ote_chead);
@@ -172,21 +225,76 @@ overlay_target_entry_dtor(void *arg)
kmem_cache_free(overlay_entry_cache, ote);
}
+static void
+overlay_target_entry_l2qq_dtor(void *arg)
+{
+ overlay_target_entry_t *ote = arg;
+ overlay_target_t *ott = ote->ote_ott;
+
+ ASSERT(MUTEX_HELD(&ott->ott_lock));
+ ASSERT3S((ote->ote_flags & OVERLAY_ENTRY_F_VL3), ==, 0);
+
+ avl_remove(&ott->ott_u.ott_dyn.ott_tree, ote);
+ overlay_target_entry_dtor(ote);
+}
+
+static void
+overlay_target_entry_l3qq_dtor(void *arg)
+{
+ overlay_target_entry_t *ote = arg;
+ overlay_target_t *ott = ote->ote_ott;
+
+ ASSERT(MUTEX_HELD(&ott->ott_lock));
+ ASSERT3S((ote->ote_flags & OVERLAY_ENTRY_F_VL3), ==,
+ OVERLAY_ENTRY_F_VL3);
+
+ avl_remove(&ott->ott_u.ott_dyn.ott_l3tree, ote);
+ overlay_target_entry_dtor(ote);
+}
+
static int
overlay_mac_avl(const void *a, const void *b)
{
+ const overlay_target_entry_t *le = a;
+ const overlay_target_entry_t *re = b;
+ const overlay_target_mac_t *lm = &le->ote_u.ote_vl2.otvl2_mac;
+ const overlay_target_mac_t *rm = &re->ote_u.ote_vl2.otvl2_mac;
int i;
- const overlay_target_entry_t *l, *r;
- l = a;
- r = b;
+
+ /* Order by DCID, then MAC */
+ if (lm->otm_dcid < rm->otm_dcid)
+ return (-1);
+ if (lm->otm_dcid > rm->otm_dcid)
+ return (1);
for (i = 0; i < ETHERADDRL; i++) {
- if (l->ote_addr[i] > r->ote_addr[i])
+ if (lm->otm_mac[i] > rm->otm_mac[i])
return (1);
- else if (l->ote_addr[i] < r->ote_addr[i])
+ else if (lm->otm_mac[i] < rm->otm_mac[i])
return (-1);
}
+ return (0);
+}
+static int
+overlay_ip_avl(const void *a, const void *b)
+{
+ const overlay_target_entry_t *l = a;
+ const overlay_target_entry_t *r = b;
+ const overlay_target_vl3_t *l_vl3 = &l->ote_u.ote_vl3;
+ const overlay_target_vl3_t *r_vl3 = &r->ote_u.ote_vl3;
+ int ret;
+
+ if ((ret = memcmp(&l_vl3->otvl3_src, &r_vl3->otvl3_src,
+ sizeof (l_vl3->otvl3_src))) != 0)
+ return (ret < 0 ? -1 : 1);
+ if ((ret = memcmp(&l_vl3->otvl3_dst, &r_vl3->otvl3_dst,
+ sizeof (l_vl3->otvl3_dst))) != 0)
+ return (ret < 0 ? -1 : 1);
+ if (l_vl3->otvl3_src_vlan < r_vl3->otvl3_src_vlan)
+ return (-1);
+ if (l_vl3->otvl3_src_vlan > r_vl3->otvl3_src_vlan)
+ return (1);
return (0);
}
@@ -233,25 +341,20 @@ overlay_target_free(overlay_dev_t *odd)
return;
if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
- refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
- avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
- overlay_target_entry_t *ote;
-
+ mutex_enter(&odd->odd_target->ott_lock);
/*
- * Our AVL tree and hashtable contain the same elements,
- * therefore we should just remove it from the tree, but then
- * delete the entries when we remove them from the hash table
- * (which happens through the refhash dtor).
+ * Our VL3 AVL tree and hashtable contain the same elements.
+ * Additionally, when an entry is removed from the 2Q cache,
+ * the entry is removed from the corresponding AVL tree.
+ * Deleting the 2Q cache will destroy any remaining entries,
+ * so all we need to do is destroy the 2Q caches.
*/
- while ((ote = avl_first(ap)) != NULL)
- avl_remove(ap, ote);
-
- avl_destroy(ap);
- for (ote = refhash_first(rp); ote != NULL;
- ote = refhash_next(rp, ote)) {
- refhash_remove(rp, ote);
- }
- refhash_destroy(rp);
+ qqcache_destroy(odd->odd_target->ott_u.ott_dyn.ott_dhash);
+ qqcache_destroy(odd->odd_target->ott_u.ott_dyn.ott_l3dhash);
+ ASSERT(avl_is_empty(&odd->odd_target->ott_u.ott_dyn.ott_tree));
+ ASSERT(avl_is_empty(
+ &odd->odd_target->ott_u.ott_dyn.ott_l3tree));
+ mutex_exit(&odd->odd_target->ott_lock);
}
ASSERT(odd->odd_target->ott_ocount == 0);
@@ -270,18 +373,42 @@ overlay_target_busy()
return (ret);
}
+/*
+ * Queue the target entry on the list of varpd requests. entry should be
+ * refheld for the duration of this call (this call takes its own additional
+ * hold that is released when we receive a response).
+ */
static void
overlay_target_queue(overlay_target_entry_t *entry)
{
+ overlay_target_t *ott = entry->ote_ott;
+ boolean_t is_vl3 = B_FALSE;
+
+ /*
+ * ote_ott is read-only and set at entry creation, so it can be
+ * read without ote_lock held
+ */
+ ASSERT(!MUTEX_HELD(&entry->ote_lock));
+
+ mutex_enter(&entry->ote_lock);
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0)
+ is_vl3 = B_TRUE;
+ mutex_exit(&entry->ote_lock);
+
mutex_enter(&overlay_target_lock);
- mutex_enter(&entry->ote_ott->ott_lock);
- if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
- mutex_exit(&entry->ote_ott->ott_lock);
+ mutex_enter(&ott->ott_lock);
+ if (ott->ott_flags & OVERLAY_T_TEARDOWN) {
+ mutex_exit(&ott->ott_lock);
mutex_exit(&overlay_target_lock);
return;
}
- entry->ote_ott->ott_ocount++;
- mutex_exit(&entry->ote_ott->ott_lock);
+ ott->ott_ocount++;
+ if (is_vl3)
+ qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ else
+ qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
+
+ mutex_exit(&ott->ott_lock);
list_insert_tail(&overlay_target_list, entry);
cv_signal(&overlay_target_condvar);
mutex_exit(&overlay_target_lock);
@@ -300,22 +427,446 @@ overlay_target_quiesce(overlay_target_t *ott)
}
/*
- * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
+ * Write the VL3 src/dst IP from the packet in mp into src and dst. If the
+ * addresses are IPv4 addresses, they are written as mapped addresses.
+ */
+static int
+overlay_get_vl3_ips(mblk_t *mp, struct in6_addr *src, struct in6_addr *dst)
+{
+ uint16_t sap;
+
+#if 1
+ /* Temporary until mblk helpers are integrated */
+ struct ether_vlan_header *eth = (struct ether_vlan_header *)mp->b_rptr;
+ ipha_t *iphp = (ipha_t *)(eth + 1);
+ ip6_t *ip6hp = (ip6_t *)(eth + 1);
+ size_t mlen = MBLKL(mp);
+
+ if (mlen < sizeof (struct ether_vlan_header))
+ return (EINVAL);
+ mlen -= sizeof (struct ether_vlan_header);
+
+ /* We currently don't support routing on untagged vlans */
+ if ((sap = ntohs(eth->ether_tpid)) != ETHERTYPE_VLAN)
+ return (EINVAL);
+
+ sap = ntohs(eth->ether_type);
+ if (mlen == 0) {
+ if ((mp = mp->b_cont) == NULL)
+ return (EINVAL);
+ mlen = MBLKL(mp);
+ iphp = (ipha_t *)mp->b_rptr;
+ ip6hp = (ip6_t *)mp->b_rptr;
+ }
+
+ switch (sap) {
+ case ETHERTYPE_IP:
+ if (mlen < sizeof (ipha_t))
+ return (EINVAL);
+ ASSERT3U(IPH_HDR_VERSION(iphp), ==, IPV4_VERSION);
+ IN6_IPADDR_TO_V4MAPPED(iphp->ipha_src, src);
+ IN6_IPADDR_TO_V4MAPPED(iphp->ipha_dst, dst);
+ break;
+ case ETHERTYPE_IPV6:
+ if (mlen < sizeof (ip6_t))
+ return (EINVAL);
+ ASSERT3U(IPH_HDR_VERSION(iphp), ==, IPV6_VERSION);
+ bcopy(&ip6hp->ip6_src, src, sizeof (*src));
+ bcopy(&ip6hp->ip6_dst, dst, sizeof (*dst));
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+#else
+ size_t soff, doff;
+ uint32_t v4s, v4d;
+ int i;
+
+ if (!mblk_read_uint16(mp, offsetof(struct ether_header, ether_type),
+ &sap))
+ return (EINVAL);
+
+ if (sap == ETHERTYPE_VLAN) {
+ if (!mblk_read_uint16(mp,
+ offsetof(struct ether_vlan_header, ether_type), &sap))
+ return (EINVAL);
+ soff = doff = sizeof (struct ether_vlan_header);
+ } else {
+ soff = doff = sizeof (struct ether_header);
+ }
+
+ switch (sap) {
+ case ETHERTYPE_IP:
+ soff += offsetof(ipha_t, ipha_src);
+ doff += offsetof(ipha_t, ipha_dst);
+
+ if (!mblk_read_uint32(mp, soff, &v4s) ||
+ !mblk_read_uint32(mp, doff, &v4d))
+ return (EINVAL);
+ IN6_IPADDR_TO_V4MAPPED(&v4s, src);
+ IN6_IPADDR_TO_V4MAPPED(&v4d, dst);
+ break;
+ case ETHERTYPE_IPV6:
+ soff += offsetof(ip6_t, ip6_src);
+ doff += offsetof(ip6_6, ip6_dst);
+
+ for (i = 0; i < 4; i++) {
+ if (!mblk_read_uint32(mp, soff, &src->s6_addr32[i]) ||
+ !mblk_read_uint32(mp, doff, &dst->s6_addr32[i]))
+ return (EINVAL);
+ soff += sizeof (uint32_t);
+ doff += sizeof (uint32_t);
+ }
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+#endif
+}
+
+static int
+overlay_route(overlay_dev_t *odd, mblk_t *mp,
+ const overlay_target_route_t *route, const overlay_target_mac_t *dst_mac)
+{
+ uint16_t tci;
+
+ if (MBLKL(mp) >= sizeof (struct ether_vlan_header)) {
+ struct ether_vlan_header *evh;
+
+ evh = (struct ether_vlan_header *)mp->b_rptr;
+ tci = ntohs(evh->ether_tci);
+
+ /*
+ * Today we require all encapsulated frames to be vlan tagged.
+ * If this is relaxed in the future, we will need to allow for
+ * insertion and removal of the vlan tag as appropriate here.
+ */
+ if (ntohs(evh->ether_tpid) != ETHERTYPE_VLAN) {
+ OVERLAY_DROP(mp, "not vlan tagged");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ tci &= ~(VLAN_ID_MASK);
+ tci |= route->otr_vlan;
+ evh->ether_tci = htons(tci);
+ bcopy(dst_mac->otm_mac, &evh->ether_dhost, ETHERADDRL);
+ bcopy(route->otr_srcmac, &evh->ether_shost, ETHERADDRL);
+ return (OVERLAY_TARGET_OK);
+ }
+
+#if 1
+ /* Temporary until mblk helpers are integrated */
+ OVERLAY_DROP(mp, "ethernet header split between mblks");
+ return (OVERLAY_TARGET_DROP);
+#else
+ size_t off;
+
+ off = offsetof(struct ether_vlan_header, ether_tpid);
+ if (!mblk_read_uint16(mp, off, &tci)) {
+ OVERLAY_DROP(mp, "cannot read tpid");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ tci = ntohs(evh->ether_tci);
+ tci &= ~(VLAN_ID_MASK);
+ tci |= route->otr_vlan;
+
+ if (!mblk_write_uint16(mp, off, tci)) {
+ OVERLAY_DROP(mp, "cannot set routed destination vlan");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ for (int i = 0; i < ETHERADDRL; i++) {
+ if (!mblk_write_uint8(mp, i, dst_msc->otm_mac[i]) ||
+ !mblk_write_uint8(mp, i + ETHERADDRL,
+ route->otr_srcmac[i])) {
+ OVERLAY_DROP(mp, "cannot set routed macs");
+ return (OVERLAY_TARGET_DROP);
+ }
+ }
+
+ return (OVERLAY_TARGET_OK);
+#endif
+}
+
+/*
+ * Attempt to add mp to the packet queue of target entry. If the queue is
+ * already full, it returns OVERLAY_TARGET_DROP, otherwise OVERLAY_TARGET_ASYNC
+ * is returned. If the entry isn't already pending a response from varpd,
+ * queue the target entry on the list of outstanding varpd requests.
+ *
+ * Entry should already be locked, however since it is intended that this
+ * should be the final step in dealing with this entry (for handling the
+ * packet in question), it always releases ote_lock before returning.
+ * entry should be refheld for the duration of this call.
+ */
+static int
+overlay_target_try_queue(overlay_target_entry_t *entry, mblk_t *mp)
+{
+ size_t mlen = msgsize(mp);
+ boolean_t queue = B_FALSE;
+
+ ASSERT(MUTEX_HELD(&entry->ote_lock));
+
+ if (mlen + entry->ote_mbsize > overlay_ent_size) {
+ OVERLAY_DROP(mp, "target queue full");
+ mutex_exit(&entry->ote_lock);
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ if (entry->ote_ctail != NULL) {
+ ASSERT(entry->ote_ctail->b_next == NULL);
+ entry->ote_ctail->b_next = mp;
+ entry->ote_ctail = mp;
+ } else {
+ entry->ote_chead = mp;
+ entry->ote_ctail = mp;
+ }
+ entry->ote_mbsize += mlen;
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0) {
+ entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+ queue = B_TRUE;
+ }
+ mutex_exit(&entry->ote_lock);
+
+ if (queue)
+ overlay_target_queue(entry);
+
+ return (OVERLAY_TARGET_ASYNC);
+}
+
+/*
+ * Given the VL3 IP->VL2 mac entry (vl3e), and the corresponding VL2 MAC->UL3
+ * entry (vl2e), if both entries are valid, sets *vidp, *v6, *slenp to the
+ * correct UL3 destination and return OVERLAY_TARGET_OK. If either of the
+ * entries are still pending lookups (or v2le is NULL because the entry is
+ * missing), mp is queued (if there is space) on the appropriate entry and
+ * OVERLAY_TARGET_ASYNC is returned. If the VL2 entry is flagged to drop all
+ * packets, OVERLAY_TARGET_DROP is returned.
+ *
+ * In all cases, the caller should acquire vl3e->ote_lock prior to calling
+ * overlay_route_lookup_vl2(). Because vl2e can be missing (NULL), the caller
+ * should not acquire vl2e->ote_lock prior to calling
+ * overlay_route_lookup_vl2(). vl3e->ote_lock is alway dropped prior to
+ * returning.
+ */
+static int
+overlay_route_lookup_vl2(overlay_target_entry_t *vl3e,
+ overlay_target_entry_t *vl2e, uint64_t *vidp, struct sockaddr_in6 *v6,
+ socklen_t *slenp, mblk_t *mp)
+{
+ overlay_target_vl2_t *vl2p;
+ int ret;
+
+ ASSERT(MUTEX_HELD(&vl3e->ote_lock));
+
+ if (vl2e == NULL) {
+ vl3e->ote_flags &= ~OVERLAY_ENTRY_F_VALID;
+ /* This drops vl3e->ote_lock */
+ return (overlay_target_try_queue(vl3e, mp));
+ }
+
+ mutex_enter(&vl2e->ote_lock);
+ if (vl2e->ote_flags & (OVERLAY_ENTRY_F_DROP | OVERLAY_ENTRY_F_ROUTER)) {
+ overlay_target_entry_flags_t flags = vl2e->ote_flags;
+
+ mutex_exit(&vl2e->ote_lock);
+ mutex_exit(&vl3e->ote_lock);
+
+ if (flags & OVERLAY_ENTRY_F_DROP) {
+ OVERLAY_DROP(mp, "VL2 target marked drop");
+ } else {
+ OVERLAY_DROP(mp, "VL2 target is overlay router");
+ }
+
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ /*
+ * If the route is missing queue on the VL3 entry so a VL3->UL3
+ * lookup is done (to get the route data).
+ */
+ if ((vl2e->ote_flags & OVERLAY_ENTRY_F_HAS_ROUTE) == 0) {
+ mutex_exit(&vl2e->ote_lock);
+
+ vl3e->ote_flags &= ~OVERLAY_ENTRY_F_VALID;
+ /* This drops vl3e->ote_lock */
+ return (overlay_target_try_queue(vl3e, mp));
+ }
+
+ /*
+ * If the VL2 target point is missing, we try to be a bit (though
+ * hopefully not too) clever. We can always queue on the VL3 entry
+ * which will trigger a VL3->UL3 lookup request (as it is effectively
+ * a superset of the VL2->UL3 lookup). However, if we know we already
+ * have an outstanding VL3->UL3 request, we queue on the VL2 entry and
+ * avoid doing another redundant lookup. We can also queue on the VL2
+ * entry when it is a local (same vnet, same DC) destination -- we
+ * currently cannot generate VL2->UL3 lookups for remote destinations,
+ * only same vnet, same DC. Queueing on the VL2 entry also allows
+ * instances on the same vlan as the queued VL2 entry to piggy back on
+ * the lookup request and avoid a redundant lookup. However if the
+ * VL2 entry is remote, we have to do a VL3->UL3 lookup.
+ */
+ if ((vl2e->ote_flags & OVERLAY_ENTRY_F_VALID) == 0) {
+ if ((vl2e->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0 &&
+ vl2e->ote_u.ote_vl2.otvl2_mac.otm_dcid !=
+ vl2e->ote_odd->odd_dcid) {
+ mutex_exit(&vl2e->ote_lock);
+ /* This drops vl3e->ote_lock */
+ return (overlay_target_try_queue(vl3e, mp));
+ }
+
+ mutex_exit(&vl3e->ote_lock);
+ /* This drops vl2e->ote_lock */
+ return (overlay_target_try_queue(vl2e, mp));
+ }
+
+ ASSERT(vl2e->ote_flags & OVERLAY_ENTRY_F_VALID);
+
+ vl2p = &vl2e->ote_u.ote_vl2;
+
+ *vidp = vl2p->otvl2_route.otr_vnet;
+ bcopy(&vl2p->otvl2_dest.otp_ip, &v6->sin6_addr,
+ sizeof (struct in6_addr));
+ v6->sin6_port = htons(vl2p->otvl2_dest.otp_port);
+ *slenp = sizeof (struct sockaddr_in6);
+
+ ret = overlay_route(vl2e->ote_odd, mp, &vl2p->otvl2_route,
+ &vl2p->otvl2_mac);
+ mutex_exit(&vl2e->ote_lock);
+ mutex_exit(&vl3e->ote_lock);
+ return (ret);
+}
+
+static int
+overlay_route_lookup(overlay_dev_t *odd, mblk_t *mp, uint16_t vlan,
+ struct sockaddr *sock, socklen_t *slenp, uint64_t *vidp)
+{
+ overlay_target_t *ott = odd->odd_target;
+ overlay_target_entry_t *entry, *vl2_entry = NULL;
+ struct sockaddr_in6 *v6 = (struct sockaddr_in6 *)sock;
+ overlay_target_vl3_t vl3 = { 0 };
+ int ret = OVERLAY_TARGET_DROP;
+
+ /* overlay_target_lookup() should have set this */
+ ASSERT3U(v6->sin6_family, ==, AF_INET6);
+
+ /* We should only be called for dynamic endpoints */
+ ASSERT3U(ott->ott_mode, ==, OVERLAY_TARGET_DYNAMIC);
+
+ vl3.otvl3_src_vlan = vlan;
+ if ((ret = overlay_get_vl3_ips(mp, &vl3.otvl3_src, &vl3.otvl3_dst))
+ != OVERLAY_TARGET_OK) {
+ OVERLAY_DROP(mp, "could not read VL3 src/dst IPs");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ mutex_enter(&ott->ott_lock);
+ entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_l3dhash, &vl3);
+ if (entry == NULL) {
+ if ((entry = kmem_cache_alloc(overlay_entry_cache,
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
+ mutex_exit(&ott->ott_lock);
+ OVERLAY_DROP(mp, "failed VL3 target entry allocation");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ bcopy(&vl3, &entry->ote_u.ote_vl3, sizeof (vl3));
+ entry->ote_flags = OVERLAY_ENTRY_F_VL3;
+
+ entry->ote_chead = entry->ote_ctail = mp;
+ entry->ote_mbsize = msgsize(mp);
+ entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+
+ entry->ote_ott = ott;
+ entry->ote_odd = odd;
+
+ qqcache_insert(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ avl_add(&ott->ott_u.ott_dyn.ott_l3tree, entry);
+ mutex_exit(&ott->ott_lock);
+
+ overlay_target_queue(entry);
+
+ mutex_enter(&ott->ott_lock);
+ qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ mutex_exit(&ott->ott_lock);
+ return (OVERLAY_TARGET_ASYNC);
+ }
+ qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ mutex_enter(&entry->ote_lock);
+
+ /*
+ * A bit ugly, but if we need the VL2 entry, we want to look it up
+ * while we still hold ott_lock.
+ */
+ if ((entry->ote_flags &
+ (OVERLAY_ENTRY_F_DROP|OVERLAY_ENTRY_F_ROUTER|
+ OVERLAY_ENTRY_F_VALID)) == OVERLAY_ENTRY_F_VALID) {
+ vl2_entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ &entry->ote_u.ote_vl3.otvl3_vl2);
+ if (vl2_entry != NULL)
+ qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, vl2_entry);
+ }
+ mutex_exit(&ott->ott_lock);
+
+ ASSERT(entry->ote_flags & OVERLAY_ENTRY_F_VL3);
+
+ if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ mutex_exit(&entry->ote_lock);
+ OVERLAY_DROP(mp, "VL3 target entry marked drop");
+ ret = OVERLAY_TARGET_DROP;
+ } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ /*
+ * XXX: A packet with a dst IP of an overlay router.
+ * Maybe generate an ICMP reply? For now, we drop.
+ */
+ mutex_exit(&entry->ote_lock);
+ OVERLAY_DROP(mp, "VL3 target entry is router");
+ ret = OVERLAY_TARGET_DROP;
+ } else if ((entry->ote_flags & OVERLAY_ENTRY_F_VALID) == 0) {
+ /* This drops entry->ote_lock */
+ ret = overlay_target_try_queue(entry, mp);
+ } else {
+ /* This drops entry->ote_lock */
+ ret = overlay_route_lookup_vl2(entry, vl2_entry, vidp, v6,
+ slenp, mp);
+ }
+
+ mutex_enter(&ott->ott_lock);
+ qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ if (vl2_entry != NULL)
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, vl2_entry);
+ mutex_exit(&ott->ott_lock);
+ return (ret);
+}
+
+/*
+ * This function assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
* OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
- * this time, say for NVGRE, we drop all packets that mcuh this.
+ * this time, say for NVGRE, we drop all packets that match this.
*/
int
overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
- socklen_t *slenp)
+ socklen_t *slenp, uint64_t *vidp)
{
int ret;
struct sockaddr_in6 *v6;
overlay_target_t *ott;
- mac_header_info_t mhi;
overlay_target_entry_t *entry;
+ mac_header_info_t mhi;
+ overlay_target_mac_t omac;
ASSERT(odd->odd_target != NULL);
+ /* Default to our local vid, routing may change this if necessary */
+ *vidp = odd->odd_vid;
+
/*
* At this point, the overlay device is in a mux which means that it's
* been activated. At this point, parts of the target, such as the mode
@@ -323,8 +874,10 @@ overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
* about synchronization for them.
*/
ott = odd->odd_target;
- if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
+ if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) {
+ OVERLAY_DROP(mp, "plugin doesn't support IP or port");
return (OVERLAY_TARGET_DROP);
+ }
v6 = (struct sockaddr_in6 *)sock;
bzero(v6, sizeof (struct sockaddr_in6));
@@ -343,76 +896,89 @@ overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
- /*
- * Note we only want the MAC address here, therefore we won't bother
- * using mac_vlan_header_info(). If any caller needs the vlan info at
- * this point, this should change to a call to mac_vlan_header_info().
- */
- if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
+ if (mac_vlan_header_info(odd->odd_mh, mp, &mhi) != 0) {
+ OVERLAY_DROP(mp, "could not read vlan header");
return (OVERLAY_TARGET_DROP);
+ }
+
+ omac.otm_dcid = odd->odd_dcid;
+ bcopy(mhi.mhi_daddr, omac.otm_mac, ETHERADDRL);
+
mutex_enter(&ott->ott_lock);
- entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- mhi.mhi_daddr);
+ entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, &omac);
if (entry == NULL) {
+ overlay_target_vl2_t *vl2p;
+
entry = kmem_cache_alloc(overlay_entry_cache,
KM_NOSLEEP | KM_NORMALPRI);
if (entry == NULL) {
mutex_exit(&ott->ott_lock);
+ OVERLAY_DROP(mp, "VL2 target entry allocation failed");
return (OVERLAY_TARGET_DROP);
}
- bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
+
+ vl2p = &entry->ote_u.ote_vl2;
+ bcopy(mhi.mhi_daddr, vl2p->otvl2_mac.otm_mac, ETHERADDRL);
+ vl2p->otvl2_mac.otm_dcid = odd->odd_dcid;
+ vl2p->otvl2_route.otr_vnet = odd->odd_vid;
+ vl2p->otvl2_route.otr_vlan = VLAN_ID(mhi.mhi_tci);
+
entry->ote_chead = entry->ote_ctail = mp;
entry->ote_mbsize = msgsize(mp);
entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+
entry->ote_ott = ott;
entry->ote_odd = odd;
- refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
+
+ qqcache_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
+ qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
mutex_exit(&ott->ott_lock);
+
overlay_target_queue(entry);
+
+ mutex_enter(&ott->ott_lock);
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+ mutex_exit(&ott->ott_lock);
return (OVERLAY_TARGET_ASYNC);
}
- refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
+ qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
mutex_exit(&ott->ott_lock);
mutex_enter(&entry->ote_lock);
if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ mutex_exit(&entry->ote_lock);
+ OVERLAY_DROP(mp, "VL2 target marked drop");
ret = OVERLAY_TARGET_DROP;
+ } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ if (mhi.mhi_bindsap == ETHERTYPE_ARP) {
+ /*
+ * Send unicast ARP requests to varpd for processing.
+ * We will eventually need something similar for IPv6.
+ * This drops entry->ote_lock.
+ */
+ ret = overlay_target_try_queue(entry, mp);
+ } else {
+ mutex_exit(&entry->ote_lock);
+ ret = overlay_route_lookup(odd, mp,
+ VLAN_ID(mhi.mhi_tci), sock, slenp, vidp);
+ }
} else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
- bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
- sizeof (struct in6_addr));
- v6->sin6_port = htons(entry->ote_dest.otp_port);
+ overlay_target_point_t *otp = &entry->ote_u.ote_vl2.otvl2_dest;
+
+ bcopy(&otp->otp_ip, &v6->sin6_addr, sizeof (struct in6_addr));
+ v6->sin6_port = htons(otp->otp_port);
+ mutex_exit(&entry->ote_lock);
+
*slenp = sizeof (struct sockaddr_in6);
ret = OVERLAY_TARGET_OK;
} else {
- size_t mlen = msgsize(mp);
-
- if (mlen + entry->ote_mbsize > overlay_ent_size) {
- ret = OVERLAY_TARGET_DROP;
- } else {
- if (entry->ote_ctail != NULL) {
- ASSERT(entry->ote_ctail->b_next ==
- NULL);
- entry->ote_ctail->b_next = mp;
- entry->ote_ctail = mp;
- } else {
- entry->ote_chead = mp;
- entry->ote_ctail = mp;
- }
- entry->ote_mbsize += mlen;
- if ((entry->ote_flags &
- OVERLAY_ENTRY_F_PENDING) == 0) {
- entry->ote_flags |=
- OVERLAY_ENTRY_F_PENDING;
- overlay_target_queue(entry);
- }
- ret = OVERLAY_TARGET_ASYNC;
- }
+ /* This drops entry->ote_lock */
+ ret = overlay_target_try_queue(entry, mp);
}
- mutex_exit(&entry->ote_lock);
mutex_enter(&ott->ott_lock);
- refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
mutex_exit(&ott->ott_lock);
return (ret);
@@ -437,6 +1003,7 @@ overlay_target_info(overlay_target_hdl_t *thdl, void *arg)
if (odd->odd_flags & OVERLAY_F_ACTIVATED)
oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE;
oti->oti_vnetid = odd->odd_vid;
+ oti->oti_dcid = odd->odd_dcid;
mutex_exit(&odd->odd_lock);
overlay_hold_rele(odd);
return (0);
@@ -488,6 +1055,13 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
}
}
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_VARPD) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (EEXIST);
+ }
+
ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP);
ott->ott_flags = 0;
ott->ott_ocount = 0;
@@ -499,21 +1073,44 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
bcopy(&ota->ota_point, &ott->ott_u.ott_point,
sizeof (overlay_target_point_t));
} else {
- ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
+ int ret;
+
+ ret = qqcache_create(&ott->ott_u.ott_dyn.ott_dhash,
+ odd->odd_vl2sz, odd->odd_vl2a, OVERLAY_HSIZE,
overlay_mac_hash, overlay_mac_cmp,
- overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
+ overlay_target_entry_l2qq_dtor,
+ sizeof (overlay_target_entry_t),
offsetof(overlay_target_entry_t, ote_reflink),
- offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
+ offsetof(overlay_target_entry_t, ote_u.ote_vl2.otvl2_mac),
+ KM_SLEEP);
+ if (ret != 0) {
+ mutex_exit(&odd->odd_lock);
+ kmem_cache_free(overlay_target_cache, ott);
+ overlay_hold_rele(odd);
+ return (ret);
+ }
+
+ ret = qqcache_create(&ott->ott_u.ott_dyn.ott_l3dhash,
+ odd->odd_routesz, odd->odd_routea, OVERLAY_HSIZE,
+ overlay_ip_hash, overlay_ip_cmp,
+ overlay_target_entry_l3qq_dtor,
+ sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_reflink),
+ offsetof(overlay_target_entry_t, ote_u.ote_vl3), KM_SLEEP);
+ if (ret != 0) {
+ mutex_exit(&odd->odd_lock);
+ qqcache_destroy(ott->ott_u.ott_dyn.ott_l3dhash);
+ kmem_cache_free(overlay_target_cache, ott);
+ overlay_hold_rele(odd);
+ return (ret);
+ }
+
avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
sizeof (overlay_target_entry_t),
offsetof(overlay_target_entry_t, ote_avllink));
- }
- mutex_enter(&odd->odd_lock);
- if (odd->odd_flags & OVERLAY_F_VARPD) {
- mutex_exit(&odd->odd_lock);
- kmem_cache_free(overlay_target_cache, ott);
- overlay_hold_rele(odd);
- return (EEXIST);
+ avl_create(&ott->ott_u.ott_dyn.ott_l3tree, overlay_ip_avl,
+ sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_avllink));
}
odd->odd_flags |= OVERLAY_F_VARPD;
@@ -521,8 +1118,6 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
mutex_exit(&odd->odd_lock);
overlay_hold_rele(odd);
-
-
return (0);
}
@@ -601,7 +1196,16 @@ again:
entry = list_remove_head(&overlay_target_list);
mutex_exit(&overlay_target_lock);
mutex_enter(&entry->ote_lock);
- if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ /*
+ * Router entries may send lookups to varpd even when valid. For
+ * example, illumos systems will send unicast ARP queries to cached
+ * entries (including the router mac address). To answer those, we
+ * need to forward on the query to varpd. IPv6 will eventually
+ * need something similar for ND requests.
+ */
+ if ((entry->ote_flags &
+ (OVERLAY_ENTRY_F_VALID|OVERLAY_ENTRY_F_ROUTER)) ==
+ OVERLAY_ENTRY_F_VALID) {
ASSERT(entry->ote_chead == NULL);
mutex_exit(&entry->ote_lock);
goto again;
@@ -637,10 +1241,23 @@ again:
otl->otl_hdrsize = mhi.mhi_hdrsize;
otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
- bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL);
- bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL);
- otl->otl_dsttype = mhi.mhi_dsttype;
- otl->otl_sap = mhi.mhi_bindsap;
+ if (entry->ote_flags & OVERLAY_ENTRY_F_VL3) {
+ overlay_targ_l3_t *l3p = &otl->otl_addru.otlu_l3;
+
+ otl->otl_l3req = B_TRUE;
+ bcopy(&entry->ote_u.ote_vl3.otvl3_src, &l3p->otl3_srcip,
+ sizeof (struct in6_addr));
+ bcopy(&entry->ote_u.ote_vl3.otvl3_dst, &l3p->otl3_dstip,
+ sizeof (struct in6_addr));
+ } else {
+ overlay_targ_l2_t *l2p = &otl->otl_addru.otlu_l2;
+
+ otl->otl_l3req = B_FALSE;
+ bcopy(mhi.mhi_daddr, l2p->otl2_dstaddr, ETHERADDRL);
+ bcopy(mhi.mhi_saddr, l2p->otl2_srcaddr, ETHERADDRL);
+ l2p->otl2_dsttype = mhi.mhi_dsttype;
+ l2p->otl2_sap = mhi.mhi_bindsap;
+ }
otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
mutex_exit(&entry->ote_lock);
@@ -651,12 +1268,128 @@ again:
return (0);
}
+static void
+overlay_target_lookup_respond_vl3(const overlay_targ_resp_t *otr,
+ overlay_target_entry_t *entry)
+{
+ overlay_target_entry_t *shared = NULL;
+ overlay_target_entry_t *vl2_entry;
+ overlay_target_t *ott = entry->ote_ott;
+ qqcache_t *mhash = ott->ott_u.ott_dyn.ott_dhash;
+ hrtime_t now = gethrtime();
+
+ ASSERT(MUTEX_HELD(&entry->ote_lock));
+ ASSERT(entry->ote_flags & OVERLAY_ENTRY_F_VL3);
+
+ /*
+ * A cross-{vlan,dc,vnet} packet with a destination VL3 of an overlay
+ * router IP. For now we drop these.
+ */
+ if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+ entry->ote_flags |= OVERLAY_ENTRY_F_DROP;
+ return;
+ }
+
+ bcopy(&otr->otr_mac, &entry->ote_u.ote_vl3.otvl3_vl2,
+ sizeof (overlay_target_mac_t));
+
+ mutex_enter(&ott->ott_lock);
+ if ((shared = qqcache_lookup(mhash, &otr->otr_mac)) != NULL)
+ qqcache_hold(mhash, shared);
+ mutex_exit(&ott->ott_lock);
+
+ /*
+ * Once we have the VL2 destination, we need to see if we already
+ * have an existing VL2 entry we can reuse. If not, we create a
+ * fully-formed (i.e. valid) VL2 entry that we add to the cache.
+ */
+ if (shared == NULL) {
+ vl2_entry = kmem_cache_alloc(overlay_entry_cache,
+ KM_NOSLEEP | KM_NORMALPRI);
+ if (vl2_entry == NULL) {
+ /*
+ * If we can't allocate a VL2 entry for the VL3
+ * destination, we just give up for now and drain
+ * any queued packets. New packets will retry this
+ * allocation, so if the memory pressure lets up, we
+ * should recover.
+ */
+ freemsgchain(entry->ote_chead);
+ entry->ote_chead = entry->ote_ctail = NULL;
+ return;
+ }
+
+ vl2_entry->ote_ott = ott;
+ vl2_entry->ote_odd = entry->ote_odd;
+
+ bcopy(&otr->otr_answer, &vl2_entry->ote_u.ote_vl2.otvl2_dest,
+ sizeof (overlay_target_point_t));
+ bcopy(&otr->otr_mac, &vl2_entry->ote_u.ote_vl2.otvl2_mac,
+ sizeof (overlay_target_mac_t));
+ bcopy(&otr->otr_route, &vl2_entry->ote_u.ote_vl2.otvl2_route,
+ sizeof (overlay_target_route_t));
+ vl2_entry->ote_flags =
+ OVERLAY_ENTRY_F_HAS_ROUTE | OVERLAY_ENTRY_F_VALID;
+ vl2_entry->ote_vtime = entry->ote_vtime = now;
+
+ mutex_enter(&ott->ott_lock);
+ if ((shared = qqcache_lookup(mhash, &otr->otr_mac)) != NULL) {
+ overlay_target_entry_dtor(vl2_entry);
+ kmem_cache_free(overlay_entry_cache, vl2_entry);
+ qqcache_hold(mhash, shared);
+
+ vl2_entry = shared;
+ } else {
+ qqcache_insert(mhash, vl2_entry);
+ avl_add(&ott->ott_u.ott_dyn.ott_tree, vl2_entry);
+ qqcache_hold(mhash, vl2_entry);
+ }
+ mutex_exit(&ott->ott_lock);
+ } else {
+ vl2_entry = shared;
+ }
+
+ mutex_enter(&vl2_entry->ote_lock);
+ if ((vl2_entry->ote_flags & (OVERLAY_ENTRY_F_HAS_ROUTE)) == 0) {
+ bcopy(&otr->otr_route, &vl2_entry->ote_u.ote_vl2.otvl2_route,
+ sizeof (overlay_target_route_t));
+ vl2_entry->ote_flags |= OVERLAY_ENTRY_F_HAS_ROUTE;
+ }
+
+ /*
+ * Update the VL2 entry if it doesn't have a valid destination, hasn't
+ * been marked as dropping all packets, and doesn't have an existing
+ * outstanding request. If a route and VL2 request involving the
+ * same VL2 destination are pending and the route response is processed
+ * prior to the VL2 request, we will continue to queue (on the VL2
+ * entry) until the VL2 response is received, even though we have
+ * an answer from the route response. If we set the valid flag
+ * while there's still oustanding requests, it will cause problems
+ * with the outstanding requests.
+ */
+ if ((vl2_entry->ote_flags & (OVERLAY_ENTRY_F_PENDING|
+ OVERLAY_ENTRY_F_VALID|OVERLAY_ENTRY_F_DROP)) == 0) {
+ bcopy(&otr->otr_answer, &vl2_entry->ote_u.ote_vl2.otvl2_dest,
+ sizeof (overlay_target_point_t));
+ vl2_entry->ote_vtime = gethrtime();
+ vl2_entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
+ }
+ mutex_exit(&vl2_entry->ote_lock);
+
+ mutex_enter(&ott->ott_lock);
+ qqcache_rele(mhash, vl2_entry);
+ mutex_exit(&ott->ott_lock);
+}
+
static int
overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
{
const overlay_targ_resp_t *otr = arg;
+ overlay_target_t *ott;
overlay_target_entry_t *entry;
mblk_t *mp;
+ boolean_t is_vl3 = B_FALSE;
mutex_enter(&thdl->oth_lock);
for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
@@ -673,38 +1406,88 @@ overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
mutex_exit(&thdl->oth_lock);
mutex_enter(&entry->ote_lock);
- bcopy(&otr->otr_answer, &entry->ote_dest,
- sizeof (overlay_target_point_t));
+ ott = entry->ote_ott;
+
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0)
+ is_vl3 = B_TRUE;
+
+ /*
+ * If we ever support a protocol that uses MAC addresses as the UL
+ * destination address, this check should probably include checking
+ * that otp_mac is also all zeros.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&otr->otr_answer.otp_ip) &&
+ otr->otr_answer.otp_port == 0)
+ entry->ote_flags |= OVERLAY_ENTRY_F_ROUTER;
+
+ if (!is_vl3) {
+ bcopy(&otr->otr_answer, &entry->ote_u.ote_vl2.otvl2_dest,
+ sizeof (overlay_target_point_t));
+ entry->ote_vtime = gethrtime();
+ } else {
+ overlay_target_lookup_respond_vl3(otr, entry);
+ }
+
entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
+
mp = entry->ote_chead;
entry->ote_chead = NULL;
entry->ote_ctail = NULL;
entry->ote_mbsize = 0;
- entry->ote_vtime = gethrtime();
mutex_exit(&entry->ote_lock);
/*
- * For now do an in-situ drain.
+ * For now do an in-situ drain. For VL3 entries, if we re-use
+ * and existing VL2 entry, it is possible the VL2 lookup is still
+ * pending (though should be rare). In such instances, the packets
+ * queued on the VL3 entry will get queued on the VL2 entry until
+ * the VL2 entry is resolved.
*/
mp = overlay_m_tx(entry->ote_odd, mp);
freemsgchain(mp);
- mutex_enter(&entry->ote_ott->ott_lock);
- entry->ote_ott->ott_ocount--;
- cv_signal(&entry->ote_ott->ott_cond);
- mutex_exit(&entry->ote_ott->ott_lock);
+ mutex_enter(&ott->ott_lock);
+ ott->ott_ocount--;
+ if (is_vl3)
+ qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ else
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+ cv_signal(&ott->ott_cond);
+ mutex_exit(&ott->ott_lock);
return (0);
}
+static boolean_t
+overlay_target_for_varpd(overlay_dev_t *odd, mblk_t *mp)
+{
+ mac_header_info_t mhi;
+
+ /* We should have dropped runts prior to ever queueing */
+ VERIFY0(mac_vlan_header_info(odd->odd_mh, mp, &mhi));
+ if (mhi.mhi_bindsap == ETHERTYPE_ARP)
+ return (B_TRUE);
+
+ /* TODO: NDP packets */
+ return (B_FALSE);
+}
+
+typedef enum overlay_target_lookup_drop_act {
+ OTLDA_NONE,
+ OTLDA_QUEUE,
+ OTLDA_DELETE
+} overlay_target_lookup_drop_act_t;
+
static int
overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
{
const overlay_targ_resp_t *otr = arg;
+ overlay_target_t *ott;
overlay_target_entry_t *entry;
mblk_t *mp;
- boolean_t queue = B_FALSE;
+ overlay_target_lookup_drop_act_t action = OTLDA_NONE;
+ boolean_t is_vl3 = B_FALSE;
mutex_enter(&thdl->oth_lock);
for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
@@ -721,9 +1504,19 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
mutex_exit(&thdl->oth_lock);
mutex_enter(&entry->ote_lock);
+ ott = entry->ote_ott;
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0)
+ is_vl3 = B_TRUE;
- /* Safeguard against a confused varpd */
- if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ mp = entry->ote_chead;
+
+ /*
+ * Safeguard against a confused varpd. Packets specifically for
+ * varpd may receive replies (e.g. ARP replies) that require us to
+ * drop, even when the entry is valid.
+ */
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_VALID) &&
+ !overlay_target_for_varpd(entry->ote_odd, mp)) {
entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
DTRACE_PROBE1(overlay__target__valid__drop,
overlay_target_entry_t *, entry);
@@ -731,7 +1524,21 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
goto done;
}
- mp = entry->ote_chead;
+ /*
+ * If varpd is instructing us to drop the head mblk in a VL2 entry,
+ * this could be because it's already provided a response (e.g. an
+ * ARP reply), and the entry itself might still be used for other
+ * purposes. VL3 entries on the other had have no such uses. If
+ * we are told to drop the packet, there is no reason to retain
+ * the VL3 entry and we can delete it.
+ */
+ if (is_vl3) {
+ action = OTLDA_DELETE;
+ mutex_exit(&entry->ote_lock);
+ goto done;
+ }
+
+ /* Drop the first packet in entry */
if (mp != NULL) {
entry->ote_chead = mp->b_next;
mp->b_next = NULL;
@@ -739,23 +1546,34 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
entry->ote_ctail = entry->ote_chead;
entry->ote_mbsize -= msgsize(mp);
}
+
if (entry->ote_chead != NULL) {
- queue = B_TRUE;
+ action = OTLDA_QUEUE;
entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
} else {
entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
}
mutex_exit(&entry->ote_lock);
- if (queue == B_TRUE)
+ if (action == OTLDA_QUEUE)
overlay_target_queue(entry);
freemsg(mp);
done:
- mutex_enter(&entry->ote_ott->ott_lock);
- entry->ote_ott->ott_ocount--;
- cv_signal(&entry->ote_ott->ott_cond);
- mutex_exit(&entry->ote_ott->ott_lock);
+ mutex_enter(&ott->ott_lock);
+ ott->ott_ocount--;
+ if (action == OTLDA_DELETE) {
+ /* overlay_target_entry_dtor() will free the mblk chain */
+ qqcache_remove(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ }
+
+ if (is_vl3)
+ qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ else
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+
+ cv_signal(&ott->ott_cond);
+ mutex_exit(&ott->ott_lock);
return (0);
}
@@ -1083,31 +1901,35 @@ overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg)
sizeof (overlay_target_point_t));
} else {
overlay_target_entry_t *ote;
- ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- otc->otc_entry.otce_mac);
- if (ote != NULL) {
- mutex_enter(&ote->ote_lock);
- if ((ote->ote_flags &
- OVERLAY_ENTRY_F_VALID_MASK) != 0) {
- if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
- otc->otc_entry.otce_flags =
- OVERLAY_TARGET_CACHE_DROP;
- } else {
- otc->otc_entry.otce_flags = 0;
- bcopy(&ote->ote_dest,
- &otc->otc_entry.otce_dest,
- sizeof (overlay_target_point_t));
- }
- ret = 0;
+
+ if ((ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ &otc->otc_entry.otce_mac)) == NULL) {
+ ret = ENOENT;
+ goto done;
+ }
+
+ mutex_enter(&ote->ote_lock);
+ if ((ote->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) != 0) {
+ if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ otc->otc_entry.otce_flags =
+ OVERLAY_TARGET_CACHE_DROP;
+ } else if (ote->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ otc->otc_entry.otce_flags =
+ OVERLAY_TARGET_CACHE_ROUTER;
} else {
- ret = ENOENT;
+ otc->otc_entry.otce_flags = 0;
+ bcopy(&ote->ote_u.ote_vl2.otvl2_dest,
+ &otc->otc_entry.otce_dest,
+ sizeof (overlay_target_point_t));
}
- mutex_exit(&ote->ote_lock);
+ ret = 0;
} else {
ret = ENOENT;
}
+ mutex_exit(&ote->ote_lock);
}
+done:
mutex_exit(&ott->ott_lock);
overlay_hold_rele(odd);
@@ -1120,53 +1942,64 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
{
overlay_dev_t *odd;
overlay_target_t *ott;
- overlay_target_entry_t *ote;
+ overlay_target_entry_t *ote, *new = NULL;
overlay_targ_cache_t *otc = arg;
mblk_t *mp = NULL;
- if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
+ if (otc->otc_entry.otce_flags &
+ ~(OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER))
+ return (EINVAL);
+
+ if (otc->otc_entry.otce_flags ==
+ (OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER))
return (EINVAL);
odd = overlay_hold_by_dlid(otc->otc_linkid);
if (odd == NULL)
return (ENOENT);
+ /*
+ * Optimistically create the new entry. If not needed, we'll free it.
+ * We shouldn't be calling this ioctl rapidly enough that any potential
+ * alloc/free churn should cause a problem.
+ */
+ new = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
+ bcopy(&otc->otc_entry.otce_mac, &new->ote_u.ote_vl2.otvl2_mac,
+ sizeof (overlay_target_mac_t));
+
mutex_enter(&odd->odd_lock);
if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
mutex_exit(&odd->odd_lock);
overlay_hold_rele(odd);
+ overlay_target_entry_dtor(new);
return (ENXIO);
}
ott = odd->odd_target;
if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
mutex_exit(&odd->odd_lock);
overlay_hold_rele(odd);
+ overlay_target_entry_dtor(new);
return (ENOTSUP);
}
+
+ new->ote_ott = ott;
+ new->ote_odd = odd;
+
mutex_enter(&ott->ott_lock);
mutex_exit(&odd->odd_lock);
- ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- otc->otc_entry.otce_mac);
- if (ote == NULL) {
- ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
- bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL);
- ote->ote_chead = ote->ote_ctail = NULL;
- ote->ote_mbsize = 0;
- ote->ote_ott = ott;
- ote->ote_odd = odd;
- mutex_enter(&ote->ote_lock);
- refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
- avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
- } else {
- mutex_enter(&ote->ote_lock);
- }
+ if ((ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ &otc->otc_entry.otce_mac)) == NULL)
+ ote = new;
+ mutex_enter(&ote->ote_lock);
if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
} else {
ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
- bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
+ if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_ROUTER)
+ ote->ote_flags |= OVERLAY_ENTRY_F_ROUTER;
+ bcopy(&otc->otc_entry.otce_dest, &ote->ote_u.ote_vl2.otvl2_dest,
sizeof (overlay_target_point_t));
mp = ote->ote_chead;
ote->ote_chead = NULL;
@@ -1175,6 +2008,10 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
ote->ote_vtime = gethrtime();
}
+ if (ote == new) {
+ qqcache_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
+ avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
+ }
mutex_exit(&ote->ote_lock);
mutex_exit(&ott->ott_lock);
@@ -1185,6 +2022,9 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
overlay_hold_rele(odd);
+ if (ote != new)
+ overlay_target_entry_dtor(new);
+
return (0);
}
@@ -1217,8 +2057,11 @@ overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg)
mutex_enter(&ott->ott_lock);
mutex_exit(&odd->odd_lock);
- ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- otc->otc_entry.otce_mac);
+ if (otc->otc_entry.otce_mac.otm_dcid == 0)
+ otc->otc_entry.otce_mac.otm_dcid = odd->odd_dcid;
+
+ ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ &otc->otc_entry.otce_mac);
if (ote != NULL) {
mutex_enter(&ote->ote_lock);
ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
@@ -1269,8 +2112,13 @@ overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg)
ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
mutex_exit(&ote->ote_lock);
}
- ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- otc->otc_entry.otce_mac);
+
+ avl = &ott->ott_u.ott_dyn.ott_l3tree;
+ for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) {
+ mutex_enter(&ote->ote_lock);
+ ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
+ mutex_exit(&ote->ote_lock);
+ }
mutex_exit(&ott->ott_lock);
overlay_hold_rele(odd);
@@ -1304,9 +2152,10 @@ overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize,
}
typedef struct overlay_targ_cache_marker {
- uint8_t otcm_mac[ETHERADDRL];
+ overlay_target_mac_t otcm_mac;
uint16_t otcm_done;
-} overlay_targ_cache_marker_t;
+} overlay_targ_cache_marker_t __aligned(8);
+CTASSERT(sizeof (overlay_targ_cache_marker_t) == 2 * sizeof (uint64_t));
/* ARGSUSED */
static int
@@ -1356,7 +2205,7 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
if (ott->ott_mode == OVERLAY_TARGET_POINT) {
overlay_targ_cache_entry_t *out = &iter->otci_ents[0];
- bzero(out->otce_mac, ETHERADDRL);
+ bzero(&out->otce_mac, sizeof (out->otce_mac));
out->otce_flags = 0;
bcopy(&ott->ott_u.ott_point, &out->otce_dest,
sizeof (overlay_target_point_t));
@@ -1365,7 +2214,9 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
}
avl = &ott->ott_u.ott_dyn.ott_tree;
- bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL);
+ lookup.ote_u.ote_vl2.otvl2_mac.otm_dcid = odd->odd_dcid;
+ bcopy(&mark->otcm_mac, &lookup.ote_u.ote_vl2.otvl2_mac,
+ sizeof (mark->otcm_mac));
ent = avl_find(avl, &lookup, &where);
/*
@@ -1390,19 +2241,21 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
mutex_exit(&ent->ote_lock);
continue;
}
- bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL);
+ bcopy(&ent->ote_u.ote_vl2.otvl2_mac, &out->otce_mac,
+ sizeof (out->otce_mac));
out->otce_flags = 0;
if (ent->ote_flags & OVERLAY_ENTRY_F_DROP)
out->otce_flags |= OVERLAY_TARGET_CACHE_DROP;
if (ent->ote_flags & OVERLAY_ENTRY_F_VALID)
- bcopy(&ent->ote_dest, &out->otce_dest,
+ bcopy(&ent->ote_u.ote_vl2.otvl2_dest, &out->otce_dest,
sizeof (overlay_target_point_t));
written++;
mutex_exit(&ent->ote_lock);
}
if (ent != NULL) {
- bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL);
+ bcopy(&ent->ote_u.ote_vl2.otvl2_mac, &mark->otcm_mac,
+ sizeof (mark->otcm_mac));
} else {
mark->otcm_done = 1;
}
@@ -1432,6 +2285,122 @@ overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize,
return (0);
}
+/*
+ * Take an IPv6 address + prefix length, and turn it into the network address.
+ * E.g. ::ffff:192.168.51.50/120 -> ::ffff:192.168.51.0
+ */
+static void
+overlay_in6_to_subnet(const struct in6_addr *src, struct in6_addr *dst,
+ uint8_t prefixlen)
+{
+ uint32_t val;
+
+ for (size_t i = 0; i < 4; i++) {
+ val = ntohl(src->_S6_un._S6_u32[i]);
+ val &= IN6_MASK_FROM_PREFIX(i, prefixlen);
+ dst->_S6_un._S6_u32[i] = htonl(val);
+ }
+}
+
+/*
+ * Find the first target entry whose source IP falls within the source subnet
+ * given by otcne. If no entries match, NULL is returned.
+ */
+static overlay_target_entry_t *
+overlay_target_cache_first_net(overlay_target_t *ott,
+ const overlay_targ_cache_net_entry_t *otcne)
+{
+ avl_tree_t *avl;
+ overlay_target_entry_t *ote;
+ struct in6_addr *start;
+ overlay_target_entry_t cmp = { 0 };
+ avl_index_t where = { 0 };
+
+ ASSERT(MUTEX_HELD(&ott->ott_lock));
+
+ avl = &ott->ott_u.ott_dyn.ott_l3tree;
+ start = &cmp.ote_u.ote_vl3.otvl3_src;
+
+ /*
+ * The first possible source address for a subnet is the network
+ * address (e.g. 192.160.10.0 for a /24). While it normally shouldn't
+ * appear, we either start here, or at the first entry after where
+ * it would exist if present. This should be the first possible
+ * entry in the subnet. If it's not within the subnet, then we
+ * know no entries with that source subnet are present.
+ */
+ overlay_in6_to_subnet(&otcne->otcne_src, start,
+ otcne->otcne_src_prefixlen);
+
+ if ((ote = avl_find(avl, &cmp, &where)) == NULL)
+ ote = avl_nearest(avl, where, AVL_AFTER);
+
+ if (ote == NULL || !IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_src,
+ &ote->ote_u.ote_vl3.otvl3_src, otcne->otcne_src_prefixlen))
+ return (NULL);
+
+ return (ote);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_remove_net(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_cache_net_t *otcn = arg;
+ overlay_targ_cache_net_entry_t *otcne = &otcn->otcn_entry;
+ overlay_dev_t *odd = NULL;
+ overlay_target_t *ott = NULL;
+ overlay_target_entry_t *ote = NULL, *ote_next = NULL;
+ avl_tree_t *avl = NULL;
+
+ odd = overlay_hold_by_dlid(otcn->otcn_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ avl = &ott->ott_u.ott_dyn.ott_l3tree;
+
+ for (ote = overlay_target_cache_first_net(ott, otcne);
+ ote != NULL && IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_src,
+ &ote->ote_u.ote_vl3.otvl3_src, otcne->otcne_src_prefixlen);
+ ote = ote_next) {
+ ote_next = AVL_NEXT(avl, ote);
+
+ /*
+ * Entries are sorted by src ip, dst ip, src vlan, there can
+ * be entries from this src ip to destinations on other
+ * subnets besides the one we are removing that will need to
+ * be skipped over.
+ */
+ if (ote->ote_u.ote_vl3.otvl3_src_vlan != otcne->otcne_vlan)
+ continue;
+
+ if (!IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_dst,
+ &ote->ote_u.ote_vl3.otvl3_dst, otcne->otcne_dst_prefixlen))
+ continue;
+
+ qqcache_remove(ott->ott_u.ott_dyn.ott_l3dhash, ote);
+ }
+
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+ return (0);
+}
+
static overlay_target_ioctl_t overlay_target_ioctab[] = {
{ OVERLAY_TARG_INFO, B_TRUE, B_TRUE,
NULL, overlay_target_info,
@@ -1492,6 +2461,9 @@ static overlay_target_ioctl_t overlay_target_ioctab[] = {
overlay_target_cache_iter,
overlay_target_cache_iter_copyout,
sizeof (overlay_targ_cache_iter_t) },
+ { OVERLAY_TARG_CACHE_REMOVE_NET, B_TRUE, B_TRUE,
+ NULL, overlay_target_cache_remove_net,
+ NULL, sizeof (overlay_targ_cache_net_t) },
{ 0 }
};