diff options
Diffstat (limited to 'usr/src/uts/common/io')
| -rw-r--r-- | usr/src/uts/common/io/overlay/overlay.c | 452 | ||||
| -rw-r--r-- | usr/src/uts/common/io/overlay/overlay_mux.c | 15 | ||||
| -rw-r--r-- | usr/src/uts/common/io/overlay/overlay_target.c | 1290 |
3 files changed, 1572 insertions, 185 deletions
diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c index 2ad3f4f591..c54a6e0d9c 100644 --- a/usr/src/uts/common/io/overlay/overlay.c +++ b/usr/src/uts/common/io/overlay/overlay.c @@ -134,6 +134,40 @@ * be sent to. In addition, they handle questions related to how to handle * things like broadcast and multicast traffic, etc. * + * ROUTING + * + * Supporting routing of packets between VLANs that exist on an overlay + * network require two major differences. First, packets destined for off-VLAN + * destinations need to be identified. Second, we must obtain the necessary + * additional information necessary to deliver the packet to its off-VLAN + * destination. + * + * To solve the first issue, we utilize the existing IP routing functionality. + * Off-vlan destinations are given routes with next hops in the originating + * netstack's routing table--just like in physical networks. The system will + * then attempt to generate an ARP query, which will be sent out to varpd in + * the exact same manner as is described above for other on-VLAN destinations. + * The response for this will include a MAC address that is both used for the + * ARP reply, and is added to our VL2 MAC->UL3 hash table, but is added + * with the OVERLAY_ENTRY_F_ROUTER flag set. Once this is done, the originating + * netstack will send off-VLAN packets to this router MAC, allowing the + * overlay device to identify these packets as requiring routing. + * + * Once packets with an off-VLAN destination are identified, we must determine + * what the destination vid, VL2 VLAN, and VL2 MAC values are for the given + * packet. For reasons similar to the VL2 MAC->UL3 lookup described above, + * we utilize the flexibility of user land to perform these lookups (also + * using varpd). In this instance we are attempting to find a destination VL3 + * IP to a UL3 IP mapping (a few extra bits of information are necessary to + * allow for disambiguation of the destination VL3 IP for situations such as + * mirroring a production environment including VL3 IPs in an isolated set of + * VLANs). We then store these results in a VL3->UL3 hash table for future + * lookups. + * + * To prevent the size of both the VL2->UL3 and VL3->UL3 hash tables from + * growing without bound, we cap the number of entries in each hash table and + * utilize the ARC algorithm to manage their contents. + * * ---------- * Properties * ---------- @@ -205,6 +239,10 @@ * UTF-8. Note that the size of the string includes the null * terminator. * + * OVERLAY_PROP_T_ETHER + * + * An ether_addr_t, which has a fixed size. + * * The next thing that we apply to a property is its permission. The permissions * are put together by the bitwise or of the following flags and values. * @@ -461,7 +499,7 @@ * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things * are much more interesting and as a result, more complicated. We primarily * store lists of overlay_target_entry_t's which are stored in both an avl tree - * and a refhash_t. The primary look up path uses the refhash_t and the avl tree + * and a qqcache_t. The primary look up path uses the qqcache_t and the avl tree * is only used for a few of the target ioctls used to dump data such that we * can get a consistent iteration order for things like dladm show-overlay -t. * The key that we use for the reference hashtable is based on the mac address @@ -486,6 +524,28 @@ * any outstanding data to that place. For the full story on how we look that up * will be discussed in the section on the Target Cache Lifecycle. * + * For routing, everything works largely the same as it does in the non-routing + * situations. The major differences are that both the target cache is always + * an OVERLAY_TARGET_DYNAMIC cache, and that an additional hash table lookup + * occurs. When a routed packet is sent down stack, the + * overlay_target_entry_t in the VL2 cache will have its + * OVERLAY_ENTRY_F_ROUTER flag set, which will prompt a lookup in the VL3->UL3 + * cache (using the source VL3, source VL2 VLAN, and destination VL3 values + * from the packet as the lookup key). The entry returned from the cache is + * used to modify the source and destination VL2 MAC addresses as well as + * the VL2 VLAN ID, and then is encapsulated and sent to its UL3 destination. + * On reception, decapsulation happens exactely the same as in the non-routed + * case, and the packet appears just as if it was sent out the VL2 network + * from a router connected to it. This is done both to maintain the illusion + * of a physical network when sniffing packets at the instance level, and + * so that the mac layer sitting above the destinations overlay device + * (for the vnic created over the overlay) does not discard the packet because + * its VLAN tag does not match the VLAN tag of the destination VNIC. While + * some of these modifications could be split between the source and + * destination hosts, by doing the work on the source, it maximizes any + * potential parallelism that might be present from multiple flows to a given + * destination. + * * ------------------------ * FMA and Degraded Devices * ------------------------ @@ -830,21 +890,62 @@ typedef enum overlay_dev_prop { OVERLAY_DEV_P_MTU = 0, OVERLAY_DEV_P_VNETID, OVERLAY_DEV_P_ENCAP, - OVERLAY_DEV_P_VARPDID + OVERLAY_DEV_P_VARPDID, + OVERLAY_DEV_P_DCID, + OVERLAY_DEV_P_VL2_CACHE_SIZE, + OVERLAY_DEV_P_VL2_CACHE_A, + OVERLAY_DEV_P_ROUTE_CACHE_SIZE, + OVERLAY_DEV_P_ROUTE_CACHE_A } overlay_dev_prop_t; -#define OVERLAY_DEV_NPROPS 4 +#define OVERLAY_DEV_NPROPS 9 static const char *overlay_dev_props[] = { "mtu", "vnetid", "encap", - "varpd/id" + "varpd/id", + "dcid", + "vl2_cache_size", + "_vl2_cache_a", + "route_cache_size", + "_route_cache_a" }; +/* properties that can be changed live */ +static boolean_t overlay_dev_liveprop[] = { + B_FALSE, /* mtu */ + B_FALSE, /* vnetid */ + B_FALSE, /* encap */ + B_FALSE, /* varpd/id */ + B_FALSE, /* dcid */ + B_TRUE, /* vl2_cache_size */ + B_TRUE, /* _vl2_cache_a */ + B_TRUE, /* route_cache_size */ + B_TRUE /* _route_cache_a */ +}; + +CTASSERT(ARRAY_SIZE(overlay_dev_props) == OVERLAY_DEV_NPROPS); +CTASSERT(ARRAY_SIZE(overlay_dev_liveprop) == OVERLAY_DEV_NPROPS); + #define OVERLAY_MTU_MIN 576 #define OVERLAY_MTU_DEF 1400 #define OVERLAY_MTU_MAX 8900 +/* The 2Q parameter 'a' is a percentage */ +#define OVERLAY_CACHE_MAX_A 100 +/* An somewhat arbitrary default, biasing towards storing more MFU entries */ +#define OVERLAY_CACHE_A_DEF 75 + +/* Somewhat arbitrary min and max values */ +#define OVERLAY_VL2_CACHE_MIN 256 +#define OVERLAY_VL2_CACHE_MAX 10240 +#define OVERLAY_VL2_CACHE_DEF OVERLAY_VL2_CACHE_MIN + +/* Somewhat arbitrary min and max values */ +#define OVERLAY_ROUTE_CACHE_MIN 256 +#define OVERLAY_ROUTE_CACHE_MAX 10240 +#define OVERLAY_ROUTE_CACHE_DEF OVERLAY_ROUTE_CACHE_MIN + overlay_dev_t * overlay_hold_by_dlid(datalink_id_t id) { @@ -1066,7 +1167,6 @@ overlay_m_tx(void *arg, mblk_t *mp_chain) bzero(&hdr, sizeof (struct msghdr)); bzero(&einfo, sizeof (ovep_encap_info_t)); - einfo.ovdi_id = odd->odd_vid; mp = mp_chain; while (mp != NULL) { socklen_t slen; @@ -1077,7 +1177,7 @@ overlay_m_tx(void *arg, mblk_t *mp_chain) ep = NULL; ret = overlay_target_lookup(odd, mp, - (struct sockaddr *)&storage, &slen); + (struct sockaddr *)&storage, &slen, &einfo.ovdi_id); if (ret != OVERLAY_TARGET_OK) { if (ret == OVERLAY_TARGET_DROP) freemsg(mp); @@ -1260,6 +1360,19 @@ overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) } odd->odd_vid = oicp->oic_vnetid; + if (oicp->oic_dcid > UINT32_MAX) { + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + odd->odd_dcid = oicp->oic_dcid; + + odd->odd_vl2sz = OVERLAY_VL2_CACHE_DEF; + odd->odd_vl2a = OVERLAY_CACHE_A_DEF; + odd->odd_routesz = OVERLAY_ROUTE_CACHE_DEF; + odd->odd_routea = OVERLAY_CACHE_A_DEF; + mac = mac_alloc(MAC_VERSION); if (mac == NULL) { mutex_exit(&overlay_dev_lock); @@ -1613,6 +1726,7 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, int ret; mac_perim_handle_t mph; uint_t propid = UINT_MAX; + uint32_t def; overlay_ioc_propinfo_t *oip = karg; overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip; @@ -1695,6 +1809,42 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); overlay_prop_set_nodefault(phdl); break; + case OVERLAY_DEV_P_DCID: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_nodefault(phdl); + overlay_prop_set_range_uint32(phdl, 0, UINT32_MAX); + break; + case OVERLAY_DEV_P_VL2_CACHE_SIZE: + def = OVERLAY_VL2_CACHE_DEF; + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, OVERLAY_VL2_CACHE_MIN, + OVERLAY_VL2_CACHE_MAX); + break; + case OVERLAY_DEV_P_VL2_CACHE_A: + def = OVERLAY_CACHE_A_DEF; + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, 0, OVERLAY_CACHE_MAX_A); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_SIZE: + def = OVERLAY_ROUTE_CACHE_DEF; + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, OVERLAY_ROUTE_CACHE_MIN, + OVERLAY_ROUTE_CACHE_MAX); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_A: + def = OVERLAY_CACHE_A_DEF; + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, 0, OVERLAY_CACHE_MAX_A); + break; default: overlay_hold_rele(odd); mac_perim_exit(mph); @@ -1804,6 +1954,41 @@ overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, } mutex_exit(&odd->odd_lock); break; + case OVERLAY_DEV_P_DCID: + /* + * While it's read-only while inside of a mux, we're not in a + * context that can guarantee that. Therefore we always grab the + * overlay_dev_t's odd_lock. + */ + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_dcid, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; + case OVERLAY_DEV_P_VL2_CACHE_SIZE: + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_vl2sz, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; + case OVERLAY_DEV_P_VL2_CACHE_A: + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_vl2a, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_SIZE: + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_routesz, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_A: + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_routea, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; default: ret = ENOENT; } @@ -1845,6 +2030,146 @@ overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid) mutex_exit(&odd->odd_lock); } +static void +overlay_setprop_dcid(overlay_dev_t *odd, uint32_t dcid) +{ + mutex_enter(&odd->odd_lock); + + /* Simple case, not active */ + if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) { + odd->odd_dcid = dcid; + mutex_exit(&odd->odd_lock); + return; + } + + /* + * In the hard case, we need to set the drop flag, quiesce I/O and then + * we can go ahead and do everything. + */ + odd->odd_flags |= OVERLAY_F_MDDROP; + overlay_io_wait(odd, OVERLAY_F_IOMASK); + mutex_exit(&odd->odd_lock); + + overlay_mux_remove_dev(odd->odd_mux, odd); + mutex_enter(&odd->odd_lock); + odd->odd_dcid = dcid; + mutex_exit(&odd->odd_lock); + overlay_mux_add_dev(odd->odd_mux, odd); + + mutex_enter(&odd->odd_lock); + ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX); + odd->odd_flags &= ~OVERLAY_F_IN_MUX; + mutex_exit(&odd->odd_lock); +} + +static int +overlay_setprop_vl2_cachesz(overlay_dev_t *odd, uint32_t sz) +{ + overlay_target_t *ott = NULL; + int ret = 0; + + if (sz == 0) + sz = OVERLAY_VL2_CACHE_DEF; + + /* Caller should have validated this */ + ASSERT3U(sz, >=, OVERLAY_VL2_CACHE_MIN); + ASSERT3U(sz, <=, OVERLAY_VL2_CACHE_MAX); + + mutex_enter(&odd->odd_lock); + ott = odd->odd_target; + + /* ott_mode is RO if the target exists */ + if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) { + mutex_enter(&ott->ott_lock); + ret = qqcache_adjust_size(ott->ott_u.ott_dyn.ott_dhash, sz); + mutex_exit(&ott->ott_lock); + } + + if (ret == 0) + odd->odd_vl2sz = sz; + mutex_exit(&odd->odd_lock); + + return (ret); +} + +static int +overlay_setprop_vl2_cachea(overlay_dev_t *odd, uint32_t a) +{ + overlay_target_t *ott = NULL; + int ret = 0; + + /* Caller should have validated this */ + ASSERT3U(a, <=, 100); + + mutex_enter(&odd->odd_lock); + ott = odd->odd_target; + + /* ott_mode is RO if the target exists */ + if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) { + mutex_enter(&ott->ott_lock); + ret = qqcache_adjust_a(ott->ott_u.ott_dyn.ott_dhash, a); + mutex_exit(&ott->ott_lock); + } + if (ret == 0) + odd->odd_vl2a = a; + mutex_exit(&odd->odd_lock); + + return (ret); +} + +static int +overlay_setprop_route_cachesz(overlay_dev_t *odd, uint32_t sz) +{ + overlay_target_t *ott = NULL; + int ret = 0; + + if (sz == 0) + sz = OVERLAY_ROUTE_CACHE_DEF; + + ASSERT3U(sz, >=, OVERLAY_ROUTE_CACHE_MIN); + ASSERT3U(sz, <=, OVERLAY_ROUTE_CACHE_MAX); + + mutex_enter(&odd->odd_lock); + ott = odd->odd_target; + + /* ott_mode is RO if the target exists */ + if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) { + mutex_enter(&ott->ott_lock); + ret = qqcache_adjust_size(ott->ott_u.ott_dyn.ott_l3dhash, sz); + mutex_exit(&ott->ott_lock); + } + if (ret == 0) + odd->odd_routesz = sz; + mutex_exit(&odd->odd_lock); + + return (ret); +} + +static int +overlay_setprop_route_cachea(overlay_dev_t *odd, uint32_t a) +{ + overlay_target_t *ott = NULL; + int ret = 0; + + /* Caller should have validated this */ + ASSERT3U(a, <=, 100); + + mutex_enter(&odd->odd_lock); + ott = odd->odd_target; + + /* ott_mode is RO if the target exists */ + if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) { + mutex_enter(&ott->ott_lock); + ret = qqcache_adjust_a(ott->ott_u.ott_dyn.ott_l3dhash, a); + mutex_exit(&ott->ott_lock); + } + if (ret == 0) + odd->odd_routea = a; + mutex_exit(&odd->odd_lock); + + return (ret); +} + /* ARGSUSED */ static int overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, @@ -1855,7 +2180,7 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, overlay_ioc_prop_t *oip = karg; uint_t propid = UINT_MAX; mac_perim_handle_t mph; - uint64_t maxid, *vidp; + uint64_t maxid, *vidp, *dcidp, *vl2szp, *vl2ap, *routeszp, *routeap; if (oip->oip_size > OVERLAY_PROP_SIZEMAX) return (EINVAL); @@ -1865,31 +2190,48 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, return (ENOENT); oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; - mac_perim_enter_by_mh(odd->odd_mh, &mph); - mutex_enter(&odd->odd_lock); - if (odd->odd_flags & OVERLAY_F_ACTIVATED) { - mac_perim_exit(mph); - mutex_exit(&odd->odd_lock); - return (ENOTSUP); - } - mutex_exit(&odd->odd_lock); + + /* + * Currently, only certain overlay properties (and no encapsulation + * properties) can be changed while the overlay device is active. + */ if (oip->oip_id == -1) { int i; for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) break; - if (i == OVERLAY_DEV_NPROPS) { - ret = odd->odd_plugin->ovp_ops->ovpo_setprop( - odd->odd_pvoid, oip->oip_name, - oip->oip_value, oip->oip_size); - overlay_hold_rele(odd); - mac_perim_exit(mph); - return (ret); - } } - propid = i; + if (i < OVERLAY_DEV_NPROPS) + propid = i; + } else if (oip->oip_id < OVERLAY_DEV_NPROPS) { + propid = oip->oip_id; + } + + /* + * A bit tricky, but propid is initalized to UINT_MAX, so we know we + * have an overlay property whenever propid < OVERLAY_DEV_NPROPS, + * otherwise we have a plugin property. + */ + mac_perim_enter_by_mh(odd->odd_mh, &mph); + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_ACTIVATED) && + ((propid >= OVERLAY_DEV_NPROPS || !overlay_dev_liveprop[propid]))) { + mutex_exit(&odd->odd_lock); + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_exit(&odd->odd_lock); + + if (oip->oip_id == -1 && propid >= OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_setprop( + odd->odd_pvoid, oip->oip_name, + oip->oip_value, oip->oip_size); + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ret); } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; @@ -1941,6 +2283,68 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, case OVERLAY_DEV_P_VARPDID: ret = EPERM; break; + case OVERLAY_DEV_P_DCID: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + dcidp = (uint64_t *)oip->oip_value; + if (*dcidp > UINT32_MAX) { + ret = EINVAL; + break; + } + overlay_setprop_dcid(odd, *dcidp); + break; + case OVERLAY_DEV_P_VL2_CACHE_SIZE: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + vl2szp = (uint64_t *)oip->oip_value; + if (*vl2szp != 0 && (*vl2szp < OVERLAY_VL2_CACHE_MIN || + *vl2szp > OVERLAY_VL2_CACHE_MAX)) { + ret = EINVAL; + break; + } + ret = overlay_setprop_vl2_cachesz(odd, *vl2szp); + break; + case OVERLAY_DEV_P_VL2_CACHE_A: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + vl2ap = (uint64_t *)oip->oip_value; + if (*vl2ap > OVERLAY_CACHE_MAX_A) { + ret = EINVAL; + break; + } + ret = overlay_setprop_vl2_cachea(odd, *vl2ap); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_SIZE: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + routeszp = (uint64_t *)oip->oip_value; + if (*routeszp != 0 && (*routeszp < OVERLAY_ROUTE_CACHE_MIN || + OVERLAY_ROUTE_CACHE_MAX)) { + ret = EINVAL; + break; + } + ret = overlay_setprop_route_cachesz(odd, *routeszp); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_A: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + routeap = (uint64_t *)oip->oip_value; + if (*routeap > OVERLAY_CACHE_MAX_A) { + ret = EINVAL; + break; + } + ret = overlay_setprop_route_cachea(odd, *routeap); + break; default: ret = ENOENT; } diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c index 0c21bb8689..2688d2791c 100644 --- a/usr/src/uts/common/io/overlay/overlay_mux.c +++ b/usr/src/uts/common/io/overlay/overlay_mux.c @@ -127,6 +127,18 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob, freeb(fmp); /* + * In cases of looped-back vxlan, that tends to have a + * prepended IP+UDP-only mblk, followed by the data. Parsing + * would've made that mblk a zero-length one (rptr == wptr). + */ + if (mp->b_rptr == mp->b_wptr && mp->b_cont != NULL) { + /* Ended up with zero-length mblk, lose it! */ + fmp = mp; + mp = fmp->b_cont; + freeb(fmp); + } + + /* * Until we have VXLAN-or-other-decap HW acceleration support * (e.g. we support NICs that reach into VXLAN-encapsulated * packets and check the inside-VXLAN IP packets' checksums, @@ -161,10 +173,9 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob, if (rem == blkl) { fmp = mp; mp = fmp->b_cont; - fmp->b_cont = NULL; OVERLAY_FREEMSG(mp, "freed a fmp block"); - freemsg(fmp); + freeb(fmp); } } if (mp == NULL) { diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c index f4147b56d1..171acd034f 100644 --- a/usr/src/uts/common/io/overlay/overlay_target.c +++ b/usr/src/uts/common/io/overlay/overlay_target.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -20,6 +20,8 @@ * uts/common/io/overlay/overlay.c */ +#include <inet/ip.h> +#include <inet/ip6.h> #include <sys/types.h> #include <sys/ethernet.h> #include <sys/kmem.h> @@ -42,6 +44,9 @@ #include <sys/overlay_impl.h> #include <sys/sdt.h> +#define OVERLAY_DROP(mp, reason) \ + DTRACE_PROBE2(overlay__drop, mblk_t *, mp, char *, reason) + /* * This is total straw man, but at least it's a prime number. Here we're * going to have to go through and do a lot of evaluation and understanding as @@ -52,6 +57,19 @@ #define OVERLAY_HSIZE 823 /* + * The default size of each target cache. This is also a complete strawman + * whose value could change as we gain better operational experience with + * overlay routing. + */ +#define OVERLAY_CACHE_SIZE 512 + +/* + * A somewhat arbitrary value. The percentage of the target cache dedicated + * to MFU entries (i.e. entries that have been looked up more than once). + */ +#define OVERLAY_CACHE_A 60 + +/* * We use this data structure to keep track of what requests have been actively * allocated to a given instance so we know what to put back on the pending * list. @@ -69,7 +87,7 @@ typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int); typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *); typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int); -typedef struct overaly_target_ioctl { +typedef struct overlay_target_ioctl { int oti_cmd; /* ioctl id */ boolean_t oti_write; /* ioctl requires FWRITE */ boolean_t oti_ncopyout; /* copyout data? */ @@ -144,25 +162,60 @@ overlay_entry_cache_destructor(void *buf, void *arg) static uint64_t overlay_mac_hash(const void *v) { + const overlay_target_mac_t *m = v; + uint32_t crc; - CRC32(crc, v, ETHERADDRL, -1U, crc32_table); + CRC32(crc, m->otm_mac, ETHERADDRL, -1U, crc32_table); + CRC32(crc, &m->otm_dcid, sizeof (uint32_t), crc, crc32_table); return (crc); } static int overlay_mac_cmp(const void *a, const void *b) { - return (bcmp(a, b, ETHERADDRL)); + const overlay_target_mac_t *l = a; + const overlay_target_mac_t *r = b; + + if (l->otm_dcid != r->otm_dcid) + return (1); + return (bcmp(l->otm_mac, r->otm_mac, ETHERADDRL) != 0); +} + +static uint64_t +overlay_ip_hash(const void *v) +{ + const overlay_target_vl3_t *vl3 = v; + + uint32_t crc; + CRC32(crc, &vl3->otvl3_src, sizeof (vl3->otvl3_src), -1U, crc32_table); + CRC32(crc, &vl3->otvl3_dst, sizeof (vl3->otvl3_dst), crc, crc32_table); + CRC32(crc, &vl3->otvl3_src_vlan, sizeof (vl3->otvl3_src_vlan), crc, + crc32_table); + return (crc); +} + +static int +overlay_ip_cmp(const void *a, const void *b) +{ + const overlay_target_vl3_t *l = a; + const overlay_target_vl3_t *r = b; + + if (l->otvl3_src_vlan != r->otvl3_src_vlan) + return (1); + if (!IN6_ARE_ADDR_EQUAL(&l->otvl3_src, &r->otvl3_src)) + return (1); + if (!IN6_ARE_ADDR_EQUAL(&l->otvl3_dst, &r->otvl3_dst)) + return (1); + return (0); } -/* ARGSUSED */ static void overlay_target_entry_dtor(void *arg) { overlay_target_entry_t *ote = arg; ote->ote_flags = 0; - bzero(ote->ote_addr, ETHERADDRL); + bzero(&ote->ote_u, sizeof (ote->ote_u)); ote->ote_ott = NULL; ote->ote_odd = NULL; freemsgchain(ote->ote_chead); @@ -172,21 +225,76 @@ overlay_target_entry_dtor(void *arg) kmem_cache_free(overlay_entry_cache, ote); } +static void +overlay_target_entry_l2qq_dtor(void *arg) +{ + overlay_target_entry_t *ote = arg; + overlay_target_t *ott = ote->ote_ott; + + ASSERT(MUTEX_HELD(&ott->ott_lock)); + ASSERT3S((ote->ote_flags & OVERLAY_ENTRY_F_VL3), ==, 0); + + avl_remove(&ott->ott_u.ott_dyn.ott_tree, ote); + overlay_target_entry_dtor(ote); +} + +static void +overlay_target_entry_l3qq_dtor(void *arg) +{ + overlay_target_entry_t *ote = arg; + overlay_target_t *ott = ote->ote_ott; + + ASSERT(MUTEX_HELD(&ott->ott_lock)); + ASSERT3S((ote->ote_flags & OVERLAY_ENTRY_F_VL3), ==, + OVERLAY_ENTRY_F_VL3); + + avl_remove(&ott->ott_u.ott_dyn.ott_l3tree, ote); + overlay_target_entry_dtor(ote); +} + static int overlay_mac_avl(const void *a, const void *b) { + const overlay_target_entry_t *le = a; + const overlay_target_entry_t *re = b; + const overlay_target_mac_t *lm = &le->ote_u.ote_vl2.otvl2_mac; + const overlay_target_mac_t *rm = &re->ote_u.ote_vl2.otvl2_mac; int i; - const overlay_target_entry_t *l, *r; - l = a; - r = b; + + /* Order by DCID, then MAC */ + if (lm->otm_dcid < rm->otm_dcid) + return (-1); + if (lm->otm_dcid > rm->otm_dcid) + return (1); for (i = 0; i < ETHERADDRL; i++) { - if (l->ote_addr[i] > r->ote_addr[i]) + if (lm->otm_mac[i] > rm->otm_mac[i]) return (1); - else if (l->ote_addr[i] < r->ote_addr[i]) + else if (lm->otm_mac[i] < rm->otm_mac[i]) return (-1); } + return (0); +} +static int +overlay_ip_avl(const void *a, const void *b) +{ + const overlay_target_entry_t *l = a; + const overlay_target_entry_t *r = b; + const overlay_target_vl3_t *l_vl3 = &l->ote_u.ote_vl3; + const overlay_target_vl3_t *r_vl3 = &r->ote_u.ote_vl3; + int ret; + + if ((ret = memcmp(&l_vl3->otvl3_src, &r_vl3->otvl3_src, + sizeof (l_vl3->otvl3_src))) != 0) + return (ret < 0 ? -1 : 1); + if ((ret = memcmp(&l_vl3->otvl3_dst, &r_vl3->otvl3_dst, + sizeof (l_vl3->otvl3_dst))) != 0) + return (ret < 0 ? -1 : 1); + if (l_vl3->otvl3_src_vlan < r_vl3->otvl3_src_vlan) + return (-1); + if (l_vl3->otvl3_src_vlan > r_vl3->otvl3_src_vlan) + return (1); return (0); } @@ -233,25 +341,20 @@ overlay_target_free(overlay_dev_t *odd) return; if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) { - refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash; - avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree; - overlay_target_entry_t *ote; - + mutex_enter(&odd->odd_target->ott_lock); /* - * Our AVL tree and hashtable contain the same elements, - * therefore we should just remove it from the tree, but then - * delete the entries when we remove them from the hash table - * (which happens through the refhash dtor). + * Our VL3 AVL tree and hashtable contain the same elements. + * Additionally, when an entry is removed from the 2Q cache, + * the entry is removed from the corresponding AVL tree. + * Deleting the 2Q cache will destroy any remaining entries, + * so all we need to do is destroy the 2Q caches. */ - while ((ote = avl_first(ap)) != NULL) - avl_remove(ap, ote); - - avl_destroy(ap); - for (ote = refhash_first(rp); ote != NULL; - ote = refhash_next(rp, ote)) { - refhash_remove(rp, ote); - } - refhash_destroy(rp); + qqcache_destroy(odd->odd_target->ott_u.ott_dyn.ott_dhash); + qqcache_destroy(odd->odd_target->ott_u.ott_dyn.ott_l3dhash); + ASSERT(avl_is_empty(&odd->odd_target->ott_u.ott_dyn.ott_tree)); + ASSERT(avl_is_empty( + &odd->odd_target->ott_u.ott_dyn.ott_l3tree)); + mutex_exit(&odd->odd_target->ott_lock); } ASSERT(odd->odd_target->ott_ocount == 0); @@ -270,18 +373,42 @@ overlay_target_busy() return (ret); } +/* + * Queue the target entry on the list of varpd requests. entry should be + * refheld for the duration of this call (this call takes its own additional + * hold that is released when we receive a response). + */ static void overlay_target_queue(overlay_target_entry_t *entry) { + overlay_target_t *ott = entry->ote_ott; + boolean_t is_vl3 = B_FALSE; + + /* + * ote_ott is read-only and set at entry creation, so it can be + * read without ote_lock held + */ + ASSERT(!MUTEX_HELD(&entry->ote_lock)); + + mutex_enter(&entry->ote_lock); + if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0) + is_vl3 = B_TRUE; + mutex_exit(&entry->ote_lock); + mutex_enter(&overlay_target_lock); - mutex_enter(&entry->ote_ott->ott_lock); - if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) { - mutex_exit(&entry->ote_ott->ott_lock); + mutex_enter(&ott->ott_lock); + if (ott->ott_flags & OVERLAY_T_TEARDOWN) { + mutex_exit(&ott->ott_lock); mutex_exit(&overlay_target_lock); return; } - entry->ote_ott->ott_ocount++; - mutex_exit(&entry->ote_ott->ott_lock); + ott->ott_ocount++; + if (is_vl3) + qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry); + else + qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry); + + mutex_exit(&ott->ott_lock); list_insert_tail(&overlay_target_list, entry); cv_signal(&overlay_target_condvar); mutex_exit(&overlay_target_lock); @@ -300,22 +427,446 @@ overlay_target_quiesce(overlay_target_t *ott) } /* - * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP | + * Write the VL3 src/dst IP from the packet in mp into src and dst. If the + * addresses are IPv4 addresses, they are written as mapped addresses. + */ +static int +overlay_get_vl3_ips(mblk_t *mp, struct in6_addr *src, struct in6_addr *dst) +{ + uint16_t sap; + +#if 1 + /* Temporary until mblk helpers are integrated */ + struct ether_vlan_header *eth = (struct ether_vlan_header *)mp->b_rptr; + ipha_t *iphp = (ipha_t *)(eth + 1); + ip6_t *ip6hp = (ip6_t *)(eth + 1); + size_t mlen = MBLKL(mp); + + if (mlen < sizeof (struct ether_vlan_header)) + return (EINVAL); + mlen -= sizeof (struct ether_vlan_header); + + /* We currently don't support routing on untagged vlans */ + if ((sap = ntohs(eth->ether_tpid)) != ETHERTYPE_VLAN) + return (EINVAL); + + sap = ntohs(eth->ether_type); + if (mlen == 0) { + if ((mp = mp->b_cont) == NULL) + return (EINVAL); + mlen = MBLKL(mp); + iphp = (ipha_t *)mp->b_rptr; + ip6hp = (ip6_t *)mp->b_rptr; + } + + switch (sap) { + case ETHERTYPE_IP: + if (mlen < sizeof (ipha_t)) + return (EINVAL); + ASSERT3U(IPH_HDR_VERSION(iphp), ==, IPV4_VERSION); + IN6_IPADDR_TO_V4MAPPED(iphp->ipha_src, src); + IN6_IPADDR_TO_V4MAPPED(iphp->ipha_dst, dst); + break; + case ETHERTYPE_IPV6: + if (mlen < sizeof (ip6_t)) + return (EINVAL); + ASSERT3U(IPH_HDR_VERSION(iphp), ==, IPV6_VERSION); + bcopy(&ip6hp->ip6_src, src, sizeof (*src)); + bcopy(&ip6hp->ip6_dst, dst, sizeof (*dst)); + break; + default: + return (EINVAL); + } + + return (0); +#else + size_t soff, doff; + uint32_t v4s, v4d; + int i; + + if (!mblk_read_uint16(mp, offsetof(struct ether_header, ether_type), + &sap)) + return (EINVAL); + + if (sap == ETHERTYPE_VLAN) { + if (!mblk_read_uint16(mp, + offsetof(struct ether_vlan_header, ether_type), &sap)) + return (EINVAL); + soff = doff = sizeof (struct ether_vlan_header); + } else { + soff = doff = sizeof (struct ether_header); + } + + switch (sap) { + case ETHERTYPE_IP: + soff += offsetof(ipha_t, ipha_src); + doff += offsetof(ipha_t, ipha_dst); + + if (!mblk_read_uint32(mp, soff, &v4s) || + !mblk_read_uint32(mp, doff, &v4d)) + return (EINVAL); + IN6_IPADDR_TO_V4MAPPED(&v4s, src); + IN6_IPADDR_TO_V4MAPPED(&v4d, dst); + break; + case ETHERTYPE_IPV6: + soff += offsetof(ip6_t, ip6_src); + doff += offsetof(ip6_6, ip6_dst); + + for (i = 0; i < 4; i++) { + if (!mblk_read_uint32(mp, soff, &src->s6_addr32[i]) || + !mblk_read_uint32(mp, doff, &dst->s6_addr32[i])) + return (EINVAL); + soff += sizeof (uint32_t); + doff += sizeof (uint32_t); + } + break; + default: + return (EINVAL); + } + + return (0); +#endif +} + +static int +overlay_route(overlay_dev_t *odd, mblk_t *mp, + const overlay_target_route_t *route, const overlay_target_mac_t *dst_mac) +{ + uint16_t tci; + + if (MBLKL(mp) >= sizeof (struct ether_vlan_header)) { + struct ether_vlan_header *evh; + + evh = (struct ether_vlan_header *)mp->b_rptr; + tci = ntohs(evh->ether_tci); + + /* + * Today we require all encapsulated frames to be vlan tagged. + * If this is relaxed in the future, we will need to allow for + * insertion and removal of the vlan tag as appropriate here. + */ + if (ntohs(evh->ether_tpid) != ETHERTYPE_VLAN) { + OVERLAY_DROP(mp, "not vlan tagged"); + return (OVERLAY_TARGET_DROP); + } + + tci &= ~(VLAN_ID_MASK); + tci |= route->otr_vlan; + evh->ether_tci = htons(tci); + bcopy(dst_mac->otm_mac, &evh->ether_dhost, ETHERADDRL); + bcopy(route->otr_srcmac, &evh->ether_shost, ETHERADDRL); + return (OVERLAY_TARGET_OK); + } + +#if 1 + /* Temporary until mblk helpers are integrated */ + OVERLAY_DROP(mp, "ethernet header split between mblks"); + return (OVERLAY_TARGET_DROP); +#else + size_t off; + + off = offsetof(struct ether_vlan_header, ether_tpid); + if (!mblk_read_uint16(mp, off, &tci)) { + OVERLAY_DROP(mp, "cannot read tpid"); + return (OVERLAY_TARGET_DROP); + } + + tci = ntohs(evh->ether_tci); + tci &= ~(VLAN_ID_MASK); + tci |= route->otr_vlan; + + if (!mblk_write_uint16(mp, off, tci)) { + OVERLAY_DROP(mp, "cannot set routed destination vlan"); + return (OVERLAY_TARGET_DROP); + } + + for (int i = 0; i < ETHERADDRL; i++) { + if (!mblk_write_uint8(mp, i, dst_msc->otm_mac[i]) || + !mblk_write_uint8(mp, i + ETHERADDRL, + route->otr_srcmac[i])) { + OVERLAY_DROP(mp, "cannot set routed macs"); + return (OVERLAY_TARGET_DROP); + } + } + + return (OVERLAY_TARGET_OK); +#endif +} + +/* + * Attempt to add mp to the packet queue of target entry. If the queue is + * already full, it returns OVERLAY_TARGET_DROP, otherwise OVERLAY_TARGET_ASYNC + * is returned. If the entry isn't already pending a response from varpd, + * queue the target entry on the list of outstanding varpd requests. + * + * Entry should already be locked, however since it is intended that this + * should be the final step in dealing with this entry (for handling the + * packet in question), it always releases ote_lock before returning. + * entry should be refheld for the duration of this call. + */ +static int +overlay_target_try_queue(overlay_target_entry_t *entry, mblk_t *mp) +{ + size_t mlen = msgsize(mp); + boolean_t queue = B_FALSE; + + ASSERT(MUTEX_HELD(&entry->ote_lock)); + + if (mlen + entry->ote_mbsize > overlay_ent_size) { + OVERLAY_DROP(mp, "target queue full"); + mutex_exit(&entry->ote_lock); + return (OVERLAY_TARGET_DROP); + } + + if (entry->ote_ctail != NULL) { + ASSERT(entry->ote_ctail->b_next == NULL); + entry->ote_ctail->b_next = mp; + entry->ote_ctail = mp; + } else { + entry->ote_chead = mp; + entry->ote_ctail = mp; + } + entry->ote_mbsize += mlen; + if ((entry->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0) { + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + queue = B_TRUE; + } + mutex_exit(&entry->ote_lock); + + if (queue) + overlay_target_queue(entry); + + return (OVERLAY_TARGET_ASYNC); +} + +/* + * Given the VL3 IP->VL2 mac entry (vl3e), and the corresponding VL2 MAC->UL3 + * entry (vl2e), if both entries are valid, sets *vidp, *v6, *slenp to the + * correct UL3 destination and return OVERLAY_TARGET_OK. If either of the + * entries are still pending lookups (or v2le is NULL because the entry is + * missing), mp is queued (if there is space) on the appropriate entry and + * OVERLAY_TARGET_ASYNC is returned. If the VL2 entry is flagged to drop all + * packets, OVERLAY_TARGET_DROP is returned. + * + * In all cases, the caller should acquire vl3e->ote_lock prior to calling + * overlay_route_lookup_vl2(). Because vl2e can be missing (NULL), the caller + * should not acquire vl2e->ote_lock prior to calling + * overlay_route_lookup_vl2(). vl3e->ote_lock is alway dropped prior to + * returning. + */ +static int +overlay_route_lookup_vl2(overlay_target_entry_t *vl3e, + overlay_target_entry_t *vl2e, uint64_t *vidp, struct sockaddr_in6 *v6, + socklen_t *slenp, mblk_t *mp) +{ + overlay_target_vl2_t *vl2p; + int ret; + + ASSERT(MUTEX_HELD(&vl3e->ote_lock)); + + if (vl2e == NULL) { + vl3e->ote_flags &= ~OVERLAY_ENTRY_F_VALID; + /* This drops vl3e->ote_lock */ + return (overlay_target_try_queue(vl3e, mp)); + } + + mutex_enter(&vl2e->ote_lock); + if (vl2e->ote_flags & (OVERLAY_ENTRY_F_DROP | OVERLAY_ENTRY_F_ROUTER)) { + overlay_target_entry_flags_t flags = vl2e->ote_flags; + + mutex_exit(&vl2e->ote_lock); + mutex_exit(&vl3e->ote_lock); + + if (flags & OVERLAY_ENTRY_F_DROP) { + OVERLAY_DROP(mp, "VL2 target marked drop"); + } else { + OVERLAY_DROP(mp, "VL2 target is overlay router"); + } + + return (OVERLAY_TARGET_DROP); + } + + /* + * If the route is missing queue on the VL3 entry so a VL3->UL3 + * lookup is done (to get the route data). + */ + if ((vl2e->ote_flags & OVERLAY_ENTRY_F_HAS_ROUTE) == 0) { + mutex_exit(&vl2e->ote_lock); + + vl3e->ote_flags &= ~OVERLAY_ENTRY_F_VALID; + /* This drops vl3e->ote_lock */ + return (overlay_target_try_queue(vl3e, mp)); + } + + /* + * If the VL2 target point is missing, we try to be a bit (though + * hopefully not too) clever. We can always queue on the VL3 entry + * which will trigger a VL3->UL3 lookup request (as it is effectively + * a superset of the VL2->UL3 lookup). However, if we know we already + * have an outstanding VL3->UL3 request, we queue on the VL2 entry and + * avoid doing another redundant lookup. We can also queue on the VL2 + * entry when it is a local (same vnet, same DC) destination -- we + * currently cannot generate VL2->UL3 lookups for remote destinations, + * only same vnet, same DC. Queueing on the VL2 entry also allows + * instances on the same vlan as the queued VL2 entry to piggy back on + * the lookup request and avoid a redundant lookup. However if the + * VL2 entry is remote, we have to do a VL3->UL3 lookup. + */ + if ((vl2e->ote_flags & OVERLAY_ENTRY_F_VALID) == 0) { + if ((vl2e->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0 && + vl2e->ote_u.ote_vl2.otvl2_mac.otm_dcid != + vl2e->ote_odd->odd_dcid) { + mutex_exit(&vl2e->ote_lock); + /* This drops vl3e->ote_lock */ + return (overlay_target_try_queue(vl3e, mp)); + } + + mutex_exit(&vl3e->ote_lock); + /* This drops vl2e->ote_lock */ + return (overlay_target_try_queue(vl2e, mp)); + } + + ASSERT(vl2e->ote_flags & OVERLAY_ENTRY_F_VALID); + + vl2p = &vl2e->ote_u.ote_vl2; + + *vidp = vl2p->otvl2_route.otr_vnet; + bcopy(&vl2p->otvl2_dest.otp_ip, &v6->sin6_addr, + sizeof (struct in6_addr)); + v6->sin6_port = htons(vl2p->otvl2_dest.otp_port); + *slenp = sizeof (struct sockaddr_in6); + + ret = overlay_route(vl2e->ote_odd, mp, &vl2p->otvl2_route, + &vl2p->otvl2_mac); + mutex_exit(&vl2e->ote_lock); + mutex_exit(&vl3e->ote_lock); + return (ret); +} + +static int +overlay_route_lookup(overlay_dev_t *odd, mblk_t *mp, uint16_t vlan, + struct sockaddr *sock, socklen_t *slenp, uint64_t *vidp) +{ + overlay_target_t *ott = odd->odd_target; + overlay_target_entry_t *entry, *vl2_entry = NULL; + struct sockaddr_in6 *v6 = (struct sockaddr_in6 *)sock; + overlay_target_vl3_t vl3 = { 0 }; + int ret = OVERLAY_TARGET_DROP; + + /* overlay_target_lookup() should have set this */ + ASSERT3U(v6->sin6_family, ==, AF_INET6); + + /* We should only be called for dynamic endpoints */ + ASSERT3U(ott->ott_mode, ==, OVERLAY_TARGET_DYNAMIC); + + vl3.otvl3_src_vlan = vlan; + if ((ret = overlay_get_vl3_ips(mp, &vl3.otvl3_src, &vl3.otvl3_dst)) + != OVERLAY_TARGET_OK) { + OVERLAY_DROP(mp, "could not read VL3 src/dst IPs"); + return (OVERLAY_TARGET_DROP); + } + + mutex_enter(&ott->ott_lock); + entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_l3dhash, &vl3); + if (entry == NULL) { + if ((entry = kmem_cache_alloc(overlay_entry_cache, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) { + mutex_exit(&ott->ott_lock); + OVERLAY_DROP(mp, "failed VL3 target entry allocation"); + return (OVERLAY_TARGET_DROP); + } + + bcopy(&vl3, &entry->ote_u.ote_vl3, sizeof (vl3)); + entry->ote_flags = OVERLAY_ENTRY_F_VL3; + + entry->ote_chead = entry->ote_ctail = mp; + entry->ote_mbsize = msgsize(mp); + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + + entry->ote_ott = ott; + entry->ote_odd = odd; + + qqcache_insert(ott->ott_u.ott_dyn.ott_l3dhash, entry); + qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry); + avl_add(&ott->ott_u.ott_dyn.ott_l3tree, entry); + mutex_exit(&ott->ott_lock); + + overlay_target_queue(entry); + + mutex_enter(&ott->ott_lock); + qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry); + mutex_exit(&ott->ott_lock); + return (OVERLAY_TARGET_ASYNC); + } + qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry); + mutex_enter(&entry->ote_lock); + + /* + * A bit ugly, but if we need the VL2 entry, we want to look it up + * while we still hold ott_lock. + */ + if ((entry->ote_flags & + (OVERLAY_ENTRY_F_DROP|OVERLAY_ENTRY_F_ROUTER| + OVERLAY_ENTRY_F_VALID)) == OVERLAY_ENTRY_F_VALID) { + vl2_entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, + &entry->ote_u.ote_vl3.otvl3_vl2); + if (vl2_entry != NULL) + qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, vl2_entry); + } + mutex_exit(&ott->ott_lock); + + ASSERT(entry->ote_flags & OVERLAY_ENTRY_F_VL3); + + if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) { + mutex_exit(&entry->ote_lock); + OVERLAY_DROP(mp, "VL3 target entry marked drop"); + ret = OVERLAY_TARGET_DROP; + } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) { + /* + * XXX: A packet with a dst IP of an overlay router. + * Maybe generate an ICMP reply? For now, we drop. + */ + mutex_exit(&entry->ote_lock); + OVERLAY_DROP(mp, "VL3 target entry is router"); + ret = OVERLAY_TARGET_DROP; + } else if ((entry->ote_flags & OVERLAY_ENTRY_F_VALID) == 0) { + /* This drops entry->ote_lock */ + ret = overlay_target_try_queue(entry, mp); + } else { + /* This drops entry->ote_lock */ + ret = overlay_route_lookup_vl2(entry, vl2_entry, vidp, v6, + slenp, mp); + } + + mutex_enter(&ott->ott_lock); + qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry); + if (vl2_entry != NULL) + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, vl2_entry); + mutex_exit(&ott->ott_lock); + return (ret); +} + +/* + * This function assumes that the destination mode is OVERLAY_PLUGIN_D_IP | * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at - * this time, say for NVGRE, we drop all packets that mcuh this. + * this time, say for NVGRE, we drop all packets that match this. */ int overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, - socklen_t *slenp) + socklen_t *slenp, uint64_t *vidp) { int ret; struct sockaddr_in6 *v6; overlay_target_t *ott; - mac_header_info_t mhi; overlay_target_entry_t *entry; + mac_header_info_t mhi; + overlay_target_mac_t omac; ASSERT(odd->odd_target != NULL); + /* Default to our local vid, routing may change this if necessary */ + *vidp = odd->odd_vid; + /* * At this point, the overlay device is in a mux which means that it's * been activated. At this point, parts of the target, such as the mode @@ -323,8 +874,10 @@ overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, * about synchronization for them. */ ott = odd->odd_target; - if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) + if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) { + OVERLAY_DROP(mp, "plugin doesn't support IP or port"); return (OVERLAY_TARGET_DROP); + } v6 = (struct sockaddr_in6 *)sock; bzero(v6, sizeof (struct sockaddr_in6)); @@ -343,76 +896,89 @@ overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC); - /* - * Note we only want the MAC address here, therefore we won't bother - * using mac_vlan_header_info(). If any caller needs the vlan info at - * this point, this should change to a call to mac_vlan_header_info(). - */ - if (mac_header_info(odd->odd_mh, mp, &mhi) != 0) + if (mac_vlan_header_info(odd->odd_mh, mp, &mhi) != 0) { + OVERLAY_DROP(mp, "could not read vlan header"); return (OVERLAY_TARGET_DROP); + } + + omac.otm_dcid = odd->odd_dcid; + bcopy(mhi.mhi_daddr, omac.otm_mac, ETHERADDRL); + mutex_enter(&ott->ott_lock); - entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - mhi.mhi_daddr); + entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, &omac); if (entry == NULL) { + overlay_target_vl2_t *vl2p; + entry = kmem_cache_alloc(overlay_entry_cache, KM_NOSLEEP | KM_NORMALPRI); if (entry == NULL) { mutex_exit(&ott->ott_lock); + OVERLAY_DROP(mp, "VL2 target entry allocation failed"); return (OVERLAY_TARGET_DROP); } - bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL); + + vl2p = &entry->ote_u.ote_vl2; + bcopy(mhi.mhi_daddr, vl2p->otvl2_mac.otm_mac, ETHERADDRL); + vl2p->otvl2_mac.otm_dcid = odd->odd_dcid; + vl2p->otvl2_route.otr_vnet = odd->odd_vid; + vl2p->otvl2_route.otr_vlan = VLAN_ID(mhi.mhi_tci); + entry->ote_chead = entry->ote_ctail = mp; entry->ote_mbsize = msgsize(mp); entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + entry->ote_ott = ott; entry->ote_odd = odd; - refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry); + + qqcache_insert(ott->ott_u.ott_dyn.ott_dhash, entry); + qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry); avl_add(&ott->ott_u.ott_dyn.ott_tree, entry); mutex_exit(&ott->ott_lock); + overlay_target_queue(entry); + + mutex_enter(&ott->ott_lock); + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + mutex_exit(&ott->ott_lock); return (OVERLAY_TARGET_ASYNC); } - refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry); + qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry); mutex_exit(&ott->ott_lock); mutex_enter(&entry->ote_lock); if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) { + mutex_exit(&entry->ote_lock); + OVERLAY_DROP(mp, "VL2 target marked drop"); ret = OVERLAY_TARGET_DROP; + } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) { + if (mhi.mhi_bindsap == ETHERTYPE_ARP) { + /* + * Send unicast ARP requests to varpd for processing. + * We will eventually need something similar for IPv6. + * This drops entry->ote_lock. + */ + ret = overlay_target_try_queue(entry, mp); + } else { + mutex_exit(&entry->ote_lock); + ret = overlay_route_lookup(odd, mp, + VLAN_ID(mhi.mhi_tci), sock, slenp, vidp); + } } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { - bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr, - sizeof (struct in6_addr)); - v6->sin6_port = htons(entry->ote_dest.otp_port); + overlay_target_point_t *otp = &entry->ote_u.ote_vl2.otvl2_dest; + + bcopy(&otp->otp_ip, &v6->sin6_addr, sizeof (struct in6_addr)); + v6->sin6_port = htons(otp->otp_port); + mutex_exit(&entry->ote_lock); + *slenp = sizeof (struct sockaddr_in6); ret = OVERLAY_TARGET_OK; } else { - size_t mlen = msgsize(mp); - - if (mlen + entry->ote_mbsize > overlay_ent_size) { - ret = OVERLAY_TARGET_DROP; - } else { - if (entry->ote_ctail != NULL) { - ASSERT(entry->ote_ctail->b_next == - NULL); - entry->ote_ctail->b_next = mp; - entry->ote_ctail = mp; - } else { - entry->ote_chead = mp; - entry->ote_ctail = mp; - } - entry->ote_mbsize += mlen; - if ((entry->ote_flags & - OVERLAY_ENTRY_F_PENDING) == 0) { - entry->ote_flags |= - OVERLAY_ENTRY_F_PENDING; - overlay_target_queue(entry); - } - ret = OVERLAY_TARGET_ASYNC; - } + /* This drops entry->ote_lock */ + ret = overlay_target_try_queue(entry, mp); } - mutex_exit(&entry->ote_lock); mutex_enter(&ott->ott_lock); - refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry); mutex_exit(&ott->ott_lock); return (ret); @@ -437,6 +1003,7 @@ overlay_target_info(overlay_target_hdl_t *thdl, void *arg) if (odd->odd_flags & OVERLAY_F_ACTIVATED) oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE; oti->oti_vnetid = odd->odd_vid; + oti->oti_dcid = odd->odd_dcid; mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); return (0); @@ -488,6 +1055,13 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) } } + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_VARPD) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (EEXIST); + } + ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP); ott->ott_flags = 0; ott->ott_ocount = 0; @@ -499,21 +1073,44 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) bcopy(&ota->ota_point, &ott->ott_u.ott_point, sizeof (overlay_target_point_t)); } else { - ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE, + int ret; + + ret = qqcache_create(&ott->ott_u.ott_dyn.ott_dhash, + odd->odd_vl2sz, odd->odd_vl2a, OVERLAY_HSIZE, overlay_mac_hash, overlay_mac_cmp, - overlay_target_entry_dtor, sizeof (overlay_target_entry_t), + overlay_target_entry_l2qq_dtor, + sizeof (overlay_target_entry_t), offsetof(overlay_target_entry_t, ote_reflink), - offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP); + offsetof(overlay_target_entry_t, ote_u.ote_vl2.otvl2_mac), + KM_SLEEP); + if (ret != 0) { + mutex_exit(&odd->odd_lock); + kmem_cache_free(overlay_target_cache, ott); + overlay_hold_rele(odd); + return (ret); + } + + ret = qqcache_create(&ott->ott_u.ott_dyn.ott_l3dhash, + odd->odd_routesz, odd->odd_routea, OVERLAY_HSIZE, + overlay_ip_hash, overlay_ip_cmp, + overlay_target_entry_l3qq_dtor, + sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_reflink), + offsetof(overlay_target_entry_t, ote_u.ote_vl3), KM_SLEEP); + if (ret != 0) { + mutex_exit(&odd->odd_lock); + qqcache_destroy(ott->ott_u.ott_dyn.ott_l3dhash); + kmem_cache_free(overlay_target_cache, ott); + overlay_hold_rele(odd); + return (ret); + } + avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl, sizeof (overlay_target_entry_t), offsetof(overlay_target_entry_t, ote_avllink)); - } - mutex_enter(&odd->odd_lock); - if (odd->odd_flags & OVERLAY_F_VARPD) { - mutex_exit(&odd->odd_lock); - kmem_cache_free(overlay_target_cache, ott); - overlay_hold_rele(odd); - return (EEXIST); + avl_create(&ott->ott_u.ott_dyn.ott_l3tree, overlay_ip_avl, + sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_avllink)); } odd->odd_flags |= OVERLAY_F_VARPD; @@ -521,8 +1118,6 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); - - return (0); } @@ -601,7 +1196,16 @@ again: entry = list_remove_head(&overlay_target_list); mutex_exit(&overlay_target_lock); mutex_enter(&entry->ote_lock); - if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + /* + * Router entries may send lookups to varpd even when valid. For + * example, illumos systems will send unicast ARP queries to cached + * entries (including the router mac address). To answer those, we + * need to forward on the query to varpd. IPv6 will eventually + * need something similar for ND requests. + */ + if ((entry->ote_flags & + (OVERLAY_ENTRY_F_VALID|OVERLAY_ENTRY_F_ROUTER)) == + OVERLAY_ENTRY_F_VALID) { ASSERT(entry->ote_chead == NULL); mutex_exit(&entry->ote_lock); goto again; @@ -637,10 +1241,23 @@ again: otl->otl_hdrsize = mhi.mhi_hdrsize; otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize; - bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL); - bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL); - otl->otl_dsttype = mhi.mhi_dsttype; - otl->otl_sap = mhi.mhi_bindsap; + if (entry->ote_flags & OVERLAY_ENTRY_F_VL3) { + overlay_targ_l3_t *l3p = &otl->otl_addru.otlu_l3; + + otl->otl_l3req = B_TRUE; + bcopy(&entry->ote_u.ote_vl3.otvl3_src, &l3p->otl3_srcip, + sizeof (struct in6_addr)); + bcopy(&entry->ote_u.ote_vl3.otvl3_dst, &l3p->otl3_dstip, + sizeof (struct in6_addr)); + } else { + overlay_targ_l2_t *l2p = &otl->otl_addru.otlu_l2; + + otl->otl_l3req = B_FALSE; + bcopy(mhi.mhi_daddr, l2p->otl2_dstaddr, ETHERADDRL); + bcopy(mhi.mhi_saddr, l2p->otl2_srcaddr, ETHERADDRL); + l2p->otl2_dsttype = mhi.mhi_dsttype; + l2p->otl2_sap = mhi.mhi_bindsap; + } otl->otl_vlan = VLAN_ID(mhi.mhi_tci); mutex_exit(&entry->ote_lock); @@ -651,12 +1268,128 @@ again: return (0); } +static void +overlay_target_lookup_respond_vl3(const overlay_targ_resp_t *otr, + overlay_target_entry_t *entry) +{ + overlay_target_entry_t *shared = NULL; + overlay_target_entry_t *vl2_entry; + overlay_target_t *ott = entry->ote_ott; + qqcache_t *mhash = ott->ott_u.ott_dyn.ott_dhash; + hrtime_t now = gethrtime(); + + ASSERT(MUTEX_HELD(&entry->ote_lock)); + ASSERT(entry->ote_flags & OVERLAY_ENTRY_F_VL3); + + /* + * A cross-{vlan,dc,vnet} packet with a destination VL3 of an overlay + * router IP. For now we drop these. + */ + if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) { + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + entry->ote_flags |= OVERLAY_ENTRY_F_DROP; + return; + } + + bcopy(&otr->otr_mac, &entry->ote_u.ote_vl3.otvl3_vl2, + sizeof (overlay_target_mac_t)); + + mutex_enter(&ott->ott_lock); + if ((shared = qqcache_lookup(mhash, &otr->otr_mac)) != NULL) + qqcache_hold(mhash, shared); + mutex_exit(&ott->ott_lock); + + /* + * Once we have the VL2 destination, we need to see if we already + * have an existing VL2 entry we can reuse. If not, we create a + * fully-formed (i.e. valid) VL2 entry that we add to the cache. + */ + if (shared == NULL) { + vl2_entry = kmem_cache_alloc(overlay_entry_cache, + KM_NOSLEEP | KM_NORMALPRI); + if (vl2_entry == NULL) { + /* + * If we can't allocate a VL2 entry for the VL3 + * destination, we just give up for now and drain + * any queued packets. New packets will retry this + * allocation, so if the memory pressure lets up, we + * should recover. + */ + freemsgchain(entry->ote_chead); + entry->ote_chead = entry->ote_ctail = NULL; + return; + } + + vl2_entry->ote_ott = ott; + vl2_entry->ote_odd = entry->ote_odd; + + bcopy(&otr->otr_answer, &vl2_entry->ote_u.ote_vl2.otvl2_dest, + sizeof (overlay_target_point_t)); + bcopy(&otr->otr_mac, &vl2_entry->ote_u.ote_vl2.otvl2_mac, + sizeof (overlay_target_mac_t)); + bcopy(&otr->otr_route, &vl2_entry->ote_u.ote_vl2.otvl2_route, + sizeof (overlay_target_route_t)); + vl2_entry->ote_flags = + OVERLAY_ENTRY_F_HAS_ROUTE | OVERLAY_ENTRY_F_VALID; + vl2_entry->ote_vtime = entry->ote_vtime = now; + + mutex_enter(&ott->ott_lock); + if ((shared = qqcache_lookup(mhash, &otr->otr_mac)) != NULL) { + overlay_target_entry_dtor(vl2_entry); + kmem_cache_free(overlay_entry_cache, vl2_entry); + qqcache_hold(mhash, shared); + + vl2_entry = shared; + } else { + qqcache_insert(mhash, vl2_entry); + avl_add(&ott->ott_u.ott_dyn.ott_tree, vl2_entry); + qqcache_hold(mhash, vl2_entry); + } + mutex_exit(&ott->ott_lock); + } else { + vl2_entry = shared; + } + + mutex_enter(&vl2_entry->ote_lock); + if ((vl2_entry->ote_flags & (OVERLAY_ENTRY_F_HAS_ROUTE)) == 0) { + bcopy(&otr->otr_route, &vl2_entry->ote_u.ote_vl2.otvl2_route, + sizeof (overlay_target_route_t)); + vl2_entry->ote_flags |= OVERLAY_ENTRY_F_HAS_ROUTE; + } + + /* + * Update the VL2 entry if it doesn't have a valid destination, hasn't + * been marked as dropping all packets, and doesn't have an existing + * outstanding request. If a route and VL2 request involving the + * same VL2 destination are pending and the route response is processed + * prior to the VL2 request, we will continue to queue (on the VL2 + * entry) until the VL2 response is received, even though we have + * an answer from the route response. If we set the valid flag + * while there's still oustanding requests, it will cause problems + * with the outstanding requests. + */ + if ((vl2_entry->ote_flags & (OVERLAY_ENTRY_F_PENDING| + OVERLAY_ENTRY_F_VALID|OVERLAY_ENTRY_F_DROP)) == 0) { + bcopy(&otr->otr_answer, &vl2_entry->ote_u.ote_vl2.otvl2_dest, + sizeof (overlay_target_point_t)); + vl2_entry->ote_vtime = gethrtime(); + vl2_entry->ote_flags |= OVERLAY_ENTRY_F_VALID; + } + mutex_exit(&vl2_entry->ote_lock); + + mutex_enter(&ott->ott_lock); + qqcache_rele(mhash, vl2_entry); + mutex_exit(&ott->ott_lock); +} + static int overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg) { const overlay_targ_resp_t *otr = arg; + overlay_target_t *ott; overlay_target_entry_t *entry; mblk_t *mp; + boolean_t is_vl3 = B_FALSE; mutex_enter(&thdl->oth_lock); for (entry = list_head(&thdl->oth_outstanding); entry != NULL; @@ -673,38 +1406,88 @@ overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg) mutex_exit(&thdl->oth_lock); mutex_enter(&entry->ote_lock); - bcopy(&otr->otr_answer, &entry->ote_dest, - sizeof (overlay_target_point_t)); + ott = entry->ote_ott; + + if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0) + is_vl3 = B_TRUE; + + /* + * If we ever support a protocol that uses MAC addresses as the UL + * destination address, this check should probably include checking + * that otp_mac is also all zeros. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&otr->otr_answer.otp_ip) && + otr->otr_answer.otp_port == 0) + entry->ote_flags |= OVERLAY_ENTRY_F_ROUTER; + + if (!is_vl3) { + bcopy(&otr->otr_answer, &entry->ote_u.ote_vl2.otvl2_dest, + sizeof (overlay_target_point_t)); + entry->ote_vtime = gethrtime(); + } else { + overlay_target_lookup_respond_vl3(otr, entry); + } + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; entry->ote_flags |= OVERLAY_ENTRY_F_VALID; + mp = entry->ote_chead; entry->ote_chead = NULL; entry->ote_ctail = NULL; entry->ote_mbsize = 0; - entry->ote_vtime = gethrtime(); mutex_exit(&entry->ote_lock); /* - * For now do an in-situ drain. + * For now do an in-situ drain. For VL3 entries, if we re-use + * and existing VL2 entry, it is possible the VL2 lookup is still + * pending (though should be rare). In such instances, the packets + * queued on the VL3 entry will get queued on the VL2 entry until + * the VL2 entry is resolved. */ mp = overlay_m_tx(entry->ote_odd, mp); freemsgchain(mp); - mutex_enter(&entry->ote_ott->ott_lock); - entry->ote_ott->ott_ocount--; - cv_signal(&entry->ote_ott->ott_cond); - mutex_exit(&entry->ote_ott->ott_lock); + mutex_enter(&ott->ott_lock); + ott->ott_ocount--; + if (is_vl3) + qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry); + else + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + cv_signal(&ott->ott_cond); + mutex_exit(&ott->ott_lock); return (0); } +static boolean_t +overlay_target_for_varpd(overlay_dev_t *odd, mblk_t *mp) +{ + mac_header_info_t mhi; + + /* We should have dropped runts prior to ever queueing */ + VERIFY0(mac_vlan_header_info(odd->odd_mh, mp, &mhi)); + if (mhi.mhi_bindsap == ETHERTYPE_ARP) + return (B_TRUE); + + /* TODO: NDP packets */ + return (B_FALSE); +} + +typedef enum overlay_target_lookup_drop_act { + OTLDA_NONE, + OTLDA_QUEUE, + OTLDA_DELETE +} overlay_target_lookup_drop_act_t; + static int overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) { const overlay_targ_resp_t *otr = arg; + overlay_target_t *ott; overlay_target_entry_t *entry; mblk_t *mp; - boolean_t queue = B_FALSE; + overlay_target_lookup_drop_act_t action = OTLDA_NONE; + boolean_t is_vl3 = B_FALSE; mutex_enter(&thdl->oth_lock); for (entry = list_head(&thdl->oth_outstanding); entry != NULL; @@ -721,9 +1504,19 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) mutex_exit(&thdl->oth_lock); mutex_enter(&entry->ote_lock); + ott = entry->ote_ott; + if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0) + is_vl3 = B_TRUE; - /* Safeguard against a confused varpd */ - if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + mp = entry->ote_chead; + + /* + * Safeguard against a confused varpd. Packets specifically for + * varpd may receive replies (e.g. ARP replies) that require us to + * drop, even when the entry is valid. + */ + if ((entry->ote_flags & OVERLAY_ENTRY_F_VALID) && + !overlay_target_for_varpd(entry->ote_odd, mp)) { entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; DTRACE_PROBE1(overlay__target__valid__drop, overlay_target_entry_t *, entry); @@ -731,7 +1524,21 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) goto done; } - mp = entry->ote_chead; + /* + * If varpd is instructing us to drop the head mblk in a VL2 entry, + * this could be because it's already provided a response (e.g. an + * ARP reply), and the entry itself might still be used for other + * purposes. VL3 entries on the other had have no such uses. If + * we are told to drop the packet, there is no reason to retain + * the VL3 entry and we can delete it. + */ + if (is_vl3) { + action = OTLDA_DELETE; + mutex_exit(&entry->ote_lock); + goto done; + } + + /* Drop the first packet in entry */ if (mp != NULL) { entry->ote_chead = mp->b_next; mp->b_next = NULL; @@ -739,23 +1546,34 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) entry->ote_ctail = entry->ote_chead; entry->ote_mbsize -= msgsize(mp); } + if (entry->ote_chead != NULL) { - queue = B_TRUE; + action = OTLDA_QUEUE; entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; } else { entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; } mutex_exit(&entry->ote_lock); - if (queue == B_TRUE) + if (action == OTLDA_QUEUE) overlay_target_queue(entry); freemsg(mp); done: - mutex_enter(&entry->ote_ott->ott_lock); - entry->ote_ott->ott_ocount--; - cv_signal(&entry->ote_ott->ott_cond); - mutex_exit(&entry->ote_ott->ott_lock); + mutex_enter(&ott->ott_lock); + ott->ott_ocount--; + if (action == OTLDA_DELETE) { + /* overlay_target_entry_dtor() will free the mblk chain */ + qqcache_remove(ott->ott_u.ott_dyn.ott_l3dhash, entry); + } + + if (is_vl3) + qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry); + else + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + + cv_signal(&ott->ott_cond); + mutex_exit(&ott->ott_lock); return (0); } @@ -1083,31 +1901,35 @@ overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg) sizeof (overlay_target_point_t)); } else { overlay_target_entry_t *ote; - ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - otc->otc_entry.otce_mac); - if (ote != NULL) { - mutex_enter(&ote->ote_lock); - if ((ote->ote_flags & - OVERLAY_ENTRY_F_VALID_MASK) != 0) { - if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) { - otc->otc_entry.otce_flags = - OVERLAY_TARGET_CACHE_DROP; - } else { - otc->otc_entry.otce_flags = 0; - bcopy(&ote->ote_dest, - &otc->otc_entry.otce_dest, - sizeof (overlay_target_point_t)); - } - ret = 0; + + if ((ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, + &otc->otc_entry.otce_mac)) == NULL) { + ret = ENOENT; + goto done; + } + + mutex_enter(&ote->ote_lock); + if ((ote->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) != 0) { + if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) { + otc->otc_entry.otce_flags = + OVERLAY_TARGET_CACHE_DROP; + } else if (ote->ote_flags & OVERLAY_ENTRY_F_ROUTER) { + otc->otc_entry.otce_flags = + OVERLAY_TARGET_CACHE_ROUTER; } else { - ret = ENOENT; + otc->otc_entry.otce_flags = 0; + bcopy(&ote->ote_u.ote_vl2.otvl2_dest, + &otc->otc_entry.otce_dest, + sizeof (overlay_target_point_t)); } - mutex_exit(&ote->ote_lock); + ret = 0; } else { ret = ENOENT; } + mutex_exit(&ote->ote_lock); } +done: mutex_exit(&ott->ott_lock); overlay_hold_rele(odd); @@ -1120,53 +1942,64 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) { overlay_dev_t *odd; overlay_target_t *ott; - overlay_target_entry_t *ote; + overlay_target_entry_t *ote, *new = NULL; overlay_targ_cache_t *otc = arg; mblk_t *mp = NULL; - if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP) + if (otc->otc_entry.otce_flags & + ~(OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER)) + return (EINVAL); + + if (otc->otc_entry.otce_flags == + (OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER)) return (EINVAL); odd = overlay_hold_by_dlid(otc->otc_linkid); if (odd == NULL) return (ENOENT); + /* + * Optimistically create the new entry. If not needed, we'll free it. + * We shouldn't be calling this ioctl rapidly enough that any potential + * alloc/free churn should cause a problem. + */ + new = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP); + bcopy(&otc->otc_entry.otce_mac, &new->ote_u.ote_vl2.otvl2_mac, + sizeof (overlay_target_mac_t)); + mutex_enter(&odd->odd_lock); if (!(odd->odd_flags & OVERLAY_F_VARPD)) { mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); + overlay_target_entry_dtor(new); return (ENXIO); } ott = odd->odd_target; if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); + overlay_target_entry_dtor(new); return (ENOTSUP); } + + new->ote_ott = ott; + new->ote_odd = odd; + mutex_enter(&ott->ott_lock); mutex_exit(&odd->odd_lock); - ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - otc->otc_entry.otce_mac); - if (ote == NULL) { - ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP); - bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL); - ote->ote_chead = ote->ote_ctail = NULL; - ote->ote_mbsize = 0; - ote->ote_ott = ott; - ote->ote_odd = odd; - mutex_enter(&ote->ote_lock); - refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote); - avl_add(&ott->ott_u.ott_dyn.ott_tree, ote); - } else { - mutex_enter(&ote->ote_lock); - } + if ((ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, + &otc->otc_entry.otce_mac)) == NULL) + ote = new; + mutex_enter(&ote->ote_lock); if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) { ote->ote_flags |= OVERLAY_ENTRY_F_DROP; } else { ote->ote_flags |= OVERLAY_ENTRY_F_VALID; - bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest, + if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_ROUTER) + ote->ote_flags |= OVERLAY_ENTRY_F_ROUTER; + bcopy(&otc->otc_entry.otce_dest, &ote->ote_u.ote_vl2.otvl2_dest, sizeof (overlay_target_point_t)); mp = ote->ote_chead; ote->ote_chead = NULL; @@ -1175,6 +2008,10 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) ote->ote_vtime = gethrtime(); } + if (ote == new) { + qqcache_insert(ott->ott_u.ott_dyn.ott_dhash, ote); + avl_add(&ott->ott_u.ott_dyn.ott_tree, ote); + } mutex_exit(&ote->ote_lock); mutex_exit(&ott->ott_lock); @@ -1185,6 +2022,9 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) overlay_hold_rele(odd); + if (ote != new) + overlay_target_entry_dtor(new); + return (0); } @@ -1217,8 +2057,11 @@ overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg) mutex_enter(&ott->ott_lock); mutex_exit(&odd->odd_lock); - ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - otc->otc_entry.otce_mac); + if (otc->otc_entry.otce_mac.otm_dcid == 0) + otc->otc_entry.otce_mac.otm_dcid = odd->odd_dcid; + + ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, + &otc->otc_entry.otce_mac); if (ote != NULL) { mutex_enter(&ote->ote_lock); ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; @@ -1269,8 +2112,13 @@ overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg) ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; mutex_exit(&ote->ote_lock); } - ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - otc->otc_entry.otce_mac); + + avl = &ott->ott_u.ott_dyn.ott_l3tree; + for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) { + mutex_enter(&ote->ote_lock); + ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; + mutex_exit(&ote->ote_lock); + } mutex_exit(&ott->ott_lock); overlay_hold_rele(odd); @@ -1304,9 +2152,10 @@ overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize, } typedef struct overlay_targ_cache_marker { - uint8_t otcm_mac[ETHERADDRL]; + overlay_target_mac_t otcm_mac; uint16_t otcm_done; -} overlay_targ_cache_marker_t; +} overlay_targ_cache_marker_t __aligned(8); +CTASSERT(sizeof (overlay_targ_cache_marker_t) == 2 * sizeof (uint64_t)); /* ARGSUSED */ static int @@ -1356,7 +2205,7 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) if (ott->ott_mode == OVERLAY_TARGET_POINT) { overlay_targ_cache_entry_t *out = &iter->otci_ents[0]; - bzero(out->otce_mac, ETHERADDRL); + bzero(&out->otce_mac, sizeof (out->otce_mac)); out->otce_flags = 0; bcopy(&ott->ott_u.ott_point, &out->otce_dest, sizeof (overlay_target_point_t)); @@ -1365,7 +2214,9 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) } avl = &ott->ott_u.ott_dyn.ott_tree; - bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL); + lookup.ote_u.ote_vl2.otvl2_mac.otm_dcid = odd->odd_dcid; + bcopy(&mark->otcm_mac, &lookup.ote_u.ote_vl2.otvl2_mac, + sizeof (mark->otcm_mac)); ent = avl_find(avl, &lookup, &where); /* @@ -1390,19 +2241,21 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) mutex_exit(&ent->ote_lock); continue; } - bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL); + bcopy(&ent->ote_u.ote_vl2.otvl2_mac, &out->otce_mac, + sizeof (out->otce_mac)); out->otce_flags = 0; if (ent->ote_flags & OVERLAY_ENTRY_F_DROP) out->otce_flags |= OVERLAY_TARGET_CACHE_DROP; if (ent->ote_flags & OVERLAY_ENTRY_F_VALID) - bcopy(&ent->ote_dest, &out->otce_dest, + bcopy(&ent->ote_u.ote_vl2.otvl2_dest, &out->otce_dest, sizeof (overlay_target_point_t)); written++; mutex_exit(&ent->ote_lock); } if (ent != NULL) { - bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL); + bcopy(&ent->ote_u.ote_vl2.otvl2_mac, &mark->otcm_mac, + sizeof (mark->otcm_mac)); } else { mark->otcm_done = 1; } @@ -1432,6 +2285,122 @@ overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize, return (0); } +/* + * Take an IPv6 address + prefix length, and turn it into the network address. + * E.g. ::ffff:192.168.51.50/120 -> ::ffff:192.168.51.0 + */ +static void +overlay_in6_to_subnet(const struct in6_addr *src, struct in6_addr *dst, + uint8_t prefixlen) +{ + uint32_t val; + + for (size_t i = 0; i < 4; i++) { + val = ntohl(src->_S6_un._S6_u32[i]); + val &= IN6_MASK_FROM_PREFIX(i, prefixlen); + dst->_S6_un._S6_u32[i] = htonl(val); + } +} + +/* + * Find the first target entry whose source IP falls within the source subnet + * given by otcne. If no entries match, NULL is returned. + */ +static overlay_target_entry_t * +overlay_target_cache_first_net(overlay_target_t *ott, + const overlay_targ_cache_net_entry_t *otcne) +{ + avl_tree_t *avl; + overlay_target_entry_t *ote; + struct in6_addr *start; + overlay_target_entry_t cmp = { 0 }; + avl_index_t where = { 0 }; + + ASSERT(MUTEX_HELD(&ott->ott_lock)); + + avl = &ott->ott_u.ott_dyn.ott_l3tree; + start = &cmp.ote_u.ote_vl3.otvl3_src; + + /* + * The first possible source address for a subnet is the network + * address (e.g. 192.160.10.0 for a /24). While it normally shouldn't + * appear, we either start here, or at the first entry after where + * it would exist if present. This should be the first possible + * entry in the subnet. If it's not within the subnet, then we + * know no entries with that source subnet are present. + */ + overlay_in6_to_subnet(&otcne->otcne_src, start, + otcne->otcne_src_prefixlen); + + if ((ote = avl_find(avl, &cmp, &where)) == NULL) + ote = avl_nearest(avl, where, AVL_AFTER); + + if (ote == NULL || !IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_src, + &ote->ote_u.ote_vl3.otvl3_src, otcne->otcne_src_prefixlen)) + return (NULL); + + return (ote); +} + +/* ARGSUSED */ +static int +overlay_target_cache_remove_net(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_cache_net_t *otcn = arg; + overlay_targ_cache_net_entry_t *otcne = &otcn->otcn_entry; + overlay_dev_t *odd = NULL; + overlay_target_t *ott = NULL; + overlay_target_entry_t *ote = NULL, *ote_next = NULL; + avl_tree_t *avl = NULL; + + odd = overlay_hold_by_dlid(otcn->otcn_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + avl = &ott->ott_u.ott_dyn.ott_l3tree; + + for (ote = overlay_target_cache_first_net(ott, otcne); + ote != NULL && IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_src, + &ote->ote_u.ote_vl3.otvl3_src, otcne->otcne_src_prefixlen); + ote = ote_next) { + ote_next = AVL_NEXT(avl, ote); + + /* + * Entries are sorted by src ip, dst ip, src vlan, there can + * be entries from this src ip to destinations on other + * subnets besides the one we are removing that will need to + * be skipped over. + */ + if (ote->ote_u.ote_vl3.otvl3_src_vlan != otcne->otcne_vlan) + continue; + + if (!IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_dst, + &ote->ote_u.ote_vl3.otvl3_dst, otcne->otcne_dst_prefixlen)) + continue; + + qqcache_remove(ott->ott_u.ott_dyn.ott_l3dhash, ote); + } + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + return (0); +} + static overlay_target_ioctl_t overlay_target_ioctab[] = { { OVERLAY_TARG_INFO, B_TRUE, B_TRUE, NULL, overlay_target_info, @@ -1492,6 +2461,9 @@ static overlay_target_ioctl_t overlay_target_ioctab[] = { overlay_target_cache_iter, overlay_target_cache_iter_copyout, sizeof (overlay_targ_cache_iter_t) }, + { OVERLAY_TARG_CACHE_REMOVE_NET, B_TRUE, B_TRUE, + NULL, overlay_target_cache_remove_net, + NULL, sizeof (overlay_targ_cache_net_t) }, { 0 } }; |
