diff options
Diffstat (limited to 'usr/src/uts/common')
| -rw-r--r-- | usr/src/uts/common/Makefile.files | 1 | ||||
| -rw-r--r-- | usr/src/uts/common/Makefile.rules | 12 | ||||
| -rw-r--r-- | usr/src/uts/common/io/overlay/overlay.c | 452 | ||||
| -rw-r--r-- | usr/src/uts/common/io/overlay/overlay_mux.c | 15 | ||||
| -rw-r--r-- | usr/src/uts/common/io/overlay/overlay_target.c | 1290 | ||||
| -rw-r--r-- | usr/src/uts/common/qqcache/qqcache.c | 444 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/Makefile | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/ethernet.h | 14 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/overlay.h | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/overlay_common.h | 3 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/overlay_impl.h | 57 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/overlay_target.h | 88 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/qqcache.h | 176 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/qqcache_impl.h | 72 |
14 files changed, 2419 insertions, 213 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 752fe56100..e973cf58ad 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -299,6 +299,7 @@ GENUNIX_OBJS += \ resolvepath.o \ retire_store.o \ process.o \ + qqcache.o \ rlimit.o \ rmap.o \ rw.o \ diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index a32c094f3b..ba8945b6fb 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -26,6 +26,7 @@ # Copyright 2020 Joyent, Inc. # Copyright 2018 Nexenta Systems, Inc. # Copyright (c) 2017 by Delphix. All rights reserved. +# Copyright 2018 Joyent, Inc. # Copyright 2020 Oxide Computer Company # @@ -1627,6 +1628,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/pcmcia/pcs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/qqcache/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/refhash/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/rpc/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -2778,6 +2787,9 @@ $(LINTS_DIR)/%.ln: $(COMMONBASE)/nvpair/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/os/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/qqcache/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/rpc/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c index 2ad3f4f591..c54a6e0d9c 100644 --- a/usr/src/uts/common/io/overlay/overlay.c +++ b/usr/src/uts/common/io/overlay/overlay.c @@ -134,6 +134,40 @@ * be sent to. In addition, they handle questions related to how to handle * things like broadcast and multicast traffic, etc. * + * ROUTING + * + * Supporting routing of packets between VLANs that exist on an overlay + * network require two major differences. First, packets destined for off-VLAN + * destinations need to be identified. Second, we must obtain the necessary + * additional information necessary to deliver the packet to its off-VLAN + * destination. + * + * To solve the first issue, we utilize the existing IP routing functionality. + * Off-vlan destinations are given routes with next hops in the originating + * netstack's routing table--just like in physical networks. The system will + * then attempt to generate an ARP query, which will be sent out to varpd in + * the exact same manner as is described above for other on-VLAN destinations. + * The response for this will include a MAC address that is both used for the + * ARP reply, and is added to our VL2 MAC->UL3 hash table, but is added + * with the OVERLAY_ENTRY_F_ROUTER flag set. Once this is done, the originating + * netstack will send off-VLAN packets to this router MAC, allowing the + * overlay device to identify these packets as requiring routing. + * + * Once packets with an off-VLAN destination are identified, we must determine + * what the destination vid, VL2 VLAN, and VL2 MAC values are for the given + * packet. For reasons similar to the VL2 MAC->UL3 lookup described above, + * we utilize the flexibility of user land to perform these lookups (also + * using varpd). In this instance we are attempting to find a destination VL3 + * IP to a UL3 IP mapping (a few extra bits of information are necessary to + * allow for disambiguation of the destination VL3 IP for situations such as + * mirroring a production environment including VL3 IPs in an isolated set of + * VLANs). We then store these results in a VL3->UL3 hash table for future + * lookups. + * + * To prevent the size of both the VL2->UL3 and VL3->UL3 hash tables from + * growing without bound, we cap the number of entries in each hash table and + * utilize the ARC algorithm to manage their contents. + * * ---------- * Properties * ---------- @@ -205,6 +239,10 @@ * UTF-8. Note that the size of the string includes the null * terminator. * + * OVERLAY_PROP_T_ETHER + * + * An ether_addr_t, which has a fixed size. + * * The next thing that we apply to a property is its permission. The permissions * are put together by the bitwise or of the following flags and values. * @@ -461,7 +499,7 @@ * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things * are much more interesting and as a result, more complicated. We primarily * store lists of overlay_target_entry_t's which are stored in both an avl tree - * and a refhash_t. The primary look up path uses the refhash_t and the avl tree + * and a qqcache_t. The primary look up path uses the qqcache_t and the avl tree * is only used for a few of the target ioctls used to dump data such that we * can get a consistent iteration order for things like dladm show-overlay -t. * The key that we use for the reference hashtable is based on the mac address @@ -486,6 +524,28 @@ * any outstanding data to that place. For the full story on how we look that up * will be discussed in the section on the Target Cache Lifecycle. * + * For routing, everything works largely the same as it does in the non-routing + * situations. The major differences are that both the target cache is always + * an OVERLAY_TARGET_DYNAMIC cache, and that an additional hash table lookup + * occurs. When a routed packet is sent down stack, the + * overlay_target_entry_t in the VL2 cache will have its + * OVERLAY_ENTRY_F_ROUTER flag set, which will prompt a lookup in the VL3->UL3 + * cache (using the source VL3, source VL2 VLAN, and destination VL3 values + * from the packet as the lookup key). The entry returned from the cache is + * used to modify the source and destination VL2 MAC addresses as well as + * the VL2 VLAN ID, and then is encapsulated and sent to its UL3 destination. + * On reception, decapsulation happens exactely the same as in the non-routed + * case, and the packet appears just as if it was sent out the VL2 network + * from a router connected to it. This is done both to maintain the illusion + * of a physical network when sniffing packets at the instance level, and + * so that the mac layer sitting above the destinations overlay device + * (for the vnic created over the overlay) does not discard the packet because + * its VLAN tag does not match the VLAN tag of the destination VNIC. While + * some of these modifications could be split between the source and + * destination hosts, by doing the work on the source, it maximizes any + * potential parallelism that might be present from multiple flows to a given + * destination. + * * ------------------------ * FMA and Degraded Devices * ------------------------ @@ -830,21 +890,62 @@ typedef enum overlay_dev_prop { OVERLAY_DEV_P_MTU = 0, OVERLAY_DEV_P_VNETID, OVERLAY_DEV_P_ENCAP, - OVERLAY_DEV_P_VARPDID + OVERLAY_DEV_P_VARPDID, + OVERLAY_DEV_P_DCID, + OVERLAY_DEV_P_VL2_CACHE_SIZE, + OVERLAY_DEV_P_VL2_CACHE_A, + OVERLAY_DEV_P_ROUTE_CACHE_SIZE, + OVERLAY_DEV_P_ROUTE_CACHE_A } overlay_dev_prop_t; -#define OVERLAY_DEV_NPROPS 4 +#define OVERLAY_DEV_NPROPS 9 static const char *overlay_dev_props[] = { "mtu", "vnetid", "encap", - "varpd/id" + "varpd/id", + "dcid", + "vl2_cache_size", + "_vl2_cache_a", + "route_cache_size", + "_route_cache_a" }; +/* properties that can be changed live */ +static boolean_t overlay_dev_liveprop[] = { + B_FALSE, /* mtu */ + B_FALSE, /* vnetid */ + B_FALSE, /* encap */ + B_FALSE, /* varpd/id */ + B_FALSE, /* dcid */ + B_TRUE, /* vl2_cache_size */ + B_TRUE, /* _vl2_cache_a */ + B_TRUE, /* route_cache_size */ + B_TRUE /* _route_cache_a */ +}; + +CTASSERT(ARRAY_SIZE(overlay_dev_props) == OVERLAY_DEV_NPROPS); +CTASSERT(ARRAY_SIZE(overlay_dev_liveprop) == OVERLAY_DEV_NPROPS); + #define OVERLAY_MTU_MIN 576 #define OVERLAY_MTU_DEF 1400 #define OVERLAY_MTU_MAX 8900 +/* The 2Q parameter 'a' is a percentage */ +#define OVERLAY_CACHE_MAX_A 100 +/* An somewhat arbitrary default, biasing towards storing more MFU entries */ +#define OVERLAY_CACHE_A_DEF 75 + +/* Somewhat arbitrary min and max values */ +#define OVERLAY_VL2_CACHE_MIN 256 +#define OVERLAY_VL2_CACHE_MAX 10240 +#define OVERLAY_VL2_CACHE_DEF OVERLAY_VL2_CACHE_MIN + +/* Somewhat arbitrary min and max values */ +#define OVERLAY_ROUTE_CACHE_MIN 256 +#define OVERLAY_ROUTE_CACHE_MAX 10240 +#define OVERLAY_ROUTE_CACHE_DEF OVERLAY_ROUTE_CACHE_MIN + overlay_dev_t * overlay_hold_by_dlid(datalink_id_t id) { @@ -1066,7 +1167,6 @@ overlay_m_tx(void *arg, mblk_t *mp_chain) bzero(&hdr, sizeof (struct msghdr)); bzero(&einfo, sizeof (ovep_encap_info_t)); - einfo.ovdi_id = odd->odd_vid; mp = mp_chain; while (mp != NULL) { socklen_t slen; @@ -1077,7 +1177,7 @@ overlay_m_tx(void *arg, mblk_t *mp_chain) ep = NULL; ret = overlay_target_lookup(odd, mp, - (struct sockaddr *)&storage, &slen); + (struct sockaddr *)&storage, &slen, &einfo.ovdi_id); if (ret != OVERLAY_TARGET_OK) { if (ret == OVERLAY_TARGET_DROP) freemsg(mp); @@ -1260,6 +1360,19 @@ overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) } odd->odd_vid = oicp->oic_vnetid; + if (oicp->oic_dcid > UINT32_MAX) { + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + odd->odd_dcid = oicp->oic_dcid; + + odd->odd_vl2sz = OVERLAY_VL2_CACHE_DEF; + odd->odd_vl2a = OVERLAY_CACHE_A_DEF; + odd->odd_routesz = OVERLAY_ROUTE_CACHE_DEF; + odd->odd_routea = OVERLAY_CACHE_A_DEF; + mac = mac_alloc(MAC_VERSION); if (mac == NULL) { mutex_exit(&overlay_dev_lock); @@ -1613,6 +1726,7 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, int ret; mac_perim_handle_t mph; uint_t propid = UINT_MAX; + uint32_t def; overlay_ioc_propinfo_t *oip = karg; overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip; @@ -1695,6 +1809,42 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); overlay_prop_set_nodefault(phdl); break; + case OVERLAY_DEV_P_DCID: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_nodefault(phdl); + overlay_prop_set_range_uint32(phdl, 0, UINT32_MAX); + break; + case OVERLAY_DEV_P_VL2_CACHE_SIZE: + def = OVERLAY_VL2_CACHE_DEF; + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, OVERLAY_VL2_CACHE_MIN, + OVERLAY_VL2_CACHE_MAX); + break; + case OVERLAY_DEV_P_VL2_CACHE_A: + def = OVERLAY_CACHE_A_DEF; + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, 0, OVERLAY_CACHE_MAX_A); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_SIZE: + def = OVERLAY_ROUTE_CACHE_DEF; + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, OVERLAY_ROUTE_CACHE_MIN, + OVERLAY_ROUTE_CACHE_MAX); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_A: + def = OVERLAY_CACHE_A_DEF; + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, 0, OVERLAY_CACHE_MAX_A); + break; default: overlay_hold_rele(odd); mac_perim_exit(mph); @@ -1804,6 +1954,41 @@ overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, } mutex_exit(&odd->odd_lock); break; + case OVERLAY_DEV_P_DCID: + /* + * While it's read-only while inside of a mux, we're not in a + * context that can guarantee that. Therefore we always grab the + * overlay_dev_t's odd_lock. + */ + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_dcid, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; + case OVERLAY_DEV_P_VL2_CACHE_SIZE: + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_vl2sz, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; + case OVERLAY_DEV_P_VL2_CACHE_A: + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_vl2a, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_SIZE: + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_routesz, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_A: + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_routea, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; default: ret = ENOENT; } @@ -1845,6 +2030,146 @@ overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid) mutex_exit(&odd->odd_lock); } +static void +overlay_setprop_dcid(overlay_dev_t *odd, uint32_t dcid) +{ + mutex_enter(&odd->odd_lock); + + /* Simple case, not active */ + if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) { + odd->odd_dcid = dcid; + mutex_exit(&odd->odd_lock); + return; + } + + /* + * In the hard case, we need to set the drop flag, quiesce I/O and then + * we can go ahead and do everything. + */ + odd->odd_flags |= OVERLAY_F_MDDROP; + overlay_io_wait(odd, OVERLAY_F_IOMASK); + mutex_exit(&odd->odd_lock); + + overlay_mux_remove_dev(odd->odd_mux, odd); + mutex_enter(&odd->odd_lock); + odd->odd_dcid = dcid; + mutex_exit(&odd->odd_lock); + overlay_mux_add_dev(odd->odd_mux, odd); + + mutex_enter(&odd->odd_lock); + ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX); + odd->odd_flags &= ~OVERLAY_F_IN_MUX; + mutex_exit(&odd->odd_lock); +} + +static int +overlay_setprop_vl2_cachesz(overlay_dev_t *odd, uint32_t sz) +{ + overlay_target_t *ott = NULL; + int ret = 0; + + if (sz == 0) + sz = OVERLAY_VL2_CACHE_DEF; + + /* Caller should have validated this */ + ASSERT3U(sz, >=, OVERLAY_VL2_CACHE_MIN); + ASSERT3U(sz, <=, OVERLAY_VL2_CACHE_MAX); + + mutex_enter(&odd->odd_lock); + ott = odd->odd_target; + + /* ott_mode is RO if the target exists */ + if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) { + mutex_enter(&ott->ott_lock); + ret = qqcache_adjust_size(ott->ott_u.ott_dyn.ott_dhash, sz); + mutex_exit(&ott->ott_lock); + } + + if (ret == 0) + odd->odd_vl2sz = sz; + mutex_exit(&odd->odd_lock); + + return (ret); +} + +static int +overlay_setprop_vl2_cachea(overlay_dev_t *odd, uint32_t a) +{ + overlay_target_t *ott = NULL; + int ret = 0; + + /* Caller should have validated this */ + ASSERT3U(a, <=, 100); + + mutex_enter(&odd->odd_lock); + ott = odd->odd_target; + + /* ott_mode is RO if the target exists */ + if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) { + mutex_enter(&ott->ott_lock); + ret = qqcache_adjust_a(ott->ott_u.ott_dyn.ott_dhash, a); + mutex_exit(&ott->ott_lock); + } + if (ret == 0) + odd->odd_vl2a = a; + mutex_exit(&odd->odd_lock); + + return (ret); +} + +static int +overlay_setprop_route_cachesz(overlay_dev_t *odd, uint32_t sz) +{ + overlay_target_t *ott = NULL; + int ret = 0; + + if (sz == 0) + sz = OVERLAY_ROUTE_CACHE_DEF; + + ASSERT3U(sz, >=, OVERLAY_ROUTE_CACHE_MIN); + ASSERT3U(sz, <=, OVERLAY_ROUTE_CACHE_MAX); + + mutex_enter(&odd->odd_lock); + ott = odd->odd_target; + + /* ott_mode is RO if the target exists */ + if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) { + mutex_enter(&ott->ott_lock); + ret = qqcache_adjust_size(ott->ott_u.ott_dyn.ott_l3dhash, sz); + mutex_exit(&ott->ott_lock); + } + if (ret == 0) + odd->odd_routesz = sz; + mutex_exit(&odd->odd_lock); + + return (ret); +} + +static int +overlay_setprop_route_cachea(overlay_dev_t *odd, uint32_t a) +{ + overlay_target_t *ott = NULL; + int ret = 0; + + /* Caller should have validated this */ + ASSERT3U(a, <=, 100); + + mutex_enter(&odd->odd_lock); + ott = odd->odd_target; + + /* ott_mode is RO if the target exists */ + if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) { + mutex_enter(&ott->ott_lock); + ret = qqcache_adjust_a(ott->ott_u.ott_dyn.ott_l3dhash, a); + mutex_exit(&ott->ott_lock); + } + if (ret == 0) + odd->odd_routea = a; + mutex_exit(&odd->odd_lock); + + return (ret); +} + /* ARGSUSED */ static int overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, @@ -1855,7 +2180,7 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, overlay_ioc_prop_t *oip = karg; uint_t propid = UINT_MAX; mac_perim_handle_t mph; - uint64_t maxid, *vidp; + uint64_t maxid, *vidp, *dcidp, *vl2szp, *vl2ap, *routeszp, *routeap; if (oip->oip_size > OVERLAY_PROP_SIZEMAX) return (EINVAL); @@ -1865,31 +2190,48 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, return (ENOENT); oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; - mac_perim_enter_by_mh(odd->odd_mh, &mph); - mutex_enter(&odd->odd_lock); - if (odd->odd_flags & OVERLAY_F_ACTIVATED) { - mac_perim_exit(mph); - mutex_exit(&odd->odd_lock); - return (ENOTSUP); - } - mutex_exit(&odd->odd_lock); + + /* + * Currently, only certain overlay properties (and no encapsulation + * properties) can be changed while the overlay device is active. + */ if (oip->oip_id == -1) { int i; for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) break; - if (i == OVERLAY_DEV_NPROPS) { - ret = odd->odd_plugin->ovp_ops->ovpo_setprop( - odd->odd_pvoid, oip->oip_name, - oip->oip_value, oip->oip_size); - overlay_hold_rele(odd); - mac_perim_exit(mph); - return (ret); - } } - propid = i; + if (i < OVERLAY_DEV_NPROPS) + propid = i; + } else if (oip->oip_id < OVERLAY_DEV_NPROPS) { + propid = oip->oip_id; + } + + /* + * A bit tricky, but propid is initalized to UINT_MAX, so we know we + * have an overlay property whenever propid < OVERLAY_DEV_NPROPS, + * otherwise we have a plugin property. + */ + mac_perim_enter_by_mh(odd->odd_mh, &mph); + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_ACTIVATED) && + ((propid >= OVERLAY_DEV_NPROPS || !overlay_dev_liveprop[propid]))) { + mutex_exit(&odd->odd_lock); + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_exit(&odd->odd_lock); + + if (oip->oip_id == -1 && propid >= OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_setprop( + odd->odd_pvoid, oip->oip_name, + oip->oip_value, oip->oip_size); + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ret); } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; @@ -1941,6 +2283,68 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, case OVERLAY_DEV_P_VARPDID: ret = EPERM; break; + case OVERLAY_DEV_P_DCID: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + dcidp = (uint64_t *)oip->oip_value; + if (*dcidp > UINT32_MAX) { + ret = EINVAL; + break; + } + overlay_setprop_dcid(odd, *dcidp); + break; + case OVERLAY_DEV_P_VL2_CACHE_SIZE: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + vl2szp = (uint64_t *)oip->oip_value; + if (*vl2szp != 0 && (*vl2szp < OVERLAY_VL2_CACHE_MIN || + *vl2szp > OVERLAY_VL2_CACHE_MAX)) { + ret = EINVAL; + break; + } + ret = overlay_setprop_vl2_cachesz(odd, *vl2szp); + break; + case OVERLAY_DEV_P_VL2_CACHE_A: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + vl2ap = (uint64_t *)oip->oip_value; + if (*vl2ap > OVERLAY_CACHE_MAX_A) { + ret = EINVAL; + break; + } + ret = overlay_setprop_vl2_cachea(odd, *vl2ap); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_SIZE: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + routeszp = (uint64_t *)oip->oip_value; + if (*routeszp != 0 && (*routeszp < OVERLAY_ROUTE_CACHE_MIN || + OVERLAY_ROUTE_CACHE_MAX)) { + ret = EINVAL; + break; + } + ret = overlay_setprop_route_cachesz(odd, *routeszp); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_A: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + routeap = (uint64_t *)oip->oip_value; + if (*routeap > OVERLAY_CACHE_MAX_A) { + ret = EINVAL; + break; + } + ret = overlay_setprop_route_cachea(odd, *routeap); + break; default: ret = ENOENT; } diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c index 0c21bb8689..2688d2791c 100644 --- a/usr/src/uts/common/io/overlay/overlay_mux.c +++ b/usr/src/uts/common/io/overlay/overlay_mux.c @@ -127,6 +127,18 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob, freeb(fmp); /* + * In cases of looped-back vxlan, that tends to have a + * prepended IP+UDP-only mblk, followed by the data. Parsing + * would've made that mblk a zero-length one (rptr == wptr). + */ + if (mp->b_rptr == mp->b_wptr && mp->b_cont != NULL) { + /* Ended up with zero-length mblk, lose it! */ + fmp = mp; + mp = fmp->b_cont; + freeb(fmp); + } + + /* * Until we have VXLAN-or-other-decap HW acceleration support * (e.g. we support NICs that reach into VXLAN-encapsulated * packets and check the inside-VXLAN IP packets' checksums, @@ -161,10 +173,9 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob, if (rem == blkl) { fmp = mp; mp = fmp->b_cont; - fmp->b_cont = NULL; OVERLAY_FREEMSG(mp, "freed a fmp block"); - freemsg(fmp); + freeb(fmp); } } if (mp == NULL) { diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c index f4147b56d1..171acd034f 100644 --- a/usr/src/uts/common/io/overlay/overlay_target.c +++ b/usr/src/uts/common/io/overlay/overlay_target.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -20,6 +20,8 @@ * uts/common/io/overlay/overlay.c */ +#include <inet/ip.h> +#include <inet/ip6.h> #include <sys/types.h> #include <sys/ethernet.h> #include <sys/kmem.h> @@ -42,6 +44,9 @@ #include <sys/overlay_impl.h> #include <sys/sdt.h> +#define OVERLAY_DROP(mp, reason) \ + DTRACE_PROBE2(overlay__drop, mblk_t *, mp, char *, reason) + /* * This is total straw man, but at least it's a prime number. Here we're * going to have to go through and do a lot of evaluation and understanding as @@ -52,6 +57,19 @@ #define OVERLAY_HSIZE 823 /* + * The default size of each target cache. This is also a complete strawman + * whose value could change as we gain better operational experience with + * overlay routing. + */ +#define OVERLAY_CACHE_SIZE 512 + +/* + * A somewhat arbitrary value. The percentage of the target cache dedicated + * to MFU entries (i.e. entries that have been looked up more than once). + */ +#define OVERLAY_CACHE_A 60 + +/* * We use this data structure to keep track of what requests have been actively * allocated to a given instance so we know what to put back on the pending * list. @@ -69,7 +87,7 @@ typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int); typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *); typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int); -typedef struct overaly_target_ioctl { +typedef struct overlay_target_ioctl { int oti_cmd; /* ioctl id */ boolean_t oti_write; /* ioctl requires FWRITE */ boolean_t oti_ncopyout; /* copyout data? */ @@ -144,25 +162,60 @@ overlay_entry_cache_destructor(void *buf, void *arg) static uint64_t overlay_mac_hash(const void *v) { + const overlay_target_mac_t *m = v; + uint32_t crc; - CRC32(crc, v, ETHERADDRL, -1U, crc32_table); + CRC32(crc, m->otm_mac, ETHERADDRL, -1U, crc32_table); + CRC32(crc, &m->otm_dcid, sizeof (uint32_t), crc, crc32_table); return (crc); } static int overlay_mac_cmp(const void *a, const void *b) { - return (bcmp(a, b, ETHERADDRL)); + const overlay_target_mac_t *l = a; + const overlay_target_mac_t *r = b; + + if (l->otm_dcid != r->otm_dcid) + return (1); + return (bcmp(l->otm_mac, r->otm_mac, ETHERADDRL) != 0); +} + +static uint64_t +overlay_ip_hash(const void *v) +{ + const overlay_target_vl3_t *vl3 = v; + + uint32_t crc; + CRC32(crc, &vl3->otvl3_src, sizeof (vl3->otvl3_src), -1U, crc32_table); + CRC32(crc, &vl3->otvl3_dst, sizeof (vl3->otvl3_dst), crc, crc32_table); + CRC32(crc, &vl3->otvl3_src_vlan, sizeof (vl3->otvl3_src_vlan), crc, + crc32_table); + return (crc); +} + +static int +overlay_ip_cmp(const void *a, const void *b) +{ + const overlay_target_vl3_t *l = a; + const overlay_target_vl3_t *r = b; + + if (l->otvl3_src_vlan != r->otvl3_src_vlan) + return (1); + if (!IN6_ARE_ADDR_EQUAL(&l->otvl3_src, &r->otvl3_src)) + return (1); + if (!IN6_ARE_ADDR_EQUAL(&l->otvl3_dst, &r->otvl3_dst)) + return (1); + return (0); } -/* ARGSUSED */ static void overlay_target_entry_dtor(void *arg) { overlay_target_entry_t *ote = arg; ote->ote_flags = 0; - bzero(ote->ote_addr, ETHERADDRL); + bzero(&ote->ote_u, sizeof (ote->ote_u)); ote->ote_ott = NULL; ote->ote_odd = NULL; freemsgchain(ote->ote_chead); @@ -172,21 +225,76 @@ overlay_target_entry_dtor(void *arg) kmem_cache_free(overlay_entry_cache, ote); } +static void +overlay_target_entry_l2qq_dtor(void *arg) +{ + overlay_target_entry_t *ote = arg; + overlay_target_t *ott = ote->ote_ott; + + ASSERT(MUTEX_HELD(&ott->ott_lock)); + ASSERT3S((ote->ote_flags & OVERLAY_ENTRY_F_VL3), ==, 0); + + avl_remove(&ott->ott_u.ott_dyn.ott_tree, ote); + overlay_target_entry_dtor(ote); +} + +static void +overlay_target_entry_l3qq_dtor(void *arg) +{ + overlay_target_entry_t *ote = arg; + overlay_target_t *ott = ote->ote_ott; + + ASSERT(MUTEX_HELD(&ott->ott_lock)); + ASSERT3S((ote->ote_flags & OVERLAY_ENTRY_F_VL3), ==, + OVERLAY_ENTRY_F_VL3); + + avl_remove(&ott->ott_u.ott_dyn.ott_l3tree, ote); + overlay_target_entry_dtor(ote); +} + static int overlay_mac_avl(const void *a, const void *b) { + const overlay_target_entry_t *le = a; + const overlay_target_entry_t *re = b; + const overlay_target_mac_t *lm = &le->ote_u.ote_vl2.otvl2_mac; + const overlay_target_mac_t *rm = &re->ote_u.ote_vl2.otvl2_mac; int i; - const overlay_target_entry_t *l, *r; - l = a; - r = b; + + /* Order by DCID, then MAC */ + if (lm->otm_dcid < rm->otm_dcid) + return (-1); + if (lm->otm_dcid > rm->otm_dcid) + return (1); for (i = 0; i < ETHERADDRL; i++) { - if (l->ote_addr[i] > r->ote_addr[i]) + if (lm->otm_mac[i] > rm->otm_mac[i]) return (1); - else if (l->ote_addr[i] < r->ote_addr[i]) + else if (lm->otm_mac[i] < rm->otm_mac[i]) return (-1); } + return (0); +} +static int +overlay_ip_avl(const void *a, const void *b) +{ + const overlay_target_entry_t *l = a; + const overlay_target_entry_t *r = b; + const overlay_target_vl3_t *l_vl3 = &l->ote_u.ote_vl3; + const overlay_target_vl3_t *r_vl3 = &r->ote_u.ote_vl3; + int ret; + + if ((ret = memcmp(&l_vl3->otvl3_src, &r_vl3->otvl3_src, + sizeof (l_vl3->otvl3_src))) != 0) + return (ret < 0 ? -1 : 1); + if ((ret = memcmp(&l_vl3->otvl3_dst, &r_vl3->otvl3_dst, + sizeof (l_vl3->otvl3_dst))) != 0) + return (ret < 0 ? -1 : 1); + if (l_vl3->otvl3_src_vlan < r_vl3->otvl3_src_vlan) + return (-1); + if (l_vl3->otvl3_src_vlan > r_vl3->otvl3_src_vlan) + return (1); return (0); } @@ -233,25 +341,20 @@ overlay_target_free(overlay_dev_t *odd) return; if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) { - refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash; - avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree; - overlay_target_entry_t *ote; - + mutex_enter(&odd->odd_target->ott_lock); /* - * Our AVL tree and hashtable contain the same elements, - * therefore we should just remove it from the tree, but then - * delete the entries when we remove them from the hash table - * (which happens through the refhash dtor). + * Our VL3 AVL tree and hashtable contain the same elements. + * Additionally, when an entry is removed from the 2Q cache, + * the entry is removed from the corresponding AVL tree. + * Deleting the 2Q cache will destroy any remaining entries, + * so all we need to do is destroy the 2Q caches. */ - while ((ote = avl_first(ap)) != NULL) - avl_remove(ap, ote); - - avl_destroy(ap); - for (ote = refhash_first(rp); ote != NULL; - ote = refhash_next(rp, ote)) { - refhash_remove(rp, ote); - } - refhash_destroy(rp); + qqcache_destroy(odd->odd_target->ott_u.ott_dyn.ott_dhash); + qqcache_destroy(odd->odd_target->ott_u.ott_dyn.ott_l3dhash); + ASSERT(avl_is_empty(&odd->odd_target->ott_u.ott_dyn.ott_tree)); + ASSERT(avl_is_empty( + &odd->odd_target->ott_u.ott_dyn.ott_l3tree)); + mutex_exit(&odd->odd_target->ott_lock); } ASSERT(odd->odd_target->ott_ocount == 0); @@ -270,18 +373,42 @@ overlay_target_busy() return (ret); } +/* + * Queue the target entry on the list of varpd requests. entry should be + * refheld for the duration of this call (this call takes its own additional + * hold that is released when we receive a response). + */ static void overlay_target_queue(overlay_target_entry_t *entry) { + overlay_target_t *ott = entry->ote_ott; + boolean_t is_vl3 = B_FALSE; + + /* + * ote_ott is read-only and set at entry creation, so it can be + * read without ote_lock held + */ + ASSERT(!MUTEX_HELD(&entry->ote_lock)); + + mutex_enter(&entry->ote_lock); + if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0) + is_vl3 = B_TRUE; + mutex_exit(&entry->ote_lock); + mutex_enter(&overlay_target_lock); - mutex_enter(&entry->ote_ott->ott_lock); - if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) { - mutex_exit(&entry->ote_ott->ott_lock); + mutex_enter(&ott->ott_lock); + if (ott->ott_flags & OVERLAY_T_TEARDOWN) { + mutex_exit(&ott->ott_lock); mutex_exit(&overlay_target_lock); return; } - entry->ote_ott->ott_ocount++; - mutex_exit(&entry->ote_ott->ott_lock); + ott->ott_ocount++; + if (is_vl3) + qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry); + else + qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry); + + mutex_exit(&ott->ott_lock); list_insert_tail(&overlay_target_list, entry); cv_signal(&overlay_target_condvar); mutex_exit(&overlay_target_lock); @@ -300,22 +427,446 @@ overlay_target_quiesce(overlay_target_t *ott) } /* - * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP | + * Write the VL3 src/dst IP from the packet in mp into src and dst. If the + * addresses are IPv4 addresses, they are written as mapped addresses. + */ +static int +overlay_get_vl3_ips(mblk_t *mp, struct in6_addr *src, struct in6_addr *dst) +{ + uint16_t sap; + +#if 1 + /* Temporary until mblk helpers are integrated */ + struct ether_vlan_header *eth = (struct ether_vlan_header *)mp->b_rptr; + ipha_t *iphp = (ipha_t *)(eth + 1); + ip6_t *ip6hp = (ip6_t *)(eth + 1); + size_t mlen = MBLKL(mp); + + if (mlen < sizeof (struct ether_vlan_header)) + return (EINVAL); + mlen -= sizeof (struct ether_vlan_header); + + /* We currently don't support routing on untagged vlans */ + if ((sap = ntohs(eth->ether_tpid)) != ETHERTYPE_VLAN) + return (EINVAL); + + sap = ntohs(eth->ether_type); + if (mlen == 0) { + if ((mp = mp->b_cont) == NULL) + return (EINVAL); + mlen = MBLKL(mp); + iphp = (ipha_t *)mp->b_rptr; + ip6hp = (ip6_t *)mp->b_rptr; + } + + switch (sap) { + case ETHERTYPE_IP: + if (mlen < sizeof (ipha_t)) + return (EINVAL); + ASSERT3U(IPH_HDR_VERSION(iphp), ==, IPV4_VERSION); + IN6_IPADDR_TO_V4MAPPED(iphp->ipha_src, src); + IN6_IPADDR_TO_V4MAPPED(iphp->ipha_dst, dst); + break; + case ETHERTYPE_IPV6: + if (mlen < sizeof (ip6_t)) + return (EINVAL); + ASSERT3U(IPH_HDR_VERSION(iphp), ==, IPV6_VERSION); + bcopy(&ip6hp->ip6_src, src, sizeof (*src)); + bcopy(&ip6hp->ip6_dst, dst, sizeof (*dst)); + break; + default: + return (EINVAL); + } + + return (0); +#else + size_t soff, doff; + uint32_t v4s, v4d; + int i; + + if (!mblk_read_uint16(mp, offsetof(struct ether_header, ether_type), + &sap)) + return (EINVAL); + + if (sap == ETHERTYPE_VLAN) { + if (!mblk_read_uint16(mp, + offsetof(struct ether_vlan_header, ether_type), &sap)) + return (EINVAL); + soff = doff = sizeof (struct ether_vlan_header); + } else { + soff = doff = sizeof (struct ether_header); + } + + switch (sap) { + case ETHERTYPE_IP: + soff += offsetof(ipha_t, ipha_src); + doff += offsetof(ipha_t, ipha_dst); + + if (!mblk_read_uint32(mp, soff, &v4s) || + !mblk_read_uint32(mp, doff, &v4d)) + return (EINVAL); + IN6_IPADDR_TO_V4MAPPED(&v4s, src); + IN6_IPADDR_TO_V4MAPPED(&v4d, dst); + break; + case ETHERTYPE_IPV6: + soff += offsetof(ip6_t, ip6_src); + doff += offsetof(ip6_6, ip6_dst); + + for (i = 0; i < 4; i++) { + if (!mblk_read_uint32(mp, soff, &src->s6_addr32[i]) || + !mblk_read_uint32(mp, doff, &dst->s6_addr32[i])) + return (EINVAL); + soff += sizeof (uint32_t); + doff += sizeof (uint32_t); + } + break; + default: + return (EINVAL); + } + + return (0); +#endif +} + +static int +overlay_route(overlay_dev_t *odd, mblk_t *mp, + const overlay_target_route_t *route, const overlay_target_mac_t *dst_mac) +{ + uint16_t tci; + + if (MBLKL(mp) >= sizeof (struct ether_vlan_header)) { + struct ether_vlan_header *evh; + + evh = (struct ether_vlan_header *)mp->b_rptr; + tci = ntohs(evh->ether_tci); + + /* + * Today we require all encapsulated frames to be vlan tagged. + * If this is relaxed in the future, we will need to allow for + * insertion and removal of the vlan tag as appropriate here. + */ + if (ntohs(evh->ether_tpid) != ETHERTYPE_VLAN) { + OVERLAY_DROP(mp, "not vlan tagged"); + return (OVERLAY_TARGET_DROP); + } + + tci &= ~(VLAN_ID_MASK); + tci |= route->otr_vlan; + evh->ether_tci = htons(tci); + bcopy(dst_mac->otm_mac, &evh->ether_dhost, ETHERADDRL); + bcopy(route->otr_srcmac, &evh->ether_shost, ETHERADDRL); + return (OVERLAY_TARGET_OK); + } + +#if 1 + /* Temporary until mblk helpers are integrated */ + OVERLAY_DROP(mp, "ethernet header split between mblks"); + return (OVERLAY_TARGET_DROP); +#else + size_t off; + + off = offsetof(struct ether_vlan_header, ether_tpid); + if (!mblk_read_uint16(mp, off, &tci)) { + OVERLAY_DROP(mp, "cannot read tpid"); + return (OVERLAY_TARGET_DROP); + } + + tci = ntohs(evh->ether_tci); + tci &= ~(VLAN_ID_MASK); + tci |= route->otr_vlan; + + if (!mblk_write_uint16(mp, off, tci)) { + OVERLAY_DROP(mp, "cannot set routed destination vlan"); + return (OVERLAY_TARGET_DROP); + } + + for (int i = 0; i < ETHERADDRL; i++) { + if (!mblk_write_uint8(mp, i, dst_msc->otm_mac[i]) || + !mblk_write_uint8(mp, i + ETHERADDRL, + route->otr_srcmac[i])) { + OVERLAY_DROP(mp, "cannot set routed macs"); + return (OVERLAY_TARGET_DROP); + } + } + + return (OVERLAY_TARGET_OK); +#endif +} + +/* + * Attempt to add mp to the packet queue of target entry. If the queue is + * already full, it returns OVERLAY_TARGET_DROP, otherwise OVERLAY_TARGET_ASYNC + * is returned. If the entry isn't already pending a response from varpd, + * queue the target entry on the list of outstanding varpd requests. + * + * Entry should already be locked, however since it is intended that this + * should be the final step in dealing with this entry (for handling the + * packet in question), it always releases ote_lock before returning. + * entry should be refheld for the duration of this call. + */ +static int +overlay_target_try_queue(overlay_target_entry_t *entry, mblk_t *mp) +{ + size_t mlen = msgsize(mp); + boolean_t queue = B_FALSE; + + ASSERT(MUTEX_HELD(&entry->ote_lock)); + + if (mlen + entry->ote_mbsize > overlay_ent_size) { + OVERLAY_DROP(mp, "target queue full"); + mutex_exit(&entry->ote_lock); + return (OVERLAY_TARGET_DROP); + } + + if (entry->ote_ctail != NULL) { + ASSERT(entry->ote_ctail->b_next == NULL); + entry->ote_ctail->b_next = mp; + entry->ote_ctail = mp; + } else { + entry->ote_chead = mp; + entry->ote_ctail = mp; + } + entry->ote_mbsize += mlen; + if ((entry->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0) { + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + queue = B_TRUE; + } + mutex_exit(&entry->ote_lock); + + if (queue) + overlay_target_queue(entry); + + return (OVERLAY_TARGET_ASYNC); +} + +/* + * Given the VL3 IP->VL2 mac entry (vl3e), and the corresponding VL2 MAC->UL3 + * entry (vl2e), if both entries are valid, sets *vidp, *v6, *slenp to the + * correct UL3 destination and return OVERLAY_TARGET_OK. If either of the + * entries are still pending lookups (or v2le is NULL because the entry is + * missing), mp is queued (if there is space) on the appropriate entry and + * OVERLAY_TARGET_ASYNC is returned. If the VL2 entry is flagged to drop all + * packets, OVERLAY_TARGET_DROP is returned. + * + * In all cases, the caller should acquire vl3e->ote_lock prior to calling + * overlay_route_lookup_vl2(). Because vl2e can be missing (NULL), the caller + * should not acquire vl2e->ote_lock prior to calling + * overlay_route_lookup_vl2(). vl3e->ote_lock is alway dropped prior to + * returning. + */ +static int +overlay_route_lookup_vl2(overlay_target_entry_t *vl3e, + overlay_target_entry_t *vl2e, uint64_t *vidp, struct sockaddr_in6 *v6, + socklen_t *slenp, mblk_t *mp) +{ + overlay_target_vl2_t *vl2p; + int ret; + + ASSERT(MUTEX_HELD(&vl3e->ote_lock)); + + if (vl2e == NULL) { + vl3e->ote_flags &= ~OVERLAY_ENTRY_F_VALID; + /* This drops vl3e->ote_lock */ + return (overlay_target_try_queue(vl3e, mp)); + } + + mutex_enter(&vl2e->ote_lock); + if (vl2e->ote_flags & (OVERLAY_ENTRY_F_DROP | OVERLAY_ENTRY_F_ROUTER)) { + overlay_target_entry_flags_t flags = vl2e->ote_flags; + + mutex_exit(&vl2e->ote_lock); + mutex_exit(&vl3e->ote_lock); + + if (flags & OVERLAY_ENTRY_F_DROP) { + OVERLAY_DROP(mp, "VL2 target marked drop"); + } else { + OVERLAY_DROP(mp, "VL2 target is overlay router"); + } + + return (OVERLAY_TARGET_DROP); + } + + /* + * If the route is missing queue on the VL3 entry so a VL3->UL3 + * lookup is done (to get the route data). + */ + if ((vl2e->ote_flags & OVERLAY_ENTRY_F_HAS_ROUTE) == 0) { + mutex_exit(&vl2e->ote_lock); + + vl3e->ote_flags &= ~OVERLAY_ENTRY_F_VALID; + /* This drops vl3e->ote_lock */ + return (overlay_target_try_queue(vl3e, mp)); + } + + /* + * If the VL2 target point is missing, we try to be a bit (though + * hopefully not too) clever. We can always queue on the VL3 entry + * which will trigger a VL3->UL3 lookup request (as it is effectively + * a superset of the VL2->UL3 lookup). However, if we know we already + * have an outstanding VL3->UL3 request, we queue on the VL2 entry and + * avoid doing another redundant lookup. We can also queue on the VL2 + * entry when it is a local (same vnet, same DC) destination -- we + * currently cannot generate VL2->UL3 lookups for remote destinations, + * only same vnet, same DC. Queueing on the VL2 entry also allows + * instances on the same vlan as the queued VL2 entry to piggy back on + * the lookup request and avoid a redundant lookup. However if the + * VL2 entry is remote, we have to do a VL3->UL3 lookup. + */ + if ((vl2e->ote_flags & OVERLAY_ENTRY_F_VALID) == 0) { + if ((vl2e->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0 && + vl2e->ote_u.ote_vl2.otvl2_mac.otm_dcid != + vl2e->ote_odd->odd_dcid) { + mutex_exit(&vl2e->ote_lock); + /* This drops vl3e->ote_lock */ + return (overlay_target_try_queue(vl3e, mp)); + } + + mutex_exit(&vl3e->ote_lock); + /* This drops vl2e->ote_lock */ + return (overlay_target_try_queue(vl2e, mp)); + } + + ASSERT(vl2e->ote_flags & OVERLAY_ENTRY_F_VALID); + + vl2p = &vl2e->ote_u.ote_vl2; + + *vidp = vl2p->otvl2_route.otr_vnet; + bcopy(&vl2p->otvl2_dest.otp_ip, &v6->sin6_addr, + sizeof (struct in6_addr)); + v6->sin6_port = htons(vl2p->otvl2_dest.otp_port); + *slenp = sizeof (struct sockaddr_in6); + + ret = overlay_route(vl2e->ote_odd, mp, &vl2p->otvl2_route, + &vl2p->otvl2_mac); + mutex_exit(&vl2e->ote_lock); + mutex_exit(&vl3e->ote_lock); + return (ret); +} + +static int +overlay_route_lookup(overlay_dev_t *odd, mblk_t *mp, uint16_t vlan, + struct sockaddr *sock, socklen_t *slenp, uint64_t *vidp) +{ + overlay_target_t *ott = odd->odd_target; + overlay_target_entry_t *entry, *vl2_entry = NULL; + struct sockaddr_in6 *v6 = (struct sockaddr_in6 *)sock; + overlay_target_vl3_t vl3 = { 0 }; + int ret = OVERLAY_TARGET_DROP; + + /* overlay_target_lookup() should have set this */ + ASSERT3U(v6->sin6_family, ==, AF_INET6); + + /* We should only be called for dynamic endpoints */ + ASSERT3U(ott->ott_mode, ==, OVERLAY_TARGET_DYNAMIC); + + vl3.otvl3_src_vlan = vlan; + if ((ret = overlay_get_vl3_ips(mp, &vl3.otvl3_src, &vl3.otvl3_dst)) + != OVERLAY_TARGET_OK) { + OVERLAY_DROP(mp, "could not read VL3 src/dst IPs"); + return (OVERLAY_TARGET_DROP); + } + + mutex_enter(&ott->ott_lock); + entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_l3dhash, &vl3); + if (entry == NULL) { + if ((entry = kmem_cache_alloc(overlay_entry_cache, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) { + mutex_exit(&ott->ott_lock); + OVERLAY_DROP(mp, "failed VL3 target entry allocation"); + return (OVERLAY_TARGET_DROP); + } + + bcopy(&vl3, &entry->ote_u.ote_vl3, sizeof (vl3)); + entry->ote_flags = OVERLAY_ENTRY_F_VL3; + + entry->ote_chead = entry->ote_ctail = mp; + entry->ote_mbsize = msgsize(mp); + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + + entry->ote_ott = ott; + entry->ote_odd = odd; + + qqcache_insert(ott->ott_u.ott_dyn.ott_l3dhash, entry); + qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry); + avl_add(&ott->ott_u.ott_dyn.ott_l3tree, entry); + mutex_exit(&ott->ott_lock); + + overlay_target_queue(entry); + + mutex_enter(&ott->ott_lock); + qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry); + mutex_exit(&ott->ott_lock); + return (OVERLAY_TARGET_ASYNC); + } + qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry); + mutex_enter(&entry->ote_lock); + + /* + * A bit ugly, but if we need the VL2 entry, we want to look it up + * while we still hold ott_lock. + */ + if ((entry->ote_flags & + (OVERLAY_ENTRY_F_DROP|OVERLAY_ENTRY_F_ROUTER| + OVERLAY_ENTRY_F_VALID)) == OVERLAY_ENTRY_F_VALID) { + vl2_entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, + &entry->ote_u.ote_vl3.otvl3_vl2); + if (vl2_entry != NULL) + qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, vl2_entry); + } + mutex_exit(&ott->ott_lock); + + ASSERT(entry->ote_flags & OVERLAY_ENTRY_F_VL3); + + if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) { + mutex_exit(&entry->ote_lock); + OVERLAY_DROP(mp, "VL3 target entry marked drop"); + ret = OVERLAY_TARGET_DROP; + } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) { + /* + * XXX: A packet with a dst IP of an overlay router. + * Maybe generate an ICMP reply? For now, we drop. + */ + mutex_exit(&entry->ote_lock); + OVERLAY_DROP(mp, "VL3 target entry is router"); + ret = OVERLAY_TARGET_DROP; + } else if ((entry->ote_flags & OVERLAY_ENTRY_F_VALID) == 0) { + /* This drops entry->ote_lock */ + ret = overlay_target_try_queue(entry, mp); + } else { + /* This drops entry->ote_lock */ + ret = overlay_route_lookup_vl2(entry, vl2_entry, vidp, v6, + slenp, mp); + } + + mutex_enter(&ott->ott_lock); + qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry); + if (vl2_entry != NULL) + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, vl2_entry); + mutex_exit(&ott->ott_lock); + return (ret); +} + +/* + * This function assumes that the destination mode is OVERLAY_PLUGIN_D_IP | * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at - * this time, say for NVGRE, we drop all packets that mcuh this. + * this time, say for NVGRE, we drop all packets that match this. */ int overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, - socklen_t *slenp) + socklen_t *slenp, uint64_t *vidp) { int ret; struct sockaddr_in6 *v6; overlay_target_t *ott; - mac_header_info_t mhi; overlay_target_entry_t *entry; + mac_header_info_t mhi; + overlay_target_mac_t omac; ASSERT(odd->odd_target != NULL); + /* Default to our local vid, routing may change this if necessary */ + *vidp = odd->odd_vid; + /* * At this point, the overlay device is in a mux which means that it's * been activated. At this point, parts of the target, such as the mode @@ -323,8 +874,10 @@ overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, * about synchronization for them. */ ott = odd->odd_target; - if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) + if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) { + OVERLAY_DROP(mp, "plugin doesn't support IP or port"); return (OVERLAY_TARGET_DROP); + } v6 = (struct sockaddr_in6 *)sock; bzero(v6, sizeof (struct sockaddr_in6)); @@ -343,76 +896,89 @@ overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC); - /* - * Note we only want the MAC address here, therefore we won't bother - * using mac_vlan_header_info(). If any caller needs the vlan info at - * this point, this should change to a call to mac_vlan_header_info(). - */ - if (mac_header_info(odd->odd_mh, mp, &mhi) != 0) + if (mac_vlan_header_info(odd->odd_mh, mp, &mhi) != 0) { + OVERLAY_DROP(mp, "could not read vlan header"); return (OVERLAY_TARGET_DROP); + } + + omac.otm_dcid = odd->odd_dcid; + bcopy(mhi.mhi_daddr, omac.otm_mac, ETHERADDRL); + mutex_enter(&ott->ott_lock); - entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - mhi.mhi_daddr); + entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, &omac); if (entry == NULL) { + overlay_target_vl2_t *vl2p; + entry = kmem_cache_alloc(overlay_entry_cache, KM_NOSLEEP | KM_NORMALPRI); if (entry == NULL) { mutex_exit(&ott->ott_lock); + OVERLAY_DROP(mp, "VL2 target entry allocation failed"); return (OVERLAY_TARGET_DROP); } - bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL); + + vl2p = &entry->ote_u.ote_vl2; + bcopy(mhi.mhi_daddr, vl2p->otvl2_mac.otm_mac, ETHERADDRL); + vl2p->otvl2_mac.otm_dcid = odd->odd_dcid; + vl2p->otvl2_route.otr_vnet = odd->odd_vid; + vl2p->otvl2_route.otr_vlan = VLAN_ID(mhi.mhi_tci); + entry->ote_chead = entry->ote_ctail = mp; entry->ote_mbsize = msgsize(mp); entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + entry->ote_ott = ott; entry->ote_odd = odd; - refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry); + + qqcache_insert(ott->ott_u.ott_dyn.ott_dhash, entry); + qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry); avl_add(&ott->ott_u.ott_dyn.ott_tree, entry); mutex_exit(&ott->ott_lock); + overlay_target_queue(entry); + + mutex_enter(&ott->ott_lock); + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + mutex_exit(&ott->ott_lock); return (OVERLAY_TARGET_ASYNC); } - refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry); + qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry); mutex_exit(&ott->ott_lock); mutex_enter(&entry->ote_lock); if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) { + mutex_exit(&entry->ote_lock); + OVERLAY_DROP(mp, "VL2 target marked drop"); ret = OVERLAY_TARGET_DROP; + } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) { + if (mhi.mhi_bindsap == ETHERTYPE_ARP) { + /* + * Send unicast ARP requests to varpd for processing. + * We will eventually need something similar for IPv6. + * This drops entry->ote_lock. + */ + ret = overlay_target_try_queue(entry, mp); + } else { + mutex_exit(&entry->ote_lock); + ret = overlay_route_lookup(odd, mp, + VLAN_ID(mhi.mhi_tci), sock, slenp, vidp); + } } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { - bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr, - sizeof (struct in6_addr)); - v6->sin6_port = htons(entry->ote_dest.otp_port); + overlay_target_point_t *otp = &entry->ote_u.ote_vl2.otvl2_dest; + + bcopy(&otp->otp_ip, &v6->sin6_addr, sizeof (struct in6_addr)); + v6->sin6_port = htons(otp->otp_port); + mutex_exit(&entry->ote_lock); + *slenp = sizeof (struct sockaddr_in6); ret = OVERLAY_TARGET_OK; } else { - size_t mlen = msgsize(mp); - - if (mlen + entry->ote_mbsize > overlay_ent_size) { - ret = OVERLAY_TARGET_DROP; - } else { - if (entry->ote_ctail != NULL) { - ASSERT(entry->ote_ctail->b_next == - NULL); - entry->ote_ctail->b_next = mp; - entry->ote_ctail = mp; - } else { - entry->ote_chead = mp; - entry->ote_ctail = mp; - } - entry->ote_mbsize += mlen; - if ((entry->ote_flags & - OVERLAY_ENTRY_F_PENDING) == 0) { - entry->ote_flags |= - OVERLAY_ENTRY_F_PENDING; - overlay_target_queue(entry); - } - ret = OVERLAY_TARGET_ASYNC; - } + /* This drops entry->ote_lock */ + ret = overlay_target_try_queue(entry, mp); } - mutex_exit(&entry->ote_lock); mutex_enter(&ott->ott_lock); - refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry); mutex_exit(&ott->ott_lock); return (ret); @@ -437,6 +1003,7 @@ overlay_target_info(overlay_target_hdl_t *thdl, void *arg) if (odd->odd_flags & OVERLAY_F_ACTIVATED) oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE; oti->oti_vnetid = odd->odd_vid; + oti->oti_dcid = odd->odd_dcid; mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); return (0); @@ -488,6 +1055,13 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) } } + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_VARPD) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (EEXIST); + } + ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP); ott->ott_flags = 0; ott->ott_ocount = 0; @@ -499,21 +1073,44 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) bcopy(&ota->ota_point, &ott->ott_u.ott_point, sizeof (overlay_target_point_t)); } else { - ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE, + int ret; + + ret = qqcache_create(&ott->ott_u.ott_dyn.ott_dhash, + odd->odd_vl2sz, odd->odd_vl2a, OVERLAY_HSIZE, overlay_mac_hash, overlay_mac_cmp, - overlay_target_entry_dtor, sizeof (overlay_target_entry_t), + overlay_target_entry_l2qq_dtor, + sizeof (overlay_target_entry_t), offsetof(overlay_target_entry_t, ote_reflink), - offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP); + offsetof(overlay_target_entry_t, ote_u.ote_vl2.otvl2_mac), + KM_SLEEP); + if (ret != 0) { + mutex_exit(&odd->odd_lock); + kmem_cache_free(overlay_target_cache, ott); + overlay_hold_rele(odd); + return (ret); + } + + ret = qqcache_create(&ott->ott_u.ott_dyn.ott_l3dhash, + odd->odd_routesz, odd->odd_routea, OVERLAY_HSIZE, + overlay_ip_hash, overlay_ip_cmp, + overlay_target_entry_l3qq_dtor, + sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_reflink), + offsetof(overlay_target_entry_t, ote_u.ote_vl3), KM_SLEEP); + if (ret != 0) { + mutex_exit(&odd->odd_lock); + qqcache_destroy(ott->ott_u.ott_dyn.ott_l3dhash); + kmem_cache_free(overlay_target_cache, ott); + overlay_hold_rele(odd); + return (ret); + } + avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl, sizeof (overlay_target_entry_t), offsetof(overlay_target_entry_t, ote_avllink)); - } - mutex_enter(&odd->odd_lock); - if (odd->odd_flags & OVERLAY_F_VARPD) { - mutex_exit(&odd->odd_lock); - kmem_cache_free(overlay_target_cache, ott); - overlay_hold_rele(odd); - return (EEXIST); + avl_create(&ott->ott_u.ott_dyn.ott_l3tree, overlay_ip_avl, + sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_avllink)); } odd->odd_flags |= OVERLAY_F_VARPD; @@ -521,8 +1118,6 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); - - return (0); } @@ -601,7 +1196,16 @@ again: entry = list_remove_head(&overlay_target_list); mutex_exit(&overlay_target_lock); mutex_enter(&entry->ote_lock); - if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + /* + * Router entries may send lookups to varpd even when valid. For + * example, illumos systems will send unicast ARP queries to cached + * entries (including the router mac address). To answer those, we + * need to forward on the query to varpd. IPv6 will eventually + * need something similar for ND requests. + */ + if ((entry->ote_flags & + (OVERLAY_ENTRY_F_VALID|OVERLAY_ENTRY_F_ROUTER)) == + OVERLAY_ENTRY_F_VALID) { ASSERT(entry->ote_chead == NULL); mutex_exit(&entry->ote_lock); goto again; @@ -637,10 +1241,23 @@ again: otl->otl_hdrsize = mhi.mhi_hdrsize; otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize; - bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL); - bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL); - otl->otl_dsttype = mhi.mhi_dsttype; - otl->otl_sap = mhi.mhi_bindsap; + if (entry->ote_flags & OVERLAY_ENTRY_F_VL3) { + overlay_targ_l3_t *l3p = &otl->otl_addru.otlu_l3; + + otl->otl_l3req = B_TRUE; + bcopy(&entry->ote_u.ote_vl3.otvl3_src, &l3p->otl3_srcip, + sizeof (struct in6_addr)); + bcopy(&entry->ote_u.ote_vl3.otvl3_dst, &l3p->otl3_dstip, + sizeof (struct in6_addr)); + } else { + overlay_targ_l2_t *l2p = &otl->otl_addru.otlu_l2; + + otl->otl_l3req = B_FALSE; + bcopy(mhi.mhi_daddr, l2p->otl2_dstaddr, ETHERADDRL); + bcopy(mhi.mhi_saddr, l2p->otl2_srcaddr, ETHERADDRL); + l2p->otl2_dsttype = mhi.mhi_dsttype; + l2p->otl2_sap = mhi.mhi_bindsap; + } otl->otl_vlan = VLAN_ID(mhi.mhi_tci); mutex_exit(&entry->ote_lock); @@ -651,12 +1268,128 @@ again: return (0); } +static void +overlay_target_lookup_respond_vl3(const overlay_targ_resp_t *otr, + overlay_target_entry_t *entry) +{ + overlay_target_entry_t *shared = NULL; + overlay_target_entry_t *vl2_entry; + overlay_target_t *ott = entry->ote_ott; + qqcache_t *mhash = ott->ott_u.ott_dyn.ott_dhash; + hrtime_t now = gethrtime(); + + ASSERT(MUTEX_HELD(&entry->ote_lock)); + ASSERT(entry->ote_flags & OVERLAY_ENTRY_F_VL3); + + /* + * A cross-{vlan,dc,vnet} packet with a destination VL3 of an overlay + * router IP. For now we drop these. + */ + if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) { + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + entry->ote_flags |= OVERLAY_ENTRY_F_DROP; + return; + } + + bcopy(&otr->otr_mac, &entry->ote_u.ote_vl3.otvl3_vl2, + sizeof (overlay_target_mac_t)); + + mutex_enter(&ott->ott_lock); + if ((shared = qqcache_lookup(mhash, &otr->otr_mac)) != NULL) + qqcache_hold(mhash, shared); + mutex_exit(&ott->ott_lock); + + /* + * Once we have the VL2 destination, we need to see if we already + * have an existing VL2 entry we can reuse. If not, we create a + * fully-formed (i.e. valid) VL2 entry that we add to the cache. + */ + if (shared == NULL) { + vl2_entry = kmem_cache_alloc(overlay_entry_cache, + KM_NOSLEEP | KM_NORMALPRI); + if (vl2_entry == NULL) { + /* + * If we can't allocate a VL2 entry for the VL3 + * destination, we just give up for now and drain + * any queued packets. New packets will retry this + * allocation, so if the memory pressure lets up, we + * should recover. + */ + freemsgchain(entry->ote_chead); + entry->ote_chead = entry->ote_ctail = NULL; + return; + } + + vl2_entry->ote_ott = ott; + vl2_entry->ote_odd = entry->ote_odd; + + bcopy(&otr->otr_answer, &vl2_entry->ote_u.ote_vl2.otvl2_dest, + sizeof (overlay_target_point_t)); + bcopy(&otr->otr_mac, &vl2_entry->ote_u.ote_vl2.otvl2_mac, + sizeof (overlay_target_mac_t)); + bcopy(&otr->otr_route, &vl2_entry->ote_u.ote_vl2.otvl2_route, + sizeof (overlay_target_route_t)); + vl2_entry->ote_flags = + OVERLAY_ENTRY_F_HAS_ROUTE | OVERLAY_ENTRY_F_VALID; + vl2_entry->ote_vtime = entry->ote_vtime = now; + + mutex_enter(&ott->ott_lock); + if ((shared = qqcache_lookup(mhash, &otr->otr_mac)) != NULL) { + overlay_target_entry_dtor(vl2_entry); + kmem_cache_free(overlay_entry_cache, vl2_entry); + qqcache_hold(mhash, shared); + + vl2_entry = shared; + } else { + qqcache_insert(mhash, vl2_entry); + avl_add(&ott->ott_u.ott_dyn.ott_tree, vl2_entry); + qqcache_hold(mhash, vl2_entry); + } + mutex_exit(&ott->ott_lock); + } else { + vl2_entry = shared; + } + + mutex_enter(&vl2_entry->ote_lock); + if ((vl2_entry->ote_flags & (OVERLAY_ENTRY_F_HAS_ROUTE)) == 0) { + bcopy(&otr->otr_route, &vl2_entry->ote_u.ote_vl2.otvl2_route, + sizeof (overlay_target_route_t)); + vl2_entry->ote_flags |= OVERLAY_ENTRY_F_HAS_ROUTE; + } + + /* + * Update the VL2 entry if it doesn't have a valid destination, hasn't + * been marked as dropping all packets, and doesn't have an existing + * outstanding request. If a route and VL2 request involving the + * same VL2 destination are pending and the route response is processed + * prior to the VL2 request, we will continue to queue (on the VL2 + * entry) until the VL2 response is received, even though we have + * an answer from the route response. If we set the valid flag + * while there's still oustanding requests, it will cause problems + * with the outstanding requests. + */ + if ((vl2_entry->ote_flags & (OVERLAY_ENTRY_F_PENDING| + OVERLAY_ENTRY_F_VALID|OVERLAY_ENTRY_F_DROP)) == 0) { + bcopy(&otr->otr_answer, &vl2_entry->ote_u.ote_vl2.otvl2_dest, + sizeof (overlay_target_point_t)); + vl2_entry->ote_vtime = gethrtime(); + vl2_entry->ote_flags |= OVERLAY_ENTRY_F_VALID; + } + mutex_exit(&vl2_entry->ote_lock); + + mutex_enter(&ott->ott_lock); + qqcache_rele(mhash, vl2_entry); + mutex_exit(&ott->ott_lock); +} + static int overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg) { const overlay_targ_resp_t *otr = arg; + overlay_target_t *ott; overlay_target_entry_t *entry; mblk_t *mp; + boolean_t is_vl3 = B_FALSE; mutex_enter(&thdl->oth_lock); for (entry = list_head(&thdl->oth_outstanding); entry != NULL; @@ -673,38 +1406,88 @@ overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg) mutex_exit(&thdl->oth_lock); mutex_enter(&entry->ote_lock); - bcopy(&otr->otr_answer, &entry->ote_dest, - sizeof (overlay_target_point_t)); + ott = entry->ote_ott; + + if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0) + is_vl3 = B_TRUE; + + /* + * If we ever support a protocol that uses MAC addresses as the UL + * destination address, this check should probably include checking + * that otp_mac is also all zeros. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&otr->otr_answer.otp_ip) && + otr->otr_answer.otp_port == 0) + entry->ote_flags |= OVERLAY_ENTRY_F_ROUTER; + + if (!is_vl3) { + bcopy(&otr->otr_answer, &entry->ote_u.ote_vl2.otvl2_dest, + sizeof (overlay_target_point_t)); + entry->ote_vtime = gethrtime(); + } else { + overlay_target_lookup_respond_vl3(otr, entry); + } + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; entry->ote_flags |= OVERLAY_ENTRY_F_VALID; + mp = entry->ote_chead; entry->ote_chead = NULL; entry->ote_ctail = NULL; entry->ote_mbsize = 0; - entry->ote_vtime = gethrtime(); mutex_exit(&entry->ote_lock); /* - * For now do an in-situ drain. + * For now do an in-situ drain. For VL3 entries, if we re-use + * and existing VL2 entry, it is possible the VL2 lookup is still + * pending (though should be rare). In such instances, the packets + * queued on the VL3 entry will get queued on the VL2 entry until + * the VL2 entry is resolved. */ mp = overlay_m_tx(entry->ote_odd, mp); freemsgchain(mp); - mutex_enter(&entry->ote_ott->ott_lock); - entry->ote_ott->ott_ocount--; - cv_signal(&entry->ote_ott->ott_cond); - mutex_exit(&entry->ote_ott->ott_lock); + mutex_enter(&ott->ott_lock); + ott->ott_ocount--; + if (is_vl3) + qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry); + else + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + cv_signal(&ott->ott_cond); + mutex_exit(&ott->ott_lock); return (0); } +static boolean_t +overlay_target_for_varpd(overlay_dev_t *odd, mblk_t *mp) +{ + mac_header_info_t mhi; + + /* We should have dropped runts prior to ever queueing */ + VERIFY0(mac_vlan_header_info(odd->odd_mh, mp, &mhi)); + if (mhi.mhi_bindsap == ETHERTYPE_ARP) + return (B_TRUE); + + /* TODO: NDP packets */ + return (B_FALSE); +} + +typedef enum overlay_target_lookup_drop_act { + OTLDA_NONE, + OTLDA_QUEUE, + OTLDA_DELETE +} overlay_target_lookup_drop_act_t; + static int overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) { const overlay_targ_resp_t *otr = arg; + overlay_target_t *ott; overlay_target_entry_t *entry; mblk_t *mp; - boolean_t queue = B_FALSE; + overlay_target_lookup_drop_act_t action = OTLDA_NONE; + boolean_t is_vl3 = B_FALSE; mutex_enter(&thdl->oth_lock); for (entry = list_head(&thdl->oth_outstanding); entry != NULL; @@ -721,9 +1504,19 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) mutex_exit(&thdl->oth_lock); mutex_enter(&entry->ote_lock); + ott = entry->ote_ott; + if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0) + is_vl3 = B_TRUE; - /* Safeguard against a confused varpd */ - if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + mp = entry->ote_chead; + + /* + * Safeguard against a confused varpd. Packets specifically for + * varpd may receive replies (e.g. ARP replies) that require us to + * drop, even when the entry is valid. + */ + if ((entry->ote_flags & OVERLAY_ENTRY_F_VALID) && + !overlay_target_for_varpd(entry->ote_odd, mp)) { entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; DTRACE_PROBE1(overlay__target__valid__drop, overlay_target_entry_t *, entry); @@ -731,7 +1524,21 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) goto done; } - mp = entry->ote_chead; + /* + * If varpd is instructing us to drop the head mblk in a VL2 entry, + * this could be because it's already provided a response (e.g. an + * ARP reply), and the entry itself might still be used for other + * purposes. VL3 entries on the other had have no such uses. If + * we are told to drop the packet, there is no reason to retain + * the VL3 entry and we can delete it. + */ + if (is_vl3) { + action = OTLDA_DELETE; + mutex_exit(&entry->ote_lock); + goto done; + } + + /* Drop the first packet in entry */ if (mp != NULL) { entry->ote_chead = mp->b_next; mp->b_next = NULL; @@ -739,23 +1546,34 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) entry->ote_ctail = entry->ote_chead; entry->ote_mbsize -= msgsize(mp); } + if (entry->ote_chead != NULL) { - queue = B_TRUE; + action = OTLDA_QUEUE; entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; } else { entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; } mutex_exit(&entry->ote_lock); - if (queue == B_TRUE) + if (action == OTLDA_QUEUE) overlay_target_queue(entry); freemsg(mp); done: - mutex_enter(&entry->ote_ott->ott_lock); - entry->ote_ott->ott_ocount--; - cv_signal(&entry->ote_ott->ott_cond); - mutex_exit(&entry->ote_ott->ott_lock); + mutex_enter(&ott->ott_lock); + ott->ott_ocount--; + if (action == OTLDA_DELETE) { + /* overlay_target_entry_dtor() will free the mblk chain */ + qqcache_remove(ott->ott_u.ott_dyn.ott_l3dhash, entry); + } + + if (is_vl3) + qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry); + else + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + + cv_signal(&ott->ott_cond); + mutex_exit(&ott->ott_lock); return (0); } @@ -1083,31 +1901,35 @@ overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg) sizeof (overlay_target_point_t)); } else { overlay_target_entry_t *ote; - ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - otc->otc_entry.otce_mac); - if (ote != NULL) { - mutex_enter(&ote->ote_lock); - if ((ote->ote_flags & - OVERLAY_ENTRY_F_VALID_MASK) != 0) { - if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) { - otc->otc_entry.otce_flags = - OVERLAY_TARGET_CACHE_DROP; - } else { - otc->otc_entry.otce_flags = 0; - bcopy(&ote->ote_dest, - &otc->otc_entry.otce_dest, - sizeof (overlay_target_point_t)); - } - ret = 0; + + if ((ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, + &otc->otc_entry.otce_mac)) == NULL) { + ret = ENOENT; + goto done; + } + + mutex_enter(&ote->ote_lock); + if ((ote->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) != 0) { + if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) { + otc->otc_entry.otce_flags = + OVERLAY_TARGET_CACHE_DROP; + } else if (ote->ote_flags & OVERLAY_ENTRY_F_ROUTER) { + otc->otc_entry.otce_flags = + OVERLAY_TARGET_CACHE_ROUTER; } else { - ret = ENOENT; + otc->otc_entry.otce_flags = 0; + bcopy(&ote->ote_u.ote_vl2.otvl2_dest, + &otc->otc_entry.otce_dest, + sizeof (overlay_target_point_t)); } - mutex_exit(&ote->ote_lock); + ret = 0; } else { ret = ENOENT; } + mutex_exit(&ote->ote_lock); } +done: mutex_exit(&ott->ott_lock); overlay_hold_rele(odd); @@ -1120,53 +1942,64 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) { overlay_dev_t *odd; overlay_target_t *ott; - overlay_target_entry_t *ote; + overlay_target_entry_t *ote, *new = NULL; overlay_targ_cache_t *otc = arg; mblk_t *mp = NULL; - if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP) + if (otc->otc_entry.otce_flags & + ~(OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER)) + return (EINVAL); + + if (otc->otc_entry.otce_flags == + (OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER)) return (EINVAL); odd = overlay_hold_by_dlid(otc->otc_linkid); if (odd == NULL) return (ENOENT); + /* + * Optimistically create the new entry. If not needed, we'll free it. + * We shouldn't be calling this ioctl rapidly enough that any potential + * alloc/free churn should cause a problem. + */ + new = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP); + bcopy(&otc->otc_entry.otce_mac, &new->ote_u.ote_vl2.otvl2_mac, + sizeof (overlay_target_mac_t)); + mutex_enter(&odd->odd_lock); if (!(odd->odd_flags & OVERLAY_F_VARPD)) { mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); + overlay_target_entry_dtor(new); return (ENXIO); } ott = odd->odd_target; if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); + overlay_target_entry_dtor(new); return (ENOTSUP); } + + new->ote_ott = ott; + new->ote_odd = odd; + mutex_enter(&ott->ott_lock); mutex_exit(&odd->odd_lock); - ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - otc->otc_entry.otce_mac); - if (ote == NULL) { - ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP); - bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL); - ote->ote_chead = ote->ote_ctail = NULL; - ote->ote_mbsize = 0; - ote->ote_ott = ott; - ote->ote_odd = odd; - mutex_enter(&ote->ote_lock); - refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote); - avl_add(&ott->ott_u.ott_dyn.ott_tree, ote); - } else { - mutex_enter(&ote->ote_lock); - } + if ((ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, + &otc->otc_entry.otce_mac)) == NULL) + ote = new; + mutex_enter(&ote->ote_lock); if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) { ote->ote_flags |= OVERLAY_ENTRY_F_DROP; } else { ote->ote_flags |= OVERLAY_ENTRY_F_VALID; - bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest, + if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_ROUTER) + ote->ote_flags |= OVERLAY_ENTRY_F_ROUTER; + bcopy(&otc->otc_entry.otce_dest, &ote->ote_u.ote_vl2.otvl2_dest, sizeof (overlay_target_point_t)); mp = ote->ote_chead; ote->ote_chead = NULL; @@ -1175,6 +2008,10 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) ote->ote_vtime = gethrtime(); } + if (ote == new) { + qqcache_insert(ott->ott_u.ott_dyn.ott_dhash, ote); + avl_add(&ott->ott_u.ott_dyn.ott_tree, ote); + } mutex_exit(&ote->ote_lock); mutex_exit(&ott->ott_lock); @@ -1185,6 +2022,9 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) overlay_hold_rele(odd); + if (ote != new) + overlay_target_entry_dtor(new); + return (0); } @@ -1217,8 +2057,11 @@ overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg) mutex_enter(&ott->ott_lock); mutex_exit(&odd->odd_lock); - ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - otc->otc_entry.otce_mac); + if (otc->otc_entry.otce_mac.otm_dcid == 0) + otc->otc_entry.otce_mac.otm_dcid = odd->odd_dcid; + + ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, + &otc->otc_entry.otce_mac); if (ote != NULL) { mutex_enter(&ote->ote_lock); ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; @@ -1269,8 +2112,13 @@ overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg) ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; mutex_exit(&ote->ote_lock); } - ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - otc->otc_entry.otce_mac); + + avl = &ott->ott_u.ott_dyn.ott_l3tree; + for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) { + mutex_enter(&ote->ote_lock); + ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; + mutex_exit(&ote->ote_lock); + } mutex_exit(&ott->ott_lock); overlay_hold_rele(odd); @@ -1304,9 +2152,10 @@ overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize, } typedef struct overlay_targ_cache_marker { - uint8_t otcm_mac[ETHERADDRL]; + overlay_target_mac_t otcm_mac; uint16_t otcm_done; -} overlay_targ_cache_marker_t; +} overlay_targ_cache_marker_t __aligned(8); +CTASSERT(sizeof (overlay_targ_cache_marker_t) == 2 * sizeof (uint64_t)); /* ARGSUSED */ static int @@ -1356,7 +2205,7 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) if (ott->ott_mode == OVERLAY_TARGET_POINT) { overlay_targ_cache_entry_t *out = &iter->otci_ents[0]; - bzero(out->otce_mac, ETHERADDRL); + bzero(&out->otce_mac, sizeof (out->otce_mac)); out->otce_flags = 0; bcopy(&ott->ott_u.ott_point, &out->otce_dest, sizeof (overlay_target_point_t)); @@ -1365,7 +2214,9 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) } avl = &ott->ott_u.ott_dyn.ott_tree; - bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL); + lookup.ote_u.ote_vl2.otvl2_mac.otm_dcid = odd->odd_dcid; + bcopy(&mark->otcm_mac, &lookup.ote_u.ote_vl2.otvl2_mac, + sizeof (mark->otcm_mac)); ent = avl_find(avl, &lookup, &where); /* @@ -1390,19 +2241,21 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) mutex_exit(&ent->ote_lock); continue; } - bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL); + bcopy(&ent->ote_u.ote_vl2.otvl2_mac, &out->otce_mac, + sizeof (out->otce_mac)); out->otce_flags = 0; if (ent->ote_flags & OVERLAY_ENTRY_F_DROP) out->otce_flags |= OVERLAY_TARGET_CACHE_DROP; if (ent->ote_flags & OVERLAY_ENTRY_F_VALID) - bcopy(&ent->ote_dest, &out->otce_dest, + bcopy(&ent->ote_u.ote_vl2.otvl2_dest, &out->otce_dest, sizeof (overlay_target_point_t)); written++; mutex_exit(&ent->ote_lock); } if (ent != NULL) { - bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL); + bcopy(&ent->ote_u.ote_vl2.otvl2_mac, &mark->otcm_mac, + sizeof (mark->otcm_mac)); } else { mark->otcm_done = 1; } @@ -1432,6 +2285,122 @@ overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize, return (0); } +/* + * Take an IPv6 address + prefix length, and turn it into the network address. + * E.g. ::ffff:192.168.51.50/120 -> ::ffff:192.168.51.0 + */ +static void +overlay_in6_to_subnet(const struct in6_addr *src, struct in6_addr *dst, + uint8_t prefixlen) +{ + uint32_t val; + + for (size_t i = 0; i < 4; i++) { + val = ntohl(src->_S6_un._S6_u32[i]); + val &= IN6_MASK_FROM_PREFIX(i, prefixlen); + dst->_S6_un._S6_u32[i] = htonl(val); + } +} + +/* + * Find the first target entry whose source IP falls within the source subnet + * given by otcne. If no entries match, NULL is returned. + */ +static overlay_target_entry_t * +overlay_target_cache_first_net(overlay_target_t *ott, + const overlay_targ_cache_net_entry_t *otcne) +{ + avl_tree_t *avl; + overlay_target_entry_t *ote; + struct in6_addr *start; + overlay_target_entry_t cmp = { 0 }; + avl_index_t where = { 0 }; + + ASSERT(MUTEX_HELD(&ott->ott_lock)); + + avl = &ott->ott_u.ott_dyn.ott_l3tree; + start = &cmp.ote_u.ote_vl3.otvl3_src; + + /* + * The first possible source address for a subnet is the network + * address (e.g. 192.160.10.0 for a /24). While it normally shouldn't + * appear, we either start here, or at the first entry after where + * it would exist if present. This should be the first possible + * entry in the subnet. If it's not within the subnet, then we + * know no entries with that source subnet are present. + */ + overlay_in6_to_subnet(&otcne->otcne_src, start, + otcne->otcne_src_prefixlen); + + if ((ote = avl_find(avl, &cmp, &where)) == NULL) + ote = avl_nearest(avl, where, AVL_AFTER); + + if (ote == NULL || !IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_src, + &ote->ote_u.ote_vl3.otvl3_src, otcne->otcne_src_prefixlen)) + return (NULL); + + return (ote); +} + +/* ARGSUSED */ +static int +overlay_target_cache_remove_net(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_cache_net_t *otcn = arg; + overlay_targ_cache_net_entry_t *otcne = &otcn->otcn_entry; + overlay_dev_t *odd = NULL; + overlay_target_t *ott = NULL; + overlay_target_entry_t *ote = NULL, *ote_next = NULL; + avl_tree_t *avl = NULL; + + odd = overlay_hold_by_dlid(otcn->otcn_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + avl = &ott->ott_u.ott_dyn.ott_l3tree; + + for (ote = overlay_target_cache_first_net(ott, otcne); + ote != NULL && IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_src, + &ote->ote_u.ote_vl3.otvl3_src, otcne->otcne_src_prefixlen); + ote = ote_next) { + ote_next = AVL_NEXT(avl, ote); + + /* + * Entries are sorted by src ip, dst ip, src vlan, there can + * be entries from this src ip to destinations on other + * subnets besides the one we are removing that will need to + * be skipped over. + */ + if (ote->ote_u.ote_vl3.otvl3_src_vlan != otcne->otcne_vlan) + continue; + + if (!IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_dst, + &ote->ote_u.ote_vl3.otvl3_dst, otcne->otcne_dst_prefixlen)) + continue; + + qqcache_remove(ott->ott_u.ott_dyn.ott_l3dhash, ote); + } + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + return (0); +} + static overlay_target_ioctl_t overlay_target_ioctab[] = { { OVERLAY_TARG_INFO, B_TRUE, B_TRUE, NULL, overlay_target_info, @@ -1492,6 +2461,9 @@ static overlay_target_ioctl_t overlay_target_ioctab[] = { overlay_target_cache_iter, overlay_target_cache_iter_copyout, sizeof (overlay_targ_cache_iter_t) }, + { OVERLAY_TARG_CACHE_REMOVE_NET, B_TRUE, B_TRUE, + NULL, overlay_target_cache_remove_net, + NULL, sizeof (overlay_targ_cache_net_t) }, { 0 } }; diff --git a/usr/src/uts/common/qqcache/qqcache.c b/usr/src/uts/common/qqcache/qqcache.c new file mode 100644 index 0000000000..ccd90c3814 --- /dev/null +++ b/usr/src/uts/common/qqcache/qqcache.c @@ -0,0 +1,444 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018, Joyent, Inc. + */ + +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/null.h> +#include <sys/types.h> +#include <sys/qqcache.h> +#include <sys/qqcache_impl.h> +#include <sys/stddef.h> + +/* + * Currently, the non _KERNEL pieces are to support testing in usr/src/test. + */ +#ifdef _KERNEL +#include <sys/kmem.h> +#define ZALLOC kmem_zalloc +#define FREE kmem_free +#else +#include <umem.h> +#define ZALLOC umem_zalloc +#define FREE umem_free +#endif + + +/* + * The *_overflow functions mimic the gcc/clang intrinsic functions. Once + * we are using a newer compiler version to that includes these as intrisnics, + * these can be replaced with those versions. + */ +static int +uadd_overflow(const size_t a, const size_t b, size_t *sump) +{ + *sump = a + b; + if (*sump < a || *sump < b) + return (1); + return (0); +} + +#define MUL_NO_OVERFLOW ((size_t)1 << (sizeof (size_t) * 4)) + +static int +umul_overflow(const size_t a, const size_t b, size_t *cp) +{ + *cp = a * b; + + if ((a >= MUL_NO_OVERFLOW || b >= MUL_NO_OVERFLOW) && + a != 0 && b != 0 && SIZE_MAX / a < b) + return (1); + + return (0); +} + +/* Calculate the capacity of each list based on sz and a */ +static void +qqcache_size_lists(size_t sz, size_t a, size_t *maxp) +{ + VERIFY3U(sz, >=, QQCACHE_NUM_LISTS); + + /* + * The general approach is to start with list 0 being sized as a% of + * sz. However every other list must be able to hold at least one + * entry unless a == 100 (i.e. 100%). If the straight percentage + * leaves any of the remaining lists with zero entries, we give them + * a size of 1, and then adjust list0's size according so that the + * sum off all list sizes == sz (this is mostly only a concern where + * sz is small enough such that (100 - a)% of sz < QQCACHE_NUM_LISTS). + */ + size_t list0sz = sz * a / 100; + size_t othersz = (sz - list0sz) / (QQCACHE_NUM_LISTS - 1); + + if (list0sz == 0) + list0sz = 1; + + if (othersz == 0 && a != 100) + othersz = 1; + + if (list0sz + othersz * (QQCACHE_NUM_LISTS - 1) > sz) + list0sz = sz - othersz * (QQCACHE_NUM_LISTS - 1); + + maxp[0] = list0sz; + for (size_t i = 1; i < QQCACHE_NUM_LISTS; i++) + maxp[i] = othersz; +} + +int +qqcache_create(qqcache_t **qp, size_t sz, size_t a, size_t buckets, + qqcache_hash_fn_t hash_fn, qqcache_cmp_fn_t cmp_fn, + qqcache_dtor_fn_t dtor_fn, size_t elsize, size_t link_off, size_t tag_off, + int kmflags) +{ + qqcache_t *qc; + size_t len = 0; + + if (sz < QQCACHE_MIN_SIZE) + return (EINVAL); + if (a > 100) + return (EINVAL); + + if (umul_overflow(sizeof (qqcache_list_t), buckets, &len)) + return (EINVAL); + if (uadd_overflow(sizeof (*qc), len, &len)) + return (EINVAL); + + if ((qc = ZALLOC(len, kmflags)) == NULL) + return (ENOMEM); + + qc->qqc_hash_fn = hash_fn; + qc->qqc_cmp_fn = cmp_fn; + qc->qqc_dtor_fn = dtor_fn; + qc->qqc_link_off = link_off; + qc->qqc_tag_off = tag_off; + qc->qqc_nbuckets = buckets; + qc->qqc_size = sz; + qc->qqc_a = a; + + qqcache_size_lists(sz, a, qc->qqc_max); + + for (size_t i = 0; i < buckets; i++) { + list_create(&qc->qqc_buckets[i].qqcl_list, elsize, + offsetof(qqcache_link_t, qqln_hash_link)); + } + + for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) { + list_create(&qc->qqc_lists[i].qqcl_list, elsize, + offsetof(qqcache_link_t, qqln_list_link)); + } + + *qp = qc; + return (0); +} + +void +qqcache_destroy(qqcache_t *qc) +{ + size_t len; + + if (qc == NULL) + return; + + /* If creation succeeded, this calculation cannot overflow */ + len = sizeof (*qc) + qc->qqc_nbuckets * sizeof (qqcache_list_t); + + for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) { + list_t *l = &qc->qqc_lists[i].qqcl_list; + qqcache_link_t *lnk; + + while ((lnk = list_remove_head(l)) != NULL) + ; + } + + for (size_t i = 0; i < qc->qqc_nbuckets; i++) { + list_t *l = &qc->qqc_buckets[i].qqcl_list; + qqcache_link_t *lnk; + + while ((lnk = list_remove_head(l)) != NULL) { + ASSERT0(lnk->qqln_refcnt); + qc->qqc_dtor_fn(link_to_obj(qc, lnk)); + } + } + + FREE(qc, len); +} + +/* + * Removal of an entry is a two step process. qqcache_remove() removes the + * entry from the cache lists, and if a reference is held, sets the + * QQCACHE_F_DEAD flag. When there are no more references held on an entry, + * (either none are held at the time qqcache_remove() is called, or the last + * reference is removed via qqcache_rele(), qqcache_delete() is called which + * removes the entry from its hash bucket and calls the entry's dtor function. + * + * The main reason for the two step process is largely simplicity. If the + * entry remains in the cache lists w/ the QQCACHE_F_DEAD flag set, it + * complicates keeping each cache within its size limits -- either the + * list size must reflect the number of non-dead entries (which could be + * confusing during troubleshooting), or as we push things down the list, we + * would need to skip/ignore dead entries. The hash buckets however don't + * have any size limits (to impose limits would require the hash function + * provided by the consumer to produce perfectly equal distribution of entries + * across all the hash buckets at all times). The only time we care about + * the QQCACHE_F_DEAD flag in the hash buckets is when trying to lookup a + * 'dead' value, so leaving the entries in there does not present the same + * issues as leaving them in the hash buckets (while still providing a way to + * find refheld entries). + */ +static void +qqcache_delete(qqcache_t *qc, qqcache_link_t *lp) +{ + void *op = link_to_obj(qc, lp); + void *tp = obj_to_tag(qc, op); + uint_t n = qc->qqc_hash_fn(tp) % qc->qqc_nbuckets; + + ASSERT3U(qc->qqc_buckets[n].qqcl_len, >, 0); + ASSERT(!list_is_empty(&qc->qqc_buckets[n].qqcl_list)); + ASSERT(!list_link_active(&lp->qqln_list_link)); + ASSERT(list_link_active(&lp->qqln_hash_link)); + + list_remove(&qc->qqc_buckets[n].qqcl_list, lp); + qc->qqc_buckets[n].qqcl_len--; + qc->qqc_dtor_fn(op); +} + +void +qqcache_remove(qqcache_t *qc, void *op) +{ + qqcache_link_t *lp = obj_to_link(qc, op); + qqcache_list_t *lst = QQCACHE_LIST(qc, lp); + + ASSERT(!list_is_empty(&lst->qqcl_list)); + ASSERT3U(lst->qqcl_len, >, 0); + + list_remove(&lst->qqcl_list, lp); + lst->qqcl_len--; + + if (lp->qqln_refcnt > 0) + lp->qqln_flags |= QQCACHE_F_DEAD; + else + qqcache_delete(qc, lp); +} + +void +qqcache_hold(qqcache_t *qc, void *op) +{ + qqcache_link_t *lp = obj_to_link(qc, op); + + ++lp->qqln_refcnt; +} + +void +qqcache_rele(qqcache_t *qc, void *op) +{ + qqcache_link_t *lp = obj_to_link(qc, op); + + VERIFY3U(lp->qqln_refcnt, >, 0); + + if (--lp->qqln_refcnt == 0 && (lp->qqln_flags & QQCACHE_F_DEAD)) + qqcache_delete(qc, lp); +} + +static qqcache_link_t * +qqcache_hash_lookup(qqcache_t *qc, const void *tp, qqcache_list_t **lpp) +{ + uint_t n = qc->qqc_hash_fn(tp) % qc->qqc_nbuckets; + qqcache_link_t *lp; + qqcache_list_t *bucket = &qc->qqc_buckets[n]; + list_t *l = &bucket->qqcl_list; + void *cmp; + + if (lpp != NULL) + *lpp = bucket; + + for (lp = list_head(l); lp != NULL; lp = list_next(l, lp)) { + cmp = obj_to_tag(qc, link_to_obj(qc, lp)); + + if (qc->qqc_cmp_fn(cmp, tp) == 0 && + !(lp->qqln_flags & QQCACHE_F_DEAD)) { + return (lp); + } + } + + return (NULL); +} + +/* + * Starting at listnum, push entries from the tail of cache list 'n' to the + * head of * list 'n + 1', keeping each list within their size limits. Excess + * entries on the tail of the last list are deleted. If 'for_insert' is + * B_TRUE, also guarantee after this returns that there are no more than + * 'max - 1' entries on listnum (so there is room to insert an entry onto + * listnum). + */ +static void +qqcache_ripple(qqcache_t *qc, uint_t listnum, boolean_t for_insert) +{ + VERIFY3U(listnum, <, QQCACHE_NUM_LISTS); + + for (uint_t i = listnum; i < QQCACHE_NUM_LISTS; i++) { + qqcache_list_t *ql = &qc->qqc_lists[i]; + qqcache_list_t *qlnext = &qc->qqc_lists[i + 1]; + size_t max = qc->qqc_max[i]; + + ASSERT3U(max, >, 0); + + /* + * If we're planning to insert an entry on list 'listnum', + * we bump the maximum size down by one to guarantee we + * have sufficient room for the entry + */ + if (for_insert && i == listnum) + max--; + + while (ql->qqcl_len > max) { + qqcache_link_t *lnk = list_tail(&ql->qqcl_list); + + if (i + 1 < QQCACHE_NUM_LISTS) { + list_remove(&ql->qqcl_list, lnk); + ql->qqcl_len--; + + ASSERT3U(lnk->qqln_listnum, ==, i); + lnk->qqln_listnum++; + + list_insert_head(&qlnext->qqcl_list, lnk); + qlnext->qqcl_len++; + } else { + qqcache_remove(qc, link_to_obj(qc, lnk)); + } + } + } +} + +int +qqcache_insert(qqcache_t *qc, void *obj) +{ + qqcache_link_t *lp = obj_to_link(qc, obj); + qqcache_list_t *bucket; + + if (qqcache_hash_lookup(qc, obj_to_tag(qc, obj), &bucket) != NULL) + return (EEXIST); + + list_link_init(&lp->qqln_hash_link); + list_link_init(&lp->qqln_list_link); + lp->qqln_refcnt = 0; + lp->qqln_flags = 0; + lp->qqln_listnum = QQCACHE_INSERT_LIST; + + qqcache_ripple(qc, QQCACHE_INSERT_LIST, B_TRUE); + + list_insert_tail(&bucket->qqcl_list, lp); + bucket->qqcl_len++; + + list_insert_head(&qc->qqc_lists[QQCACHE_INSERT_LIST].qqcl_list, lp); + qc->qqc_lists[QQCACHE_INSERT_LIST].qqcl_len++; + + return (0); +} + +void * +qqcache_lookup(qqcache_t *qc, const void *tp) +{ + qqcache_link_t *lp; + qqcache_list_t *src; + uint_t tgtnum; + + if ((lp = qqcache_hash_lookup(qc, tp, NULL)) == NULL) + return (NULL); + + src = QQCACHE_LIST(qc, lp); + list_remove(&src->qqcl_list, lp); + src->qqcl_len--; + + tgtnum = (lp->qqln_listnum > 0) ? lp->qqln_listnum - 1 : 0; + + if (tgtnum != lp->qqln_listnum) + qqcache_ripple(qc, tgtnum, B_TRUE); + + lp->qqln_listnum = tgtnum; + list_insert_head(&qc->qqc_lists[tgtnum].qqcl_list, lp); + qc->qqc_lists[tgtnum].qqcl_len++; + + return (link_to_obj(qc, lp)); +} + +int +qqcache_adjust_size(qqcache_t *qc, size_t sz) +{ + if (sz < QQCACHE_MIN_SIZE) + return (EINVAL); + + qc->qqc_size = sz; + qqcache_size_lists(sz, qc->qqc_a, qc->qqc_max); + qqcache_ripple(qc, 0, B_FALSE); + return (0); +} + +int +qqcache_adjust_a(qqcache_t *qc, size_t a) +{ + if (a > 100) + return (EINVAL); + + qc->qqc_a = a; + qqcache_size_lists(qc->qqc_size, a, qc->qqc_max); + qqcache_ripple(qc, 0, B_FALSE); + return (0); +} + +size_t +qqcache_size(const qqcache_t *qc) +{ + return (qc->qqc_size); +} + +size_t +qqcache_a(const qqcache_t *qc) +{ + return (qc->qqc_a); +} + +void * +qqcache_first(qqcache_t *qc) +{ + for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) { + qqcache_list_t *l = &qc->qqc_lists[i]; + + if (l->qqcl_len > 0) + return (link_to_obj(qc, list_head(&l->qqcl_list))); + } + + return (NULL); +} + +void * +qqcache_next(qqcache_t *qc, void *obj) +{ + qqcache_link_t *lp = obj_to_link(qc, obj); + qqcache_link_t *next; + qqcache_list_t *l = QQCACHE_LIST(qc, lp); + + ASSERT3U(lp->qqln_listnum, <, QQCACHE_NUM_LISTS); + + if ((next = list_next(&l->qqcl_list, lp)) != NULL) + return (link_to_obj(qc, next)); + + for (size_t i = lp->qqln_listnum + 1; i < QQCACHE_NUM_LISTS; i++) { + l = &qc->qqc_lists[i]; + if (l->qqcl_len > 0) + return (link_to_obj(qc, list_head(&l->qqcl_list))); + } + + return (NULL); +} diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 24fdd94c11..eaf06f476c 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -28,7 +28,7 @@ # Copyright 2017 Nexenta Systems, Inc. # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> # Copyright 2019 Peter Tribble. -# Copyright 2015, Joyent, Inc. All rights reserved. +# Copyright 2018 Joyent, Inc. # include $(SRC)/uts/Makefile.uts @@ -491,6 +491,8 @@ CHKHDRS= \ ptem.h \ ptms.h \ ptyvar.h \ + qqcache.h \ + qqcache_impl.h \ raidioctl.h \ ramdisk.h \ random.h \ diff --git a/usr/src/uts/common/sys/ethernet.h b/usr/src/uts/common/sys/ethernet.h index 5b9de2f2bf..f19912bfc3 100644 --- a/usr/src/uts/common/sys/ethernet.h +++ b/usr/src/uts/common/sys/ethernet.h @@ -23,6 +23,8 @@ * * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018, Joyent, Inc. */ /* @@ -139,6 +141,18 @@ struct ether_vlan_extinfo { #define ether_copy(a, b) (bcopy((caddr_t)a, (caddr_t)b, 6)) #endif +/* + * Ethernet is-zero check + */ +#if defined(__sparc) || defined(__i386) || defined(__amd64) +#define ether_is_zero(a) \ + (((short *)a)[0] == 0 && ((short *)a)[1] == 0 && ((short *)a)[2] == 0) +#else +#define ether_is_zero(a) (((uint8_t *)a)[0] == 0 && ((uint8_t *)a)[1] == 0 && \ + ((uint8_t *)a)[2] == 0 && ((uint8_t *)a)[3] == 0 && \ + ((uint8_t *)a)[4] == 0 && ((uint8_t *)a)[5] == 0) +#endif + #ifdef _KERNEL extern int localetheraddr(struct ether_addr *, struct ether_addr *); extern char *ether_sprintf(struct ether_addr *); diff --git a/usr/src/uts/common/sys/overlay.h b/usr/src/uts/common/sys/overlay.h index 12d0dbca51..90f1843282 100644 --- a/usr/src/uts/common/sys/overlay.h +++ b/usr/src/uts/common/sys/overlay.h @@ -10,7 +10,7 @@ */ /* - * Copyright 2015, Joyent, Inc. + * Copyright 2018, Joyent, Inc. */ #ifndef _SYS_OVERLAY_H @@ -40,7 +40,7 @@ extern "C" { typedef struct overlay_ioc_create { datalink_id_t oic_linkid; - uint32_t oic_filler; + uint32_t oic_dcid; uint64_t oic_vnetid; char oic_encap[MAXLINKNAMELEN]; } overlay_ioc_create_t; diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h index d638096006..de682a0397 100644 --- a/usr/src/uts/common/sys/overlay_common.h +++ b/usr/src/uts/common/sys/overlay_common.h @@ -42,7 +42,8 @@ typedef enum overlay_prop_type { OVERLAY_PROP_T_INT = 0x1, /* signed int */ OVERLAY_PROP_T_UINT, /* unsigned int */ OVERLAY_PROP_T_IP, /* sinaddr6 */ - OVERLAY_PROP_T_STRING /* OVERLAY_PROPS_SIZEMAX */ + OVERLAY_PROP_T_STRING, /* OVERLAY_PROPS_SIZEMAX */ + OVERLAY_PROP_T_ETHER /* 6-byte MAC address */ } overlay_prop_type_t; typedef enum overlay_prop_prot { diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h index 7fb8b8da1d..28e80d6d58 100644 --- a/usr/src/uts/common/sys/overlay_impl.h +++ b/usr/src/uts/common/sys/overlay_impl.h @@ -10,7 +10,7 @@ */ /* - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_OVERLAY_IMPL_H @@ -29,9 +29,9 @@ #include <sys/avl.h> #include <sys/ksocket.h> #include <sys/socket.h> -#include <sys/refhash.h> #include <sys/ethernet.h> #include <sys/list.h> +#include <sys/qqcache.h> #ifdef __cplusplus extern "C" { @@ -59,7 +59,7 @@ typedef struct overlay_mux { int omux_domain; /* RO: socket domain */ int omux_family; /* RO: socket family */ int omux_protocol; /* RO: socket protocol */ - struct sockaddr *omux_addr; /* RO: socket address */ + struct sockaddr *omux_addr; /* RO: socket address */ socklen_t omux_alen; /* RO: sockaddr len */ kmutex_t omux_lock; /* Protects everything below */ uint_t omux_count; /* Active instances */ @@ -81,8 +81,10 @@ typedef struct overlay_target { union { /* ott_lock */ overlay_target_point_t ott_point; struct overlay_target_dyn { - refhash_t *ott_dhash; + qqcache_t *ott_dhash; + qqcache_t *ott_l3dhash; avl_tree_t ott_tree; + avl_tree_t ott_l3tree; } ott_dyn; } ott_u; } overlay_target_t; @@ -117,6 +119,12 @@ typedef struct overlay_dev { uint64_t odd_vid; /* RO if active else odd_lock */ avl_node_t odd_muxnode; /* managed by mux */ overlay_target_t *odd_target; /* See big theory statement */ + uint32_t odd_dcid; /* RO if active else odd_lock */ + uint32_t odd_vl2sz; /* protected by odd_lock */ + uint32_t odd_vl2a; /* protected by odd_lock */ + uint32_t odd_routesz; /* protected by odd_lock */ + uint32_t odd_routea; /* protected by odd_lock */ + uint8_t odd_macaddr[ETHERADDRL]; /* RO same as odd_dcid */ char odd_fmamsg[OVERLAY_STATUS_BUFLEN]; /* odd_lock */ } overlay_dev_t; @@ -124,25 +132,50 @@ typedef enum overlay_target_entry_flags { OVERLAY_ENTRY_F_PENDING = 0x01, /* lookup in progress */ OVERLAY_ENTRY_F_VALID = 0x02, /* entry is currently valid */ OVERLAY_ENTRY_F_DROP = 0x04, /* always drop target */ - OVERLAY_ENTRY_F_VALID_MASK = 0x06 + OVERLAY_ENTRY_F_ROUTER = 0x08, /* VL2 router entry */ + OVERLAY_ENTRY_F_HAS_ROUTE = 0x10, + OVERLAY_ENTRY_F_VALID_MASK = 0x1e, + OVERLAY_ENTRY_F_VL3 = 0x20, /* Is VL3 entry */ } overlay_target_entry_flags_t; -typedef struct overlay_target_entry { +struct overlay_target_entry; +typedef struct overlay_target_entry overlay_target_entry_t; + +/* + * For VL3 target entries, if we need to lock both the VL3 entry and the + * (possibly shared with multiple VL3 entries) VL2 entry, we must always + * take the VL3 lock prior to the VL2 entry lock. + */ +typedef struct overlay_target_vl3 { + struct in6_addr otvl3_src; + struct in6_addr otvl3_dst; + uint16_t otvl3_src_vlan; + overlay_target_mac_t otvl3_vl2; +} overlay_target_vl3_t; + +typedef struct overlay_target_vl2 { + overlay_target_route_t otvl2_route; + overlay_target_mac_t otvl2_mac; + overlay_target_point_t otvl2_dest; +} overlay_target_vl2_t; + +struct overlay_target_entry { kmutex_t ote_lock; - refhash_link_t ote_reflink; /* hashtable link */ + qqcache_link_t ote_reflink; /* hashtable link */ avl_node_t ote_avllink; /* iteration link */ list_node_t ote_qlink; overlay_target_entry_flags_t ote_flags; /* RW: state flags */ - uint8_t ote_addr[ETHERADDRL]; /* RO: mac addr */ overlay_target_t *ote_ott; /* RO */ overlay_dev_t *ote_odd; /* RO */ - overlay_target_point_t ote_dest; /* RW: destination */ mblk_t *ote_chead; /* RW: blocked mb chain head */ mblk_t *ote_ctail; /* RW: blocked mb chain tail */ size_t ote_mbsize; /* RW: outstanding mblk size */ hrtime_t ote_vtime; /* RW: valid timestamp */ -} overlay_target_entry_t; - + union { + overlay_target_vl2_t ote_vl2; + overlay_target_vl3_t ote_vl3; + } ote_u; +}; #define OVERLAY_CTL "overlay" @@ -186,7 +219,7 @@ extern void overlay_target_free(overlay_dev_t *); #define OVERLAY_TARGET_DROP 1 #define OVERLAY_TARGET_ASYNC 2 extern int overlay_target_lookup(overlay_dev_t *, mblk_t *, struct sockaddr *, - socklen_t *); + socklen_t *, uint64_t *); extern void overlay_target_quiesce(overlay_target_t *); extern void overlay_target_fini(void); diff --git a/usr/src/uts/common/sys/overlay_target.h b/usr/src/uts/common/sys/overlay_target.h index ae92ef3532..28c0559acb 100644 --- a/usr/src/uts/common/sys/overlay_target.h +++ b/usr/src/uts/common/sys/overlay_target.h @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2015 Joyent, Inc. + * Copyright (c) 2018 Joyent, Inc. */ #ifndef _OVERLAY_TARGET_H @@ -29,11 +29,43 @@ extern "C" { #endif +/* + * The overlay_target_point_t structure represents the destination where + * encapsulated frames are sent. Currently supported virtualization protocols + * (i.e. vxlan) only use otp_ip and otp_port, but other methods might use + * a L2 address instead of an L3 address to represent a destination. + */ typedef struct overlay_target_point { - uint8_t otp_mac[ETHERADDRL]; struct in6_addr otp_ip; uint16_t otp_port; -} overlay_target_point_t; + uint8_t otp_mac[ETHERADDRL]; +} overlay_target_point_t __aligned(8); + +/* + * An overlay_target_mac_t represents the overlay representation of a VL2 MAC + * address. With the advent of cross-DC routing, it is possible to have + * duplicate MAC addresses in different data centers, so the data center id + * is necessary to uniquely identify a MAC address. + * + * XXX: In hindsight, using a uint16_t for the DCID might have been nicer. + */ +typedef struct overlay_target_mac { + uint32_t otm_dcid; + uint8_t otm_mac[ETHERADDRL]; +} overlay_target_mac_t; + +/* + * The overlay_target_route_t represents the fields of the packet that + * have to be modified to deliver a packet to remote (routed) destinations. + * All three values are always populated when a packet is routed, even if + * some of the overlay_target_route_t values end up being the same as the + * original values in the packet being routed. + */ +typedef struct overlay_target_route { + uint64_t otr_vnet; + uint8_t otr_srcmac[ETHERADDRL]; + uint16_t otr_vlan; +} overlay_target_route_t; #define OVERLAY_TARG_IOCTL (('o' << 24) | ('v' << 16) | ('t' << 8)) @@ -52,6 +84,7 @@ typedef struct overlay_targ_info { uint32_t oti_needs; uint64_t oti_flags; uint64_t oti_vnetid; + uint32_t oti_dcid; } overlay_targ_info_t; /* @@ -134,7 +167,7 @@ typedef struct overlay_targ_id { * * This ioctl can be used to copy data from a given request into a * user buffer. This can be used in combination with - * OVERLAY_TARG_INJECT to implemnt services such as a proxy-arp. + * OVERLAY_TARG_INJECT to implement services such as a proxy-arp. * * * OVERLAY_TARG_RESEND - overlay_targ_pkt_t @@ -152,6 +185,18 @@ typedef struct overlay_targ_id { #define OVERLAY_TARG_PKT (OVERLAY_TARG_IOCTL | 0x14) #define OVERLAY_TARG_RESEND (OVERLAY_TARG_IOCTL | 0x15) +typedef struct overlay_targ_l2 { + uint8_t otl2_srcaddr[ETHERADDRL]; + uint8_t otl2_dstaddr[ETHERADDRL]; + uint32_t otl2_dsttype; + uint32_t otl2_sap; +} overlay_targ_l2_t; + +typedef struct overlay_targ_l3 { + struct in6_addr otl3_srcip; + struct in6_addr otl3_dstip; +} overlay_targ_l3_t; + typedef struct overlay_targ_lookup { uint64_t otl_dlid; uint64_t otl_reqid; @@ -159,16 +204,20 @@ typedef struct overlay_targ_lookup { uint64_t otl_vnetid; uint64_t otl_hdrsize; uint64_t otl_pktsize; - uint8_t otl_srcaddr[ETHERADDRL]; - uint8_t otl_dstaddr[ETHERADDRL]; - uint32_t otl_dsttype; - uint32_t otl_sap; + union { + overlay_targ_l2_t otlu_l2; + overlay_targ_l3_t otlu_l3; + } otl_addru; int32_t otl_vlan; + boolean_t otl_l3req; } overlay_targ_lookup_t; + typedef struct overlay_targ_resp { - uint64_t otr_reqid; - overlay_target_point_t otr_answer; + uint64_t otr_reqid; + overlay_target_route_t otr_route; /* Ignored for VL2->UL3 requests */ + overlay_target_mac_t otr_mac; /* Ignored for VL2->UL3 requests */ + overlay_target_point_t otr_answer; } overlay_targ_resp_t; typedef struct overlay_targ_pkt { @@ -255,6 +304,7 @@ typedef struct overlay_targ_list { #define OVERLAY_TARG_CACHE_REMOVE (OVERLAY_TARG_IOCTL | 0x32) #define OVERLAY_TARG_CACHE_FLUSH (OVERLAY_TARG_IOCTL | 0x33) #define OVERLAY_TARG_CACHE_ITER (OVERLAY_TARG_IOCTL | 0x34) +#define OVERLAY_TARG_CACHE_REMOVE_NET (OVERLAY_TARG_IOCTL | 0x35) /* * This is a pretty arbitrary number that we're constraining ourselves to @@ -265,22 +315,36 @@ typedef struct overlay_targ_list { #define OVERLAY_TARGET_ITER_MAX 500 #define OVERLAY_TARGET_CACHE_DROP 0x01 +#define OVERLAY_TARGET_CACHE_ROUTER 0x02 typedef struct overlay_targ_cache_entry { - uint8_t otce_mac[ETHERADDRL]; + overlay_target_mac_t otce_mac; uint16_t otce_flags; overlay_target_point_t otce_dest; } overlay_targ_cache_entry_t; +typedef struct overlay_targ_cache_net_entry { + struct in6_addr otcne_src; + struct in6_addr otcne_dst; + uint16_t otcne_vlan; /* src vlan */ + uint8_t otcne_src_prefixlen; + uint8_t otcne_dst_prefixlen; +} overlay_targ_cache_net_entry_t; + typedef struct overlay_targ_cache { datalink_id_t otc_linkid; overlay_targ_cache_entry_t otc_entry; } overlay_targ_cache_t; +typedef struct overlay_targ_cache_net { + datalink_id_t otcn_linkid; + overlay_targ_cache_net_entry_t otcn_entry; +} overlay_targ_cache_net_t; + typedef struct overlay_targ_cache_iter { datalink_id_t otci_linkid; uint32_t otci_pad; - uint64_t otci_marker; + uint64_t otci_marker[2]; uint16_t otci_count; uint8_t otci_pad2[3]; overlay_targ_cache_entry_t otci_ents[]; diff --git a/usr/src/uts/common/sys/qqcache.h b/usr/src/uts/common/sys/qqcache.h new file mode 100644 index 0000000000..a2244338dd --- /dev/null +++ b/usr/src/uts/common/sys/qqcache.h @@ -0,0 +1,176 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018, Joyent, Inc. + */ + +#ifndef _QQCACHE_H +#define _QQCACHE_H + +#include <sys/list.h> +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This implements a fixed-size hash table that uses the 2Q algorithm + * from Johnson and Shasha to manage the contents of the entries. + * + * Briefly, there are two fixed sizes lists (0 and 1). New entries are + * added to the head of list 1, and upon subsequent access (lookup), are + * moved to the head of list 0. Entries that fall off the end of list 0 + * are pushed onto the head of list 1, and entries that fall off the end + * of list 1 are deleted. The percentage of the total size of the cache + * for each list is determined by the parameter 'a', which is a percentage + * (0-100) of the cache size that is dedicated to list 0. + * + * This implementation does generalize this algorithm somewhat to an + * arbitrary number of lists (instead of just 2) via the QQCACHE_NUM_LISTS + * and QQCACHE_INSERT_LIST preprocessor symbols (defined in + * sys/qqcache_impl.h). New entries are added to list QQCACHE_INSERT_LIST + * and as each list gets full, the oldest entry in each list is pushed to + * the head of the succeeding list, and the oldest entries are removed + * from the cache (so each list never has more entries than their maximum + * size). + * + * The API itself is very similar to that of refhash. A qqcache_link_t struct + * is embedded within the definition of the entries that are being stored in + * a given qqcache_t. Functions are provided to hash/compare the tag (key) + * value of an entry, as well as destroying the entry during the creation + * of the cache. Lookups then occur by passing a pointer to the key value + * being looked up. + * + * NOTE: As one can take references to entries in the cache via the + * qqcache_hold() function, refheld entries that are marked for deletion are + * not counted when tracking the cache size, and their dtor function is not + * called until the last reference has been released (by calling the + * qqcache_rele() function). + */ + +typedef enum qqcache_flag { + QQCACHE_F_DEAD = 0x01, +} qqcache_flag_t; + +typedef struct qqcache_link { + list_node_t qqln_hash_link; /* Hash chain bucket */ + list_node_t qqln_list_link; /* Cache list link */ + uint_t qqln_listnum; + uint_t qqln_refcnt; + qqcache_flag_t qqln_flags; +} qqcache_link_t; + +struct qqcache; +typedef struct qqcache qqcache_t; + +typedef uint64_t (*qqcache_hash_fn_t)(const void *); +typedef int (*qqcache_cmp_fn_t)(const void *, const void *); +typedef void (*qqcache_dtor_fn_t)(void *); + +/* + * qqcache_create(qcp, sz, a, buckets, hash_fn, cmp_fn, dtor_fn, + * elsize, link_off, tag_off, flags); + * + * Creates a new 2Q cache: + * + * qqcache_t **qcp A pointer to the pointer that will hold the new + * cache. + * + * size_t sz The size of the cache (in entries). + * + * size_t a The percentage (0-100) of the cache dedicated to + * MRU entries (list 0); + * + * size_t buckets The number of hash buckets in the cache. + * + * qqcache_hash_fn_t hash_fn The function used to create a + * hash value for a given entry's tag + * value. + * + * qqcache_cmp_fn_t cmp_fn The function used to compare the two + * tag values of two entries. The function + * should return '0' if the two entries + * are equal, '1' if they are not equal. + * + * qqcache_dtor_fn_t dtor_fn The function used to destroy/free + * entries. + * + * size_t elsize The size of each entry. + * + * size_t link_off The offset of the qqcache_link_t struct in the entry. + * + * size_t tag_off The offset in the entry of the tag value (used for + * hashing and comparison). + * + * int flags The flags passed to kmem_zalloc/umem_zalloc. + * + * Returns: + * 0 Success + * EINVAL A parameter was not valid + * ENOMEM The memory allocation failed (only possible when + * KM_NOSLEEP/UMEM_DEFAULT is passed to flags). + */ +extern int qqcache_create(qqcache_t **, size_t, size_t, size_t, + qqcache_hash_fn_t, qqcache_cmp_fn_t, qqcache_dtor_fn_t, + size_t, size_t, size_t, int); + +/* Destroy the given qqcache_t */ +extern void qqcache_destroy(qqcache_t *); + +/* + * qqcache_insert(qc, obj) + * + * qqcache_t *qc The cache to insert the item into. + * + * void *obj The object to add. + * + * Returns: + * 0 Success + * EEXIST The same entry (as determined by the cache cmp function) already + * exists in the cache. + */ +extern int qqcache_insert(qqcache_t *, void *); + +/* Lookup an entry with the given tag/key, or return NULL if not found */ +extern void *qqcache_lookup(qqcache_t *, const void *); + +/* Remove the given entry from the cache */ +extern void qqcache_remove(qqcache_t *, void *); + +/* Add a hold on the entry in the cache */ +extern void qqcache_hold(qqcache_t *, void *); + +/* Release the hold on the entry in the cache */ +extern void qqcache_rele(qqcache_t *, void *); + +/* + * Adjust the size and percentage of the cache for list 0. If new values are + * smaller than current values, entries may be evicted as necessary to reduce + * the size of the cache to the given size. + */ +extern int qqcache_adjust_size(qqcache_t *, size_t); +extern int qqcache_adjust_a(qqcache_t *, size_t); + +/* Return the current values of size or a. */ +extern size_t qqcache_size(const qqcache_t *); +extern size_t qqcache_a(const qqcache_t *); + +/* Iterate through entries. */ +extern void *qqcache_first(qqcache_t *); +extern void *qqcache_next(qqcache_t *, void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _QQCACHE_H */ diff --git a/usr/src/uts/common/sys/qqcache_impl.h b/usr/src/uts/common/sys/qqcache_impl.h new file mode 100644 index 0000000000..f709b74d6c --- /dev/null +++ b/usr/src/uts/common/sys/qqcache_impl.h @@ -0,0 +1,72 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018, Joyent, Inc. + */ + +#ifndef _QQCACHE_IMPL_H +#define _QQCACHE_IMPL_H + +#include <sys/debug.h> +#include <sys/qqcache.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define QQCACHE_NUM_LISTS 2 +#define QQCACHE_INSERT_LIST 1 +#define QQCACHE_MIN_SIZE 10 + +CTASSERT(QQCACHE_INSERT_LIST < QQCACHE_NUM_LISTS); +CTASSERT(QQCACHE_NUM_LISTS >= 2); + +typedef struct qqcache_list { + list_t qqcl_list; + size_t qqcl_len; +} qqcache_list_t; + +struct qqcache { + qqcache_hash_fn_t qqc_hash_fn; + qqcache_cmp_fn_t qqc_cmp_fn; + qqcache_dtor_fn_t qqc_dtor_fn; + size_t qqc_link_off; + size_t qqc_tag_off; + size_t qqc_nbuckets; + size_t qqc_size; + size_t qqc_a; + size_t qqc_max[QQCACHE_NUM_LISTS]; + qqcache_list_t qqc_lists[QQCACHE_NUM_LISTS]; + qqcache_list_t qqc_buckets[]; +}; + +#define QQCACHE_LIST(qqc, lnk) \ + (&(qqc)->qqc_lists[(lnk)->qqln_listnum]) + +#ifdef lint +extern qqcache_link_t *obj_to_link(qqcache_t *, void *); +extern void *link_to_obj(qqcache_t *, qqcache_link_t *); +extern void *obj_to_tag(qqcache_t *, void *); +#else +#define obj_to_link(_q, _o) \ + ((qqcache_link_t *)(((char *)(_o)) + (_q)->qqc_link_off)) +#define link_to_obj(_q, _l) \ + ((void *)(((char *)(_l)) - (_q)->qqc_link_off)) +#define obj_to_tag(_q, _o) \ + ((void *)(((char *)(_o)) + (_q)->qqc_tag_off)) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _QQCACHE_IMPL_H */ |
