diff options
49 files changed, 5318 insertions, 641 deletions
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile index 4e3dd8259a..9b11174c49 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile +++ b/usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile @@ -22,7 +22,7 @@ # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# Copyright (c) 2018, Joyent, Inc. +# Copyright 2018 Joyent, Inc. # PROG= snoop diff --git a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_svp.c b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_svp.c index a0768c2234..3da8c57f44 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_svp.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_svp.c @@ -322,6 +322,11 @@ do_svp_log_ack(void *data, int len) case SVP_LOG_VL3: rlen = sizeof (svp_log_vl3_t); break; +#if 0 /* XXX KEBE SAYS ROUTE */ + case SVP_LOG_ROUTE: + rlen = sizeof (svp_log_route_t); + break; +#endif default: /* * If we don't know the type of log record we have, @@ -362,6 +367,33 @@ do_svp_log_ack(void *data, int len) ntohl(u.vl3->svl3_vnetid)); u.vl3++; break; +#if 0 /* XXX KEBE SAYS ROUTE */ + case SVP_LOG_ROUTE: + show_printf("%8s Source Vnet = %u", "", + ntohl(u.vr->svlr_src_vnetid)); + show_printf("%8s Source VLAN = %hu", "", + ntohs(u.vr->svlr_src_vlan)); + + prefixlen = u.vr->svlr_src_prefixlen; + is_host = prefixlen == 128 ? B_TRUE : B_FALSE; + show_printf("%8s Source %s = %s", "", + is_host ? "address" : "subnet", + svp_addr_str(u.vr->svlr_srcip, &prefixlen)); + show_printf("%8s Destination DC id = %u", "", + ntohl(u.vr->svlr_dcid)); + show_printf("%8s Destination Vnet = %u", "", + ntohl(u.vr->svlr_dst_vnetid)); + show_printf("%8s Destination VLAN = %hu", "", + ntohs(u.vr->svlr_dst_vlan)); + + prefixlen = u.vr->svlr_dst_prefixlen; + is_host = prefixlen == 128 ? B_TRUE : B_FALSE; + show_printf("%8s Destination %s = %s", "", + is_host ? "address" : "subnet", + svp_addr_str(u.vr->svlr_dstip, &prefixlen)); + u.vr++; + break; +#endif } len -= rlen; @@ -423,6 +455,39 @@ do_svp_shootdown(void *data, int len) ether_ntoa((struct ether_addr *)sd->svsd_mac)); } +#if 0 /* XXX KEBE SAYS ROUTE */ +static void +do_svp_route_req(void *data, int len) +{ + svp_route_req_t *req = data; + + show_printf("Vnet = %u", ntohl(req->srr_vnetid)); + show_printf("VLAN = %hu", ntohs(req->srr_vlan)); + show_printf("Source Address = %s", svp_addr_str(req->srr_srcip, NULL)); + show_printf("Destination Address = %s", svp_addr_str(req->srr_dstip, + NULL)); +} + +static void +do_svp_route_ack(void *data, int len) +{ + svp_route_ack_t *ack = data; + + show_printf("Status = %s", svp_status_str(ntohl(ack->sra_status))); + show_printf("Remote DC Id = %u", ntohl(ack->sra_dcid)); + show_printf("Remote Vnet = %u", ntohl(ack->sra_vnetid)); + show_printf("Remote VLAN = %hu", ntohs(ack->sra_vlan)); + show_printf("Remote UL3 Address = %s", svp_addr_str(ack->sra_ip, NULL)); + show_printf("Remote UL3 Port = %hu", ntohs(ack->sra_port)); + show_printf("Source MAC Address = %s", + ether_ntoa((struct ether_addr *)ack->sra_srcmac)); + show_printf("Destination MAC Address = %s", + ether_ntoa((struct ether_addr *)ack->sra_dstmac)); + show_printf("Source IP Prefix = %hhu", ack->sra_src_pfx); + show_printf("Destination IP Prefix = %hhu", ack->sra_dst_pfx); +} +#endif + static struct svp_len_tbl { uint16_t slt_op; size_t slt_len; @@ -441,6 +506,10 @@ static struct svp_len_tbl { { SVP_R_LOG_RM, sizeof (svp_lrm_req_t) }, { SVP_R_LOG_RM_ACK, sizeof (svp_lrm_ack_t) }, { SVP_R_SHOOTDOWN, sizeof (svp_shootdown_t) }, +#if 0 /* XXX KEBE SAYS ROUTE */ + { SVP_R_ROUTE_REQ, sizeof (svp_route_req_t) }, + { SVP_R_ROUTE_ACK, sizeof (svp_route_ack_t) } +#endif }; static boolean_t @@ -548,6 +617,14 @@ interpret_svp(int flags, char *data, int fraglen) case SVP_R_SHOOTDOWN: do_svp_shootdown(req, fraglen); break; +#if 0 /* XXX KEBE SAYS ROUTE */ + case SVP_R_ROUTE_REQ: + do_svp_route_req(req, fraglen); + break; + case SVP_R_ROUTE_ACK: + do_svp_route_ack(req, fraglen); + break; +#endif } show_space(); diff --git a/usr/src/cmd/dladm/dladm.c b/usr/src/cmd/dladm/dladm.c index c59926be94..590f693a66 100644 --- a/usr/src/cmd/dladm/dladm.c +++ b/usr/src/cmd/dladm/dladm.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2016 Nexenta Systems, Inc. * Copyright 2020 Peter Tribble. */ @@ -420,13 +420,14 @@ static cmd_t cmds[] = { " show-bridge -t [-p] [-o <field>,...] [-s [-i <interval>]]" " <bridge>\n" }, { "create-overlay", do_create_overlay, - " create-overlay [-t] -e <encap> -s <search> -v <vnetid>\n" + " create-overlay [-t] [-d <dcid>] -e <encap> -s <search> " + "-v <vnetid>\n" "\t\t [ -p <prop>=<value>[,...]] <overlay>" }, { "delete-overlay", do_delete_overlay, " delete-overlay <overlay>" }, { "modify-overlay", do_modify_overlay, - " modify-overlay -d mac | -f | -s mac=ip:port " - "<overlay>" }, + " modify-overlay -d [dcid/]mac | -f | -s [dcid/]mac=ip:port " + " | -p prop=value[,...] <overlay>" }, { "show-overlay", do_show_overlay, " show-overlay [-f | -t] [[-p] -o <field>,...] " "[<overlay>]\n" }, @@ -1464,12 +1465,14 @@ static const struct option overlay_create_lopts[] = { { "search", required_argument, NULL, 's' }, { "temporary", no_argument, NULL, 't' }, { "vnetid", required_argument, NULL, 'v' }, + { "dcid", optional_argument, NULL, 'd' }, { NULL, 0, NULL, 0 } }; static const struct option overlay_modify_lopts[] = { { "delete-entry", required_argument, NULL, 'd' }, { "flush-table", no_argument, NULL, 'f' }, + { "prop", required_argument, NULL, 'p' }, { "set-entry", required_argument, NULL, 's' }, { NULL, 0, NULL, 0 } }; @@ -9892,15 +9895,26 @@ do_create_overlay(int argc, char *argv[], const char *use) char name[MAXLINKNAMELEN]; dladm_status_t status; uint32_t flags = DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST; + uint32_t dcid = 0; uint64_t vid; boolean_t havevid = B_FALSE; char propstr[DLADM_STRSIZE]; dladm_arg_list_t *proplist = NULL; bzero(propstr, sizeof (propstr)); - while ((opt = getopt_long(argc, argv, ":te:v:p:s:", + while ((opt = getopt_long(argc, argv, ":td:e:v:p:s:", overlay_create_lopts, NULL)) != -1) { switch (opt) { + case 'd': + errno = 0; + dcid = strtoul(optarg, &endp, 10); + if (*endp != '\0' || (dcid == 0 && errno == EINVAL)) + die("couldn't parse datacenter id: %s", + optarg); + /* XXX If we go 64-bit, add check for > UINT32_MAX. */ + if (dcid == ULONG_MAX && errno == ERANGE) + die("datacenter id too large: %s", optarg); + break; case 'e': encap = optarg; break; @@ -9917,6 +9931,7 @@ do_create_overlay(int argc, char *argv[], const char *use) die("property list too long '%s'", propstr); break; case 'v': + errno = 0; vid = strtoul(optarg, &endp, 10); if (*endp != '\0' || (vid == 0 && errno == EINVAL)) die("couldn't parse virtual networkd id: %s", @@ -9959,7 +9974,7 @@ do_create_overlay(int argc, char *argv[], const char *use) != DLADM_STATUS_OK) die("invalid overlay property"); - status = dladm_overlay_create(handle, name, encap, search, vid, + status = dladm_overlay_create(handle, name, encap, search, vid, dcid, proplist, &errlist, flags); dladm_free_props(proplist); if (status != DLADM_STATUS_OK) { @@ -9989,7 +10004,7 @@ do_delete_overlay(int argc, char *argv[], const char *use) typedef struct showoverlay_state { ofmt_handle_t sho_ofmt; - const char *sho_linkname; + const char *sho_linkname; dladm_overlay_propinfo_handle_t sho_info; uint8_t sho_value[DLADM_OVERLAY_PROP_SIZEMAX]; uint32_t sho_size; @@ -10080,6 +10095,12 @@ print_overlay_value(char *outbuf, uint_t bufsize, uint_t type, const void *pbuf, case OVERLAY_PROP_T_STRING: (void) snprintf(outbuf, bufsize, "%s", pbuf); break; + case OVERLAY_PROP_T_ETHER: + if (ether_ntoa_r((struct ether_addr *)pbuf, outbuf) == NULL) { + warn("malformed overlay ethernet property\n"); + (void) snprintf(outbuf, bufsize, "--"); + } + break; default: abort(); } @@ -10428,7 +10449,7 @@ do_show_overlay(int argc, char *argv[], const char *use) int i, opt; datalink_id_t linkid = DATALINK_ALL_LINKID; dladm_status_t status; - int (*funcp)(dladm_handle_t, datalink_id_t, void *); + int (*funcp)(dladm_handle_t, datalink_id_t, void *); char *fields_str = NULL; const ofmt_field_t *fieldsp; ofmt_status_t oferr; @@ -10498,17 +10519,54 @@ do_show_overlay(int argc, char *argv[], const char *use) } static void +parse_overlay_mac(const char *s, uint32_t *dcidp, struct ether_addr *ep) +{ + const char *slash; + + *dcidp = 0; + + if ((slash = strchr(s, '/')) != NULL) { + ulong_t dcval = 0; + size_t slen = (size_t)(slash - s) + 1; + + /* + * If present the dcid must be at least 1 digit, and <= + * UINT32_MAX (10 digits + 1 for NUL). + */ + if (slen < 2 || slen > 11) + die("invalid mac specification: %s\n", s); + + char dcstr[slen]; + + (void) strlcpy(dcstr, s, slen); + errno = 0; + if ((dcval = strtoul(dcstr, NULL, 10)) == 0 && errno != 0) + die("invalid data center id: %s\n", dcstr); + /* XXX if we become 64-bit, check for results > UINT32_MAX */ + + *dcidp = (uint32_t)dcval; + /* Move s past '/' */ + s = slash + 1; + } + + if (ether_aton_r(s, ep) == NULL) + die("invalid mac specification: %s\n", s); +} + +static void do_modify_overlay(int argc, char *argv[], const char *use) { int opt, ocnt = 0; - boolean_t flush, set, delete; + boolean_t flush, set, delete, setprop; + uint32_t dcid = 0; struct ether_addr e; char *dest; datalink_id_t linkid = DATALINK_ALL_LINKID; dladm_status_t status; + char propstr[DLADM_STRSIZE] = { 0 }; - flush = set = delete = B_FALSE; - while ((opt = getopt_long(argc, argv, ":fd:s:", overlay_modify_lopts, + flush = set = delete = setprop = B_FALSE; + while ((opt = getopt_long(argc, argv, ":fd:p:s:", overlay_modify_lopts, NULL)) != -1) { switch (opt) { case 'd': @@ -10516,8 +10574,7 @@ do_modify_overlay(int argc, char *argv[], const char *use) die_optdup('d'); delete = B_TRUE; ocnt++; - if (ether_aton_r(optarg, &e) == NULL) - die("invalid mac address: %s\n", optarg); + parse_overlay_mac(optarg, &dcid, &e); break; case 'f': if (flush == B_TRUE) @@ -10525,6 +10582,16 @@ do_modify_overlay(int argc, char *argv[], const char *use) flush = B_TRUE; ocnt++; break; + case 'p': + if (setprop == B_TRUE) + die_optdup('p'); + setprop = B_TRUE; + (void) strlcat(propstr, optarg, DLADM_STRSIZE); + if (strlcat(propstr, ",", DLADM_STRSIZE) >= + DLADM_STRSIZE) + die("property list too long '%s'", propstr); + ocnt++; + break; case 's': if (set == B_TRUE) die_optdup('s'); @@ -10536,8 +10603,7 @@ do_modify_overlay(int argc, char *argv[], const char *use) if (dest == NULL) die("malformed value, expected mac=dest, " "got: %s\n", optarg); - if (ether_aton_r(optarg, &e) == NULL) - die("invalid mac address: %s\n", optarg); + parse_overlay_mac(optarg, &dcid, &e); break; default: die_opterr(optopt, opt, use); @@ -10545,9 +10611,9 @@ do_modify_overlay(int argc, char *argv[], const char *use) } if (ocnt == 0) - die("need to specify one of -d, -f, or -s"); + die("need to specify one of -d, -f, -p, or -s"); if (ocnt > 1) - die("only one of -d, -f, or -s may be used"); + die("only one of -d, -f, -p, or -s may be used"); if (argv[optind] == NULL) die("missing required overlay device\n"); @@ -10568,17 +10634,43 @@ do_modify_overlay(int argc, char *argv[], const char *use) } if (delete == B_TRUE) { - status = dladm_overlay_cache_delete(handle, linkid, &e); + status = dladm_overlay_cache_delete(handle, linkid, dcid, &e); if (status != DLADM_STATUS_OK) die_dlerr(status, "failed to flush target %s from " "overlay target cache %s", optarg, argv[optind]); } if (set == B_TRUE) { - status = dladm_overlay_cache_set(handle, linkid, &e, dest); + status = dladm_overlay_cache_set(handle, linkid, dcid, &e, + dest); if (status != DLADM_STATUS_OK) die_dlerr(status, "failed to set target %s for overlay " "target cache %s", optarg, argv[optind]); } + if (setprop == B_TRUE) { + dladm_arg_list_t *proplist = NULL; + uint_t i; + + if (dladm_parse_link_props(propstr, &proplist, B_FALSE) + != DLADM_STATUS_OK) + die("invalid overlay property"); + + for (i = 0; i < proplist->al_count; i++) { + dladm_status_t status; + + status = dladm_overlay_setprop(handle, linkid, + proplist->al_info[i].ai_name, + proplist->al_info[i].ai_val, + proplist->al_info[i].ai_count); + + if (status != DLADM_STATUS_OK) { + die_dlerr(status, "failed to set property %s " + "for overlay device %s", + proplist->al_info[i].ai_name, argv[optind]); + } + } + + dladm_free_props(proplist); + } } diff --git a/usr/src/cmd/mdb/common/modules/genunix/Makefile.files b/usr/src/cmd/mdb/common/modules/genunix/Makefile.files index d371cf70fe..05ab8fe59c 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/Makefile.files +++ b/usr/src/cmd/mdb/common/modules/genunix/Makefile.files @@ -71,6 +71,7 @@ GENUNIX_SRCS = \ nvpair.c \ pci.c \ pg.c \ + qqcache.c \ rctl.c \ refhash.c \ refstr.c \ diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c index 32370ba7e1..e0f21979e9 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c +++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c @@ -98,6 +98,7 @@ #include "nvpair.h" #include "pci.h" #include "pg.h" +#include "qqcache.h" #include "rctl.h" #include "refhash.h" #include "sobj.h" @@ -4788,6 +4789,12 @@ static const mdb_walker_t walkers[] = { { "pcie_bus", "walk all pcie_bus_t's", pcie_bus_walk_init, pcie_bus_walk_step, NULL }, + /* from qqcache.c */ + { QQCACHE_WALK_NAME, QQCACHE_WALK_DESC, + qqcache_walk_init_cache, qqcache_walk_step, qqcache_walk_fini }, + { QQCACHE_HASH_WALK_NAME, QQCACHE_HASH_WALK_DESC, + qqcache_walk_init_hash, qqcache_walk_step, qqcache_walk_fini }, + /* from rctl.c */ { "rctl_dict_list", "walk all rctl_dict_entry_t's from rctl_lists", rctl_dict_walk_init, rctl_dict_walk_step, NULL }, diff --git a/usr/src/cmd/mdb/common/modules/genunix/qqcache.c b/usr/src/cmd/mdb/common/modules/genunix/qqcache.c new file mode 100644 index 0000000000..a2ba1463b9 --- /dev/null +++ b/usr/src/cmd/mdb/common/modules/genunix/qqcache.c @@ -0,0 +1,117 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018, Joyent, Inc. + */ + +#include <mdb/mdb_modapi.h> +#include <mdb/mdb_ctf.h> + +#include <sys/qqcache.h> +#include <sys/qqcache_impl.h> + +#include "qqcache.h" + +typedef struct qqcache_walk_data { + size_t qwd_link_off; +} qqcache_walk_data_t; + +typedef struct mdb_qqcache { + size_t qqc_link_off; + size_t qqc_nbuckets; +} mdb_qqcache_t; + +static int +qqcache_walk_init(mdb_walk_state_t *wsp, boolean_t use_hash) +{ + qqcache_walk_data_t *qwd; + uintptr_t base; + size_t i, n, qqc_list_sz; + int cache_off, bucket_off, list_off; + mdb_qqcache_t qc; + + /* mdb_ctf_offsetof_by_name will print any errors */ + cache_off = mdb_ctf_offsetof_by_name("qqcache_t", "qqc_lists"); + if (cache_off == -1) + return (WALK_ERR); + + bucket_off = mdb_ctf_offsetof_by_name("qqcache_t", "qqc_buckets"); + if (bucket_off == -1) + return (WALK_ERR); + + list_off = mdb_ctf_offsetof_by_name("qqcache_list_t", "qqcl_list"); + if (list_off == -1) + return (WALK_ERR); + + /* mdb_ctf_sizeof_by_name will print any errors */ + qqc_list_sz = mdb_ctf_sizeof_by_name("qqcache_list_t"); + if (qqc_list_sz == -1) + return (WALK_ERR); + + if (mdb_ctf_vread(&qc, "qqcache_t", "mdb_qqcache_t", wsp->walk_addr, + 0) == -1) { + mdb_warn("failed to read qqcache_t at %#lx", wsp->walk_addr); + return (WALK_ERR); + } + + qwd = wsp->walk_data = mdb_zalloc(sizeof (*qwd), UM_SLEEP); + qwd->qwd_link_off = qc.qqc_link_off; + + if (use_hash) { + base = wsp->walk_addr + bucket_off; + n = qc.qqc_nbuckets; + } else { + base = wsp->walk_addr + cache_off; + n = QQCACHE_NUM_LISTS; + } + + for (i = 0; i < n; i++) { + wsp->walk_addr = base + i * qqc_list_sz + list_off; + + if (mdb_layered_walk("list", wsp) == -1) { + mdb_warn("can't walk qqcache_t"); + mdb_free(qwd, sizeof (*qwd)); + return (WALK_ERR); + } + } + + return (WALK_NEXT); +} + +int +qqcache_walk_init_cache(mdb_walk_state_t *wsp) +{ + return (qqcache_walk_init(wsp, B_FALSE)); +} + +int +qqcache_walk_init_hash(mdb_walk_state_t *wsp) +{ + return (qqcache_walk_init(wsp, B_TRUE)); +} + +int +qqcache_walk_step(mdb_walk_state_t *wsp) +{ + qqcache_walk_data_t *qwd = wsp->walk_data; + uintptr_t addr = wsp->walk_addr - qwd->qwd_link_off; + + return (wsp->walk_callback(addr, wsp->walk_layer, wsp->walk_cbdata)); +} + +void +qqcache_walk_fini(mdb_walk_state_t *wsp) +{ + qqcache_walk_data_t *qwd = wsp->walk_data; + + mdb_free(qwd, sizeof (*qwd)); +} diff --git a/usr/src/cmd/mdb/common/modules/genunix/qqcache.h b/usr/src/cmd/mdb/common/modules/genunix/qqcache.h new file mode 100644 index 0000000000..c0d1d14fe6 --- /dev/null +++ b/usr/src/cmd/mdb/common/modules/genunix/qqcache.h @@ -0,0 +1,40 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018, Joyent Inc. + */ + +#ifndef _MDB_QQCACHE_H +#define _MDB_QQCACHE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define QQCACHE_WALK_NAME "qqcache" +#define QQCACHE_WALK_DESC "walk a qqcache (2Q cache)" + +#define QQCACHE_HASH_WALK_NAME "qqhash" +#define QQCACHE_HASH_WALK_DESC "walk a qqcache (2Q cache) via the hash buckets" + +struct mdb_walk_state; + +extern int qqcache_walk_init_cache(struct mdb_walk_state *); +extern int qqcache_walk_init_hash(struct mdb_walk_state *); +extern int qqcache_walk_step(struct mdb_walk_state *); +extern void qqcache_walk_fini(struct mdb_walk_state *); + +#ifdef __cplusplus +} +#endif + +#endif /* _MDB_QQCACHE_H */ diff --git a/usr/src/lib/libdladm/common/libdloverlay.c b/usr/src/lib/libdladm/common/libdloverlay.c index a83105b91c..db58da0a34 100644 --- a/usr/src/lib/libdladm/common/libdloverlay.c +++ b/usr/src/lib/libdladm/common/libdloverlay.c @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2015 Joyent, Inc. + * Copyright (c) 2018 Joyent, Inc. */ #include <libdladm_impl.h> @@ -127,6 +127,11 @@ dladm_overlay_parse_prop(overlay_prop_type_t type, void *buf, uint32_t *sizep, bcopy(&ipv6, buf, sizeof (struct in6_addr)); *sizep = sizeof (struct in6_addr); break; + case OVERLAY_PROP_T_ETHER: + if (ether_aton_r(val, (struct ether_addr *)buf) == NULL) + return (DLADM_STATUS_BADARG); + *sizep = ETHERADDRL; + break; default: abort(); } @@ -203,16 +208,16 @@ dladm_overlay_setprop(dladm_handle_t handle, datalink_id_t linkid, prop.oip_linkid = linkid; prop.oip_id = info.oipi_id; prop.oip_name[0] = '\0'; - if ((ret = dladm_overlay_parse_prop(info.oipi_type, prop.oip_value, + if ((status = dladm_overlay_parse_prop(info.oipi_type, prop.oip_value, &prop.oip_size, valp[0])) != DLADM_STATUS_OK) - return (ret); + return (status); status = DLADM_STATUS_OK; ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_SETPROP, &prop); if (ret != 0) status = dladm_errno2status(errno); - return (ret); + return (status); } /* @@ -475,7 +480,7 @@ dladm_overlay_walk_prop(dladm_handle_t handle, datalink_id_t linkid, dladm_status_t dladm_overlay_create(dladm_handle_t handle, const char *name, - const char *encap, const char *search, uint64_t vid, + const char *encap, const char *search, uint64_t vid, uint32_t dcid, dladm_arg_list_t *props, dladm_errlist_t *errs, uint32_t flags) { int ret, i; @@ -495,6 +500,7 @@ dladm_overlay_create(dladm_handle_t handle, const char *name, bzero(&oic, sizeof (oic)); oic.oic_linkid = linkid; oic.oic_vnetid = vid; + oic.oic_dcid = dcid; (void) strlcpy(oic.oic_encap, encap, MAXLINKNAMELEN); status = DLADM_STATUS_OK; @@ -542,8 +548,7 @@ dladm_overlay_create(dladm_handle_t handle, const char *name, return (dladm_errno2status(ret)); } - if ((ret = libvarpd_c_instance_create(vch, linkid, search, - &id)) != 0) { + if ((ret = libvarpd_c_instance_create(vch, linkid, search, &id)) != 0) { (void) dladm_errlist_append(errs, "failed to create varpd instance: %s", strerror(ret)); libvarpd_c_destroy(vch); @@ -708,7 +713,7 @@ dladm_overlay_cache_flush(dladm_handle_t handle, datalink_id_t linkid) /* ARGSUSED */ dladm_status_t dladm_overlay_cache_delete(dladm_handle_t handle, datalink_id_t linkid, - const struct ether_addr *key) + uint32_t dcid, const struct ether_addr *key) { int ret; uint64_t varpdid; @@ -722,7 +727,7 @@ dladm_overlay_cache_delete(dladm_handle_t handle, datalink_id_t linkid, return (dladm_errno2status(ret)); } - ret = libvarpd_c_instance_cache_delete(chdl, varpdid, key); + ret = libvarpd_c_instance_cache_delete(chdl, varpdid, dcid, key); libvarpd_c_destroy(chdl); return (dladm_errno2status(ret)); @@ -731,7 +736,7 @@ dladm_overlay_cache_delete(dladm_handle_t handle, datalink_id_t linkid, /* ARGSUSED */ dladm_status_t dladm_overlay_cache_set(dladm_handle_t handle, datalink_id_t linkid, - const struct ether_addr *key, char *val) + uint32_t dcid, const struct ether_addr *key, char *val) { int ret; uint_t dest; @@ -836,7 +841,7 @@ dladm_overlay_cache_set(dladm_handle_t handle, datalink_id_t linkid, } send: - ret = libvarpd_c_instance_cache_set(chdl, varpdid, key, &vcp); + ret = libvarpd_c_instance_cache_set(chdl, varpdid, dcid, key, &vcp); libvarpd_c_destroy(chdl); return (dladm_errno2status(ret)); diff --git a/usr/src/lib/libdladm/common/libdloverlay.h b/usr/src/lib/libdladm/common/libdloverlay.h index 39b01ccae3..e058cb7349 100644 --- a/usr/src/lib/libdladm/common/libdloverlay.h +++ b/usr/src/lib/libdladm/common/libdloverlay.h @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2015 Joyent, Inc. + * Copyright (c) 2018 Joyent, Inc. */ #ifndef _LIBDLOVERLAY_H @@ -45,8 +45,8 @@ typedef struct dladm_overlay_status { } dladm_overlay_status_t; extern dladm_status_t dladm_overlay_create(dladm_handle_t, const char *, - const char *, const char *, uint64_t, dladm_arg_list_t *, dladm_errlist_t *, - uint32_t); + const char *, const char *, uint64_t, uint32_t, dladm_arg_list_t *, + dladm_errlist_t *, uint32_t); extern dladm_status_t dladm_overlay_delete(dladm_handle_t, datalink_id_t); typedef void (*dladm_overlay_status_f)(dladm_handle_t, datalink_id_t, @@ -56,9 +56,9 @@ extern dladm_status_t dladm_overlay_status(dladm_handle_t, datalink_id_t, extern dladm_status_t dladm_overlay_cache_flush(dladm_handle_t, datalink_id_t); extern dladm_status_t dladm_overlay_cache_delete(dladm_handle_t, datalink_id_t, - const struct ether_addr *); + uint32_t, const struct ether_addr *); extern dladm_status_t dladm_overlay_cache_set(dladm_handle_t, datalink_id_t, - const struct ether_addr *, char *); + uint32_t, const struct ether_addr *, char *); extern dladm_status_t dladm_overlay_cache_get(dladm_handle_t, datalink_id_t, const struct ether_addr *, dladm_overlay_point_t *); @@ -72,6 +72,8 @@ extern dladm_status_t dladm_overlay_prop_info(dladm_overlay_propinfo_handle_t, const mac_propval_range_t **); extern dladm_status_t dladm_overlay_get_prop(dladm_handle_t, datalink_id_t, dladm_overlay_propinfo_handle_t, void *buf, size_t *bufsize); +extern dladm_status_t dladm_overlay_setprop(dladm_handle_t, datalink_id_t, + const char *, char *const *, uint_t); typedef int (*dladm_overlay_prop_f)(dladm_handle_t, datalink_id_t, dladm_overlay_propinfo_handle_t, void *); diff --git a/usr/src/lib/libdladm/common/mapfile-vers b/usr/src/lib/libdladm/common/mapfile-vers index 589bbf5330..3b595920f7 100644 --- a/usr/src/lib/libdladm/common/mapfile-vers +++ b/usr/src/lib/libdladm/common/mapfile-vers @@ -281,6 +281,7 @@ SYMBOL_VERSION SUNWprivate_1.1 { dladm_overlay_status; dladm_overlay_prop_info; dladm_overlay_get_prop; + dladm_overlay_setprop; dladm_overlay_walk_prop; dladm_overlay_cache_set; diff --git a/usr/src/lib/varpd/files/Makefile.com b/usr/src/lib/varpd/files/Makefile.com index dd8009d002..79a06e7d56 100644 --- a/usr/src/lib/varpd/files/Makefile.com +++ b/usr/src/lib/varpd/files/Makefile.com @@ -23,6 +23,7 @@ include ../../Makefile.plugin LIBS = $(DYNLIB) LDLIBS += -lc -lumem -lnvpair -lsocket -lcustr +LDLIBS += -lcmdutils -lavl -lbunyan CPPFLAGS += -I../common LINTFLAGS += -erroff=E_BAD_PTR_CAST_ALIGN diff --git a/usr/src/lib/varpd/files/common/libvarpd_files.c b/usr/src/lib/varpd/files/common/libvarpd_files.c index 812919a07d..90ef1c34ce 100644 --- a/usr/src/lib/varpd/files/common/libvarpd_files.c +++ b/usr/src/lib/varpd/files/common/libvarpd_files.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2015, Joyent, Inc. + * Copyright 2018, Joyent, Inc. */ /* @@ -28,7 +28,7 @@ * The plug-in only has a single property, which is the location of the JSON * file. The JSON file itself looks something like: * - * { + * { * "aa:bb:cc:dd:ee:ff": { * "arp": "10.23.69.1", * "ndp": "2600:3c00::f03c:91ff:fe96:a264", @@ -36,7 +36,42 @@ * "port": 8080 * }, * ... - * } + * + * "local-subnet1": { + * "prefix": "192.168.1.0/24", + * "vlan": 123 + * }, + * ... + * + * "remote-subnet1": { + * "dcid": 11223344, + * "prefix": "10.21.10.0/24", + * "vnet": 5340123, + * "vlan": 789, + * "routermac": "12:34:56:78:aa:bb", + * "macs": { + * "aa:bb:cc:dd:ee:ff": { + * "arp": "192.168.50.22", + * ... + * } + * } + * }, + * ... + * "attach-group1": [ + * "remote-subnet1", + * "remote-subnet2", + * "local-subnet1", + * ... + * ], + * ... + * + * Entries for performing VL3 routing (local-, remote-, and attach-) must + * all start with their respective prefixes (local-, remote-, or attach-) to + * identify the type of entry. Names of entries are limited to + * FABRIC_NAME_MAX-1 characters. + * + * NOTE: This isn't very sophisticated, so attachment entries need to appear + * after the entries referenced in it. */ #include <libvarpd_provider.h> @@ -47,32 +82,264 @@ #include <strings.h> #include <assert.h> #include <limits.h> +#include <sys/avl.h> +#include <sys/debug.h> +#include <sys/list.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <libnvpair.h> +#include <stddef.h> #include <unistd.h> #include <sys/mman.h> #include <sys/ethernet.h> #include <sys/socket.h> +#include <sys/vlan.h> #include <netinet/in.h> #include <arpa/inet.h> #include <libvarpd_files_json.h> +#define FABRIC_NAME_MAX 64 +struct varpd_files_attach; +typedef struct varpd_files_attach varpd_files_attach_t; + +typedef struct varpd_files_fabric { + avl_node_t vafs_avlnode; + list_node_t vafs_attached_node; + varpd_files_attach_t *vafs_attach; + char vafs_name[FABRIC_NAME_MAX]; + struct in6_addr vafs_addr; + uint64_t vafs_vnet; + uint32_t vafs_dcid; + uint16_t vafs_vlan; + uint8_t vafs_prefixlen; + uint8_t vafs_routermac[ETHERADDRL]; +} varpd_files_fabric_t; + +struct varpd_files_attach { + list_node_t vfa_node; + char vfa_name[FABRIC_NAME_MAX]; + list_t vfa_fabrics; +}; + +typedef struct varpd_files_if { + avl_node_t vfi_macnode; + avl_node_t vfi_ipnode; + avl_node_t vfi_ndpnode; + struct in6_addr vfi_ip; + struct in6_addr vfi_llocalip; /* IPv6 link local if specified */ + uint64_t vfi_vnet; + uint32_t vfi_dcid; + uint16_t vfi_vlan; + uint8_t vfi_mac[ETHERADDRL]; + uint8_t vfi_dhcp[ETHERADDRL]; /* dhcp-proxy MAC address */ + boolean_t vfi_has_dhcp; + boolean_t vfi_has_lladdr; + overlay_target_point_t vfi_dest; +} varpd_files_if_t; + typedef struct varpd_files { overlay_plugin_dest_t vaf_dest; /* RO */ varpd_provider_handle_t *vaf_hdl; /* RO */ char *vaf_path; /* WO */ - nvlist_t *vaf_nvl; /* WO */ uint64_t vaf_nmisses; /* Atomic */ uint64_t vaf_narp; /* Atomic */ + + /* These hold varpd_files_fabric_t's */ + avl_tree_t vaf_fabrics; /* WO */ + list_t vaf_attached; /* WO */ + + /* These hold varpd_files_if_t */ + avl_tree_t vaf_macs; /* WO */ + avl_tree_t vaf_ips; /* WO */ + avl_tree_t vaf_ndp; /* WO */ + + uint64_t vaf_vnet; /* RO */ + uint32_t vaf_dcid; /* RO */ } varpd_files_t; static const char *varpd_files_props[] = { "files/config" }; +static bunyan_logger_t *files_bunyan; + +/* + * Try to convert a string to an IP address or IP address + prefix. We first + * try to convert as an IPv6 address, and if that fails, we try to convert as + * an IPv4 adress and then wrap it in an IPv6 address. + * + * To parse an address+prefix length (e.g. 192.168.0.1/24), prefixlen must be + * non-NULL. If prefixlen is not NULL and a lone address is supplied, + * *prefixlen will be set to 128. If prefixlen is NULL, only a lone address + * can be successfully parsed. + * + * Note: if this is a wrapped IPv4 address with a prefix, *prefixlen is adjusted + * to reflect the value as an IPv6 address, e.g. 192.168.1.0/24 will have a + * prefixlen of 120 (96 + 24). + * + */ +static int +str_to_ip(const char *s, struct in6_addr *v6, uint8_t *prefixlen) +{ + const char *slash; /* he is real */ + char addrstr[INET6_ADDRSTRLEN] = { 0 }; + size_t addrlen; + boolean_t is_v4 = B_FALSE; + + slash = strchr(s, '/'); + + if (prefixlen != NULL) { + addrlen = (slash != NULL) ? (size_t)(slash - s) : strlen(s); + } else { + if (slash != NULL) + return (EINVAL); + addrlen = strlen(s); + } + + if (addrlen > sizeof (addrstr)) + return (EINVAL); + + bcopy(s, addrstr, addrlen); + + if (inet_pton(AF_INET6, addrstr, v6) != 1) { + uint32_t v4; + + if (inet_pton(AF_INET, addrstr, &v4) != 1) + return (EINVAL); + + IN6_IPADDR_TO_V4MAPPED(v4, v6); + is_v4 = B_TRUE; + } + + if (prefixlen != NULL) { + if (slash == NULL) { + *prefixlen = is_v4 ? 32 : 128; + } else { + unsigned long mask = 0; + + errno = 0; + mask = strtoul(slash + 1, NULL, 10); + if (errno != 0) + return (EINVAL); + + if (is_v4) { + if (mask > 32) + return (EINVAL); + mask += 96; + } + + if (mask > 128) + return (EINVAL); + + *prefixlen = (uint8_t)mask; + } + } + + return (0); +} + +static int +varpd_files_if_mac_avl(const void *a, const void *b) +{ + const varpd_files_if_t *l = a; + const varpd_files_if_t *r = b; + int i; + + if (l->vfi_dcid < r->vfi_dcid) + return (-1); + if (l->vfi_dcid > r->vfi_dcid) + return (1); + + for (i = 0; i < ETHERADDRL; i++) { + if (l->vfi_mac[i] < r->vfi_mac[i]) + return (-1); + if (l->vfi_mac[i] > r->vfi_mac[i]) + return (1); + } + + return (0); +} + +static int +varpd_files_if_ip_avl(const void *a, const void *b) +{ + const varpd_files_if_t *l = a; + const varpd_files_if_t *r = b; + int i; + + if (l->vfi_vnet < r->vfi_vnet) + return (-1); + if (l->vfi_vnet > r->vfi_vnet) + return (1); + if (l->vfi_vlan < r->vfi_vlan) + return (-1); + if (l->vfi_vlan > r->vfi_vlan) + return (1); + for (i = 0; i < sizeof (struct in6_addr); i++) { + if (l->vfi_ip.s6_addr[i] < r->vfi_ip.s6_addr[i]) + return (-1); + if (l->vfi_ip.s6_addr[i] > r->vfi_ip.s6_addr[i]) + return (1); + } + return (0); +} + +static int +varpd_files_if_ndp_avl(const void *a, const void *b) +{ + const varpd_files_if_t *l = a; + const varpd_files_if_t *r = b; + int i; + + VERIFY(l->vfi_has_lladdr); + VERIFY(r->vfi_has_lladdr); + + for (i = 0; i < sizeof (struct in6_addr); i++) { + if (l->vfi_llocalip.s6_addr[i] < r->vfi_llocalip.s6_addr[i]) + return (-1); + if (l->vfi_llocalip.s6_addr[i] > r->vfi_llocalip.s6_addr[i]) + return (1); + } + return (0); +} + +static int +varpd_files_fabric_avl(const void *a, const void *b) +{ + const varpd_files_fabric_t *l = a; + const varpd_files_fabric_t *r = b; + int i; + + /* + * Sort by dcid, vnet, vlan, subnet. With subnet last, we can use + * avl_nearest() to find the fabric for an IP (given the other pieces + * of information). + */ + if (l->vafs_dcid < r->vafs_dcid) + return (-1); + if (l->vafs_dcid > r->vafs_dcid) + return (1); + if (l->vafs_vnet < r->vafs_vnet) + return (-1); + if (l->vafs_vnet > r->vafs_vnet) + return (1); + if (l->vafs_vlan < r->vafs_vlan) + return (-1); + if (l->vafs_vlan > r->vafs_vlan) + return (1); + + for (i = 0; i < sizeof (struct in6_addr); i++) { + if (l->vafs_addr.s6_addr[i] < r->vafs_addr.s6_addr[i]) + return (-1); + if (l->vafs_addr.s6_addr[i] > r->vafs_addr.s6_addr[i]) + return (1); + } + + return (0); +} + static boolean_t varpd_files_valid_dest(overlay_plugin_dest_t dest) { @@ -94,64 +361,674 @@ varpd_files_create(varpd_provider_handle_t *hdl, void **outp, if (varpd_files_valid_dest(dest) == B_FALSE) return (ENOTSUP); - vaf = umem_alloc(sizeof (varpd_files_t), UMEM_DEFAULT); + vaf = umem_zalloc(sizeof (varpd_files_t), UMEM_DEFAULT); if (vaf == NULL) return (ENOMEM); - bzero(vaf, sizeof (varpd_files_t)); vaf->vaf_dest = dest; - vaf->vaf_path = NULL; - vaf->vaf_nvl = NULL; vaf->vaf_hdl = hdl; + vaf->vaf_dcid = libvarpd_plugin_dcid(hdl); + vaf->vaf_vnet = libvarpd_plugin_vnetid(hdl); + avl_create(&vaf->vaf_macs, varpd_files_if_mac_avl, + sizeof (varpd_files_if_t), offsetof(varpd_files_if_t, vfi_macnode)); + avl_create(&vaf->vaf_ips, varpd_files_if_ip_avl, + sizeof (varpd_files_if_t), offsetof(varpd_files_if_t, vfi_ipnode)); + avl_create(&vaf->vaf_ndp, varpd_files_if_ndp_avl, + sizeof (varpd_files_if_t), offsetof(varpd_files_if_t, vfi_ndpnode)); + avl_create(&vaf->vaf_fabrics, varpd_files_fabric_avl, + sizeof (varpd_files_fabric_t), + offsetof(varpd_files_fabric_t, vafs_avlnode)); + list_create(&vaf->vaf_attached, sizeof (varpd_files_attach_t), + offsetof(varpd_files_attach_t, vfa_node)); *outp = vaf; return (0); } +static varpd_files_fabric_t * +varpd_files_fabric_getbyname(varpd_files_t *vaf, const char *name) +{ + varpd_files_fabric_t *fab = NULL; + + for (fab = avl_first(&vaf->vaf_fabrics); fab != NULL; + fab = AVL_NEXT(&vaf->vaf_fabrics, fab)) { + if (strcmp(fab->vafs_name, name) == 0) + return (fab); + } + + return (NULL); +} + static int -varpd_files_normalize_nvlist(varpd_files_t *vaf, nvlist_t *nvl) +varpd_files_convert_attached(varpd_files_t *vaf, nvlist_t *att) { + nvlist_t *nvl = NULL; + nvpair_t *nvp = NULL; int ret; - nvlist_t *out; - nvpair_t *pair; - if ((ret = nvlist_alloc(&out, NV_UNIQUE_NAME, 0)) != 0) + while ((nvp = nvlist_next_nvpair(att, nvp)) != NULL) { + varpd_files_attach_t *att; + char **nets = NULL; + uint32_t i, n; + + if (nvpair_type(nvp) != DATA_TYPE_NVLIST) { + (void) bunyan_error(files_bunyan, + "attached fabric group value is not an nvlist", + BUNYAN_T_STRING, "group", nvpair_name(nvp), + BUNYAN_T_END); + return (EINVAL); + } + + if ((ret = nvpair_value_nvlist(nvp, &nvl)) != 0) { + (void) bunyan_error(files_bunyan, + "unexpected error retrieving attached fabric group", + BUNYAN_T_STRING, "group", nvpair_name(nvp), + BUNYAN_T_STRING, "errmsg", strerror(ret), + BUNYAN_T_END); + return (EINVAL); + } + + if ((ret = nvlist_lookup_boolean(nvl, ".__json_array")) != 0) { + (void) bunyan_error(files_bunyan, + "group value does not appear to be a JSON array", + BUNYAN_T_STRING, "group", nvpair_name(nvp), + BUNYAN_T_END); + return (EINVAL); + } + + if ((ret = nvlist_lookup_uint32(nvl, "length", &n)) != 0) { + (void) bunyan_error(files_bunyan, + "unexpected error obtain group array length", + BUNYAN_T_STRING, "group", nvpair_name(nvp), + BUNYAN_T_STRING, "errmsg", strerror(ret), + BUNYAN_T_END); + return (ret); + } + + if ((nets = calloc(n, sizeof (char *))) == NULL) { + (void) bunyan_error(files_bunyan, + "out of memory", BUNYAN_T_END); + return (ENOMEM); + } + + /* + * Note, we are just storing references to the names in + * nets, so we only need to call free(nets), and not on + * each entry (e.g. free(nets[0])). We strlcpy() it out, + * so we don't need to worry about it going away before we + * done with it. + */ + for (i = 0; i < n; i++) { + char buf[11]; /* largest uint32_t val + NUL */ + + (void) snprintf(buf, sizeof (buf), "%u", i); + ret = nvlist_lookup_string(nvl, buf, &nets[i]); + if (ret != 0) { + (void) bunyan_error(files_bunyan, + "unexpected error lookup up group array " + "value", + BUNYAN_T_STRING, "group", nvpair_name(nvp), + BUNYAN_T_UINT32, "index", i, + BUNYAN_T_STRING, "errmsg", strerror(ret), + BUNYAN_T_END); + free(nets); + return (ret); + } + } + + if ((att = umem_zalloc(sizeof (*att), UMEM_DEFAULT)) == NULL) { + (void) bunyan_error(files_bunyan, "out of memory", + BUNYAN_T_END); + free(nets); + return (ENOMEM); + } + + if (strlcpy(att->vfa_name, nvpair_name(nvp), + sizeof (att->vfa_name)) >= sizeof (att->vfa_name)) { + (void) bunyan_error(files_bunyan, + "attached fabric group name is too long", + BUNYAN_T_STRING, "group", nvpair_name(nvp), + BUNYAN_T_UINT32, "len", + (uint32_t)strlen(nvpair_name(nvp)), + BUNYAN_T_UINT32, "maxlen", + (uint32_t)sizeof (att->vfa_name) - 1, + BUNYAN_T_END); + umem_free(att, sizeof (*att)); + free(nets); + return (EOVERFLOW); + } + + list_create(&att->vfa_fabrics, sizeof (varpd_files_fabric_t), + offsetof(varpd_files_fabric_t, vafs_attached_node)); + + list_insert_tail(&vaf->vaf_attached, att); + + for (i = 0; i < n; i++) { + varpd_files_fabric_t *fab; + + fab = varpd_files_fabric_getbyname(vaf, nets[i]); + if (fab == NULL) { + (void) bunyan_error(files_bunyan, + "subnet name not found", + BUNYAN_T_STRING, "subnet", nets[i], + BUNYAN_T_STRING, "group", nvpair_name(nvp), + BUNYAN_T_END); + free(nets); + return (ENOENT); + } + + if (fab->vafs_attach != NULL) { + (void) bunyan_error(files_bunyan, + "subnet already attached to another group", + BUNYAN_T_STRING, "subnet", nets[i], + BUNYAN_T_STRING, "group", nvpair_name(nvp), + BUNYAN_T_STRING, "existing_group", + fab->vafs_attach->vfa_name, + BUNYAN_T_END); + free(nets); + return (EBUSY); + } + + fab->vafs_attach = att; + list_insert_tail(&att->vfa_fabrics, fab); + } + free(nets); + } + + return (0); +} + +static int +varpd_files_convert_fabrics(varpd_files_t *vaf, nvpair_t *fpair) +{ + nvlist_t *nvl = NULL; + nvpair_t *nvp = NULL; + int ret; + + ASSERT(strcmp(nvpair_name(fpair), "fabrics") == 0); + + if (nvpair_type(fpair) != DATA_TYPE_NVLIST) { + (void) bunyan_error(files_bunyan, + "'fabrics' value is not an nvlist", BUNYAN_T_END); + return (EINVAL); + } + + if ((ret = nvpair_value_nvlist(fpair, &nvl)) != 0) { + (void) bunyan_error(files_bunyan, + "unexpected error reading value of 'fabrics'", + BUNYAN_T_STRING, "errmsg", strerror(errno), + BUNYAN_T_END); return (ret); + } - for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL; - pair = nvlist_next_nvpair(nvl, pair)) { - char *name, fname[ETHERADDRSTRL]; - nvlist_t *data; - struct ether_addr ether, *e; - e = ðer; + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + struct in6_addr ip = { 0 }; + varpd_files_fabric_t *fab = NULL; + varpd_files_if_t *vl2 = NULL; + nvlist_t *vnvl = NULL; + int32_t i32; + char *s; + + if (strcmp(nvpair_name(nvp), "attached-fabrics") == 0) { + if (nvpair_type(nvp) != DATA_TYPE_NVLIST) { + (void) bunyan_error(files_bunyan, + "'attached-fabrics' value is not an nvlist", + BUNYAN_T_END); + return (EINVAL); + } + + if ((ret = nvpair_value_nvlist(nvp, &vnvl)) != 0) { + (void) bunyan_error(files_bunyan, + "unexpected error in 'attached-fabrics' " + "value", + BUNYAN_T_STRING, "errmsg", strerror(ret), + BUNYAN_T_END); + return (ret); + } + ret = varpd_files_convert_attached(vaf, vnvl); + if (ret != 0) { + return (ret); + } + continue; + } - if (nvpair_type(pair) != DATA_TYPE_NVLIST) { - nvlist_free(out); + if (nvpair_type(nvp) != DATA_TYPE_NVLIST) { + (void) bunyan_error(files_bunyan, + "subnet value is not an nvlist", + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_END); return (EINVAL); } - name = nvpair_name(pair); - if ((ret = nvpair_value_nvlist(pair, &data)) != 0) { - nvlist_free(out); + if ((ret = nvpair_value_nvlist(nvp, &vnvl)) != 0) { + (void) bunyan_error(files_bunyan, + "unexpected error reading subnet value", + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_END); + return (ret); + } + + if ((fab = umem_zalloc(sizeof (*fab), UMEM_DEFAULT)) == NULL) { + (void) bunyan_error(files_bunyan, "out of memory", + BUNYAN_T_END); + return (ENOMEM); + } + /* Default to our vid if none is given */ + fab->vafs_vnet = vaf->vaf_vnet; + + if (strlcpy(fab->vafs_name, nvpair_name(nvp), + sizeof (fab->vafs_name)) >= sizeof (fab->vafs_name)) { + (void) bunyan_error(files_bunyan, + "subnet name is too long", + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_UINT32, "length", + (uint32_t)strlen(nvpair_name(nvp)), + BUNYAN_T_UINT32, "maxlen", + (uint32_t)sizeof (fab->vafs_name) - 1, + BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); + return (EOVERFLOW); + } + + if ((ret = nvlist_lookup_string(vnvl, "prefix", &s)) != 0) { + (void) bunyan_error(files_bunyan, + "'prefix' value is missing from subnet", + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); return (EINVAL); } + if ((ret = str_to_ip(s, &fab->vafs_addr, + &fab->vafs_prefixlen)) != 0) { + (void) bunyan_error(files_bunyan, + "prefix value is not valid", + BUNYAN_T_STRING, "prefix", s, + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); + return (ret); + } + /* XXX: Make sure it's the subnet address */ + + if ((ret = nvlist_lookup_int32(vnvl, "vlan", &i32)) != 0) { + (void) bunyan_error(files_bunyan, + "'vlan' value is missing", + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); + return (EINVAL); + } + if (i32 < 0 || i32 > VLAN_ID_MAX) { + (void) bunyan_error(files_bunyan, + "vlan value is out of range (0-4094)", + BUNYAN_T_INT32, "vlan", i32, + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); + return (ERANGE); + } + fab->vafs_vlan = (uint16_t)i32; + + if ((ret = nvlist_lookup_string(vnvl, "routerip", &s)) != 0) { + (void) bunyan_error(files_bunyan, + "'routerip' value is missing", + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); + return (EINVAL); + } + if ((ret = str_to_ip(s, &ip, NULL)) != 0) { + (void) bunyan_error(files_bunyan, + "'routerip' value is not an IP", + BUNYAN_T_STRING, "routerip", s, + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); + return (ret); + } - if (ether_aton_r(name, e) == NULL) { - nvlist_free(out); + if ((ret = nvlist_lookup_string(vnvl, "routermac", &s)) != 0) { + (void) bunyan_error(files_bunyan, + "'routermac' value is missing from subnet", + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); + return (EINVAL); + } + if (ether_aton_r(s, + (struct ether_addr *)fab->vafs_routermac) == NULL) { + (void) bunyan_error(files_bunyan, + "'routermac' is not a valid MAC address", + BUNYAN_T_STRING, "mac", s, + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); return (EINVAL); } - if (ether_ntoa_r(e, fname) == NULL) { - nvlist_free(out); + /* + * XXX: Because of the quirks of javascript, representing + * integers > INT32_MAX in json becomes dicey. Should we + * just use a string instead? + */ + switch (ret = nvlist_lookup_int32(vnvl, "dcid", &i32)) { + case 0: + fab->vafs_dcid = (uint32_t)i32; + break; + case ENOENT: + fab->vafs_dcid = vaf->vaf_dcid; + break; + default: + (void) bunyan_error(files_bunyan, + "unexpected error processing 'dcid' value", + BUNYAN_T_STRING, "errmsg", strerror(errno), + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); + return (ret); + } + + switch (ret = nvlist_lookup_string(vnvl, "vid", &s)) { + case ENOENT: + fab->vafs_vnet = vaf->vaf_vnet; + break; + case 0: + errno = 0; + if ((fab->vafs_vnet = strtoul(s, NULL, 10)) != 0 || + errno == 0) + break; + ret = errno; + (void) bunyan_error(files_bunyan, + "unable to parse 'vid' as a number", + BUNYAN_T_STRING, "vid", s, + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); + return (ret); + default: + (void) bunyan_error(files_bunyan, + "unexpected error processing 'vid' value", + BUNYAN_T_STRING, "errmsg", strerror(errno), + BUNYAN_T_STRING, "subnet", nvpair_name(nvp), + BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); + return (ret); + } + + /* Make sure router ip is in subnet */ + if (!IN6_ARE_PREFIXEDADDR_EQUAL(&ip, &fab->vafs_addr, + fab->vafs_prefixlen)) { + void *ipp = &fab->vafs_addr; + bunyan_type_t type = + IN6_IS_ADDR_V4MAPPED(&fab->vafs_addr) ? + BUNYAN_T_IP : BUNYAN_T_IP6; + + (void) bunyan_error(files_bunyan, + "'routerip' value is not within subnet", + type, "routerip", ipp, + BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); + return (EINVAL); + } + + /* + * Add VL2 entry for overlay router on this fabric. + * Use umem_zalloc so vl2->vfi_dest (UL3 address) is all zeros. + */ + if ((vl2 = umem_zalloc(sizeof (*vl2), UMEM_DEFAULT)) == NULL) { + (void) bunyan_error(files_bunyan, + "out of memory", BUNYAN_T_END); + umem_free(fab, sizeof (*fab)); return (ENOMEM); } - if ((ret = nvlist_add_nvlist(out, fname, data)) != 0) { - nvlist_free(out); + bcopy(&ip, &vl2->vfi_ip, sizeof (struct in6_addr)); + bcopy(fab->vafs_routermac, vl2->vfi_mac, ETHERADDRL); + vl2->vfi_dcid = fab->vafs_dcid; + vl2->vfi_vnet = fab->vafs_vnet; + vl2->vfi_vlan = fab->vafs_vlan; + avl_add(&vaf->vaf_macs, vl2); + avl_add(&vaf->vaf_ips, vl2); + + avl_add(&vaf->vaf_fabrics, fab); + } + + return (0); +} + +static int +varpd_files_convert_nvlist(varpd_files_t *vaf, nvlist_t *data, uint_t level) +{ + nvpair_t *nvp = NULL; + nvlist_t *nvl = NULL; + char *name; + int ret; + + while ((nvp = nvlist_next_nvpair(data, nvp)) != NULL) { + varpd_files_if_t *ifp = NULL; + char *s; + int32_t i32; + + name = nvpair_name(nvp); + + (void) bunyan_debug(files_bunyan, "processing key", + BUNYAN_T_STRING, "key", name, + BUNYAN_T_END); + + if (nvpair_type(nvp) != DATA_TYPE_NVLIST) { + (void) bunyan_error(files_bunyan, + "value is not a hash (nvlist)", + BUNYAN_T_STRING, "key", name, + BUNYAN_T_END); + return (EINVAL); + } + + if ((ret = nvpair_value_nvlist(nvp, &nvl)) != 0) { + (void) bunyan_error(files_bunyan, + "unexpected error reading values for mac entry", + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_STRING, "errmsg", strerror(ret), + BUNYAN_T_END); + return (ret); + } + + if (strcmp(name, "fabrics") == 0) { + if (level > 0) { + (void) bunyan_error(files_bunyan, + "'fabrics' can only appear at the top-most " + "level", BUNYAN_T_END); + return (EINVAL); + } + ret = varpd_files_convert_fabrics(vaf, nvp); + if (ret != 0) { + return (ret); + } + continue; + } + + if ((ifp = umem_zalloc(sizeof (*ifp), UMEM_DEFAULT)) == NULL) { + (void) bunyan_error(files_bunyan, + "out of memory", BUNYAN_T_END); + return (ENOMEM); + } + ifp->vfi_dcid = vaf->vaf_dcid; + + struct ether_addr *ep = (struct ether_addr *)ifp->vfi_mac; + if (ether_aton_r(name, ep) == NULL) { + (void) bunyan_error(files_bunyan, "invalid MAC address", + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); return (EINVAL); } + + if ((ret = nvlist_lookup_int32(nvl, "vlan", &i32)) != 0) { + (void) bunyan_error(files_bunyan, + "'vlan' entry is missing", + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); + return (ret); + } + if (i32 < 0 || i32 > VLAN_ID_MAX) { + (void) bunyan_error(files_bunyan, + "vlan value is out of range (0-4094)", + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_INT32, "vlan", i32, + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); + return (ERANGE); + } + ifp->vfi_vlan = (uint16_t)i32; + + if ((ret = nvlist_lookup_string(nvl, "arp", &s)) != 0) { + (void) bunyan_error(files_bunyan, + "'arp' entry is missing", + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_STRING, "errmsg", strerror(ret), + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); + return (ret); + } + if ((ret = str_to_ip(s, &ifp->vfi_ip, NULL)) != 0) { + (void) bunyan_error(files_bunyan, + "'arp' value is not an IP address", + BUNYAN_T_STRING, "arp", s, + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); + return (ret); + } + + if ((ret = nvlist_lookup_string(nvl, "ip", &s)) != 0) { + (void) bunyan_error(files_bunyan, + "'ip' entry is missing", + BUNYAN_T_STRING, "ip", s, + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); + return (ret); + } + if ((ret = str_to_ip(s, &ifp->vfi_dest.otp_ip, NULL)) != 0) { + (void) bunyan_error(files_bunyan, + "'ip' value is not a IP address", + BUNYAN_T_STRING, "ip", s, + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); + return (ret); + } + + if (vaf->vaf_dest & OVERLAY_PLUGIN_D_PORT) { + ret = nvlist_lookup_int32(nvl, "port", &i32); + if (ret != 0) { + (void) bunyan_error(files_bunyan, + "'port' value is required, but is missing", + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); + return (ret); + } + + if (i32 <= 0 || i32 > UINT16_MAX) { + (void) bunyan_error(files_bunyan, + "'port' value is out of range (0-65535)", + BUNYAN_T_INT32, "port", i32, + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); + return (ERANGE); + } + ifp->vfi_dest.otp_port = i32; + } + + switch (ret = nvlist_lookup_string(nvl, "ndp", &s)) { + case 0: + ret = str_to_ip(s, &ifp->vfi_llocalip, NULL); + if (ret != 0) { + (void) bunyan_error(files_bunyan, + "'ndp' value is not an IP", + BUNYAN_T_STRING, "ndp", s, + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_END); + return (ret); + } + ifp->vfi_has_lladdr = B_TRUE; + break; + case ENOENT: + /* Ok if missing */ + break; + default: + (void) bunyan_error(files_bunyan, + "unexpected error processing 'ndp' value", + BUNYAN_T_STRING, "errmsg", strerror(errno), + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); + return (ret); + } + + switch (ret = nvlist_lookup_string(nvl, "dhcp-proxy", &s)) { + case 0: + ep = (struct ether_addr *)&ifp->vfi_dhcp; + if (ether_aton_r(s, ep) == NULL) { + (void) bunyan_error(files_bunyan, + "value of 'dhcp-proxy' is not a " + "MAC address", + BUNYAN_T_STRING, "dhcp-proxy", s, + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); + return (EINVAL); + } + ifp->vfi_has_dhcp = B_TRUE; + break; + case ENOENT: + /* Ok if missing */ + break; + default: + (void) bunyan_error(files_bunyan, + "unexpected error reading 'dhcp-proxy' value", + BUNYAN_T_STRING, "errmsg", strerror(errno), + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); + return (ret); + } + + switch (ret = nvlist_lookup_string(nvl, "vid", &s)) { + case ENOENT: + ifp->vfi_vnet = vaf->vaf_vnet; + break; + case 0: + errno = 0; + if ((ifp->vfi_vnet = strtoul(s, NULL, 10)) != 0 || + errno == 0) + break; + ret = errno; + (void) bunyan_error(files_bunyan, + "unable to parse 'vid' as a number", + BUNYAN_T_STRING, "vid", s, + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); + return (ret); + default: + (void) bunyan_error(files_bunyan, + "unexpected error processing 'vid' value", + BUNYAN_T_STRING, "errmsg", strerror(errno), + BUNYAN_T_STRING, "mac", name, + BUNYAN_T_END); + umem_free(ifp, sizeof (*ifp)); + return (ret); + } + + /* Make sure router ip is in subnet */ + avl_add(&vaf->vaf_macs, ifp); + avl_add(&vaf->vaf_ips, ifp); + if (ifp->vfi_has_lladdr && (ifp->vfi_dcid == vaf->vaf_dcid)) + avl_add(&vaf->vaf_ndp, ifp); } - vaf->vaf_nvl = out; return (0); } @@ -163,17 +1040,29 @@ varpd_files_start(void *arg) struct stat st; nvlist_t *nvl; varpd_files_t *vaf = arg; + nvlist_parse_json_error_t jerr = { 0 }; if (vaf->vaf_path == NULL) return (EAGAIN); - if ((fd = open(vaf->vaf_path, O_RDONLY)) < 0) + if ((fd = open(vaf->vaf_path, O_RDONLY)) < 0) { + (void) bunyan_error(files_bunyan, + "Cannot read destination data", + BUNYAN_T_STRING, "path", vaf->vaf_path, + BUNYAN_T_STRING, "errmsg", strerror(errno), + BUNYAN_T_END); return (errno); + } if (fstat(fd, &st) != 0) { ret = errno; if (close(fd) != 0) abort(); + (void) bunyan_error(files_bunyan, + "could not determine status of file (stat(2) failed)", + BUNYAN_T_STRING, "path", vaf->vaf_path, + BUNYAN_T_STRING, "errmsg", strerror(ret), + BUNYAN_T_END); return (ret); } @@ -183,15 +1072,30 @@ varpd_files_start(void *arg) ret = errno; if (close(fd) != 0) abort(); + (void) bunyan_error(files_bunyan, + "could not load destination data (mmap(2) failed)", + BUNYAN_T_STRING, "path", vaf->vaf_path, + BUNYAN_T_STRING, "errmsg", strerror(errno), + BUNYAN_T_END); return (ret); } - ret = nvlist_parse_json(maddr, st.st_size, &nvl, - NVJSON_FORCE_INTEGER, NULL); - if (ret == 0) { - ret = varpd_files_normalize_nvlist(vaf, nvl); + if ((ret = nvlist_parse_json(maddr, st.st_size, &nvl, + NVJSON_FORCE_INTEGER, &jerr)) != 0) { + (void) bunyan_error(files_bunyan, + "could not parse destination JSON file", + BUNYAN_T_STRING, "path", vaf->vaf_path, + BUNYAN_T_STRING, "parse_msg", jerr.nje_message, + BUNYAN_T_UINT32, "pos", (uint32_t)jerr.nje_pos, + BUNYAN_T_INT32, "errno", (int32_t)jerr.nje_errno, + BUNYAN_T_STRING, "errmsg", strerror(jerr.nje_errno), + BUNYAN_T_END); + } else { + ret = varpd_files_convert_nvlist(vaf, nvl, 0); nvlist_free(nvl); + nvl = NULL; } + if (munmap(maddr, st.st_size) != 0) abort(); if (close(fd) != 0) @@ -204,9 +1108,38 @@ static void varpd_files_stop(void *arg) { varpd_files_t *vaf = arg; + varpd_files_if_t *vif; + varpd_files_attach_t *att; + varpd_files_fabric_t *fab; + + /* + * VL2 data should appear in both trees, so free only after removed + * from second tree. + */ + while ((vif = avl_first(&vaf->vaf_ips)) != NULL) + avl_remove(&vaf->vaf_ips, vif); + + while ((vif = avl_first(&vaf->vaf_macs)) != NULL) { + avl_remove(&vaf->vaf_macs, vif); + umem_free(vif, sizeof (*vif)); + } - nvlist_free(vaf->vaf_nvl); - vaf->vaf_nvl = NULL; + /* + * A fabric could be unattached, and not appear in any attachment + * group. Therefore, remove the fabrics from all the attached groups, + * then free them after removing from the global list of fabrics. + */ + while ((att = list_remove_head(&vaf->vaf_attached)) != NULL) { + do { + fab = list_remove_head(&att->vfa_fabrics); + } while (fab != NULL); + umem_free(att, sizeof (*att)); + } + + while ((fab = avl_first(&vaf->vaf_fabrics)) != NULL) { + avl_remove(&vaf->vaf_fabrics, fab); + umem_free(fab, sizeof (*fab)); + } } static void @@ -214,114 +1147,199 @@ varpd_files_destroy(void *arg) { varpd_files_t *vaf = arg; - assert(vaf->vaf_nvl == NULL); if (vaf->vaf_path != NULL) { umem_free(vaf->vaf_path, strlen(vaf->vaf_path) + 1); vaf->vaf_path = NULL; } + + avl_destroy(&vaf->vaf_fabrics); + avl_destroy(&vaf->vaf_macs); + avl_destroy(&vaf->vaf_ips); + list_destroy(&vaf->vaf_attached); + umem_free(vaf, sizeof (varpd_files_t)); } -static void -varpd_files_lookup(void *arg, varpd_query_handle_t *qh, - const overlay_targ_lookup_t *otl, overlay_target_point_t *otp) +static varpd_files_fabric_t * +varpd_files_find_dstfab(varpd_files_t *vaf, varpd_files_attach_t *att, + const struct in6_addr *dst) { - char macstr[ETHERADDRSTRL], *ipstr; - nvlist_t *nvl; - varpd_files_t *vaf = arg; - int32_t port; - static const uint8_t bcast[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + varpd_files_fabric_t *net = NULL; - /* We don't support a default */ - if (otl == NULL) { - libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); - return; + for (net = list_head(&att->vfa_fabrics); net != NULL; + net = list_next(&att->vfa_fabrics, net)) { + if (IN6_ARE_PREFIXEDADDR_EQUAL(dst, &net->vafs_addr, + net->vafs_prefixlen)) { + return (net); + } } - if (otl->otl_sap == ETHERTYPE_ARP) { - libvarpd_plugin_proxy_arp(vaf->vaf_hdl, qh, otl); - return; - } + return (NULL); +} - if (otl->otl_sap == ETHERTYPE_IPV6 && - otl->otl_dstaddr[0] == 0x33 && - otl->otl_dstaddr[1] == 0x33) { - libvarpd_plugin_proxy_ndp(vaf->vaf_hdl, qh, otl); - return; - } +static varpd_files_attach_t * +varpd_files_find_attach(varpd_files_t *vaf, const struct in6_addr *src, + uint16_t vlan, overlay_target_route_t *otr) +{ + varpd_files_fabric_t *fab; + varpd_files_fabric_t lookup = { + .vafs_vnet = vaf->vaf_vnet, + .vafs_dcid = vaf->vaf_dcid, + .vafs_vlan = vlan, + .vafs_addr = *src + }; + avl_index_t where = 0; - if (otl->otl_sap == ETHERTYPE_IP && - bcmp(otl->otl_dstaddr, bcast, ETHERADDRL) == 0) { - char *mac; - struct ether_addr a, *addr; + /* + * Since fabrics are sorted by subnet address last, any given IP + * potentially in a fabric subnet should lie between two adjacent + * fabric entries in the tree. Find where such an IP would go in + * the tree, and the entry before the insertion point should be the + * fabric (if it is present). + */ + fab = avl_find(&vaf->vaf_fabrics, &lookup, &where); + if (fab != NULL) { + /* + * Someone requested the subnet address. E.g. if the fabric + * is 192.168.10.0/24, someone asked for 192.168.10.0. Treat + * as not found. + */ + return (NULL); + } - addr = &a; - if (ether_ntoa_r((struct ether_addr *)otl->otl_srcaddr, - macstr) == NULL) { - libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); - return; - } + fab = avl_nearest(&vaf->vaf_fabrics, where, AVL_BEFORE); + if (fab == NULL) { + return (NULL); + } - if (nvlist_lookup_nvlist(vaf->vaf_nvl, macstr, &nvl) != 0) { - libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); - return; - } + /* Still must verify that the address lies in the range of the subnet */ + if (!IN6_ARE_PREFIXEDADDR_EQUAL(&fab->vafs_addr, src, + fab->vafs_prefixlen)) { + return (NULL); + } - if (nvlist_lookup_string(nvl, "dhcp-proxy", &mac) != 0) { - libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); - return; - } + return (fab->vafs_attach); +} - if (ether_aton_r(mac, addr) == NULL) { - libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); - return; - } +static void +varpd_files_lookup_l3(varpd_files_t *vaf, varpd_query_handle_t *qh, + const overlay_targ_lookup_t *otl, overlay_target_point_t *otp, + overlay_target_route_t *otr, overlay_target_mac_t *otm) +{ + const struct in6_addr *dst_ip; + const struct in6_addr *src_ip; + varpd_files_attach_t *attach = NULL; + varpd_files_fabric_t *fab = NULL; + varpd_files_if_t *ifp = NULL; - libvarpd_plugin_proxy_dhcp(vaf->vaf_hdl, qh, otl); - return; - } + dst_ip = &otl->otl_addru.otlu_l3.otl3_dstip; + src_ip = &otl->otl_addru.otlu_l3.otl3_srcip; - if (ether_ntoa_r((struct ether_addr *)otl->otl_dstaddr, - macstr) == NULL) { + if ((attach = varpd_files_find_attach(vaf, src_ip, otl->otl_vlan, + otr)) == NULL) { libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); return; } - if (nvlist_lookup_nvlist(vaf->vaf_nvl, macstr, &nvl) != 0) { + if ((fab = varpd_files_find_dstfab(vaf, attach, dst_ip)) == NULL) { libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); return; } - if (nvlist_lookup_int32(nvl, "port", &port) != 0) { + varpd_files_if_t lookup = { 0 }; + + lookup.vfi_vnet = fab->vafs_vnet; + lookup.vfi_vlan = fab->vafs_vlan; + bcopy(dst_ip, &lookup.vfi_ip, sizeof (struct in6_addr)); + + if ((ifp = avl_find(&vaf->vaf_ips, &lookup, NULL)) == NULL) { libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); return; } - if (port <= 0 || port > UINT16_MAX) { + otr->otr_vnet = fab->vafs_vnet; + otr->otr_vlan = fab->vafs_vlan; + bcopy(fab->vafs_routermac, otr->otr_srcmac, ETHERADDRL); + + otm->otm_dcid = fab->vafs_dcid; + bcopy(ifp->vfi_mac, otm->otm_mac, ETHERADDRL); + + bcopy(&ifp->vfi_dest, otp, sizeof (*otp)); + + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_OK); +} + +static void +varpd_files_lookup(void *arg, varpd_query_handle_t *qh, + const overlay_targ_lookup_t *otl, overlay_target_point_t *otp, + overlay_target_route_t *otr, overlay_target_mac_t *otm) +{ + varpd_files_t *vaf = arg; + static const uint8_t bcast[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + varpd_files_if_t *ifp = NULL; + varpd_files_if_t lookup = { .vfi_dcid = vaf->vaf_dcid }; + + + /* We don't support a default */ + if (otl == NULL) { libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); return; } - otp->otp_port = port; - if (nvlist_lookup_string(nvl, "ip", &ipstr) != 0) { - libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + /* + * Shuffle off L3 lookups to their own codepath. + */ + if (otl->otl_l3req) { + varpd_files_lookup_l3(vaf, qh, otl, otp, otr, otm); return; } /* - * Try to parse it as a v6 address and then if it's not, try to - * transform it into a v4 address which we'll then wrap it into a v4 - * mapped address. + * At this point, the traditional overlay_target_point_t is all that + * needs filling in. Zero-out the otr and otm for safety. */ - if (inet_pton(AF_INET6, ipstr, &otp->otp_ip) != 1) { - uint32_t v4; - if (inet_pton(AF_INET, ipstr, &v4) != 1) { + bzero(otr, sizeof (*otr)); + bzero(otm, sizeof (*otm)); + + if (otl->otl_addru.otlu_l2.otl2_sap == ETHERTYPE_ARP) { + libvarpd_plugin_proxy_arp(vaf->vaf_hdl, qh, otl); + return; + } + + if (otl->otl_addru.otlu_l2.otl2_sap == ETHERTYPE_IPV6 && + otl->otl_addru.otlu_l2.otl2_dstaddr[0] == 0x33 && + otl->otl_addru.otlu_l2.otl2_dstaddr[1] == 0x33) { + libvarpd_plugin_proxy_ndp(vaf->vaf_hdl, qh, otl); + return; + } + + if (otl->otl_addru.otlu_l2.otl2_sap == ETHERTYPE_IP && + bcmp(otl->otl_addru.otlu_l2.otl2_dstaddr, bcast, ETHERADDRL) == 0) { + bcopy(otl->otl_addru.otlu_l2.otl2_srcaddr, lookup.vfi_mac, + ETHERADDRL); + + if ((ifp = avl_find(&vaf->vaf_macs, &lookup, NULL)) == NULL) { + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + return; + } + + if (!ifp->vfi_has_dhcp) { libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); return; } - IN6_IPADDR_TO_V4MAPPED(v4, &otp->otp_ip); + + libvarpd_plugin_proxy_dhcp(vaf->vaf_hdl, qh, otl); + return; + } + + bcopy(otl->otl_addru.otlu_l2.otl2_dstaddr, lookup.vfi_mac, ETHERADDRL); + if ((ifp = avl_find(&vaf->vaf_macs, &lookup, NULL)) == NULL) { + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + return; } + bcopy(&ifp->vfi_dest, otp, sizeof (*otp)); + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_OK); } @@ -344,6 +1362,7 @@ varpd_files_propinfo(void *arg, uint_t propid, varpd_prop_handle_t *vph) libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_RRW); libvarpd_prop_set_type(vph, OVERLAY_PROP_T_STRING); libvarpd_prop_set_nodefault(vph); + return (0); } @@ -361,7 +1380,6 @@ varpd_files_getprop(void *arg, const char *pname, void *buf, uint32_t *sizep) return (EOVERFLOW); *sizep = len; (void) strlcpy(buf, vaf->vaf_path, *sizep); - } else { *sizep = 0; } @@ -457,12 +1475,17 @@ varpd_files_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl, static void varpd_files_proxy_arp(void *arg, varpd_arp_handle_t *vah, int kind, - const struct sockaddr *sock, uint8_t *out) + const struct sockaddr *sock, uint16_t vlan, uint8_t *out) { varpd_files_t *vaf = arg; const struct sockaddr_in *ip; const struct sockaddr_in6 *ip6; - nvpair_t *pair; + varpd_files_if_t *ifp = NULL; + varpd_files_if_t lookup = { + .vfi_vnet = vaf->vaf_vnet, + .vfi_dcid = vaf->vaf_dcid, + .vfi_vlan = vlan + }; if (kind != VARPD_QTYPE_ETHERNET) { libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP); @@ -476,56 +1499,23 @@ varpd_files_proxy_arp(void *arg, varpd_arp_handle_t *vah, int kind, ip = (const struct sockaddr_in *)sock; ip6 = (const struct sockaddr_in6 *)sock; - for (pair = nvlist_next_nvpair(vaf->vaf_nvl, NULL); pair != NULL; - pair = nvlist_next_nvpair(vaf->vaf_nvl, pair)) { - char *mac, *ipstr; - nvlist_t *data; - struct in_addr ia; - struct in6_addr ia6; - struct ether_addr ether, *e; - e = ðer; - - if (nvpair_type(pair) != DATA_TYPE_NVLIST) - continue; - - mac = nvpair_name(pair); - if (nvpair_value_nvlist(pair, &data) != 0) - continue; - - - if (sock->sa_family == AF_INET) { - if (nvlist_lookup_string(data, "arp", &ipstr) != 0) - continue; - if (inet_pton(AF_INET, ipstr, &ia) != 1) - continue; - - if (bcmp(&ia, &ip->sin_addr, - sizeof (struct in_addr)) != 0) - continue; - } else { - if (nvlist_lookup_string(data, "ndp", &ipstr) != 0) - continue; - - if (inet_pton(AF_INET6, ipstr, &ia6) != 1) - continue; - - if (bcmp(&ia6, &ip6->sin6_addr, - sizeof (struct in6_addr)) != 0) - continue; - } - - if (ether_aton_r(mac, e) == NULL) { - libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP); - return; - } + if (sock->sa_family == AF_INET) { + IN6_IPADDR_TO_V4MAPPED(ip->sin_addr.s_addr, &lookup.vfi_ip); + ifp = avl_find(&vaf->vaf_ips, &lookup, NULL); + } else { + bcopy(&ip6->sin6_addr, &lookup.vfi_llocalip, + sizeof (struct in6_addr)); + ifp = avl_find(&vaf->vaf_ndp, &lookup, NULL); + } - bcopy(e, out, ETHERADDRL); - libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_OK); + if (ifp == NULL) { + libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP); return; } - libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP); + bcopy(ifp->vfi_mac, out, ETHERADDRL); + libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_OK); } static void @@ -533,38 +1523,28 @@ varpd_files_proxy_dhcp(void *arg, varpd_dhcp_handle_t *vdh, int type, const overlay_targ_lookup_t *otl, uint8_t *out) { varpd_files_t *vaf = arg; - nvlist_t *nvl; - char macstr[ETHERADDRSTRL], *mac; - struct ether_addr a, *addr; + varpd_files_if_t *ifp = NULL; + varpd_files_if_t lookup = { + .vfi_dcid = vaf->vaf_dcid, + .vfi_mac = *otl->otl_addru.otlu_l2.otl2_srcaddr + }; - addr = &a; if (type != VARPD_QTYPE_ETHERNET) { libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP); return; } - if (ether_ntoa_r((struct ether_addr *)otl->otl_srcaddr, - macstr) == NULL) { + if ((ifp = avl_find(&vaf->vaf_macs, &lookup, NULL)) == NULL) { libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP); return; } - if (nvlist_lookup_nvlist(vaf->vaf_nvl, macstr, &nvl) != 0) { + if (!ifp->vfi_has_dhcp) { libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP); return; } - if (nvlist_lookup_string(nvl, "dhcp-proxy", &mac) != 0) { - libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP); - return; - } - - if (ether_aton_r(mac, addr) == NULL) { - libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP); - return; - } - - bcopy(addr, out, ETHERADDRL); + bcopy(ifp->vfi_dhcp, out, ETHERADDRL); libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_OK); } @@ -586,6 +1566,27 @@ static const varpd_plugin_ops_t varpd_files_ops = { varpd_files_proxy_dhcp }; +static int +files_bunyan_init(void) +{ + int ret; + + if ((ret = bunyan_init("files", &files_bunyan)) != 0) + return (ret); + ret = bunyan_stream_add(files_bunyan, "stderr", BUNYAN_L_INFO, + bunyan_stream_fd, (void *)STDERR_FILENO); + if (ret != 0) + bunyan_fini(files_bunyan); + return (ret); +} + +static void +files_bunyan_fini(void) +{ + if (files_bunyan != NULL) + bunyan_fini(files_bunyan); +} + #pragma init(varpd_files_init) static void varpd_files_init(void) @@ -593,9 +1594,14 @@ varpd_files_init(void) int err; varpd_plugin_register_t *vpr; + if (files_bunyan_init() != 0) + return; + vpr = libvarpd_plugin_alloc(VARPD_CURRENT_VERSION, &err); - if (vpr == NULL) + if (vpr == NULL) { + files_bunyan_fini(); return; + } vpr->vpr_mode = OVERLAY_TARGET_DYNAMIC; vpr->vpr_name = "files"; diff --git a/usr/src/lib/varpd/libvarpd/Makefile b/usr/src/lib/varpd/libvarpd/Makefile index 2a4f8f070c..7fb91078e3 100644 --- a/usr/src/lib/varpd/libvarpd/Makefile +++ b/usr/src/lib/varpd/libvarpd/Makefile @@ -29,7 +29,8 @@ TYPELIST = \ varpd_persist_header_t \ overlay_targ_cache_entry_t \ overlay_targ_cache_t \ - overlay_targ_cache_iter_t + overlay_targ_cache_iter_t \ + overlay_targ_resp_t all := TARGET = all clean := TARGET = clean diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd.c b/usr/src/lib/varpd/libvarpd/common/libvarpd.c index e4460089cc..9de3602e62 100644 --- a/usr/src/lib/varpd/libvarpd/common/libvarpd.c +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd.c @@ -150,6 +150,7 @@ libvarpd_instance_create(varpd_handle_t *vhp, datalink_id_t linkid, varpd_instance_t *inst, lookup; overlay_plugin_dest_t dest; uint64_t vid; + uint32_t dcid; /* * We should really have our own errnos. @@ -158,7 +159,8 @@ libvarpd_instance_create(varpd_handle_t *vhp, datalink_id_t linkid, if (plugin == NULL) return (ENOENT); - if ((ret = libvarpd_overlay_info(vip, linkid, &dest, NULL, &vid)) != 0) + if ((ret = libvarpd_overlay_info(vip, linkid, &dest, NULL, &vid, + &dcid)) != 0) return (ret); inst = umem_alloc(sizeof (varpd_instance_t), UMEM_DEFAULT); @@ -175,6 +177,7 @@ libvarpd_instance_create(varpd_handle_t *vhp, datalink_id_t linkid, inst->vri_dest = dest; inst->vri_plugin = plugin; inst->vri_impl = vip; + inst->vri_dcid = dcid; inst->vri_flags = 0; if ((ret = plugin->vpp_ops->vpo_create((varpd_provider_handle_t *)inst, &inst->vri_private, dest)) != 0) { @@ -217,6 +220,13 @@ libvarpd_plugin_vnetid(varpd_provider_handle_t *vhp) return (inst->vri_vnetid); } +uint32_t +libvarpd_plugin_dcid(varpd_provider_handle_t *vhp) +{ + varpd_instance_t *inst = (varpd_instance_t *)vhp; + return (inst->vri_dcid); +} + varpd_instance_handle_t * libvarpd_instance_lookup(varpd_handle_t *vhp, uint64_t id) { diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c index df69207fe0..a32889e8a2 100644 --- a/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c @@ -43,7 +43,7 @@ typedef struct varpd_arp_query { varpd_query_handle_t *vaq_query; const overlay_targ_lookup_t *vaq_otl; ip6_t *vaq_ip6; - nd_neighbor_solicit_t *vaq_ns; + nd_neighbor_solicit_t *vaq_ns; } varpd_arp_query_t; typedef struct varpd_dhcp_query { @@ -75,7 +75,7 @@ libvarpd_plugin_proxy_arp(varpd_provider_handle_t *hdl, } vaq->vaq_bsize = sizeof (vaq->vaq_buf); - if (otl->otl_sap != ETHERTYPE_ARP) { + if (otl->otl_addru.otlu_l2.otl2_sap != ETHERTYPE_ARP) { libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); umem_free(vaq, sizeof (varpd_arp_query_t)); return; @@ -151,7 +151,7 @@ libvarpd_plugin_proxy_arp(varpd_provider_handle_t *hdl, inst->vri_plugin->vpp_ops->vpo_arp(inst->vri_private, (varpd_arp_handle_t *)vaq, VARPD_QTYPE_ETHERNET, - (struct sockaddr *)ip, vaq->vaq_lookup); + (struct sockaddr *)ip, otl->otl_vlan, vaq->vaq_lookup); } static void @@ -248,8 +248,8 @@ libvarpd_plugin_proxy_ndp(varpd_provider_handle_t *hdl, } vaq->vaq_bsize = sizeof (vaq->vaq_buf); - if (otl->otl_dstaddr[0] != 0x33 || - otl->otl_dstaddr[1] != 0x33) { + if (otl->otl_addru.otlu_l2.otl2_dstaddr[0] != 0x33 || + otl->otl_addru.otlu_l2.otl2_dstaddr[1] != 0x33) { libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); umem_free(vaq, sizeof (varpd_arp_query_t)); return; @@ -388,7 +388,7 @@ libvarpd_plugin_proxy_ndp(varpd_provider_handle_t *hdl, vaq->vaq_ip6 = v6hdr; inst->vri_plugin->vpp_ops->vpo_arp(inst->vri_private, (varpd_arp_handle_t *)vaq, VARPD_QTYPE_ETHERNET, - (struct sockaddr *)s6, vaq->vaq_lookup); + (struct sockaddr *)s6, otl->otl_vlan, vaq->vaq_lookup); } static void @@ -505,13 +505,14 @@ libvarpd_plugin_proxy_dhcp(varpd_provider_handle_t *hdl, } vdq->vdq_bsize = sizeof (vdq->vdq_buf); - if (otl->otl_sap != ETHERTYPE_IP) { + if (otl->otl_addru.otlu_l2.otl2_sap != ETHERTYPE_IP) { libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); umem_free(vdq, sizeof (varpd_dhcp_query_t)); return; } - if (bcmp(otl->otl_dstaddr, libvarpd_arp_bcast, ETHERADDRL) != 0) { + if (bcmp(otl->otl_addru.otlu_l2.otl2_dstaddr, libvarpd_arp_bcast, + ETHERADDRL) != 0) { libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); umem_free(vdq, sizeof (varpd_dhcp_query_t)); return; diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c index 18e220259c..c6cc812dcf 100644 --- a/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c @@ -476,7 +476,7 @@ libvarpd_c_instance_cache_flush(varpd_client_handle_t *chp, uint64_t cid) int libvarpd_c_instance_cache_delete(varpd_client_handle_t *chp, uint64_t cid, - const struct ether_addr *key) + uint32_t dcid, const struct ether_addr *key) { int ret; varpd_client_arg_t carg; @@ -489,6 +489,7 @@ libvarpd_c_instance_cache_delete(varpd_client_handle_t *chp, uint64_t cid, carg.vca_command = VARPD_CLIENT_CACHE_DELETE; carg.vca_errno = 0; vctcap->vtca_id = cid; + vctcap->vtca_dcid = dcid; bcopy(key, vctcap->vtca_key, ETHERADDRL); ret = libvarpd_c_door_call(client, &carg, 0); @@ -532,7 +533,8 @@ libvarpd_c_instance_cache_get(varpd_client_handle_t *chp, uint64_t cid, int libvarpd_c_instance_cache_set(varpd_client_handle_t *chp, uint64_t cid, - const struct ether_addr *key, const varpd_client_cache_entry_t *entry) + uint32_t dcid, const struct ether_addr *key, + const varpd_client_cache_entry_t *entry) { int ret; varpd_client_arg_t carg; @@ -545,6 +547,7 @@ libvarpd_c_instance_cache_set(varpd_client_handle_t *chp, uint64_t cid, carg.vca_command = VARPD_CLIENT_CACHE_SET; carg.vca_errno = 0; vctcap->vtca_id = cid; + vctcap->vtca_dcid = dcid; bcopy(key, vctcap->vtca_key, ETHERADDRL); bcopy(entry, &vctcap->vtca_entry, sizeof (varpd_client_cache_entry_t)); @@ -604,14 +607,17 @@ libvarpd_c_instance_cache_walk(varpd_client_handle_t *chp, uint64_t cid, for (i = 0; i < vctwap->vtcw_count; i++) { varpd_client_cache_entry_t ent; + overlay_targ_cache_entry_t *otce; - ent.vcp_flags = vctwap->vtcw_ents[i].otce_flags; - bcopy(vctwap->vtcw_ents[i].otce_dest.otp_mac, - &ent.vcp_mac, ETHERADDRL); - ent.vcp_ip = vctwap->vtcw_ents[i].otce_dest.otp_ip; - ent.vcp_port = vctwap->vtcw_ents[i].otce_dest.otp_port; + otce = &vctwap->vtcw_ents[i]; + + ent.vcp_flags = otce->otce_flags; + bcopy(otce->otce_dest.otp_mac, &ent.vcp_mac, + ETHERADDRL); + ent.vcp_ip = otce->otce_dest.otp_ip; + ent.vcp_port = otce->otce_dest.otp_port; ret = func(chp, cid, - (struct ether_addr *)vctwap->vtcw_ents[i].otce_mac, + (struct ether_addr *)otce->otce_mac.otm_mac, &ent, arg); if (ret != 0) { ret = 0; diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h index 459711b385..335385b262 100644 --- a/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _LIBVARPD_CLIENT_H @@ -73,11 +73,11 @@ extern int libvarpd_c_instance_target_mode(varpd_client_handle_t *, uint64_t, uint_t *, uint_t *); extern int libvarpd_c_instance_cache_flush(varpd_client_handle_t *, uint64_t); extern int libvarpd_c_instance_cache_delete(varpd_client_handle_t *, uint64_t, - const struct ether_addr *); + uint32_t, const struct ether_addr *); extern int libvarpd_c_instance_cache_get(varpd_client_handle_t *, uint64_t, const struct ether_addr *, varpd_client_cache_entry_t *); extern int libvarpd_c_instance_cache_set(varpd_client_handle_t *, uint64_t, - const struct ether_addr *, const varpd_client_cache_entry_t *); + uint32_t, const struct ether_addr *, const varpd_client_cache_entry_t *); typedef int (*varpd_client_cache_f)(varpd_client_handle_t *, uint64_t, const struct ether_addr *, const varpd_client_cache_entry_t *, void *); diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c index f684e031a8..d58445d1b7 100644 --- a/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c @@ -288,7 +288,7 @@ libvarpd_door_f_delete(varpd_impl_t *vip, varpd_client_arg_t *vcap, if (ihp == NULL) return (ENOENT); return (libvarpd_overlay_cache_delete((varpd_instance_t *)ihp, - vtcap->vtca_key)); + vtcap->vtca_dcid, vtcap->vtca_key)); } /* ARGSUSED */ @@ -321,7 +321,7 @@ libvarpd_door_f_set(varpd_impl_t *vip, varpd_client_arg_t *vcap, return (ENOENT); return (libvarpd_overlay_cache_set((varpd_instance_t *)ihp, - vtcap->vtca_key, &vtcap->vtca_entry)); + vtcap->vtca_dcid, vtcap->vtca_key, &vtcap->vtca_entry)); } /* ARGSUSED */ @@ -337,7 +337,7 @@ libvarpd_door_f_walk(varpd_impl_t *vip, varpd_client_arg_t *vcap, return (ENOENT); return (libvarpd_overlay_cache_walk_fill((varpd_instance_t *)ihp, - &vctwp->vtcw_marker, &vctwp->vtcw_count, vctwp->vtcw_ents)); + vctwp->vtcw_marker, &vctwp->vtcw_count, vctwp->vtcw_ents)); } static libvarpd_door_f *libvarpd_door_table[] = { diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h b/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h index 60f0dc5fff..e18fc3c4d2 100644 --- a/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _LIBVARPD_IMPL_H @@ -77,6 +77,7 @@ typedef struct varpd_instance { varpd_impl_t *vri_impl; /* RO */ varpd_plugin_t *vri_plugin; /* RO */ void *vri_private; /* RO */ + uint32_t vri_dcid; /* RO */ mutex_t vri_lock; varpd_instance_flags_t vri_flags; /* vri_lock */ } varpd_instance_t; @@ -113,7 +114,7 @@ typedef struct varpd_client_propinfo_arg { uint8_t vcfa_pad[4]; char vcfa_name[LIBVARPD_PROP_NAMELEN]; uint8_t vcfa_default[LIBVARPD_PROP_SIZEMAX]; - uint8_t vcfa_poss[LIBVARPD_PROP_SIZEMAX]; + uint8_t vcfa_poss[LIBVARPD_PROP_SIZEMAX] __aligned(8); } varpd_client_propinfo_arg_t; typedef struct varpd_client_prop_arg { @@ -137,6 +138,7 @@ typedef struct varpd_client_target_mode_arg { typedef struct varpd_client_target_cache_arg { uint64_t vtca_id; + uint32_t vtca_dcid; uint8_t vtca_key[ETHERADDRL]; uint8_t vtca_pad[2]; varpd_client_cache_entry_t vtca_entry; @@ -144,7 +146,7 @@ typedef struct varpd_client_target_cache_arg { typedef struct varpd_client_target_walk_arg { uint64_t vtcw_id; - uint64_t vtcw_marker; + uint64_t vtcw_marker[2]; uint64_t vtcw_count; overlay_targ_cache_entry_t vtcw_ents[]; } varpd_client_target_walk_arg_t; @@ -210,7 +212,7 @@ extern int libvarpd_dirwalk(varpd_impl_t *, const char *, const char *, extern int libvarpd_overlay_init(varpd_impl_t *); extern void libvarpd_overlay_fini(varpd_impl_t *); extern int libvarpd_overlay_info(varpd_impl_t *, datalink_id_t, - overlay_plugin_dest_t *, uint64_t *, uint64_t *); + overlay_plugin_dest_t *, uint64_t *, uint64_t *, uint32_t *); extern int libvarpd_overlay_associate(varpd_instance_t *); extern int libvarpd_overlay_disassociate(varpd_instance_t *); extern int libvarpd_overlay_degrade(varpd_instance_t *, const char *); @@ -228,12 +230,12 @@ typedef int (*libvarpd_overlay_iter_f)(varpd_impl_t *, datalink_id_t, void *); extern int libvarpd_overlay_iter(varpd_impl_t *, libvarpd_overlay_iter_f, void *); extern int libvarpd_overlay_cache_flush(varpd_instance_t *); -extern int libvarpd_overlay_cache_delete(varpd_instance_t *, const uint8_t *); -extern int libvarpd_overlay_cache_delete(varpd_instance_t *, const uint8_t *); +extern int libvarpd_overlay_cache_delete(varpd_instance_t *, uint32_t, + const uint8_t *); extern int libvarpd_overlay_cache_get(varpd_instance_t *, const uint8_t *, varpd_client_cache_entry_t *); -extern int libvarpd_overlay_cache_set(varpd_instance_t *, const uint8_t *, - const varpd_client_cache_entry_t *); +extern int libvarpd_overlay_cache_set(varpd_instance_t *, uint32_t, + const uint8_t *, const varpd_client_cache_entry_t *); extern int libvarpd_overlay_cache_walk_fill(varpd_instance_t *, uint64_t *, uint64_t *, overlay_targ_cache_entry_t *); diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c index 124e3c5791..8ee12a455e 100644 --- a/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -53,7 +53,8 @@ libvarpd_overlay_fini(varpd_impl_t *vip) int libvarpd_overlay_info(varpd_impl_t *vip, datalink_id_t linkid, - overlay_plugin_dest_t *destp, uint64_t *flags, uint64_t *vnetid) + overlay_plugin_dest_t *destp, uint64_t *flags, uint64_t *vnetid, + uint32_t *dcid) { overlay_targ_info_t oti; @@ -67,6 +68,8 @@ libvarpd_overlay_info(varpd_impl_t *vip, datalink_id_t linkid, *flags = oti.oti_flags; if (vnetid != NULL) *vnetid = oti.oti_vnetid; + if (dcid != NULL) + *dcid = oti.oti_dcid; return (0); } @@ -252,6 +255,7 @@ libvarpd_overlay_lookup_handle(varpd_impl_t *vip) vqp = umem_cache_alloc(vip->vdi_qcache, UMEM_DEFAULT); otl = &vqp->vq_lookup; otr = &vqp->vq_response; + /* * abort doesn't really help here that much, maybe we can instead try * and for a reap or something? @@ -280,7 +284,8 @@ libvarpd_overlay_lookup_handle(varpd_impl_t *vip) vqp->vq_instance = inst; inst->vri_plugin->vpp_ops->vpo_lookup(inst->vri_private, - (varpd_query_handle_t *)vqp, otl, &otr->otr_answer); + (varpd_query_handle_t *)vqp, otl, &otr->otr_answer, + &otr->otr_route, &otr->otr_mac); } void @@ -387,7 +392,8 @@ libvarpd_overlay_cache_flush(varpd_instance_t *inst) } int -libvarpd_overlay_cache_delete(varpd_instance_t *inst, const uint8_t *key) +libvarpd_overlay_cache_delete(varpd_instance_t *inst, uint32_t dcid, + const uint8_t *key) { int ret; overlay_targ_cache_t cache; @@ -395,7 +401,8 @@ libvarpd_overlay_cache_delete(varpd_instance_t *inst, const uint8_t *key) bzero(&cache, sizeof (overlay_targ_cache_t)); cache.otc_linkid = inst->vri_linkid; - bcopy(key, cache.otc_entry.otce_mac, ETHERADDRL); + cache.otc_entry.otce_mac.otm_dcid = dcid; + bcopy(key, cache.otc_entry.otce_mac.otm_mac, ETHERADDRL); ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_REMOVE, &cache); if (ret != 0 && errno == EFAULT) @@ -412,12 +419,11 @@ libvarpd_overlay_cache_get(varpd_instance_t *inst, const uint8_t *key, varpd_client_cache_entry_t *entry) { int ret; - overlay_targ_cache_t cache; + overlay_targ_cache_t cache = { 0 }; varpd_impl_t *vip = inst->vri_impl; - bzero(&cache, sizeof (overlay_targ_cache_t)); cache.otc_linkid = inst->vri_linkid; - bcopy(key, cache.otc_entry.otce_mac, ETHERADDRL); + bcopy(key, cache.otc_entry.otce_mac.otm_mac, ETHERADDRL); ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_GET, &cache); if (ret != 0 && errno == EFAULT) @@ -434,16 +440,16 @@ libvarpd_overlay_cache_get(varpd_instance_t *inst, const uint8_t *key, } int -libvarpd_overlay_cache_set(varpd_instance_t *inst, const uint8_t *key, - const varpd_client_cache_entry_t *entry) +libvarpd_overlay_cache_set(varpd_instance_t *inst, uint32_t dcid, + const uint8_t *key, const varpd_client_cache_entry_t *entry) { int ret; - overlay_targ_cache_t cache; + overlay_targ_cache_t cache = { 0 }; varpd_impl_t *vip = inst->vri_impl; - bzero(&cache, sizeof (overlay_targ_cache_t)); cache.otc_linkid = inst->vri_linkid; - bcopy(key, cache.otc_entry.otce_mac, ETHERADDRL); + cache.otc_entry.otce_mac.otm_dcid = dcid; + bcopy(key, cache.otc_entry.otce_mac.otm_mac, ETHERADDRL); bcopy(&entry->vcp_mac, cache.otc_entry.otce_dest.otp_mac, ETHERADDRL); cache.otc_entry.otce_flags = entry->vcp_flags; cache.otc_entry.otce_dest.otp_ip = entry->vcp_ip; @@ -477,7 +483,8 @@ libvarpd_overlay_cache_walk_fill(varpd_instance_t *inst, uint64_t *markerp, return (ENOMEM); iter->otci_linkid = inst->vri_linkid; - iter->otci_marker = *markerp; + iter->otci_marker[0] = markerp[0]; + iter->otci_marker[1] = markerp[1]; iter->otci_count = *countp; ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_ITER, iter); if (ret != 0 && errno == EFAULT) @@ -487,7 +494,8 @@ libvarpd_overlay_cache_walk_fill(varpd_instance_t *inst, uint64_t *markerp, goto out; } - *markerp = iter->otci_marker; + markerp[0] = iter->otci_marker[0]; + markerp[1] = iter->otci_marker[1]; *countp = iter->otci_count; bcopy(iter->otci_ents, ents, *countp * sizeof (overlay_targ_cache_entry_t)); @@ -523,18 +531,20 @@ libvarpd_inject_varp(varpd_provider_handle_t *vph, const uint8_t *mac, const overlay_target_point_t *otp) { int ret; - overlay_targ_cache_t otc; + overlay_targ_cache_t otc = { 0 }; varpd_instance_t *inst = (varpd_instance_t *)vph; varpd_impl_t *vip = inst->vri_impl; if (otp == NULL) { - (void) libvarpd_overlay_cache_delete(inst, mac); + (void) libvarpd_overlay_cache_delete(inst, 0, mac); return; } otc.otc_linkid = inst->vri_linkid; otc.otc_entry.otce_flags = 0; - bcopy(mac, otc.otc_entry.otce_mac, ETHERADDRL); + if (IN6_IS_ADDR_UNSPECIFIED(&otp->otp_ip) && otp->otp_port == 0) + otc.otc_entry.otce_flags |= OVERLAY_TARGET_CACHE_ROUTER; + bcopy(mac, otc.otc_entry.otce_mac.otm_mac, ETHERADDRL); bcopy(otp, &otc.otc_entry.otce_dest, sizeof (overlay_target_point_t)); ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_SET, &otc); @@ -552,6 +562,34 @@ libvarpd_inject_varp(varpd_provider_handle_t *vph, const uint8_t *mac, } void +libvarpd_route_flush(varpd_provider_handle_t *vph, uint8_t *srcip, + uint8_t *dstip, uint8_t src_prefixlen, uint8_t dst_prefixlen, + uint16_t vlan_id) +{ + varpd_instance_t *inst = (varpd_instance_t *)vph; + varpd_impl_t *vip = inst->vri_impl; + overlay_targ_cache_net_t otcn; + overlay_targ_cache_net_entry_t *otcne; + int ret; + + otcn.otcn_linkid = inst->vri_linkid; + otcne = &otcn.otcn_entry; + bcopy(srcip, &otcne->otcne_src, sizeof (in6_addr_t)); + bcopy(dstip, &otcne->otcne_dst, sizeof (in6_addr_t)); + otcne->otcne_vlan = vlan_id; + otcne->otcne_src_prefixlen = src_prefixlen; + otcne->otcne_dst_prefixlen = dst_prefixlen; + + ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_REMOVE_NET, &otcn); + if (ret != 0) { + /* XXX KEBE ASKS, any harmless error cases? */ + libvarpd_panic("received bad errno from " + "OVERLAY_TARG_CACHE_REMOVE_NET: %d - %s", errno, + strerror(errno)); + } +} + +void libvarpd_fma_degrade(varpd_provider_handle_t *vph, const char *msg) { int ret; diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c index 27cc802a9c..f8b1fcedfc 100644 --- a/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. All rights reserved. */ /* @@ -281,7 +281,7 @@ libvarpd_persist_restore_instance(varpd_impl_t *vip, nvlist_t *nvl) int err; nvlist_t *pvl; uint64_t id, flags, vid; - uint32_t linkid, dest, mode; + uint32_t linkid, dest, mode, dcid; char *pluginstr; varpd_plugin_t *plugin; overlay_plugin_dest_t adest; @@ -312,7 +312,8 @@ libvarpd_persist_restore_instance(varpd_impl_t *vip, nvlist_t *nvl) if (plugin->vpp_mode != mode) return (EINVAL); - if (libvarpd_overlay_info(vip, linkid, &adest, &flags, &vid) != 0) + if (libvarpd_overlay_info(vip, linkid, &adest, &flags, &vid, + &dcid) != 0) return (EINVAL); if (dest != adest) @@ -334,6 +335,7 @@ libvarpd_persist_restore_instance(varpd_impl_t *vip, nvlist_t *nvl) inst->vri_dest = dest; inst->vri_plugin = plugin; inst->vri_impl = vip; + inst->vri_dcid = dcid; inst->vri_flags = 0; if (plugin->vpp_ops->vpo_restore(pvl, (varpd_provider_handle_t *)inst, dest, &inst->vri_private) != 0) { diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h b/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h index 64fa99d308..ab198919d7 100644 --- a/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _LIBVARPD_PROVIDER_H @@ -315,11 +315,12 @@ typedef void (*varpd_plugin_destroy_f)(void *); #define VARPD_LOOKUP_DROP (-1) typedef int (*varpd_plugin_default_f)(void *, overlay_target_point_t *); typedef void (*varpd_plugin_lookup_f)(void *, varpd_query_handle_t *, - const overlay_targ_lookup_t *, overlay_target_point_t *); + const overlay_targ_lookup_t *, overlay_target_point_t *, + overlay_target_route_t *, overlay_target_mac_t *); #define VARPD_QTYPE_ETHERNET 0x0 typedef void (*varpd_plugin_arp_f)(void *, varpd_arp_handle_t *, int, - const struct sockaddr *, uint8_t *); + const struct sockaddr *, uint16_t, uint8_t *); typedef void (*varpd_plugin_dhcp_f)(void *, varpd_dhcp_handle_t *, int, const overlay_targ_lookup_t *, uint8_t *); @@ -373,6 +374,7 @@ extern const bunyan_logger_t *libvarpd_plugin_bunyan(varpd_provider_handle_t *); * Misc. Information APIs */ extern uint64_t libvarpd_plugin_vnetid(varpd_provider_handle_t *); +extern uint32_t libvarpd_plugin_dcid(varpd_provider_handle_t *); /* * Lookup Replying query and proxying @@ -411,6 +413,8 @@ extern void libvarpd_inject_arp(varpd_provider_handle_t *, const uint16_t, const uint8_t *, const struct in_addr *, const uint8_t *); extern void libvarpd_fma_degrade(varpd_provider_handle_t *, const char *); extern void libvarpd_fma_restore(varpd_provider_handle_t *); +extern void libvarpd_route_flush(varpd_provider_handle_t *, uint8_t *, + uint8_t *, uint8_t, uint8_t, uint16_t vlan_id); #ifdef __cplusplus } diff --git a/usr/src/lib/varpd/libvarpd/common/mapfile-plugin b/usr/src/lib/varpd/libvarpd/common/mapfile-plugin index 8cef7f669f..f51dfd9129 100644 --- a/usr/src/lib/varpd/libvarpd/common/mapfile-plugin +++ b/usr/src/lib/varpd/libvarpd/common/mapfile-plugin @@ -10,7 +10,7 @@ # # -# Copyright 2015 Joyent, Inc. +# Copyright 2018 Joyent, Inc. # # @@ -39,6 +39,7 @@ SYMBOL_SCOPE { libvarpd_panic { FLAGS = EXTERN }; libvarpd_plugin_alloc { FLAGS = EXTERN }; libvarpd_plugin_arp_reply { FLAGS = EXTERN }; + libvarpd_plugin_dcid { FLAGS = EXTERN }; libvarpd_plugin_dhcp_reply { FLAGS = EXTERN }; libvarpd_plugin_free { FLAGS = EXTERN }; libvarpd_plugin_proxy_arp { FLAGS = EXTERN }; @@ -54,4 +55,5 @@ SYMBOL_SCOPE { libvarpd_prop_set_nodefault { FLAGS = EXTERN }; libvarpd_prop_set_range_uint32 { FLAGS = EXTERN }; libvarpd_prop_set_rangestr { FLAGS = EXTERN }; + libvarpd_route_flush { FLAGS = EXTERN }; }; diff --git a/usr/src/lib/varpd/libvarpd/common/mapfile-vers b/usr/src/lib/varpd/libvarpd/common/mapfile-vers index 7aa930cb54..3eb74972e5 100644 --- a/usr/src/lib/varpd/libvarpd/common/mapfile-vers +++ b/usr/src/lib/varpd/libvarpd/common/mapfile-vers @@ -10,7 +10,7 @@ # # -# Copyright 2015 Joyent, Inc. +# Copyright 2018 Joyent, Inc. # # @@ -65,6 +65,8 @@ SYMBOL_VERSION SUNWprivate { libvarpd_inject_varp; libvarpd_inject_arp; + libvarpd_route_flush; + libvarpd_instance_activate; libvarpd_instance_create; libvarpd_instance_destroy; @@ -82,6 +84,7 @@ SYMBOL_VERSION SUNWprivate { libvarpd_plugin_free; libvarpd_plugin_arp_reply; libvarpd_plugin_dhcp_reply; + libvarpd_plugin_dcid; libvarpd_plugin_query_reply; libvarpd_plugin_proxy_arp; libvarpd_plugin_proxy_dhcp; diff --git a/usr/src/lib/varpd/svp/common/libvarpd_svp.c b/usr/src/lib/varpd/svp/common/libvarpd_svp.c index 58828065a1..1e9ea979d7 100644 --- a/usr/src/lib/varpd/svp/common/libvarpd_svp.c +++ b/usr/src/lib/varpd/svp/common/libvarpd_svp.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2015, Joyent, Inc. + * Copyright 2018, Joyent, Inc. */ /* @@ -217,20 +217,24 @@ * | * v Socket Error, * +----------------+ still in DNS - * +----------------<---| SVP_CS_INITIAL |<----------------------*-----+ - * | +----------------+ | - * | System | | - * | Connection . . . . . success * Successful | - * | failed . | connect() | - * | +----*---------+ | +-----------*--+ | - * | | | | | | | - * | V ^ v ^ V ^ - * | +----------------+ +-------------------+ +---------------+ - * +<-| SVP_CS_BACKOFF | | SVP_CS_CONNECTING | | SVP_CS_ACTIVE | - * | +----------------+ +-------------------+ +---------------+ - * | V ^ V V V - * | Backoff wait * | | | * Removed - * v interval +--------------+ +-----------------<-----+ | from DNS + * +----------------<---| SVP_CS_INITIAL |<----------------------*--------+ + * | +----------------+ | + * | System | | + * | Connection . . . . . success * Successful | + * | failed . | connect() | + * | . | +-------------------+ | + * | +----*---------+ | +-*>| SVP_CS_VERSIONING + | + * | | | | | +-------------------+ | + * | | | | | V V Set version | + * | | | | | | * based on | + * | | | | | | | SVP_R_PONG | + * | V ^ v ^ | V ^ + * | +----------------+ +-------------------+ | +---------------+ + * +<-| SVP_CS_BACKOFF | | SVP_CS_CONNECTING | | | SVP_CS_ACTIVE | + * | +----------------+ +-------------------+ | +---------------+ + * | V ^ V | V V + * | Backoff wait * | | | | * Removed + * v interval +--------------+ +-----------------<+----+ | from DNS * | finished | | * | V | * | | V @@ -311,7 +315,7 @@ * * The shoot down information needs to be done on a per-backend basis. The * general design is that we'll have a single query for this which can fire on a - * 5-10s period, we randmoize the latter part to give us a bit more load + * 5-10s period, we randomize the latter part to give us a bit more load * spreading. If we complete because there's no work to do, then we wait the * normal period. If we complete, but there's still work to do, we'll go again * after a second. @@ -360,7 +364,8 @@ static umem_cache_t *svp_lookup_cache; typedef enum svp_lookup_type { SVP_L_UNKNOWN = 0x0, SVP_L_VL2 = 0x1, - SVP_L_VL3 = 0x2 + SVP_L_VL3 = 0x2, + SVP_L_ROUTE = 0x3 } svp_lookup_type_t; typedef struct svp_lookup { @@ -374,6 +379,12 @@ typedef struct svp_lookup { varpd_arp_handle_t *svl_vah; uint8_t *svl_out; } svl_vl3; + struct svl_lookup_route { + varpd_query_handle_t *svl_handle; + overlay_target_point_t *svl_point; + overlay_target_route_t *svl_route; + overlay_target_mac_t *svl_mac; + } svl_route; } svl_u; svp_query_t svl_query; } svp_lookup_t; @@ -382,7 +393,9 @@ static const char *varpd_svp_props[] = { "svp/host", "svp/port", "svp/underlay_ip", - "svp/underlay_port" + "svp/underlay_port", + "svp/dcid", + "svp/router_oui" }; static const uint8_t svp_bcast[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; @@ -429,7 +442,8 @@ static void svp_vl3_lookup_cb(svp_t *svp, svp_status_t status, const uint8_t *vl2mac, const struct in6_addr *uip, const uint16_t uport, void *arg) { - overlay_target_point_t point; + /* Initialize address-holders to 0 for comparisons-to-zeroes later. */ + overlay_target_point_t point = { 0 }; svp_lookup_t *svl = arg; assert(svp != NULL); @@ -486,17 +500,67 @@ svp_shootdown_cb(svp_t *svp, const uint8_t *vl2mac, const struct in6_addr *uip, const uint16_t uport) { /* - * We should probably do a conditional invlaidation here. + * We should probably do a conditional invalidation here. */ libvarpd_inject_varp(svp->svp_hdl, vl2mac, NULL); } +static void +svp_route_lookup_cb(svp_t *svp, svp_status_t status, uint32_t dcid, + uint32_t vnetid, uint16_t vlan, uint8_t *srcmac, uint8_t *dstmac, + uint16_t ul3_port, uint8_t *ul3_addr, uint8_t srcpfx, uint8_t dstpfx, + void *arg) +{ + svp_lookup_t *svl = arg; + overlay_target_point_t *otp; + overlay_target_route_t *otr; + overlay_target_mac_t *otm; + + if (status != SVP_S_OK) { + libvarpd_plugin_query_reply(svl->svl_u.svl_route.svl_handle, + VARPD_LOOKUP_DROP); + umem_cache_free(svp_lookup_cache, svl); + return; + } + + otp = svl->svl_u.svl_route.svl_point; + bcopy(ul3_addr, &otp->otp_ip, sizeof (struct in6_addr)); + otp->otp_port = ul3_port; + + otr = svl->svl_u.svl_route.svl_route; + otr->otr_vnet = vnetid; + otr->otr_vlan = vlan; + bcopy(srcmac, otr->otr_srcmac, ETHERADDRL); + + otm = svl->svl_u.svl_route.svl_mac; + otm->otm_dcid = dcid; + bcopy(dstmac, otm->otm_mac, ETHERADDRL); + + libvarpd_plugin_query_reply(svl->svl_u.svl_route.svl_handle, + VARPD_LOOKUP_OK); + umem_cache_free(svp_lookup_cache, svl); +} + +/* + * Tell the overlay instance to flush out entries matcthing this route. + * See libvarpd_route_flush() for more. + */ +static void +svp_route_shootdown_cb(svp_t *svp, uint8_t *srcip, uint8_t *dstip, + uint8_t src_prefixlen, uint8_t dst_prefixlen, uint16_t vlan_id) +{ + libvarpd_route_flush(svp->svp_hdl, srcip, dstip, src_prefixlen, + dst_prefixlen, vlan_id); +} + static svp_cb_t svp_defops = { svp_vl2_lookup_cb, svp_vl3_lookup_cb, svp_vl2_invalidate_cb, svp_vl3_inject_cb, - svp_shootdown_cb + svp_shootdown_cb, + svp_route_lookup_cb, + svp_route_shootdown_cb }; static boolean_t @@ -587,23 +651,89 @@ varpd_svp_destroy(void *arg) } static void +varpd_svp_lookup_l3(svp_t *svp, varpd_query_handle_t *vqh, + const overlay_targ_lookup_t *otl, overlay_target_point_t *otp, + overlay_target_route_t *otr, overlay_target_mac_t *otm) +{ + svp_lookup_t *slp; + /* uint32_t type; */ + const struct in6_addr *src = &otl->otl_addru.otlu_l3.otl3_srcip, + *dst = &otl->otl_addru.otlu_l3.otl3_dstip; + + /* + * otl is an L3 request, so we have src/dst IPs for the inner packet. + * We also have the vlan. + * + * Assume kernel's overlay module is caching well, so we are directly + * going to query (i.e. no caching up here of actual destinations). + * + * Our existing remote sever (svp_remote), but with the new message + * SVP_R_ROUTE_REQ. + */ + + if (IN6_IS_ADDR_V4MAPPED(src)) { + if (!IN6_IS_ADDR_V4MAPPED(dst)) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + return; + } + /* type = SVP_VL3_IP; */ + } else { + if (IN6_IS_ADDR_V4MAPPED(dst)) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + return; + } + /* type = SVP_VL3_IPV6; */ + } + + slp = umem_cache_alloc(svp_lookup_cache, UMEM_DEFAULT); + if (slp == NULL) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + return; + } + + slp->svl_type = SVP_L_ROUTE; + slp->svl_u.svl_route.svl_handle = vqh; + slp->svl_u.svl_route.svl_point = otp; + slp->svl_u.svl_route.svl_route = otr; + slp->svl_u.svl_route.svl_mac = otm; + + svp_remote_route_lookup(svp, &slp->svl_query, src, dst, + otl->otl_vnetid, (uint16_t)otl->otl_vlan, slp); +} + +static void varpd_svp_lookup(void *arg, varpd_query_handle_t *vqh, - const overlay_targ_lookup_t *otl, overlay_target_point_t *otp) + const overlay_targ_lookup_t *otl, overlay_target_point_t *otp, + overlay_target_route_t *otr, overlay_target_mac_t *otm) { svp_lookup_t *slp; svp_t *svp = arg; /* + * Shuffle off L3 lookups to their own codepath. + */ + if (otl->otl_l3req) { + varpd_svp_lookup_l3(svp, vqh, otl, otp, otr, otm); + return; + } + /* + * At this point, the traditional overlay_target_point_t is all that + * needs filling in. Zero-out the otr for safety. + */ + bzero(otr, sizeof (*otr)); + + + /* * Check if this is something that we need to proxy, eg. arp or ndp. */ - if (otl->otl_sap == ETHERTYPE_ARP) { + if (otl->otl_addru.otlu_l2.otl2_sap == ETHERTYPE_ARP) { libvarpd_plugin_proxy_arp(svp->svp_hdl, vqh, otl); return; } - if (otl->otl_dstaddr[0] == 0x33 && - otl->otl_dstaddr[1] == 0x33) { - if (otl->otl_sap == ETHERTYPE_IPV6) { + if (otl->otl_addru.otlu_l2.otl2_dstaddr[0] == 0x33 && + otl->otl_addru.otlu_l2.otl2_dstaddr[1] == 0x33) { + if (otl->otl_addru.otlu_l2.otl2_sap == ETHERTYPE_IPV6) { libvarpd_plugin_proxy_ndp(svp->svp_hdl, vqh, otl); } else { libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); @@ -617,8 +747,9 @@ varpd_svp_lookup(void *arg, varpd_query_handle_t *vqh, * handle broadcast and if the multicast bit is set, lowest bit of the * first octet of the MAC, then we drop it now. */ - if (bcmp(otl->otl_dstaddr, svp_bcast, ETHERADDRL) == 0 || - (otl->otl_dstaddr[0] & 0x01) == 0x01) { + if (bcmp(otl->otl_addru.otlu_l2.otl2_dstaddr, svp_bcast, + ETHERADDRL) == 0 || + (otl->otl_addru.otlu_l2.otl2_dstaddr[0] & 0x01) == 0x01) { libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); return; } @@ -639,7 +770,8 @@ varpd_svp_lookup(void *arg, varpd_query_handle_t *vqh, slp->svl_u.svl_vl2.svl_handle = vqh; slp->svl_u.svl_vl2.svl_point = otp; - svp_remote_vl2_lookup(svp, &slp->svl_query, otl->otl_dstaddr, slp); + svp_remote_vl2_lookup(svp, &slp->svl_query, + otl->otl_addru.otlu_l2.otl2_dstaddr, slp); } /* ARGSUSED */ @@ -687,6 +819,21 @@ varpd_svp_propinfo(void *arg, uint_t propid, varpd_prop_handle_t *vph) sizeof (svp_defuport)); libvarpd_prop_set_range_uint32(vph, 1, UINT16_MAX); break; + case 4: + /* svp/dcid */ + libvarpd_prop_set_name(vph, varpd_svp_props[4]); + libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_RRW); + libvarpd_prop_set_type(vph, OVERLAY_PROP_T_UINT); + libvarpd_prop_set_nodefault(vph); + libvarpd_prop_set_range_uint32(vph, 1, UINT32_MAX - 1); + break; + case 5: + /* svp/router_oui */ + libvarpd_prop_set_name(vph, varpd_svp_props[5]); + libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_RRW); + libvarpd_prop_set_type(vph, OVERLAY_PROP_T_ETHER); + libvarpd_prop_set_nodefault(vph); + break; default: return (EINVAL); } @@ -733,14 +880,13 @@ varpd_svp_getprop(void *arg, const char *pname, void *buf, uint32_t *sizep) bcopy(&val, buf, sizeof (uint64_t)); *sizep = sizeof (uint64_t); } - mutex_exit(&svp->svp_lock); return (0); } /* svp/underlay_ip */ if (strcmp(pname, varpd_svp_props[2]) == 0) { - if (*sizep > sizeof (struct in6_addr)) + if (*sizep < sizeof (struct in6_addr)) return (EOVERFLOW); mutex_enter(&svp->svp_lock); if (svp->svp_huip == B_FALSE) { @@ -749,6 +895,7 @@ varpd_svp_getprop(void *arg, const char *pname, void *buf, uint32_t *sizep) bcopy(&svp->svp_uip, buf, sizeof (struct in6_addr)); *sizep = sizeof (struct in6_addr); } + mutex_exit(&svp->svp_lock); return (0); } @@ -772,6 +919,42 @@ varpd_svp_getprop(void *arg, const char *pname, void *buf, uint32_t *sizep) return (0); } + /* svp/dcid */ + if (strcmp(pname, varpd_svp_props[4]) == 0) { + uint64_t val; + + if (*sizep < sizeof (uint64_t)) + return (EOVERFLOW); + + mutex_enter(&svp->svp_lock); + if (svp->svp_uport == 0) { + *sizep = 0; + } else { + val = svp->svp_dcid; + bcopy(&val, buf, sizeof (uint64_t)); + *sizep = sizeof (uint64_t); + } + + mutex_exit(&svp->svp_lock); + return (0); + } + + /* svp/router_oui */ + if (strcmp(pname, varpd_svp_props[5]) == 0) { + if (*sizep < ETHERADDRL) + return (EOVERFLOW); + mutex_enter(&svp->svp_lock); + + if (ether_is_zero(&svp->svp_router_oui)) { + *sizep = 0; + } else { + bcopy(&svp->svp_router_oui, buf, ETHERADDRL); + *sizep = ETHERADDRL; + } + + mutex_exit(&svp->svp_lock); + return (0); + } return (EINVAL); } @@ -857,6 +1040,36 @@ varpd_svp_setprop(void *arg, const char *pname, const void *buf, return (0); } + /* svp/dcid */ + if (strcmp(pname, varpd_svp_props[4]) == 0) { + const uint64_t *valp = buf; + if (size < sizeof (uint64_t)) + return (EOVERFLOW); + + if (*valp == 0 || *valp > UINT32_MAX - 1) + return (EINVAL); + + mutex_enter(&svp->svp_lock); + svp->svp_dcid = (uint32_t)*valp; + mutex_exit(&svp->svp_lock); + + return (0); + } + + /* svp/router_oui */ + if (strcmp(pname, varpd_svp_props[5]) == 0) { + if (size < ETHERADDRL) + return (EOVERFLOW); + mutex_enter(&svp->svp_lock); + bcopy(buf, &svp->svp_router_oui, ETHERADDRL); + /* Zero-out the low three bytes. */ + svp->svp_router_oui[3] = 0; + svp->svp_router_oui[4] = 0; + svp->svp_router_oui[5] = 0; + mutex_exit(&svp->svp_lock); + return (0); + } + return (EINVAL); } @@ -867,6 +1080,7 @@ varpd_svp_save(void *arg, nvlist_t *nvp) svp_t *svp = arg; mutex_enter(&svp->svp_lock); + /* svp/host */ if (svp->svp_host != NULL) { if ((ret = nvlist_add_string(nvp, varpd_svp_props[0], svp->svp_host)) != 0) { @@ -875,6 +1089,7 @@ varpd_svp_save(void *arg, nvlist_t *nvp) } } + /* svp/port */ if (svp->svp_port != 0) { if ((ret = nvlist_add_uint16(nvp, varpd_svp_props[1], svp->svp_port)) != 0) { @@ -883,6 +1098,7 @@ varpd_svp_save(void *arg, nvlist_t *nvp) } } + /* svp/underlay_ip */ if (svp->svp_huip == B_TRUE) { char buf[INET6_ADDRSTRLEN]; @@ -898,6 +1114,7 @@ varpd_svp_save(void *arg, nvlist_t *nvp) } } + /* svp/underlay_port */ if (svp->svp_uport != 0) { if ((ret = nvlist_add_uint16(nvp, varpd_svp_props[3], svp->svp_uport)) != 0) { @@ -906,6 +1123,32 @@ varpd_svp_save(void *arg, nvlist_t *nvp) } } + /* svp/dcid */ + if (svp->svp_dcid != 0) { + if ((ret = nvlist_add_uint32(nvp, varpd_svp_props[4], + svp->svp_dcid)) != 0) { + mutex_exit(&svp->svp_lock); + return (ret); + } + } + + /* svp/router_oui */ + if (!ether_is_zero(&svp->svp_router_oui)) { + char buf[ETHERADDRSTRL]; + + if (ether_ntoa_r((struct ether_addr *)&svp->svp_router_oui, + buf) == NULL) { + libvarpd_panic("unexpected ether_ntoa_r failure: %d", + errno); + } + + if ((ret = nvlist_add_string(nvp, varpd_svp_props[5], + buf)) != 0) { + mutex_exit(&svp->svp_lock); + return (ret); + } + } + mutex_exit(&svp->svp_lock); return (0); } @@ -916,7 +1159,7 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl, { int ret; svp_t *svp; - char *ipstr, *hstr; + char *ipstr, *hstr, *etherstr; if (varpd_svp_valid_dest(dest) == B_FALSE) return (ENOTSUP); @@ -924,6 +1167,7 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl, if ((ret = varpd_svp_create(hdl, (void **)&svp, dest)) != 0) return (ret); + /* svp/host */ if ((ret = nvlist_lookup_string(nvp, varpd_svp_props[0], &hstr)) != 0) { if (ret != ENOENT) { @@ -937,6 +1181,7 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl, (void) strlcpy(svp->svp_host, hstr, blen); } + /* svp/port */ if ((ret = nvlist_lookup_uint16(nvp, varpd_svp_props[1], &svp->svp_port)) != 0) { if (ret != ENOENT) { @@ -946,6 +1191,7 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl, svp->svp_port = 0; } + /* svp/underlay_ip */ if ((ret = nvlist_lookup_string(nvp, varpd_svp_props[2], &ipstr)) != 0) { if (ret != ENOENT) { @@ -968,6 +1214,7 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl, svp->svp_huip = B_TRUE; } + /* svp/underlay_port */ if ((ret = nvlist_lookup_uint16(nvp, varpd_svp_props[3], &svp->svp_uport)) != 0) { if (ret != ENOENT) { @@ -977,6 +1224,29 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl, svp->svp_uport = 0; } + /* svp/dcid */ + if ((ret = nvlist_lookup_uint32(nvp, varpd_svp_props[4], + &svp->svp_dcid)) != 0) { + if (ret != ENOENT) { + varpd_svp_destroy(svp); + return (ret); + } + svp->svp_dcid = 0; + } + + /* svp/router_oui */ + if ((ret = nvlist_lookup_string(nvp, varpd_svp_props[5], + ðerstr)) != 0) { + if (ret != ENOENT) { + varpd_svp_destroy(svp); + return (ret); + } + bzero(&svp->svp_router_oui, ETHERADDRL); + } else if (ether_aton_r(etherstr, + (struct ether_addr *)&svp->svp_router_oui) == NULL) { + libvarpd_panic("unexpected ether_aton_r failure: %d", errno); + } + svp->svp_hdl = hdl; *outp = svp; return (0); @@ -984,7 +1254,7 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl, static void varpd_svp_arp(void *arg, varpd_arp_handle_t *vah, int type, - const struct sockaddr *sock, uint8_t *out) + const struct sockaddr *sock, uint16_t vlan __unused, uint8_t *out) { svp_t *svp = arg; svp_lookup_t *svl; diff --git a/usr/src/lib/varpd/svp/common/libvarpd_svp.h b/usr/src/lib/varpd/svp/common/libvarpd_svp.h index 8192b842ce..348996898e 100644 --- a/usr/src/lib/varpd/svp/common/libvarpd_svp.h +++ b/usr/src/lib/varpd/svp/common/libvarpd_svp.h @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _LIBVARPD_SVP_H @@ -74,6 +74,8 @@ typedef union svp_query_data { svp_vl3_ack_t sdq_vl3a; svp_log_req_t sdq_logr; svp_lrm_ack_t sdq_lrma; + svp_route_req_t sqd_rr; + svp_route_ack_t sqd_ra; } svp_query_data_t; typedef void (*svp_query_f)(svp_query_t *, void *); @@ -116,14 +118,16 @@ typedef enum svp_conn_state { SVP_CS_CONNECTING = 0x02, SVP_CS_BACKOFF = 0x03, SVP_CS_ACTIVE = 0x04, - SVP_CS_WINDDOWN = 0x05 + SVP_CS_WINDDOWN = 0x05, + SVP_CS_VERSIONING = 0x06 } svp_conn_state_t; typedef enum svp_conn_error { SVP_CE_NONE = 0x00, SVP_CE_ASSOCIATE = 0x01, SVP_CE_NOPOLLOUT = 0x02, - SVP_CE_SOCKET = 0x03 + SVP_CE_SOCKET = 0x03, + SVP_CE_VERSION_PONG = 0x04 } svp_conn_error_t; typedef enum svp_conn_flags { @@ -164,6 +168,7 @@ struct svp_conn { list_t sc_queries; svp_conn_out_t sc_output; svp_conn_in_t sc_input; + uint_t sc_version; }; typedef enum svp_remote_state { @@ -245,6 +250,11 @@ typedef void (*svp_vl3_inject_f)(svp_t *, const uint16_t, const struct in6_addr *, const uint8_t *, const uint8_t *); typedef void (*svp_shootdown_f)(svp_t *, const uint8_t *, const struct in6_addr *, const uint16_t uport); +typedef void (*svp_route_lookup_f)(svp_t *, svp_status_t, uint32_t, uint32_t, + uint16_t, uint8_t *, uint8_t *, uint16_t, uint8_t *, uint8_t, uint8_t, + void *); +typedef void (*svp_route_shootdown_f)(svp_t *, uint8_t *, uint8_t *, uint8_t, + uint8_t, uint16_t); typedef struct svp_cb { svp_vl2_lookup_f scb_vl2_lookup; @@ -252,6 +262,8 @@ typedef struct svp_cb { svp_vl2_invalidation_f scb_vl2_invalidate; svp_vl3_inject_f scb_vl3_inject; svp_shootdown_f scb_shootdown; + svp_route_lookup_f scb_route_lookup; + svp_route_shootdown_f scb_route_shootdown; } svp_cb_t; /* @@ -268,8 +280,11 @@ struct svp { char *svp_host; /* svp_lock */ uint16_t svp_port; /* svp_lock */ uint16_t svp_uport; /* svp_lock */ + uint32_t svp_dcid; /* svp_lock (but write-once?) */ boolean_t svp_huip; /* svp_lock */ struct in6_addr svp_uip; /* svp_lock */ + /* NOTE: lower-3 bytes are 0s. */ + uint8_t svp_router_oui[6]; /* svp_lock (but write-once?) */ }; extern bunyan_logger_t *svp_bunyan; @@ -283,6 +298,10 @@ extern void svp_remote_vl3_lookup(svp_t *, svp_query_t *, const struct sockaddr *, void *); extern void svp_remote_vl2_lookup(svp_t *, svp_query_t *, const uint8_t *, void *); +extern void svp_remote_route_lookup(svp_t *, svp_query_t *, + const struct in6_addr *, const struct in6_addr *, uint32_t, + uint16_t, void *); + /* * Init functions @@ -332,6 +351,7 @@ extern void svp_remote_resolved(svp_remote_t *, struct addrinfo *); extern void svp_host_queue(svp_remote_t *); extern void svp_query_release(svp_query_t *); extern void svp_query_crc32(svp_req_t *, void *, size_t); +extern id_t svp_id_alloc(void); /* * Shootdown related @@ -339,11 +359,13 @@ extern void svp_query_crc32(svp_req_t *, void *, size_t); extern void svp_remote_shootdown_vl3(svp_remote_t *, svp_log_vl3_t *, svp_sdlog_t *); extern void svp_remote_shootdown_vl2(svp_remote_t *, svp_log_vl2_t *); +extern void svp_remote_shootdown_route(svp_remote_t *, svp_log_route_t *); extern void svp_remote_log_request(svp_remote_t *, svp_query_t *, void *, size_t); extern void svp_remote_lrm_request(svp_remote_t *, svp_query_t *, void *, size_t); -extern void svp_shootdown_logr_cb(svp_remote_t *, svp_status_t, void *, size_t); +extern void svp_shootdown_logr_cb(svp_remote_t *, svp_status_t, void *, size_t, + uint16_t); extern void svp_shootdown_lrm_cb(svp_remote_t *, svp_status_t); extern void svp_shootdown_vl3_cb(svp_status_t, svp_log_vl3_t *, svp_sdlog_t *); extern int svp_shootdown_init(svp_remote_t *); diff --git a/usr/src/lib/varpd/svp/common/libvarpd_svp_conn.c b/usr/src/lib/varpd/svp/common/libvarpd_svp_conn.c index 4d10d1dba4..af0fe07e52 100644 --- a/usr/src/lib/varpd/svp/common/libvarpd_svp_conn.c +++ b/usr/src/lib/varpd/svp/common/libvarpd_svp_conn.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -40,9 +40,12 @@ typedef enum svp_conn_act { SVP_RA_DEGRADE = 0x01, SVP_RA_RESTORE = 0x02, SVP_RA_ERROR = 0x03, - SVP_RA_CLEANUP = 0x04 + SVP_RA_CLEANUP = 0x04, + SVP_RA_FIND_VERSION = 0x05 } svp_conn_act_t; +static svp_conn_act_t svp_conn_poll_connect(port_event_t *, svp_conn_t *); + static void svp_conn_inject(svp_conn_t *scp) { @@ -90,6 +93,75 @@ svp_conn_restore(svp_conn_t *scp) srp->sr_ndconns--; } +static svp_conn_act_t +svp_conn_pong_handler(svp_conn_t *scp, svp_query_t *sqp) +{ + uint16_t remote_version = ntohs(scp->sc_input.sci_req.svp_ver); + + if (scp->sc_cstate == SVP_CS_VERSIONING) { + /* Transition VERSIONING -> ACTIVE. */ + assert(scp->sc_version == 0); + if (remote_version == 0 || remote_version > SVP_CURRENT_VERSION) + return (SVP_RA_ERROR); + scp->sc_version = remote_version; + scp->sc_cstate = SVP_CS_ACTIVE; + } + + return (SVP_RA_NONE); +} + +static void +svp_conn_ping_cb(svp_query_t *sqp, void *arg) +{ + size_t len = (size_t)arg; + + assert(len == sizeof (svp_query_t)); + umem_free(sqp, len); +} + +static svp_conn_act_t +svp_conn_ping_version(svp_conn_t *scp) +{ + svp_remote_t *srp = scp->sc_remote; + svp_query_t *sqp = umem_zalloc(sizeof (svp_query_t), UMEM_DEFAULT); + int ret; + + assert(MUTEX_HELD(&srp->sr_lock)); + assert(MUTEX_HELD(&scp->sc_lock)); + assert(scp->sc_cstate == SVP_CS_CONNECTING); + + if (sqp == NULL) + return (SVP_RA_ERROR); + + /* Only set things that need to be non-0/non-NULL. */ + sqp->sq_state = SVP_QUERY_INIT; + sqp->sq_func = svp_conn_ping_cb; + sqp->sq_arg = (void *)sizeof (svp_query_t); + sqp->sq_header.svp_op = htons(SVP_R_PING); + sqp->sq_header.svp_ver = htons(SVP_CURRENT_VERSION); + sqp->sq_header.svp_id = svp_id_alloc(); + if (sqp->sq_header.svp_id == -1) { + umem_free(sqp, sizeof (svp_query_t)); + return (SVP_RA_ERROR); + } + + scp->sc_cstate = SVP_CS_VERSIONING; + /* Set the event flags now... */ + scp->sc_event.se_events = POLLIN | POLLRDNORM | POLLHUP | POLLOUT; + /* ...so I can just queue it up directly... */ + svp_conn_queue(scp, sqp); + /* ... and then associate the event port myself. */ + ret = svp_event_associate(&scp->sc_event, scp->sc_socket); + if (ret == 0) + return (SVP_RA_RESTORE); + scp->sc_error = SVP_CE_ASSOCIATE; + scp->sc_errno = ret; + scp->sc_cstate = SVP_CS_ERROR; + list_remove(&scp->sc_queries, sqp); + umem_free(sqp, sizeof (svp_query_t)); + return (SVP_RA_DEGRADE); +} + static void svp_conn_add(svp_conn_t *scp) { @@ -180,6 +252,9 @@ svp_conn_connect(svp_conn_t *scp) if (scp->sc_cstate == SVP_CS_INITIAL) scp->sc_nbackoff = 0; + /* New connect means we need to know the version. */ + scp->sc_version = 0; + scp->sc_socket = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, 0); if (scp->sc_socket == -1) { scp->sc_error = SVP_CE_SOCKET; @@ -252,57 +327,53 @@ svp_conn_connect(svp_conn_t *scp) } } - /* - * We've connected. Successfully move ourselves to the bound - * state and start polling. - */ - scp->sc_cstate = SVP_CS_ACTIVE; - scp->sc_event.se_events = POLLIN | POLLRDNORM | POLLHUP; - ret = svp_event_associate(&scp->sc_event, scp->sc_socket); - if (ret == 0) - return (SVP_RA_RESTORE); - scp->sc_error = SVP_CE_ASSOCIATE; - scp->sc_cstate = SVP_CS_ERROR; - - return (SVP_RA_DEGRADE); + /* Immediately successful connection, move to SVP_CS_VERSIONING. */ + return (svp_conn_poll_connect(NULL, scp)); } /* - * This should be the first call we get after a connect. If we have successfully - * connected, we should see a writeable event. We may also see an error or a - * hang up. In either of these cases, we transition to error mode. If there is - * also a readable event, we ignore it at the moment and just let a - * reassociation pick it up so we can simplify the set of state transitions that - * we have. + * This should be the first call we get after a successful synchronous + * connect, or a completed (failed or successful) asynchronous connect. A + * non-NULL port-event indicates asynchronous completion, a NULL port-event + * indicates a successful synchronous connect. + * + * If we have successfully connected, we should see a writeable event. In the + * asynchronous case, we may also see an error or a hang up. For either hang + * up or error, we transition to error mode. If there is also a readable event + * (i.e. incoming data), we ignore it at the moment and just let a + * reassociation pick it up so we can simplify the set of state transitions + * that we have. */ static svp_conn_act_t svp_conn_poll_connect(port_event_t *pe, svp_conn_t *scp) { - int ret, err; - socklen_t sl = sizeof (err); - if (!(pe->portev_events & POLLOUT)) { - scp->sc_errno = 0; - scp->sc_error = SVP_CE_NOPOLLOUT; - scp->sc_cstate = SVP_CS_ERROR; - return (SVP_RA_DEGRADE); - } + int ret; - ret = getsockopt(scp->sc_socket, SOL_SOCKET, SO_ERROR, &err, &sl); - if (ret != 0) - libvarpd_panic("unanticipated getsockopt error"); - if (err != 0) { - return (svp_conn_backoff(scp)); + if (pe != NULL) { + int err; + socklen_t sl = sizeof (err); + + /* + * These bits only matter if we're notified of an + * asynchronous connection completion. + */ + if (!(pe->portev_events & POLLOUT)) { + scp->sc_errno = 0; + scp->sc_error = SVP_CE_NOPOLLOUT; + scp->sc_cstate = SVP_CS_ERROR; + return (SVP_RA_DEGRADE); + } + + ret = getsockopt(scp->sc_socket, SOL_SOCKET, SO_ERROR, &err, + &sl); + if (ret != 0) + libvarpd_panic("unanticipated getsockopt error"); + if (err != 0) { + return (svp_conn_backoff(scp)); + } } - scp->sc_cstate = SVP_CS_ACTIVE; - scp->sc_event.se_events = POLLIN | POLLRDNORM | POLLHUP; - ret = svp_event_associate(&scp->sc_event, scp->sc_socket); - if (ret == 0) - return (SVP_RA_RESTORE); - scp->sc_error = SVP_CE_ASSOCIATE; - scp->sc_errno = ret; - scp->sc_cstate = SVP_CS_ERROR; - return (SVP_RA_DEGRADE); + return (SVP_RA_FIND_VERSION); } static svp_conn_act_t @@ -357,7 +428,7 @@ svp_conn_pollout(svp_conn_t *scp) do { ret = writev(scp->sc_socket, iov, nvecs); - } while (ret == -1 && errno == EAGAIN); + } while (ret == -1 && errno == EINTR); if (ret == -1) { switch (errno) { case EAGAIN: @@ -387,7 +458,7 @@ static boolean_t svp_conn_pollin_validate(svp_conn_t *scp) { svp_query_t *sqp; - uint32_t nsize; + uint32_t nsize, expected_size = 0; uint16_t nvers, nop; svp_req_t *resp = &scp->sc_input.sci_req; @@ -397,19 +468,40 @@ svp_conn_pollin_validate(svp_conn_t *scp) nop = ntohs(resp->svp_op); nsize = ntohl(resp->svp_size); - if (nvers != SVP_CURRENT_VERSION) { - (void) bunyan_warn(svp_bunyan, "unsupported version", + /* + * A peer that's messing with post-connection version changes is + * likely a broken peer. + */ + if (scp->sc_cstate != SVP_CS_VERSIONING && nvers != scp->sc_version) { + (void) bunyan_warn(svp_bunyan, "version mismatch", BUNYAN_T_IP, "remote_ip", &scp->sc_addr, BUNYAN_T_INT32, "remote_port", scp->sc_remote->sr_rport, - BUNYAN_T_INT32, "version", nvers, + BUNYAN_T_INT32, "peer version", nvers, + BUNYAN_T_INT32, "our version", scp->sc_version, BUNYAN_T_INT32, "operation", nop, BUNYAN_T_INT32, "response_id", resp->svp_id, BUNYAN_T_END); return (B_FALSE); } - if (nop != SVP_R_VL2_ACK && nop != SVP_R_VL3_ACK && - nop != SVP_R_LOG_ACK && nop != SVP_R_LOG_RM_ACK) { + switch (nop) { + case SVP_R_VL2_ACK: + expected_size = sizeof (svp_vl2_ack_t); + break; + case SVP_R_VL3_ACK: + expected_size = sizeof (svp_vl3_ack_t); + break; + case SVP_R_LOG_RM_ACK: + expected_size = sizeof (svp_lrm_ack_t); + break; + case SVP_R_ROUTE_ACK: + expected_size = sizeof (svp_route_ack_t); + break; + case SVP_R_LOG_ACK: + case SVP_R_PONG: + /* No expected size (LOG_ACK) or size is 0 (PONG). */ + break; + default: (void) bunyan_warn(svp_bunyan, "unsupported operation", BUNYAN_T_IP, "remote_ip", &scp->sc_addr, BUNYAN_T_INT32, "remote_port", scp->sc_remote->sr_rport, @@ -445,26 +537,9 @@ svp_conn_pollin_validate(svp_conn_t *scp) return (B_FALSE); } - if ((nop == SVP_R_VL2_ACK && nsize != sizeof (svp_vl2_ack_t)) || - (nop == SVP_R_VL3_ACK && nsize != sizeof (svp_vl3_ack_t)) || - (nop == SVP_R_LOG_RM_ACK && nsize != sizeof (svp_lrm_ack_t))) { - (void) bunyan_warn(svp_bunyan, "response size too large", - BUNYAN_T_IP, "remote_ip", &scp->sc_addr, - BUNYAN_T_INT32, "remote_port", scp->sc_remote->sr_rport, - BUNYAN_T_INT32, "version", nvers, - BUNYAN_T_INT32, "operation", nop, - BUNYAN_T_INT32, "response_id", resp->svp_id, - BUNYAN_T_INT32, "response_size", nsize, - BUNYAN_T_INT32, "expected_size", nop == SVP_R_VL2_ACK ? - sizeof (svp_vl2_ack_t) : sizeof (svp_vl3_ack_t), - BUNYAN_T_INT32, "query_state", sqp->sq_state, - BUNYAN_T_END); - return (B_FALSE); - } - /* - * The valid size is anything <= to what the user requested, but at - * least svp_log_ack_t bytes large. + * For LOG_ACK, the valid size is anything <= to what the user + * requested, but at least svp_log_ack_t bytes large. */ if (nop == SVP_R_LOG_ACK) { const char *msg = NULL; @@ -487,12 +562,26 @@ svp_conn_pollin_validate(svp_conn_t *scp) BUNYAN_T_END); return (B_FALSE); } + } else if (nsize != expected_size) { + /* For other ops, we know the expected size. */ + (void) bunyan_warn(svp_bunyan, "response size too large", + BUNYAN_T_IP, "remote_ip", &scp->sc_addr, + BUNYAN_T_INT32, "remote_port", scp->sc_remote->sr_rport, + BUNYAN_T_INT32, "version", nvers, + BUNYAN_T_INT32, "operation", nop, + BUNYAN_T_INT32, "response_id", resp->svp_id, + BUNYAN_T_INT32, "response_size", nsize, + BUNYAN_T_INT32, "expected_size", expected_size, + BUNYAN_T_INT32, "query_state", sqp->sq_state, + BUNYAN_T_END); + return (B_FALSE); } sqp->sq_size = nsize; scp->sc_input.sci_query = sqp; if (nop == SVP_R_VL2_ACK || nop == SVP_R_VL3_ACK || - nop == SVP_R_LOG_RM_ACK) { + nop == SVP_R_LOG_RM_ACK || nop == SVP_R_ROUTE_ACK || + nop == SVP_R_PONG) { sqp->sq_wdata = &sqp->sq_wdun; sqp->sq_wsize = sizeof (svp_query_data_t); } else { @@ -582,7 +671,7 @@ svp_conn_pollin(svp_conn_t *scp) default: libvarpd_panic("unexpeted read errno: %d", errno); } - } else if (ret == 0) { + } else if (ret == 0 && total - off > 0) { /* Try to reconnect to the remote host */ return (SVP_RA_ERROR); } @@ -626,6 +715,20 @@ svp_conn_pollin(svp_conn_t *scp) } else if (nop == SVP_R_LOG_RM_ACK) { svp_lrm_ack_t *svra = sqp->sq_wdata; sqp->sq_status = ntohl(svra->svra_status); + } else if (nop == SVP_R_ROUTE_ACK) { + svp_route_ack_t *sra = sqp->sq_wdata; + sqp->sq_status = ntohl(sra->sra_status); + } else if (nop == SVP_R_PONG) { + /* + * Handle the PONG versioning-capture here, as we need + * the version number, the scp_lock held, and the ability + * to error out. + */ + svp_conn_act_t cbret; + + cbret = svp_conn_pong_handler(scp, sqp); + if (cbret != SVP_RA_NONE) + return (cbret); } else { libvarpd_panic("unhandled nop: %d", nop); } @@ -737,6 +840,7 @@ svp_conn_handler(port_event_t *pe, void *arg) assert(pe != NULL); ret = svp_conn_poll_connect(pe, scp); break; + case SVP_CS_VERSIONING: case SVP_CS_ACTIVE: case SVP_CS_WINDDOWN: assert(pe != NULL); @@ -774,6 +878,9 @@ out: mutex_enter(&srp->sr_lock); mutex_enter(&scp->sc_lock); + if (ret == SVP_RA_FIND_VERSION) + ret = svp_conn_ping_version(scp); + if (ret == SVP_RA_ERROR) ret = svp_conn_reset(scp); @@ -1014,7 +1121,8 @@ void svp_conn_queue(svp_conn_t *scp, svp_query_t *sqp) { assert(MUTEX_HELD(&scp->sc_lock)); - assert(scp->sc_cstate == SVP_CS_ACTIVE); + assert(scp->sc_cstate == SVP_CS_ACTIVE || + scp->sc_cstate == SVP_CS_VERSIONING); sqp->sq_acttime = -1; list_insert_tail(&scp->sc_queries, sqp); diff --git a/usr/src/lib/varpd/svp/common/libvarpd_svp_prot.h b/usr/src/lib/varpd/svp/common/libvarpd_svp_prot.h index 16dbdbec05..25a626afd1 100644 --- a/usr/src/lib/varpd/svp/common/libvarpd_svp_prot.h +++ b/usr/src/lib/varpd/svp/common/libvarpd_svp_prot.h @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _LIBVARPD_SVP_PROT_H @@ -34,7 +34,13 @@ extern "C" { */ #define SVP_VERSION_ONE 1 -#define SVP_CURRENT_VERSION SVP_VERSION_ONE +#define SVP_VERSION_TWO 2 +/* + * Bump this to 2. Version 1 SVP is a subset of version 2, and can be + * determined using an SVP_R_PING as part of connection establishment. + * Version-2 specific changes will be highlighed (look for "v2"). + */ +#define SVP_CURRENT_VERSION SVP_VERSION_TWO typedef struct svp_req { uint16_t svp_ver; @@ -44,6 +50,10 @@ typedef struct svp_req { uint32_t svp_crc32; } svp_req_t; +/* + * Unless specified, all message types function identically between v1 and v2 + * of SVP. + */ typedef enum svp_op { SVP_R_UNKNOWN = 0x00, SVP_R_PING = 0x01, @@ -54,11 +64,13 @@ typedef enum svp_op { SVP_R_VL3_ACK = 0x06, SVP_R_BULK_REQ = 0x07, SVP_R_BULK_ACK = 0x08, - SVP_R_LOG_REQ = 0x09, - SVP_R_LOG_ACK = 0x0A, + SVP_R_LOG_REQ = 0x09, /* v2 introduces new log type */ + SVP_R_LOG_ACK = 0x0A, /* See svp_log_route_t */ SVP_R_LOG_RM = 0x0B, SVP_R_LOG_RM_ACK = 0x0C, - SVP_R_SHOOTDOWN = 0x0D + SVP_R_SHOOTDOWN = 0x0D, + SVP_R_ROUTE_REQ = 0x0E, /* v2 only */ + SVP_R_ROUTE_ACK = 0x0F /* v2 only */ } svp_op_t; typedef enum svp_status { @@ -70,7 +82,7 @@ typedef enum svp_status { } svp_status_t; /* - * A client issues the SVP_R_VL2_REQ whenever it needs to perform a VLS->UL3 + * A client issues the SVP_R_VL2_REQ whenever it needs to perform a VL2->UL3 * lookup. Requests have the following structure: */ typedef struct svp_vl2_req { @@ -169,7 +181,8 @@ typedef struct svp_log_req { */ typedef enum svp_log_type { SVP_LOG_VL2 = 0x01, - SVP_LOG_VL3 = 0x02 + SVP_LOG_VL3 = 0x02, + SVP_LOG_ROUTE = 0x03 /* v2 only */ } svp_log_type_t; typedef struct svp_log_vl2 { @@ -189,6 +202,24 @@ typedef struct svp_log_vl3 { uint32_t svl3_vnetid; } svp_log_vl3_t; +/* + * This log entry only appears on v2 connections. + */ +typedef struct svp_log_route { + uint32_t svlr_type; /* Should be SVP_LOG_ROUTE */ + uint8_t svlr_id[16]; /* 16-byte UUID */ + uint32_t svlr_src_vnetid; /* Source VXLAN vnetid. */ + uint32_t svlr_dst_vnetid; /* Dest. VXLAN vnetid. */ + uint32_t svlr_dcid; /* Remote/dest Data Center ID. */ + uint8_t svlr_srcip[16]; /* Source IP address base. */ + uint8_t svlr_dstip[16]; /* Destination IP address base. */ + uint16_t svlr_src_vlan; /* Source VLAN id. */ + uint16_t svlr_dst_vlan; /* Destination VLAN id. */ + uint8_t svlr_src_prefixlen; /* Source IP prefix length. */ + uint8_t svlr_dst_prefixlen; /* Dest. IP prefix length. */ + uint16_t svlr_pad; /* So we can be aligned... */ +} svp_log_route_t; + typedef struct svp_log_ack { uint32_t svla_status; uint8_t svla_data[]; @@ -229,6 +260,41 @@ typedef struct svp_shootdown { uint32_t svsd_vnetid; } svp_shootdown_t; +/* + * A route-request (SVP_R_ROUTE_REQ) queries the local SVP server to get a + * far-remote (i.e. another Triton Data Center, nee. SDC) SVP server for + * far-remote networks. Modern overlay modules will request IP destinations + * for remote-Triton networks, but they must know how to reach the + * remote-Triton SVP server. + * + * NOTE: SVP_R_ROUTE_{REQ,ACK} are only present in SVP v2. + */ +typedef struct svp_route_req { + uint32_t srr_vnetid; /* Requester's vnet ID. */ + uint16_t srr_vlan; /* Requester's VLAN ID. */ + uint16_t srr_pad; /* Zero on xmit, ignore on receipt. */ + uint8_t srr_srcip[16]; /* VL3 Source IP. */ + uint8_t srr_dstip[16]; /* VL3 Destination IP. */ +} svp_route_req_t; + +/* + * The far-remote Triton Data Center will answer with the requisite information + * to send overlay packets to the appropriate far-remote CNs. + */ +typedef struct svp_route_ack { + uint32_t sra_status; /* Status. */ + uint32_t sra_dcid; /* Far-remote Data Center ID. */ + uint32_t sra_vnetid; /* Far-remote vnet ID. */ + uint16_t sra_vlan; /* Far-remote VLAN ID. */ + uint16_t sra_port; /* Destination UL3 port. */ + uint8_t sra_ip[16]; /* Destination UL3 address. */ + uint8_t sra_srcmac[ETHERADDRL]; /* Far-remote VL2 source. */ + uint8_t sra_dstmac[ETHERADDRL]; /* Far-remote VL2 dest. */ + uint8_t sra_src_pfx; /* Far-remote VL3 source prefix */ + uint8_t sra_dst_pfx; /* Far-remote VL3 dest. prefix */ + uint16_t sra_pad; /* Must be explicit to 4-bytes. */ +} svp_route_ack_t; + #ifdef __cplusplus } #endif diff --git a/usr/src/lib/varpd/svp/common/libvarpd_svp_remote.c b/usr/src/lib/varpd/svp/common/libvarpd_svp_remote.c index 99775f93c0..cbb5572265 100644 --- a/usr/src/lib/varpd/svp/common/libvarpd_svp_remote.c +++ b/usr/src/lib/varpd/svp/common/libvarpd_svp_remote.c @@ -48,6 +48,12 @@ static svp_timer_t svp_dns_timer; static id_space_t *svp_idspace; static int svp_dns_timer_rate = 30; /* seconds */ +id_t +svp_id_alloc(void) +{ + return (id_alloc(svp_idspace)); +} + static void svp_remote_mkfmamsg(svp_remote_t *srp, svp_degrade_state_t state, char *buf, size_t buflen) @@ -245,6 +251,8 @@ svp_remote_attach(svp_remote_t *srp, svp_t *svp) libvarpd_panic("missing callback scb_vl2_invalidate"); if (svp->svp_cb.scb_vl3_inject == NULL) libvarpd_panic("missing callback scb_vl3_inject"); + if (svp->svp_cb.scb_route_lookup == NULL) + libvarpd_panic("missing callback scb_route_lookup"); check.svp_vid = svp->svp_vid; if (avl_find(&srp->sr_tree, &check, &where) != NULL) @@ -277,8 +285,41 @@ svp_remote_detach(svp_t *svp) } /* - * Walk the list of connections and find the first one that's available, the - * move it to the back of the list so it's less likely to be used again. + * See if the request can be sent over the connection's supported version. + * Scribble the version in the request itself. NOTE that we do not check the + * version that already exists in sqp->sq_header.svp_ver, as we may be called + * from svp_remote_reassign() (and change versions when arriving at a new + * connection). + */ +static boolean_t +svp_outbound_version_check(int version, svp_query_t *sqp) +{ + uint16_t op = htons(sqp->sq_header.svp_op); + + /* + * As of v1 -> v2, we really only need to restrict SVP_R_ROUTE_REQ + * as v2-only. Reflect that here. + * + * NOTE that if any message semantics change between versions, + * (e.g. "in v3 SVP_R_VL2_REQ takes on additional work"), we'll + * need to more-deeply inspect the query. It's possible that the + * svp_op space is big enough to just continue op-only inspections. + */ + + assert(version > 0 && version <= SVP_CURRENT_VERSION); + + if (op != SVP_R_ROUTE_REQ || version >= SVP_VERSION_TWO) { + sqp->sq_header.svp_ver = htons(version); + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Walk the list of connections and find the first one that's available AND + * version-appropriate for the message, then move the matched connection to + * the back of the list so it's less likely to be used again. */ static boolean_t svp_remote_conn_queue(svp_remote_t *srp, svp_query_t *sqp) @@ -289,7 +330,8 @@ svp_remote_conn_queue(svp_remote_t *srp, svp_query_t *sqp) for (scp = list_head(&srp->sr_conns); scp != NULL; scp = list_next(&srp->sr_conns, scp)) { mutex_enter(&scp->sc_lock); - if (scp->sc_cstate != SVP_CS_ACTIVE) { + if (scp->sc_cstate != SVP_CS_ACTIVE || + !svp_outbound_version_check(scp->sc_version, sqp)) { mutex_exit(&scp->sc_lock); continue; } @@ -329,14 +371,13 @@ svp_remote_vl2_lookup(svp_t *svp, svp_query_t *sqp, const uint8_t *mac, sqp->sq_arg = arg; sqp->sq_svp = svp; sqp->sq_state = SVP_QUERY_INIT; - sqp->sq_header.svp_ver = htons(SVP_CURRENT_VERSION); sqp->sq_header.svp_op = htons(SVP_R_VL2_REQ); sqp->sq_header.svp_size = htonl(sizeof (svp_vl2_req_t)); sqp->sq_header.svp_id = id_alloc(svp_idspace); if (sqp->sq_header.svp_id == (id_t)-1) libvarpd_panic("failed to allcoate from svp_idspace: %d", errno); - sqp->sq_header.svp_crc32 = htonl(0); + sqp->sq_header.svp_crc32 = 0; sqp->sq_rdata = vl2r; sqp->sq_rsize = sizeof (svp_vl2_req_t); sqp->sq_wdata = NULL; @@ -352,6 +393,67 @@ svp_remote_vl2_lookup(svp_t *svp, svp_query_t *sqp, const uint8_t *mac, } static void +svp_remote_route_lookup_cb(svp_query_t *sqp, void *arg) +{ + svp_t *svp = sqp->sq_svp; + svp_route_ack_t *sra = (svp_route_ack_t *)sqp->sq_wdata; + + /* + * Do the ntoh*()-ing here. + */ + if (sqp->sq_status == SVP_S_OK) { + svp->svp_cb.scb_route_lookup(svp, ntohl(sqp->sq_status), + ntohl(sra->sra_dcid), ntohl(sra->sra_vnetid), + ntohs(sra->sra_vlan), sra->sra_srcmac, sra->sra_dstmac, + ntohs(sra->sra_port), sra->sra_ip, sra->sra_src_pfx, + sra->sra_dst_pfx, arg); + } else { + svp->svp_cb.scb_route_lookup(svp, sqp->sq_status, + 0, 0, 0, NULL, NULL, 0, NULL, 0, 0, arg); + } +} + +void +svp_remote_route_lookup(svp_t *svp, svp_query_t *sqp, + const struct in6_addr *src, const struct in6_addr *dst, uint32_t vnetid, + uint16_t vlan, void *arg) +{ + svp_remote_t *srp; + svp_route_req_t *srr = &sqp->sq_rdun.sqd_rr; + + srp = svp->svp_remote; + sqp->sq_func = svp_remote_route_lookup_cb; + sqp->sq_arg = arg; + sqp->sq_svp = svp; + sqp->sq_state = SVP_QUERY_INIT; + sqp->sq_header.svp_op = htons(SVP_R_ROUTE_REQ); + sqp->sq_header.svp_size = htonl(sizeof (svp_route_req_t)); + sqp->sq_header.svp_id = id_alloc(svp_idspace); + if (sqp->sq_header.svp_id == (id_t)-1) + libvarpd_panic("failed to allcoate from svp_idspace: %d", + errno); + sqp->sq_header.svp_crc32 = 0; + sqp->sq_rdata = srr; + sqp->sq_rsize = sizeof (svp_route_req_t); + sqp->sq_wdata = NULL; + sqp->sq_wsize = 0; + + bcopy(src, srr->srr_srcip, sizeof (struct in6_addr)); + bcopy(dst, srr->srr_dstip, sizeof (struct in6_addr)); + /* Caller should've checked both are the same type... */ + srr->srr_vnetid = htonl(vnetid); + srr->srr_vlan = htons(vlan); + srr->srr_pad = 0; + + mutex_enter(&srp->sr_lock); + if (!svp_remote_conn_queue(srp, sqp)) { + sqp->sq_status = SVP_S_FATAL; + sqp->sq_func(sqp, arg); + } + mutex_exit(&srp->sr_lock); +} + +static void svp_remote_vl3_lookup_cb(svp_query_t *sqp, void *arg) { svp_t *svp = sqp->sq_svp; @@ -378,14 +480,13 @@ svp_remote_vl3_common(svp_remote_t *srp, svp_query_t *sqp, sqp->sq_func = func; sqp->sq_arg = arg; sqp->sq_state = SVP_QUERY_INIT; - sqp->sq_header.svp_ver = htons(SVP_CURRENT_VERSION); sqp->sq_header.svp_op = htons(SVP_R_VL3_REQ); sqp->sq_header.svp_size = htonl(sizeof (svp_vl3_req_t)); sqp->sq_header.svp_id = id_alloc(svp_idspace); if (sqp->sq_header.svp_id == (id_t)-1) libvarpd_panic("failed to allcoate from svp_idspace: %d", errno); - sqp->sq_header.svp_crc32 = htonl(0); + sqp->sq_header.svp_crc32 = 0; sqp->sq_rdata = vl3r; sqp->sq_rsize = sizeof (svp_vl3_req_t); sqp->sq_wdata = NULL; @@ -441,13 +542,22 @@ static void svp_remote_log_request_cb(svp_query_t *sqp, void *arg) { svp_remote_t *srp = sqp->sq_arg; + uint16_t version; + + /* + * Version in request is set in this sqp's read-data/sq_header by + * now. + */ + assert(sqp->sq_header.svp_op == htons(SVP_R_LOG_REQ)); + assert(sqp->sq_header.svp_ver != 0); + version = htons(sqp->sq_header.svp_ver); assert(sqp->sq_wdata != NULL); if (sqp->sq_status == SVP_S_OK) svp_shootdown_logr_cb(srp, sqp->sq_status, sqp->sq_wdata, - sqp->sq_size); + sqp->sq_size, version); else - svp_shootdown_logr_cb(srp, sqp->sq_status, NULL, 0); + svp_shootdown_logr_cb(srp, sqp->sq_status, NULL, 0, 0); } void @@ -460,14 +570,13 @@ svp_remote_log_request(svp_remote_t *srp, svp_query_t *sqp, void *buf, sqp->sq_func = svp_remote_log_request_cb; sqp->sq_state = SVP_QUERY_INIT; sqp->sq_arg = srp; - sqp->sq_header.svp_ver = htons(SVP_CURRENT_VERSION); sqp->sq_header.svp_op = htons(SVP_R_LOG_REQ); sqp->sq_header.svp_size = htonl(sizeof (svp_log_req_t)); sqp->sq_header.svp_id = id_alloc(svp_idspace); if (sqp->sq_header.svp_id == (id_t)-1) libvarpd_panic("failed to allcoate from svp_idspace: %d", errno); - sqp->sq_header.svp_crc32 = htonl(0); + sqp->sq_header.svp_crc32 = 0; sqp->sq_rdata = logr; sqp->sq_rsize = sizeof (svp_log_req_t); sqp->sq_wdata = buf; @@ -485,7 +594,7 @@ svp_remote_log_request(svp_remote_t *srp, svp_query_t *sqp, void *buf, mutex_exit(&srp->sr_lock); if (queued == B_FALSE) - svp_shootdown_logr_cb(srp, SVP_S_FATAL, NULL, 0); + svp_shootdown_logr_cb(srp, SVP_S_FATAL, NULL, 0, 0); } static void @@ -506,14 +615,13 @@ svp_remote_lrm_request(svp_remote_t *srp, svp_query_t *sqp, void *buf, sqp->sq_func = svp_remote_lrm_request_cb; sqp->sq_state = SVP_QUERY_INIT; sqp->sq_arg = srp; - sqp->sq_header.svp_ver = htons(SVP_CURRENT_VERSION); sqp->sq_header.svp_op = htons(SVP_R_LOG_RM); sqp->sq_header.svp_size = htonl(buflen); sqp->sq_header.svp_id = id_alloc(svp_idspace); if (sqp->sq_header.svp_id == (id_t)-1) libvarpd_panic("failed to allcoate from svp_idspace: %d", errno); - sqp->sq_header.svp_crc32 = htonl(0); + sqp->sq_header.svp_crc32 = 0; sqp->sq_rdata = buf; sqp->sq_rsize = buflen; sqp->sq_wdata = NULL; @@ -533,7 +641,7 @@ svp_remote_lrm_request(svp_remote_t *srp, svp_query_t *sqp, void *buf, mutex_exit(&srp->sr_lock); if (queued == B_FALSE) - svp_shootdown_logr_cb(srp, SVP_S_FATAL, NULL, 0); + svp_shootdown_logr_cb(srp, SVP_S_FATAL, NULL, 0, 0); } /* ARGSUSED */ @@ -795,6 +903,21 @@ svp_remote_shootdown_vl2(svp_remote_t *srp, svp_log_vl2_t *svl2) mutex_exit(&srp->sr_lock); } +void +svp_remote_shootdown_route(svp_remote_t *srp, svp_log_route_t *svlr) +{ + svp_t *svp, lookup; + + lookup.svp_vid = ntohl(svlr->svlr_src_vnetid); + mutex_enter(&srp->sr_lock); + if ((svp = avl_find(&srp->sr_tree, &lookup, NULL)) != NULL) { + svp->svp_cb.scb_route_shootdown(svp, svlr->svlr_srcip, + svlr->svlr_dstip, svlr->svlr_src_prefixlen, + svlr->svlr_dst_prefixlen, htons(svlr->svlr_src_vlan)); + } + mutex_exit(&srp->sr_lock); +} + int svp_remote_init(void) { diff --git a/usr/src/lib/varpd/svp/common/libvarpd_svp_shootdown.c b/usr/src/lib/varpd/svp/common/libvarpd_svp_shootdown.c index 76afb2519f..eacc927b4f 100644 --- a/usr/src/lib/varpd/svp/common/libvarpd_svp_shootdown.c +++ b/usr/src/lib/varpd/svp/common/libvarpd_svp_shootdown.c @@ -154,7 +154,7 @@ svp_shootdown_logr_shoot(void *data, svp_log_type_t type, void *arg) svp_remote_t *srp = sdl->sdl_remote; svp_lrm_req_t *svrr = sdl->sdl_logrm; - if (type != SVP_LOG_VL2 && type != SVP_LOG_VL3) + if (type != SVP_LOG_VL2 && type != SVP_LOG_VL3 && type != SVP_LOG_ROUTE) libvarpd_panic("encountered unknown type: %d\n", type); if (type == SVP_LOG_VL2) { @@ -165,12 +165,21 @@ svp_shootdown_logr_shoot(void *data, svp_log_type_t type, void *arg) UUID_LEN); svrr->svrr_count++; mutex_exit(&sdl->sdl_lock); - } else { + } else if (type == SVP_LOG_VL3) { svp_log_vl3_t *svl3 = data; /* Take a hold for the duration of this request */ svp_shootdown_ref(sdl); svp_remote_shootdown_vl3(srp, svl3, sdl); + } else { + svp_log_route_t *svlr = data; + + svp_remote_shootdown_route(srp, svlr); + mutex_enter(&sdl->sdl_lock); + bcopy(svlr->svlr_id, &svrr->svrr_ids[svrr->svrr_count * 16], + UUID_LEN); + svrr->svrr_count++; + mutex_exit(&sdl->sdl_lock); } return (0); @@ -187,13 +196,11 @@ svp_shootdown_logr_count(void *data, svp_log_type_t type, void *arg) static int svp_shootdown_logr_iter(svp_remote_t *srp, void *buf, size_t len, - int (*cb)(void *, svp_log_type_t, void *), void *arg) + int (*cb)(void *, svp_log_type_t, void *), void *arg, uint16_t version) { int ret; off_t cboff = 0; uint32_t *typep, type; - svp_log_vl2_t *svl2; - svp_log_vl3_t *svl3; /* Adjust for initial status word */ assert(len >= sizeof (uint32_t)); @@ -202,6 +209,7 @@ svp_shootdown_logr_iter(svp_remote_t *srp, void *buf, size_t len, while (len > 0) { size_t opsz; + char *typestring; if (len < sizeof (uint32_t)) { (void) bunyan_warn(svp_bunyan, @@ -216,30 +224,20 @@ svp_shootdown_logr_iter(svp_remote_t *srp, void *buf, size_t len, typep = buf + cboff; type = ntohl(*typep); - if (type == SVP_LOG_VL2) { + switch (type) { + case SVP_LOG_VL2: opsz = sizeof (svp_log_vl2_t); - if (len < opsz) { - (void) bunyan_warn(svp_bunyan, - "not enough data for svp_log_vl2_t", - BUNYAN_T_STRING, "remote_host", - srp->sr_hostname, - BUNYAN_T_INT32, "remote_port", - srp->sr_rport, - BUNYAN_T_INT32, "response_size", - cboff + len, - BUNYAN_T_INT32, "response_offset", cboff, - BUNYAN_T_END); - return (-1); - } - svl2 = (void *)typep; - if ((ret = cb(svl2, type, arg)) != 0) - return (ret); - } else if (type == SVP_LOG_VL3) { - + typestring = "svp_log_vl2_t"; + break; + case SVP_LOG_VL3: opsz = sizeof (svp_log_vl3_t); - if (len < opsz) { + typestring = "svp_log_vl3_t"; + break; + case SVP_LOG_ROUTE: + if (version < SVP_VERSION_TWO) { (void) bunyan_warn(svp_bunyan, - "not enough data for svp_log_vl3_t", + "insufficient version for SVP_LOG_ROUTE", + BUNYAN_T_UINT32, "version", version, BUNYAN_T_STRING, "remote_host", srp->sr_hostname, BUNYAN_T_INT32, "remote_port", @@ -250,10 +248,10 @@ svp_shootdown_logr_iter(svp_remote_t *srp, void *buf, size_t len, BUNYAN_T_END); return (-1); } - svl3 = (void *)typep; - if ((ret = cb(svl3, type, arg)) != 0) - return (ret); - } else { + opsz = sizeof (svp_log_route_t); + typestring = "svp_log_route_t"; + break; + default: (void) bunyan_warn(svp_bunyan, "unknown log structure type", BUNYAN_T_STRING, "remote_host", @@ -265,6 +263,20 @@ svp_shootdown_logr_iter(svp_remote_t *srp, void *buf, size_t len, BUNYAN_T_END); return (-1); } + if (len < opsz) { + (void) bunyan_warn(svp_bunyan, + "not enough data for message type", + BUNYAN_T_STRING, "msg_type", typestring, + BUNYAN_T_STRING, "remote_host", srp->sr_hostname, + BUNYAN_T_INT32, "remote_port", srp->sr_rport, + BUNYAN_T_INT32, "response_size", cboff + len, + BUNYAN_T_INT32, "response_offset", cboff, + BUNYAN_T_END); + return (-1); + } + if ((ret = cb((void *)typep, type, arg)) != 0) + return (ret); + len -= opsz; cboff += opsz; } @@ -274,7 +286,7 @@ svp_shootdown_logr_iter(svp_remote_t *srp, void *buf, size_t len, void svp_shootdown_logr_cb(svp_remote_t *srp, svp_status_t status, void *cbdata, - size_t cbsize) + size_t cbsize, uint16_t version) { uint_t count; svp_sdlog_t *sdl = &srp->sr_shoot; @@ -301,7 +313,7 @@ svp_shootdown_logr_cb(svp_remote_t *srp, svp_status_t status, void *cbdata, */ count = 0; if ((svp_shootdown_logr_iter(srp, cbdata, cbsize, - svp_shootdown_logr_count, &count)) != 0) { + svp_shootdown_logr_count, &count, version)) != 0) { mutex_enter(&sdl->sdl_lock); sdl->sdl_flags &= ~SVP_SD_RUNNING; svp_shootdown_schedule(sdl, B_FALSE); @@ -337,7 +349,7 @@ svp_shootdown_logr_cb(svp_remote_t *srp, svp_status_t status, void *cbdata, * is how many entries we have to remove. */ (void) svp_shootdown_logr_iter(srp, cbdata, cbsize, - svp_shootdown_logr_shoot, sdl); + svp_shootdown_logr_shoot, sdl, version); /* * Now that we're done with our work, release the hold. If we don't have diff --git a/usr/src/man/man1m/dladm.1m b/usr/src/man/man1m/dladm.1m index 77bf045e08..9912269dfb 100644 --- a/usr/src/man/man1m/dladm.1m +++ b/usr/src/man/man1m/dladm.1m @@ -176,7 +176,7 @@ dladm \- administer data links .LP .nf -\fBdladm create-overlay\fR [\fB-t\fR] \fB-e\fR \fIencap\fR \fB-s\fR \fIsearch\fR \fB-v\fR \fIvnetid\fR [\fB-p\fR \fIprop\fR=\fIvalue\fR[,...]] \fIoverlay\fR +\fBdladm create-overlay\fR [\fB-t\fR] \fB-e\fR \fIencap\fR \fB-s\fR \fIsearch\fR \fB-v\fR \fIvnetid\fR [\fB-d\fR \fIdcid\fR] [\fB-p\fR \fIprop\fR=\fIvalue\fR[,...]] \fIoverlay\fR \fBdladm delete-overlay\fR \fIoverlay\fR \fBdladm modify-overlay\fR \fB-d\fR \fImac\fR | \fB-f\fR | \fB-s\fR \fImac=ip:port\fR \fIoverlay\fR \fBdladm show-overlay\fR [ \fB-f\fR | \fB-t\fR ] [[\fB-p\fR] \fB-o\fR \fIfield\fR[,...]] [\fIoverlay\fR] @@ -4443,8 +4443,8 @@ The tunnel destination address. .sp .ne 2 .na -\fBdladm create-overlay\fR \fB-e\fR \fIencap\fR \fB-s\fR \fIsearch\fR -\fB-v\fR \fIvnetid\fR [\fB-p\fR \fIprop\fR=\fIvalue\fR[,...]] \fIoverlay\fR +\fBdladm create-overlay\fR [\fB-t\fR] \fB-e\fR \fIencap\fR \fB-s\fR \fIsearch\fR +\fB-v\fR \fIvnetid\fR [\fB-d\fR \fIdcid\fR] [\fB-p\fR \fIprop\fR=\fIvalue\fR[,...]] \fIoverlay\fR .ad .sp .6 .RS 4n @@ -4501,6 +4501,16 @@ determines how non-local targets are found and where packets are directed to. .sp .ne 2 .na +\fB\fB-d\fR \fIdcid\fR +.ad +.sp .6 +.RS 4n +Set the datacenter id to \fIdcid\fR. +.RE + +.sp +.ne 2 +.na \fB\fB-p\fR \fIprop\fR=\fIvalue\fR,..., \fB--prop\fR \fIprop\fR=\fIvalue\fR,...\fR .ad diff --git a/usr/src/test/os-tests/runfiles/default.run b/usr/src/test/os-tests/runfiles/default.run index 72158c8bc2..927c24ad7f 100644 --- a/usr/src/test/os-tests/runfiles/default.run +++ b/usr/src/test/os-tests/runfiles/default.run @@ -83,6 +83,7 @@ tests = ['acquire-compare', 'kmc-update'] [/opt/os-tests/tests/OS-6097.32] [/opt/os-tests/tests/OS-6097.64] +[/opt/os-tests/tests/qqcache] [/opt/os-tests/tests/ddi_ufm] user = root diff --git a/usr/src/test/os-tests/tests/Makefile b/usr/src/test/os-tests/tests/Makefile index 7396e135c9..3ec39ef440 100644 --- a/usr/src/test/os-tests/tests/Makefile +++ b/usr/src/test/os-tests/tests/Makefile @@ -23,6 +23,7 @@ SUBDIRS = \ libtopo \ pf_key \ poll \ + qqcache \ sdevfs \ secflags \ sigqueue \ diff --git a/usr/src/test/os-tests/tests/qqcache/Makefile b/usr/src/test/os-tests/tests/qqcache/Makefile new file mode 100644 index 0000000000..43843b36fb --- /dev/null +++ b/usr/src/test/os-tests/tests/qqcache/Makefile @@ -0,0 +1,60 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +include $(SRC)/cmd/Makefile.cmd +include $(SRC)/test/Makefile.com + +PROG = qqcache + +LOCAL_OBJS = qqcache-test.o +COMMON_OBJS = qqcache.o +COMMON_DIR = $(SRC)/uts/common/qqcache +OBJS = $(LOCAL_OBJS) $(COMMON_OBJS) + +ROOTOPTPKG = $(ROOT)/opt/os-tests +TESTDIR = $(ROOTOPTPKG)/tests/qqcache + +CMDS = $(PROG:%=$(TESTDIR)/%) +$(CMDS) := FILEMODE = 0555 + +CSTD = $(CSTD_GNU99) +LDLIBS += -lumem -lcmdutils + +all: $(PROG) + +install: all $(CMDS) + +lint: + +clobber: clean + -$(RM) $(PROG) $(OBJS) + +clean: + -$(RM) $(CLEANFILES) + +$(CMDS): $(TESTDIR) $(PROG) + +$(TESTDIR): + $(INS.dir) + +$(TESTDIR)/%: % + $(INS.file) + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDLIBS) + +%.o: $(COMMON_DIR)/%.c + $(COMPILE.c) -o $@ $< + $(POST_PROCESS_O) diff --git a/usr/src/test/os-tests/tests/qqcache/qqcache-test.c b/usr/src/test/os-tests/tests/qqcache/qqcache-test.c new file mode 100644 index 0000000000..2606ffffb8 --- /dev/null +++ b/usr/src/test/os-tests/tests/qqcache/qqcache-test.c @@ -0,0 +1,380 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018, Joyent, Inc. + */ +#include <err.h> +#include <errno.h> +#include <limits.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <string.h> +#include <sys/debug.h> +#include <sys/list.h> +#include <sys/types.h> +#include <sys/qqcache.h> +#include <sys/qqcache_impl.h> +#include <umem.h> + +/* Some arbitrary sizes */ +#define INITIAL_CACHE_SIZE 12 +#define INITIAL_CACHE_A 25 +#define CACHE_HSIZE 11 + +#define OUTPUT_WIDTH 80 + +/* + * If we extend the implementation to use more lists, the test code will need + * to be updated accordingly + */ +CTASSERT(QQCACHE_NUM_LISTS == 2); + +typedef struct entry { + uint_t e_val; + qqcache_link_t e_link; +} entry_t; + +enum { + ITER_ERROR = -1, + ITER_OK = 0, + ITER_STOP = 1 +}; + +static uint64_t entry_hash(const void *); +static int entry_cmp(const void *, const void *); +static void entry_dtor(void *); +static entry_t *entry_new(uint_t val); + +static void expect(qqcache_t *, uint_t *, size_t, uint_t *, size_t, int); +static void expect_val(qqcache_t *, const entry_t *, uint_t); +static void dump_cache(qqcache_t *); +static int iter_list(qqcache_t *, size_t, int (*)(void *, void *), void *); +static int xprintf(FILE *, const char *, ...); + +int +main(void) +{ + qqcache_t *qc; + uint_t val; + + VERIFY0(qqcache_create(&qc, INITIAL_CACHE_SIZE, INITIAL_CACHE_A, + CACHE_HSIZE, entry_hash, entry_cmp, entry_dtor, sizeof (entry_t), + offsetof(entry_t, e_link), offsetof(entry_t, e_val), UMEM_DEFAULT)); + + /* Create a few entries */ + VERIFY0(qqcache_insert(qc, entry_new(5))); + VERIFY0(qqcache_insert(qc, entry_new(4))); + VERIFY0(qqcache_insert(qc, entry_new(3))); + VERIFY0(qqcache_insert(qc, entry_new(2))); + VERIFY0(qqcache_insert(qc, entry_new(1))); + expect(qc, NULL, 0, (uint_t[]){1, 2, 3, 4, 5}, 5, __LINE__); + + /* Adding a duplicate should fail */ + { + entry_t *e = entry_new(3); + VERIFY3S(qqcache_insert(qc, e), ==, EEXIST); + entry_dtor(e); + } + + VERIFY0(qqcache_insert(qc, entry_new(10))); + VERIFY0(qqcache_insert(qc, entry_new(9))); + VERIFY0(qqcache_insert(qc, entry_new(8))); + VERIFY0(qqcache_insert(qc, entry_new(7))); + /* This should bump the LRU entry (5) from the list */ + VERIFY0(qqcache_insert(qc, entry_new(6))); + expect(qc, NULL, 0, + (uint_t[]){6, 7, 8, 9, 10, 1, 2, 3, 4}, 9, __LINE__); + + /* Lookup a few entries to move them to the MFU list */ + val = 3; + expect_val(qc, qqcache_lookup(qc, &val), 3); + expect(qc, (uint_t[]) {3}, 1, + (uint_t[]){6, 7, 8, 9, 10, 1, 2, 4}, 8, __LINE__); + + val = 8; + expect_val(qc, qqcache_lookup(qc, &val), 8); + expect(qc, (uint_t[]) {8, 3}, 2, + (uint_t[]){6, 7, 9, 10, 1, 2, 4}, 7, __LINE__); + + /* Now move 3 back to the head of list 0 */ + val = 3; + expect_val(qc, qqcache_lookup(qc, &val), 3); + expect(qc, (uint_t[]) {3, 8}, 2, + (uint_t[]){6, 7, 9, 10, 1, 2, 4}, 7, __LINE__); + + val = 7; + expect_val(qc, qqcache_lookup(qc, &val), 7); + expect(qc, (uint_t[]) {7, 3, 8}, 3, + (uint_t[]){6, 9, 10, 1, 2, 4}, 6, __LINE__); + + /* This should push 8 from the MFU back onto the MRU */ + val = 10; + expect_val(qc, qqcache_lookup(qc, &val), 10); + expect(qc, (uint_t[]) {10, 7, 3}, 3, + (uint_t[]){8, 6, 9, 1, 2, 4}, 6, __LINE__); + + /* Add some more values */ + VERIFY0(qqcache_insert(qc, entry_new(11))); + VERIFY0(qqcache_insert(qc, entry_new(12))); + VERIFY0(qqcache_insert(qc, entry_new(13))); + VERIFY0(qqcache_insert(qc, entry_new(14))); + VERIFY0(qqcache_insert(qc, entry_new(15))); + expect(qc, (uint_t[]) {10, 7, 3}, 3, + (uint_t[]){15, 14, 13, 12, 11, 8, 6, 9, 1, 2}, 9, __LINE__); + + VERIFY0(qqcache_adjust_size(qc, INITIAL_CACHE_SIZE + 4)); + expect(qc, (uint_t[]) {10, 7, 3}, 3, + (uint_t[]){15, 14, 13, 12, 11, 8, 6, 9, 1, 2}, 9, __LINE__); + + VERIFY0(qqcache_insert(qc, entry_new(16))); + VERIFY0(qqcache_insert(qc, entry_new(17))); + VERIFY0(qqcache_insert(qc, entry_new(18))); + VERIFY0(qqcache_insert(qc, entry_new(19))); + expect(qc, (uint_t[]) {10, 7, 3}, 3, + (uint_t[]) {19, 18, 17, 16, 15, 14, 13, 12, 11, 8, 6, 9}, 12, + __LINE__); + + VERIFY0(qqcache_adjust_size(qc, INITIAL_CACHE_SIZE - 2)); + expect(qc, (uint_t[]) {10, 7}, 2, + (uint_t[]){3, 19, 18, 17, 16, 15, 14, 13}, 8, __LINE__); + + VERIFY3S(qqcache_adjust_size(qc, 2), ==, EINVAL); + + VERIFY0(qqcache_adjust_a(qc, 50)); + expect(qc, (uint_t[]) {10, 7}, 2, + (uint_t[]){3, 19, 18, 17, 16}, 5, __LINE__); + + qqcache_destroy(qc); + return (0); +} + +struct cmp_arg { + qqcache_t *qc; + uint_t *vals; + size_t i; + size_t listnum; + int linenum; +}; + +static int +cmp_cb(void *op, void *arg) +{ + entry_t *e = op; + struct cmp_arg *ca = arg; + uint_t val = ca->vals[ca->i++]; + + if (e->e_val == val) + return (ITER_OK); + + (void) xprintf(stderr, "Line %d: Unexpected value in list %zu.\n", + ca->linenum, ca->listnum); + (void) xprintf(stderr, " Expected: %u\n Actual: %u\n", val, + e->e_val); + + return (ITER_ERROR); +} + +static void +cmp_list(qqcache_t *qc, size_t listnum, uint_t *vals, size_t n, int linenum) +{ + qqcache_list_t *l = &qc->qqc_lists[listnum]; + struct cmp_arg arg = { + .qc = qc, + .vals = vals, + .i = 0, + .listnum = listnum, + .linenum = linenum + }; + + if (l->qqcl_len != n) { + (void) xprintf(stderr, + "Line %d: Unexpected length for list %zu.\n" + " Length: %zu\n" + " Expected: %zu\n\n", linenum, listnum, l->qqcl_len, n); + dump_cache(qc); + } + + if (iter_list(qc, listnum, cmp_cb, &arg) != ITER_OK) { + dump_cache(qc); + exit(1); + } +} + +static void +expect(qqcache_t *qc, uint_t *l0, size_t l0sz, uint_t *l1, size_t l1sz, + int linenum) +{ + cmp_list(qc, 0, l0, l0sz, linenum); + cmp_list(qc, 1, l1, l1sz, linenum); +} + +static void +expect_val(qqcache_t *qc, const entry_t *e, uint_t val) +{ + char buf[2][64]; + if (e == NULL && val == UINT_MAX) + return; + + if (e != NULL && e->e_val == val) + return; + + if (e != NULL) + (void) snprintf(buf[0], sizeof (buf[0]), "%u", e->e_val); + else + (void) strlcpy(buf[0], "<NULL>", sizeof (buf[0])); + + if (val != UINT_MAX) + (void) snprintf(buf[1], sizeof (buf[1]), "%u", val); + else + (void) strlcpy(buf[1], "<NONE>", sizeof (buf[1])); + + (void) xprintf(stderr, "Unexpected value in list:\n"); + (void) xprintf(stderr, " Found: %s\n Expected: %s\n", + buf[0], buf[1]); + dump_cache(qc); + exit(1); +} + +struct dump_args { + int prefixlen; + int col; + boolean_t nl; +}; + +static int +dump_entry(void *ep, void *arg) +{ + entry_t *e = ep; + struct dump_args *da = arg; + char buf[64] = { 0 }; + int n; + + n = snprintf(buf, sizeof (buf), "%u", e->e_val); + /* buf should be large enough to hold an unsigned val */ + VERIFY3S(n, >, 0); + VERIFY3S(n, <, sizeof (buf)); + + if (da->col + n + 2 > OUTPUT_WIDTH) { + da->col = xprintf(stderr, "\n%*s", da->prefixlen, "") - 1; + da->nl = B_TRUE; + } else if (!da->nl) { + da->col += xprintf(stderr, ", "); + } + + da->col += xprintf(stderr, "%s", buf); + da->nl = B_FALSE; + + return (ITER_OK); +} + +static void +dump_cache(qqcache_t *qc) +{ + (void) xprintf(stderr, "Cache contents:\n"); + + for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) { + qqcache_list_t *l = &qc->qqc_lists[i]; + struct dump_args args = { + .nl = B_TRUE + }; + + args.col = args.prefixlen = + xprintf(stderr, "List %zu (%zu/%zu): ", i, l->qqcl_len, + qc->qqc_max[i]); + + (void) iter_list(qc, i, dump_entry, &args); + VERIFY(fputc('\n', stderr)); + } +} + +static int +iter_list(qqcache_t *qc, size_t listnum, int (*cb)(void *, void *), + void *arg) +{ + qqcache_list_t *l = &qc->qqc_lists[listnum]; + void *lp; + int ret; + + for (lp = list_head(&l->qqcl_list); lp != NULL; + lp = list_next(&l->qqcl_list, lp)) { + if ((ret = cb(link_to_obj(qc, lp), arg)) != ITER_OK) + return (ret); + } + + return (ITER_OK); +} + +/* + * A small wrapper around vfprintf(3C) so caller doesn't need to deal with + * errors or negative return values. + */ +static int +xprintf(FILE *f, const char *fmt, ...) +{ + int n; + va_list ap; + + va_start(ap, fmt); + n = vfprintf(f, fmt, ap); + va_end(ap); + + if (n < 0 || ferror(f)) + err(EXIT_FAILURE, "\nUnable to write output"); + + return (n); +} + +static entry_t * +entry_new(uint_t val) +{ + entry_t *e = calloc(1, sizeof (*e)); + + VERIFY3P(e, !=, NULL); + e->e_val = val; + return (e); +} + +static uint64_t +entry_hash(const void *p) +{ + const uint_t *vp = p; + uint64_t val = *vp; + return (val); +} + +static int +entry_cmp(const void *a, const void *b) +{ + const uint_t *l = a; + const uint_t *r = b; + return ((*l == *r) ? 0 : 1); +} + +static void +entry_dtor(void *arg) +{ + free(arg); +} + +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); +} diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 752fe56100..e973cf58ad 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -299,6 +299,7 @@ GENUNIX_OBJS += \ resolvepath.o \ retire_store.o \ process.o \ + qqcache.o \ rlimit.o \ rmap.o \ rw.o \ diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index a32c094f3b..ba8945b6fb 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -26,6 +26,7 @@ # Copyright 2020 Joyent, Inc. # Copyright 2018 Nexenta Systems, Inc. # Copyright (c) 2017 by Delphix. All rights reserved. +# Copyright 2018 Joyent, Inc. # Copyright 2020 Oxide Computer Company # @@ -1627,6 +1628,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/pcmcia/pcs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/qqcache/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/refhash/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/rpc/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -2778,6 +2787,9 @@ $(LINTS_DIR)/%.ln: $(COMMONBASE)/nvpair/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/os/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/qqcache/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/rpc/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c index 2ad3f4f591..c54a6e0d9c 100644 --- a/usr/src/uts/common/io/overlay/overlay.c +++ b/usr/src/uts/common/io/overlay/overlay.c @@ -134,6 +134,40 @@ * be sent to. In addition, they handle questions related to how to handle * things like broadcast and multicast traffic, etc. * + * ROUTING + * + * Supporting routing of packets between VLANs that exist on an overlay + * network require two major differences. First, packets destined for off-VLAN + * destinations need to be identified. Second, we must obtain the necessary + * additional information necessary to deliver the packet to its off-VLAN + * destination. + * + * To solve the first issue, we utilize the existing IP routing functionality. + * Off-vlan destinations are given routes with next hops in the originating + * netstack's routing table--just like in physical networks. The system will + * then attempt to generate an ARP query, which will be sent out to varpd in + * the exact same manner as is described above for other on-VLAN destinations. + * The response for this will include a MAC address that is both used for the + * ARP reply, and is added to our VL2 MAC->UL3 hash table, but is added + * with the OVERLAY_ENTRY_F_ROUTER flag set. Once this is done, the originating + * netstack will send off-VLAN packets to this router MAC, allowing the + * overlay device to identify these packets as requiring routing. + * + * Once packets with an off-VLAN destination are identified, we must determine + * what the destination vid, VL2 VLAN, and VL2 MAC values are for the given + * packet. For reasons similar to the VL2 MAC->UL3 lookup described above, + * we utilize the flexibility of user land to perform these lookups (also + * using varpd). In this instance we are attempting to find a destination VL3 + * IP to a UL3 IP mapping (a few extra bits of information are necessary to + * allow for disambiguation of the destination VL3 IP for situations such as + * mirroring a production environment including VL3 IPs in an isolated set of + * VLANs). We then store these results in a VL3->UL3 hash table for future + * lookups. + * + * To prevent the size of both the VL2->UL3 and VL3->UL3 hash tables from + * growing without bound, we cap the number of entries in each hash table and + * utilize the ARC algorithm to manage their contents. + * * ---------- * Properties * ---------- @@ -205,6 +239,10 @@ * UTF-8. Note that the size of the string includes the null * terminator. * + * OVERLAY_PROP_T_ETHER + * + * An ether_addr_t, which has a fixed size. + * * The next thing that we apply to a property is its permission. The permissions * are put together by the bitwise or of the following flags and values. * @@ -461,7 +499,7 @@ * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things * are much more interesting and as a result, more complicated. We primarily * store lists of overlay_target_entry_t's which are stored in both an avl tree - * and a refhash_t. The primary look up path uses the refhash_t and the avl tree + * and a qqcache_t. The primary look up path uses the qqcache_t and the avl tree * is only used for a few of the target ioctls used to dump data such that we * can get a consistent iteration order for things like dladm show-overlay -t. * The key that we use for the reference hashtable is based on the mac address @@ -486,6 +524,28 @@ * any outstanding data to that place. For the full story on how we look that up * will be discussed in the section on the Target Cache Lifecycle. * + * For routing, everything works largely the same as it does in the non-routing + * situations. The major differences are that both the target cache is always + * an OVERLAY_TARGET_DYNAMIC cache, and that an additional hash table lookup + * occurs. When a routed packet is sent down stack, the + * overlay_target_entry_t in the VL2 cache will have its + * OVERLAY_ENTRY_F_ROUTER flag set, which will prompt a lookup in the VL3->UL3 + * cache (using the source VL3, source VL2 VLAN, and destination VL3 values + * from the packet as the lookup key). The entry returned from the cache is + * used to modify the source and destination VL2 MAC addresses as well as + * the VL2 VLAN ID, and then is encapsulated and sent to its UL3 destination. + * On reception, decapsulation happens exactely the same as in the non-routed + * case, and the packet appears just as if it was sent out the VL2 network + * from a router connected to it. This is done both to maintain the illusion + * of a physical network when sniffing packets at the instance level, and + * so that the mac layer sitting above the destinations overlay device + * (for the vnic created over the overlay) does not discard the packet because + * its VLAN tag does not match the VLAN tag of the destination VNIC. While + * some of these modifications could be split between the source and + * destination hosts, by doing the work on the source, it maximizes any + * potential parallelism that might be present from multiple flows to a given + * destination. + * * ------------------------ * FMA and Degraded Devices * ------------------------ @@ -830,21 +890,62 @@ typedef enum overlay_dev_prop { OVERLAY_DEV_P_MTU = 0, OVERLAY_DEV_P_VNETID, OVERLAY_DEV_P_ENCAP, - OVERLAY_DEV_P_VARPDID + OVERLAY_DEV_P_VARPDID, + OVERLAY_DEV_P_DCID, + OVERLAY_DEV_P_VL2_CACHE_SIZE, + OVERLAY_DEV_P_VL2_CACHE_A, + OVERLAY_DEV_P_ROUTE_CACHE_SIZE, + OVERLAY_DEV_P_ROUTE_CACHE_A } overlay_dev_prop_t; -#define OVERLAY_DEV_NPROPS 4 +#define OVERLAY_DEV_NPROPS 9 static const char *overlay_dev_props[] = { "mtu", "vnetid", "encap", - "varpd/id" + "varpd/id", + "dcid", + "vl2_cache_size", + "_vl2_cache_a", + "route_cache_size", + "_route_cache_a" }; +/* properties that can be changed live */ +static boolean_t overlay_dev_liveprop[] = { + B_FALSE, /* mtu */ + B_FALSE, /* vnetid */ + B_FALSE, /* encap */ + B_FALSE, /* varpd/id */ + B_FALSE, /* dcid */ + B_TRUE, /* vl2_cache_size */ + B_TRUE, /* _vl2_cache_a */ + B_TRUE, /* route_cache_size */ + B_TRUE /* _route_cache_a */ +}; + +CTASSERT(ARRAY_SIZE(overlay_dev_props) == OVERLAY_DEV_NPROPS); +CTASSERT(ARRAY_SIZE(overlay_dev_liveprop) == OVERLAY_DEV_NPROPS); + #define OVERLAY_MTU_MIN 576 #define OVERLAY_MTU_DEF 1400 #define OVERLAY_MTU_MAX 8900 +/* The 2Q parameter 'a' is a percentage */ +#define OVERLAY_CACHE_MAX_A 100 +/* An somewhat arbitrary default, biasing towards storing more MFU entries */ +#define OVERLAY_CACHE_A_DEF 75 + +/* Somewhat arbitrary min and max values */ +#define OVERLAY_VL2_CACHE_MIN 256 +#define OVERLAY_VL2_CACHE_MAX 10240 +#define OVERLAY_VL2_CACHE_DEF OVERLAY_VL2_CACHE_MIN + +/* Somewhat arbitrary min and max values */ +#define OVERLAY_ROUTE_CACHE_MIN 256 +#define OVERLAY_ROUTE_CACHE_MAX 10240 +#define OVERLAY_ROUTE_CACHE_DEF OVERLAY_ROUTE_CACHE_MIN + overlay_dev_t * overlay_hold_by_dlid(datalink_id_t id) { @@ -1066,7 +1167,6 @@ overlay_m_tx(void *arg, mblk_t *mp_chain) bzero(&hdr, sizeof (struct msghdr)); bzero(&einfo, sizeof (ovep_encap_info_t)); - einfo.ovdi_id = odd->odd_vid; mp = mp_chain; while (mp != NULL) { socklen_t slen; @@ -1077,7 +1177,7 @@ overlay_m_tx(void *arg, mblk_t *mp_chain) ep = NULL; ret = overlay_target_lookup(odd, mp, - (struct sockaddr *)&storage, &slen); + (struct sockaddr *)&storage, &slen, &einfo.ovdi_id); if (ret != OVERLAY_TARGET_OK) { if (ret == OVERLAY_TARGET_DROP) freemsg(mp); @@ -1260,6 +1360,19 @@ overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) } odd->odd_vid = oicp->oic_vnetid; + if (oicp->oic_dcid > UINT32_MAX) { + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + odd->odd_dcid = oicp->oic_dcid; + + odd->odd_vl2sz = OVERLAY_VL2_CACHE_DEF; + odd->odd_vl2a = OVERLAY_CACHE_A_DEF; + odd->odd_routesz = OVERLAY_ROUTE_CACHE_DEF; + odd->odd_routea = OVERLAY_CACHE_A_DEF; + mac = mac_alloc(MAC_VERSION); if (mac == NULL) { mutex_exit(&overlay_dev_lock); @@ -1613,6 +1726,7 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, int ret; mac_perim_handle_t mph; uint_t propid = UINT_MAX; + uint32_t def; overlay_ioc_propinfo_t *oip = karg; overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip; @@ -1695,6 +1809,42 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); overlay_prop_set_nodefault(phdl); break; + case OVERLAY_DEV_P_DCID: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_nodefault(phdl); + overlay_prop_set_range_uint32(phdl, 0, UINT32_MAX); + break; + case OVERLAY_DEV_P_VL2_CACHE_SIZE: + def = OVERLAY_VL2_CACHE_DEF; + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, OVERLAY_VL2_CACHE_MIN, + OVERLAY_VL2_CACHE_MAX); + break; + case OVERLAY_DEV_P_VL2_CACHE_A: + def = OVERLAY_CACHE_A_DEF; + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, 0, OVERLAY_CACHE_MAX_A); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_SIZE: + def = OVERLAY_ROUTE_CACHE_DEF; + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, OVERLAY_ROUTE_CACHE_MIN, + OVERLAY_ROUTE_CACHE_MAX); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_A: + def = OVERLAY_CACHE_A_DEF; + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, 0, OVERLAY_CACHE_MAX_A); + break; default: overlay_hold_rele(odd); mac_perim_exit(mph); @@ -1804,6 +1954,41 @@ overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, } mutex_exit(&odd->odd_lock); break; + case OVERLAY_DEV_P_DCID: + /* + * While it's read-only while inside of a mux, we're not in a + * context that can guarantee that. Therefore we always grab the + * overlay_dev_t's odd_lock. + */ + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_dcid, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; + case OVERLAY_DEV_P_VL2_CACHE_SIZE: + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_vl2sz, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; + case OVERLAY_DEV_P_VL2_CACHE_A: + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_vl2a, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_SIZE: + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_routesz, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_A: + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_routea, oip->oip_value, sizeof (uint32_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint32_t); + break; default: ret = ENOENT; } @@ -1845,6 +2030,146 @@ overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid) mutex_exit(&odd->odd_lock); } +static void +overlay_setprop_dcid(overlay_dev_t *odd, uint32_t dcid) +{ + mutex_enter(&odd->odd_lock); + + /* Simple case, not active */ + if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) { + odd->odd_dcid = dcid; + mutex_exit(&odd->odd_lock); + return; + } + + /* + * In the hard case, we need to set the drop flag, quiesce I/O and then + * we can go ahead and do everything. + */ + odd->odd_flags |= OVERLAY_F_MDDROP; + overlay_io_wait(odd, OVERLAY_F_IOMASK); + mutex_exit(&odd->odd_lock); + + overlay_mux_remove_dev(odd->odd_mux, odd); + mutex_enter(&odd->odd_lock); + odd->odd_dcid = dcid; + mutex_exit(&odd->odd_lock); + overlay_mux_add_dev(odd->odd_mux, odd); + + mutex_enter(&odd->odd_lock); + ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX); + odd->odd_flags &= ~OVERLAY_F_IN_MUX; + mutex_exit(&odd->odd_lock); +} + +static int +overlay_setprop_vl2_cachesz(overlay_dev_t *odd, uint32_t sz) +{ + overlay_target_t *ott = NULL; + int ret = 0; + + if (sz == 0) + sz = OVERLAY_VL2_CACHE_DEF; + + /* Caller should have validated this */ + ASSERT3U(sz, >=, OVERLAY_VL2_CACHE_MIN); + ASSERT3U(sz, <=, OVERLAY_VL2_CACHE_MAX); + + mutex_enter(&odd->odd_lock); + ott = odd->odd_target; + + /* ott_mode is RO if the target exists */ + if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) { + mutex_enter(&ott->ott_lock); + ret = qqcache_adjust_size(ott->ott_u.ott_dyn.ott_dhash, sz); + mutex_exit(&ott->ott_lock); + } + + if (ret == 0) + odd->odd_vl2sz = sz; + mutex_exit(&odd->odd_lock); + + return (ret); +} + +static int +overlay_setprop_vl2_cachea(overlay_dev_t *odd, uint32_t a) +{ + overlay_target_t *ott = NULL; + int ret = 0; + + /* Caller should have validated this */ + ASSERT3U(a, <=, 100); + + mutex_enter(&odd->odd_lock); + ott = odd->odd_target; + + /* ott_mode is RO if the target exists */ + if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) { + mutex_enter(&ott->ott_lock); + ret = qqcache_adjust_a(ott->ott_u.ott_dyn.ott_dhash, a); + mutex_exit(&ott->ott_lock); + } + if (ret == 0) + odd->odd_vl2a = a; + mutex_exit(&odd->odd_lock); + + return (ret); +} + +static int +overlay_setprop_route_cachesz(overlay_dev_t *odd, uint32_t sz) +{ + overlay_target_t *ott = NULL; + int ret = 0; + + if (sz == 0) + sz = OVERLAY_ROUTE_CACHE_DEF; + + ASSERT3U(sz, >=, OVERLAY_ROUTE_CACHE_MIN); + ASSERT3U(sz, <=, OVERLAY_ROUTE_CACHE_MAX); + + mutex_enter(&odd->odd_lock); + ott = odd->odd_target; + + /* ott_mode is RO if the target exists */ + if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) { + mutex_enter(&ott->ott_lock); + ret = qqcache_adjust_size(ott->ott_u.ott_dyn.ott_l3dhash, sz); + mutex_exit(&ott->ott_lock); + } + if (ret == 0) + odd->odd_routesz = sz; + mutex_exit(&odd->odd_lock); + + return (ret); +} + +static int +overlay_setprop_route_cachea(overlay_dev_t *odd, uint32_t a) +{ + overlay_target_t *ott = NULL; + int ret = 0; + + /* Caller should have validated this */ + ASSERT3U(a, <=, 100); + + mutex_enter(&odd->odd_lock); + ott = odd->odd_target; + + /* ott_mode is RO if the target exists */ + if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) { + mutex_enter(&ott->ott_lock); + ret = qqcache_adjust_a(ott->ott_u.ott_dyn.ott_l3dhash, a); + mutex_exit(&ott->ott_lock); + } + if (ret == 0) + odd->odd_routea = a; + mutex_exit(&odd->odd_lock); + + return (ret); +} + /* ARGSUSED */ static int overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, @@ -1855,7 +2180,7 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, overlay_ioc_prop_t *oip = karg; uint_t propid = UINT_MAX; mac_perim_handle_t mph; - uint64_t maxid, *vidp; + uint64_t maxid, *vidp, *dcidp, *vl2szp, *vl2ap, *routeszp, *routeap; if (oip->oip_size > OVERLAY_PROP_SIZEMAX) return (EINVAL); @@ -1865,31 +2190,48 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, return (ENOENT); oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; - mac_perim_enter_by_mh(odd->odd_mh, &mph); - mutex_enter(&odd->odd_lock); - if (odd->odd_flags & OVERLAY_F_ACTIVATED) { - mac_perim_exit(mph); - mutex_exit(&odd->odd_lock); - return (ENOTSUP); - } - mutex_exit(&odd->odd_lock); + + /* + * Currently, only certain overlay properties (and no encapsulation + * properties) can be changed while the overlay device is active. + */ if (oip->oip_id == -1) { int i; for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) break; - if (i == OVERLAY_DEV_NPROPS) { - ret = odd->odd_plugin->ovp_ops->ovpo_setprop( - odd->odd_pvoid, oip->oip_name, - oip->oip_value, oip->oip_size); - overlay_hold_rele(odd); - mac_perim_exit(mph); - return (ret); - } } - propid = i; + if (i < OVERLAY_DEV_NPROPS) + propid = i; + } else if (oip->oip_id < OVERLAY_DEV_NPROPS) { + propid = oip->oip_id; + } + + /* + * A bit tricky, but propid is initalized to UINT_MAX, so we know we + * have an overlay property whenever propid < OVERLAY_DEV_NPROPS, + * otherwise we have a plugin property. + */ + mac_perim_enter_by_mh(odd->odd_mh, &mph); + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_ACTIVATED) && + ((propid >= OVERLAY_DEV_NPROPS || !overlay_dev_liveprop[propid]))) { + mutex_exit(&odd->odd_lock); + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_exit(&odd->odd_lock); + + if (oip->oip_id == -1 && propid >= OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_setprop( + odd->odd_pvoid, oip->oip_name, + oip->oip_value, oip->oip_size); + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ret); } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; @@ -1941,6 +2283,68 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, case OVERLAY_DEV_P_VARPDID: ret = EPERM; break; + case OVERLAY_DEV_P_DCID: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + dcidp = (uint64_t *)oip->oip_value; + if (*dcidp > UINT32_MAX) { + ret = EINVAL; + break; + } + overlay_setprop_dcid(odd, *dcidp); + break; + case OVERLAY_DEV_P_VL2_CACHE_SIZE: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + vl2szp = (uint64_t *)oip->oip_value; + if (*vl2szp != 0 && (*vl2szp < OVERLAY_VL2_CACHE_MIN || + *vl2szp > OVERLAY_VL2_CACHE_MAX)) { + ret = EINVAL; + break; + } + ret = overlay_setprop_vl2_cachesz(odd, *vl2szp); + break; + case OVERLAY_DEV_P_VL2_CACHE_A: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + vl2ap = (uint64_t *)oip->oip_value; + if (*vl2ap > OVERLAY_CACHE_MAX_A) { + ret = EINVAL; + break; + } + ret = overlay_setprop_vl2_cachea(odd, *vl2ap); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_SIZE: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + routeszp = (uint64_t *)oip->oip_value; + if (*routeszp != 0 && (*routeszp < OVERLAY_ROUTE_CACHE_MIN || + OVERLAY_ROUTE_CACHE_MAX)) { + ret = EINVAL; + break; + } + ret = overlay_setprop_route_cachesz(odd, *routeszp); + break; + case OVERLAY_DEV_P_ROUTE_CACHE_A: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + routeap = (uint64_t *)oip->oip_value; + if (*routeap > OVERLAY_CACHE_MAX_A) { + ret = EINVAL; + break; + } + ret = overlay_setprop_route_cachea(odd, *routeap); + break; default: ret = ENOENT; } diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c index 0c21bb8689..2688d2791c 100644 --- a/usr/src/uts/common/io/overlay/overlay_mux.c +++ b/usr/src/uts/common/io/overlay/overlay_mux.c @@ -127,6 +127,18 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob, freeb(fmp); /* + * In cases of looped-back vxlan, that tends to have a + * prepended IP+UDP-only mblk, followed by the data. Parsing + * would've made that mblk a zero-length one (rptr == wptr). + */ + if (mp->b_rptr == mp->b_wptr && mp->b_cont != NULL) { + /* Ended up with zero-length mblk, lose it! */ + fmp = mp; + mp = fmp->b_cont; + freeb(fmp); + } + + /* * Until we have VXLAN-or-other-decap HW acceleration support * (e.g. we support NICs that reach into VXLAN-encapsulated * packets and check the inside-VXLAN IP packets' checksums, @@ -161,10 +173,9 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob, if (rem == blkl) { fmp = mp; mp = fmp->b_cont; - fmp->b_cont = NULL; OVERLAY_FREEMSG(mp, "freed a fmp block"); - freemsg(fmp); + freeb(fmp); } } if (mp == NULL) { diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c index f4147b56d1..171acd034f 100644 --- a/usr/src/uts/common/io/overlay/overlay_target.c +++ b/usr/src/uts/common/io/overlay/overlay_target.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -20,6 +20,8 @@ * uts/common/io/overlay/overlay.c */ +#include <inet/ip.h> +#include <inet/ip6.h> #include <sys/types.h> #include <sys/ethernet.h> #include <sys/kmem.h> @@ -42,6 +44,9 @@ #include <sys/overlay_impl.h> #include <sys/sdt.h> +#define OVERLAY_DROP(mp, reason) \ + DTRACE_PROBE2(overlay__drop, mblk_t *, mp, char *, reason) + /* * This is total straw man, but at least it's a prime number. Here we're * going to have to go through and do a lot of evaluation and understanding as @@ -52,6 +57,19 @@ #define OVERLAY_HSIZE 823 /* + * The default size of each target cache. This is also a complete strawman + * whose value could change as we gain better operational experience with + * overlay routing. + */ +#define OVERLAY_CACHE_SIZE 512 + +/* + * A somewhat arbitrary value. The percentage of the target cache dedicated + * to MFU entries (i.e. entries that have been looked up more than once). + */ +#define OVERLAY_CACHE_A 60 + +/* * We use this data structure to keep track of what requests have been actively * allocated to a given instance so we know what to put back on the pending * list. @@ -69,7 +87,7 @@ typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int); typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *); typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int); -typedef struct overaly_target_ioctl { +typedef struct overlay_target_ioctl { int oti_cmd; /* ioctl id */ boolean_t oti_write; /* ioctl requires FWRITE */ boolean_t oti_ncopyout; /* copyout data? */ @@ -144,25 +162,60 @@ overlay_entry_cache_destructor(void *buf, void *arg) static uint64_t overlay_mac_hash(const void *v) { + const overlay_target_mac_t *m = v; + uint32_t crc; - CRC32(crc, v, ETHERADDRL, -1U, crc32_table); + CRC32(crc, m->otm_mac, ETHERADDRL, -1U, crc32_table); + CRC32(crc, &m->otm_dcid, sizeof (uint32_t), crc, crc32_table); return (crc); } static int overlay_mac_cmp(const void *a, const void *b) { - return (bcmp(a, b, ETHERADDRL)); + const overlay_target_mac_t *l = a; + const overlay_target_mac_t *r = b; + + if (l->otm_dcid != r->otm_dcid) + return (1); + return (bcmp(l->otm_mac, r->otm_mac, ETHERADDRL) != 0); +} + +static uint64_t +overlay_ip_hash(const void *v) +{ + const overlay_target_vl3_t *vl3 = v; + + uint32_t crc; + CRC32(crc, &vl3->otvl3_src, sizeof (vl3->otvl3_src), -1U, crc32_table); + CRC32(crc, &vl3->otvl3_dst, sizeof (vl3->otvl3_dst), crc, crc32_table); + CRC32(crc, &vl3->otvl3_src_vlan, sizeof (vl3->otvl3_src_vlan), crc, + crc32_table); + return (crc); +} + +static int +overlay_ip_cmp(const void *a, const void *b) +{ + const overlay_target_vl3_t *l = a; + const overlay_target_vl3_t *r = b; + + if (l->otvl3_src_vlan != r->otvl3_src_vlan) + return (1); + if (!IN6_ARE_ADDR_EQUAL(&l->otvl3_src, &r->otvl3_src)) + return (1); + if (!IN6_ARE_ADDR_EQUAL(&l->otvl3_dst, &r->otvl3_dst)) + return (1); + return (0); } -/* ARGSUSED */ static void overlay_target_entry_dtor(void *arg) { overlay_target_entry_t *ote = arg; ote->ote_flags = 0; - bzero(ote->ote_addr, ETHERADDRL); + bzero(&ote->ote_u, sizeof (ote->ote_u)); ote->ote_ott = NULL; ote->ote_odd = NULL; freemsgchain(ote->ote_chead); @@ -172,21 +225,76 @@ overlay_target_entry_dtor(void *arg) kmem_cache_free(overlay_entry_cache, ote); } +static void +overlay_target_entry_l2qq_dtor(void *arg) +{ + overlay_target_entry_t *ote = arg; + overlay_target_t *ott = ote->ote_ott; + + ASSERT(MUTEX_HELD(&ott->ott_lock)); + ASSERT3S((ote->ote_flags & OVERLAY_ENTRY_F_VL3), ==, 0); + + avl_remove(&ott->ott_u.ott_dyn.ott_tree, ote); + overlay_target_entry_dtor(ote); +} + +static void +overlay_target_entry_l3qq_dtor(void *arg) +{ + overlay_target_entry_t *ote = arg; + overlay_target_t *ott = ote->ote_ott; + + ASSERT(MUTEX_HELD(&ott->ott_lock)); + ASSERT3S((ote->ote_flags & OVERLAY_ENTRY_F_VL3), ==, + OVERLAY_ENTRY_F_VL3); + + avl_remove(&ott->ott_u.ott_dyn.ott_l3tree, ote); + overlay_target_entry_dtor(ote); +} + static int overlay_mac_avl(const void *a, const void *b) { + const overlay_target_entry_t *le = a; + const overlay_target_entry_t *re = b; + const overlay_target_mac_t *lm = &le->ote_u.ote_vl2.otvl2_mac; + const overlay_target_mac_t *rm = &re->ote_u.ote_vl2.otvl2_mac; int i; - const overlay_target_entry_t *l, *r; - l = a; - r = b; + + /* Order by DCID, then MAC */ + if (lm->otm_dcid < rm->otm_dcid) + return (-1); + if (lm->otm_dcid > rm->otm_dcid) + return (1); for (i = 0; i < ETHERADDRL; i++) { - if (l->ote_addr[i] > r->ote_addr[i]) + if (lm->otm_mac[i] > rm->otm_mac[i]) return (1); - else if (l->ote_addr[i] < r->ote_addr[i]) + else if (lm->otm_mac[i] < rm->otm_mac[i]) return (-1); } + return (0); +} +static int +overlay_ip_avl(const void *a, const void *b) +{ + const overlay_target_entry_t *l = a; + const overlay_target_entry_t *r = b; + const overlay_target_vl3_t *l_vl3 = &l->ote_u.ote_vl3; + const overlay_target_vl3_t *r_vl3 = &r->ote_u.ote_vl3; + int ret; + + if ((ret = memcmp(&l_vl3->otvl3_src, &r_vl3->otvl3_src, + sizeof (l_vl3->otvl3_src))) != 0) + return (ret < 0 ? -1 : 1); + if ((ret = memcmp(&l_vl3->otvl3_dst, &r_vl3->otvl3_dst, + sizeof (l_vl3->otvl3_dst))) != 0) + return (ret < 0 ? -1 : 1); + if (l_vl3->otvl3_src_vlan < r_vl3->otvl3_src_vlan) + return (-1); + if (l_vl3->otvl3_src_vlan > r_vl3->otvl3_src_vlan) + return (1); return (0); } @@ -233,25 +341,20 @@ overlay_target_free(overlay_dev_t *odd) return; if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) { - refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash; - avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree; - overlay_target_entry_t *ote; - + mutex_enter(&odd->odd_target->ott_lock); /* - * Our AVL tree and hashtable contain the same elements, - * therefore we should just remove it from the tree, but then - * delete the entries when we remove them from the hash table - * (which happens through the refhash dtor). + * Our VL3 AVL tree and hashtable contain the same elements. + * Additionally, when an entry is removed from the 2Q cache, + * the entry is removed from the corresponding AVL tree. + * Deleting the 2Q cache will destroy any remaining entries, + * so all we need to do is destroy the 2Q caches. */ - while ((ote = avl_first(ap)) != NULL) - avl_remove(ap, ote); - - avl_destroy(ap); - for (ote = refhash_first(rp); ote != NULL; - ote = refhash_next(rp, ote)) { - refhash_remove(rp, ote); - } - refhash_destroy(rp); + qqcache_destroy(odd->odd_target->ott_u.ott_dyn.ott_dhash); + qqcache_destroy(odd->odd_target->ott_u.ott_dyn.ott_l3dhash); + ASSERT(avl_is_empty(&odd->odd_target->ott_u.ott_dyn.ott_tree)); + ASSERT(avl_is_empty( + &odd->odd_target->ott_u.ott_dyn.ott_l3tree)); + mutex_exit(&odd->odd_target->ott_lock); } ASSERT(odd->odd_target->ott_ocount == 0); @@ -270,18 +373,42 @@ overlay_target_busy() return (ret); } +/* + * Queue the target entry on the list of varpd requests. entry should be + * refheld for the duration of this call (this call takes its own additional + * hold that is released when we receive a response). + */ static void overlay_target_queue(overlay_target_entry_t *entry) { + overlay_target_t *ott = entry->ote_ott; + boolean_t is_vl3 = B_FALSE; + + /* + * ote_ott is read-only and set at entry creation, so it can be + * read without ote_lock held + */ + ASSERT(!MUTEX_HELD(&entry->ote_lock)); + + mutex_enter(&entry->ote_lock); + if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0) + is_vl3 = B_TRUE; + mutex_exit(&entry->ote_lock); + mutex_enter(&overlay_target_lock); - mutex_enter(&entry->ote_ott->ott_lock); - if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) { - mutex_exit(&entry->ote_ott->ott_lock); + mutex_enter(&ott->ott_lock); + if (ott->ott_flags & OVERLAY_T_TEARDOWN) { + mutex_exit(&ott->ott_lock); mutex_exit(&overlay_target_lock); return; } - entry->ote_ott->ott_ocount++; - mutex_exit(&entry->ote_ott->ott_lock); + ott->ott_ocount++; + if (is_vl3) + qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry); + else + qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry); + + mutex_exit(&ott->ott_lock); list_insert_tail(&overlay_target_list, entry); cv_signal(&overlay_target_condvar); mutex_exit(&overlay_target_lock); @@ -300,22 +427,446 @@ overlay_target_quiesce(overlay_target_t *ott) } /* - * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP | + * Write the VL3 src/dst IP from the packet in mp into src and dst. If the + * addresses are IPv4 addresses, they are written as mapped addresses. + */ +static int +overlay_get_vl3_ips(mblk_t *mp, struct in6_addr *src, struct in6_addr *dst) +{ + uint16_t sap; + +#if 1 + /* Temporary until mblk helpers are integrated */ + struct ether_vlan_header *eth = (struct ether_vlan_header *)mp->b_rptr; + ipha_t *iphp = (ipha_t *)(eth + 1); + ip6_t *ip6hp = (ip6_t *)(eth + 1); + size_t mlen = MBLKL(mp); + + if (mlen < sizeof (struct ether_vlan_header)) + return (EINVAL); + mlen -= sizeof (struct ether_vlan_header); + + /* We currently don't support routing on untagged vlans */ + if ((sap = ntohs(eth->ether_tpid)) != ETHERTYPE_VLAN) + return (EINVAL); + + sap = ntohs(eth->ether_type); + if (mlen == 0) { + if ((mp = mp->b_cont) == NULL) + return (EINVAL); + mlen = MBLKL(mp); + iphp = (ipha_t *)mp->b_rptr; + ip6hp = (ip6_t *)mp->b_rptr; + } + + switch (sap) { + case ETHERTYPE_IP: + if (mlen < sizeof (ipha_t)) + return (EINVAL); + ASSERT3U(IPH_HDR_VERSION(iphp), ==, IPV4_VERSION); + IN6_IPADDR_TO_V4MAPPED(iphp->ipha_src, src); + IN6_IPADDR_TO_V4MAPPED(iphp->ipha_dst, dst); + break; + case ETHERTYPE_IPV6: + if (mlen < sizeof (ip6_t)) + return (EINVAL); + ASSERT3U(IPH_HDR_VERSION(iphp), ==, IPV6_VERSION); + bcopy(&ip6hp->ip6_src, src, sizeof (*src)); + bcopy(&ip6hp->ip6_dst, dst, sizeof (*dst)); + break; + default: + return (EINVAL); + } + + return (0); +#else + size_t soff, doff; + uint32_t v4s, v4d; + int i; + + if (!mblk_read_uint16(mp, offsetof(struct ether_header, ether_type), + &sap)) + return (EINVAL); + + if (sap == ETHERTYPE_VLAN) { + if (!mblk_read_uint16(mp, + offsetof(struct ether_vlan_header, ether_type), &sap)) + return (EINVAL); + soff = doff = sizeof (struct ether_vlan_header); + } else { + soff = doff = sizeof (struct ether_header); + } + + switch (sap) { + case ETHERTYPE_IP: + soff += offsetof(ipha_t, ipha_src); + doff += offsetof(ipha_t, ipha_dst); + + if (!mblk_read_uint32(mp, soff, &v4s) || + !mblk_read_uint32(mp, doff, &v4d)) + return (EINVAL); + IN6_IPADDR_TO_V4MAPPED(&v4s, src); + IN6_IPADDR_TO_V4MAPPED(&v4d, dst); + break; + case ETHERTYPE_IPV6: + soff += offsetof(ip6_t, ip6_src); + doff += offsetof(ip6_6, ip6_dst); + + for (i = 0; i < 4; i++) { + if (!mblk_read_uint32(mp, soff, &src->s6_addr32[i]) || + !mblk_read_uint32(mp, doff, &dst->s6_addr32[i])) + return (EINVAL); + soff += sizeof (uint32_t); + doff += sizeof (uint32_t); + } + break; + default: + return (EINVAL); + } + + return (0); +#endif +} + +static int +overlay_route(overlay_dev_t *odd, mblk_t *mp, + const overlay_target_route_t *route, const overlay_target_mac_t *dst_mac) +{ + uint16_t tci; + + if (MBLKL(mp) >= sizeof (struct ether_vlan_header)) { + struct ether_vlan_header *evh; + + evh = (struct ether_vlan_header *)mp->b_rptr; + tci = ntohs(evh->ether_tci); + + /* + * Today we require all encapsulated frames to be vlan tagged. + * If this is relaxed in the future, we will need to allow for + * insertion and removal of the vlan tag as appropriate here. + */ + if (ntohs(evh->ether_tpid) != ETHERTYPE_VLAN) { + OVERLAY_DROP(mp, "not vlan tagged"); + return (OVERLAY_TARGET_DROP); + } + + tci &= ~(VLAN_ID_MASK); + tci |= route->otr_vlan; + evh->ether_tci = htons(tci); + bcopy(dst_mac->otm_mac, &evh->ether_dhost, ETHERADDRL); + bcopy(route->otr_srcmac, &evh->ether_shost, ETHERADDRL); + return (OVERLAY_TARGET_OK); + } + +#if 1 + /* Temporary until mblk helpers are integrated */ + OVERLAY_DROP(mp, "ethernet header split between mblks"); + return (OVERLAY_TARGET_DROP); +#else + size_t off; + + off = offsetof(struct ether_vlan_header, ether_tpid); + if (!mblk_read_uint16(mp, off, &tci)) { + OVERLAY_DROP(mp, "cannot read tpid"); + return (OVERLAY_TARGET_DROP); + } + + tci = ntohs(evh->ether_tci); + tci &= ~(VLAN_ID_MASK); + tci |= route->otr_vlan; + + if (!mblk_write_uint16(mp, off, tci)) { + OVERLAY_DROP(mp, "cannot set routed destination vlan"); + return (OVERLAY_TARGET_DROP); + } + + for (int i = 0; i < ETHERADDRL; i++) { + if (!mblk_write_uint8(mp, i, dst_msc->otm_mac[i]) || + !mblk_write_uint8(mp, i + ETHERADDRL, + route->otr_srcmac[i])) { + OVERLAY_DROP(mp, "cannot set routed macs"); + return (OVERLAY_TARGET_DROP); + } + } + + return (OVERLAY_TARGET_OK); +#endif +} + +/* + * Attempt to add mp to the packet queue of target entry. If the queue is + * already full, it returns OVERLAY_TARGET_DROP, otherwise OVERLAY_TARGET_ASYNC + * is returned. If the entry isn't already pending a response from varpd, + * queue the target entry on the list of outstanding varpd requests. + * + * Entry should already be locked, however since it is intended that this + * should be the final step in dealing with this entry (for handling the + * packet in question), it always releases ote_lock before returning. + * entry should be refheld for the duration of this call. + */ +static int +overlay_target_try_queue(overlay_target_entry_t *entry, mblk_t *mp) +{ + size_t mlen = msgsize(mp); + boolean_t queue = B_FALSE; + + ASSERT(MUTEX_HELD(&entry->ote_lock)); + + if (mlen + entry->ote_mbsize > overlay_ent_size) { + OVERLAY_DROP(mp, "target queue full"); + mutex_exit(&entry->ote_lock); + return (OVERLAY_TARGET_DROP); + } + + if (entry->ote_ctail != NULL) { + ASSERT(entry->ote_ctail->b_next == NULL); + entry->ote_ctail->b_next = mp; + entry->ote_ctail = mp; + } else { + entry->ote_chead = mp; + entry->ote_ctail = mp; + } + entry->ote_mbsize += mlen; + if ((entry->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0) { + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + queue = B_TRUE; + } + mutex_exit(&entry->ote_lock); + + if (queue) + overlay_target_queue(entry); + + return (OVERLAY_TARGET_ASYNC); +} + +/* + * Given the VL3 IP->VL2 mac entry (vl3e), and the corresponding VL2 MAC->UL3 + * entry (vl2e), if both entries are valid, sets *vidp, *v6, *slenp to the + * correct UL3 destination and return OVERLAY_TARGET_OK. If either of the + * entries are still pending lookups (or v2le is NULL because the entry is + * missing), mp is queued (if there is space) on the appropriate entry and + * OVERLAY_TARGET_ASYNC is returned. If the VL2 entry is flagged to drop all + * packets, OVERLAY_TARGET_DROP is returned. + * + * In all cases, the caller should acquire vl3e->ote_lock prior to calling + * overlay_route_lookup_vl2(). Because vl2e can be missing (NULL), the caller + * should not acquire vl2e->ote_lock prior to calling + * overlay_route_lookup_vl2(). vl3e->ote_lock is alway dropped prior to + * returning. + */ +static int +overlay_route_lookup_vl2(overlay_target_entry_t *vl3e, + overlay_target_entry_t *vl2e, uint64_t *vidp, struct sockaddr_in6 *v6, + socklen_t *slenp, mblk_t *mp) +{ + overlay_target_vl2_t *vl2p; + int ret; + + ASSERT(MUTEX_HELD(&vl3e->ote_lock)); + + if (vl2e == NULL) { + vl3e->ote_flags &= ~OVERLAY_ENTRY_F_VALID; + /* This drops vl3e->ote_lock */ + return (overlay_target_try_queue(vl3e, mp)); + } + + mutex_enter(&vl2e->ote_lock); + if (vl2e->ote_flags & (OVERLAY_ENTRY_F_DROP | OVERLAY_ENTRY_F_ROUTER)) { + overlay_target_entry_flags_t flags = vl2e->ote_flags; + + mutex_exit(&vl2e->ote_lock); + mutex_exit(&vl3e->ote_lock); + + if (flags & OVERLAY_ENTRY_F_DROP) { + OVERLAY_DROP(mp, "VL2 target marked drop"); + } else { + OVERLAY_DROP(mp, "VL2 target is overlay router"); + } + + return (OVERLAY_TARGET_DROP); + } + + /* + * If the route is missing queue on the VL3 entry so a VL3->UL3 + * lookup is done (to get the route data). + */ + if ((vl2e->ote_flags & OVERLAY_ENTRY_F_HAS_ROUTE) == 0) { + mutex_exit(&vl2e->ote_lock); + + vl3e->ote_flags &= ~OVERLAY_ENTRY_F_VALID; + /* This drops vl3e->ote_lock */ + return (overlay_target_try_queue(vl3e, mp)); + } + + /* + * If the VL2 target point is missing, we try to be a bit (though + * hopefully not too) clever. We can always queue on the VL3 entry + * which will trigger a VL3->UL3 lookup request (as it is effectively + * a superset of the VL2->UL3 lookup). However, if we know we already + * have an outstanding VL3->UL3 request, we queue on the VL2 entry and + * avoid doing another redundant lookup. We can also queue on the VL2 + * entry when it is a local (same vnet, same DC) destination -- we + * currently cannot generate VL2->UL3 lookups for remote destinations, + * only same vnet, same DC. Queueing on the VL2 entry also allows + * instances on the same vlan as the queued VL2 entry to piggy back on + * the lookup request and avoid a redundant lookup. However if the + * VL2 entry is remote, we have to do a VL3->UL3 lookup. + */ + if ((vl2e->ote_flags & OVERLAY_ENTRY_F_VALID) == 0) { + if ((vl2e->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0 && + vl2e->ote_u.ote_vl2.otvl2_mac.otm_dcid != + vl2e->ote_odd->odd_dcid) { + mutex_exit(&vl2e->ote_lock); + /* This drops vl3e->ote_lock */ + return (overlay_target_try_queue(vl3e, mp)); + } + + mutex_exit(&vl3e->ote_lock); + /* This drops vl2e->ote_lock */ + return (overlay_target_try_queue(vl2e, mp)); + } + + ASSERT(vl2e->ote_flags & OVERLAY_ENTRY_F_VALID); + + vl2p = &vl2e->ote_u.ote_vl2; + + *vidp = vl2p->otvl2_route.otr_vnet; + bcopy(&vl2p->otvl2_dest.otp_ip, &v6->sin6_addr, + sizeof (struct in6_addr)); + v6->sin6_port = htons(vl2p->otvl2_dest.otp_port); + *slenp = sizeof (struct sockaddr_in6); + + ret = overlay_route(vl2e->ote_odd, mp, &vl2p->otvl2_route, + &vl2p->otvl2_mac); + mutex_exit(&vl2e->ote_lock); + mutex_exit(&vl3e->ote_lock); + return (ret); +} + +static int +overlay_route_lookup(overlay_dev_t *odd, mblk_t *mp, uint16_t vlan, + struct sockaddr *sock, socklen_t *slenp, uint64_t *vidp) +{ + overlay_target_t *ott = odd->odd_target; + overlay_target_entry_t *entry, *vl2_entry = NULL; + struct sockaddr_in6 *v6 = (struct sockaddr_in6 *)sock; + overlay_target_vl3_t vl3 = { 0 }; + int ret = OVERLAY_TARGET_DROP; + + /* overlay_target_lookup() should have set this */ + ASSERT3U(v6->sin6_family, ==, AF_INET6); + + /* We should only be called for dynamic endpoints */ + ASSERT3U(ott->ott_mode, ==, OVERLAY_TARGET_DYNAMIC); + + vl3.otvl3_src_vlan = vlan; + if ((ret = overlay_get_vl3_ips(mp, &vl3.otvl3_src, &vl3.otvl3_dst)) + != OVERLAY_TARGET_OK) { + OVERLAY_DROP(mp, "could not read VL3 src/dst IPs"); + return (OVERLAY_TARGET_DROP); + } + + mutex_enter(&ott->ott_lock); + entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_l3dhash, &vl3); + if (entry == NULL) { + if ((entry = kmem_cache_alloc(overlay_entry_cache, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) { + mutex_exit(&ott->ott_lock); + OVERLAY_DROP(mp, "failed VL3 target entry allocation"); + return (OVERLAY_TARGET_DROP); + } + + bcopy(&vl3, &entry->ote_u.ote_vl3, sizeof (vl3)); + entry->ote_flags = OVERLAY_ENTRY_F_VL3; + + entry->ote_chead = entry->ote_ctail = mp; + entry->ote_mbsize = msgsize(mp); + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + + entry->ote_ott = ott; + entry->ote_odd = odd; + + qqcache_insert(ott->ott_u.ott_dyn.ott_l3dhash, entry); + qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry); + avl_add(&ott->ott_u.ott_dyn.ott_l3tree, entry); + mutex_exit(&ott->ott_lock); + + overlay_target_queue(entry); + + mutex_enter(&ott->ott_lock); + qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry); + mutex_exit(&ott->ott_lock); + return (OVERLAY_TARGET_ASYNC); + } + qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry); + mutex_enter(&entry->ote_lock); + + /* + * A bit ugly, but if we need the VL2 entry, we want to look it up + * while we still hold ott_lock. + */ + if ((entry->ote_flags & + (OVERLAY_ENTRY_F_DROP|OVERLAY_ENTRY_F_ROUTER| + OVERLAY_ENTRY_F_VALID)) == OVERLAY_ENTRY_F_VALID) { + vl2_entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, + &entry->ote_u.ote_vl3.otvl3_vl2); + if (vl2_entry != NULL) + qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, vl2_entry); + } + mutex_exit(&ott->ott_lock); + + ASSERT(entry->ote_flags & OVERLAY_ENTRY_F_VL3); + + if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) { + mutex_exit(&entry->ote_lock); + OVERLAY_DROP(mp, "VL3 target entry marked drop"); + ret = OVERLAY_TARGET_DROP; + } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) { + /* + * XXX: A packet with a dst IP of an overlay router. + * Maybe generate an ICMP reply? For now, we drop. + */ + mutex_exit(&entry->ote_lock); + OVERLAY_DROP(mp, "VL3 target entry is router"); + ret = OVERLAY_TARGET_DROP; + } else if ((entry->ote_flags & OVERLAY_ENTRY_F_VALID) == 0) { + /* This drops entry->ote_lock */ + ret = overlay_target_try_queue(entry, mp); + } else { + /* This drops entry->ote_lock */ + ret = overlay_route_lookup_vl2(entry, vl2_entry, vidp, v6, + slenp, mp); + } + + mutex_enter(&ott->ott_lock); + qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry); + if (vl2_entry != NULL) + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, vl2_entry); + mutex_exit(&ott->ott_lock); + return (ret); +} + +/* + * This function assumes that the destination mode is OVERLAY_PLUGIN_D_IP | * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at - * this time, say for NVGRE, we drop all packets that mcuh this. + * this time, say for NVGRE, we drop all packets that match this. */ int overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, - socklen_t *slenp) + socklen_t *slenp, uint64_t *vidp) { int ret; struct sockaddr_in6 *v6; overlay_target_t *ott; - mac_header_info_t mhi; overlay_target_entry_t *entry; + mac_header_info_t mhi; + overlay_target_mac_t omac; ASSERT(odd->odd_target != NULL); + /* Default to our local vid, routing may change this if necessary */ + *vidp = odd->odd_vid; + /* * At this point, the overlay device is in a mux which means that it's * been activated. At this point, parts of the target, such as the mode @@ -323,8 +874,10 @@ overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, * about synchronization for them. */ ott = odd->odd_target; - if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) + if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) { + OVERLAY_DROP(mp, "plugin doesn't support IP or port"); return (OVERLAY_TARGET_DROP); + } v6 = (struct sockaddr_in6 *)sock; bzero(v6, sizeof (struct sockaddr_in6)); @@ -343,76 +896,89 @@ overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC); - /* - * Note we only want the MAC address here, therefore we won't bother - * using mac_vlan_header_info(). If any caller needs the vlan info at - * this point, this should change to a call to mac_vlan_header_info(). - */ - if (mac_header_info(odd->odd_mh, mp, &mhi) != 0) + if (mac_vlan_header_info(odd->odd_mh, mp, &mhi) != 0) { + OVERLAY_DROP(mp, "could not read vlan header"); return (OVERLAY_TARGET_DROP); + } + + omac.otm_dcid = odd->odd_dcid; + bcopy(mhi.mhi_daddr, omac.otm_mac, ETHERADDRL); + mutex_enter(&ott->ott_lock); - entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - mhi.mhi_daddr); + entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, &omac); if (entry == NULL) { + overlay_target_vl2_t *vl2p; + entry = kmem_cache_alloc(overlay_entry_cache, KM_NOSLEEP | KM_NORMALPRI); if (entry == NULL) { mutex_exit(&ott->ott_lock); + OVERLAY_DROP(mp, "VL2 target entry allocation failed"); return (OVERLAY_TARGET_DROP); } - bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL); + + vl2p = &entry->ote_u.ote_vl2; + bcopy(mhi.mhi_daddr, vl2p->otvl2_mac.otm_mac, ETHERADDRL); + vl2p->otvl2_mac.otm_dcid = odd->odd_dcid; + vl2p->otvl2_route.otr_vnet = odd->odd_vid; + vl2p->otvl2_route.otr_vlan = VLAN_ID(mhi.mhi_tci); + entry->ote_chead = entry->ote_ctail = mp; entry->ote_mbsize = msgsize(mp); entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + entry->ote_ott = ott; entry->ote_odd = odd; - refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry); + + qqcache_insert(ott->ott_u.ott_dyn.ott_dhash, entry); + qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry); avl_add(&ott->ott_u.ott_dyn.ott_tree, entry); mutex_exit(&ott->ott_lock); + overlay_target_queue(entry); + + mutex_enter(&ott->ott_lock); + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + mutex_exit(&ott->ott_lock); return (OVERLAY_TARGET_ASYNC); } - refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry); + qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry); mutex_exit(&ott->ott_lock); mutex_enter(&entry->ote_lock); if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) { + mutex_exit(&entry->ote_lock); + OVERLAY_DROP(mp, "VL2 target marked drop"); ret = OVERLAY_TARGET_DROP; + } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) { + if (mhi.mhi_bindsap == ETHERTYPE_ARP) { + /* + * Send unicast ARP requests to varpd for processing. + * We will eventually need something similar for IPv6. + * This drops entry->ote_lock. + */ + ret = overlay_target_try_queue(entry, mp); + } else { + mutex_exit(&entry->ote_lock); + ret = overlay_route_lookup(odd, mp, + VLAN_ID(mhi.mhi_tci), sock, slenp, vidp); + } } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { - bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr, - sizeof (struct in6_addr)); - v6->sin6_port = htons(entry->ote_dest.otp_port); + overlay_target_point_t *otp = &entry->ote_u.ote_vl2.otvl2_dest; + + bcopy(&otp->otp_ip, &v6->sin6_addr, sizeof (struct in6_addr)); + v6->sin6_port = htons(otp->otp_port); + mutex_exit(&entry->ote_lock); + *slenp = sizeof (struct sockaddr_in6); ret = OVERLAY_TARGET_OK; } else { - size_t mlen = msgsize(mp); - - if (mlen + entry->ote_mbsize > overlay_ent_size) { - ret = OVERLAY_TARGET_DROP; - } else { - if (entry->ote_ctail != NULL) { - ASSERT(entry->ote_ctail->b_next == - NULL); - entry->ote_ctail->b_next = mp; - entry->ote_ctail = mp; - } else { - entry->ote_chead = mp; - entry->ote_ctail = mp; - } - entry->ote_mbsize += mlen; - if ((entry->ote_flags & - OVERLAY_ENTRY_F_PENDING) == 0) { - entry->ote_flags |= - OVERLAY_ENTRY_F_PENDING; - overlay_target_queue(entry); - } - ret = OVERLAY_TARGET_ASYNC; - } + /* This drops entry->ote_lock */ + ret = overlay_target_try_queue(entry, mp); } - mutex_exit(&entry->ote_lock); mutex_enter(&ott->ott_lock); - refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry); mutex_exit(&ott->ott_lock); return (ret); @@ -437,6 +1003,7 @@ overlay_target_info(overlay_target_hdl_t *thdl, void *arg) if (odd->odd_flags & OVERLAY_F_ACTIVATED) oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE; oti->oti_vnetid = odd->odd_vid; + oti->oti_dcid = odd->odd_dcid; mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); return (0); @@ -488,6 +1055,13 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) } } + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_VARPD) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (EEXIST); + } + ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP); ott->ott_flags = 0; ott->ott_ocount = 0; @@ -499,21 +1073,44 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) bcopy(&ota->ota_point, &ott->ott_u.ott_point, sizeof (overlay_target_point_t)); } else { - ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE, + int ret; + + ret = qqcache_create(&ott->ott_u.ott_dyn.ott_dhash, + odd->odd_vl2sz, odd->odd_vl2a, OVERLAY_HSIZE, overlay_mac_hash, overlay_mac_cmp, - overlay_target_entry_dtor, sizeof (overlay_target_entry_t), + overlay_target_entry_l2qq_dtor, + sizeof (overlay_target_entry_t), offsetof(overlay_target_entry_t, ote_reflink), - offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP); + offsetof(overlay_target_entry_t, ote_u.ote_vl2.otvl2_mac), + KM_SLEEP); + if (ret != 0) { + mutex_exit(&odd->odd_lock); + kmem_cache_free(overlay_target_cache, ott); + overlay_hold_rele(odd); + return (ret); + } + + ret = qqcache_create(&ott->ott_u.ott_dyn.ott_l3dhash, + odd->odd_routesz, odd->odd_routea, OVERLAY_HSIZE, + overlay_ip_hash, overlay_ip_cmp, + overlay_target_entry_l3qq_dtor, + sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_reflink), + offsetof(overlay_target_entry_t, ote_u.ote_vl3), KM_SLEEP); + if (ret != 0) { + mutex_exit(&odd->odd_lock); + qqcache_destroy(ott->ott_u.ott_dyn.ott_l3dhash); + kmem_cache_free(overlay_target_cache, ott); + overlay_hold_rele(odd); + return (ret); + } + avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl, sizeof (overlay_target_entry_t), offsetof(overlay_target_entry_t, ote_avllink)); - } - mutex_enter(&odd->odd_lock); - if (odd->odd_flags & OVERLAY_F_VARPD) { - mutex_exit(&odd->odd_lock); - kmem_cache_free(overlay_target_cache, ott); - overlay_hold_rele(odd); - return (EEXIST); + avl_create(&ott->ott_u.ott_dyn.ott_l3tree, overlay_ip_avl, + sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_avllink)); } odd->odd_flags |= OVERLAY_F_VARPD; @@ -521,8 +1118,6 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); - - return (0); } @@ -601,7 +1196,16 @@ again: entry = list_remove_head(&overlay_target_list); mutex_exit(&overlay_target_lock); mutex_enter(&entry->ote_lock); - if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + /* + * Router entries may send lookups to varpd even when valid. For + * example, illumos systems will send unicast ARP queries to cached + * entries (including the router mac address). To answer those, we + * need to forward on the query to varpd. IPv6 will eventually + * need something similar for ND requests. + */ + if ((entry->ote_flags & + (OVERLAY_ENTRY_F_VALID|OVERLAY_ENTRY_F_ROUTER)) == + OVERLAY_ENTRY_F_VALID) { ASSERT(entry->ote_chead == NULL); mutex_exit(&entry->ote_lock); goto again; @@ -637,10 +1241,23 @@ again: otl->otl_hdrsize = mhi.mhi_hdrsize; otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize; - bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL); - bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL); - otl->otl_dsttype = mhi.mhi_dsttype; - otl->otl_sap = mhi.mhi_bindsap; + if (entry->ote_flags & OVERLAY_ENTRY_F_VL3) { + overlay_targ_l3_t *l3p = &otl->otl_addru.otlu_l3; + + otl->otl_l3req = B_TRUE; + bcopy(&entry->ote_u.ote_vl3.otvl3_src, &l3p->otl3_srcip, + sizeof (struct in6_addr)); + bcopy(&entry->ote_u.ote_vl3.otvl3_dst, &l3p->otl3_dstip, + sizeof (struct in6_addr)); + } else { + overlay_targ_l2_t *l2p = &otl->otl_addru.otlu_l2; + + otl->otl_l3req = B_FALSE; + bcopy(mhi.mhi_daddr, l2p->otl2_dstaddr, ETHERADDRL); + bcopy(mhi.mhi_saddr, l2p->otl2_srcaddr, ETHERADDRL); + l2p->otl2_dsttype = mhi.mhi_dsttype; + l2p->otl2_sap = mhi.mhi_bindsap; + } otl->otl_vlan = VLAN_ID(mhi.mhi_tci); mutex_exit(&entry->ote_lock); @@ -651,12 +1268,128 @@ again: return (0); } +static void +overlay_target_lookup_respond_vl3(const overlay_targ_resp_t *otr, + overlay_target_entry_t *entry) +{ + overlay_target_entry_t *shared = NULL; + overlay_target_entry_t *vl2_entry; + overlay_target_t *ott = entry->ote_ott; + qqcache_t *mhash = ott->ott_u.ott_dyn.ott_dhash; + hrtime_t now = gethrtime(); + + ASSERT(MUTEX_HELD(&entry->ote_lock)); + ASSERT(entry->ote_flags & OVERLAY_ENTRY_F_VL3); + + /* + * A cross-{vlan,dc,vnet} packet with a destination VL3 of an overlay + * router IP. For now we drop these. + */ + if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) { + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + entry->ote_flags |= OVERLAY_ENTRY_F_DROP; + return; + } + + bcopy(&otr->otr_mac, &entry->ote_u.ote_vl3.otvl3_vl2, + sizeof (overlay_target_mac_t)); + + mutex_enter(&ott->ott_lock); + if ((shared = qqcache_lookup(mhash, &otr->otr_mac)) != NULL) + qqcache_hold(mhash, shared); + mutex_exit(&ott->ott_lock); + + /* + * Once we have the VL2 destination, we need to see if we already + * have an existing VL2 entry we can reuse. If not, we create a + * fully-formed (i.e. valid) VL2 entry that we add to the cache. + */ + if (shared == NULL) { + vl2_entry = kmem_cache_alloc(overlay_entry_cache, + KM_NOSLEEP | KM_NORMALPRI); + if (vl2_entry == NULL) { + /* + * If we can't allocate a VL2 entry for the VL3 + * destination, we just give up for now and drain + * any queued packets. New packets will retry this + * allocation, so if the memory pressure lets up, we + * should recover. + */ + freemsgchain(entry->ote_chead); + entry->ote_chead = entry->ote_ctail = NULL; + return; + } + + vl2_entry->ote_ott = ott; + vl2_entry->ote_odd = entry->ote_odd; + + bcopy(&otr->otr_answer, &vl2_entry->ote_u.ote_vl2.otvl2_dest, + sizeof (overlay_target_point_t)); + bcopy(&otr->otr_mac, &vl2_entry->ote_u.ote_vl2.otvl2_mac, + sizeof (overlay_target_mac_t)); + bcopy(&otr->otr_route, &vl2_entry->ote_u.ote_vl2.otvl2_route, + sizeof (overlay_target_route_t)); + vl2_entry->ote_flags = + OVERLAY_ENTRY_F_HAS_ROUTE | OVERLAY_ENTRY_F_VALID; + vl2_entry->ote_vtime = entry->ote_vtime = now; + + mutex_enter(&ott->ott_lock); + if ((shared = qqcache_lookup(mhash, &otr->otr_mac)) != NULL) { + overlay_target_entry_dtor(vl2_entry); + kmem_cache_free(overlay_entry_cache, vl2_entry); + qqcache_hold(mhash, shared); + + vl2_entry = shared; + } else { + qqcache_insert(mhash, vl2_entry); + avl_add(&ott->ott_u.ott_dyn.ott_tree, vl2_entry); + qqcache_hold(mhash, vl2_entry); + } + mutex_exit(&ott->ott_lock); + } else { + vl2_entry = shared; + } + + mutex_enter(&vl2_entry->ote_lock); + if ((vl2_entry->ote_flags & (OVERLAY_ENTRY_F_HAS_ROUTE)) == 0) { + bcopy(&otr->otr_route, &vl2_entry->ote_u.ote_vl2.otvl2_route, + sizeof (overlay_target_route_t)); + vl2_entry->ote_flags |= OVERLAY_ENTRY_F_HAS_ROUTE; + } + + /* + * Update the VL2 entry if it doesn't have a valid destination, hasn't + * been marked as dropping all packets, and doesn't have an existing + * outstanding request. If a route and VL2 request involving the + * same VL2 destination are pending and the route response is processed + * prior to the VL2 request, we will continue to queue (on the VL2 + * entry) until the VL2 response is received, even though we have + * an answer from the route response. If we set the valid flag + * while there's still oustanding requests, it will cause problems + * with the outstanding requests. + */ + if ((vl2_entry->ote_flags & (OVERLAY_ENTRY_F_PENDING| + OVERLAY_ENTRY_F_VALID|OVERLAY_ENTRY_F_DROP)) == 0) { + bcopy(&otr->otr_answer, &vl2_entry->ote_u.ote_vl2.otvl2_dest, + sizeof (overlay_target_point_t)); + vl2_entry->ote_vtime = gethrtime(); + vl2_entry->ote_flags |= OVERLAY_ENTRY_F_VALID; + } + mutex_exit(&vl2_entry->ote_lock); + + mutex_enter(&ott->ott_lock); + qqcache_rele(mhash, vl2_entry); + mutex_exit(&ott->ott_lock); +} + static int overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg) { const overlay_targ_resp_t *otr = arg; + overlay_target_t *ott; overlay_target_entry_t *entry; mblk_t *mp; + boolean_t is_vl3 = B_FALSE; mutex_enter(&thdl->oth_lock); for (entry = list_head(&thdl->oth_outstanding); entry != NULL; @@ -673,38 +1406,88 @@ overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg) mutex_exit(&thdl->oth_lock); mutex_enter(&entry->ote_lock); - bcopy(&otr->otr_answer, &entry->ote_dest, - sizeof (overlay_target_point_t)); + ott = entry->ote_ott; + + if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0) + is_vl3 = B_TRUE; + + /* + * If we ever support a protocol that uses MAC addresses as the UL + * destination address, this check should probably include checking + * that otp_mac is also all zeros. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&otr->otr_answer.otp_ip) && + otr->otr_answer.otp_port == 0) + entry->ote_flags |= OVERLAY_ENTRY_F_ROUTER; + + if (!is_vl3) { + bcopy(&otr->otr_answer, &entry->ote_u.ote_vl2.otvl2_dest, + sizeof (overlay_target_point_t)); + entry->ote_vtime = gethrtime(); + } else { + overlay_target_lookup_respond_vl3(otr, entry); + } + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; entry->ote_flags |= OVERLAY_ENTRY_F_VALID; + mp = entry->ote_chead; entry->ote_chead = NULL; entry->ote_ctail = NULL; entry->ote_mbsize = 0; - entry->ote_vtime = gethrtime(); mutex_exit(&entry->ote_lock); /* - * For now do an in-situ drain. + * For now do an in-situ drain. For VL3 entries, if we re-use + * and existing VL2 entry, it is possible the VL2 lookup is still + * pending (though should be rare). In such instances, the packets + * queued on the VL3 entry will get queued on the VL2 entry until + * the VL2 entry is resolved. */ mp = overlay_m_tx(entry->ote_odd, mp); freemsgchain(mp); - mutex_enter(&entry->ote_ott->ott_lock); - entry->ote_ott->ott_ocount--; - cv_signal(&entry->ote_ott->ott_cond); - mutex_exit(&entry->ote_ott->ott_lock); + mutex_enter(&ott->ott_lock); + ott->ott_ocount--; + if (is_vl3) + qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry); + else + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + cv_signal(&ott->ott_cond); + mutex_exit(&ott->ott_lock); return (0); } +static boolean_t +overlay_target_for_varpd(overlay_dev_t *odd, mblk_t *mp) +{ + mac_header_info_t mhi; + + /* We should have dropped runts prior to ever queueing */ + VERIFY0(mac_vlan_header_info(odd->odd_mh, mp, &mhi)); + if (mhi.mhi_bindsap == ETHERTYPE_ARP) + return (B_TRUE); + + /* TODO: NDP packets */ + return (B_FALSE); +} + +typedef enum overlay_target_lookup_drop_act { + OTLDA_NONE, + OTLDA_QUEUE, + OTLDA_DELETE +} overlay_target_lookup_drop_act_t; + static int overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) { const overlay_targ_resp_t *otr = arg; + overlay_target_t *ott; overlay_target_entry_t *entry; mblk_t *mp; - boolean_t queue = B_FALSE; + overlay_target_lookup_drop_act_t action = OTLDA_NONE; + boolean_t is_vl3 = B_FALSE; mutex_enter(&thdl->oth_lock); for (entry = list_head(&thdl->oth_outstanding); entry != NULL; @@ -721,9 +1504,19 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) mutex_exit(&thdl->oth_lock); mutex_enter(&entry->ote_lock); + ott = entry->ote_ott; + if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0) + is_vl3 = B_TRUE; - /* Safeguard against a confused varpd */ - if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + mp = entry->ote_chead; + + /* + * Safeguard against a confused varpd. Packets specifically for + * varpd may receive replies (e.g. ARP replies) that require us to + * drop, even when the entry is valid. + */ + if ((entry->ote_flags & OVERLAY_ENTRY_F_VALID) && + !overlay_target_for_varpd(entry->ote_odd, mp)) { entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; DTRACE_PROBE1(overlay__target__valid__drop, overlay_target_entry_t *, entry); @@ -731,7 +1524,21 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) goto done; } - mp = entry->ote_chead; + /* + * If varpd is instructing us to drop the head mblk in a VL2 entry, + * this could be because it's already provided a response (e.g. an + * ARP reply), and the entry itself might still be used for other + * purposes. VL3 entries on the other had have no such uses. If + * we are told to drop the packet, there is no reason to retain + * the VL3 entry and we can delete it. + */ + if (is_vl3) { + action = OTLDA_DELETE; + mutex_exit(&entry->ote_lock); + goto done; + } + + /* Drop the first packet in entry */ if (mp != NULL) { entry->ote_chead = mp->b_next; mp->b_next = NULL; @@ -739,23 +1546,34 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) entry->ote_ctail = entry->ote_chead; entry->ote_mbsize -= msgsize(mp); } + if (entry->ote_chead != NULL) { - queue = B_TRUE; + action = OTLDA_QUEUE; entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; } else { entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; } mutex_exit(&entry->ote_lock); - if (queue == B_TRUE) + if (action == OTLDA_QUEUE) overlay_target_queue(entry); freemsg(mp); done: - mutex_enter(&entry->ote_ott->ott_lock); - entry->ote_ott->ott_ocount--; - cv_signal(&entry->ote_ott->ott_cond); - mutex_exit(&entry->ote_ott->ott_lock); + mutex_enter(&ott->ott_lock); + ott->ott_ocount--; + if (action == OTLDA_DELETE) { + /* overlay_target_entry_dtor() will free the mblk chain */ + qqcache_remove(ott->ott_u.ott_dyn.ott_l3dhash, entry); + } + + if (is_vl3) + qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry); + else + qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + + cv_signal(&ott->ott_cond); + mutex_exit(&ott->ott_lock); return (0); } @@ -1083,31 +1901,35 @@ overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg) sizeof (overlay_target_point_t)); } else { overlay_target_entry_t *ote; - ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - otc->otc_entry.otce_mac); - if (ote != NULL) { - mutex_enter(&ote->ote_lock); - if ((ote->ote_flags & - OVERLAY_ENTRY_F_VALID_MASK) != 0) { - if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) { - otc->otc_entry.otce_flags = - OVERLAY_TARGET_CACHE_DROP; - } else { - otc->otc_entry.otce_flags = 0; - bcopy(&ote->ote_dest, - &otc->otc_entry.otce_dest, - sizeof (overlay_target_point_t)); - } - ret = 0; + + if ((ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, + &otc->otc_entry.otce_mac)) == NULL) { + ret = ENOENT; + goto done; + } + + mutex_enter(&ote->ote_lock); + if ((ote->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) != 0) { + if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) { + otc->otc_entry.otce_flags = + OVERLAY_TARGET_CACHE_DROP; + } else if (ote->ote_flags & OVERLAY_ENTRY_F_ROUTER) { + otc->otc_entry.otce_flags = + OVERLAY_TARGET_CACHE_ROUTER; } else { - ret = ENOENT; + otc->otc_entry.otce_flags = 0; + bcopy(&ote->ote_u.ote_vl2.otvl2_dest, + &otc->otc_entry.otce_dest, + sizeof (overlay_target_point_t)); } - mutex_exit(&ote->ote_lock); + ret = 0; } else { ret = ENOENT; } + mutex_exit(&ote->ote_lock); } +done: mutex_exit(&ott->ott_lock); overlay_hold_rele(odd); @@ -1120,53 +1942,64 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) { overlay_dev_t *odd; overlay_target_t *ott; - overlay_target_entry_t *ote; + overlay_target_entry_t *ote, *new = NULL; overlay_targ_cache_t *otc = arg; mblk_t *mp = NULL; - if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP) + if (otc->otc_entry.otce_flags & + ~(OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER)) + return (EINVAL); + + if (otc->otc_entry.otce_flags == + (OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER)) return (EINVAL); odd = overlay_hold_by_dlid(otc->otc_linkid); if (odd == NULL) return (ENOENT); + /* + * Optimistically create the new entry. If not needed, we'll free it. + * We shouldn't be calling this ioctl rapidly enough that any potential + * alloc/free churn should cause a problem. + */ + new = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP); + bcopy(&otc->otc_entry.otce_mac, &new->ote_u.ote_vl2.otvl2_mac, + sizeof (overlay_target_mac_t)); + mutex_enter(&odd->odd_lock); if (!(odd->odd_flags & OVERLAY_F_VARPD)) { mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); + overlay_target_entry_dtor(new); return (ENXIO); } ott = odd->odd_target; if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { mutex_exit(&odd->odd_lock); overlay_hold_rele(odd); + overlay_target_entry_dtor(new); return (ENOTSUP); } + + new->ote_ott = ott; + new->ote_odd = odd; + mutex_enter(&ott->ott_lock); mutex_exit(&odd->odd_lock); - ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - otc->otc_entry.otce_mac); - if (ote == NULL) { - ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP); - bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL); - ote->ote_chead = ote->ote_ctail = NULL; - ote->ote_mbsize = 0; - ote->ote_ott = ott; - ote->ote_odd = odd; - mutex_enter(&ote->ote_lock); - refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote); - avl_add(&ott->ott_u.ott_dyn.ott_tree, ote); - } else { - mutex_enter(&ote->ote_lock); - } + if ((ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, + &otc->otc_entry.otce_mac)) == NULL) + ote = new; + mutex_enter(&ote->ote_lock); if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) { ote->ote_flags |= OVERLAY_ENTRY_F_DROP; } else { ote->ote_flags |= OVERLAY_ENTRY_F_VALID; - bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest, + if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_ROUTER) + ote->ote_flags |= OVERLAY_ENTRY_F_ROUTER; + bcopy(&otc->otc_entry.otce_dest, &ote->ote_u.ote_vl2.otvl2_dest, sizeof (overlay_target_point_t)); mp = ote->ote_chead; ote->ote_chead = NULL; @@ -1175,6 +2008,10 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) ote->ote_vtime = gethrtime(); } + if (ote == new) { + qqcache_insert(ott->ott_u.ott_dyn.ott_dhash, ote); + avl_add(&ott->ott_u.ott_dyn.ott_tree, ote); + } mutex_exit(&ote->ote_lock); mutex_exit(&ott->ott_lock); @@ -1185,6 +2022,9 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) overlay_hold_rele(odd); + if (ote != new) + overlay_target_entry_dtor(new); + return (0); } @@ -1217,8 +2057,11 @@ overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg) mutex_enter(&ott->ott_lock); mutex_exit(&odd->odd_lock); - ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - otc->otc_entry.otce_mac); + if (otc->otc_entry.otce_mac.otm_dcid == 0) + otc->otc_entry.otce_mac.otm_dcid = odd->odd_dcid; + + ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, + &otc->otc_entry.otce_mac); if (ote != NULL) { mutex_enter(&ote->ote_lock); ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; @@ -1269,8 +2112,13 @@ overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg) ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; mutex_exit(&ote->ote_lock); } - ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, - otc->otc_entry.otce_mac); + + avl = &ott->ott_u.ott_dyn.ott_l3tree; + for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) { + mutex_enter(&ote->ote_lock); + ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; + mutex_exit(&ote->ote_lock); + } mutex_exit(&ott->ott_lock); overlay_hold_rele(odd); @@ -1304,9 +2152,10 @@ overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize, } typedef struct overlay_targ_cache_marker { - uint8_t otcm_mac[ETHERADDRL]; + overlay_target_mac_t otcm_mac; uint16_t otcm_done; -} overlay_targ_cache_marker_t; +} overlay_targ_cache_marker_t __aligned(8); +CTASSERT(sizeof (overlay_targ_cache_marker_t) == 2 * sizeof (uint64_t)); /* ARGSUSED */ static int @@ -1356,7 +2205,7 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) if (ott->ott_mode == OVERLAY_TARGET_POINT) { overlay_targ_cache_entry_t *out = &iter->otci_ents[0]; - bzero(out->otce_mac, ETHERADDRL); + bzero(&out->otce_mac, sizeof (out->otce_mac)); out->otce_flags = 0; bcopy(&ott->ott_u.ott_point, &out->otce_dest, sizeof (overlay_target_point_t)); @@ -1365,7 +2214,9 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) } avl = &ott->ott_u.ott_dyn.ott_tree; - bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL); + lookup.ote_u.ote_vl2.otvl2_mac.otm_dcid = odd->odd_dcid; + bcopy(&mark->otcm_mac, &lookup.ote_u.ote_vl2.otvl2_mac, + sizeof (mark->otcm_mac)); ent = avl_find(avl, &lookup, &where); /* @@ -1390,19 +2241,21 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) mutex_exit(&ent->ote_lock); continue; } - bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL); + bcopy(&ent->ote_u.ote_vl2.otvl2_mac, &out->otce_mac, + sizeof (out->otce_mac)); out->otce_flags = 0; if (ent->ote_flags & OVERLAY_ENTRY_F_DROP) out->otce_flags |= OVERLAY_TARGET_CACHE_DROP; if (ent->ote_flags & OVERLAY_ENTRY_F_VALID) - bcopy(&ent->ote_dest, &out->otce_dest, + bcopy(&ent->ote_u.ote_vl2.otvl2_dest, &out->otce_dest, sizeof (overlay_target_point_t)); written++; mutex_exit(&ent->ote_lock); } if (ent != NULL) { - bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL); + bcopy(&ent->ote_u.ote_vl2.otvl2_mac, &mark->otcm_mac, + sizeof (mark->otcm_mac)); } else { mark->otcm_done = 1; } @@ -1432,6 +2285,122 @@ overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize, return (0); } +/* + * Take an IPv6 address + prefix length, and turn it into the network address. + * E.g. ::ffff:192.168.51.50/120 -> ::ffff:192.168.51.0 + */ +static void +overlay_in6_to_subnet(const struct in6_addr *src, struct in6_addr *dst, + uint8_t prefixlen) +{ + uint32_t val; + + for (size_t i = 0; i < 4; i++) { + val = ntohl(src->_S6_un._S6_u32[i]); + val &= IN6_MASK_FROM_PREFIX(i, prefixlen); + dst->_S6_un._S6_u32[i] = htonl(val); + } +} + +/* + * Find the first target entry whose source IP falls within the source subnet + * given by otcne. If no entries match, NULL is returned. + */ +static overlay_target_entry_t * +overlay_target_cache_first_net(overlay_target_t *ott, + const overlay_targ_cache_net_entry_t *otcne) +{ + avl_tree_t *avl; + overlay_target_entry_t *ote; + struct in6_addr *start; + overlay_target_entry_t cmp = { 0 }; + avl_index_t where = { 0 }; + + ASSERT(MUTEX_HELD(&ott->ott_lock)); + + avl = &ott->ott_u.ott_dyn.ott_l3tree; + start = &cmp.ote_u.ote_vl3.otvl3_src; + + /* + * The first possible source address for a subnet is the network + * address (e.g. 192.160.10.0 for a /24). While it normally shouldn't + * appear, we either start here, or at the first entry after where + * it would exist if present. This should be the first possible + * entry in the subnet. If it's not within the subnet, then we + * know no entries with that source subnet are present. + */ + overlay_in6_to_subnet(&otcne->otcne_src, start, + otcne->otcne_src_prefixlen); + + if ((ote = avl_find(avl, &cmp, &where)) == NULL) + ote = avl_nearest(avl, where, AVL_AFTER); + + if (ote == NULL || !IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_src, + &ote->ote_u.ote_vl3.otvl3_src, otcne->otcne_src_prefixlen)) + return (NULL); + + return (ote); +} + +/* ARGSUSED */ +static int +overlay_target_cache_remove_net(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_cache_net_t *otcn = arg; + overlay_targ_cache_net_entry_t *otcne = &otcn->otcn_entry; + overlay_dev_t *odd = NULL; + overlay_target_t *ott = NULL; + overlay_target_entry_t *ote = NULL, *ote_next = NULL; + avl_tree_t *avl = NULL; + + odd = overlay_hold_by_dlid(otcn->otcn_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + avl = &ott->ott_u.ott_dyn.ott_l3tree; + + for (ote = overlay_target_cache_first_net(ott, otcne); + ote != NULL && IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_src, + &ote->ote_u.ote_vl3.otvl3_src, otcne->otcne_src_prefixlen); + ote = ote_next) { + ote_next = AVL_NEXT(avl, ote); + + /* + * Entries are sorted by src ip, dst ip, src vlan, there can + * be entries from this src ip to destinations on other + * subnets besides the one we are removing that will need to + * be skipped over. + */ + if (ote->ote_u.ote_vl3.otvl3_src_vlan != otcne->otcne_vlan) + continue; + + if (!IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_dst, + &ote->ote_u.ote_vl3.otvl3_dst, otcne->otcne_dst_prefixlen)) + continue; + + qqcache_remove(ott->ott_u.ott_dyn.ott_l3dhash, ote); + } + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + return (0); +} + static overlay_target_ioctl_t overlay_target_ioctab[] = { { OVERLAY_TARG_INFO, B_TRUE, B_TRUE, NULL, overlay_target_info, @@ -1492,6 +2461,9 @@ static overlay_target_ioctl_t overlay_target_ioctab[] = { overlay_target_cache_iter, overlay_target_cache_iter_copyout, sizeof (overlay_targ_cache_iter_t) }, + { OVERLAY_TARG_CACHE_REMOVE_NET, B_TRUE, B_TRUE, + NULL, overlay_target_cache_remove_net, + NULL, sizeof (overlay_targ_cache_net_t) }, { 0 } }; diff --git a/usr/src/uts/common/qqcache/qqcache.c b/usr/src/uts/common/qqcache/qqcache.c new file mode 100644 index 0000000000..ccd90c3814 --- /dev/null +++ b/usr/src/uts/common/qqcache/qqcache.c @@ -0,0 +1,444 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018, Joyent, Inc. + */ + +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/null.h> +#include <sys/types.h> +#include <sys/qqcache.h> +#include <sys/qqcache_impl.h> +#include <sys/stddef.h> + +/* + * Currently, the non _KERNEL pieces are to support testing in usr/src/test. + */ +#ifdef _KERNEL +#include <sys/kmem.h> +#define ZALLOC kmem_zalloc +#define FREE kmem_free +#else +#include <umem.h> +#define ZALLOC umem_zalloc +#define FREE umem_free +#endif + + +/* + * The *_overflow functions mimic the gcc/clang intrinsic functions. Once + * we are using a newer compiler version to that includes these as intrisnics, + * these can be replaced with those versions. + */ +static int +uadd_overflow(const size_t a, const size_t b, size_t *sump) +{ + *sump = a + b; + if (*sump < a || *sump < b) + return (1); + return (0); +} + +#define MUL_NO_OVERFLOW ((size_t)1 << (sizeof (size_t) * 4)) + +static int +umul_overflow(const size_t a, const size_t b, size_t *cp) +{ + *cp = a * b; + + if ((a >= MUL_NO_OVERFLOW || b >= MUL_NO_OVERFLOW) && + a != 0 && b != 0 && SIZE_MAX / a < b) + return (1); + + return (0); +} + +/* Calculate the capacity of each list based on sz and a */ +static void +qqcache_size_lists(size_t sz, size_t a, size_t *maxp) +{ + VERIFY3U(sz, >=, QQCACHE_NUM_LISTS); + + /* + * The general approach is to start with list 0 being sized as a% of + * sz. However every other list must be able to hold at least one + * entry unless a == 100 (i.e. 100%). If the straight percentage + * leaves any of the remaining lists with zero entries, we give them + * a size of 1, and then adjust list0's size according so that the + * sum off all list sizes == sz (this is mostly only a concern where + * sz is small enough such that (100 - a)% of sz < QQCACHE_NUM_LISTS). + */ + size_t list0sz = sz * a / 100; + size_t othersz = (sz - list0sz) / (QQCACHE_NUM_LISTS - 1); + + if (list0sz == 0) + list0sz = 1; + + if (othersz == 0 && a != 100) + othersz = 1; + + if (list0sz + othersz * (QQCACHE_NUM_LISTS - 1) > sz) + list0sz = sz - othersz * (QQCACHE_NUM_LISTS - 1); + + maxp[0] = list0sz; + for (size_t i = 1; i < QQCACHE_NUM_LISTS; i++) + maxp[i] = othersz; +} + +int +qqcache_create(qqcache_t **qp, size_t sz, size_t a, size_t buckets, + qqcache_hash_fn_t hash_fn, qqcache_cmp_fn_t cmp_fn, + qqcache_dtor_fn_t dtor_fn, size_t elsize, size_t link_off, size_t tag_off, + int kmflags) +{ + qqcache_t *qc; + size_t len = 0; + + if (sz < QQCACHE_MIN_SIZE) + return (EINVAL); + if (a > 100) + return (EINVAL); + + if (umul_overflow(sizeof (qqcache_list_t), buckets, &len)) + return (EINVAL); + if (uadd_overflow(sizeof (*qc), len, &len)) + return (EINVAL); + + if ((qc = ZALLOC(len, kmflags)) == NULL) + return (ENOMEM); + + qc->qqc_hash_fn = hash_fn; + qc->qqc_cmp_fn = cmp_fn; + qc->qqc_dtor_fn = dtor_fn; + qc->qqc_link_off = link_off; + qc->qqc_tag_off = tag_off; + qc->qqc_nbuckets = buckets; + qc->qqc_size = sz; + qc->qqc_a = a; + + qqcache_size_lists(sz, a, qc->qqc_max); + + for (size_t i = 0; i < buckets; i++) { + list_create(&qc->qqc_buckets[i].qqcl_list, elsize, + offsetof(qqcache_link_t, qqln_hash_link)); + } + + for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) { + list_create(&qc->qqc_lists[i].qqcl_list, elsize, + offsetof(qqcache_link_t, qqln_list_link)); + } + + *qp = qc; + return (0); +} + +void +qqcache_destroy(qqcache_t *qc) +{ + size_t len; + + if (qc == NULL) + return; + + /* If creation succeeded, this calculation cannot overflow */ + len = sizeof (*qc) + qc->qqc_nbuckets * sizeof (qqcache_list_t); + + for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) { + list_t *l = &qc->qqc_lists[i].qqcl_list; + qqcache_link_t *lnk; + + while ((lnk = list_remove_head(l)) != NULL) + ; + } + + for (size_t i = 0; i < qc->qqc_nbuckets; i++) { + list_t *l = &qc->qqc_buckets[i].qqcl_list; + qqcache_link_t *lnk; + + while ((lnk = list_remove_head(l)) != NULL) { + ASSERT0(lnk->qqln_refcnt); + qc->qqc_dtor_fn(link_to_obj(qc, lnk)); + } + } + + FREE(qc, len); +} + +/* + * Removal of an entry is a two step process. qqcache_remove() removes the + * entry from the cache lists, and if a reference is held, sets the + * QQCACHE_F_DEAD flag. When there are no more references held on an entry, + * (either none are held at the time qqcache_remove() is called, or the last + * reference is removed via qqcache_rele(), qqcache_delete() is called which + * removes the entry from its hash bucket and calls the entry's dtor function. + * + * The main reason for the two step process is largely simplicity. If the + * entry remains in the cache lists w/ the QQCACHE_F_DEAD flag set, it + * complicates keeping each cache within its size limits -- either the + * list size must reflect the number of non-dead entries (which could be + * confusing during troubleshooting), or as we push things down the list, we + * would need to skip/ignore dead entries. The hash buckets however don't + * have any size limits (to impose limits would require the hash function + * provided by the consumer to produce perfectly equal distribution of entries + * across all the hash buckets at all times). The only time we care about + * the QQCACHE_F_DEAD flag in the hash buckets is when trying to lookup a + * 'dead' value, so leaving the entries in there does not present the same + * issues as leaving them in the hash buckets (while still providing a way to + * find refheld entries). + */ +static void +qqcache_delete(qqcache_t *qc, qqcache_link_t *lp) +{ + void *op = link_to_obj(qc, lp); + void *tp = obj_to_tag(qc, op); + uint_t n = qc->qqc_hash_fn(tp) % qc->qqc_nbuckets; + + ASSERT3U(qc->qqc_buckets[n].qqcl_len, >, 0); + ASSERT(!list_is_empty(&qc->qqc_buckets[n].qqcl_list)); + ASSERT(!list_link_active(&lp->qqln_list_link)); + ASSERT(list_link_active(&lp->qqln_hash_link)); + + list_remove(&qc->qqc_buckets[n].qqcl_list, lp); + qc->qqc_buckets[n].qqcl_len--; + qc->qqc_dtor_fn(op); +} + +void +qqcache_remove(qqcache_t *qc, void *op) +{ + qqcache_link_t *lp = obj_to_link(qc, op); + qqcache_list_t *lst = QQCACHE_LIST(qc, lp); + + ASSERT(!list_is_empty(&lst->qqcl_list)); + ASSERT3U(lst->qqcl_len, >, 0); + + list_remove(&lst->qqcl_list, lp); + lst->qqcl_len--; + + if (lp->qqln_refcnt > 0) + lp->qqln_flags |= QQCACHE_F_DEAD; + else + qqcache_delete(qc, lp); +} + +void +qqcache_hold(qqcache_t *qc, void *op) +{ + qqcache_link_t *lp = obj_to_link(qc, op); + + ++lp->qqln_refcnt; +} + +void +qqcache_rele(qqcache_t *qc, void *op) +{ + qqcache_link_t *lp = obj_to_link(qc, op); + + VERIFY3U(lp->qqln_refcnt, >, 0); + + if (--lp->qqln_refcnt == 0 && (lp->qqln_flags & QQCACHE_F_DEAD)) + qqcache_delete(qc, lp); +} + +static qqcache_link_t * +qqcache_hash_lookup(qqcache_t *qc, const void *tp, qqcache_list_t **lpp) +{ + uint_t n = qc->qqc_hash_fn(tp) % qc->qqc_nbuckets; + qqcache_link_t *lp; + qqcache_list_t *bucket = &qc->qqc_buckets[n]; + list_t *l = &bucket->qqcl_list; + void *cmp; + + if (lpp != NULL) + *lpp = bucket; + + for (lp = list_head(l); lp != NULL; lp = list_next(l, lp)) { + cmp = obj_to_tag(qc, link_to_obj(qc, lp)); + + if (qc->qqc_cmp_fn(cmp, tp) == 0 && + !(lp->qqln_flags & QQCACHE_F_DEAD)) { + return (lp); + } + } + + return (NULL); +} + +/* + * Starting at listnum, push entries from the tail of cache list 'n' to the + * head of * list 'n + 1', keeping each list within their size limits. Excess + * entries on the tail of the last list are deleted. If 'for_insert' is + * B_TRUE, also guarantee after this returns that there are no more than + * 'max - 1' entries on listnum (so there is room to insert an entry onto + * listnum). + */ +static void +qqcache_ripple(qqcache_t *qc, uint_t listnum, boolean_t for_insert) +{ + VERIFY3U(listnum, <, QQCACHE_NUM_LISTS); + + for (uint_t i = listnum; i < QQCACHE_NUM_LISTS; i++) { + qqcache_list_t *ql = &qc->qqc_lists[i]; + qqcache_list_t *qlnext = &qc->qqc_lists[i + 1]; + size_t max = qc->qqc_max[i]; + + ASSERT3U(max, >, 0); + + /* + * If we're planning to insert an entry on list 'listnum', + * we bump the maximum size down by one to guarantee we + * have sufficient room for the entry + */ + if (for_insert && i == listnum) + max--; + + while (ql->qqcl_len > max) { + qqcache_link_t *lnk = list_tail(&ql->qqcl_list); + + if (i + 1 < QQCACHE_NUM_LISTS) { + list_remove(&ql->qqcl_list, lnk); + ql->qqcl_len--; + + ASSERT3U(lnk->qqln_listnum, ==, i); + lnk->qqln_listnum++; + + list_insert_head(&qlnext->qqcl_list, lnk); + qlnext->qqcl_len++; + } else { + qqcache_remove(qc, link_to_obj(qc, lnk)); + } + } + } +} + +int +qqcache_insert(qqcache_t *qc, void *obj) +{ + qqcache_link_t *lp = obj_to_link(qc, obj); + qqcache_list_t *bucket; + + if (qqcache_hash_lookup(qc, obj_to_tag(qc, obj), &bucket) != NULL) + return (EEXIST); + + list_link_init(&lp->qqln_hash_link); + list_link_init(&lp->qqln_list_link); + lp->qqln_refcnt = 0; + lp->qqln_flags = 0; + lp->qqln_listnum = QQCACHE_INSERT_LIST; + + qqcache_ripple(qc, QQCACHE_INSERT_LIST, B_TRUE); + + list_insert_tail(&bucket->qqcl_list, lp); + bucket->qqcl_len++; + + list_insert_head(&qc->qqc_lists[QQCACHE_INSERT_LIST].qqcl_list, lp); + qc->qqc_lists[QQCACHE_INSERT_LIST].qqcl_len++; + + return (0); +} + +void * +qqcache_lookup(qqcache_t *qc, const void *tp) +{ + qqcache_link_t *lp; + qqcache_list_t *src; + uint_t tgtnum; + + if ((lp = qqcache_hash_lookup(qc, tp, NULL)) == NULL) + return (NULL); + + src = QQCACHE_LIST(qc, lp); + list_remove(&src->qqcl_list, lp); + src->qqcl_len--; + + tgtnum = (lp->qqln_listnum > 0) ? lp->qqln_listnum - 1 : 0; + + if (tgtnum != lp->qqln_listnum) + qqcache_ripple(qc, tgtnum, B_TRUE); + + lp->qqln_listnum = tgtnum; + list_insert_head(&qc->qqc_lists[tgtnum].qqcl_list, lp); + qc->qqc_lists[tgtnum].qqcl_len++; + + return (link_to_obj(qc, lp)); +} + +int +qqcache_adjust_size(qqcache_t *qc, size_t sz) +{ + if (sz < QQCACHE_MIN_SIZE) + return (EINVAL); + + qc->qqc_size = sz; + qqcache_size_lists(sz, qc->qqc_a, qc->qqc_max); + qqcache_ripple(qc, 0, B_FALSE); + return (0); +} + +int +qqcache_adjust_a(qqcache_t *qc, size_t a) +{ + if (a > 100) + return (EINVAL); + + qc->qqc_a = a; + qqcache_size_lists(qc->qqc_size, a, qc->qqc_max); + qqcache_ripple(qc, 0, B_FALSE); + return (0); +} + +size_t +qqcache_size(const qqcache_t *qc) +{ + return (qc->qqc_size); +} + +size_t +qqcache_a(const qqcache_t *qc) +{ + return (qc->qqc_a); +} + +void * +qqcache_first(qqcache_t *qc) +{ + for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) { + qqcache_list_t *l = &qc->qqc_lists[i]; + + if (l->qqcl_len > 0) + return (link_to_obj(qc, list_head(&l->qqcl_list))); + } + + return (NULL); +} + +void * +qqcache_next(qqcache_t *qc, void *obj) +{ + qqcache_link_t *lp = obj_to_link(qc, obj); + qqcache_link_t *next; + qqcache_list_t *l = QQCACHE_LIST(qc, lp); + + ASSERT3U(lp->qqln_listnum, <, QQCACHE_NUM_LISTS); + + if ((next = list_next(&l->qqcl_list, lp)) != NULL) + return (link_to_obj(qc, next)); + + for (size_t i = lp->qqln_listnum + 1; i < QQCACHE_NUM_LISTS; i++) { + l = &qc->qqc_lists[i]; + if (l->qqcl_len > 0) + return (link_to_obj(qc, list_head(&l->qqcl_list))); + } + + return (NULL); +} diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 24fdd94c11..eaf06f476c 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -28,7 +28,7 @@ # Copyright 2017 Nexenta Systems, Inc. # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> # Copyright 2019 Peter Tribble. -# Copyright 2015, Joyent, Inc. All rights reserved. +# Copyright 2018 Joyent, Inc. # include $(SRC)/uts/Makefile.uts @@ -491,6 +491,8 @@ CHKHDRS= \ ptem.h \ ptms.h \ ptyvar.h \ + qqcache.h \ + qqcache_impl.h \ raidioctl.h \ ramdisk.h \ random.h \ diff --git a/usr/src/uts/common/sys/ethernet.h b/usr/src/uts/common/sys/ethernet.h index 5b9de2f2bf..f19912bfc3 100644 --- a/usr/src/uts/common/sys/ethernet.h +++ b/usr/src/uts/common/sys/ethernet.h @@ -23,6 +23,8 @@ * * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018, Joyent, Inc. */ /* @@ -139,6 +141,18 @@ struct ether_vlan_extinfo { #define ether_copy(a, b) (bcopy((caddr_t)a, (caddr_t)b, 6)) #endif +/* + * Ethernet is-zero check + */ +#if defined(__sparc) || defined(__i386) || defined(__amd64) +#define ether_is_zero(a) \ + (((short *)a)[0] == 0 && ((short *)a)[1] == 0 && ((short *)a)[2] == 0) +#else +#define ether_is_zero(a) (((uint8_t *)a)[0] == 0 && ((uint8_t *)a)[1] == 0 && \ + ((uint8_t *)a)[2] == 0 && ((uint8_t *)a)[3] == 0 && \ + ((uint8_t *)a)[4] == 0 && ((uint8_t *)a)[5] == 0) +#endif + #ifdef _KERNEL extern int localetheraddr(struct ether_addr *, struct ether_addr *); extern char *ether_sprintf(struct ether_addr *); diff --git a/usr/src/uts/common/sys/overlay.h b/usr/src/uts/common/sys/overlay.h index 12d0dbca51..90f1843282 100644 --- a/usr/src/uts/common/sys/overlay.h +++ b/usr/src/uts/common/sys/overlay.h @@ -10,7 +10,7 @@ */ /* - * Copyright 2015, Joyent, Inc. + * Copyright 2018, Joyent, Inc. */ #ifndef _SYS_OVERLAY_H @@ -40,7 +40,7 @@ extern "C" { typedef struct overlay_ioc_create { datalink_id_t oic_linkid; - uint32_t oic_filler; + uint32_t oic_dcid; uint64_t oic_vnetid; char oic_encap[MAXLINKNAMELEN]; } overlay_ioc_create_t; diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h index d638096006..de682a0397 100644 --- a/usr/src/uts/common/sys/overlay_common.h +++ b/usr/src/uts/common/sys/overlay_common.h @@ -42,7 +42,8 @@ typedef enum overlay_prop_type { OVERLAY_PROP_T_INT = 0x1, /* signed int */ OVERLAY_PROP_T_UINT, /* unsigned int */ OVERLAY_PROP_T_IP, /* sinaddr6 */ - OVERLAY_PROP_T_STRING /* OVERLAY_PROPS_SIZEMAX */ + OVERLAY_PROP_T_STRING, /* OVERLAY_PROPS_SIZEMAX */ + OVERLAY_PROP_T_ETHER /* 6-byte MAC address */ } overlay_prop_type_t; typedef enum overlay_prop_prot { diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h index 7fb8b8da1d..28e80d6d58 100644 --- a/usr/src/uts/common/sys/overlay_impl.h +++ b/usr/src/uts/common/sys/overlay_impl.h @@ -10,7 +10,7 @@ */ /* - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_OVERLAY_IMPL_H @@ -29,9 +29,9 @@ #include <sys/avl.h> #include <sys/ksocket.h> #include <sys/socket.h> -#include <sys/refhash.h> #include <sys/ethernet.h> #include <sys/list.h> +#include <sys/qqcache.h> #ifdef __cplusplus extern "C" { @@ -59,7 +59,7 @@ typedef struct overlay_mux { int omux_domain; /* RO: socket domain */ int omux_family; /* RO: socket family */ int omux_protocol; /* RO: socket protocol */ - struct sockaddr *omux_addr; /* RO: socket address */ + struct sockaddr *omux_addr; /* RO: socket address */ socklen_t omux_alen; /* RO: sockaddr len */ kmutex_t omux_lock; /* Protects everything below */ uint_t omux_count; /* Active instances */ @@ -81,8 +81,10 @@ typedef struct overlay_target { union { /* ott_lock */ overlay_target_point_t ott_point; struct overlay_target_dyn { - refhash_t *ott_dhash; + qqcache_t *ott_dhash; + qqcache_t *ott_l3dhash; avl_tree_t ott_tree; + avl_tree_t ott_l3tree; } ott_dyn; } ott_u; } overlay_target_t; @@ -117,6 +119,12 @@ typedef struct overlay_dev { uint64_t odd_vid; /* RO if active else odd_lock */ avl_node_t odd_muxnode; /* managed by mux */ overlay_target_t *odd_target; /* See big theory statement */ + uint32_t odd_dcid; /* RO if active else odd_lock */ + uint32_t odd_vl2sz; /* protected by odd_lock */ + uint32_t odd_vl2a; /* protected by odd_lock */ + uint32_t odd_routesz; /* protected by odd_lock */ + uint32_t odd_routea; /* protected by odd_lock */ + uint8_t odd_macaddr[ETHERADDRL]; /* RO same as odd_dcid */ char odd_fmamsg[OVERLAY_STATUS_BUFLEN]; /* odd_lock */ } overlay_dev_t; @@ -124,25 +132,50 @@ typedef enum overlay_target_entry_flags { OVERLAY_ENTRY_F_PENDING = 0x01, /* lookup in progress */ OVERLAY_ENTRY_F_VALID = 0x02, /* entry is currently valid */ OVERLAY_ENTRY_F_DROP = 0x04, /* always drop target */ - OVERLAY_ENTRY_F_VALID_MASK = 0x06 + OVERLAY_ENTRY_F_ROUTER = 0x08, /* VL2 router entry */ + OVERLAY_ENTRY_F_HAS_ROUTE = 0x10, + OVERLAY_ENTRY_F_VALID_MASK = 0x1e, + OVERLAY_ENTRY_F_VL3 = 0x20, /* Is VL3 entry */ } overlay_target_entry_flags_t; -typedef struct overlay_target_entry { +struct overlay_target_entry; +typedef struct overlay_target_entry overlay_target_entry_t; + +/* + * For VL3 target entries, if we need to lock both the VL3 entry and the + * (possibly shared with multiple VL3 entries) VL2 entry, we must always + * take the VL3 lock prior to the VL2 entry lock. + */ +typedef struct overlay_target_vl3 { + struct in6_addr otvl3_src; + struct in6_addr otvl3_dst; + uint16_t otvl3_src_vlan; + overlay_target_mac_t otvl3_vl2; +} overlay_target_vl3_t; + +typedef struct overlay_target_vl2 { + overlay_target_route_t otvl2_route; + overlay_target_mac_t otvl2_mac; + overlay_target_point_t otvl2_dest; +} overlay_target_vl2_t; + +struct overlay_target_entry { kmutex_t ote_lock; - refhash_link_t ote_reflink; /* hashtable link */ + qqcache_link_t ote_reflink; /* hashtable link */ avl_node_t ote_avllink; /* iteration link */ list_node_t ote_qlink; overlay_target_entry_flags_t ote_flags; /* RW: state flags */ - uint8_t ote_addr[ETHERADDRL]; /* RO: mac addr */ overlay_target_t *ote_ott; /* RO */ overlay_dev_t *ote_odd; /* RO */ - overlay_target_point_t ote_dest; /* RW: destination */ mblk_t *ote_chead; /* RW: blocked mb chain head */ mblk_t *ote_ctail; /* RW: blocked mb chain tail */ size_t ote_mbsize; /* RW: outstanding mblk size */ hrtime_t ote_vtime; /* RW: valid timestamp */ -} overlay_target_entry_t; - + union { + overlay_target_vl2_t ote_vl2; + overlay_target_vl3_t ote_vl3; + } ote_u; +}; #define OVERLAY_CTL "overlay" @@ -186,7 +219,7 @@ extern void overlay_target_free(overlay_dev_t *); #define OVERLAY_TARGET_DROP 1 #define OVERLAY_TARGET_ASYNC 2 extern int overlay_target_lookup(overlay_dev_t *, mblk_t *, struct sockaddr *, - socklen_t *); + socklen_t *, uint64_t *); extern void overlay_target_quiesce(overlay_target_t *); extern void overlay_target_fini(void); diff --git a/usr/src/uts/common/sys/overlay_target.h b/usr/src/uts/common/sys/overlay_target.h index ae92ef3532..28c0559acb 100644 --- a/usr/src/uts/common/sys/overlay_target.h +++ b/usr/src/uts/common/sys/overlay_target.h @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2015 Joyent, Inc. + * Copyright (c) 2018 Joyent, Inc. */ #ifndef _OVERLAY_TARGET_H @@ -29,11 +29,43 @@ extern "C" { #endif +/* + * The overlay_target_point_t structure represents the destination where + * encapsulated frames are sent. Currently supported virtualization protocols + * (i.e. vxlan) only use otp_ip and otp_port, but other methods might use + * a L2 address instead of an L3 address to represent a destination. + */ typedef struct overlay_target_point { - uint8_t otp_mac[ETHERADDRL]; struct in6_addr otp_ip; uint16_t otp_port; -} overlay_target_point_t; + uint8_t otp_mac[ETHERADDRL]; +} overlay_target_point_t __aligned(8); + +/* + * An overlay_target_mac_t represents the overlay representation of a VL2 MAC + * address. With the advent of cross-DC routing, it is possible to have + * duplicate MAC addresses in different data centers, so the data center id + * is necessary to uniquely identify a MAC address. + * + * XXX: In hindsight, using a uint16_t for the DCID might have been nicer. + */ +typedef struct overlay_target_mac { + uint32_t otm_dcid; + uint8_t otm_mac[ETHERADDRL]; +} overlay_target_mac_t; + +/* + * The overlay_target_route_t represents the fields of the packet that + * have to be modified to deliver a packet to remote (routed) destinations. + * All three values are always populated when a packet is routed, even if + * some of the overlay_target_route_t values end up being the same as the + * original values in the packet being routed. + */ +typedef struct overlay_target_route { + uint64_t otr_vnet; + uint8_t otr_srcmac[ETHERADDRL]; + uint16_t otr_vlan; +} overlay_target_route_t; #define OVERLAY_TARG_IOCTL (('o' << 24) | ('v' << 16) | ('t' << 8)) @@ -52,6 +84,7 @@ typedef struct overlay_targ_info { uint32_t oti_needs; uint64_t oti_flags; uint64_t oti_vnetid; + uint32_t oti_dcid; } overlay_targ_info_t; /* @@ -134,7 +167,7 @@ typedef struct overlay_targ_id { * * This ioctl can be used to copy data from a given request into a * user buffer. This can be used in combination with - * OVERLAY_TARG_INJECT to implemnt services such as a proxy-arp. + * OVERLAY_TARG_INJECT to implement services such as a proxy-arp. * * * OVERLAY_TARG_RESEND - overlay_targ_pkt_t @@ -152,6 +185,18 @@ typedef struct overlay_targ_id { #define OVERLAY_TARG_PKT (OVERLAY_TARG_IOCTL | 0x14) #define OVERLAY_TARG_RESEND (OVERLAY_TARG_IOCTL | 0x15) +typedef struct overlay_targ_l2 { + uint8_t otl2_srcaddr[ETHERADDRL]; + uint8_t otl2_dstaddr[ETHERADDRL]; + uint32_t otl2_dsttype; + uint32_t otl2_sap; +} overlay_targ_l2_t; + +typedef struct overlay_targ_l3 { + struct in6_addr otl3_srcip; + struct in6_addr otl3_dstip; +} overlay_targ_l3_t; + typedef struct overlay_targ_lookup { uint64_t otl_dlid; uint64_t otl_reqid; @@ -159,16 +204,20 @@ typedef struct overlay_targ_lookup { uint64_t otl_vnetid; uint64_t otl_hdrsize; uint64_t otl_pktsize; - uint8_t otl_srcaddr[ETHERADDRL]; - uint8_t otl_dstaddr[ETHERADDRL]; - uint32_t otl_dsttype; - uint32_t otl_sap; + union { + overlay_targ_l2_t otlu_l2; + overlay_targ_l3_t otlu_l3; + } otl_addru; int32_t otl_vlan; + boolean_t otl_l3req; } overlay_targ_lookup_t; + typedef struct overlay_targ_resp { - uint64_t otr_reqid; - overlay_target_point_t otr_answer; + uint64_t otr_reqid; + overlay_target_route_t otr_route; /* Ignored for VL2->UL3 requests */ + overlay_target_mac_t otr_mac; /* Ignored for VL2->UL3 requests */ + overlay_target_point_t otr_answer; } overlay_targ_resp_t; typedef struct overlay_targ_pkt { @@ -255,6 +304,7 @@ typedef struct overlay_targ_list { #define OVERLAY_TARG_CACHE_REMOVE (OVERLAY_TARG_IOCTL | 0x32) #define OVERLAY_TARG_CACHE_FLUSH (OVERLAY_TARG_IOCTL | 0x33) #define OVERLAY_TARG_CACHE_ITER (OVERLAY_TARG_IOCTL | 0x34) +#define OVERLAY_TARG_CACHE_REMOVE_NET (OVERLAY_TARG_IOCTL | 0x35) /* * This is a pretty arbitrary number that we're constraining ourselves to @@ -265,22 +315,36 @@ typedef struct overlay_targ_list { #define OVERLAY_TARGET_ITER_MAX 500 #define OVERLAY_TARGET_CACHE_DROP 0x01 +#define OVERLAY_TARGET_CACHE_ROUTER 0x02 typedef struct overlay_targ_cache_entry { - uint8_t otce_mac[ETHERADDRL]; + overlay_target_mac_t otce_mac; uint16_t otce_flags; overlay_target_point_t otce_dest; } overlay_targ_cache_entry_t; +typedef struct overlay_targ_cache_net_entry { + struct in6_addr otcne_src; + struct in6_addr otcne_dst; + uint16_t otcne_vlan; /* src vlan */ + uint8_t otcne_src_prefixlen; + uint8_t otcne_dst_prefixlen; +} overlay_targ_cache_net_entry_t; + typedef struct overlay_targ_cache { datalink_id_t otc_linkid; overlay_targ_cache_entry_t otc_entry; } overlay_targ_cache_t; +typedef struct overlay_targ_cache_net { + datalink_id_t otcn_linkid; + overlay_targ_cache_net_entry_t otcn_entry; +} overlay_targ_cache_net_t; + typedef struct overlay_targ_cache_iter { datalink_id_t otci_linkid; uint32_t otci_pad; - uint64_t otci_marker; + uint64_t otci_marker[2]; uint16_t otci_count; uint8_t otci_pad2[3]; overlay_targ_cache_entry_t otci_ents[]; diff --git a/usr/src/uts/common/sys/qqcache.h b/usr/src/uts/common/sys/qqcache.h new file mode 100644 index 0000000000..a2244338dd --- /dev/null +++ b/usr/src/uts/common/sys/qqcache.h @@ -0,0 +1,176 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018, Joyent, Inc. + */ + +#ifndef _QQCACHE_H +#define _QQCACHE_H + +#include <sys/list.h> +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This implements a fixed-size hash table that uses the 2Q algorithm + * from Johnson and Shasha to manage the contents of the entries. + * + * Briefly, there are two fixed sizes lists (0 and 1). New entries are + * added to the head of list 1, and upon subsequent access (lookup), are + * moved to the head of list 0. Entries that fall off the end of list 0 + * are pushed onto the head of list 1, and entries that fall off the end + * of list 1 are deleted. The percentage of the total size of the cache + * for each list is determined by the parameter 'a', which is a percentage + * (0-100) of the cache size that is dedicated to list 0. + * + * This implementation does generalize this algorithm somewhat to an + * arbitrary number of lists (instead of just 2) via the QQCACHE_NUM_LISTS + * and QQCACHE_INSERT_LIST preprocessor symbols (defined in + * sys/qqcache_impl.h). New entries are added to list QQCACHE_INSERT_LIST + * and as each list gets full, the oldest entry in each list is pushed to + * the head of the succeeding list, and the oldest entries are removed + * from the cache (so each list never has more entries than their maximum + * size). + * + * The API itself is very similar to that of refhash. A qqcache_link_t struct + * is embedded within the definition of the entries that are being stored in + * a given qqcache_t. Functions are provided to hash/compare the tag (key) + * value of an entry, as well as destroying the entry during the creation + * of the cache. Lookups then occur by passing a pointer to the key value + * being looked up. + * + * NOTE: As one can take references to entries in the cache via the + * qqcache_hold() function, refheld entries that are marked for deletion are + * not counted when tracking the cache size, and their dtor function is not + * called until the last reference has been released (by calling the + * qqcache_rele() function). + */ + +typedef enum qqcache_flag { + QQCACHE_F_DEAD = 0x01, +} qqcache_flag_t; + +typedef struct qqcache_link { + list_node_t qqln_hash_link; /* Hash chain bucket */ + list_node_t qqln_list_link; /* Cache list link */ + uint_t qqln_listnum; + uint_t qqln_refcnt; + qqcache_flag_t qqln_flags; +} qqcache_link_t; + +struct qqcache; +typedef struct qqcache qqcache_t; + +typedef uint64_t (*qqcache_hash_fn_t)(const void *); +typedef int (*qqcache_cmp_fn_t)(const void *, const void *); +typedef void (*qqcache_dtor_fn_t)(void *); + +/* + * qqcache_create(qcp, sz, a, buckets, hash_fn, cmp_fn, dtor_fn, + * elsize, link_off, tag_off, flags); + * + * Creates a new 2Q cache: + * + * qqcache_t **qcp A pointer to the pointer that will hold the new + * cache. + * + * size_t sz The size of the cache (in entries). + * + * size_t a The percentage (0-100) of the cache dedicated to + * MRU entries (list 0); + * + * size_t buckets The number of hash buckets in the cache. + * + * qqcache_hash_fn_t hash_fn The function used to create a + * hash value for a given entry's tag + * value. + * + * qqcache_cmp_fn_t cmp_fn The function used to compare the two + * tag values of two entries. The function + * should return '0' if the two entries + * are equal, '1' if they are not equal. + * + * qqcache_dtor_fn_t dtor_fn The function used to destroy/free + * entries. + * + * size_t elsize The size of each entry. + * + * size_t link_off The offset of the qqcache_link_t struct in the entry. + * + * size_t tag_off The offset in the entry of the tag value (used for + * hashing and comparison). + * + * int flags The flags passed to kmem_zalloc/umem_zalloc. + * + * Returns: + * 0 Success + * EINVAL A parameter was not valid + * ENOMEM The memory allocation failed (only possible when + * KM_NOSLEEP/UMEM_DEFAULT is passed to flags). + */ +extern int qqcache_create(qqcache_t **, size_t, size_t, size_t, + qqcache_hash_fn_t, qqcache_cmp_fn_t, qqcache_dtor_fn_t, + size_t, size_t, size_t, int); + +/* Destroy the given qqcache_t */ +extern void qqcache_destroy(qqcache_t *); + +/* + * qqcache_insert(qc, obj) + * + * qqcache_t *qc The cache to insert the item into. + * + * void *obj The object to add. + * + * Returns: + * 0 Success + * EEXIST The same entry (as determined by the cache cmp function) already + * exists in the cache. + */ +extern int qqcache_insert(qqcache_t *, void *); + +/* Lookup an entry with the given tag/key, or return NULL if not found */ +extern void *qqcache_lookup(qqcache_t *, const void *); + +/* Remove the given entry from the cache */ +extern void qqcache_remove(qqcache_t *, void *); + +/* Add a hold on the entry in the cache */ +extern void qqcache_hold(qqcache_t *, void *); + +/* Release the hold on the entry in the cache */ +extern void qqcache_rele(qqcache_t *, void *); + +/* + * Adjust the size and percentage of the cache for list 0. If new values are + * smaller than current values, entries may be evicted as necessary to reduce + * the size of the cache to the given size. + */ +extern int qqcache_adjust_size(qqcache_t *, size_t); +extern int qqcache_adjust_a(qqcache_t *, size_t); + +/* Return the current values of size or a. */ +extern size_t qqcache_size(const qqcache_t *); +extern size_t qqcache_a(const qqcache_t *); + +/* Iterate through entries. */ +extern void *qqcache_first(qqcache_t *); +extern void *qqcache_next(qqcache_t *, void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _QQCACHE_H */ diff --git a/usr/src/uts/common/sys/qqcache_impl.h b/usr/src/uts/common/sys/qqcache_impl.h new file mode 100644 index 0000000000..f709b74d6c --- /dev/null +++ b/usr/src/uts/common/sys/qqcache_impl.h @@ -0,0 +1,72 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018, Joyent, Inc. + */ + +#ifndef _QQCACHE_IMPL_H +#define _QQCACHE_IMPL_H + +#include <sys/debug.h> +#include <sys/qqcache.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define QQCACHE_NUM_LISTS 2 +#define QQCACHE_INSERT_LIST 1 +#define QQCACHE_MIN_SIZE 10 + +CTASSERT(QQCACHE_INSERT_LIST < QQCACHE_NUM_LISTS); +CTASSERT(QQCACHE_NUM_LISTS >= 2); + +typedef struct qqcache_list { + list_t qqcl_list; + size_t qqcl_len; +} qqcache_list_t; + +struct qqcache { + qqcache_hash_fn_t qqc_hash_fn; + qqcache_cmp_fn_t qqc_cmp_fn; + qqcache_dtor_fn_t qqc_dtor_fn; + size_t qqc_link_off; + size_t qqc_tag_off; + size_t qqc_nbuckets; + size_t qqc_size; + size_t qqc_a; + size_t qqc_max[QQCACHE_NUM_LISTS]; + qqcache_list_t qqc_lists[QQCACHE_NUM_LISTS]; + qqcache_list_t qqc_buckets[]; +}; + +#define QQCACHE_LIST(qqc, lnk) \ + (&(qqc)->qqc_lists[(lnk)->qqln_listnum]) + +#ifdef lint +extern qqcache_link_t *obj_to_link(qqcache_t *, void *); +extern void *link_to_obj(qqcache_t *, qqcache_link_t *); +extern void *obj_to_tag(qqcache_t *, void *); +#else +#define obj_to_link(_q, _o) \ + ((qqcache_link_t *)(((char *)(_o)) + (_q)->qqc_link_off)) +#define link_to_obj(_q, _l) \ + ((void *)(((char *)(_l)) - (_q)->qqc_link_off)) +#define obj_to_tag(_q, _o) \ + ((void *)(((char *)(_o)) + (_q)->qqc_tag_off)) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _QQCACHE_IMPL_H */ |