summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile2
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_svp.c77
-rw-r--r--usr/src/cmd/dladm/dladm.c130
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/Makefile.files1
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/genunix.c7
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/qqcache.c117
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/qqcache.h40
-rw-r--r--usr/src/lib/libdladm/common/libdloverlay.c27
-rw-r--r--usr/src/lib/libdladm/common/libdloverlay.h12
-rw-r--r--usr/src/lib/libdladm/common/mapfile-vers1
-rw-r--r--usr/src/lib/varpd/files/Makefile.com1
-rw-r--r--usr/src/lib/varpd/files/common/libvarpd_files.c1340
-rw-r--r--usr/src/lib/varpd/libvarpd/Makefile3
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd.c12
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c17
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_client.c22
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_client.h6
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_door.c6
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h18
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c74
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c8
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h10
-rw-r--r--usr/src/lib/varpd/libvarpd/common/mapfile-plugin4
-rw-r--r--usr/src/lib/varpd/libvarpd/common/mapfile-vers5
-rw-r--r--usr/src/lib/varpd/svp/common/libvarpd_svp.c336
-rw-r--r--usr/src/lib/varpd/svp/common/libvarpd_svp.h30
-rw-r--r--usr/src/lib/varpd/svp/common/libvarpd_svp_conn.c252
-rw-r--r--usr/src/lib/varpd/svp/common/libvarpd_svp_prot.h80
-rw-r--r--usr/src/lib/varpd/svp/common/libvarpd_svp_remote.c153
-rw-r--r--usr/src/lib/varpd/svp/common/libvarpd_svp_shootdown.c78
-rw-r--r--usr/src/man/man1m/dladm.1m16
-rw-r--r--usr/src/test/os-tests/runfiles/default.run1
-rw-r--r--usr/src/test/os-tests/tests/Makefile1
-rw-r--r--usr/src/test/os-tests/tests/qqcache/Makefile60
-rw-r--r--usr/src/test/os-tests/tests/qqcache/qqcache-test.c380
-rw-r--r--usr/src/uts/common/Makefile.files1
-rw-r--r--usr/src/uts/common/Makefile.rules12
-rw-r--r--usr/src/uts/common/io/overlay/overlay.c452
-rw-r--r--usr/src/uts/common/io/overlay/overlay_mux.c15
-rw-r--r--usr/src/uts/common/io/overlay/overlay_target.c1290
-rw-r--r--usr/src/uts/common/qqcache/qqcache.c444
-rw-r--r--usr/src/uts/common/sys/Makefile4
-rw-r--r--usr/src/uts/common/sys/ethernet.h14
-rw-r--r--usr/src/uts/common/sys/overlay.h4
-rw-r--r--usr/src/uts/common/sys/overlay_common.h3
-rw-r--r--usr/src/uts/common/sys/overlay_impl.h57
-rw-r--r--usr/src/uts/common/sys/overlay_target.h88
-rw-r--r--usr/src/uts/common/sys/qqcache.h176
-rw-r--r--usr/src/uts/common/sys/qqcache_impl.h72
49 files changed, 5318 insertions, 641 deletions
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile
index 4e3dd8259a..9b11174c49 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile
+++ b/usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile
@@ -22,7 +22,7 @@
#
# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
-# Copyright (c) 2018, Joyent, Inc.
+# Copyright 2018 Joyent, Inc.
#
PROG= snoop
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_svp.c b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_svp.c
index a0768c2234..3da8c57f44 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_svp.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_svp.c
@@ -322,6 +322,11 @@ do_svp_log_ack(void *data, int len)
case SVP_LOG_VL3:
rlen = sizeof (svp_log_vl3_t);
break;
+#if 0 /* XXX KEBE SAYS ROUTE */
+ case SVP_LOG_ROUTE:
+ rlen = sizeof (svp_log_route_t);
+ break;
+#endif
default:
/*
* If we don't know the type of log record we have,
@@ -362,6 +367,33 @@ do_svp_log_ack(void *data, int len)
ntohl(u.vl3->svl3_vnetid));
u.vl3++;
break;
+#if 0 /* XXX KEBE SAYS ROUTE */
+ case SVP_LOG_ROUTE:
+ show_printf("%8s Source Vnet = %u", "",
+ ntohl(u.vr->svlr_src_vnetid));
+ show_printf("%8s Source VLAN = %hu", "",
+ ntohs(u.vr->svlr_src_vlan));
+
+ prefixlen = u.vr->svlr_src_prefixlen;
+ is_host = prefixlen == 128 ? B_TRUE : B_FALSE;
+ show_printf("%8s Source %s = %s", "",
+ is_host ? "address" : "subnet",
+ svp_addr_str(u.vr->svlr_srcip, &prefixlen));
+ show_printf("%8s Destination DC id = %u", "",
+ ntohl(u.vr->svlr_dcid));
+ show_printf("%8s Destination Vnet = %u", "",
+ ntohl(u.vr->svlr_dst_vnetid));
+ show_printf("%8s Destination VLAN = %hu", "",
+ ntohs(u.vr->svlr_dst_vlan));
+
+ prefixlen = u.vr->svlr_dst_prefixlen;
+ is_host = prefixlen == 128 ? B_TRUE : B_FALSE;
+ show_printf("%8s Destination %s = %s", "",
+ is_host ? "address" : "subnet",
+ svp_addr_str(u.vr->svlr_dstip, &prefixlen));
+ u.vr++;
+ break;
+#endif
}
len -= rlen;
@@ -423,6 +455,39 @@ do_svp_shootdown(void *data, int len)
ether_ntoa((struct ether_addr *)sd->svsd_mac));
}
+#if 0 /* XXX KEBE SAYS ROUTE */
+static void
+do_svp_route_req(void *data, int len)
+{
+ svp_route_req_t *req = data;
+
+ show_printf("Vnet = %u", ntohl(req->srr_vnetid));
+ show_printf("VLAN = %hu", ntohs(req->srr_vlan));
+ show_printf("Source Address = %s", svp_addr_str(req->srr_srcip, NULL));
+ show_printf("Destination Address = %s", svp_addr_str(req->srr_dstip,
+ NULL));
+}
+
+static void
+do_svp_route_ack(void *data, int len)
+{
+ svp_route_ack_t *ack = data;
+
+ show_printf("Status = %s", svp_status_str(ntohl(ack->sra_status)));
+ show_printf("Remote DC Id = %u", ntohl(ack->sra_dcid));
+ show_printf("Remote Vnet = %u", ntohl(ack->sra_vnetid));
+ show_printf("Remote VLAN = %hu", ntohs(ack->sra_vlan));
+ show_printf("Remote UL3 Address = %s", svp_addr_str(ack->sra_ip, NULL));
+ show_printf("Remote UL3 Port = %hu", ntohs(ack->sra_port));
+ show_printf("Source MAC Address = %s",
+ ether_ntoa((struct ether_addr *)ack->sra_srcmac));
+ show_printf("Destination MAC Address = %s",
+ ether_ntoa((struct ether_addr *)ack->sra_dstmac));
+ show_printf("Source IP Prefix = %hhu", ack->sra_src_pfx);
+ show_printf("Destination IP Prefix = %hhu", ack->sra_dst_pfx);
+}
+#endif
+
static struct svp_len_tbl {
uint16_t slt_op;
size_t slt_len;
@@ -441,6 +506,10 @@ static struct svp_len_tbl {
{ SVP_R_LOG_RM, sizeof (svp_lrm_req_t) },
{ SVP_R_LOG_RM_ACK, sizeof (svp_lrm_ack_t) },
{ SVP_R_SHOOTDOWN, sizeof (svp_shootdown_t) },
+#if 0 /* XXX KEBE SAYS ROUTE */
+ { SVP_R_ROUTE_REQ, sizeof (svp_route_req_t) },
+ { SVP_R_ROUTE_ACK, sizeof (svp_route_ack_t) }
+#endif
};
static boolean_t
@@ -548,6 +617,14 @@ interpret_svp(int flags, char *data, int fraglen)
case SVP_R_SHOOTDOWN:
do_svp_shootdown(req, fraglen);
break;
+#if 0 /* XXX KEBE SAYS ROUTE */
+ case SVP_R_ROUTE_REQ:
+ do_svp_route_req(req, fraglen);
+ break;
+ case SVP_R_ROUTE_ACK:
+ do_svp_route_ack(req, fraglen);
+ break;
+#endif
}
show_space();
diff --git a/usr/src/cmd/dladm/dladm.c b/usr/src/cmd/dladm/dladm.c
index c59926be94..590f693a66 100644
--- a/usr/src/cmd/dladm/dladm.c
+++ b/usr/src/cmd/dladm/dladm.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2016 Nexenta Systems, Inc.
* Copyright 2020 Peter Tribble.
*/
@@ -420,13 +420,14 @@ static cmd_t cmds[] = {
" show-bridge -t [-p] [-o <field>,...] [-s [-i <interval>]]"
" <bridge>\n" },
{ "create-overlay", do_create_overlay,
- " create-overlay [-t] -e <encap> -s <search> -v <vnetid>\n"
+ " create-overlay [-t] [-d <dcid>] -e <encap> -s <search> "
+ "-v <vnetid>\n"
"\t\t [ -p <prop>=<value>[,...]] <overlay>" },
{ "delete-overlay", do_delete_overlay,
" delete-overlay <overlay>" },
{ "modify-overlay", do_modify_overlay,
- " modify-overlay -d mac | -f | -s mac=ip:port "
- "<overlay>" },
+ " modify-overlay -d [dcid/]mac | -f | -s [dcid/]mac=ip:port "
+ " | -p prop=value[,...] <overlay>" },
{ "show-overlay", do_show_overlay,
" show-overlay [-f | -t] [[-p] -o <field>,...] "
"[<overlay>]\n" },
@@ -1464,12 +1465,14 @@ static const struct option overlay_create_lopts[] = {
{ "search", required_argument, NULL, 's' },
{ "temporary", no_argument, NULL, 't' },
{ "vnetid", required_argument, NULL, 'v' },
+ { "dcid", optional_argument, NULL, 'd' },
{ NULL, 0, NULL, 0 }
};
static const struct option overlay_modify_lopts[] = {
{ "delete-entry", required_argument, NULL, 'd' },
{ "flush-table", no_argument, NULL, 'f' },
+ { "prop", required_argument, NULL, 'p' },
{ "set-entry", required_argument, NULL, 's' },
{ NULL, 0, NULL, 0 }
};
@@ -9892,15 +9895,26 @@ do_create_overlay(int argc, char *argv[], const char *use)
char name[MAXLINKNAMELEN];
dladm_status_t status;
uint32_t flags = DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST;
+ uint32_t dcid = 0;
uint64_t vid;
boolean_t havevid = B_FALSE;
char propstr[DLADM_STRSIZE];
dladm_arg_list_t *proplist = NULL;
bzero(propstr, sizeof (propstr));
- while ((opt = getopt_long(argc, argv, ":te:v:p:s:",
+ while ((opt = getopt_long(argc, argv, ":td:e:v:p:s:",
overlay_create_lopts, NULL)) != -1) {
switch (opt) {
+ case 'd':
+ errno = 0;
+ dcid = strtoul(optarg, &endp, 10);
+ if (*endp != '\0' || (dcid == 0 && errno == EINVAL))
+ die("couldn't parse datacenter id: %s",
+ optarg);
+ /* XXX If we go 64-bit, add check for > UINT32_MAX. */
+ if (dcid == ULONG_MAX && errno == ERANGE)
+ die("datacenter id too large: %s", optarg);
+ break;
case 'e':
encap = optarg;
break;
@@ -9917,6 +9931,7 @@ do_create_overlay(int argc, char *argv[], const char *use)
die("property list too long '%s'", propstr);
break;
case 'v':
+ errno = 0;
vid = strtoul(optarg, &endp, 10);
if (*endp != '\0' || (vid == 0 && errno == EINVAL))
die("couldn't parse virtual networkd id: %s",
@@ -9959,7 +9974,7 @@ do_create_overlay(int argc, char *argv[], const char *use)
!= DLADM_STATUS_OK)
die("invalid overlay property");
- status = dladm_overlay_create(handle, name, encap, search, vid,
+ status = dladm_overlay_create(handle, name, encap, search, vid, dcid,
proplist, &errlist, flags);
dladm_free_props(proplist);
if (status != DLADM_STATUS_OK) {
@@ -9989,7 +10004,7 @@ do_delete_overlay(int argc, char *argv[], const char *use)
typedef struct showoverlay_state {
ofmt_handle_t sho_ofmt;
- const char *sho_linkname;
+ const char *sho_linkname;
dladm_overlay_propinfo_handle_t sho_info;
uint8_t sho_value[DLADM_OVERLAY_PROP_SIZEMAX];
uint32_t sho_size;
@@ -10080,6 +10095,12 @@ print_overlay_value(char *outbuf, uint_t bufsize, uint_t type, const void *pbuf,
case OVERLAY_PROP_T_STRING:
(void) snprintf(outbuf, bufsize, "%s", pbuf);
break;
+ case OVERLAY_PROP_T_ETHER:
+ if (ether_ntoa_r((struct ether_addr *)pbuf, outbuf) == NULL) {
+ warn("malformed overlay ethernet property\n");
+ (void) snprintf(outbuf, bufsize, "--");
+ }
+ break;
default:
abort();
}
@@ -10428,7 +10449,7 @@ do_show_overlay(int argc, char *argv[], const char *use)
int i, opt;
datalink_id_t linkid = DATALINK_ALL_LINKID;
dladm_status_t status;
- int (*funcp)(dladm_handle_t, datalink_id_t, void *);
+ int (*funcp)(dladm_handle_t, datalink_id_t, void *);
char *fields_str = NULL;
const ofmt_field_t *fieldsp;
ofmt_status_t oferr;
@@ -10498,17 +10519,54 @@ do_show_overlay(int argc, char *argv[], const char *use)
}
static void
+parse_overlay_mac(const char *s, uint32_t *dcidp, struct ether_addr *ep)
+{
+ const char *slash;
+
+ *dcidp = 0;
+
+ if ((slash = strchr(s, '/')) != NULL) {
+ ulong_t dcval = 0;
+ size_t slen = (size_t)(slash - s) + 1;
+
+ /*
+ * If present the dcid must be at least 1 digit, and <=
+ * UINT32_MAX (10 digits + 1 for NUL).
+ */
+ if (slen < 2 || slen > 11)
+ die("invalid mac specification: %s\n", s);
+
+ char dcstr[slen];
+
+ (void) strlcpy(dcstr, s, slen);
+ errno = 0;
+ if ((dcval = strtoul(dcstr, NULL, 10)) == 0 && errno != 0)
+ die("invalid data center id: %s\n", dcstr);
+ /* XXX if we become 64-bit, check for results > UINT32_MAX */
+
+ *dcidp = (uint32_t)dcval;
+ /* Move s past '/' */
+ s = slash + 1;
+ }
+
+ if (ether_aton_r(s, ep) == NULL)
+ die("invalid mac specification: %s\n", s);
+}
+
+static void
do_modify_overlay(int argc, char *argv[], const char *use)
{
int opt, ocnt = 0;
- boolean_t flush, set, delete;
+ boolean_t flush, set, delete, setprop;
+ uint32_t dcid = 0;
struct ether_addr e;
char *dest;
datalink_id_t linkid = DATALINK_ALL_LINKID;
dladm_status_t status;
+ char propstr[DLADM_STRSIZE] = { 0 };
- flush = set = delete = B_FALSE;
- while ((opt = getopt_long(argc, argv, ":fd:s:", overlay_modify_lopts,
+ flush = set = delete = setprop = B_FALSE;
+ while ((opt = getopt_long(argc, argv, ":fd:p:s:", overlay_modify_lopts,
NULL)) != -1) {
switch (opt) {
case 'd':
@@ -10516,8 +10574,7 @@ do_modify_overlay(int argc, char *argv[], const char *use)
die_optdup('d');
delete = B_TRUE;
ocnt++;
- if (ether_aton_r(optarg, &e) == NULL)
- die("invalid mac address: %s\n", optarg);
+ parse_overlay_mac(optarg, &dcid, &e);
break;
case 'f':
if (flush == B_TRUE)
@@ -10525,6 +10582,16 @@ do_modify_overlay(int argc, char *argv[], const char *use)
flush = B_TRUE;
ocnt++;
break;
+ case 'p':
+ if (setprop == B_TRUE)
+ die_optdup('p');
+ setprop = B_TRUE;
+ (void) strlcat(propstr, optarg, DLADM_STRSIZE);
+ if (strlcat(propstr, ",", DLADM_STRSIZE) >=
+ DLADM_STRSIZE)
+ die("property list too long '%s'", propstr);
+ ocnt++;
+ break;
case 's':
if (set == B_TRUE)
die_optdup('s');
@@ -10536,8 +10603,7 @@ do_modify_overlay(int argc, char *argv[], const char *use)
if (dest == NULL)
die("malformed value, expected mac=dest, "
"got: %s\n", optarg);
- if (ether_aton_r(optarg, &e) == NULL)
- die("invalid mac address: %s\n", optarg);
+ parse_overlay_mac(optarg, &dcid, &e);
break;
default:
die_opterr(optopt, opt, use);
@@ -10545,9 +10611,9 @@ do_modify_overlay(int argc, char *argv[], const char *use)
}
if (ocnt == 0)
- die("need to specify one of -d, -f, or -s");
+ die("need to specify one of -d, -f, -p, or -s");
if (ocnt > 1)
- die("only one of -d, -f, or -s may be used");
+ die("only one of -d, -f, -p, or -s may be used");
if (argv[optind] == NULL)
die("missing required overlay device\n");
@@ -10568,17 +10634,43 @@ do_modify_overlay(int argc, char *argv[], const char *use)
}
if (delete == B_TRUE) {
- status = dladm_overlay_cache_delete(handle, linkid, &e);
+ status = dladm_overlay_cache_delete(handle, linkid, dcid, &e);
if (status != DLADM_STATUS_OK)
die_dlerr(status, "failed to flush target %s from "
"overlay target cache %s", optarg, argv[optind]);
}
if (set == B_TRUE) {
- status = dladm_overlay_cache_set(handle, linkid, &e, dest);
+ status = dladm_overlay_cache_set(handle, linkid, dcid, &e,
+ dest);
if (status != DLADM_STATUS_OK)
die_dlerr(status, "failed to set target %s for overlay "
"target cache %s", optarg, argv[optind]);
}
+ if (setprop == B_TRUE) {
+ dladm_arg_list_t *proplist = NULL;
+ uint_t i;
+
+ if (dladm_parse_link_props(propstr, &proplist, B_FALSE)
+ != DLADM_STATUS_OK)
+ die("invalid overlay property");
+
+ for (i = 0; i < proplist->al_count; i++) {
+ dladm_status_t status;
+
+ status = dladm_overlay_setprop(handle, linkid,
+ proplist->al_info[i].ai_name,
+ proplist->al_info[i].ai_val,
+ proplist->al_info[i].ai_count);
+
+ if (status != DLADM_STATUS_OK) {
+ die_dlerr(status, "failed to set property %s "
+ "for overlay device %s",
+ proplist->al_info[i].ai_name, argv[optind]);
+ }
+ }
+
+ dladm_free_props(proplist);
+ }
}
diff --git a/usr/src/cmd/mdb/common/modules/genunix/Makefile.files b/usr/src/cmd/mdb/common/modules/genunix/Makefile.files
index d371cf70fe..05ab8fe59c 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/Makefile.files
+++ b/usr/src/cmd/mdb/common/modules/genunix/Makefile.files
@@ -71,6 +71,7 @@ GENUNIX_SRCS = \
nvpair.c \
pci.c \
pg.c \
+ qqcache.c \
rctl.c \
refhash.c \
refstr.c \
diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
index 32370ba7e1..e0f21979e9 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
@@ -98,6 +98,7 @@
#include "nvpair.h"
#include "pci.h"
#include "pg.h"
+#include "qqcache.h"
#include "rctl.h"
#include "refhash.h"
#include "sobj.h"
@@ -4788,6 +4789,12 @@ static const mdb_walker_t walkers[] = {
{ "pcie_bus", "walk all pcie_bus_t's", pcie_bus_walk_init,
pcie_bus_walk_step, NULL },
+ /* from qqcache.c */
+ { QQCACHE_WALK_NAME, QQCACHE_WALK_DESC,
+ qqcache_walk_init_cache, qqcache_walk_step, qqcache_walk_fini },
+ { QQCACHE_HASH_WALK_NAME, QQCACHE_HASH_WALK_DESC,
+ qqcache_walk_init_hash, qqcache_walk_step, qqcache_walk_fini },
+
/* from rctl.c */
{ "rctl_dict_list", "walk all rctl_dict_entry_t's from rctl_lists",
rctl_dict_walk_init, rctl_dict_walk_step, NULL },
diff --git a/usr/src/cmd/mdb/common/modules/genunix/qqcache.c b/usr/src/cmd/mdb/common/modules/genunix/qqcache.c
new file mode 100644
index 0000000000..a2ba1463b9
--- /dev/null
+++ b/usr/src/cmd/mdb/common/modules/genunix/qqcache.c
@@ -0,0 +1,117 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018, Joyent, Inc.
+ */
+
+#include <mdb/mdb_modapi.h>
+#include <mdb/mdb_ctf.h>
+
+#include <sys/qqcache.h>
+#include <sys/qqcache_impl.h>
+
+#include "qqcache.h"
+
+typedef struct qqcache_walk_data {
+ size_t qwd_link_off;
+} qqcache_walk_data_t;
+
+typedef struct mdb_qqcache {
+ size_t qqc_link_off;
+ size_t qqc_nbuckets;
+} mdb_qqcache_t;
+
+static int
+qqcache_walk_init(mdb_walk_state_t *wsp, boolean_t use_hash)
+{
+ qqcache_walk_data_t *qwd;
+ uintptr_t base;
+ size_t i, n, qqc_list_sz;
+ int cache_off, bucket_off, list_off;
+ mdb_qqcache_t qc;
+
+ /* mdb_ctf_offsetof_by_name will print any errors */
+ cache_off = mdb_ctf_offsetof_by_name("qqcache_t", "qqc_lists");
+ if (cache_off == -1)
+ return (WALK_ERR);
+
+ bucket_off = mdb_ctf_offsetof_by_name("qqcache_t", "qqc_buckets");
+ if (bucket_off == -1)
+ return (WALK_ERR);
+
+ list_off = mdb_ctf_offsetof_by_name("qqcache_list_t", "qqcl_list");
+ if (list_off == -1)
+ return (WALK_ERR);
+
+ /* mdb_ctf_sizeof_by_name will print any errors */
+ qqc_list_sz = mdb_ctf_sizeof_by_name("qqcache_list_t");
+ if (qqc_list_sz == -1)
+ return (WALK_ERR);
+
+ if (mdb_ctf_vread(&qc, "qqcache_t", "mdb_qqcache_t", wsp->walk_addr,
+ 0) == -1) {
+ mdb_warn("failed to read qqcache_t at %#lx", wsp->walk_addr);
+ return (WALK_ERR);
+ }
+
+ qwd = wsp->walk_data = mdb_zalloc(sizeof (*qwd), UM_SLEEP);
+ qwd->qwd_link_off = qc.qqc_link_off;
+
+ if (use_hash) {
+ base = wsp->walk_addr + bucket_off;
+ n = qc.qqc_nbuckets;
+ } else {
+ base = wsp->walk_addr + cache_off;
+ n = QQCACHE_NUM_LISTS;
+ }
+
+ for (i = 0; i < n; i++) {
+ wsp->walk_addr = base + i * qqc_list_sz + list_off;
+
+ if (mdb_layered_walk("list", wsp) == -1) {
+ mdb_warn("can't walk qqcache_t");
+ mdb_free(qwd, sizeof (*qwd));
+ return (WALK_ERR);
+ }
+ }
+
+ return (WALK_NEXT);
+}
+
+int
+qqcache_walk_init_cache(mdb_walk_state_t *wsp)
+{
+ return (qqcache_walk_init(wsp, B_FALSE));
+}
+
+int
+qqcache_walk_init_hash(mdb_walk_state_t *wsp)
+{
+ return (qqcache_walk_init(wsp, B_TRUE));
+}
+
+int
+qqcache_walk_step(mdb_walk_state_t *wsp)
+{
+ qqcache_walk_data_t *qwd = wsp->walk_data;
+ uintptr_t addr = wsp->walk_addr - qwd->qwd_link_off;
+
+ return (wsp->walk_callback(addr, wsp->walk_layer, wsp->walk_cbdata));
+}
+
+void
+qqcache_walk_fini(mdb_walk_state_t *wsp)
+{
+ qqcache_walk_data_t *qwd = wsp->walk_data;
+
+ mdb_free(qwd, sizeof (*qwd));
+}
diff --git a/usr/src/cmd/mdb/common/modules/genunix/qqcache.h b/usr/src/cmd/mdb/common/modules/genunix/qqcache.h
new file mode 100644
index 0000000000..c0d1d14fe6
--- /dev/null
+++ b/usr/src/cmd/mdb/common/modules/genunix/qqcache.h
@@ -0,0 +1,40 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018, Joyent Inc.
+ */
+
+#ifndef _MDB_QQCACHE_H
+#define _MDB_QQCACHE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QQCACHE_WALK_NAME "qqcache"
+#define QQCACHE_WALK_DESC "walk a qqcache (2Q cache)"
+
+#define QQCACHE_HASH_WALK_NAME "qqhash"
+#define QQCACHE_HASH_WALK_DESC "walk a qqcache (2Q cache) via the hash buckets"
+
+struct mdb_walk_state;
+
+extern int qqcache_walk_init_cache(struct mdb_walk_state *);
+extern int qqcache_walk_init_hash(struct mdb_walk_state *);
+extern int qqcache_walk_step(struct mdb_walk_state *);
+extern void qqcache_walk_fini(struct mdb_walk_state *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _MDB_QQCACHE_H */
diff --git a/usr/src/lib/libdladm/common/libdloverlay.c b/usr/src/lib/libdladm/common/libdloverlay.c
index a83105b91c..db58da0a34 100644
--- a/usr/src/lib/libdladm/common/libdloverlay.c
+++ b/usr/src/lib/libdladm/common/libdloverlay.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright (c) 2015 Joyent, Inc.
+ * Copyright (c) 2018 Joyent, Inc.
*/
#include <libdladm_impl.h>
@@ -127,6 +127,11 @@ dladm_overlay_parse_prop(overlay_prop_type_t type, void *buf, uint32_t *sizep,
bcopy(&ipv6, buf, sizeof (struct in6_addr));
*sizep = sizeof (struct in6_addr);
break;
+ case OVERLAY_PROP_T_ETHER:
+ if (ether_aton_r(val, (struct ether_addr *)buf) == NULL)
+ return (DLADM_STATUS_BADARG);
+ *sizep = ETHERADDRL;
+ break;
default:
abort();
}
@@ -203,16 +208,16 @@ dladm_overlay_setprop(dladm_handle_t handle, datalink_id_t linkid,
prop.oip_linkid = linkid;
prop.oip_id = info.oipi_id;
prop.oip_name[0] = '\0';
- if ((ret = dladm_overlay_parse_prop(info.oipi_type, prop.oip_value,
+ if ((status = dladm_overlay_parse_prop(info.oipi_type, prop.oip_value,
&prop.oip_size, valp[0])) != DLADM_STATUS_OK)
- return (ret);
+ return (status);
status = DLADM_STATUS_OK;
ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_SETPROP, &prop);
if (ret != 0)
status = dladm_errno2status(errno);
- return (ret);
+ return (status);
}
/*
@@ -475,7 +480,7 @@ dladm_overlay_walk_prop(dladm_handle_t handle, datalink_id_t linkid,
dladm_status_t
dladm_overlay_create(dladm_handle_t handle, const char *name,
- const char *encap, const char *search, uint64_t vid,
+ const char *encap, const char *search, uint64_t vid, uint32_t dcid,
dladm_arg_list_t *props, dladm_errlist_t *errs, uint32_t flags)
{
int ret, i;
@@ -495,6 +500,7 @@ dladm_overlay_create(dladm_handle_t handle, const char *name,
bzero(&oic, sizeof (oic));
oic.oic_linkid = linkid;
oic.oic_vnetid = vid;
+ oic.oic_dcid = dcid;
(void) strlcpy(oic.oic_encap, encap, MAXLINKNAMELEN);
status = DLADM_STATUS_OK;
@@ -542,8 +548,7 @@ dladm_overlay_create(dladm_handle_t handle, const char *name,
return (dladm_errno2status(ret));
}
- if ((ret = libvarpd_c_instance_create(vch, linkid, search,
- &id)) != 0) {
+ if ((ret = libvarpd_c_instance_create(vch, linkid, search, &id)) != 0) {
(void) dladm_errlist_append(errs,
"failed to create varpd instance: %s", strerror(ret));
libvarpd_c_destroy(vch);
@@ -708,7 +713,7 @@ dladm_overlay_cache_flush(dladm_handle_t handle, datalink_id_t linkid)
/* ARGSUSED */
dladm_status_t
dladm_overlay_cache_delete(dladm_handle_t handle, datalink_id_t linkid,
- const struct ether_addr *key)
+ uint32_t dcid, const struct ether_addr *key)
{
int ret;
uint64_t varpdid;
@@ -722,7 +727,7 @@ dladm_overlay_cache_delete(dladm_handle_t handle, datalink_id_t linkid,
return (dladm_errno2status(ret));
}
- ret = libvarpd_c_instance_cache_delete(chdl, varpdid, key);
+ ret = libvarpd_c_instance_cache_delete(chdl, varpdid, dcid, key);
libvarpd_c_destroy(chdl);
return (dladm_errno2status(ret));
@@ -731,7 +736,7 @@ dladm_overlay_cache_delete(dladm_handle_t handle, datalink_id_t linkid,
/* ARGSUSED */
dladm_status_t
dladm_overlay_cache_set(dladm_handle_t handle, datalink_id_t linkid,
- const struct ether_addr *key, char *val)
+ uint32_t dcid, const struct ether_addr *key, char *val)
{
int ret;
uint_t dest;
@@ -836,7 +841,7 @@ dladm_overlay_cache_set(dladm_handle_t handle, datalink_id_t linkid,
}
send:
- ret = libvarpd_c_instance_cache_set(chdl, varpdid, key, &vcp);
+ ret = libvarpd_c_instance_cache_set(chdl, varpdid, dcid, key, &vcp);
libvarpd_c_destroy(chdl);
return (dladm_errno2status(ret));
diff --git a/usr/src/lib/libdladm/common/libdloverlay.h b/usr/src/lib/libdladm/common/libdloverlay.h
index 39b01ccae3..e058cb7349 100644
--- a/usr/src/lib/libdladm/common/libdloverlay.h
+++ b/usr/src/lib/libdladm/common/libdloverlay.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright (c) 2015 Joyent, Inc.
+ * Copyright (c) 2018 Joyent, Inc.
*/
#ifndef _LIBDLOVERLAY_H
@@ -45,8 +45,8 @@ typedef struct dladm_overlay_status {
} dladm_overlay_status_t;
extern dladm_status_t dladm_overlay_create(dladm_handle_t, const char *,
- const char *, const char *, uint64_t, dladm_arg_list_t *, dladm_errlist_t *,
- uint32_t);
+ const char *, const char *, uint64_t, uint32_t, dladm_arg_list_t *,
+ dladm_errlist_t *, uint32_t);
extern dladm_status_t dladm_overlay_delete(dladm_handle_t, datalink_id_t);
typedef void (*dladm_overlay_status_f)(dladm_handle_t, datalink_id_t,
@@ -56,9 +56,9 @@ extern dladm_status_t dladm_overlay_status(dladm_handle_t, datalink_id_t,
extern dladm_status_t dladm_overlay_cache_flush(dladm_handle_t, datalink_id_t);
extern dladm_status_t dladm_overlay_cache_delete(dladm_handle_t, datalink_id_t,
- const struct ether_addr *);
+ uint32_t, const struct ether_addr *);
extern dladm_status_t dladm_overlay_cache_set(dladm_handle_t, datalink_id_t,
- const struct ether_addr *, char *);
+ uint32_t, const struct ether_addr *, char *);
extern dladm_status_t dladm_overlay_cache_get(dladm_handle_t, datalink_id_t,
const struct ether_addr *, dladm_overlay_point_t *);
@@ -72,6 +72,8 @@ extern dladm_status_t dladm_overlay_prop_info(dladm_overlay_propinfo_handle_t,
const mac_propval_range_t **);
extern dladm_status_t dladm_overlay_get_prop(dladm_handle_t, datalink_id_t,
dladm_overlay_propinfo_handle_t, void *buf, size_t *bufsize);
+extern dladm_status_t dladm_overlay_setprop(dladm_handle_t, datalink_id_t,
+ const char *, char *const *, uint_t);
typedef int (*dladm_overlay_prop_f)(dladm_handle_t, datalink_id_t,
dladm_overlay_propinfo_handle_t, void *);
diff --git a/usr/src/lib/libdladm/common/mapfile-vers b/usr/src/lib/libdladm/common/mapfile-vers
index 589bbf5330..3b595920f7 100644
--- a/usr/src/lib/libdladm/common/mapfile-vers
+++ b/usr/src/lib/libdladm/common/mapfile-vers
@@ -281,6 +281,7 @@ SYMBOL_VERSION SUNWprivate_1.1 {
dladm_overlay_status;
dladm_overlay_prop_info;
dladm_overlay_get_prop;
+ dladm_overlay_setprop;
dladm_overlay_walk_prop;
dladm_overlay_cache_set;
diff --git a/usr/src/lib/varpd/files/Makefile.com b/usr/src/lib/varpd/files/Makefile.com
index dd8009d002..79a06e7d56 100644
--- a/usr/src/lib/varpd/files/Makefile.com
+++ b/usr/src/lib/varpd/files/Makefile.com
@@ -23,6 +23,7 @@ include ../../Makefile.plugin
LIBS = $(DYNLIB)
LDLIBS += -lc -lumem -lnvpair -lsocket -lcustr
+LDLIBS += -lcmdutils -lavl -lbunyan
CPPFLAGS += -I../common
LINTFLAGS += -erroff=E_BAD_PTR_CAST_ALIGN
diff --git a/usr/src/lib/varpd/files/common/libvarpd_files.c b/usr/src/lib/varpd/files/common/libvarpd_files.c
index 812919a07d..90ef1c34ce 100644
--- a/usr/src/lib/varpd/files/common/libvarpd_files.c
+++ b/usr/src/lib/varpd/files/common/libvarpd_files.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015, Joyent, Inc.
+ * Copyright 2018, Joyent, Inc.
*/
/*
@@ -28,7 +28,7 @@
* The plug-in only has a single property, which is the location of the JSON
* file. The JSON file itself looks something like:
*
- * {
+ * {
* "aa:bb:cc:dd:ee:ff": {
* "arp": "10.23.69.1",
* "ndp": "2600:3c00::f03c:91ff:fe96:a264",
@@ -36,7 +36,42 @@
* "port": 8080
* },
* ...
- * }
+ *
+ * "local-subnet1": {
+ * "prefix": "192.168.1.0/24",
+ * "vlan": 123
+ * },
+ * ...
+ *
+ * "remote-subnet1": {
+ * "dcid": 11223344,
+ * "prefix": "10.21.10.0/24",
+ * "vnet": 5340123,
+ * "vlan": 789,
+ * "routermac": "12:34:56:78:aa:bb",
+ * "macs": {
+ * "aa:bb:cc:dd:ee:ff": {
+ * "arp": "192.168.50.22",
+ * ...
+ * }
+ * }
+ * },
+ * ...
+ * "attach-group1": [
+ * "remote-subnet1",
+ * "remote-subnet2",
+ * "local-subnet1",
+ * ...
+ * ],
+ * ...
+ *
+ * Entries for performing VL3 routing (local-, remote-, and attach-) must
+ * all start with their respective prefixes (local-, remote-, or attach-) to
+ * identify the type of entry. Names of entries are limited to
+ * FABRIC_NAME_MAX-1 characters.
+ *
+ * NOTE: This isn't very sophisticated, so attachment entries need to appear
+ * after the entries referenced in it.
*/
#include <libvarpd_provider.h>
@@ -47,32 +82,264 @@
#include <strings.h>
#include <assert.h>
#include <limits.h>
+#include <sys/avl.h>
+#include <sys/debug.h>
+#include <sys/list.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <libnvpair.h>
+#include <stddef.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/ethernet.h>
#include <sys/socket.h>
+#include <sys/vlan.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <libvarpd_files_json.h>
+#define FABRIC_NAME_MAX 64
+struct varpd_files_attach;
+typedef struct varpd_files_attach varpd_files_attach_t;
+
+typedef struct varpd_files_fabric {
+ avl_node_t vafs_avlnode;
+ list_node_t vafs_attached_node;
+ varpd_files_attach_t *vafs_attach;
+ char vafs_name[FABRIC_NAME_MAX];
+ struct in6_addr vafs_addr;
+ uint64_t vafs_vnet;
+ uint32_t vafs_dcid;
+ uint16_t vafs_vlan;
+ uint8_t vafs_prefixlen;
+ uint8_t vafs_routermac[ETHERADDRL];
+} varpd_files_fabric_t;
+
+struct varpd_files_attach {
+ list_node_t vfa_node;
+ char vfa_name[FABRIC_NAME_MAX];
+ list_t vfa_fabrics;
+};
+
+typedef struct varpd_files_if {
+ avl_node_t vfi_macnode;
+ avl_node_t vfi_ipnode;
+ avl_node_t vfi_ndpnode;
+ struct in6_addr vfi_ip;
+ struct in6_addr vfi_llocalip; /* IPv6 link local if specified */
+ uint64_t vfi_vnet;
+ uint32_t vfi_dcid;
+ uint16_t vfi_vlan;
+ uint8_t vfi_mac[ETHERADDRL];
+ uint8_t vfi_dhcp[ETHERADDRL]; /* dhcp-proxy MAC address */
+ boolean_t vfi_has_dhcp;
+ boolean_t vfi_has_lladdr;
+ overlay_target_point_t vfi_dest;
+} varpd_files_if_t;
+
typedef struct varpd_files {
overlay_plugin_dest_t vaf_dest; /* RO */
varpd_provider_handle_t *vaf_hdl; /* RO */
char *vaf_path; /* WO */
- nvlist_t *vaf_nvl; /* WO */
uint64_t vaf_nmisses; /* Atomic */
uint64_t vaf_narp; /* Atomic */
+
+ /* These hold varpd_files_fabric_t's */
+ avl_tree_t vaf_fabrics; /* WO */
+ list_t vaf_attached; /* WO */
+
+ /* These hold varpd_files_if_t */
+ avl_tree_t vaf_macs; /* WO */
+ avl_tree_t vaf_ips; /* WO */
+ avl_tree_t vaf_ndp; /* WO */
+
+ uint64_t vaf_vnet; /* RO */
+ uint32_t vaf_dcid; /* RO */
} varpd_files_t;
static const char *varpd_files_props[] = {
"files/config"
};
+static bunyan_logger_t *files_bunyan;
+
+/*
+ * Try to convert a string to an IP address or IP address + prefix. We first
+ * try to convert as an IPv6 address, and if that fails, we try to convert as
+ * an IPv4 adress and then wrap it in an IPv6 address.
+ *
+ * To parse an address+prefix length (e.g. 192.168.0.1/24), prefixlen must be
+ * non-NULL. If prefixlen is not NULL and a lone address is supplied,
+ * *prefixlen will be set to 128. If prefixlen is NULL, only a lone address
+ * can be successfully parsed.
+ *
+ * Note: if this is a wrapped IPv4 address with a prefix, *prefixlen is adjusted
+ * to reflect the value as an IPv6 address, e.g. 192.168.1.0/24 will have a
+ * prefixlen of 120 (96 + 24).
+ *
+ */
+static int
+str_to_ip(const char *s, struct in6_addr *v6, uint8_t *prefixlen)
+{
+ const char *slash; /* he is real */
+ char addrstr[INET6_ADDRSTRLEN] = { 0 };
+ size_t addrlen;
+ boolean_t is_v4 = B_FALSE;
+
+ slash = strchr(s, '/');
+
+ if (prefixlen != NULL) {
+ addrlen = (slash != NULL) ? (size_t)(slash - s) : strlen(s);
+ } else {
+ if (slash != NULL)
+ return (EINVAL);
+ addrlen = strlen(s);
+ }
+
+ if (addrlen > sizeof (addrstr))
+ return (EINVAL);
+
+ bcopy(s, addrstr, addrlen);
+
+ if (inet_pton(AF_INET6, addrstr, v6) != 1) {
+ uint32_t v4;
+
+ if (inet_pton(AF_INET, addrstr, &v4) != 1)
+ return (EINVAL);
+
+ IN6_IPADDR_TO_V4MAPPED(v4, v6);
+ is_v4 = B_TRUE;
+ }
+
+ if (prefixlen != NULL) {
+ if (slash == NULL) {
+ *prefixlen = is_v4 ? 32 : 128;
+ } else {
+ unsigned long mask = 0;
+
+ errno = 0;
+ mask = strtoul(slash + 1, NULL, 10);
+ if (errno != 0)
+ return (EINVAL);
+
+ if (is_v4) {
+ if (mask > 32)
+ return (EINVAL);
+ mask += 96;
+ }
+
+ if (mask > 128)
+ return (EINVAL);
+
+ *prefixlen = (uint8_t)mask;
+ }
+ }
+
+ return (0);
+}
+
+static int
+varpd_files_if_mac_avl(const void *a, const void *b)
+{
+ const varpd_files_if_t *l = a;
+ const varpd_files_if_t *r = b;
+ int i;
+
+ if (l->vfi_dcid < r->vfi_dcid)
+ return (-1);
+ if (l->vfi_dcid > r->vfi_dcid)
+ return (1);
+
+ for (i = 0; i < ETHERADDRL; i++) {
+ if (l->vfi_mac[i] < r->vfi_mac[i])
+ return (-1);
+ if (l->vfi_mac[i] > r->vfi_mac[i])
+ return (1);
+ }
+
+ return (0);
+}
+
+static int
+varpd_files_if_ip_avl(const void *a, const void *b)
+{
+ const varpd_files_if_t *l = a;
+ const varpd_files_if_t *r = b;
+ int i;
+
+ if (l->vfi_vnet < r->vfi_vnet)
+ return (-1);
+ if (l->vfi_vnet > r->vfi_vnet)
+ return (1);
+ if (l->vfi_vlan < r->vfi_vlan)
+ return (-1);
+ if (l->vfi_vlan > r->vfi_vlan)
+ return (1);
+ for (i = 0; i < sizeof (struct in6_addr); i++) {
+ if (l->vfi_ip.s6_addr[i] < r->vfi_ip.s6_addr[i])
+ return (-1);
+ if (l->vfi_ip.s6_addr[i] > r->vfi_ip.s6_addr[i])
+ return (1);
+ }
+ return (0);
+}
+
+static int
+varpd_files_if_ndp_avl(const void *a, const void *b)
+{
+ const varpd_files_if_t *l = a;
+ const varpd_files_if_t *r = b;
+ int i;
+
+ VERIFY(l->vfi_has_lladdr);
+ VERIFY(r->vfi_has_lladdr);
+
+ for (i = 0; i < sizeof (struct in6_addr); i++) {
+ if (l->vfi_llocalip.s6_addr[i] < r->vfi_llocalip.s6_addr[i])
+ return (-1);
+ if (l->vfi_llocalip.s6_addr[i] > r->vfi_llocalip.s6_addr[i])
+ return (1);
+ }
+ return (0);
+}
+
+static int
+varpd_files_fabric_avl(const void *a, const void *b)
+{
+ const varpd_files_fabric_t *l = a;
+ const varpd_files_fabric_t *r = b;
+ int i;
+
+ /*
+ * Sort by dcid, vnet, vlan, subnet. With subnet last, we can use
+ * avl_nearest() to find the fabric for an IP (given the other pieces
+ * of information).
+ */
+ if (l->vafs_dcid < r->vafs_dcid)
+ return (-1);
+ if (l->vafs_dcid > r->vafs_dcid)
+ return (1);
+ if (l->vafs_vnet < r->vafs_vnet)
+ return (-1);
+ if (l->vafs_vnet > r->vafs_vnet)
+ return (1);
+ if (l->vafs_vlan < r->vafs_vlan)
+ return (-1);
+ if (l->vafs_vlan > r->vafs_vlan)
+ return (1);
+
+ for (i = 0; i < sizeof (struct in6_addr); i++) {
+ if (l->vafs_addr.s6_addr[i] < r->vafs_addr.s6_addr[i])
+ return (-1);
+ if (l->vafs_addr.s6_addr[i] > r->vafs_addr.s6_addr[i])
+ return (1);
+ }
+
+ return (0);
+}
+
static boolean_t
varpd_files_valid_dest(overlay_plugin_dest_t dest)
{
@@ -94,64 +361,674 @@ varpd_files_create(varpd_provider_handle_t *hdl, void **outp,
if (varpd_files_valid_dest(dest) == B_FALSE)
return (ENOTSUP);
- vaf = umem_alloc(sizeof (varpd_files_t), UMEM_DEFAULT);
+ vaf = umem_zalloc(sizeof (varpd_files_t), UMEM_DEFAULT);
if (vaf == NULL)
return (ENOMEM);
- bzero(vaf, sizeof (varpd_files_t));
vaf->vaf_dest = dest;
- vaf->vaf_path = NULL;
- vaf->vaf_nvl = NULL;
vaf->vaf_hdl = hdl;
+ vaf->vaf_dcid = libvarpd_plugin_dcid(hdl);
+ vaf->vaf_vnet = libvarpd_plugin_vnetid(hdl);
+ avl_create(&vaf->vaf_macs, varpd_files_if_mac_avl,
+ sizeof (varpd_files_if_t), offsetof(varpd_files_if_t, vfi_macnode));
+ avl_create(&vaf->vaf_ips, varpd_files_if_ip_avl,
+ sizeof (varpd_files_if_t), offsetof(varpd_files_if_t, vfi_ipnode));
+ avl_create(&vaf->vaf_ndp, varpd_files_if_ndp_avl,
+ sizeof (varpd_files_if_t), offsetof(varpd_files_if_t, vfi_ndpnode));
+ avl_create(&vaf->vaf_fabrics, varpd_files_fabric_avl,
+ sizeof (varpd_files_fabric_t),
+ offsetof(varpd_files_fabric_t, vafs_avlnode));
+ list_create(&vaf->vaf_attached, sizeof (varpd_files_attach_t),
+ offsetof(varpd_files_attach_t, vfa_node));
*outp = vaf;
return (0);
}
+static varpd_files_fabric_t *
+varpd_files_fabric_getbyname(varpd_files_t *vaf, const char *name)
+{
+ varpd_files_fabric_t *fab = NULL;
+
+ for (fab = avl_first(&vaf->vaf_fabrics); fab != NULL;
+ fab = AVL_NEXT(&vaf->vaf_fabrics, fab)) {
+ if (strcmp(fab->vafs_name, name) == 0)
+ return (fab);
+ }
+
+ return (NULL);
+}
+
static int
-varpd_files_normalize_nvlist(varpd_files_t *vaf, nvlist_t *nvl)
+varpd_files_convert_attached(varpd_files_t *vaf, nvlist_t *att)
{
+ nvlist_t *nvl = NULL;
+ nvpair_t *nvp = NULL;
int ret;
- nvlist_t *out;
- nvpair_t *pair;
- if ((ret = nvlist_alloc(&out, NV_UNIQUE_NAME, 0)) != 0)
+ while ((nvp = nvlist_next_nvpair(att, nvp)) != NULL) {
+ varpd_files_attach_t *att;
+ char **nets = NULL;
+ uint32_t i, n;
+
+ if (nvpair_type(nvp) != DATA_TYPE_NVLIST) {
+ (void) bunyan_error(files_bunyan,
+ "attached fabric group value is not an nvlist",
+ BUNYAN_T_STRING, "group", nvpair_name(nvp),
+ BUNYAN_T_END);
+ return (EINVAL);
+ }
+
+ if ((ret = nvpair_value_nvlist(nvp, &nvl)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "unexpected error retrieving attached fabric group",
+ BUNYAN_T_STRING, "group", nvpair_name(nvp),
+ BUNYAN_T_STRING, "errmsg", strerror(ret),
+ BUNYAN_T_END);
+ return (EINVAL);
+ }
+
+ if ((ret = nvlist_lookup_boolean(nvl, ".__json_array")) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "group value does not appear to be a JSON array",
+ BUNYAN_T_STRING, "group", nvpair_name(nvp),
+ BUNYAN_T_END);
+ return (EINVAL);
+ }
+
+ if ((ret = nvlist_lookup_uint32(nvl, "length", &n)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "unexpected error obtain group array length",
+ BUNYAN_T_STRING, "group", nvpair_name(nvp),
+ BUNYAN_T_STRING, "errmsg", strerror(ret),
+ BUNYAN_T_END);
+ return (ret);
+ }
+
+ if ((nets = calloc(n, sizeof (char *))) == NULL) {
+ (void) bunyan_error(files_bunyan,
+ "out of memory", BUNYAN_T_END);
+ return (ENOMEM);
+ }
+
+ /*
+ * Note, we are just storing references to the names in
+ * nets, so we only need to call free(nets), and not on
+ * each entry (e.g. free(nets[0])). We strlcpy() it out,
+ * so we don't need to worry about it going away before we
+ * done with it.
+ */
+ for (i = 0; i < n; i++) {
+ char buf[11]; /* largest uint32_t val + NUL */
+
+ (void) snprintf(buf, sizeof (buf), "%u", i);
+ ret = nvlist_lookup_string(nvl, buf, &nets[i]);
+ if (ret != 0) {
+ (void) bunyan_error(files_bunyan,
+ "unexpected error lookup up group array "
+ "value",
+ BUNYAN_T_STRING, "group", nvpair_name(nvp),
+ BUNYAN_T_UINT32, "index", i,
+ BUNYAN_T_STRING, "errmsg", strerror(ret),
+ BUNYAN_T_END);
+ free(nets);
+ return (ret);
+ }
+ }
+
+ if ((att = umem_zalloc(sizeof (*att), UMEM_DEFAULT)) == NULL) {
+ (void) bunyan_error(files_bunyan, "out of memory",
+ BUNYAN_T_END);
+ free(nets);
+ return (ENOMEM);
+ }
+
+ if (strlcpy(att->vfa_name, nvpair_name(nvp),
+ sizeof (att->vfa_name)) >= sizeof (att->vfa_name)) {
+ (void) bunyan_error(files_bunyan,
+ "attached fabric group name is too long",
+ BUNYAN_T_STRING, "group", nvpair_name(nvp),
+ BUNYAN_T_UINT32, "len",
+ (uint32_t)strlen(nvpair_name(nvp)),
+ BUNYAN_T_UINT32, "maxlen",
+ (uint32_t)sizeof (att->vfa_name) - 1,
+ BUNYAN_T_END);
+ umem_free(att, sizeof (*att));
+ free(nets);
+ return (EOVERFLOW);
+ }
+
+ list_create(&att->vfa_fabrics, sizeof (varpd_files_fabric_t),
+ offsetof(varpd_files_fabric_t, vafs_attached_node));
+
+ list_insert_tail(&vaf->vaf_attached, att);
+
+ for (i = 0; i < n; i++) {
+ varpd_files_fabric_t *fab;
+
+ fab = varpd_files_fabric_getbyname(vaf, nets[i]);
+ if (fab == NULL) {
+ (void) bunyan_error(files_bunyan,
+ "subnet name not found",
+ BUNYAN_T_STRING, "subnet", nets[i],
+ BUNYAN_T_STRING, "group", nvpair_name(nvp),
+ BUNYAN_T_END);
+ free(nets);
+ return (ENOENT);
+ }
+
+ if (fab->vafs_attach != NULL) {
+ (void) bunyan_error(files_bunyan,
+ "subnet already attached to another group",
+ BUNYAN_T_STRING, "subnet", nets[i],
+ BUNYAN_T_STRING, "group", nvpair_name(nvp),
+ BUNYAN_T_STRING, "existing_group",
+ fab->vafs_attach->vfa_name,
+ BUNYAN_T_END);
+ free(nets);
+ return (EBUSY);
+ }
+
+ fab->vafs_attach = att;
+ list_insert_tail(&att->vfa_fabrics, fab);
+ }
+ free(nets);
+ }
+
+ return (0);
+}
+
+static int
+varpd_files_convert_fabrics(varpd_files_t *vaf, nvpair_t *fpair)
+{
+ nvlist_t *nvl = NULL;
+ nvpair_t *nvp = NULL;
+ int ret;
+
+ ASSERT(strcmp(nvpair_name(fpair), "fabrics") == 0);
+
+ if (nvpair_type(fpair) != DATA_TYPE_NVLIST) {
+ (void) bunyan_error(files_bunyan,
+ "'fabrics' value is not an nvlist", BUNYAN_T_END);
+ return (EINVAL);
+ }
+
+ if ((ret = nvpair_value_nvlist(fpair, &nvl)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "unexpected error reading value of 'fabrics'",
+ BUNYAN_T_STRING, "errmsg", strerror(errno),
+ BUNYAN_T_END);
return (ret);
+ }
- for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL;
- pair = nvlist_next_nvpair(nvl, pair)) {
- char *name, fname[ETHERADDRSTRL];
- nvlist_t *data;
- struct ether_addr ether, *e;
- e = &ether;
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ struct in6_addr ip = { 0 };
+ varpd_files_fabric_t *fab = NULL;
+ varpd_files_if_t *vl2 = NULL;
+ nvlist_t *vnvl = NULL;
+ int32_t i32;
+ char *s;
+
+ if (strcmp(nvpair_name(nvp), "attached-fabrics") == 0) {
+ if (nvpair_type(nvp) != DATA_TYPE_NVLIST) {
+ (void) bunyan_error(files_bunyan,
+ "'attached-fabrics' value is not an nvlist",
+ BUNYAN_T_END);
+ return (EINVAL);
+ }
+
+ if ((ret = nvpair_value_nvlist(nvp, &vnvl)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "unexpected error in 'attached-fabrics' "
+ "value",
+ BUNYAN_T_STRING, "errmsg", strerror(ret),
+ BUNYAN_T_END);
+ return (ret);
+ }
+ ret = varpd_files_convert_attached(vaf, vnvl);
+ if (ret != 0) {
+ return (ret);
+ }
+ continue;
+ }
- if (nvpair_type(pair) != DATA_TYPE_NVLIST) {
- nvlist_free(out);
+ if (nvpair_type(nvp) != DATA_TYPE_NVLIST) {
+ (void) bunyan_error(files_bunyan,
+ "subnet value is not an nvlist",
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_END);
return (EINVAL);
}
- name = nvpair_name(pair);
- if ((ret = nvpair_value_nvlist(pair, &data)) != 0) {
- nvlist_free(out);
+ if ((ret = nvpair_value_nvlist(nvp, &vnvl)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "unexpected error reading subnet value",
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_END);
+ return (ret);
+ }
+
+ if ((fab = umem_zalloc(sizeof (*fab), UMEM_DEFAULT)) == NULL) {
+ (void) bunyan_error(files_bunyan, "out of memory",
+ BUNYAN_T_END);
+ return (ENOMEM);
+ }
+ /* Default to our vid if none is given */
+ fab->vafs_vnet = vaf->vaf_vnet;
+
+ if (strlcpy(fab->vafs_name, nvpair_name(nvp),
+ sizeof (fab->vafs_name)) >= sizeof (fab->vafs_name)) {
+ (void) bunyan_error(files_bunyan,
+ "subnet name is too long",
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_UINT32, "length",
+ (uint32_t)strlen(nvpair_name(nvp)),
+ BUNYAN_T_UINT32, "maxlen",
+ (uint32_t)sizeof (fab->vafs_name) - 1,
+ BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
+ return (EOVERFLOW);
+ }
+
+ if ((ret = nvlist_lookup_string(vnvl, "prefix", &s)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "'prefix' value is missing from subnet",
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
return (EINVAL);
}
+ if ((ret = str_to_ip(s, &fab->vafs_addr,
+ &fab->vafs_prefixlen)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "prefix value is not valid",
+ BUNYAN_T_STRING, "prefix", s,
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
+ return (ret);
+ }
+ /* XXX: Make sure it's the subnet address */
+
+ if ((ret = nvlist_lookup_int32(vnvl, "vlan", &i32)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "'vlan' value is missing",
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
+ return (EINVAL);
+ }
+ if (i32 < 0 || i32 > VLAN_ID_MAX) {
+ (void) bunyan_error(files_bunyan,
+ "vlan value is out of range (0-4094)",
+ BUNYAN_T_INT32, "vlan", i32,
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
+ return (ERANGE);
+ }
+ fab->vafs_vlan = (uint16_t)i32;
+
+ if ((ret = nvlist_lookup_string(vnvl, "routerip", &s)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "'routerip' value is missing",
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
+ return (EINVAL);
+ }
+ if ((ret = str_to_ip(s, &ip, NULL)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "'routerip' value is not an IP",
+ BUNYAN_T_STRING, "routerip", s,
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
+ return (ret);
+ }
- if (ether_aton_r(name, e) == NULL) {
- nvlist_free(out);
+ if ((ret = nvlist_lookup_string(vnvl, "routermac", &s)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "'routermac' value is missing from subnet",
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
+ return (EINVAL);
+ }
+ if (ether_aton_r(s,
+ (struct ether_addr *)fab->vafs_routermac) == NULL) {
+ (void) bunyan_error(files_bunyan,
+ "'routermac' is not a valid MAC address",
+ BUNYAN_T_STRING, "mac", s,
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
return (EINVAL);
}
- if (ether_ntoa_r(e, fname) == NULL) {
- nvlist_free(out);
+ /*
+ * XXX: Because of the quirks of javascript, representing
+ * integers > INT32_MAX in json becomes dicey. Should we
+ * just use a string instead?
+ */
+ switch (ret = nvlist_lookup_int32(vnvl, "dcid", &i32)) {
+ case 0:
+ fab->vafs_dcid = (uint32_t)i32;
+ break;
+ case ENOENT:
+ fab->vafs_dcid = vaf->vaf_dcid;
+ break;
+ default:
+ (void) bunyan_error(files_bunyan,
+ "unexpected error processing 'dcid' value",
+ BUNYAN_T_STRING, "errmsg", strerror(errno),
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
+ return (ret);
+ }
+
+ switch (ret = nvlist_lookup_string(vnvl, "vid", &s)) {
+ case ENOENT:
+ fab->vafs_vnet = vaf->vaf_vnet;
+ break;
+ case 0:
+ errno = 0;
+ if ((fab->vafs_vnet = strtoul(s, NULL, 10)) != 0 ||
+ errno == 0)
+ break;
+ ret = errno;
+ (void) bunyan_error(files_bunyan,
+ "unable to parse 'vid' as a number",
+ BUNYAN_T_STRING, "vid", s,
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
+ return (ret);
+ default:
+ (void) bunyan_error(files_bunyan,
+ "unexpected error processing 'vid' value",
+ BUNYAN_T_STRING, "errmsg", strerror(errno),
+ BUNYAN_T_STRING, "subnet", nvpair_name(nvp),
+ BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
+ return (ret);
+ }
+
+ /* Make sure router ip is in subnet */
+ if (!IN6_ARE_PREFIXEDADDR_EQUAL(&ip, &fab->vafs_addr,
+ fab->vafs_prefixlen)) {
+ void *ipp = &fab->vafs_addr;
+ bunyan_type_t type =
+ IN6_IS_ADDR_V4MAPPED(&fab->vafs_addr) ?
+ BUNYAN_T_IP : BUNYAN_T_IP6;
+
+ (void) bunyan_error(files_bunyan,
+ "'routerip' value is not within subnet",
+ type, "routerip", ipp,
+ BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
+ return (EINVAL);
+ }
+
+ /*
+ * Add VL2 entry for overlay router on this fabric.
+ * Use umem_zalloc so vl2->vfi_dest (UL3 address) is all zeros.
+ */
+ if ((vl2 = umem_zalloc(sizeof (*vl2), UMEM_DEFAULT)) == NULL) {
+ (void) bunyan_error(files_bunyan,
+ "out of memory", BUNYAN_T_END);
+ umem_free(fab, sizeof (*fab));
return (ENOMEM);
}
- if ((ret = nvlist_add_nvlist(out, fname, data)) != 0) {
- nvlist_free(out);
+ bcopy(&ip, &vl2->vfi_ip, sizeof (struct in6_addr));
+ bcopy(fab->vafs_routermac, vl2->vfi_mac, ETHERADDRL);
+ vl2->vfi_dcid = fab->vafs_dcid;
+ vl2->vfi_vnet = fab->vafs_vnet;
+ vl2->vfi_vlan = fab->vafs_vlan;
+ avl_add(&vaf->vaf_macs, vl2);
+ avl_add(&vaf->vaf_ips, vl2);
+
+ avl_add(&vaf->vaf_fabrics, fab);
+ }
+
+ return (0);
+}
+
+static int
+varpd_files_convert_nvlist(varpd_files_t *vaf, nvlist_t *data, uint_t level)
+{
+ nvpair_t *nvp = NULL;
+ nvlist_t *nvl = NULL;
+ char *name;
+ int ret;
+
+ while ((nvp = nvlist_next_nvpair(data, nvp)) != NULL) {
+ varpd_files_if_t *ifp = NULL;
+ char *s;
+ int32_t i32;
+
+ name = nvpair_name(nvp);
+
+ (void) bunyan_debug(files_bunyan, "processing key",
+ BUNYAN_T_STRING, "key", name,
+ BUNYAN_T_END);
+
+ if (nvpair_type(nvp) != DATA_TYPE_NVLIST) {
+ (void) bunyan_error(files_bunyan,
+ "value is not a hash (nvlist)",
+ BUNYAN_T_STRING, "key", name,
+ BUNYAN_T_END);
+ return (EINVAL);
+ }
+
+ if ((ret = nvpair_value_nvlist(nvp, &nvl)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "unexpected error reading values for mac entry",
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_STRING, "errmsg", strerror(ret),
+ BUNYAN_T_END);
+ return (ret);
+ }
+
+ if (strcmp(name, "fabrics") == 0) {
+ if (level > 0) {
+ (void) bunyan_error(files_bunyan,
+ "'fabrics' can only appear at the top-most "
+ "level", BUNYAN_T_END);
+ return (EINVAL);
+ }
+ ret = varpd_files_convert_fabrics(vaf, nvp);
+ if (ret != 0) {
+ return (ret);
+ }
+ continue;
+ }
+
+ if ((ifp = umem_zalloc(sizeof (*ifp), UMEM_DEFAULT)) == NULL) {
+ (void) bunyan_error(files_bunyan,
+ "out of memory", BUNYAN_T_END);
+ return (ENOMEM);
+ }
+ ifp->vfi_dcid = vaf->vaf_dcid;
+
+ struct ether_addr *ep = (struct ether_addr *)ifp->vfi_mac;
+ if (ether_aton_r(name, ep) == NULL) {
+ (void) bunyan_error(files_bunyan, "invalid MAC address",
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
return (EINVAL);
}
+
+ if ((ret = nvlist_lookup_int32(nvl, "vlan", &i32)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "'vlan' entry is missing",
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
+ return (ret);
+ }
+ if (i32 < 0 || i32 > VLAN_ID_MAX) {
+ (void) bunyan_error(files_bunyan,
+ "vlan value is out of range (0-4094)",
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_INT32, "vlan", i32,
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
+ return (ERANGE);
+ }
+ ifp->vfi_vlan = (uint16_t)i32;
+
+ if ((ret = nvlist_lookup_string(nvl, "arp", &s)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "'arp' entry is missing",
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_STRING, "errmsg", strerror(ret),
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
+ return (ret);
+ }
+ if ((ret = str_to_ip(s, &ifp->vfi_ip, NULL)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "'arp' value is not an IP address",
+ BUNYAN_T_STRING, "arp", s,
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
+ return (ret);
+ }
+
+ if ((ret = nvlist_lookup_string(nvl, "ip", &s)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "'ip' entry is missing",
+ BUNYAN_T_STRING, "ip", s,
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
+ return (ret);
+ }
+ if ((ret = str_to_ip(s, &ifp->vfi_dest.otp_ip, NULL)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "'ip' value is not a IP address",
+ BUNYAN_T_STRING, "ip", s,
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
+ return (ret);
+ }
+
+ if (vaf->vaf_dest & OVERLAY_PLUGIN_D_PORT) {
+ ret = nvlist_lookup_int32(nvl, "port", &i32);
+ if (ret != 0) {
+ (void) bunyan_error(files_bunyan,
+ "'port' value is required, but is missing",
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
+ return (ret);
+ }
+
+ if (i32 <= 0 || i32 > UINT16_MAX) {
+ (void) bunyan_error(files_bunyan,
+ "'port' value is out of range (0-65535)",
+ BUNYAN_T_INT32, "port", i32,
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
+ return (ERANGE);
+ }
+ ifp->vfi_dest.otp_port = i32;
+ }
+
+ switch (ret = nvlist_lookup_string(nvl, "ndp", &s)) {
+ case 0:
+ ret = str_to_ip(s, &ifp->vfi_llocalip, NULL);
+ if (ret != 0) {
+ (void) bunyan_error(files_bunyan,
+ "'ndp' value is not an IP",
+ BUNYAN_T_STRING, "ndp", s,
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_END);
+ return (ret);
+ }
+ ifp->vfi_has_lladdr = B_TRUE;
+ break;
+ case ENOENT:
+ /* Ok if missing */
+ break;
+ default:
+ (void) bunyan_error(files_bunyan,
+ "unexpected error processing 'ndp' value",
+ BUNYAN_T_STRING, "errmsg", strerror(errno),
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
+ return (ret);
+ }
+
+ switch (ret = nvlist_lookup_string(nvl, "dhcp-proxy", &s)) {
+ case 0:
+ ep = (struct ether_addr *)&ifp->vfi_dhcp;
+ if (ether_aton_r(s, ep) == NULL) {
+ (void) bunyan_error(files_bunyan,
+ "value of 'dhcp-proxy' is not a "
+ "MAC address",
+ BUNYAN_T_STRING, "dhcp-proxy", s,
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
+ return (EINVAL);
+ }
+ ifp->vfi_has_dhcp = B_TRUE;
+ break;
+ case ENOENT:
+ /* Ok if missing */
+ break;
+ default:
+ (void) bunyan_error(files_bunyan,
+ "unexpected error reading 'dhcp-proxy' value",
+ BUNYAN_T_STRING, "errmsg", strerror(errno),
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
+ return (ret);
+ }
+
+ switch (ret = nvlist_lookup_string(nvl, "vid", &s)) {
+ case ENOENT:
+ ifp->vfi_vnet = vaf->vaf_vnet;
+ break;
+ case 0:
+ errno = 0;
+ if ((ifp->vfi_vnet = strtoul(s, NULL, 10)) != 0 ||
+ errno == 0)
+ break;
+ ret = errno;
+ (void) bunyan_error(files_bunyan,
+ "unable to parse 'vid' as a number",
+ BUNYAN_T_STRING, "vid", s,
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
+ return (ret);
+ default:
+ (void) bunyan_error(files_bunyan,
+ "unexpected error processing 'vid' value",
+ BUNYAN_T_STRING, "errmsg", strerror(errno),
+ BUNYAN_T_STRING, "mac", name,
+ BUNYAN_T_END);
+ umem_free(ifp, sizeof (*ifp));
+ return (ret);
+ }
+
+ /* Make sure router ip is in subnet */
+ avl_add(&vaf->vaf_macs, ifp);
+ avl_add(&vaf->vaf_ips, ifp);
+ if (ifp->vfi_has_lladdr && (ifp->vfi_dcid == vaf->vaf_dcid))
+ avl_add(&vaf->vaf_ndp, ifp);
}
- vaf->vaf_nvl = out;
return (0);
}
@@ -163,17 +1040,29 @@ varpd_files_start(void *arg)
struct stat st;
nvlist_t *nvl;
varpd_files_t *vaf = arg;
+ nvlist_parse_json_error_t jerr = { 0 };
if (vaf->vaf_path == NULL)
return (EAGAIN);
- if ((fd = open(vaf->vaf_path, O_RDONLY)) < 0)
+ if ((fd = open(vaf->vaf_path, O_RDONLY)) < 0) {
+ (void) bunyan_error(files_bunyan,
+ "Cannot read destination data",
+ BUNYAN_T_STRING, "path", vaf->vaf_path,
+ BUNYAN_T_STRING, "errmsg", strerror(errno),
+ BUNYAN_T_END);
return (errno);
+ }
if (fstat(fd, &st) != 0) {
ret = errno;
if (close(fd) != 0)
abort();
+ (void) bunyan_error(files_bunyan,
+ "could not determine status of file (stat(2) failed)",
+ BUNYAN_T_STRING, "path", vaf->vaf_path,
+ BUNYAN_T_STRING, "errmsg", strerror(ret),
+ BUNYAN_T_END);
return (ret);
}
@@ -183,15 +1072,30 @@ varpd_files_start(void *arg)
ret = errno;
if (close(fd) != 0)
abort();
+ (void) bunyan_error(files_bunyan,
+ "could not load destination data (mmap(2) failed)",
+ BUNYAN_T_STRING, "path", vaf->vaf_path,
+ BUNYAN_T_STRING, "errmsg", strerror(errno),
+ BUNYAN_T_END);
return (ret);
}
- ret = nvlist_parse_json(maddr, st.st_size, &nvl,
- NVJSON_FORCE_INTEGER, NULL);
- if (ret == 0) {
- ret = varpd_files_normalize_nvlist(vaf, nvl);
+ if ((ret = nvlist_parse_json(maddr, st.st_size, &nvl,
+ NVJSON_FORCE_INTEGER, &jerr)) != 0) {
+ (void) bunyan_error(files_bunyan,
+ "could not parse destination JSON file",
+ BUNYAN_T_STRING, "path", vaf->vaf_path,
+ BUNYAN_T_STRING, "parse_msg", jerr.nje_message,
+ BUNYAN_T_UINT32, "pos", (uint32_t)jerr.nje_pos,
+ BUNYAN_T_INT32, "errno", (int32_t)jerr.nje_errno,
+ BUNYAN_T_STRING, "errmsg", strerror(jerr.nje_errno),
+ BUNYAN_T_END);
+ } else {
+ ret = varpd_files_convert_nvlist(vaf, nvl, 0);
nvlist_free(nvl);
+ nvl = NULL;
}
+
if (munmap(maddr, st.st_size) != 0)
abort();
if (close(fd) != 0)
@@ -204,9 +1108,38 @@ static void
varpd_files_stop(void *arg)
{
varpd_files_t *vaf = arg;
+ varpd_files_if_t *vif;
+ varpd_files_attach_t *att;
+ varpd_files_fabric_t *fab;
+
+ /*
+ * VL2 data should appear in both trees, so free only after removed
+ * from second tree.
+ */
+ while ((vif = avl_first(&vaf->vaf_ips)) != NULL)
+ avl_remove(&vaf->vaf_ips, vif);
+
+ while ((vif = avl_first(&vaf->vaf_macs)) != NULL) {
+ avl_remove(&vaf->vaf_macs, vif);
+ umem_free(vif, sizeof (*vif));
+ }
- nvlist_free(vaf->vaf_nvl);
- vaf->vaf_nvl = NULL;
+ /*
+ * A fabric could be unattached, and not appear in any attachment
+ * group. Therefore, remove the fabrics from all the attached groups,
+ * then free them after removing from the global list of fabrics.
+ */
+ while ((att = list_remove_head(&vaf->vaf_attached)) != NULL) {
+ do {
+ fab = list_remove_head(&att->vfa_fabrics);
+ } while (fab != NULL);
+ umem_free(att, sizeof (*att));
+ }
+
+ while ((fab = avl_first(&vaf->vaf_fabrics)) != NULL) {
+ avl_remove(&vaf->vaf_fabrics, fab);
+ umem_free(fab, sizeof (*fab));
+ }
}
static void
@@ -214,114 +1147,199 @@ varpd_files_destroy(void *arg)
{
varpd_files_t *vaf = arg;
- assert(vaf->vaf_nvl == NULL);
if (vaf->vaf_path != NULL) {
umem_free(vaf->vaf_path, strlen(vaf->vaf_path) + 1);
vaf->vaf_path = NULL;
}
+
+ avl_destroy(&vaf->vaf_fabrics);
+ avl_destroy(&vaf->vaf_macs);
+ avl_destroy(&vaf->vaf_ips);
+ list_destroy(&vaf->vaf_attached);
+
umem_free(vaf, sizeof (varpd_files_t));
}
-static void
-varpd_files_lookup(void *arg, varpd_query_handle_t *qh,
- const overlay_targ_lookup_t *otl, overlay_target_point_t *otp)
+static varpd_files_fabric_t *
+varpd_files_find_dstfab(varpd_files_t *vaf, varpd_files_attach_t *att,
+ const struct in6_addr *dst)
{
- char macstr[ETHERADDRSTRL], *ipstr;
- nvlist_t *nvl;
- varpd_files_t *vaf = arg;
- int32_t port;
- static const uint8_t bcast[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+ varpd_files_fabric_t *net = NULL;
- /* We don't support a default */
- if (otl == NULL) {
- libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
- return;
+ for (net = list_head(&att->vfa_fabrics); net != NULL;
+ net = list_next(&att->vfa_fabrics, net)) {
+ if (IN6_ARE_PREFIXEDADDR_EQUAL(dst, &net->vafs_addr,
+ net->vafs_prefixlen)) {
+ return (net);
+ }
}
- if (otl->otl_sap == ETHERTYPE_ARP) {
- libvarpd_plugin_proxy_arp(vaf->vaf_hdl, qh, otl);
- return;
- }
+ return (NULL);
+}
- if (otl->otl_sap == ETHERTYPE_IPV6 &&
- otl->otl_dstaddr[0] == 0x33 &&
- otl->otl_dstaddr[1] == 0x33) {
- libvarpd_plugin_proxy_ndp(vaf->vaf_hdl, qh, otl);
- return;
- }
+static varpd_files_attach_t *
+varpd_files_find_attach(varpd_files_t *vaf, const struct in6_addr *src,
+ uint16_t vlan, overlay_target_route_t *otr)
+{
+ varpd_files_fabric_t *fab;
+ varpd_files_fabric_t lookup = {
+ .vafs_vnet = vaf->vaf_vnet,
+ .vafs_dcid = vaf->vaf_dcid,
+ .vafs_vlan = vlan,
+ .vafs_addr = *src
+ };
+ avl_index_t where = 0;
- if (otl->otl_sap == ETHERTYPE_IP &&
- bcmp(otl->otl_dstaddr, bcast, ETHERADDRL) == 0) {
- char *mac;
- struct ether_addr a, *addr;
+ /*
+ * Since fabrics are sorted by subnet address last, any given IP
+ * potentially in a fabric subnet should lie between two adjacent
+ * fabric entries in the tree. Find where such an IP would go in
+ * the tree, and the entry before the insertion point should be the
+ * fabric (if it is present).
+ */
+ fab = avl_find(&vaf->vaf_fabrics, &lookup, &where);
+ if (fab != NULL) {
+ /*
+ * Someone requested the subnet address. E.g. if the fabric
+ * is 192.168.10.0/24, someone asked for 192.168.10.0. Treat
+ * as not found.
+ */
+ return (NULL);
+ }
- addr = &a;
- if (ether_ntoa_r((struct ether_addr *)otl->otl_srcaddr,
- macstr) == NULL) {
- libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
- return;
- }
+ fab = avl_nearest(&vaf->vaf_fabrics, where, AVL_BEFORE);
+ if (fab == NULL) {
+ return (NULL);
+ }
- if (nvlist_lookup_nvlist(vaf->vaf_nvl, macstr, &nvl) != 0) {
- libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
- return;
- }
+ /* Still must verify that the address lies in the range of the subnet */
+ if (!IN6_ARE_PREFIXEDADDR_EQUAL(&fab->vafs_addr, src,
+ fab->vafs_prefixlen)) {
+ return (NULL);
+ }
- if (nvlist_lookup_string(nvl, "dhcp-proxy", &mac) != 0) {
- libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
- return;
- }
+ return (fab->vafs_attach);
+}
- if (ether_aton_r(mac, addr) == NULL) {
- libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
- return;
- }
+static void
+varpd_files_lookup_l3(varpd_files_t *vaf, varpd_query_handle_t *qh,
+ const overlay_targ_lookup_t *otl, overlay_target_point_t *otp,
+ overlay_target_route_t *otr, overlay_target_mac_t *otm)
+{
+ const struct in6_addr *dst_ip;
+ const struct in6_addr *src_ip;
+ varpd_files_attach_t *attach = NULL;
+ varpd_files_fabric_t *fab = NULL;
+ varpd_files_if_t *ifp = NULL;
- libvarpd_plugin_proxy_dhcp(vaf->vaf_hdl, qh, otl);
- return;
- }
+ dst_ip = &otl->otl_addru.otlu_l3.otl3_dstip;
+ src_ip = &otl->otl_addru.otlu_l3.otl3_srcip;
- if (ether_ntoa_r((struct ether_addr *)otl->otl_dstaddr,
- macstr) == NULL) {
+ if ((attach = varpd_files_find_attach(vaf, src_ip, otl->otl_vlan,
+ otr)) == NULL) {
libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
return;
}
- if (nvlist_lookup_nvlist(vaf->vaf_nvl, macstr, &nvl) != 0) {
+ if ((fab = varpd_files_find_dstfab(vaf, attach, dst_ip)) == NULL) {
libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
return;
}
- if (nvlist_lookup_int32(nvl, "port", &port) != 0) {
+ varpd_files_if_t lookup = { 0 };
+
+ lookup.vfi_vnet = fab->vafs_vnet;
+ lookup.vfi_vlan = fab->vafs_vlan;
+ bcopy(dst_ip, &lookup.vfi_ip, sizeof (struct in6_addr));
+
+ if ((ifp = avl_find(&vaf->vaf_ips, &lookup, NULL)) == NULL) {
libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
return;
}
- if (port <= 0 || port > UINT16_MAX) {
+ otr->otr_vnet = fab->vafs_vnet;
+ otr->otr_vlan = fab->vafs_vlan;
+ bcopy(fab->vafs_routermac, otr->otr_srcmac, ETHERADDRL);
+
+ otm->otm_dcid = fab->vafs_dcid;
+ bcopy(ifp->vfi_mac, otm->otm_mac, ETHERADDRL);
+
+ bcopy(&ifp->vfi_dest, otp, sizeof (*otp));
+
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_OK);
+}
+
+static void
+varpd_files_lookup(void *arg, varpd_query_handle_t *qh,
+ const overlay_targ_lookup_t *otl, overlay_target_point_t *otp,
+ overlay_target_route_t *otr, overlay_target_mac_t *otm)
+{
+ varpd_files_t *vaf = arg;
+ static const uint8_t bcast[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+ varpd_files_if_t *ifp = NULL;
+ varpd_files_if_t lookup = { .vfi_dcid = vaf->vaf_dcid };
+
+
+ /* We don't support a default */
+ if (otl == NULL) {
libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
return;
}
- otp->otp_port = port;
- if (nvlist_lookup_string(nvl, "ip", &ipstr) != 0) {
- libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ /*
+ * Shuffle off L3 lookups to their own codepath.
+ */
+ if (otl->otl_l3req) {
+ varpd_files_lookup_l3(vaf, qh, otl, otp, otr, otm);
return;
}
/*
- * Try to parse it as a v6 address and then if it's not, try to
- * transform it into a v4 address which we'll then wrap it into a v4
- * mapped address.
+ * At this point, the traditional overlay_target_point_t is all that
+ * needs filling in. Zero-out the otr and otm for safety.
*/
- if (inet_pton(AF_INET6, ipstr, &otp->otp_ip) != 1) {
- uint32_t v4;
- if (inet_pton(AF_INET, ipstr, &v4) != 1) {
+ bzero(otr, sizeof (*otr));
+ bzero(otm, sizeof (*otm));
+
+ if (otl->otl_addru.otlu_l2.otl2_sap == ETHERTYPE_ARP) {
+ libvarpd_plugin_proxy_arp(vaf->vaf_hdl, qh, otl);
+ return;
+ }
+
+ if (otl->otl_addru.otlu_l2.otl2_sap == ETHERTYPE_IPV6 &&
+ otl->otl_addru.otlu_l2.otl2_dstaddr[0] == 0x33 &&
+ otl->otl_addru.otlu_l2.otl2_dstaddr[1] == 0x33) {
+ libvarpd_plugin_proxy_ndp(vaf->vaf_hdl, qh, otl);
+ return;
+ }
+
+ if (otl->otl_addru.otlu_l2.otl2_sap == ETHERTYPE_IP &&
+ bcmp(otl->otl_addru.otlu_l2.otl2_dstaddr, bcast, ETHERADDRL) == 0) {
+ bcopy(otl->otl_addru.otlu_l2.otl2_srcaddr, lookup.vfi_mac,
+ ETHERADDRL);
+
+ if ((ifp = avl_find(&vaf->vaf_macs, &lookup, NULL)) == NULL) {
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ if (!ifp->vfi_has_dhcp) {
libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
return;
}
- IN6_IPADDR_TO_V4MAPPED(v4, &otp->otp_ip);
+
+ libvarpd_plugin_proxy_dhcp(vaf->vaf_hdl, qh, otl);
+ return;
+ }
+
+ bcopy(otl->otl_addru.otlu_l2.otl2_dstaddr, lookup.vfi_mac, ETHERADDRL);
+ if ((ifp = avl_find(&vaf->vaf_macs, &lookup, NULL)) == NULL) {
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ return;
}
+ bcopy(&ifp->vfi_dest, otp, sizeof (*otp));
+
libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_OK);
}
@@ -344,6 +1362,7 @@ varpd_files_propinfo(void *arg, uint_t propid, varpd_prop_handle_t *vph)
libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_RRW);
libvarpd_prop_set_type(vph, OVERLAY_PROP_T_STRING);
libvarpd_prop_set_nodefault(vph);
+
return (0);
}
@@ -361,7 +1380,6 @@ varpd_files_getprop(void *arg, const char *pname, void *buf, uint32_t *sizep)
return (EOVERFLOW);
*sizep = len;
(void) strlcpy(buf, vaf->vaf_path, *sizep);
-
} else {
*sizep = 0;
}
@@ -457,12 +1475,17 @@ varpd_files_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl,
static void
varpd_files_proxy_arp(void *arg, varpd_arp_handle_t *vah, int kind,
- const struct sockaddr *sock, uint8_t *out)
+ const struct sockaddr *sock, uint16_t vlan, uint8_t *out)
{
varpd_files_t *vaf = arg;
const struct sockaddr_in *ip;
const struct sockaddr_in6 *ip6;
- nvpair_t *pair;
+ varpd_files_if_t *ifp = NULL;
+ varpd_files_if_t lookup = {
+ .vfi_vnet = vaf->vaf_vnet,
+ .vfi_dcid = vaf->vaf_dcid,
+ .vfi_vlan = vlan
+ };
if (kind != VARPD_QTYPE_ETHERNET) {
libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP);
@@ -476,56 +1499,23 @@ varpd_files_proxy_arp(void *arg, varpd_arp_handle_t *vah, int kind,
ip = (const struct sockaddr_in *)sock;
ip6 = (const struct sockaddr_in6 *)sock;
- for (pair = nvlist_next_nvpair(vaf->vaf_nvl, NULL); pair != NULL;
- pair = nvlist_next_nvpair(vaf->vaf_nvl, pair)) {
- char *mac, *ipstr;
- nvlist_t *data;
- struct in_addr ia;
- struct in6_addr ia6;
- struct ether_addr ether, *e;
- e = &ether;
-
- if (nvpair_type(pair) != DATA_TYPE_NVLIST)
- continue;
-
- mac = nvpair_name(pair);
- if (nvpair_value_nvlist(pair, &data) != 0)
- continue;
-
-
- if (sock->sa_family == AF_INET) {
- if (nvlist_lookup_string(data, "arp", &ipstr) != 0)
- continue;
- if (inet_pton(AF_INET, ipstr, &ia) != 1)
- continue;
-
- if (bcmp(&ia, &ip->sin_addr,
- sizeof (struct in_addr)) != 0)
- continue;
- } else {
- if (nvlist_lookup_string(data, "ndp", &ipstr) != 0)
- continue;
-
- if (inet_pton(AF_INET6, ipstr, &ia6) != 1)
- continue;
-
- if (bcmp(&ia6, &ip6->sin6_addr,
- sizeof (struct in6_addr)) != 0)
- continue;
- }
-
- if (ether_aton_r(mac, e) == NULL) {
- libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP);
- return;
- }
+ if (sock->sa_family == AF_INET) {
+ IN6_IPADDR_TO_V4MAPPED(ip->sin_addr.s_addr, &lookup.vfi_ip);
+ ifp = avl_find(&vaf->vaf_ips, &lookup, NULL);
+ } else {
+ bcopy(&ip6->sin6_addr, &lookup.vfi_llocalip,
+ sizeof (struct in6_addr));
+ ifp = avl_find(&vaf->vaf_ndp, &lookup, NULL);
+ }
- bcopy(e, out, ETHERADDRL);
- libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_OK);
+ if (ifp == NULL) {
+ libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP);
return;
}
- libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP);
+ bcopy(ifp->vfi_mac, out, ETHERADDRL);
+ libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_OK);
}
static void
@@ -533,38 +1523,28 @@ varpd_files_proxy_dhcp(void *arg, varpd_dhcp_handle_t *vdh, int type,
const overlay_targ_lookup_t *otl, uint8_t *out)
{
varpd_files_t *vaf = arg;
- nvlist_t *nvl;
- char macstr[ETHERADDRSTRL], *mac;
- struct ether_addr a, *addr;
+ varpd_files_if_t *ifp = NULL;
+ varpd_files_if_t lookup = {
+ .vfi_dcid = vaf->vaf_dcid,
+ .vfi_mac = *otl->otl_addru.otlu_l2.otl2_srcaddr
+ };
- addr = &a;
if (type != VARPD_QTYPE_ETHERNET) {
libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP);
return;
}
- if (ether_ntoa_r((struct ether_addr *)otl->otl_srcaddr,
- macstr) == NULL) {
+ if ((ifp = avl_find(&vaf->vaf_macs, &lookup, NULL)) == NULL) {
libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP);
return;
}
- if (nvlist_lookup_nvlist(vaf->vaf_nvl, macstr, &nvl) != 0) {
+ if (!ifp->vfi_has_dhcp) {
libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP);
return;
}
- if (nvlist_lookup_string(nvl, "dhcp-proxy", &mac) != 0) {
- libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP);
- return;
- }
-
- if (ether_aton_r(mac, addr) == NULL) {
- libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP);
- return;
- }
-
- bcopy(addr, out, ETHERADDRL);
+ bcopy(ifp->vfi_dhcp, out, ETHERADDRL);
libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_OK);
}
@@ -586,6 +1566,27 @@ static const varpd_plugin_ops_t varpd_files_ops = {
varpd_files_proxy_dhcp
};
+static int
+files_bunyan_init(void)
+{
+ int ret;
+
+ if ((ret = bunyan_init("files", &files_bunyan)) != 0)
+ return (ret);
+ ret = bunyan_stream_add(files_bunyan, "stderr", BUNYAN_L_INFO,
+ bunyan_stream_fd, (void *)STDERR_FILENO);
+ if (ret != 0)
+ bunyan_fini(files_bunyan);
+ return (ret);
+}
+
+static void
+files_bunyan_fini(void)
+{
+ if (files_bunyan != NULL)
+ bunyan_fini(files_bunyan);
+}
+
#pragma init(varpd_files_init)
static void
varpd_files_init(void)
@@ -593,9 +1594,14 @@ varpd_files_init(void)
int err;
varpd_plugin_register_t *vpr;
+ if (files_bunyan_init() != 0)
+ return;
+
vpr = libvarpd_plugin_alloc(VARPD_CURRENT_VERSION, &err);
- if (vpr == NULL)
+ if (vpr == NULL) {
+ files_bunyan_fini();
return;
+ }
vpr->vpr_mode = OVERLAY_TARGET_DYNAMIC;
vpr->vpr_name = "files";
diff --git a/usr/src/lib/varpd/libvarpd/Makefile b/usr/src/lib/varpd/libvarpd/Makefile
index 2a4f8f070c..7fb91078e3 100644
--- a/usr/src/lib/varpd/libvarpd/Makefile
+++ b/usr/src/lib/varpd/libvarpd/Makefile
@@ -29,7 +29,8 @@ TYPELIST = \
varpd_persist_header_t \
overlay_targ_cache_entry_t \
overlay_targ_cache_t \
- overlay_targ_cache_iter_t
+ overlay_targ_cache_iter_t \
+ overlay_targ_resp_t
all := TARGET = all
clean := TARGET = clean
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd.c b/usr/src/lib/varpd/libvarpd/common/libvarpd.c
index e4460089cc..9de3602e62 100644
--- a/usr/src/lib/varpd/libvarpd/common/libvarpd.c
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd.c
@@ -150,6 +150,7 @@ libvarpd_instance_create(varpd_handle_t *vhp, datalink_id_t linkid,
varpd_instance_t *inst, lookup;
overlay_plugin_dest_t dest;
uint64_t vid;
+ uint32_t dcid;
/*
* We should really have our own errnos.
@@ -158,7 +159,8 @@ libvarpd_instance_create(varpd_handle_t *vhp, datalink_id_t linkid,
if (plugin == NULL)
return (ENOENT);
- if ((ret = libvarpd_overlay_info(vip, linkid, &dest, NULL, &vid)) != 0)
+ if ((ret = libvarpd_overlay_info(vip, linkid, &dest, NULL, &vid,
+ &dcid)) != 0)
return (ret);
inst = umem_alloc(sizeof (varpd_instance_t), UMEM_DEFAULT);
@@ -175,6 +177,7 @@ libvarpd_instance_create(varpd_handle_t *vhp, datalink_id_t linkid,
inst->vri_dest = dest;
inst->vri_plugin = plugin;
inst->vri_impl = vip;
+ inst->vri_dcid = dcid;
inst->vri_flags = 0;
if ((ret = plugin->vpp_ops->vpo_create((varpd_provider_handle_t *)inst,
&inst->vri_private, dest)) != 0) {
@@ -217,6 +220,13 @@ libvarpd_plugin_vnetid(varpd_provider_handle_t *vhp)
return (inst->vri_vnetid);
}
+uint32_t
+libvarpd_plugin_dcid(varpd_provider_handle_t *vhp)
+{
+ varpd_instance_t *inst = (varpd_instance_t *)vhp;
+ return (inst->vri_dcid);
+}
+
varpd_instance_handle_t *
libvarpd_instance_lookup(varpd_handle_t *vhp, uint64_t id)
{
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c
index df69207fe0..a32889e8a2 100644
--- a/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c
@@ -43,7 +43,7 @@ typedef struct varpd_arp_query {
varpd_query_handle_t *vaq_query;
const overlay_targ_lookup_t *vaq_otl;
ip6_t *vaq_ip6;
- nd_neighbor_solicit_t *vaq_ns;
+ nd_neighbor_solicit_t *vaq_ns;
} varpd_arp_query_t;
typedef struct varpd_dhcp_query {
@@ -75,7 +75,7 @@ libvarpd_plugin_proxy_arp(varpd_provider_handle_t *hdl,
}
vaq->vaq_bsize = sizeof (vaq->vaq_buf);
- if (otl->otl_sap != ETHERTYPE_ARP) {
+ if (otl->otl_addru.otlu_l2.otl2_sap != ETHERTYPE_ARP) {
libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
umem_free(vaq, sizeof (varpd_arp_query_t));
return;
@@ -151,7 +151,7 @@ libvarpd_plugin_proxy_arp(varpd_provider_handle_t *hdl,
inst->vri_plugin->vpp_ops->vpo_arp(inst->vri_private,
(varpd_arp_handle_t *)vaq, VARPD_QTYPE_ETHERNET,
- (struct sockaddr *)ip, vaq->vaq_lookup);
+ (struct sockaddr *)ip, otl->otl_vlan, vaq->vaq_lookup);
}
static void
@@ -248,8 +248,8 @@ libvarpd_plugin_proxy_ndp(varpd_provider_handle_t *hdl,
}
vaq->vaq_bsize = sizeof (vaq->vaq_buf);
- if (otl->otl_dstaddr[0] != 0x33 ||
- otl->otl_dstaddr[1] != 0x33) {
+ if (otl->otl_addru.otlu_l2.otl2_dstaddr[0] != 0x33 ||
+ otl->otl_addru.otlu_l2.otl2_dstaddr[1] != 0x33) {
libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
umem_free(vaq, sizeof (varpd_arp_query_t));
return;
@@ -388,7 +388,7 @@ libvarpd_plugin_proxy_ndp(varpd_provider_handle_t *hdl,
vaq->vaq_ip6 = v6hdr;
inst->vri_plugin->vpp_ops->vpo_arp(inst->vri_private,
(varpd_arp_handle_t *)vaq, VARPD_QTYPE_ETHERNET,
- (struct sockaddr *)s6, vaq->vaq_lookup);
+ (struct sockaddr *)s6, otl->otl_vlan, vaq->vaq_lookup);
}
static void
@@ -505,13 +505,14 @@ libvarpd_plugin_proxy_dhcp(varpd_provider_handle_t *hdl,
}
vdq->vdq_bsize = sizeof (vdq->vdq_buf);
- if (otl->otl_sap != ETHERTYPE_IP) {
+ if (otl->otl_addru.otlu_l2.otl2_sap != ETHERTYPE_IP) {
libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
umem_free(vdq, sizeof (varpd_dhcp_query_t));
return;
}
- if (bcmp(otl->otl_dstaddr, libvarpd_arp_bcast, ETHERADDRL) != 0) {
+ if (bcmp(otl->otl_addru.otlu_l2.otl2_dstaddr, libvarpd_arp_bcast,
+ ETHERADDRL) != 0) {
libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
umem_free(vdq, sizeof (varpd_dhcp_query_t));
return;
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c
index 18e220259c..c6cc812dcf 100644
--- a/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c
@@ -476,7 +476,7 @@ libvarpd_c_instance_cache_flush(varpd_client_handle_t *chp, uint64_t cid)
int
libvarpd_c_instance_cache_delete(varpd_client_handle_t *chp, uint64_t cid,
- const struct ether_addr *key)
+ uint32_t dcid, const struct ether_addr *key)
{
int ret;
varpd_client_arg_t carg;
@@ -489,6 +489,7 @@ libvarpd_c_instance_cache_delete(varpd_client_handle_t *chp, uint64_t cid,
carg.vca_command = VARPD_CLIENT_CACHE_DELETE;
carg.vca_errno = 0;
vctcap->vtca_id = cid;
+ vctcap->vtca_dcid = dcid;
bcopy(key, vctcap->vtca_key, ETHERADDRL);
ret = libvarpd_c_door_call(client, &carg, 0);
@@ -532,7 +533,8 @@ libvarpd_c_instance_cache_get(varpd_client_handle_t *chp, uint64_t cid,
int
libvarpd_c_instance_cache_set(varpd_client_handle_t *chp, uint64_t cid,
- const struct ether_addr *key, const varpd_client_cache_entry_t *entry)
+ uint32_t dcid, const struct ether_addr *key,
+ const varpd_client_cache_entry_t *entry)
{
int ret;
varpd_client_arg_t carg;
@@ -545,6 +547,7 @@ libvarpd_c_instance_cache_set(varpd_client_handle_t *chp, uint64_t cid,
carg.vca_command = VARPD_CLIENT_CACHE_SET;
carg.vca_errno = 0;
vctcap->vtca_id = cid;
+ vctcap->vtca_dcid = dcid;
bcopy(key, vctcap->vtca_key, ETHERADDRL);
bcopy(entry, &vctcap->vtca_entry, sizeof (varpd_client_cache_entry_t));
@@ -604,14 +607,17 @@ libvarpd_c_instance_cache_walk(varpd_client_handle_t *chp, uint64_t cid,
for (i = 0; i < vctwap->vtcw_count; i++) {
varpd_client_cache_entry_t ent;
+ overlay_targ_cache_entry_t *otce;
- ent.vcp_flags = vctwap->vtcw_ents[i].otce_flags;
- bcopy(vctwap->vtcw_ents[i].otce_dest.otp_mac,
- &ent.vcp_mac, ETHERADDRL);
- ent.vcp_ip = vctwap->vtcw_ents[i].otce_dest.otp_ip;
- ent.vcp_port = vctwap->vtcw_ents[i].otce_dest.otp_port;
+ otce = &vctwap->vtcw_ents[i];
+
+ ent.vcp_flags = otce->otce_flags;
+ bcopy(otce->otce_dest.otp_mac, &ent.vcp_mac,
+ ETHERADDRL);
+ ent.vcp_ip = otce->otce_dest.otp_ip;
+ ent.vcp_port = otce->otce_dest.otp_port;
ret = func(chp, cid,
- (struct ether_addr *)vctwap->vtcw_ents[i].otce_mac,
+ (struct ether_addr *)otce->otce_mac.otm_mac,
&ent, arg);
if (ret != 0) {
ret = 0;
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h
index 459711b385..335385b262 100644
--- a/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _LIBVARPD_CLIENT_H
@@ -73,11 +73,11 @@ extern int libvarpd_c_instance_target_mode(varpd_client_handle_t *, uint64_t,
uint_t *, uint_t *);
extern int libvarpd_c_instance_cache_flush(varpd_client_handle_t *, uint64_t);
extern int libvarpd_c_instance_cache_delete(varpd_client_handle_t *, uint64_t,
- const struct ether_addr *);
+ uint32_t, const struct ether_addr *);
extern int libvarpd_c_instance_cache_get(varpd_client_handle_t *, uint64_t,
const struct ether_addr *, varpd_client_cache_entry_t *);
extern int libvarpd_c_instance_cache_set(varpd_client_handle_t *, uint64_t,
- const struct ether_addr *, const varpd_client_cache_entry_t *);
+ uint32_t, const struct ether_addr *, const varpd_client_cache_entry_t *);
typedef int (*varpd_client_cache_f)(varpd_client_handle_t *, uint64_t,
const struct ether_addr *, const varpd_client_cache_entry_t *, void *);
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c
index f684e031a8..d58445d1b7 100644
--- a/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c
@@ -288,7 +288,7 @@ libvarpd_door_f_delete(varpd_impl_t *vip, varpd_client_arg_t *vcap,
if (ihp == NULL)
return (ENOENT);
return (libvarpd_overlay_cache_delete((varpd_instance_t *)ihp,
- vtcap->vtca_key));
+ vtcap->vtca_dcid, vtcap->vtca_key));
}
/* ARGSUSED */
@@ -321,7 +321,7 @@ libvarpd_door_f_set(varpd_impl_t *vip, varpd_client_arg_t *vcap,
return (ENOENT);
return (libvarpd_overlay_cache_set((varpd_instance_t *)ihp,
- vtcap->vtca_key, &vtcap->vtca_entry));
+ vtcap->vtca_dcid, vtcap->vtca_key, &vtcap->vtca_entry));
}
/* ARGSUSED */
@@ -337,7 +337,7 @@ libvarpd_door_f_walk(varpd_impl_t *vip, varpd_client_arg_t *vcap,
return (ENOENT);
return (libvarpd_overlay_cache_walk_fill((varpd_instance_t *)ihp,
- &vctwp->vtcw_marker, &vctwp->vtcw_count, vctwp->vtcw_ents));
+ vctwp->vtcw_marker, &vctwp->vtcw_count, vctwp->vtcw_ents));
}
static libvarpd_door_f *libvarpd_door_table[] = {
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h b/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h
index 60f0dc5fff..e18fc3c4d2 100644
--- a/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _LIBVARPD_IMPL_H
@@ -77,6 +77,7 @@ typedef struct varpd_instance {
varpd_impl_t *vri_impl; /* RO */
varpd_plugin_t *vri_plugin; /* RO */
void *vri_private; /* RO */
+ uint32_t vri_dcid; /* RO */
mutex_t vri_lock;
varpd_instance_flags_t vri_flags; /* vri_lock */
} varpd_instance_t;
@@ -113,7 +114,7 @@ typedef struct varpd_client_propinfo_arg {
uint8_t vcfa_pad[4];
char vcfa_name[LIBVARPD_PROP_NAMELEN];
uint8_t vcfa_default[LIBVARPD_PROP_SIZEMAX];
- uint8_t vcfa_poss[LIBVARPD_PROP_SIZEMAX];
+ uint8_t vcfa_poss[LIBVARPD_PROP_SIZEMAX] __aligned(8);
} varpd_client_propinfo_arg_t;
typedef struct varpd_client_prop_arg {
@@ -137,6 +138,7 @@ typedef struct varpd_client_target_mode_arg {
typedef struct varpd_client_target_cache_arg {
uint64_t vtca_id;
+ uint32_t vtca_dcid;
uint8_t vtca_key[ETHERADDRL];
uint8_t vtca_pad[2];
varpd_client_cache_entry_t vtca_entry;
@@ -144,7 +146,7 @@ typedef struct varpd_client_target_cache_arg {
typedef struct varpd_client_target_walk_arg {
uint64_t vtcw_id;
- uint64_t vtcw_marker;
+ uint64_t vtcw_marker[2];
uint64_t vtcw_count;
overlay_targ_cache_entry_t vtcw_ents[];
} varpd_client_target_walk_arg_t;
@@ -210,7 +212,7 @@ extern int libvarpd_dirwalk(varpd_impl_t *, const char *, const char *,
extern int libvarpd_overlay_init(varpd_impl_t *);
extern void libvarpd_overlay_fini(varpd_impl_t *);
extern int libvarpd_overlay_info(varpd_impl_t *, datalink_id_t,
- overlay_plugin_dest_t *, uint64_t *, uint64_t *);
+ overlay_plugin_dest_t *, uint64_t *, uint64_t *, uint32_t *);
extern int libvarpd_overlay_associate(varpd_instance_t *);
extern int libvarpd_overlay_disassociate(varpd_instance_t *);
extern int libvarpd_overlay_degrade(varpd_instance_t *, const char *);
@@ -228,12 +230,12 @@ typedef int (*libvarpd_overlay_iter_f)(varpd_impl_t *, datalink_id_t, void *);
extern int libvarpd_overlay_iter(varpd_impl_t *, libvarpd_overlay_iter_f,
void *);
extern int libvarpd_overlay_cache_flush(varpd_instance_t *);
-extern int libvarpd_overlay_cache_delete(varpd_instance_t *, const uint8_t *);
-extern int libvarpd_overlay_cache_delete(varpd_instance_t *, const uint8_t *);
+extern int libvarpd_overlay_cache_delete(varpd_instance_t *, uint32_t,
+ const uint8_t *);
extern int libvarpd_overlay_cache_get(varpd_instance_t *, const uint8_t *,
varpd_client_cache_entry_t *);
-extern int libvarpd_overlay_cache_set(varpd_instance_t *, const uint8_t *,
- const varpd_client_cache_entry_t *);
+extern int libvarpd_overlay_cache_set(varpd_instance_t *, uint32_t,
+ const uint8_t *, const varpd_client_cache_entry_t *);
extern int libvarpd_overlay_cache_walk_fill(varpd_instance_t *, uint64_t *,
uint64_t *, overlay_targ_cache_entry_t *);
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c
index 124e3c5791..8ee12a455e 100644
--- a/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -53,7 +53,8 @@ libvarpd_overlay_fini(varpd_impl_t *vip)
int
libvarpd_overlay_info(varpd_impl_t *vip, datalink_id_t linkid,
- overlay_plugin_dest_t *destp, uint64_t *flags, uint64_t *vnetid)
+ overlay_plugin_dest_t *destp, uint64_t *flags, uint64_t *vnetid,
+ uint32_t *dcid)
{
overlay_targ_info_t oti;
@@ -67,6 +68,8 @@ libvarpd_overlay_info(varpd_impl_t *vip, datalink_id_t linkid,
*flags = oti.oti_flags;
if (vnetid != NULL)
*vnetid = oti.oti_vnetid;
+ if (dcid != NULL)
+ *dcid = oti.oti_dcid;
return (0);
}
@@ -252,6 +255,7 @@ libvarpd_overlay_lookup_handle(varpd_impl_t *vip)
vqp = umem_cache_alloc(vip->vdi_qcache, UMEM_DEFAULT);
otl = &vqp->vq_lookup;
otr = &vqp->vq_response;
+
/*
* abort doesn't really help here that much, maybe we can instead try
* and for a reap or something?
@@ -280,7 +284,8 @@ libvarpd_overlay_lookup_handle(varpd_impl_t *vip)
vqp->vq_instance = inst;
inst->vri_plugin->vpp_ops->vpo_lookup(inst->vri_private,
- (varpd_query_handle_t *)vqp, otl, &otr->otr_answer);
+ (varpd_query_handle_t *)vqp, otl, &otr->otr_answer,
+ &otr->otr_route, &otr->otr_mac);
}
void
@@ -387,7 +392,8 @@ libvarpd_overlay_cache_flush(varpd_instance_t *inst)
}
int
-libvarpd_overlay_cache_delete(varpd_instance_t *inst, const uint8_t *key)
+libvarpd_overlay_cache_delete(varpd_instance_t *inst, uint32_t dcid,
+ const uint8_t *key)
{
int ret;
overlay_targ_cache_t cache;
@@ -395,7 +401,8 @@ libvarpd_overlay_cache_delete(varpd_instance_t *inst, const uint8_t *key)
bzero(&cache, sizeof (overlay_targ_cache_t));
cache.otc_linkid = inst->vri_linkid;
- bcopy(key, cache.otc_entry.otce_mac, ETHERADDRL);
+ cache.otc_entry.otce_mac.otm_dcid = dcid;
+ bcopy(key, cache.otc_entry.otce_mac.otm_mac, ETHERADDRL);
ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_REMOVE, &cache);
if (ret != 0 && errno == EFAULT)
@@ -412,12 +419,11 @@ libvarpd_overlay_cache_get(varpd_instance_t *inst, const uint8_t *key,
varpd_client_cache_entry_t *entry)
{
int ret;
- overlay_targ_cache_t cache;
+ overlay_targ_cache_t cache = { 0 };
varpd_impl_t *vip = inst->vri_impl;
- bzero(&cache, sizeof (overlay_targ_cache_t));
cache.otc_linkid = inst->vri_linkid;
- bcopy(key, cache.otc_entry.otce_mac, ETHERADDRL);
+ bcopy(key, cache.otc_entry.otce_mac.otm_mac, ETHERADDRL);
ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_GET, &cache);
if (ret != 0 && errno == EFAULT)
@@ -434,16 +440,16 @@ libvarpd_overlay_cache_get(varpd_instance_t *inst, const uint8_t *key,
}
int
-libvarpd_overlay_cache_set(varpd_instance_t *inst, const uint8_t *key,
- const varpd_client_cache_entry_t *entry)
+libvarpd_overlay_cache_set(varpd_instance_t *inst, uint32_t dcid,
+ const uint8_t *key, const varpd_client_cache_entry_t *entry)
{
int ret;
- overlay_targ_cache_t cache;
+ overlay_targ_cache_t cache = { 0 };
varpd_impl_t *vip = inst->vri_impl;
- bzero(&cache, sizeof (overlay_targ_cache_t));
cache.otc_linkid = inst->vri_linkid;
- bcopy(key, cache.otc_entry.otce_mac, ETHERADDRL);
+ cache.otc_entry.otce_mac.otm_dcid = dcid;
+ bcopy(key, cache.otc_entry.otce_mac.otm_mac, ETHERADDRL);
bcopy(&entry->vcp_mac, cache.otc_entry.otce_dest.otp_mac, ETHERADDRL);
cache.otc_entry.otce_flags = entry->vcp_flags;
cache.otc_entry.otce_dest.otp_ip = entry->vcp_ip;
@@ -477,7 +483,8 @@ libvarpd_overlay_cache_walk_fill(varpd_instance_t *inst, uint64_t *markerp,
return (ENOMEM);
iter->otci_linkid = inst->vri_linkid;
- iter->otci_marker = *markerp;
+ iter->otci_marker[0] = markerp[0];
+ iter->otci_marker[1] = markerp[1];
iter->otci_count = *countp;
ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_ITER, iter);
if (ret != 0 && errno == EFAULT)
@@ -487,7 +494,8 @@ libvarpd_overlay_cache_walk_fill(varpd_instance_t *inst, uint64_t *markerp,
goto out;
}
- *markerp = iter->otci_marker;
+ markerp[0] = iter->otci_marker[0];
+ markerp[1] = iter->otci_marker[1];
*countp = iter->otci_count;
bcopy(iter->otci_ents, ents,
*countp * sizeof (overlay_targ_cache_entry_t));
@@ -523,18 +531,20 @@ libvarpd_inject_varp(varpd_provider_handle_t *vph, const uint8_t *mac,
const overlay_target_point_t *otp)
{
int ret;
- overlay_targ_cache_t otc;
+ overlay_targ_cache_t otc = { 0 };
varpd_instance_t *inst = (varpd_instance_t *)vph;
varpd_impl_t *vip = inst->vri_impl;
if (otp == NULL) {
- (void) libvarpd_overlay_cache_delete(inst, mac);
+ (void) libvarpd_overlay_cache_delete(inst, 0, mac);
return;
}
otc.otc_linkid = inst->vri_linkid;
otc.otc_entry.otce_flags = 0;
- bcopy(mac, otc.otc_entry.otce_mac, ETHERADDRL);
+ if (IN6_IS_ADDR_UNSPECIFIED(&otp->otp_ip) && otp->otp_port == 0)
+ otc.otc_entry.otce_flags |= OVERLAY_TARGET_CACHE_ROUTER;
+ bcopy(mac, otc.otc_entry.otce_mac.otm_mac, ETHERADDRL);
bcopy(otp, &otc.otc_entry.otce_dest, sizeof (overlay_target_point_t));
ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_SET, &otc);
@@ -552,6 +562,34 @@ libvarpd_inject_varp(varpd_provider_handle_t *vph, const uint8_t *mac,
}
void
+libvarpd_route_flush(varpd_provider_handle_t *vph, uint8_t *srcip,
+ uint8_t *dstip, uint8_t src_prefixlen, uint8_t dst_prefixlen,
+ uint16_t vlan_id)
+{
+ varpd_instance_t *inst = (varpd_instance_t *)vph;
+ varpd_impl_t *vip = inst->vri_impl;
+ overlay_targ_cache_net_t otcn;
+ overlay_targ_cache_net_entry_t *otcne;
+ int ret;
+
+ otcn.otcn_linkid = inst->vri_linkid;
+ otcne = &otcn.otcn_entry;
+ bcopy(srcip, &otcne->otcne_src, sizeof (in6_addr_t));
+ bcopy(dstip, &otcne->otcne_dst, sizeof (in6_addr_t));
+ otcne->otcne_vlan = vlan_id;
+ otcne->otcne_src_prefixlen = src_prefixlen;
+ otcne->otcne_dst_prefixlen = dst_prefixlen;
+
+ ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_REMOVE_NET, &otcn);
+ if (ret != 0) {
+ /* XXX KEBE ASKS, any harmless error cases? */
+ libvarpd_panic("received bad errno from "
+ "OVERLAY_TARG_CACHE_REMOVE_NET: %d - %s", errno,
+ strerror(errno));
+ }
+}
+
+void
libvarpd_fma_degrade(varpd_provider_handle_t *vph, const char *msg)
{
int ret;
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c
index 27cc802a9c..f8b1fcedfc 100644
--- a/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc. All rights reserved.
*/
/*
@@ -281,7 +281,7 @@ libvarpd_persist_restore_instance(varpd_impl_t *vip, nvlist_t *nvl)
int err;
nvlist_t *pvl;
uint64_t id, flags, vid;
- uint32_t linkid, dest, mode;
+ uint32_t linkid, dest, mode, dcid;
char *pluginstr;
varpd_plugin_t *plugin;
overlay_plugin_dest_t adest;
@@ -312,7 +312,8 @@ libvarpd_persist_restore_instance(varpd_impl_t *vip, nvlist_t *nvl)
if (plugin->vpp_mode != mode)
return (EINVAL);
- if (libvarpd_overlay_info(vip, linkid, &adest, &flags, &vid) != 0)
+ if (libvarpd_overlay_info(vip, linkid, &adest, &flags, &vid,
+ &dcid) != 0)
return (EINVAL);
if (dest != adest)
@@ -334,6 +335,7 @@ libvarpd_persist_restore_instance(varpd_impl_t *vip, nvlist_t *nvl)
inst->vri_dest = dest;
inst->vri_plugin = plugin;
inst->vri_impl = vip;
+ inst->vri_dcid = dcid;
inst->vri_flags = 0;
if (plugin->vpp_ops->vpo_restore(pvl, (varpd_provider_handle_t *)inst,
dest, &inst->vri_private) != 0) {
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h b/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h
index 64fa99d308..ab198919d7 100644
--- a/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _LIBVARPD_PROVIDER_H
@@ -315,11 +315,12 @@ typedef void (*varpd_plugin_destroy_f)(void *);
#define VARPD_LOOKUP_DROP (-1)
typedef int (*varpd_plugin_default_f)(void *, overlay_target_point_t *);
typedef void (*varpd_plugin_lookup_f)(void *, varpd_query_handle_t *,
- const overlay_targ_lookup_t *, overlay_target_point_t *);
+ const overlay_targ_lookup_t *, overlay_target_point_t *,
+ overlay_target_route_t *, overlay_target_mac_t *);
#define VARPD_QTYPE_ETHERNET 0x0
typedef void (*varpd_plugin_arp_f)(void *, varpd_arp_handle_t *, int,
- const struct sockaddr *, uint8_t *);
+ const struct sockaddr *, uint16_t, uint8_t *);
typedef void (*varpd_plugin_dhcp_f)(void *, varpd_dhcp_handle_t *, int,
const overlay_targ_lookup_t *, uint8_t *);
@@ -373,6 +374,7 @@ extern const bunyan_logger_t *libvarpd_plugin_bunyan(varpd_provider_handle_t *);
* Misc. Information APIs
*/
extern uint64_t libvarpd_plugin_vnetid(varpd_provider_handle_t *);
+extern uint32_t libvarpd_plugin_dcid(varpd_provider_handle_t *);
/*
* Lookup Replying query and proxying
@@ -411,6 +413,8 @@ extern void libvarpd_inject_arp(varpd_provider_handle_t *, const uint16_t,
const uint8_t *, const struct in_addr *, const uint8_t *);
extern void libvarpd_fma_degrade(varpd_provider_handle_t *, const char *);
extern void libvarpd_fma_restore(varpd_provider_handle_t *);
+extern void libvarpd_route_flush(varpd_provider_handle_t *, uint8_t *,
+ uint8_t *, uint8_t, uint8_t, uint16_t vlan_id);
#ifdef __cplusplus
}
diff --git a/usr/src/lib/varpd/libvarpd/common/mapfile-plugin b/usr/src/lib/varpd/libvarpd/common/mapfile-plugin
index 8cef7f669f..f51dfd9129 100644
--- a/usr/src/lib/varpd/libvarpd/common/mapfile-plugin
+++ b/usr/src/lib/varpd/libvarpd/common/mapfile-plugin
@@ -10,7 +10,7 @@
#
#
-# Copyright 2015 Joyent, Inc.
+# Copyright 2018 Joyent, Inc.
#
#
@@ -39,6 +39,7 @@ SYMBOL_SCOPE {
libvarpd_panic { FLAGS = EXTERN };
libvarpd_plugin_alloc { FLAGS = EXTERN };
libvarpd_plugin_arp_reply { FLAGS = EXTERN };
+ libvarpd_plugin_dcid { FLAGS = EXTERN };
libvarpd_plugin_dhcp_reply { FLAGS = EXTERN };
libvarpd_plugin_free { FLAGS = EXTERN };
libvarpd_plugin_proxy_arp { FLAGS = EXTERN };
@@ -54,4 +55,5 @@ SYMBOL_SCOPE {
libvarpd_prop_set_nodefault { FLAGS = EXTERN };
libvarpd_prop_set_range_uint32 { FLAGS = EXTERN };
libvarpd_prop_set_rangestr { FLAGS = EXTERN };
+ libvarpd_route_flush { FLAGS = EXTERN };
};
diff --git a/usr/src/lib/varpd/libvarpd/common/mapfile-vers b/usr/src/lib/varpd/libvarpd/common/mapfile-vers
index 7aa930cb54..3eb74972e5 100644
--- a/usr/src/lib/varpd/libvarpd/common/mapfile-vers
+++ b/usr/src/lib/varpd/libvarpd/common/mapfile-vers
@@ -10,7 +10,7 @@
#
#
-# Copyright 2015 Joyent, Inc.
+# Copyright 2018 Joyent, Inc.
#
#
@@ -65,6 +65,8 @@ SYMBOL_VERSION SUNWprivate {
libvarpd_inject_varp;
libvarpd_inject_arp;
+ libvarpd_route_flush;
+
libvarpd_instance_activate;
libvarpd_instance_create;
libvarpd_instance_destroy;
@@ -82,6 +84,7 @@ SYMBOL_VERSION SUNWprivate {
libvarpd_plugin_free;
libvarpd_plugin_arp_reply;
libvarpd_plugin_dhcp_reply;
+ libvarpd_plugin_dcid;
libvarpd_plugin_query_reply;
libvarpd_plugin_proxy_arp;
libvarpd_plugin_proxy_dhcp;
diff --git a/usr/src/lib/varpd/svp/common/libvarpd_svp.c b/usr/src/lib/varpd/svp/common/libvarpd_svp.c
index 58828065a1..1e9ea979d7 100644
--- a/usr/src/lib/varpd/svp/common/libvarpd_svp.c
+++ b/usr/src/lib/varpd/svp/common/libvarpd_svp.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015, Joyent, Inc.
+ * Copyright 2018, Joyent, Inc.
*/
/*
@@ -217,20 +217,24 @@
* |
* v Socket Error,
* +----------------+ still in DNS
- * +----------------<---| SVP_CS_INITIAL |<----------------------*-----+
- * | +----------------+ |
- * | System | |
- * | Connection . . . . . success * Successful |
- * | failed . | connect() |
- * | +----*---------+ | +-----------*--+ |
- * | | | | | | |
- * | V ^ v ^ V ^
- * | +----------------+ +-------------------+ +---------------+
- * +<-| SVP_CS_BACKOFF | | SVP_CS_CONNECTING | | SVP_CS_ACTIVE |
- * | +----------------+ +-------------------+ +---------------+
- * | V ^ V V V
- * | Backoff wait * | | | * Removed
- * v interval +--------------+ +-----------------<-----+ | from DNS
+ * +----------------<---| SVP_CS_INITIAL |<----------------------*--------+
+ * | +----------------+ |
+ * | System | |
+ * | Connection . . . . . success * Successful |
+ * | failed . | connect() |
+ * | . | +-------------------+ |
+ * | +----*---------+ | +-*>| SVP_CS_VERSIONING + |
+ * | | | | | +-------------------+ |
+ * | | | | | V V Set version |
+ * | | | | | | * based on |
+ * | | | | | | | SVP_R_PONG |
+ * | V ^ v ^ | V ^
+ * | +----------------+ +-------------------+ | +---------------+
+ * +<-| SVP_CS_BACKOFF | | SVP_CS_CONNECTING | | | SVP_CS_ACTIVE |
+ * | +----------------+ +-------------------+ | +---------------+
+ * | V ^ V | V V
+ * | Backoff wait * | | | | * Removed
+ * v interval +--------------+ +-----------------<+----+ | from DNS
* | finished | |
* | V |
* | | V
@@ -311,7 +315,7 @@
*
* The shoot down information needs to be done on a per-backend basis. The
* general design is that we'll have a single query for this which can fire on a
- * 5-10s period, we randmoize the latter part to give us a bit more load
+ * 5-10s period, we randomize the latter part to give us a bit more load
* spreading. If we complete because there's no work to do, then we wait the
* normal period. If we complete, but there's still work to do, we'll go again
* after a second.
@@ -360,7 +364,8 @@ static umem_cache_t *svp_lookup_cache;
typedef enum svp_lookup_type {
SVP_L_UNKNOWN = 0x0,
SVP_L_VL2 = 0x1,
- SVP_L_VL3 = 0x2
+ SVP_L_VL3 = 0x2,
+ SVP_L_ROUTE = 0x3
} svp_lookup_type_t;
typedef struct svp_lookup {
@@ -374,6 +379,12 @@ typedef struct svp_lookup {
varpd_arp_handle_t *svl_vah;
uint8_t *svl_out;
} svl_vl3;
+ struct svl_lookup_route {
+ varpd_query_handle_t *svl_handle;
+ overlay_target_point_t *svl_point;
+ overlay_target_route_t *svl_route;
+ overlay_target_mac_t *svl_mac;
+ } svl_route;
} svl_u;
svp_query_t svl_query;
} svp_lookup_t;
@@ -382,7 +393,9 @@ static const char *varpd_svp_props[] = {
"svp/host",
"svp/port",
"svp/underlay_ip",
- "svp/underlay_port"
+ "svp/underlay_port",
+ "svp/dcid",
+ "svp/router_oui"
};
static const uint8_t svp_bcast[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
@@ -429,7 +442,8 @@ static void
svp_vl3_lookup_cb(svp_t *svp, svp_status_t status, const uint8_t *vl2mac,
const struct in6_addr *uip, const uint16_t uport, void *arg)
{
- overlay_target_point_t point;
+ /* Initialize address-holders to 0 for comparisons-to-zeroes later. */
+ overlay_target_point_t point = { 0 };
svp_lookup_t *svl = arg;
assert(svp != NULL);
@@ -486,17 +500,67 @@ svp_shootdown_cb(svp_t *svp, const uint8_t *vl2mac, const struct in6_addr *uip,
const uint16_t uport)
{
/*
- * We should probably do a conditional invlaidation here.
+ * We should probably do a conditional invalidation here.
*/
libvarpd_inject_varp(svp->svp_hdl, vl2mac, NULL);
}
+static void
+svp_route_lookup_cb(svp_t *svp, svp_status_t status, uint32_t dcid,
+ uint32_t vnetid, uint16_t vlan, uint8_t *srcmac, uint8_t *dstmac,
+ uint16_t ul3_port, uint8_t *ul3_addr, uint8_t srcpfx, uint8_t dstpfx,
+ void *arg)
+{
+ svp_lookup_t *svl = arg;
+ overlay_target_point_t *otp;
+ overlay_target_route_t *otr;
+ overlay_target_mac_t *otm;
+
+ if (status != SVP_S_OK) {
+ libvarpd_plugin_query_reply(svl->svl_u.svl_route.svl_handle,
+ VARPD_LOOKUP_DROP);
+ umem_cache_free(svp_lookup_cache, svl);
+ return;
+ }
+
+ otp = svl->svl_u.svl_route.svl_point;
+ bcopy(ul3_addr, &otp->otp_ip, sizeof (struct in6_addr));
+ otp->otp_port = ul3_port;
+
+ otr = svl->svl_u.svl_route.svl_route;
+ otr->otr_vnet = vnetid;
+ otr->otr_vlan = vlan;
+ bcopy(srcmac, otr->otr_srcmac, ETHERADDRL);
+
+ otm = svl->svl_u.svl_route.svl_mac;
+ otm->otm_dcid = dcid;
+ bcopy(dstmac, otm->otm_mac, ETHERADDRL);
+
+ libvarpd_plugin_query_reply(svl->svl_u.svl_route.svl_handle,
+ VARPD_LOOKUP_OK);
+ umem_cache_free(svp_lookup_cache, svl);
+}
+
+/*
+ * Tell the overlay instance to flush out entries matcthing this route.
+ * See libvarpd_route_flush() for more.
+ */
+static void
+svp_route_shootdown_cb(svp_t *svp, uint8_t *srcip, uint8_t *dstip,
+ uint8_t src_prefixlen, uint8_t dst_prefixlen, uint16_t vlan_id)
+{
+ libvarpd_route_flush(svp->svp_hdl, srcip, dstip, src_prefixlen,
+ dst_prefixlen, vlan_id);
+}
+
static svp_cb_t svp_defops = {
svp_vl2_lookup_cb,
svp_vl3_lookup_cb,
svp_vl2_invalidate_cb,
svp_vl3_inject_cb,
- svp_shootdown_cb
+ svp_shootdown_cb,
+ svp_route_lookup_cb,
+ svp_route_shootdown_cb
};
static boolean_t
@@ -587,23 +651,89 @@ varpd_svp_destroy(void *arg)
}
static void
+varpd_svp_lookup_l3(svp_t *svp, varpd_query_handle_t *vqh,
+ const overlay_targ_lookup_t *otl, overlay_target_point_t *otp,
+ overlay_target_route_t *otr, overlay_target_mac_t *otm)
+{
+ svp_lookup_t *slp;
+ /* uint32_t type; */
+ const struct in6_addr *src = &otl->otl_addru.otlu_l3.otl3_srcip,
+ *dst = &otl->otl_addru.otlu_l3.otl3_dstip;
+
+ /*
+ * otl is an L3 request, so we have src/dst IPs for the inner packet.
+ * We also have the vlan.
+ *
+ * Assume kernel's overlay module is caching well, so we are directly
+ * going to query (i.e. no caching up here of actual destinations).
+ *
+ * Our existing remote sever (svp_remote), but with the new message
+ * SVP_R_ROUTE_REQ.
+ */
+
+ if (IN6_IS_ADDR_V4MAPPED(src)) {
+ if (!IN6_IS_ADDR_V4MAPPED(dst)) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ return;
+ }
+ /* type = SVP_VL3_IP; */
+ } else {
+ if (IN6_IS_ADDR_V4MAPPED(dst)) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ return;
+ }
+ /* type = SVP_VL3_IPV6; */
+ }
+
+ slp = umem_cache_alloc(svp_lookup_cache, UMEM_DEFAULT);
+ if (slp == NULL) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ slp->svl_type = SVP_L_ROUTE;
+ slp->svl_u.svl_route.svl_handle = vqh;
+ slp->svl_u.svl_route.svl_point = otp;
+ slp->svl_u.svl_route.svl_route = otr;
+ slp->svl_u.svl_route.svl_mac = otm;
+
+ svp_remote_route_lookup(svp, &slp->svl_query, src, dst,
+ otl->otl_vnetid, (uint16_t)otl->otl_vlan, slp);
+}
+
+static void
varpd_svp_lookup(void *arg, varpd_query_handle_t *vqh,
- const overlay_targ_lookup_t *otl, overlay_target_point_t *otp)
+ const overlay_targ_lookup_t *otl, overlay_target_point_t *otp,
+ overlay_target_route_t *otr, overlay_target_mac_t *otm)
{
svp_lookup_t *slp;
svp_t *svp = arg;
/*
+ * Shuffle off L3 lookups to their own codepath.
+ */
+ if (otl->otl_l3req) {
+ varpd_svp_lookup_l3(svp, vqh, otl, otp, otr, otm);
+ return;
+ }
+ /*
+ * At this point, the traditional overlay_target_point_t is all that
+ * needs filling in. Zero-out the otr for safety.
+ */
+ bzero(otr, sizeof (*otr));
+
+
+ /*
* Check if this is something that we need to proxy, eg. arp or ndp.
*/
- if (otl->otl_sap == ETHERTYPE_ARP) {
+ if (otl->otl_addru.otlu_l2.otl2_sap == ETHERTYPE_ARP) {
libvarpd_plugin_proxy_arp(svp->svp_hdl, vqh, otl);
return;
}
- if (otl->otl_dstaddr[0] == 0x33 &&
- otl->otl_dstaddr[1] == 0x33) {
- if (otl->otl_sap == ETHERTYPE_IPV6) {
+ if (otl->otl_addru.otlu_l2.otl2_dstaddr[0] == 0x33 &&
+ otl->otl_addru.otlu_l2.otl2_dstaddr[1] == 0x33) {
+ if (otl->otl_addru.otlu_l2.otl2_sap == ETHERTYPE_IPV6) {
libvarpd_plugin_proxy_ndp(svp->svp_hdl, vqh, otl);
} else {
libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
@@ -617,8 +747,9 @@ varpd_svp_lookup(void *arg, varpd_query_handle_t *vqh,
* handle broadcast and if the multicast bit is set, lowest bit of the
* first octet of the MAC, then we drop it now.
*/
- if (bcmp(otl->otl_dstaddr, svp_bcast, ETHERADDRL) == 0 ||
- (otl->otl_dstaddr[0] & 0x01) == 0x01) {
+ if (bcmp(otl->otl_addru.otlu_l2.otl2_dstaddr, svp_bcast,
+ ETHERADDRL) == 0 ||
+ (otl->otl_addru.otlu_l2.otl2_dstaddr[0] & 0x01) == 0x01) {
libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
return;
}
@@ -639,7 +770,8 @@ varpd_svp_lookup(void *arg, varpd_query_handle_t *vqh,
slp->svl_u.svl_vl2.svl_handle = vqh;
slp->svl_u.svl_vl2.svl_point = otp;
- svp_remote_vl2_lookup(svp, &slp->svl_query, otl->otl_dstaddr, slp);
+ svp_remote_vl2_lookup(svp, &slp->svl_query,
+ otl->otl_addru.otlu_l2.otl2_dstaddr, slp);
}
/* ARGSUSED */
@@ -687,6 +819,21 @@ varpd_svp_propinfo(void *arg, uint_t propid, varpd_prop_handle_t *vph)
sizeof (svp_defuport));
libvarpd_prop_set_range_uint32(vph, 1, UINT16_MAX);
break;
+ case 4:
+ /* svp/dcid */
+ libvarpd_prop_set_name(vph, varpd_svp_props[4]);
+ libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_RRW);
+ libvarpd_prop_set_type(vph, OVERLAY_PROP_T_UINT);
+ libvarpd_prop_set_nodefault(vph);
+ libvarpd_prop_set_range_uint32(vph, 1, UINT32_MAX - 1);
+ break;
+ case 5:
+ /* svp/router_oui */
+ libvarpd_prop_set_name(vph, varpd_svp_props[5]);
+ libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_RRW);
+ libvarpd_prop_set_type(vph, OVERLAY_PROP_T_ETHER);
+ libvarpd_prop_set_nodefault(vph);
+ break;
default:
return (EINVAL);
}
@@ -733,14 +880,13 @@ varpd_svp_getprop(void *arg, const char *pname, void *buf, uint32_t *sizep)
bcopy(&val, buf, sizeof (uint64_t));
*sizep = sizeof (uint64_t);
}
-
mutex_exit(&svp->svp_lock);
return (0);
}
/* svp/underlay_ip */
if (strcmp(pname, varpd_svp_props[2]) == 0) {
- if (*sizep > sizeof (struct in6_addr))
+ if (*sizep < sizeof (struct in6_addr))
return (EOVERFLOW);
mutex_enter(&svp->svp_lock);
if (svp->svp_huip == B_FALSE) {
@@ -749,6 +895,7 @@ varpd_svp_getprop(void *arg, const char *pname, void *buf, uint32_t *sizep)
bcopy(&svp->svp_uip, buf, sizeof (struct in6_addr));
*sizep = sizeof (struct in6_addr);
}
+ mutex_exit(&svp->svp_lock);
return (0);
}
@@ -772,6 +919,42 @@ varpd_svp_getprop(void *arg, const char *pname, void *buf, uint32_t *sizep)
return (0);
}
+ /* svp/dcid */
+ if (strcmp(pname, varpd_svp_props[4]) == 0) {
+ uint64_t val;
+
+ if (*sizep < sizeof (uint64_t))
+ return (EOVERFLOW);
+
+ mutex_enter(&svp->svp_lock);
+ if (svp->svp_uport == 0) {
+ *sizep = 0;
+ } else {
+ val = svp->svp_dcid;
+ bcopy(&val, buf, sizeof (uint64_t));
+ *sizep = sizeof (uint64_t);
+ }
+
+ mutex_exit(&svp->svp_lock);
+ return (0);
+ }
+
+ /* svp/router_oui */
+ if (strcmp(pname, varpd_svp_props[5]) == 0) {
+ if (*sizep < ETHERADDRL)
+ return (EOVERFLOW);
+ mutex_enter(&svp->svp_lock);
+
+ if (ether_is_zero(&svp->svp_router_oui)) {
+ *sizep = 0;
+ } else {
+ bcopy(&svp->svp_router_oui, buf, ETHERADDRL);
+ *sizep = ETHERADDRL;
+ }
+
+ mutex_exit(&svp->svp_lock);
+ return (0);
+ }
return (EINVAL);
}
@@ -857,6 +1040,36 @@ varpd_svp_setprop(void *arg, const char *pname, const void *buf,
return (0);
}
+ /* svp/dcid */
+ if (strcmp(pname, varpd_svp_props[4]) == 0) {
+ const uint64_t *valp = buf;
+ if (size < sizeof (uint64_t))
+ return (EOVERFLOW);
+
+ if (*valp == 0 || *valp > UINT32_MAX - 1)
+ return (EINVAL);
+
+ mutex_enter(&svp->svp_lock);
+ svp->svp_dcid = (uint32_t)*valp;
+ mutex_exit(&svp->svp_lock);
+
+ return (0);
+ }
+
+ /* svp/router_oui */
+ if (strcmp(pname, varpd_svp_props[5]) == 0) {
+ if (size < ETHERADDRL)
+ return (EOVERFLOW);
+ mutex_enter(&svp->svp_lock);
+ bcopy(buf, &svp->svp_router_oui, ETHERADDRL);
+ /* Zero-out the low three bytes. */
+ svp->svp_router_oui[3] = 0;
+ svp->svp_router_oui[4] = 0;
+ svp->svp_router_oui[5] = 0;
+ mutex_exit(&svp->svp_lock);
+ return (0);
+ }
+
return (EINVAL);
}
@@ -867,6 +1080,7 @@ varpd_svp_save(void *arg, nvlist_t *nvp)
svp_t *svp = arg;
mutex_enter(&svp->svp_lock);
+ /* svp/host */
if (svp->svp_host != NULL) {
if ((ret = nvlist_add_string(nvp, varpd_svp_props[0],
svp->svp_host)) != 0) {
@@ -875,6 +1089,7 @@ varpd_svp_save(void *arg, nvlist_t *nvp)
}
}
+ /* svp/port */
if (svp->svp_port != 0) {
if ((ret = nvlist_add_uint16(nvp, varpd_svp_props[1],
svp->svp_port)) != 0) {
@@ -883,6 +1098,7 @@ varpd_svp_save(void *arg, nvlist_t *nvp)
}
}
+ /* svp/underlay_ip */
if (svp->svp_huip == B_TRUE) {
char buf[INET6_ADDRSTRLEN];
@@ -898,6 +1114,7 @@ varpd_svp_save(void *arg, nvlist_t *nvp)
}
}
+ /* svp/underlay_port */
if (svp->svp_uport != 0) {
if ((ret = nvlist_add_uint16(nvp, varpd_svp_props[3],
svp->svp_uport)) != 0) {
@@ -906,6 +1123,32 @@ varpd_svp_save(void *arg, nvlist_t *nvp)
}
}
+ /* svp/dcid */
+ if (svp->svp_dcid != 0) {
+ if ((ret = nvlist_add_uint32(nvp, varpd_svp_props[4],
+ svp->svp_dcid)) != 0) {
+ mutex_exit(&svp->svp_lock);
+ return (ret);
+ }
+ }
+
+ /* svp/router_oui */
+ if (!ether_is_zero(&svp->svp_router_oui)) {
+ char buf[ETHERADDRSTRL];
+
+ if (ether_ntoa_r((struct ether_addr *)&svp->svp_router_oui,
+ buf) == NULL) {
+ libvarpd_panic("unexpected ether_ntoa_r failure: %d",
+ errno);
+ }
+
+ if ((ret = nvlist_add_string(nvp, varpd_svp_props[5],
+ buf)) != 0) {
+ mutex_exit(&svp->svp_lock);
+ return (ret);
+ }
+ }
+
mutex_exit(&svp->svp_lock);
return (0);
}
@@ -916,7 +1159,7 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl,
{
int ret;
svp_t *svp;
- char *ipstr, *hstr;
+ char *ipstr, *hstr, *etherstr;
if (varpd_svp_valid_dest(dest) == B_FALSE)
return (ENOTSUP);
@@ -924,6 +1167,7 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl,
if ((ret = varpd_svp_create(hdl, (void **)&svp, dest)) != 0)
return (ret);
+ /* svp/host */
if ((ret = nvlist_lookup_string(nvp, varpd_svp_props[0],
&hstr)) != 0) {
if (ret != ENOENT) {
@@ -937,6 +1181,7 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl,
(void) strlcpy(svp->svp_host, hstr, blen);
}
+ /* svp/port */
if ((ret = nvlist_lookup_uint16(nvp, varpd_svp_props[1],
&svp->svp_port)) != 0) {
if (ret != ENOENT) {
@@ -946,6 +1191,7 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl,
svp->svp_port = 0;
}
+ /* svp/underlay_ip */
if ((ret = nvlist_lookup_string(nvp, varpd_svp_props[2],
&ipstr)) != 0) {
if (ret != ENOENT) {
@@ -968,6 +1214,7 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl,
svp->svp_huip = B_TRUE;
}
+ /* svp/underlay_port */
if ((ret = nvlist_lookup_uint16(nvp, varpd_svp_props[3],
&svp->svp_uport)) != 0) {
if (ret != ENOENT) {
@@ -977,6 +1224,29 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl,
svp->svp_uport = 0;
}
+ /* svp/dcid */
+ if ((ret = nvlist_lookup_uint32(nvp, varpd_svp_props[4],
+ &svp->svp_dcid)) != 0) {
+ if (ret != ENOENT) {
+ varpd_svp_destroy(svp);
+ return (ret);
+ }
+ svp->svp_dcid = 0;
+ }
+
+ /* svp/router_oui */
+ if ((ret = nvlist_lookup_string(nvp, varpd_svp_props[5],
+ &etherstr)) != 0) {
+ if (ret != ENOENT) {
+ varpd_svp_destroy(svp);
+ return (ret);
+ }
+ bzero(&svp->svp_router_oui, ETHERADDRL);
+ } else if (ether_aton_r(etherstr,
+ (struct ether_addr *)&svp->svp_router_oui) == NULL) {
+ libvarpd_panic("unexpected ether_aton_r failure: %d", errno);
+ }
+
svp->svp_hdl = hdl;
*outp = svp;
return (0);
@@ -984,7 +1254,7 @@ varpd_svp_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl,
static void
varpd_svp_arp(void *arg, varpd_arp_handle_t *vah, int type,
- const struct sockaddr *sock, uint8_t *out)
+ const struct sockaddr *sock, uint16_t vlan __unused, uint8_t *out)
{
svp_t *svp = arg;
svp_lookup_t *svl;
diff --git a/usr/src/lib/varpd/svp/common/libvarpd_svp.h b/usr/src/lib/varpd/svp/common/libvarpd_svp.h
index 8192b842ce..348996898e 100644
--- a/usr/src/lib/varpd/svp/common/libvarpd_svp.h
+++ b/usr/src/lib/varpd/svp/common/libvarpd_svp.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _LIBVARPD_SVP_H
@@ -74,6 +74,8 @@ typedef union svp_query_data {
svp_vl3_ack_t sdq_vl3a;
svp_log_req_t sdq_logr;
svp_lrm_ack_t sdq_lrma;
+ svp_route_req_t sqd_rr;
+ svp_route_ack_t sqd_ra;
} svp_query_data_t;
typedef void (*svp_query_f)(svp_query_t *, void *);
@@ -116,14 +118,16 @@ typedef enum svp_conn_state {
SVP_CS_CONNECTING = 0x02,
SVP_CS_BACKOFF = 0x03,
SVP_CS_ACTIVE = 0x04,
- SVP_CS_WINDDOWN = 0x05
+ SVP_CS_WINDDOWN = 0x05,
+ SVP_CS_VERSIONING = 0x06
} svp_conn_state_t;
typedef enum svp_conn_error {
SVP_CE_NONE = 0x00,
SVP_CE_ASSOCIATE = 0x01,
SVP_CE_NOPOLLOUT = 0x02,
- SVP_CE_SOCKET = 0x03
+ SVP_CE_SOCKET = 0x03,
+ SVP_CE_VERSION_PONG = 0x04
} svp_conn_error_t;
typedef enum svp_conn_flags {
@@ -164,6 +168,7 @@ struct svp_conn {
list_t sc_queries;
svp_conn_out_t sc_output;
svp_conn_in_t sc_input;
+ uint_t sc_version;
};
typedef enum svp_remote_state {
@@ -245,6 +250,11 @@ typedef void (*svp_vl3_inject_f)(svp_t *, const uint16_t,
const struct in6_addr *, const uint8_t *, const uint8_t *);
typedef void (*svp_shootdown_f)(svp_t *, const uint8_t *,
const struct in6_addr *, const uint16_t uport);
+typedef void (*svp_route_lookup_f)(svp_t *, svp_status_t, uint32_t, uint32_t,
+ uint16_t, uint8_t *, uint8_t *, uint16_t, uint8_t *, uint8_t, uint8_t,
+ void *);
+typedef void (*svp_route_shootdown_f)(svp_t *, uint8_t *, uint8_t *, uint8_t,
+ uint8_t, uint16_t);
typedef struct svp_cb {
svp_vl2_lookup_f scb_vl2_lookup;
@@ -252,6 +262,8 @@ typedef struct svp_cb {
svp_vl2_invalidation_f scb_vl2_invalidate;
svp_vl3_inject_f scb_vl3_inject;
svp_shootdown_f scb_shootdown;
+ svp_route_lookup_f scb_route_lookup;
+ svp_route_shootdown_f scb_route_shootdown;
} svp_cb_t;
/*
@@ -268,8 +280,11 @@ struct svp {
char *svp_host; /* svp_lock */
uint16_t svp_port; /* svp_lock */
uint16_t svp_uport; /* svp_lock */
+ uint32_t svp_dcid; /* svp_lock (but write-once?) */
boolean_t svp_huip; /* svp_lock */
struct in6_addr svp_uip; /* svp_lock */
+ /* NOTE: lower-3 bytes are 0s. */
+ uint8_t svp_router_oui[6]; /* svp_lock (but write-once?) */
};
extern bunyan_logger_t *svp_bunyan;
@@ -283,6 +298,10 @@ extern void svp_remote_vl3_lookup(svp_t *, svp_query_t *,
const struct sockaddr *, void *);
extern void svp_remote_vl2_lookup(svp_t *, svp_query_t *, const uint8_t *,
void *);
+extern void svp_remote_route_lookup(svp_t *, svp_query_t *,
+ const struct in6_addr *, const struct in6_addr *, uint32_t,
+ uint16_t, void *);
+
/*
* Init functions
@@ -332,6 +351,7 @@ extern void svp_remote_resolved(svp_remote_t *, struct addrinfo *);
extern void svp_host_queue(svp_remote_t *);
extern void svp_query_release(svp_query_t *);
extern void svp_query_crc32(svp_req_t *, void *, size_t);
+extern id_t svp_id_alloc(void);
/*
* Shootdown related
@@ -339,11 +359,13 @@ extern void svp_query_crc32(svp_req_t *, void *, size_t);
extern void svp_remote_shootdown_vl3(svp_remote_t *, svp_log_vl3_t *,
svp_sdlog_t *);
extern void svp_remote_shootdown_vl2(svp_remote_t *, svp_log_vl2_t *);
+extern void svp_remote_shootdown_route(svp_remote_t *, svp_log_route_t *);
extern void svp_remote_log_request(svp_remote_t *, svp_query_t *, void *,
size_t);
extern void svp_remote_lrm_request(svp_remote_t *, svp_query_t *, void *,
size_t);
-extern void svp_shootdown_logr_cb(svp_remote_t *, svp_status_t, void *, size_t);
+extern void svp_shootdown_logr_cb(svp_remote_t *, svp_status_t, void *, size_t,
+ uint16_t);
extern void svp_shootdown_lrm_cb(svp_remote_t *, svp_status_t);
extern void svp_shootdown_vl3_cb(svp_status_t, svp_log_vl3_t *, svp_sdlog_t *);
extern int svp_shootdown_init(svp_remote_t *);
diff --git a/usr/src/lib/varpd/svp/common/libvarpd_svp_conn.c b/usr/src/lib/varpd/svp/common/libvarpd_svp_conn.c
index 4d10d1dba4..af0fe07e52 100644
--- a/usr/src/lib/varpd/svp/common/libvarpd_svp_conn.c
+++ b/usr/src/lib/varpd/svp/common/libvarpd_svp_conn.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -40,9 +40,12 @@ typedef enum svp_conn_act {
SVP_RA_DEGRADE = 0x01,
SVP_RA_RESTORE = 0x02,
SVP_RA_ERROR = 0x03,
- SVP_RA_CLEANUP = 0x04
+ SVP_RA_CLEANUP = 0x04,
+ SVP_RA_FIND_VERSION = 0x05
} svp_conn_act_t;
+static svp_conn_act_t svp_conn_poll_connect(port_event_t *, svp_conn_t *);
+
static void
svp_conn_inject(svp_conn_t *scp)
{
@@ -90,6 +93,75 @@ svp_conn_restore(svp_conn_t *scp)
srp->sr_ndconns--;
}
+static svp_conn_act_t
+svp_conn_pong_handler(svp_conn_t *scp, svp_query_t *sqp)
+{
+ uint16_t remote_version = ntohs(scp->sc_input.sci_req.svp_ver);
+
+ if (scp->sc_cstate == SVP_CS_VERSIONING) {
+ /* Transition VERSIONING -> ACTIVE. */
+ assert(scp->sc_version == 0);
+ if (remote_version == 0 || remote_version > SVP_CURRENT_VERSION)
+ return (SVP_RA_ERROR);
+ scp->sc_version = remote_version;
+ scp->sc_cstate = SVP_CS_ACTIVE;
+ }
+
+ return (SVP_RA_NONE);
+}
+
+static void
+svp_conn_ping_cb(svp_query_t *sqp, void *arg)
+{
+ size_t len = (size_t)arg;
+
+ assert(len == sizeof (svp_query_t));
+ umem_free(sqp, len);
+}
+
+static svp_conn_act_t
+svp_conn_ping_version(svp_conn_t *scp)
+{
+ svp_remote_t *srp = scp->sc_remote;
+ svp_query_t *sqp = umem_zalloc(sizeof (svp_query_t), UMEM_DEFAULT);
+ int ret;
+
+ assert(MUTEX_HELD(&srp->sr_lock));
+ assert(MUTEX_HELD(&scp->sc_lock));
+ assert(scp->sc_cstate == SVP_CS_CONNECTING);
+
+ if (sqp == NULL)
+ return (SVP_RA_ERROR);
+
+ /* Only set things that need to be non-0/non-NULL. */
+ sqp->sq_state = SVP_QUERY_INIT;
+ sqp->sq_func = svp_conn_ping_cb;
+ sqp->sq_arg = (void *)sizeof (svp_query_t);
+ sqp->sq_header.svp_op = htons(SVP_R_PING);
+ sqp->sq_header.svp_ver = htons(SVP_CURRENT_VERSION);
+ sqp->sq_header.svp_id = svp_id_alloc();
+ if (sqp->sq_header.svp_id == -1) {
+ umem_free(sqp, sizeof (svp_query_t));
+ return (SVP_RA_ERROR);
+ }
+
+ scp->sc_cstate = SVP_CS_VERSIONING;
+ /* Set the event flags now... */
+ scp->sc_event.se_events = POLLIN | POLLRDNORM | POLLHUP | POLLOUT;
+ /* ...so I can just queue it up directly... */
+ svp_conn_queue(scp, sqp);
+ /* ... and then associate the event port myself. */
+ ret = svp_event_associate(&scp->sc_event, scp->sc_socket);
+ if (ret == 0)
+ return (SVP_RA_RESTORE);
+ scp->sc_error = SVP_CE_ASSOCIATE;
+ scp->sc_errno = ret;
+ scp->sc_cstate = SVP_CS_ERROR;
+ list_remove(&scp->sc_queries, sqp);
+ umem_free(sqp, sizeof (svp_query_t));
+ return (SVP_RA_DEGRADE);
+}
+
static void
svp_conn_add(svp_conn_t *scp)
{
@@ -180,6 +252,9 @@ svp_conn_connect(svp_conn_t *scp)
if (scp->sc_cstate == SVP_CS_INITIAL)
scp->sc_nbackoff = 0;
+ /* New connect means we need to know the version. */
+ scp->sc_version = 0;
+
scp->sc_socket = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, 0);
if (scp->sc_socket == -1) {
scp->sc_error = SVP_CE_SOCKET;
@@ -252,57 +327,53 @@ svp_conn_connect(svp_conn_t *scp)
}
}
- /*
- * We've connected. Successfully move ourselves to the bound
- * state and start polling.
- */
- scp->sc_cstate = SVP_CS_ACTIVE;
- scp->sc_event.se_events = POLLIN | POLLRDNORM | POLLHUP;
- ret = svp_event_associate(&scp->sc_event, scp->sc_socket);
- if (ret == 0)
- return (SVP_RA_RESTORE);
- scp->sc_error = SVP_CE_ASSOCIATE;
- scp->sc_cstate = SVP_CS_ERROR;
-
- return (SVP_RA_DEGRADE);
+ /* Immediately successful connection, move to SVP_CS_VERSIONING. */
+ return (svp_conn_poll_connect(NULL, scp));
}
/*
- * This should be the first call we get after a connect. If we have successfully
- * connected, we should see a writeable event. We may also see an error or a
- * hang up. In either of these cases, we transition to error mode. If there is
- * also a readable event, we ignore it at the moment and just let a
- * reassociation pick it up so we can simplify the set of state transitions that
- * we have.
+ * This should be the first call we get after a successful synchronous
+ * connect, or a completed (failed or successful) asynchronous connect. A
+ * non-NULL port-event indicates asynchronous completion, a NULL port-event
+ * indicates a successful synchronous connect.
+ *
+ * If we have successfully connected, we should see a writeable event. In the
+ * asynchronous case, we may also see an error or a hang up. For either hang
+ * up or error, we transition to error mode. If there is also a readable event
+ * (i.e. incoming data), we ignore it at the moment and just let a
+ * reassociation pick it up so we can simplify the set of state transitions
+ * that we have.
*/
static svp_conn_act_t
svp_conn_poll_connect(port_event_t *pe, svp_conn_t *scp)
{
- int ret, err;
- socklen_t sl = sizeof (err);
- if (!(pe->portev_events & POLLOUT)) {
- scp->sc_errno = 0;
- scp->sc_error = SVP_CE_NOPOLLOUT;
- scp->sc_cstate = SVP_CS_ERROR;
- return (SVP_RA_DEGRADE);
- }
+ int ret;
- ret = getsockopt(scp->sc_socket, SOL_SOCKET, SO_ERROR, &err, &sl);
- if (ret != 0)
- libvarpd_panic("unanticipated getsockopt error");
- if (err != 0) {
- return (svp_conn_backoff(scp));
+ if (pe != NULL) {
+ int err;
+ socklen_t sl = sizeof (err);
+
+ /*
+ * These bits only matter if we're notified of an
+ * asynchronous connection completion.
+ */
+ if (!(pe->portev_events & POLLOUT)) {
+ scp->sc_errno = 0;
+ scp->sc_error = SVP_CE_NOPOLLOUT;
+ scp->sc_cstate = SVP_CS_ERROR;
+ return (SVP_RA_DEGRADE);
+ }
+
+ ret = getsockopt(scp->sc_socket, SOL_SOCKET, SO_ERROR, &err,
+ &sl);
+ if (ret != 0)
+ libvarpd_panic("unanticipated getsockopt error");
+ if (err != 0) {
+ return (svp_conn_backoff(scp));
+ }
}
- scp->sc_cstate = SVP_CS_ACTIVE;
- scp->sc_event.se_events = POLLIN | POLLRDNORM | POLLHUP;
- ret = svp_event_associate(&scp->sc_event, scp->sc_socket);
- if (ret == 0)
- return (SVP_RA_RESTORE);
- scp->sc_error = SVP_CE_ASSOCIATE;
- scp->sc_errno = ret;
- scp->sc_cstate = SVP_CS_ERROR;
- return (SVP_RA_DEGRADE);
+ return (SVP_RA_FIND_VERSION);
}
static svp_conn_act_t
@@ -357,7 +428,7 @@ svp_conn_pollout(svp_conn_t *scp)
do {
ret = writev(scp->sc_socket, iov, nvecs);
- } while (ret == -1 && errno == EAGAIN);
+ } while (ret == -1 && errno == EINTR);
if (ret == -1) {
switch (errno) {
case EAGAIN:
@@ -387,7 +458,7 @@ static boolean_t
svp_conn_pollin_validate(svp_conn_t *scp)
{
svp_query_t *sqp;
- uint32_t nsize;
+ uint32_t nsize, expected_size = 0;
uint16_t nvers, nop;
svp_req_t *resp = &scp->sc_input.sci_req;
@@ -397,19 +468,40 @@ svp_conn_pollin_validate(svp_conn_t *scp)
nop = ntohs(resp->svp_op);
nsize = ntohl(resp->svp_size);
- if (nvers != SVP_CURRENT_VERSION) {
- (void) bunyan_warn(svp_bunyan, "unsupported version",
+ /*
+ * A peer that's messing with post-connection version changes is
+ * likely a broken peer.
+ */
+ if (scp->sc_cstate != SVP_CS_VERSIONING && nvers != scp->sc_version) {
+ (void) bunyan_warn(svp_bunyan, "version mismatch",
BUNYAN_T_IP, "remote_ip", &scp->sc_addr,
BUNYAN_T_INT32, "remote_port", scp->sc_remote->sr_rport,
- BUNYAN_T_INT32, "version", nvers,
+ BUNYAN_T_INT32, "peer version", nvers,
+ BUNYAN_T_INT32, "our version", scp->sc_version,
BUNYAN_T_INT32, "operation", nop,
BUNYAN_T_INT32, "response_id", resp->svp_id,
BUNYAN_T_END);
return (B_FALSE);
}
- if (nop != SVP_R_VL2_ACK && nop != SVP_R_VL3_ACK &&
- nop != SVP_R_LOG_ACK && nop != SVP_R_LOG_RM_ACK) {
+ switch (nop) {
+ case SVP_R_VL2_ACK:
+ expected_size = sizeof (svp_vl2_ack_t);
+ break;
+ case SVP_R_VL3_ACK:
+ expected_size = sizeof (svp_vl3_ack_t);
+ break;
+ case SVP_R_LOG_RM_ACK:
+ expected_size = sizeof (svp_lrm_ack_t);
+ break;
+ case SVP_R_ROUTE_ACK:
+ expected_size = sizeof (svp_route_ack_t);
+ break;
+ case SVP_R_LOG_ACK:
+ case SVP_R_PONG:
+ /* No expected size (LOG_ACK) or size is 0 (PONG). */
+ break;
+ default:
(void) bunyan_warn(svp_bunyan, "unsupported operation",
BUNYAN_T_IP, "remote_ip", &scp->sc_addr,
BUNYAN_T_INT32, "remote_port", scp->sc_remote->sr_rport,
@@ -445,26 +537,9 @@ svp_conn_pollin_validate(svp_conn_t *scp)
return (B_FALSE);
}
- if ((nop == SVP_R_VL2_ACK && nsize != sizeof (svp_vl2_ack_t)) ||
- (nop == SVP_R_VL3_ACK && nsize != sizeof (svp_vl3_ack_t)) ||
- (nop == SVP_R_LOG_RM_ACK && nsize != sizeof (svp_lrm_ack_t))) {
- (void) bunyan_warn(svp_bunyan, "response size too large",
- BUNYAN_T_IP, "remote_ip", &scp->sc_addr,
- BUNYAN_T_INT32, "remote_port", scp->sc_remote->sr_rport,
- BUNYAN_T_INT32, "version", nvers,
- BUNYAN_T_INT32, "operation", nop,
- BUNYAN_T_INT32, "response_id", resp->svp_id,
- BUNYAN_T_INT32, "response_size", nsize,
- BUNYAN_T_INT32, "expected_size", nop == SVP_R_VL2_ACK ?
- sizeof (svp_vl2_ack_t) : sizeof (svp_vl3_ack_t),
- BUNYAN_T_INT32, "query_state", sqp->sq_state,
- BUNYAN_T_END);
- return (B_FALSE);
- }
-
/*
- * The valid size is anything <= to what the user requested, but at
- * least svp_log_ack_t bytes large.
+ * For LOG_ACK, the valid size is anything <= to what the user
+ * requested, but at least svp_log_ack_t bytes large.
*/
if (nop == SVP_R_LOG_ACK) {
const char *msg = NULL;
@@ -487,12 +562,26 @@ svp_conn_pollin_validate(svp_conn_t *scp)
BUNYAN_T_END);
return (B_FALSE);
}
+ } else if (nsize != expected_size) {
+ /* For other ops, we know the expected size. */
+ (void) bunyan_warn(svp_bunyan, "response size too large",
+ BUNYAN_T_IP, "remote_ip", &scp->sc_addr,
+ BUNYAN_T_INT32, "remote_port", scp->sc_remote->sr_rport,
+ BUNYAN_T_INT32, "version", nvers,
+ BUNYAN_T_INT32, "operation", nop,
+ BUNYAN_T_INT32, "response_id", resp->svp_id,
+ BUNYAN_T_INT32, "response_size", nsize,
+ BUNYAN_T_INT32, "expected_size", expected_size,
+ BUNYAN_T_INT32, "query_state", sqp->sq_state,
+ BUNYAN_T_END);
+ return (B_FALSE);
}
sqp->sq_size = nsize;
scp->sc_input.sci_query = sqp;
if (nop == SVP_R_VL2_ACK || nop == SVP_R_VL3_ACK ||
- nop == SVP_R_LOG_RM_ACK) {
+ nop == SVP_R_LOG_RM_ACK || nop == SVP_R_ROUTE_ACK ||
+ nop == SVP_R_PONG) {
sqp->sq_wdata = &sqp->sq_wdun;
sqp->sq_wsize = sizeof (svp_query_data_t);
} else {
@@ -582,7 +671,7 @@ svp_conn_pollin(svp_conn_t *scp)
default:
libvarpd_panic("unexpeted read errno: %d", errno);
}
- } else if (ret == 0) {
+ } else if (ret == 0 && total - off > 0) {
/* Try to reconnect to the remote host */
return (SVP_RA_ERROR);
}
@@ -626,6 +715,20 @@ svp_conn_pollin(svp_conn_t *scp)
} else if (nop == SVP_R_LOG_RM_ACK) {
svp_lrm_ack_t *svra = sqp->sq_wdata;
sqp->sq_status = ntohl(svra->svra_status);
+ } else if (nop == SVP_R_ROUTE_ACK) {
+ svp_route_ack_t *sra = sqp->sq_wdata;
+ sqp->sq_status = ntohl(sra->sra_status);
+ } else if (nop == SVP_R_PONG) {
+ /*
+ * Handle the PONG versioning-capture here, as we need
+ * the version number, the scp_lock held, and the ability
+ * to error out.
+ */
+ svp_conn_act_t cbret;
+
+ cbret = svp_conn_pong_handler(scp, sqp);
+ if (cbret != SVP_RA_NONE)
+ return (cbret);
} else {
libvarpd_panic("unhandled nop: %d", nop);
}
@@ -737,6 +840,7 @@ svp_conn_handler(port_event_t *pe, void *arg)
assert(pe != NULL);
ret = svp_conn_poll_connect(pe, scp);
break;
+ case SVP_CS_VERSIONING:
case SVP_CS_ACTIVE:
case SVP_CS_WINDDOWN:
assert(pe != NULL);
@@ -774,6 +878,9 @@ out:
mutex_enter(&srp->sr_lock);
mutex_enter(&scp->sc_lock);
+ if (ret == SVP_RA_FIND_VERSION)
+ ret = svp_conn_ping_version(scp);
+
if (ret == SVP_RA_ERROR)
ret = svp_conn_reset(scp);
@@ -1014,7 +1121,8 @@ void
svp_conn_queue(svp_conn_t *scp, svp_query_t *sqp)
{
assert(MUTEX_HELD(&scp->sc_lock));
- assert(scp->sc_cstate == SVP_CS_ACTIVE);
+ assert(scp->sc_cstate == SVP_CS_ACTIVE ||
+ scp->sc_cstate == SVP_CS_VERSIONING);
sqp->sq_acttime = -1;
list_insert_tail(&scp->sc_queries, sqp);
diff --git a/usr/src/lib/varpd/svp/common/libvarpd_svp_prot.h b/usr/src/lib/varpd/svp/common/libvarpd_svp_prot.h
index 16dbdbec05..25a626afd1 100644
--- a/usr/src/lib/varpd/svp/common/libvarpd_svp_prot.h
+++ b/usr/src/lib/varpd/svp/common/libvarpd_svp_prot.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _LIBVARPD_SVP_PROT_H
@@ -34,7 +34,13 @@ extern "C" {
*/
#define SVP_VERSION_ONE 1
-#define SVP_CURRENT_VERSION SVP_VERSION_ONE
+#define SVP_VERSION_TWO 2
+/*
+ * Bump this to 2. Version 1 SVP is a subset of version 2, and can be
+ * determined using an SVP_R_PING as part of connection establishment.
+ * Version-2 specific changes will be highlighed (look for "v2").
+ */
+#define SVP_CURRENT_VERSION SVP_VERSION_TWO
typedef struct svp_req {
uint16_t svp_ver;
@@ -44,6 +50,10 @@ typedef struct svp_req {
uint32_t svp_crc32;
} svp_req_t;
+/*
+ * Unless specified, all message types function identically between v1 and v2
+ * of SVP.
+ */
typedef enum svp_op {
SVP_R_UNKNOWN = 0x00,
SVP_R_PING = 0x01,
@@ -54,11 +64,13 @@ typedef enum svp_op {
SVP_R_VL3_ACK = 0x06,
SVP_R_BULK_REQ = 0x07,
SVP_R_BULK_ACK = 0x08,
- SVP_R_LOG_REQ = 0x09,
- SVP_R_LOG_ACK = 0x0A,
+ SVP_R_LOG_REQ = 0x09, /* v2 introduces new log type */
+ SVP_R_LOG_ACK = 0x0A, /* See svp_log_route_t */
SVP_R_LOG_RM = 0x0B,
SVP_R_LOG_RM_ACK = 0x0C,
- SVP_R_SHOOTDOWN = 0x0D
+ SVP_R_SHOOTDOWN = 0x0D,
+ SVP_R_ROUTE_REQ = 0x0E, /* v2 only */
+ SVP_R_ROUTE_ACK = 0x0F /* v2 only */
} svp_op_t;
typedef enum svp_status {
@@ -70,7 +82,7 @@ typedef enum svp_status {
} svp_status_t;
/*
- * A client issues the SVP_R_VL2_REQ whenever it needs to perform a VLS->UL3
+ * A client issues the SVP_R_VL2_REQ whenever it needs to perform a VL2->UL3
* lookup. Requests have the following structure:
*/
typedef struct svp_vl2_req {
@@ -169,7 +181,8 @@ typedef struct svp_log_req {
*/
typedef enum svp_log_type {
SVP_LOG_VL2 = 0x01,
- SVP_LOG_VL3 = 0x02
+ SVP_LOG_VL3 = 0x02,
+ SVP_LOG_ROUTE = 0x03 /* v2 only */
} svp_log_type_t;
typedef struct svp_log_vl2 {
@@ -189,6 +202,24 @@ typedef struct svp_log_vl3 {
uint32_t svl3_vnetid;
} svp_log_vl3_t;
+/*
+ * This log entry only appears on v2 connections.
+ */
+typedef struct svp_log_route {
+ uint32_t svlr_type; /* Should be SVP_LOG_ROUTE */
+ uint8_t svlr_id[16]; /* 16-byte UUID */
+ uint32_t svlr_src_vnetid; /* Source VXLAN vnetid. */
+ uint32_t svlr_dst_vnetid; /* Dest. VXLAN vnetid. */
+ uint32_t svlr_dcid; /* Remote/dest Data Center ID. */
+ uint8_t svlr_srcip[16]; /* Source IP address base. */
+ uint8_t svlr_dstip[16]; /* Destination IP address base. */
+ uint16_t svlr_src_vlan; /* Source VLAN id. */
+ uint16_t svlr_dst_vlan; /* Destination VLAN id. */
+ uint8_t svlr_src_prefixlen; /* Source IP prefix length. */
+ uint8_t svlr_dst_prefixlen; /* Dest. IP prefix length. */
+ uint16_t svlr_pad; /* So we can be aligned... */
+} svp_log_route_t;
+
typedef struct svp_log_ack {
uint32_t svla_status;
uint8_t svla_data[];
@@ -229,6 +260,41 @@ typedef struct svp_shootdown {
uint32_t svsd_vnetid;
} svp_shootdown_t;
+/*
+ * A route-request (SVP_R_ROUTE_REQ) queries the local SVP server to get a
+ * far-remote (i.e. another Triton Data Center, nee. SDC) SVP server for
+ * far-remote networks. Modern overlay modules will request IP destinations
+ * for remote-Triton networks, but they must know how to reach the
+ * remote-Triton SVP server.
+ *
+ * NOTE: SVP_R_ROUTE_{REQ,ACK} are only present in SVP v2.
+ */
+typedef struct svp_route_req {
+ uint32_t srr_vnetid; /* Requester's vnet ID. */
+ uint16_t srr_vlan; /* Requester's VLAN ID. */
+ uint16_t srr_pad; /* Zero on xmit, ignore on receipt. */
+ uint8_t srr_srcip[16]; /* VL3 Source IP. */
+ uint8_t srr_dstip[16]; /* VL3 Destination IP. */
+} svp_route_req_t;
+
+/*
+ * The far-remote Triton Data Center will answer with the requisite information
+ * to send overlay packets to the appropriate far-remote CNs.
+ */
+typedef struct svp_route_ack {
+ uint32_t sra_status; /* Status. */
+ uint32_t sra_dcid; /* Far-remote Data Center ID. */
+ uint32_t sra_vnetid; /* Far-remote vnet ID. */
+ uint16_t sra_vlan; /* Far-remote VLAN ID. */
+ uint16_t sra_port; /* Destination UL3 port. */
+ uint8_t sra_ip[16]; /* Destination UL3 address. */
+ uint8_t sra_srcmac[ETHERADDRL]; /* Far-remote VL2 source. */
+ uint8_t sra_dstmac[ETHERADDRL]; /* Far-remote VL2 dest. */
+ uint8_t sra_src_pfx; /* Far-remote VL3 source prefix */
+ uint8_t sra_dst_pfx; /* Far-remote VL3 dest. prefix */
+ uint16_t sra_pad; /* Must be explicit to 4-bytes. */
+} svp_route_ack_t;
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/lib/varpd/svp/common/libvarpd_svp_remote.c b/usr/src/lib/varpd/svp/common/libvarpd_svp_remote.c
index 99775f93c0..cbb5572265 100644
--- a/usr/src/lib/varpd/svp/common/libvarpd_svp_remote.c
+++ b/usr/src/lib/varpd/svp/common/libvarpd_svp_remote.c
@@ -48,6 +48,12 @@ static svp_timer_t svp_dns_timer;
static id_space_t *svp_idspace;
static int svp_dns_timer_rate = 30; /* seconds */
+id_t
+svp_id_alloc(void)
+{
+ return (id_alloc(svp_idspace));
+}
+
static void
svp_remote_mkfmamsg(svp_remote_t *srp, svp_degrade_state_t state, char *buf,
size_t buflen)
@@ -245,6 +251,8 @@ svp_remote_attach(svp_remote_t *srp, svp_t *svp)
libvarpd_panic("missing callback scb_vl2_invalidate");
if (svp->svp_cb.scb_vl3_inject == NULL)
libvarpd_panic("missing callback scb_vl3_inject");
+ if (svp->svp_cb.scb_route_lookup == NULL)
+ libvarpd_panic("missing callback scb_route_lookup");
check.svp_vid = svp->svp_vid;
if (avl_find(&srp->sr_tree, &check, &where) != NULL)
@@ -277,8 +285,41 @@ svp_remote_detach(svp_t *svp)
}
/*
- * Walk the list of connections and find the first one that's available, the
- * move it to the back of the list so it's less likely to be used again.
+ * See if the request can be sent over the connection's supported version.
+ * Scribble the version in the request itself. NOTE that we do not check the
+ * version that already exists in sqp->sq_header.svp_ver, as we may be called
+ * from svp_remote_reassign() (and change versions when arriving at a new
+ * connection).
+ */
+static boolean_t
+svp_outbound_version_check(int version, svp_query_t *sqp)
+{
+ uint16_t op = htons(sqp->sq_header.svp_op);
+
+ /*
+ * As of v1 -> v2, we really only need to restrict SVP_R_ROUTE_REQ
+ * as v2-only. Reflect that here.
+ *
+ * NOTE that if any message semantics change between versions,
+ * (e.g. "in v3 SVP_R_VL2_REQ takes on additional work"), we'll
+ * need to more-deeply inspect the query. It's possible that the
+ * svp_op space is big enough to just continue op-only inspections.
+ */
+
+ assert(version > 0 && version <= SVP_CURRENT_VERSION);
+
+ if (op != SVP_R_ROUTE_REQ || version >= SVP_VERSION_TWO) {
+ sqp->sq_header.svp_ver = htons(version);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Walk the list of connections and find the first one that's available AND
+ * version-appropriate for the message, then move the matched connection to
+ * the back of the list so it's less likely to be used again.
*/
static boolean_t
svp_remote_conn_queue(svp_remote_t *srp, svp_query_t *sqp)
@@ -289,7 +330,8 @@ svp_remote_conn_queue(svp_remote_t *srp, svp_query_t *sqp)
for (scp = list_head(&srp->sr_conns); scp != NULL;
scp = list_next(&srp->sr_conns, scp)) {
mutex_enter(&scp->sc_lock);
- if (scp->sc_cstate != SVP_CS_ACTIVE) {
+ if (scp->sc_cstate != SVP_CS_ACTIVE ||
+ !svp_outbound_version_check(scp->sc_version, sqp)) {
mutex_exit(&scp->sc_lock);
continue;
}
@@ -329,14 +371,13 @@ svp_remote_vl2_lookup(svp_t *svp, svp_query_t *sqp, const uint8_t *mac,
sqp->sq_arg = arg;
sqp->sq_svp = svp;
sqp->sq_state = SVP_QUERY_INIT;
- sqp->sq_header.svp_ver = htons(SVP_CURRENT_VERSION);
sqp->sq_header.svp_op = htons(SVP_R_VL2_REQ);
sqp->sq_header.svp_size = htonl(sizeof (svp_vl2_req_t));
sqp->sq_header.svp_id = id_alloc(svp_idspace);
if (sqp->sq_header.svp_id == (id_t)-1)
libvarpd_panic("failed to allcoate from svp_idspace: %d",
errno);
- sqp->sq_header.svp_crc32 = htonl(0);
+ sqp->sq_header.svp_crc32 = 0;
sqp->sq_rdata = vl2r;
sqp->sq_rsize = sizeof (svp_vl2_req_t);
sqp->sq_wdata = NULL;
@@ -352,6 +393,67 @@ svp_remote_vl2_lookup(svp_t *svp, svp_query_t *sqp, const uint8_t *mac,
}
static void
+svp_remote_route_lookup_cb(svp_query_t *sqp, void *arg)
+{
+ svp_t *svp = sqp->sq_svp;
+ svp_route_ack_t *sra = (svp_route_ack_t *)sqp->sq_wdata;
+
+ /*
+ * Do the ntoh*()-ing here.
+ */
+ if (sqp->sq_status == SVP_S_OK) {
+ svp->svp_cb.scb_route_lookup(svp, ntohl(sqp->sq_status),
+ ntohl(sra->sra_dcid), ntohl(sra->sra_vnetid),
+ ntohs(sra->sra_vlan), sra->sra_srcmac, sra->sra_dstmac,
+ ntohs(sra->sra_port), sra->sra_ip, sra->sra_src_pfx,
+ sra->sra_dst_pfx, arg);
+ } else {
+ svp->svp_cb.scb_route_lookup(svp, sqp->sq_status,
+ 0, 0, 0, NULL, NULL, 0, NULL, 0, 0, arg);
+ }
+}
+
+void
+svp_remote_route_lookup(svp_t *svp, svp_query_t *sqp,
+ const struct in6_addr *src, const struct in6_addr *dst, uint32_t vnetid,
+ uint16_t vlan, void *arg)
+{
+ svp_remote_t *srp;
+ svp_route_req_t *srr = &sqp->sq_rdun.sqd_rr;
+
+ srp = svp->svp_remote;
+ sqp->sq_func = svp_remote_route_lookup_cb;
+ sqp->sq_arg = arg;
+ sqp->sq_svp = svp;
+ sqp->sq_state = SVP_QUERY_INIT;
+ sqp->sq_header.svp_op = htons(SVP_R_ROUTE_REQ);
+ sqp->sq_header.svp_size = htonl(sizeof (svp_route_req_t));
+ sqp->sq_header.svp_id = id_alloc(svp_idspace);
+ if (sqp->sq_header.svp_id == (id_t)-1)
+ libvarpd_panic("failed to allcoate from svp_idspace: %d",
+ errno);
+ sqp->sq_header.svp_crc32 = 0;
+ sqp->sq_rdata = srr;
+ sqp->sq_rsize = sizeof (svp_route_req_t);
+ sqp->sq_wdata = NULL;
+ sqp->sq_wsize = 0;
+
+ bcopy(src, srr->srr_srcip, sizeof (struct in6_addr));
+ bcopy(dst, srr->srr_dstip, sizeof (struct in6_addr));
+ /* Caller should've checked both are the same type... */
+ srr->srr_vnetid = htonl(vnetid);
+ srr->srr_vlan = htons(vlan);
+ srr->srr_pad = 0;
+
+ mutex_enter(&srp->sr_lock);
+ if (!svp_remote_conn_queue(srp, sqp)) {
+ sqp->sq_status = SVP_S_FATAL;
+ sqp->sq_func(sqp, arg);
+ }
+ mutex_exit(&srp->sr_lock);
+}
+
+static void
svp_remote_vl3_lookup_cb(svp_query_t *sqp, void *arg)
{
svp_t *svp = sqp->sq_svp;
@@ -378,14 +480,13 @@ svp_remote_vl3_common(svp_remote_t *srp, svp_query_t *sqp,
sqp->sq_func = func;
sqp->sq_arg = arg;
sqp->sq_state = SVP_QUERY_INIT;
- sqp->sq_header.svp_ver = htons(SVP_CURRENT_VERSION);
sqp->sq_header.svp_op = htons(SVP_R_VL3_REQ);
sqp->sq_header.svp_size = htonl(sizeof (svp_vl3_req_t));
sqp->sq_header.svp_id = id_alloc(svp_idspace);
if (sqp->sq_header.svp_id == (id_t)-1)
libvarpd_panic("failed to allcoate from svp_idspace: %d",
errno);
- sqp->sq_header.svp_crc32 = htonl(0);
+ sqp->sq_header.svp_crc32 = 0;
sqp->sq_rdata = vl3r;
sqp->sq_rsize = sizeof (svp_vl3_req_t);
sqp->sq_wdata = NULL;
@@ -441,13 +542,22 @@ static void
svp_remote_log_request_cb(svp_query_t *sqp, void *arg)
{
svp_remote_t *srp = sqp->sq_arg;
+ uint16_t version;
+
+ /*
+ * Version in request is set in this sqp's read-data/sq_header by
+ * now.
+ */
+ assert(sqp->sq_header.svp_op == htons(SVP_R_LOG_REQ));
+ assert(sqp->sq_header.svp_ver != 0);
+ version = htons(sqp->sq_header.svp_ver);
assert(sqp->sq_wdata != NULL);
if (sqp->sq_status == SVP_S_OK)
svp_shootdown_logr_cb(srp, sqp->sq_status, sqp->sq_wdata,
- sqp->sq_size);
+ sqp->sq_size, version);
else
- svp_shootdown_logr_cb(srp, sqp->sq_status, NULL, 0);
+ svp_shootdown_logr_cb(srp, sqp->sq_status, NULL, 0, 0);
}
void
@@ -460,14 +570,13 @@ svp_remote_log_request(svp_remote_t *srp, svp_query_t *sqp, void *buf,
sqp->sq_func = svp_remote_log_request_cb;
sqp->sq_state = SVP_QUERY_INIT;
sqp->sq_arg = srp;
- sqp->sq_header.svp_ver = htons(SVP_CURRENT_VERSION);
sqp->sq_header.svp_op = htons(SVP_R_LOG_REQ);
sqp->sq_header.svp_size = htonl(sizeof (svp_log_req_t));
sqp->sq_header.svp_id = id_alloc(svp_idspace);
if (sqp->sq_header.svp_id == (id_t)-1)
libvarpd_panic("failed to allcoate from svp_idspace: %d",
errno);
- sqp->sq_header.svp_crc32 = htonl(0);
+ sqp->sq_header.svp_crc32 = 0;
sqp->sq_rdata = logr;
sqp->sq_rsize = sizeof (svp_log_req_t);
sqp->sq_wdata = buf;
@@ -485,7 +594,7 @@ svp_remote_log_request(svp_remote_t *srp, svp_query_t *sqp, void *buf,
mutex_exit(&srp->sr_lock);
if (queued == B_FALSE)
- svp_shootdown_logr_cb(srp, SVP_S_FATAL, NULL, 0);
+ svp_shootdown_logr_cb(srp, SVP_S_FATAL, NULL, 0, 0);
}
static void
@@ -506,14 +615,13 @@ svp_remote_lrm_request(svp_remote_t *srp, svp_query_t *sqp, void *buf,
sqp->sq_func = svp_remote_lrm_request_cb;
sqp->sq_state = SVP_QUERY_INIT;
sqp->sq_arg = srp;
- sqp->sq_header.svp_ver = htons(SVP_CURRENT_VERSION);
sqp->sq_header.svp_op = htons(SVP_R_LOG_RM);
sqp->sq_header.svp_size = htonl(buflen);
sqp->sq_header.svp_id = id_alloc(svp_idspace);
if (sqp->sq_header.svp_id == (id_t)-1)
libvarpd_panic("failed to allcoate from svp_idspace: %d",
errno);
- sqp->sq_header.svp_crc32 = htonl(0);
+ sqp->sq_header.svp_crc32 = 0;
sqp->sq_rdata = buf;
sqp->sq_rsize = buflen;
sqp->sq_wdata = NULL;
@@ -533,7 +641,7 @@ svp_remote_lrm_request(svp_remote_t *srp, svp_query_t *sqp, void *buf,
mutex_exit(&srp->sr_lock);
if (queued == B_FALSE)
- svp_shootdown_logr_cb(srp, SVP_S_FATAL, NULL, 0);
+ svp_shootdown_logr_cb(srp, SVP_S_FATAL, NULL, 0, 0);
}
/* ARGSUSED */
@@ -795,6 +903,21 @@ svp_remote_shootdown_vl2(svp_remote_t *srp, svp_log_vl2_t *svl2)
mutex_exit(&srp->sr_lock);
}
+void
+svp_remote_shootdown_route(svp_remote_t *srp, svp_log_route_t *svlr)
+{
+ svp_t *svp, lookup;
+
+ lookup.svp_vid = ntohl(svlr->svlr_src_vnetid);
+ mutex_enter(&srp->sr_lock);
+ if ((svp = avl_find(&srp->sr_tree, &lookup, NULL)) != NULL) {
+ svp->svp_cb.scb_route_shootdown(svp, svlr->svlr_srcip,
+ svlr->svlr_dstip, svlr->svlr_src_prefixlen,
+ svlr->svlr_dst_prefixlen, htons(svlr->svlr_src_vlan));
+ }
+ mutex_exit(&srp->sr_lock);
+}
+
int
svp_remote_init(void)
{
diff --git a/usr/src/lib/varpd/svp/common/libvarpd_svp_shootdown.c b/usr/src/lib/varpd/svp/common/libvarpd_svp_shootdown.c
index 76afb2519f..eacc927b4f 100644
--- a/usr/src/lib/varpd/svp/common/libvarpd_svp_shootdown.c
+++ b/usr/src/lib/varpd/svp/common/libvarpd_svp_shootdown.c
@@ -154,7 +154,7 @@ svp_shootdown_logr_shoot(void *data, svp_log_type_t type, void *arg)
svp_remote_t *srp = sdl->sdl_remote;
svp_lrm_req_t *svrr = sdl->sdl_logrm;
- if (type != SVP_LOG_VL2 && type != SVP_LOG_VL3)
+ if (type != SVP_LOG_VL2 && type != SVP_LOG_VL3 && type != SVP_LOG_ROUTE)
libvarpd_panic("encountered unknown type: %d\n", type);
if (type == SVP_LOG_VL2) {
@@ -165,12 +165,21 @@ svp_shootdown_logr_shoot(void *data, svp_log_type_t type, void *arg)
UUID_LEN);
svrr->svrr_count++;
mutex_exit(&sdl->sdl_lock);
- } else {
+ } else if (type == SVP_LOG_VL3) {
svp_log_vl3_t *svl3 = data;
/* Take a hold for the duration of this request */
svp_shootdown_ref(sdl);
svp_remote_shootdown_vl3(srp, svl3, sdl);
+ } else {
+ svp_log_route_t *svlr = data;
+
+ svp_remote_shootdown_route(srp, svlr);
+ mutex_enter(&sdl->sdl_lock);
+ bcopy(svlr->svlr_id, &svrr->svrr_ids[svrr->svrr_count * 16],
+ UUID_LEN);
+ svrr->svrr_count++;
+ mutex_exit(&sdl->sdl_lock);
}
return (0);
@@ -187,13 +196,11 @@ svp_shootdown_logr_count(void *data, svp_log_type_t type, void *arg)
static int
svp_shootdown_logr_iter(svp_remote_t *srp, void *buf, size_t len,
- int (*cb)(void *, svp_log_type_t, void *), void *arg)
+ int (*cb)(void *, svp_log_type_t, void *), void *arg, uint16_t version)
{
int ret;
off_t cboff = 0;
uint32_t *typep, type;
- svp_log_vl2_t *svl2;
- svp_log_vl3_t *svl3;
/* Adjust for initial status word */
assert(len >= sizeof (uint32_t));
@@ -202,6 +209,7 @@ svp_shootdown_logr_iter(svp_remote_t *srp, void *buf, size_t len,
while (len > 0) {
size_t opsz;
+ char *typestring;
if (len < sizeof (uint32_t)) {
(void) bunyan_warn(svp_bunyan,
@@ -216,30 +224,20 @@ svp_shootdown_logr_iter(svp_remote_t *srp, void *buf, size_t len,
typep = buf + cboff;
type = ntohl(*typep);
- if (type == SVP_LOG_VL2) {
+ switch (type) {
+ case SVP_LOG_VL2:
opsz = sizeof (svp_log_vl2_t);
- if (len < opsz) {
- (void) bunyan_warn(svp_bunyan,
- "not enough data for svp_log_vl2_t",
- BUNYAN_T_STRING, "remote_host",
- srp->sr_hostname,
- BUNYAN_T_INT32, "remote_port",
- srp->sr_rport,
- BUNYAN_T_INT32, "response_size",
- cboff + len,
- BUNYAN_T_INT32, "response_offset", cboff,
- BUNYAN_T_END);
- return (-1);
- }
- svl2 = (void *)typep;
- if ((ret = cb(svl2, type, arg)) != 0)
- return (ret);
- } else if (type == SVP_LOG_VL3) {
-
+ typestring = "svp_log_vl2_t";
+ break;
+ case SVP_LOG_VL3:
opsz = sizeof (svp_log_vl3_t);
- if (len < opsz) {
+ typestring = "svp_log_vl3_t";
+ break;
+ case SVP_LOG_ROUTE:
+ if (version < SVP_VERSION_TWO) {
(void) bunyan_warn(svp_bunyan,
- "not enough data for svp_log_vl3_t",
+ "insufficient version for SVP_LOG_ROUTE",
+ BUNYAN_T_UINT32, "version", version,
BUNYAN_T_STRING, "remote_host",
srp->sr_hostname,
BUNYAN_T_INT32, "remote_port",
@@ -250,10 +248,10 @@ svp_shootdown_logr_iter(svp_remote_t *srp, void *buf, size_t len,
BUNYAN_T_END);
return (-1);
}
- svl3 = (void *)typep;
- if ((ret = cb(svl3, type, arg)) != 0)
- return (ret);
- } else {
+ opsz = sizeof (svp_log_route_t);
+ typestring = "svp_log_route_t";
+ break;
+ default:
(void) bunyan_warn(svp_bunyan,
"unknown log structure type",
BUNYAN_T_STRING, "remote_host",
@@ -265,6 +263,20 @@ svp_shootdown_logr_iter(svp_remote_t *srp, void *buf, size_t len,
BUNYAN_T_END);
return (-1);
}
+ if (len < opsz) {
+ (void) bunyan_warn(svp_bunyan,
+ "not enough data for message type",
+ BUNYAN_T_STRING, "msg_type", typestring,
+ BUNYAN_T_STRING, "remote_host", srp->sr_hostname,
+ BUNYAN_T_INT32, "remote_port", srp->sr_rport,
+ BUNYAN_T_INT32, "response_size", cboff + len,
+ BUNYAN_T_INT32, "response_offset", cboff,
+ BUNYAN_T_END);
+ return (-1);
+ }
+ if ((ret = cb((void *)typep, type, arg)) != 0)
+ return (ret);
+
len -= opsz;
cboff += opsz;
}
@@ -274,7 +286,7 @@ svp_shootdown_logr_iter(svp_remote_t *srp, void *buf, size_t len,
void
svp_shootdown_logr_cb(svp_remote_t *srp, svp_status_t status, void *cbdata,
- size_t cbsize)
+ size_t cbsize, uint16_t version)
{
uint_t count;
svp_sdlog_t *sdl = &srp->sr_shoot;
@@ -301,7 +313,7 @@ svp_shootdown_logr_cb(svp_remote_t *srp, svp_status_t status, void *cbdata,
*/
count = 0;
if ((svp_shootdown_logr_iter(srp, cbdata, cbsize,
- svp_shootdown_logr_count, &count)) != 0) {
+ svp_shootdown_logr_count, &count, version)) != 0) {
mutex_enter(&sdl->sdl_lock);
sdl->sdl_flags &= ~SVP_SD_RUNNING;
svp_shootdown_schedule(sdl, B_FALSE);
@@ -337,7 +349,7 @@ svp_shootdown_logr_cb(svp_remote_t *srp, svp_status_t status, void *cbdata,
* is how many entries we have to remove.
*/
(void) svp_shootdown_logr_iter(srp, cbdata, cbsize,
- svp_shootdown_logr_shoot, sdl);
+ svp_shootdown_logr_shoot, sdl, version);
/*
* Now that we're done with our work, release the hold. If we don't have
diff --git a/usr/src/man/man1m/dladm.1m b/usr/src/man/man1m/dladm.1m
index 77bf045e08..9912269dfb 100644
--- a/usr/src/man/man1m/dladm.1m
+++ b/usr/src/man/man1m/dladm.1m
@@ -176,7 +176,7 @@ dladm \- administer data links
.LP
.nf
-\fBdladm create-overlay\fR [\fB-t\fR] \fB-e\fR \fIencap\fR \fB-s\fR \fIsearch\fR \fB-v\fR \fIvnetid\fR [\fB-p\fR \fIprop\fR=\fIvalue\fR[,...]] \fIoverlay\fR
+\fBdladm create-overlay\fR [\fB-t\fR] \fB-e\fR \fIencap\fR \fB-s\fR \fIsearch\fR \fB-v\fR \fIvnetid\fR [\fB-d\fR \fIdcid\fR] [\fB-p\fR \fIprop\fR=\fIvalue\fR[,...]] \fIoverlay\fR
\fBdladm delete-overlay\fR \fIoverlay\fR
\fBdladm modify-overlay\fR \fB-d\fR \fImac\fR | \fB-f\fR | \fB-s\fR \fImac=ip:port\fR \fIoverlay\fR
\fBdladm show-overlay\fR [ \fB-f\fR | \fB-t\fR ] [[\fB-p\fR] \fB-o\fR \fIfield\fR[,...]] [\fIoverlay\fR]
@@ -4443,8 +4443,8 @@ The tunnel destination address.
.sp
.ne 2
.na
-\fBdladm create-overlay\fR \fB-e\fR \fIencap\fR \fB-s\fR \fIsearch\fR
-\fB-v\fR \fIvnetid\fR [\fB-p\fR \fIprop\fR=\fIvalue\fR[,...]] \fIoverlay\fR
+\fBdladm create-overlay\fR [\fB-t\fR] \fB-e\fR \fIencap\fR \fB-s\fR \fIsearch\fR
+\fB-v\fR \fIvnetid\fR [\fB-d\fR \fIdcid\fR] [\fB-p\fR \fIprop\fR=\fIvalue\fR[,...]] \fIoverlay\fR
.ad
.sp .6
.RS 4n
@@ -4501,6 +4501,16 @@ determines how non-local targets are found and where packets are directed to.
.sp
.ne 2
.na
+\fB\fB-d\fR \fIdcid\fR
+.ad
+.sp .6
+.RS 4n
+Set the datacenter id to \fIdcid\fR.
+.RE
+
+.sp
+.ne 2
+.na
\fB\fB-p\fR \fIprop\fR=\fIvalue\fR,..., \fB--prop\fR
\fIprop\fR=\fIvalue\fR,...\fR
.ad
diff --git a/usr/src/test/os-tests/runfiles/default.run b/usr/src/test/os-tests/runfiles/default.run
index 72158c8bc2..927c24ad7f 100644
--- a/usr/src/test/os-tests/runfiles/default.run
+++ b/usr/src/test/os-tests/runfiles/default.run
@@ -83,6 +83,7 @@ tests = ['acquire-compare', 'kmc-update']
[/opt/os-tests/tests/OS-6097.32]
[/opt/os-tests/tests/OS-6097.64]
+[/opt/os-tests/tests/qqcache]
[/opt/os-tests/tests/ddi_ufm]
user = root
diff --git a/usr/src/test/os-tests/tests/Makefile b/usr/src/test/os-tests/tests/Makefile
index 7396e135c9..3ec39ef440 100644
--- a/usr/src/test/os-tests/tests/Makefile
+++ b/usr/src/test/os-tests/tests/Makefile
@@ -23,6 +23,7 @@ SUBDIRS = \
libtopo \
pf_key \
poll \
+ qqcache \
sdevfs \
secflags \
sigqueue \
diff --git a/usr/src/test/os-tests/tests/qqcache/Makefile b/usr/src/test/os-tests/tests/qqcache/Makefile
new file mode 100644
index 0000000000..43843b36fb
--- /dev/null
+++ b/usr/src/test/os-tests/tests/qqcache/Makefile
@@ -0,0 +1,60 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+include $(SRC)/cmd/Makefile.cmd
+include $(SRC)/test/Makefile.com
+
+PROG = qqcache
+
+LOCAL_OBJS = qqcache-test.o
+COMMON_OBJS = qqcache.o
+COMMON_DIR = $(SRC)/uts/common/qqcache
+OBJS = $(LOCAL_OBJS) $(COMMON_OBJS)
+
+ROOTOPTPKG = $(ROOT)/opt/os-tests
+TESTDIR = $(ROOTOPTPKG)/tests/qqcache
+
+CMDS = $(PROG:%=$(TESTDIR)/%)
+$(CMDS) := FILEMODE = 0555
+
+CSTD = $(CSTD_GNU99)
+LDLIBS += -lumem -lcmdutils
+
+all: $(PROG)
+
+install: all $(CMDS)
+
+lint:
+
+clobber: clean
+ -$(RM) $(PROG) $(OBJS)
+
+clean:
+ -$(RM) $(CLEANFILES)
+
+$(CMDS): $(TESTDIR) $(PROG)
+
+$(TESTDIR):
+ $(INS.dir)
+
+$(TESTDIR)/%: %
+ $(INS.file)
+
+$(PROG): $(OBJS)
+ $(LINK.c) -o $@ $(OBJS) $(LDLIBS)
+
+%.o: $(COMMON_DIR)/%.c
+ $(COMPILE.c) -o $@ $<
+ $(POST_PROCESS_O)
diff --git a/usr/src/test/os-tests/tests/qqcache/qqcache-test.c b/usr/src/test/os-tests/tests/qqcache/qqcache-test.c
new file mode 100644
index 0000000000..2606ffffb8
--- /dev/null
+++ b/usr/src/test/os-tests/tests/qqcache/qqcache-test.c
@@ -0,0 +1,380 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018, Joyent, Inc.
+ */
+#include <err.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/debug.h>
+#include <sys/list.h>
+#include <sys/types.h>
+#include <sys/qqcache.h>
+#include <sys/qqcache_impl.h>
+#include <umem.h>
+
+/* Some arbitrary sizes */
+#define INITIAL_CACHE_SIZE 12
+#define INITIAL_CACHE_A 25
+#define CACHE_HSIZE 11
+
+#define OUTPUT_WIDTH 80
+
+/*
+ * If we extend the implementation to use more lists, the test code will need
+ * to be updated accordingly
+ */
+CTASSERT(QQCACHE_NUM_LISTS == 2);
+
+typedef struct entry {
+ uint_t e_val;
+ qqcache_link_t e_link;
+} entry_t;
+
+enum {
+ ITER_ERROR = -1,
+ ITER_OK = 0,
+ ITER_STOP = 1
+};
+
+static uint64_t entry_hash(const void *);
+static int entry_cmp(const void *, const void *);
+static void entry_dtor(void *);
+static entry_t *entry_new(uint_t val);
+
+static void expect(qqcache_t *, uint_t *, size_t, uint_t *, size_t, int);
+static void expect_val(qqcache_t *, const entry_t *, uint_t);
+static void dump_cache(qqcache_t *);
+static int iter_list(qqcache_t *, size_t, int (*)(void *, void *), void *);
+static int xprintf(FILE *, const char *, ...);
+
+int
+main(void)
+{
+ qqcache_t *qc;
+ uint_t val;
+
+ VERIFY0(qqcache_create(&qc, INITIAL_CACHE_SIZE, INITIAL_CACHE_A,
+ CACHE_HSIZE, entry_hash, entry_cmp, entry_dtor, sizeof (entry_t),
+ offsetof(entry_t, e_link), offsetof(entry_t, e_val), UMEM_DEFAULT));
+
+ /* Create a few entries */
+ VERIFY0(qqcache_insert(qc, entry_new(5)));
+ VERIFY0(qqcache_insert(qc, entry_new(4)));
+ VERIFY0(qqcache_insert(qc, entry_new(3)));
+ VERIFY0(qqcache_insert(qc, entry_new(2)));
+ VERIFY0(qqcache_insert(qc, entry_new(1)));
+ expect(qc, NULL, 0, (uint_t[]){1, 2, 3, 4, 5}, 5, __LINE__);
+
+ /* Adding a duplicate should fail */
+ {
+ entry_t *e = entry_new(3);
+ VERIFY3S(qqcache_insert(qc, e), ==, EEXIST);
+ entry_dtor(e);
+ }
+
+ VERIFY0(qqcache_insert(qc, entry_new(10)));
+ VERIFY0(qqcache_insert(qc, entry_new(9)));
+ VERIFY0(qqcache_insert(qc, entry_new(8)));
+ VERIFY0(qqcache_insert(qc, entry_new(7)));
+ /* This should bump the LRU entry (5) from the list */
+ VERIFY0(qqcache_insert(qc, entry_new(6)));
+ expect(qc, NULL, 0,
+ (uint_t[]){6, 7, 8, 9, 10, 1, 2, 3, 4}, 9, __LINE__);
+
+ /* Lookup a few entries to move them to the MFU list */
+ val = 3;
+ expect_val(qc, qqcache_lookup(qc, &val), 3);
+ expect(qc, (uint_t[]) {3}, 1,
+ (uint_t[]){6, 7, 8, 9, 10, 1, 2, 4}, 8, __LINE__);
+
+ val = 8;
+ expect_val(qc, qqcache_lookup(qc, &val), 8);
+ expect(qc, (uint_t[]) {8, 3}, 2,
+ (uint_t[]){6, 7, 9, 10, 1, 2, 4}, 7, __LINE__);
+
+ /* Now move 3 back to the head of list 0 */
+ val = 3;
+ expect_val(qc, qqcache_lookup(qc, &val), 3);
+ expect(qc, (uint_t[]) {3, 8}, 2,
+ (uint_t[]){6, 7, 9, 10, 1, 2, 4}, 7, __LINE__);
+
+ val = 7;
+ expect_val(qc, qqcache_lookup(qc, &val), 7);
+ expect(qc, (uint_t[]) {7, 3, 8}, 3,
+ (uint_t[]){6, 9, 10, 1, 2, 4}, 6, __LINE__);
+
+ /* This should push 8 from the MFU back onto the MRU */
+ val = 10;
+ expect_val(qc, qqcache_lookup(qc, &val), 10);
+ expect(qc, (uint_t[]) {10, 7, 3}, 3,
+ (uint_t[]){8, 6, 9, 1, 2, 4}, 6, __LINE__);
+
+ /* Add some more values */
+ VERIFY0(qqcache_insert(qc, entry_new(11)));
+ VERIFY0(qqcache_insert(qc, entry_new(12)));
+ VERIFY0(qqcache_insert(qc, entry_new(13)));
+ VERIFY0(qqcache_insert(qc, entry_new(14)));
+ VERIFY0(qqcache_insert(qc, entry_new(15)));
+ expect(qc, (uint_t[]) {10, 7, 3}, 3,
+ (uint_t[]){15, 14, 13, 12, 11, 8, 6, 9, 1, 2}, 9, __LINE__);
+
+ VERIFY0(qqcache_adjust_size(qc, INITIAL_CACHE_SIZE + 4));
+ expect(qc, (uint_t[]) {10, 7, 3}, 3,
+ (uint_t[]){15, 14, 13, 12, 11, 8, 6, 9, 1, 2}, 9, __LINE__);
+
+ VERIFY0(qqcache_insert(qc, entry_new(16)));
+ VERIFY0(qqcache_insert(qc, entry_new(17)));
+ VERIFY0(qqcache_insert(qc, entry_new(18)));
+ VERIFY0(qqcache_insert(qc, entry_new(19)));
+ expect(qc, (uint_t[]) {10, 7, 3}, 3,
+ (uint_t[]) {19, 18, 17, 16, 15, 14, 13, 12, 11, 8, 6, 9}, 12,
+ __LINE__);
+
+ VERIFY0(qqcache_adjust_size(qc, INITIAL_CACHE_SIZE - 2));
+ expect(qc, (uint_t[]) {10, 7}, 2,
+ (uint_t[]){3, 19, 18, 17, 16, 15, 14, 13}, 8, __LINE__);
+
+ VERIFY3S(qqcache_adjust_size(qc, 2), ==, EINVAL);
+
+ VERIFY0(qqcache_adjust_a(qc, 50));
+ expect(qc, (uint_t[]) {10, 7}, 2,
+ (uint_t[]){3, 19, 18, 17, 16}, 5, __LINE__);
+
+ qqcache_destroy(qc);
+ return (0);
+}
+
+struct cmp_arg {
+ qqcache_t *qc;
+ uint_t *vals;
+ size_t i;
+ size_t listnum;
+ int linenum;
+};
+
+static int
+cmp_cb(void *op, void *arg)
+{
+ entry_t *e = op;
+ struct cmp_arg *ca = arg;
+ uint_t val = ca->vals[ca->i++];
+
+ if (e->e_val == val)
+ return (ITER_OK);
+
+ (void) xprintf(stderr, "Line %d: Unexpected value in list %zu.\n",
+ ca->linenum, ca->listnum);
+ (void) xprintf(stderr, " Expected: %u\n Actual: %u\n", val,
+ e->e_val);
+
+ return (ITER_ERROR);
+}
+
+static void
+cmp_list(qqcache_t *qc, size_t listnum, uint_t *vals, size_t n, int linenum)
+{
+ qqcache_list_t *l = &qc->qqc_lists[listnum];
+ struct cmp_arg arg = {
+ .qc = qc,
+ .vals = vals,
+ .i = 0,
+ .listnum = listnum,
+ .linenum = linenum
+ };
+
+ if (l->qqcl_len != n) {
+ (void) xprintf(stderr,
+ "Line %d: Unexpected length for list %zu.\n"
+ " Length: %zu\n"
+ " Expected: %zu\n\n", linenum, listnum, l->qqcl_len, n);
+ dump_cache(qc);
+ }
+
+ if (iter_list(qc, listnum, cmp_cb, &arg) != ITER_OK) {
+ dump_cache(qc);
+ exit(1);
+ }
+}
+
+static void
+expect(qqcache_t *qc, uint_t *l0, size_t l0sz, uint_t *l1, size_t l1sz,
+ int linenum)
+{
+ cmp_list(qc, 0, l0, l0sz, linenum);
+ cmp_list(qc, 1, l1, l1sz, linenum);
+}
+
+static void
+expect_val(qqcache_t *qc, const entry_t *e, uint_t val)
+{
+ char buf[2][64];
+ if (e == NULL && val == UINT_MAX)
+ return;
+
+ if (e != NULL && e->e_val == val)
+ return;
+
+ if (e != NULL)
+ (void) snprintf(buf[0], sizeof (buf[0]), "%u", e->e_val);
+ else
+ (void) strlcpy(buf[0], "<NULL>", sizeof (buf[0]));
+
+ if (val != UINT_MAX)
+ (void) snprintf(buf[1], sizeof (buf[1]), "%u", val);
+ else
+ (void) strlcpy(buf[1], "<NONE>", sizeof (buf[1]));
+
+ (void) xprintf(stderr, "Unexpected value in list:\n");
+ (void) xprintf(stderr, " Found: %s\n Expected: %s\n",
+ buf[0], buf[1]);
+ dump_cache(qc);
+ exit(1);
+}
+
+struct dump_args {
+ int prefixlen;
+ int col;
+ boolean_t nl;
+};
+
+static int
+dump_entry(void *ep, void *arg)
+{
+ entry_t *e = ep;
+ struct dump_args *da = arg;
+ char buf[64] = { 0 };
+ int n;
+
+ n = snprintf(buf, sizeof (buf), "%u", e->e_val);
+ /* buf should be large enough to hold an unsigned val */
+ VERIFY3S(n, >, 0);
+ VERIFY3S(n, <, sizeof (buf));
+
+ if (da->col + n + 2 > OUTPUT_WIDTH) {
+ da->col = xprintf(stderr, "\n%*s", da->prefixlen, "") - 1;
+ da->nl = B_TRUE;
+ } else if (!da->nl) {
+ da->col += xprintf(stderr, ", ");
+ }
+
+ da->col += xprintf(stderr, "%s", buf);
+ da->nl = B_FALSE;
+
+ return (ITER_OK);
+}
+
+static void
+dump_cache(qqcache_t *qc)
+{
+ (void) xprintf(stderr, "Cache contents:\n");
+
+ for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) {
+ qqcache_list_t *l = &qc->qqc_lists[i];
+ struct dump_args args = {
+ .nl = B_TRUE
+ };
+
+ args.col = args.prefixlen =
+ xprintf(stderr, "List %zu (%zu/%zu): ", i, l->qqcl_len,
+ qc->qqc_max[i]);
+
+ (void) iter_list(qc, i, dump_entry, &args);
+ VERIFY(fputc('\n', stderr));
+ }
+}
+
+static int
+iter_list(qqcache_t *qc, size_t listnum, int (*cb)(void *, void *),
+ void *arg)
+{
+ qqcache_list_t *l = &qc->qqc_lists[listnum];
+ void *lp;
+ int ret;
+
+ for (lp = list_head(&l->qqcl_list); lp != NULL;
+ lp = list_next(&l->qqcl_list, lp)) {
+ if ((ret = cb(link_to_obj(qc, lp), arg)) != ITER_OK)
+ return (ret);
+ }
+
+ return (ITER_OK);
+}
+
+/*
+ * A small wrapper around vfprintf(3C) so caller doesn't need to deal with
+ * errors or negative return values.
+ */
+static int
+xprintf(FILE *f, const char *fmt, ...)
+{
+ int n;
+ va_list ap;
+
+ va_start(ap, fmt);
+ n = vfprintf(f, fmt, ap);
+ va_end(ap);
+
+ if (n < 0 || ferror(f))
+ err(EXIT_FAILURE, "\nUnable to write output");
+
+ return (n);
+}
+
+static entry_t *
+entry_new(uint_t val)
+{
+ entry_t *e = calloc(1, sizeof (*e));
+
+ VERIFY3P(e, !=, NULL);
+ e->e_val = val;
+ return (e);
+}
+
+static uint64_t
+entry_hash(const void *p)
+{
+ const uint_t *vp = p;
+ uint64_t val = *vp;
+ return (val);
+}
+
+static int
+entry_cmp(const void *a, const void *b)
+{
+ const uint_t *l = a;
+ const uint_t *r = b;
+ return ((*l == *r) ? 0 : 1);
+}
+
+static void
+entry_dtor(void *arg)
+{
+ free(arg);
+}
+
+const char *
+_umem_debug_init(void)
+{
+ return ("default,verbose");
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents");
+}
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 752fe56100..e973cf58ad 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -299,6 +299,7 @@ GENUNIX_OBJS += \
resolvepath.o \
retire_store.o \
process.o \
+ qqcache.o \
rlimit.o \
rmap.o \
rw.o \
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index a32c094f3b..ba8945b6fb 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -26,6 +26,7 @@
# Copyright 2020 Joyent, Inc.
# Copyright 2018 Nexenta Systems, Inc.
# Copyright (c) 2017 by Delphix. All rights reserved.
+# Copyright 2018 Joyent, Inc.
# Copyright 2020 Oxide Computer Company
#
@@ -1627,6 +1628,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/pcmcia/pcs/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/qqcache/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/refhash/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/rpc/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -2778,6 +2787,9 @@ $(LINTS_DIR)/%.ln: $(COMMONBASE)/nvpair/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/os/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/qqcache/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/rpc/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c
index 2ad3f4f591..c54a6e0d9c 100644
--- a/usr/src/uts/common/io/overlay/overlay.c
+++ b/usr/src/uts/common/io/overlay/overlay.c
@@ -134,6 +134,40 @@
* be sent to. In addition, they handle questions related to how to handle
* things like broadcast and multicast traffic, etc.
*
+ * ROUTING
+ *
+ * Supporting routing of packets between VLANs that exist on an overlay
+ * network require two major differences. First, packets destined for off-VLAN
+ * destinations need to be identified. Second, we must obtain the necessary
+ * additional information necessary to deliver the packet to its off-VLAN
+ * destination.
+ *
+ * To solve the first issue, we utilize the existing IP routing functionality.
+ * Off-vlan destinations are given routes with next hops in the originating
+ * netstack's routing table--just like in physical networks. The system will
+ * then attempt to generate an ARP query, which will be sent out to varpd in
+ * the exact same manner as is described above for other on-VLAN destinations.
+ * The response for this will include a MAC address that is both used for the
+ * ARP reply, and is added to our VL2 MAC->UL3 hash table, but is added
+ * with the OVERLAY_ENTRY_F_ROUTER flag set. Once this is done, the originating
+ * netstack will send off-VLAN packets to this router MAC, allowing the
+ * overlay device to identify these packets as requiring routing.
+ *
+ * Once packets with an off-VLAN destination are identified, we must determine
+ * what the destination vid, VL2 VLAN, and VL2 MAC values are for the given
+ * packet. For reasons similar to the VL2 MAC->UL3 lookup described above,
+ * we utilize the flexibility of user land to perform these lookups (also
+ * using varpd). In this instance we are attempting to find a destination VL3
+ * IP to a UL3 IP mapping (a few extra bits of information are necessary to
+ * allow for disambiguation of the destination VL3 IP for situations such as
+ * mirroring a production environment including VL3 IPs in an isolated set of
+ * VLANs). We then store these results in a VL3->UL3 hash table for future
+ * lookups.
+ *
+ * To prevent the size of both the VL2->UL3 and VL3->UL3 hash tables from
+ * growing without bound, we cap the number of entries in each hash table and
+ * utilize the ARC algorithm to manage their contents.
+ *
* ----------
* Properties
* ----------
@@ -205,6 +239,10 @@
* UTF-8. Note that the size of the string includes the null
* terminator.
*
+ * OVERLAY_PROP_T_ETHER
+ *
+ * An ether_addr_t, which has a fixed size.
+ *
* The next thing that we apply to a property is its permission. The permissions
* are put together by the bitwise or of the following flags and values.
*
@@ -461,7 +499,7 @@
* On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
* are much more interesting and as a result, more complicated. We primarily
* store lists of overlay_target_entry_t's which are stored in both an avl tree
- * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
+ * and a qqcache_t. The primary look up path uses the qqcache_t and the avl tree
* is only used for a few of the target ioctls used to dump data such that we
* can get a consistent iteration order for things like dladm show-overlay -t.
* The key that we use for the reference hashtable is based on the mac address
@@ -486,6 +524,28 @@
* any outstanding data to that place. For the full story on how we look that up
* will be discussed in the section on the Target Cache Lifecycle.
*
+ * For routing, everything works largely the same as it does in the non-routing
+ * situations. The major differences are that both the target cache is always
+ * an OVERLAY_TARGET_DYNAMIC cache, and that an additional hash table lookup
+ * occurs. When a routed packet is sent down stack, the
+ * overlay_target_entry_t in the VL2 cache will have its
+ * OVERLAY_ENTRY_F_ROUTER flag set, which will prompt a lookup in the VL3->UL3
+ * cache (using the source VL3, source VL2 VLAN, and destination VL3 values
+ * from the packet as the lookup key). The entry returned from the cache is
+ * used to modify the source and destination VL2 MAC addresses as well as
+ * the VL2 VLAN ID, and then is encapsulated and sent to its UL3 destination.
+ * On reception, decapsulation happens exactely the same as in the non-routed
+ * case, and the packet appears just as if it was sent out the VL2 network
+ * from a router connected to it. This is done both to maintain the illusion
+ * of a physical network when sniffing packets at the instance level, and
+ * so that the mac layer sitting above the destinations overlay device
+ * (for the vnic created over the overlay) does not discard the packet because
+ * its VLAN tag does not match the VLAN tag of the destination VNIC. While
+ * some of these modifications could be split between the source and
+ * destination hosts, by doing the work on the source, it maximizes any
+ * potential parallelism that might be present from multiple flows to a given
+ * destination.
+ *
* ------------------------
* FMA and Degraded Devices
* ------------------------
@@ -830,21 +890,62 @@ typedef enum overlay_dev_prop {
OVERLAY_DEV_P_MTU = 0,
OVERLAY_DEV_P_VNETID,
OVERLAY_DEV_P_ENCAP,
- OVERLAY_DEV_P_VARPDID
+ OVERLAY_DEV_P_VARPDID,
+ OVERLAY_DEV_P_DCID,
+ OVERLAY_DEV_P_VL2_CACHE_SIZE,
+ OVERLAY_DEV_P_VL2_CACHE_A,
+ OVERLAY_DEV_P_ROUTE_CACHE_SIZE,
+ OVERLAY_DEV_P_ROUTE_CACHE_A
} overlay_dev_prop_t;
-#define OVERLAY_DEV_NPROPS 4
+#define OVERLAY_DEV_NPROPS 9
static const char *overlay_dev_props[] = {
"mtu",
"vnetid",
"encap",
- "varpd/id"
+ "varpd/id",
+ "dcid",
+ "vl2_cache_size",
+ "_vl2_cache_a",
+ "route_cache_size",
+ "_route_cache_a"
};
+/* properties that can be changed live */
+static boolean_t overlay_dev_liveprop[] = {
+ B_FALSE, /* mtu */
+ B_FALSE, /* vnetid */
+ B_FALSE, /* encap */
+ B_FALSE, /* varpd/id */
+ B_FALSE, /* dcid */
+ B_TRUE, /* vl2_cache_size */
+ B_TRUE, /* _vl2_cache_a */
+ B_TRUE, /* route_cache_size */
+ B_TRUE /* _route_cache_a */
+};
+
+CTASSERT(ARRAY_SIZE(overlay_dev_props) == OVERLAY_DEV_NPROPS);
+CTASSERT(ARRAY_SIZE(overlay_dev_liveprop) == OVERLAY_DEV_NPROPS);
+
#define OVERLAY_MTU_MIN 576
#define OVERLAY_MTU_DEF 1400
#define OVERLAY_MTU_MAX 8900
+/* The 2Q parameter 'a' is a percentage */
+#define OVERLAY_CACHE_MAX_A 100
+/* An somewhat arbitrary default, biasing towards storing more MFU entries */
+#define OVERLAY_CACHE_A_DEF 75
+
+/* Somewhat arbitrary min and max values */
+#define OVERLAY_VL2_CACHE_MIN 256
+#define OVERLAY_VL2_CACHE_MAX 10240
+#define OVERLAY_VL2_CACHE_DEF OVERLAY_VL2_CACHE_MIN
+
+/* Somewhat arbitrary min and max values */
+#define OVERLAY_ROUTE_CACHE_MIN 256
+#define OVERLAY_ROUTE_CACHE_MAX 10240
+#define OVERLAY_ROUTE_CACHE_DEF OVERLAY_ROUTE_CACHE_MIN
+
overlay_dev_t *
overlay_hold_by_dlid(datalink_id_t id)
{
@@ -1066,7 +1167,6 @@ overlay_m_tx(void *arg, mblk_t *mp_chain)
bzero(&hdr, sizeof (struct msghdr));
bzero(&einfo, sizeof (ovep_encap_info_t));
- einfo.ovdi_id = odd->odd_vid;
mp = mp_chain;
while (mp != NULL) {
socklen_t slen;
@@ -1077,7 +1177,7 @@ overlay_m_tx(void *arg, mblk_t *mp_chain)
ep = NULL;
ret = overlay_target_lookup(odd, mp,
- (struct sockaddr *)&storage, &slen);
+ (struct sockaddr *)&storage, &slen, &einfo.ovdi_id);
if (ret != OVERLAY_TARGET_OK) {
if (ret == OVERLAY_TARGET_DROP)
freemsg(mp);
@@ -1260,6 +1360,19 @@ overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
}
odd->odd_vid = oicp->oic_vnetid;
+ if (oicp->oic_dcid > UINT32_MAX) {
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EINVAL);
+ }
+ odd->odd_dcid = oicp->oic_dcid;
+
+ odd->odd_vl2sz = OVERLAY_VL2_CACHE_DEF;
+ odd->odd_vl2a = OVERLAY_CACHE_A_DEF;
+ odd->odd_routesz = OVERLAY_ROUTE_CACHE_DEF;
+ odd->odd_routea = OVERLAY_CACHE_A_DEF;
+
mac = mac_alloc(MAC_VERSION);
if (mac == NULL) {
mutex_exit(&overlay_dev_lock);
@@ -1613,6 +1726,7 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
int ret;
mac_perim_handle_t mph;
uint_t propid = UINT_MAX;
+ uint32_t def;
overlay_ioc_propinfo_t *oip = karg;
overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
@@ -1695,6 +1809,42 @@ overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
overlay_prop_set_nodefault(phdl);
break;
+ case OVERLAY_DEV_P_DCID:
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_nodefault(phdl);
+ overlay_prop_set_range_uint32(phdl, 0, UINT32_MAX);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_SIZE:
+ def = OVERLAY_VL2_CACHE_DEF;
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, OVERLAY_VL2_CACHE_MIN,
+ OVERLAY_VL2_CACHE_MAX);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_A:
+ def = OVERLAY_CACHE_A_DEF;
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, 0, OVERLAY_CACHE_MAX_A);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_SIZE:
+ def = OVERLAY_ROUTE_CACHE_DEF;
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, OVERLAY_ROUTE_CACHE_MIN,
+ OVERLAY_ROUTE_CACHE_MAX);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_A:
+ def = OVERLAY_CACHE_A_DEF;
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, 0, OVERLAY_CACHE_MAX_A);
+ break;
default:
overlay_hold_rele(odd);
mac_perim_exit(mph);
@@ -1804,6 +1954,41 @@ overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
}
mutex_exit(&odd->odd_lock);
break;
+ case OVERLAY_DEV_P_DCID:
+ /*
+ * While it's read-only while inside of a mux, we're not in a
+ * context that can guarantee that. Therefore we always grab the
+ * overlay_dev_t's odd_lock.
+ */
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_dcid, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_SIZE:
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_vl2sz, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_A:
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_vl2a, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_SIZE:
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_routesz, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_A:
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_routea, oip->oip_value, sizeof (uint32_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint32_t);
+ break;
default:
ret = ENOENT;
}
@@ -1845,6 +2030,146 @@ overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
mutex_exit(&odd->odd_lock);
}
+static void
+overlay_setprop_dcid(overlay_dev_t *odd, uint32_t dcid)
+{
+ mutex_enter(&odd->odd_lock);
+
+ /* Simple case, not active */
+ if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+ odd->odd_dcid = dcid;
+ mutex_exit(&odd->odd_lock);
+ return;
+ }
+
+ /*
+ * In the hard case, we need to set the drop flag, quiesce I/O and then
+ * we can go ahead and do everything.
+ */
+ odd->odd_flags |= OVERLAY_F_MDDROP;
+ overlay_io_wait(odd, OVERLAY_F_IOMASK);
+ mutex_exit(&odd->odd_lock);
+
+ overlay_mux_remove_dev(odd->odd_mux, odd);
+ mutex_enter(&odd->odd_lock);
+ odd->odd_dcid = dcid;
+ mutex_exit(&odd->odd_lock);
+ overlay_mux_add_dev(odd->odd_mux, odd);
+
+ mutex_enter(&odd->odd_lock);
+ ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
+ odd->odd_flags &= ~OVERLAY_F_IN_MUX;
+ mutex_exit(&odd->odd_lock);
+}
+
+static int
+overlay_setprop_vl2_cachesz(overlay_dev_t *odd, uint32_t sz)
+{
+ overlay_target_t *ott = NULL;
+ int ret = 0;
+
+ if (sz == 0)
+ sz = OVERLAY_VL2_CACHE_DEF;
+
+ /* Caller should have validated this */
+ ASSERT3U(sz, >=, OVERLAY_VL2_CACHE_MIN);
+ ASSERT3U(sz, <=, OVERLAY_VL2_CACHE_MAX);
+
+ mutex_enter(&odd->odd_lock);
+ ott = odd->odd_target;
+
+ /* ott_mode is RO if the target exists */
+ if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ mutex_enter(&ott->ott_lock);
+ ret = qqcache_adjust_size(ott->ott_u.ott_dyn.ott_dhash, sz);
+ mutex_exit(&ott->ott_lock);
+ }
+
+ if (ret == 0)
+ odd->odd_vl2sz = sz;
+ mutex_exit(&odd->odd_lock);
+
+ return (ret);
+}
+
+static int
+overlay_setprop_vl2_cachea(overlay_dev_t *odd, uint32_t a)
+{
+ overlay_target_t *ott = NULL;
+ int ret = 0;
+
+ /* Caller should have validated this */
+ ASSERT3U(a, <=, 100);
+
+ mutex_enter(&odd->odd_lock);
+ ott = odd->odd_target;
+
+ /* ott_mode is RO if the target exists */
+ if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ mutex_enter(&ott->ott_lock);
+ ret = qqcache_adjust_a(ott->ott_u.ott_dyn.ott_dhash, a);
+ mutex_exit(&ott->ott_lock);
+ }
+ if (ret == 0)
+ odd->odd_vl2a = a;
+ mutex_exit(&odd->odd_lock);
+
+ return (ret);
+}
+
+static int
+overlay_setprop_route_cachesz(overlay_dev_t *odd, uint32_t sz)
+{
+ overlay_target_t *ott = NULL;
+ int ret = 0;
+
+ if (sz == 0)
+ sz = OVERLAY_ROUTE_CACHE_DEF;
+
+ ASSERT3U(sz, >=, OVERLAY_ROUTE_CACHE_MIN);
+ ASSERT3U(sz, <=, OVERLAY_ROUTE_CACHE_MAX);
+
+ mutex_enter(&odd->odd_lock);
+ ott = odd->odd_target;
+
+ /* ott_mode is RO if the target exists */
+ if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ mutex_enter(&ott->ott_lock);
+ ret = qqcache_adjust_size(ott->ott_u.ott_dyn.ott_l3dhash, sz);
+ mutex_exit(&ott->ott_lock);
+ }
+ if (ret == 0)
+ odd->odd_routesz = sz;
+ mutex_exit(&odd->odd_lock);
+
+ return (ret);
+}
+
+static int
+overlay_setprop_route_cachea(overlay_dev_t *odd, uint32_t a)
+{
+ overlay_target_t *ott = NULL;
+ int ret = 0;
+
+ /* Caller should have validated this */
+ ASSERT3U(a, <=, 100);
+
+ mutex_enter(&odd->odd_lock);
+ ott = odd->odd_target;
+
+ /* ott_mode is RO if the target exists */
+ if (ott != NULL && ott->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ mutex_enter(&ott->ott_lock);
+ ret = qqcache_adjust_a(ott->ott_u.ott_dyn.ott_l3dhash, a);
+ mutex_exit(&ott->ott_lock);
+ }
+ if (ret == 0)
+ odd->odd_routea = a;
+ mutex_exit(&odd->odd_lock);
+
+ return (ret);
+}
+
/* ARGSUSED */
static int
overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
@@ -1855,7 +2180,7 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
overlay_ioc_prop_t *oip = karg;
uint_t propid = UINT_MAX;
mac_perim_handle_t mph;
- uint64_t maxid, *vidp;
+ uint64_t maxid, *vidp, *dcidp, *vl2szp, *vl2ap, *routeszp, *routeap;
if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
return (EINVAL);
@@ -1865,31 +2190,48 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
return (ENOENT);
oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
- mac_perim_enter_by_mh(odd->odd_mh, &mph);
- mutex_enter(&odd->odd_lock);
- if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
- mac_perim_exit(mph);
- mutex_exit(&odd->odd_lock);
- return (ENOTSUP);
- }
- mutex_exit(&odd->odd_lock);
+
+ /*
+ * Currently, only certain overlay properties (and no encapsulation
+ * properties) can be changed while the overlay device is active.
+ */
if (oip->oip_id == -1) {
int i;
for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
break;
- if (i == OVERLAY_DEV_NPROPS) {
- ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
- odd->odd_pvoid, oip->oip_name,
- oip->oip_value, oip->oip_size);
- overlay_hold_rele(odd);
- mac_perim_exit(mph);
- return (ret);
- }
}
- propid = i;
+ if (i < OVERLAY_DEV_NPROPS)
+ propid = i;
+ } else if (oip->oip_id < OVERLAY_DEV_NPROPS) {
+ propid = oip->oip_id;
+ }
+
+ /*
+ * A bit tricky, but propid is initalized to UINT_MAX, so we know we
+ * have an overlay property whenever propid < OVERLAY_DEV_NPROPS,
+ * otherwise we have a plugin property.
+ */
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_ACTIVATED) &&
+ ((propid >= OVERLAY_DEV_NPROPS || !overlay_dev_liveprop[propid]))) {
+ mutex_exit(&odd->odd_lock);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_exit(&odd->odd_lock);
+
+ if (oip->oip_id == -1 && propid >= OVERLAY_DEV_NPROPS) {
+ ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
+ odd->odd_pvoid, oip->oip_name,
+ oip->oip_value, oip->oip_size);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (ret);
} else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
@@ -1941,6 +2283,68 @@ overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
case OVERLAY_DEV_P_VARPDID:
ret = EPERM;
break;
+ case OVERLAY_DEV_P_DCID:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ dcidp = (uint64_t *)oip->oip_value;
+ if (*dcidp > UINT32_MAX) {
+ ret = EINVAL;
+ break;
+ }
+ overlay_setprop_dcid(odd, *dcidp);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_SIZE:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ vl2szp = (uint64_t *)oip->oip_value;
+ if (*vl2szp != 0 && (*vl2szp < OVERLAY_VL2_CACHE_MIN ||
+ *vl2szp > OVERLAY_VL2_CACHE_MAX)) {
+ ret = EINVAL;
+ break;
+ }
+ ret = overlay_setprop_vl2_cachesz(odd, *vl2szp);
+ break;
+ case OVERLAY_DEV_P_VL2_CACHE_A:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ vl2ap = (uint64_t *)oip->oip_value;
+ if (*vl2ap > OVERLAY_CACHE_MAX_A) {
+ ret = EINVAL;
+ break;
+ }
+ ret = overlay_setprop_vl2_cachea(odd, *vl2ap);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_SIZE:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ routeszp = (uint64_t *)oip->oip_value;
+ if (*routeszp != 0 && (*routeszp < OVERLAY_ROUTE_CACHE_MIN ||
+ OVERLAY_ROUTE_CACHE_MAX)) {
+ ret = EINVAL;
+ break;
+ }
+ ret = overlay_setprop_route_cachesz(odd, *routeszp);
+ break;
+ case OVERLAY_DEV_P_ROUTE_CACHE_A:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ routeap = (uint64_t *)oip->oip_value;
+ if (*routeap > OVERLAY_CACHE_MAX_A) {
+ ret = EINVAL;
+ break;
+ }
+ ret = overlay_setprop_route_cachea(odd, *routeap);
+ break;
default:
ret = ENOENT;
}
diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c
index 0c21bb8689..2688d2791c 100644
--- a/usr/src/uts/common/io/overlay/overlay_mux.c
+++ b/usr/src/uts/common/io/overlay/overlay_mux.c
@@ -127,6 +127,18 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
freeb(fmp);
/*
+ * In cases of looped-back vxlan, that tends to have a
+ * prepended IP+UDP-only mblk, followed by the data. Parsing
+ * would've made that mblk a zero-length one (rptr == wptr).
+ */
+ if (mp->b_rptr == mp->b_wptr && mp->b_cont != NULL) {
+ /* Ended up with zero-length mblk, lose it! */
+ fmp = mp;
+ mp = fmp->b_cont;
+ freeb(fmp);
+ }
+
+ /*
* Until we have VXLAN-or-other-decap HW acceleration support
* (e.g. we support NICs that reach into VXLAN-encapsulated
* packets and check the inside-VXLAN IP packets' checksums,
@@ -161,10 +173,9 @@ overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
if (rem == blkl) {
fmp = mp;
mp = fmp->b_cont;
- fmp->b_cont = NULL;
OVERLAY_FREEMSG(mp,
"freed a fmp block");
- freemsg(fmp);
+ freeb(fmp);
}
}
if (mp == NULL) {
diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c
index f4147b56d1..171acd034f 100644
--- a/usr/src/uts/common/io/overlay/overlay_target.c
+++ b/usr/src/uts/common/io/overlay/overlay_target.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -20,6 +20,8 @@
* uts/common/io/overlay/overlay.c
*/
+#include <inet/ip.h>
+#include <inet/ip6.h>
#include <sys/types.h>
#include <sys/ethernet.h>
#include <sys/kmem.h>
@@ -42,6 +44,9 @@
#include <sys/overlay_impl.h>
#include <sys/sdt.h>
+#define OVERLAY_DROP(mp, reason) \
+ DTRACE_PROBE2(overlay__drop, mblk_t *, mp, char *, reason)
+
/*
* This is total straw man, but at least it's a prime number. Here we're
* going to have to go through and do a lot of evaluation and understanding as
@@ -52,6 +57,19 @@
#define OVERLAY_HSIZE 823
/*
+ * The default size of each target cache. This is also a complete strawman
+ * whose value could change as we gain better operational experience with
+ * overlay routing.
+ */
+#define OVERLAY_CACHE_SIZE 512
+
+/*
+ * A somewhat arbitrary value. The percentage of the target cache dedicated
+ * to MFU entries (i.e. entries that have been looked up more than once).
+ */
+#define OVERLAY_CACHE_A 60
+
+/*
* We use this data structure to keep track of what requests have been actively
* allocated to a given instance so we know what to put back on the pending
* list.
@@ -69,7 +87,7 @@ typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int);
typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *);
typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int);
-typedef struct overaly_target_ioctl {
+typedef struct overlay_target_ioctl {
int oti_cmd; /* ioctl id */
boolean_t oti_write; /* ioctl requires FWRITE */
boolean_t oti_ncopyout; /* copyout data? */
@@ -144,25 +162,60 @@ overlay_entry_cache_destructor(void *buf, void *arg)
static uint64_t
overlay_mac_hash(const void *v)
{
+ const overlay_target_mac_t *m = v;
+
uint32_t crc;
- CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
+ CRC32(crc, m->otm_mac, ETHERADDRL, -1U, crc32_table);
+ CRC32(crc, &m->otm_dcid, sizeof (uint32_t), crc, crc32_table);
return (crc);
}
static int
overlay_mac_cmp(const void *a, const void *b)
{
- return (bcmp(a, b, ETHERADDRL));
+ const overlay_target_mac_t *l = a;
+ const overlay_target_mac_t *r = b;
+
+ if (l->otm_dcid != r->otm_dcid)
+ return (1);
+ return (bcmp(l->otm_mac, r->otm_mac, ETHERADDRL) != 0);
+}
+
+static uint64_t
+overlay_ip_hash(const void *v)
+{
+ const overlay_target_vl3_t *vl3 = v;
+
+ uint32_t crc;
+ CRC32(crc, &vl3->otvl3_src, sizeof (vl3->otvl3_src), -1U, crc32_table);
+ CRC32(crc, &vl3->otvl3_dst, sizeof (vl3->otvl3_dst), crc, crc32_table);
+ CRC32(crc, &vl3->otvl3_src_vlan, sizeof (vl3->otvl3_src_vlan), crc,
+ crc32_table);
+ return (crc);
+}
+
+static int
+overlay_ip_cmp(const void *a, const void *b)
+{
+ const overlay_target_vl3_t *l = a;
+ const overlay_target_vl3_t *r = b;
+
+ if (l->otvl3_src_vlan != r->otvl3_src_vlan)
+ return (1);
+ if (!IN6_ARE_ADDR_EQUAL(&l->otvl3_src, &r->otvl3_src))
+ return (1);
+ if (!IN6_ARE_ADDR_EQUAL(&l->otvl3_dst, &r->otvl3_dst))
+ return (1);
+ return (0);
}
-/* ARGSUSED */
static void
overlay_target_entry_dtor(void *arg)
{
overlay_target_entry_t *ote = arg;
ote->ote_flags = 0;
- bzero(ote->ote_addr, ETHERADDRL);
+ bzero(&ote->ote_u, sizeof (ote->ote_u));
ote->ote_ott = NULL;
ote->ote_odd = NULL;
freemsgchain(ote->ote_chead);
@@ -172,21 +225,76 @@ overlay_target_entry_dtor(void *arg)
kmem_cache_free(overlay_entry_cache, ote);
}
+static void
+overlay_target_entry_l2qq_dtor(void *arg)
+{
+ overlay_target_entry_t *ote = arg;
+ overlay_target_t *ott = ote->ote_ott;
+
+ ASSERT(MUTEX_HELD(&ott->ott_lock));
+ ASSERT3S((ote->ote_flags & OVERLAY_ENTRY_F_VL3), ==, 0);
+
+ avl_remove(&ott->ott_u.ott_dyn.ott_tree, ote);
+ overlay_target_entry_dtor(ote);
+}
+
+static void
+overlay_target_entry_l3qq_dtor(void *arg)
+{
+ overlay_target_entry_t *ote = arg;
+ overlay_target_t *ott = ote->ote_ott;
+
+ ASSERT(MUTEX_HELD(&ott->ott_lock));
+ ASSERT3S((ote->ote_flags & OVERLAY_ENTRY_F_VL3), ==,
+ OVERLAY_ENTRY_F_VL3);
+
+ avl_remove(&ott->ott_u.ott_dyn.ott_l3tree, ote);
+ overlay_target_entry_dtor(ote);
+}
+
static int
overlay_mac_avl(const void *a, const void *b)
{
+ const overlay_target_entry_t *le = a;
+ const overlay_target_entry_t *re = b;
+ const overlay_target_mac_t *lm = &le->ote_u.ote_vl2.otvl2_mac;
+ const overlay_target_mac_t *rm = &re->ote_u.ote_vl2.otvl2_mac;
int i;
- const overlay_target_entry_t *l, *r;
- l = a;
- r = b;
+
+ /* Order by DCID, then MAC */
+ if (lm->otm_dcid < rm->otm_dcid)
+ return (-1);
+ if (lm->otm_dcid > rm->otm_dcid)
+ return (1);
for (i = 0; i < ETHERADDRL; i++) {
- if (l->ote_addr[i] > r->ote_addr[i])
+ if (lm->otm_mac[i] > rm->otm_mac[i])
return (1);
- else if (l->ote_addr[i] < r->ote_addr[i])
+ else if (lm->otm_mac[i] < rm->otm_mac[i])
return (-1);
}
+ return (0);
+}
+static int
+overlay_ip_avl(const void *a, const void *b)
+{
+ const overlay_target_entry_t *l = a;
+ const overlay_target_entry_t *r = b;
+ const overlay_target_vl3_t *l_vl3 = &l->ote_u.ote_vl3;
+ const overlay_target_vl3_t *r_vl3 = &r->ote_u.ote_vl3;
+ int ret;
+
+ if ((ret = memcmp(&l_vl3->otvl3_src, &r_vl3->otvl3_src,
+ sizeof (l_vl3->otvl3_src))) != 0)
+ return (ret < 0 ? -1 : 1);
+ if ((ret = memcmp(&l_vl3->otvl3_dst, &r_vl3->otvl3_dst,
+ sizeof (l_vl3->otvl3_dst))) != 0)
+ return (ret < 0 ? -1 : 1);
+ if (l_vl3->otvl3_src_vlan < r_vl3->otvl3_src_vlan)
+ return (-1);
+ if (l_vl3->otvl3_src_vlan > r_vl3->otvl3_src_vlan)
+ return (1);
return (0);
}
@@ -233,25 +341,20 @@ overlay_target_free(overlay_dev_t *odd)
return;
if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
- refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
- avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
- overlay_target_entry_t *ote;
-
+ mutex_enter(&odd->odd_target->ott_lock);
/*
- * Our AVL tree and hashtable contain the same elements,
- * therefore we should just remove it from the tree, but then
- * delete the entries when we remove them from the hash table
- * (which happens through the refhash dtor).
+ * Our VL3 AVL tree and hashtable contain the same elements.
+ * Additionally, when an entry is removed from the 2Q cache,
+ * the entry is removed from the corresponding AVL tree.
+ * Deleting the 2Q cache will destroy any remaining entries,
+ * so all we need to do is destroy the 2Q caches.
*/
- while ((ote = avl_first(ap)) != NULL)
- avl_remove(ap, ote);
-
- avl_destroy(ap);
- for (ote = refhash_first(rp); ote != NULL;
- ote = refhash_next(rp, ote)) {
- refhash_remove(rp, ote);
- }
- refhash_destroy(rp);
+ qqcache_destroy(odd->odd_target->ott_u.ott_dyn.ott_dhash);
+ qqcache_destroy(odd->odd_target->ott_u.ott_dyn.ott_l3dhash);
+ ASSERT(avl_is_empty(&odd->odd_target->ott_u.ott_dyn.ott_tree));
+ ASSERT(avl_is_empty(
+ &odd->odd_target->ott_u.ott_dyn.ott_l3tree));
+ mutex_exit(&odd->odd_target->ott_lock);
}
ASSERT(odd->odd_target->ott_ocount == 0);
@@ -270,18 +373,42 @@ overlay_target_busy()
return (ret);
}
+/*
+ * Queue the target entry on the list of varpd requests. entry should be
+ * refheld for the duration of this call (this call takes its own additional
+ * hold that is released when we receive a response).
+ */
static void
overlay_target_queue(overlay_target_entry_t *entry)
{
+ overlay_target_t *ott = entry->ote_ott;
+ boolean_t is_vl3 = B_FALSE;
+
+ /*
+ * ote_ott is read-only and set at entry creation, so it can be
+ * read without ote_lock held
+ */
+ ASSERT(!MUTEX_HELD(&entry->ote_lock));
+
+ mutex_enter(&entry->ote_lock);
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0)
+ is_vl3 = B_TRUE;
+ mutex_exit(&entry->ote_lock);
+
mutex_enter(&overlay_target_lock);
- mutex_enter(&entry->ote_ott->ott_lock);
- if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
- mutex_exit(&entry->ote_ott->ott_lock);
+ mutex_enter(&ott->ott_lock);
+ if (ott->ott_flags & OVERLAY_T_TEARDOWN) {
+ mutex_exit(&ott->ott_lock);
mutex_exit(&overlay_target_lock);
return;
}
- entry->ote_ott->ott_ocount++;
- mutex_exit(&entry->ote_ott->ott_lock);
+ ott->ott_ocount++;
+ if (is_vl3)
+ qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ else
+ qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
+
+ mutex_exit(&ott->ott_lock);
list_insert_tail(&overlay_target_list, entry);
cv_signal(&overlay_target_condvar);
mutex_exit(&overlay_target_lock);
@@ -300,22 +427,446 @@ overlay_target_quiesce(overlay_target_t *ott)
}
/*
- * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
+ * Write the VL3 src/dst IP from the packet in mp into src and dst. If the
+ * addresses are IPv4 addresses, they are written as mapped addresses.
+ */
+static int
+overlay_get_vl3_ips(mblk_t *mp, struct in6_addr *src, struct in6_addr *dst)
+{
+ uint16_t sap;
+
+#if 1
+ /* Temporary until mblk helpers are integrated */
+ struct ether_vlan_header *eth = (struct ether_vlan_header *)mp->b_rptr;
+ ipha_t *iphp = (ipha_t *)(eth + 1);
+ ip6_t *ip6hp = (ip6_t *)(eth + 1);
+ size_t mlen = MBLKL(mp);
+
+ if (mlen < sizeof (struct ether_vlan_header))
+ return (EINVAL);
+ mlen -= sizeof (struct ether_vlan_header);
+
+ /* We currently don't support routing on untagged vlans */
+ if ((sap = ntohs(eth->ether_tpid)) != ETHERTYPE_VLAN)
+ return (EINVAL);
+
+ sap = ntohs(eth->ether_type);
+ if (mlen == 0) {
+ if ((mp = mp->b_cont) == NULL)
+ return (EINVAL);
+ mlen = MBLKL(mp);
+ iphp = (ipha_t *)mp->b_rptr;
+ ip6hp = (ip6_t *)mp->b_rptr;
+ }
+
+ switch (sap) {
+ case ETHERTYPE_IP:
+ if (mlen < sizeof (ipha_t))
+ return (EINVAL);
+ ASSERT3U(IPH_HDR_VERSION(iphp), ==, IPV4_VERSION);
+ IN6_IPADDR_TO_V4MAPPED(iphp->ipha_src, src);
+ IN6_IPADDR_TO_V4MAPPED(iphp->ipha_dst, dst);
+ break;
+ case ETHERTYPE_IPV6:
+ if (mlen < sizeof (ip6_t))
+ return (EINVAL);
+ ASSERT3U(IPH_HDR_VERSION(iphp), ==, IPV6_VERSION);
+ bcopy(&ip6hp->ip6_src, src, sizeof (*src));
+ bcopy(&ip6hp->ip6_dst, dst, sizeof (*dst));
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+#else
+ size_t soff, doff;
+ uint32_t v4s, v4d;
+ int i;
+
+ if (!mblk_read_uint16(mp, offsetof(struct ether_header, ether_type),
+ &sap))
+ return (EINVAL);
+
+ if (sap == ETHERTYPE_VLAN) {
+ if (!mblk_read_uint16(mp,
+ offsetof(struct ether_vlan_header, ether_type), &sap))
+ return (EINVAL);
+ soff = doff = sizeof (struct ether_vlan_header);
+ } else {
+ soff = doff = sizeof (struct ether_header);
+ }
+
+ switch (sap) {
+ case ETHERTYPE_IP:
+ soff += offsetof(ipha_t, ipha_src);
+ doff += offsetof(ipha_t, ipha_dst);
+
+ if (!mblk_read_uint32(mp, soff, &v4s) ||
+ !mblk_read_uint32(mp, doff, &v4d))
+ return (EINVAL);
+ IN6_IPADDR_TO_V4MAPPED(&v4s, src);
+ IN6_IPADDR_TO_V4MAPPED(&v4d, dst);
+ break;
+ case ETHERTYPE_IPV6:
+ soff += offsetof(ip6_t, ip6_src);
+ doff += offsetof(ip6_6, ip6_dst);
+
+ for (i = 0; i < 4; i++) {
+ if (!mblk_read_uint32(mp, soff, &src->s6_addr32[i]) ||
+ !mblk_read_uint32(mp, doff, &dst->s6_addr32[i]))
+ return (EINVAL);
+ soff += sizeof (uint32_t);
+ doff += sizeof (uint32_t);
+ }
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+#endif
+}
+
+static int
+overlay_route(overlay_dev_t *odd, mblk_t *mp,
+ const overlay_target_route_t *route, const overlay_target_mac_t *dst_mac)
+{
+ uint16_t tci;
+
+ if (MBLKL(mp) >= sizeof (struct ether_vlan_header)) {
+ struct ether_vlan_header *evh;
+
+ evh = (struct ether_vlan_header *)mp->b_rptr;
+ tci = ntohs(evh->ether_tci);
+
+ /*
+ * Today we require all encapsulated frames to be vlan tagged.
+ * If this is relaxed in the future, we will need to allow for
+ * insertion and removal of the vlan tag as appropriate here.
+ */
+ if (ntohs(evh->ether_tpid) != ETHERTYPE_VLAN) {
+ OVERLAY_DROP(mp, "not vlan tagged");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ tci &= ~(VLAN_ID_MASK);
+ tci |= route->otr_vlan;
+ evh->ether_tci = htons(tci);
+ bcopy(dst_mac->otm_mac, &evh->ether_dhost, ETHERADDRL);
+ bcopy(route->otr_srcmac, &evh->ether_shost, ETHERADDRL);
+ return (OVERLAY_TARGET_OK);
+ }
+
+#if 1
+ /* Temporary until mblk helpers are integrated */
+ OVERLAY_DROP(mp, "ethernet header split between mblks");
+ return (OVERLAY_TARGET_DROP);
+#else
+ size_t off;
+
+ off = offsetof(struct ether_vlan_header, ether_tpid);
+ if (!mblk_read_uint16(mp, off, &tci)) {
+ OVERLAY_DROP(mp, "cannot read tpid");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ tci = ntohs(evh->ether_tci);
+ tci &= ~(VLAN_ID_MASK);
+ tci |= route->otr_vlan;
+
+ if (!mblk_write_uint16(mp, off, tci)) {
+ OVERLAY_DROP(mp, "cannot set routed destination vlan");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ for (int i = 0; i < ETHERADDRL; i++) {
+ if (!mblk_write_uint8(mp, i, dst_msc->otm_mac[i]) ||
+ !mblk_write_uint8(mp, i + ETHERADDRL,
+ route->otr_srcmac[i])) {
+ OVERLAY_DROP(mp, "cannot set routed macs");
+ return (OVERLAY_TARGET_DROP);
+ }
+ }
+
+ return (OVERLAY_TARGET_OK);
+#endif
+}
+
+/*
+ * Attempt to add mp to the packet queue of target entry. If the queue is
+ * already full, it returns OVERLAY_TARGET_DROP, otherwise OVERLAY_TARGET_ASYNC
+ * is returned. If the entry isn't already pending a response from varpd,
+ * queue the target entry on the list of outstanding varpd requests.
+ *
+ * Entry should already be locked, however since it is intended that this
+ * should be the final step in dealing with this entry (for handling the
+ * packet in question), it always releases ote_lock before returning.
+ * entry should be refheld for the duration of this call.
+ */
+static int
+overlay_target_try_queue(overlay_target_entry_t *entry, mblk_t *mp)
+{
+ size_t mlen = msgsize(mp);
+ boolean_t queue = B_FALSE;
+
+ ASSERT(MUTEX_HELD(&entry->ote_lock));
+
+ if (mlen + entry->ote_mbsize > overlay_ent_size) {
+ OVERLAY_DROP(mp, "target queue full");
+ mutex_exit(&entry->ote_lock);
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ if (entry->ote_ctail != NULL) {
+ ASSERT(entry->ote_ctail->b_next == NULL);
+ entry->ote_ctail->b_next = mp;
+ entry->ote_ctail = mp;
+ } else {
+ entry->ote_chead = mp;
+ entry->ote_ctail = mp;
+ }
+ entry->ote_mbsize += mlen;
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0) {
+ entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+ queue = B_TRUE;
+ }
+ mutex_exit(&entry->ote_lock);
+
+ if (queue)
+ overlay_target_queue(entry);
+
+ return (OVERLAY_TARGET_ASYNC);
+}
+
+/*
+ * Given the VL3 IP->VL2 mac entry (vl3e), and the corresponding VL2 MAC->UL3
+ * entry (vl2e), if both entries are valid, sets *vidp, *v6, *slenp to the
+ * correct UL3 destination and return OVERLAY_TARGET_OK. If either of the
+ * entries are still pending lookups (or v2le is NULL because the entry is
+ * missing), mp is queued (if there is space) on the appropriate entry and
+ * OVERLAY_TARGET_ASYNC is returned. If the VL2 entry is flagged to drop all
+ * packets, OVERLAY_TARGET_DROP is returned.
+ *
+ * In all cases, the caller should acquire vl3e->ote_lock prior to calling
+ * overlay_route_lookup_vl2(). Because vl2e can be missing (NULL), the caller
+ * should not acquire vl2e->ote_lock prior to calling
+ * overlay_route_lookup_vl2(). vl3e->ote_lock is alway dropped prior to
+ * returning.
+ */
+static int
+overlay_route_lookup_vl2(overlay_target_entry_t *vl3e,
+ overlay_target_entry_t *vl2e, uint64_t *vidp, struct sockaddr_in6 *v6,
+ socklen_t *slenp, mblk_t *mp)
+{
+ overlay_target_vl2_t *vl2p;
+ int ret;
+
+ ASSERT(MUTEX_HELD(&vl3e->ote_lock));
+
+ if (vl2e == NULL) {
+ vl3e->ote_flags &= ~OVERLAY_ENTRY_F_VALID;
+ /* This drops vl3e->ote_lock */
+ return (overlay_target_try_queue(vl3e, mp));
+ }
+
+ mutex_enter(&vl2e->ote_lock);
+ if (vl2e->ote_flags & (OVERLAY_ENTRY_F_DROP | OVERLAY_ENTRY_F_ROUTER)) {
+ overlay_target_entry_flags_t flags = vl2e->ote_flags;
+
+ mutex_exit(&vl2e->ote_lock);
+ mutex_exit(&vl3e->ote_lock);
+
+ if (flags & OVERLAY_ENTRY_F_DROP) {
+ OVERLAY_DROP(mp, "VL2 target marked drop");
+ } else {
+ OVERLAY_DROP(mp, "VL2 target is overlay router");
+ }
+
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ /*
+ * If the route is missing queue on the VL3 entry so a VL3->UL3
+ * lookup is done (to get the route data).
+ */
+ if ((vl2e->ote_flags & OVERLAY_ENTRY_F_HAS_ROUTE) == 0) {
+ mutex_exit(&vl2e->ote_lock);
+
+ vl3e->ote_flags &= ~OVERLAY_ENTRY_F_VALID;
+ /* This drops vl3e->ote_lock */
+ return (overlay_target_try_queue(vl3e, mp));
+ }
+
+ /*
+ * If the VL2 target point is missing, we try to be a bit (though
+ * hopefully not too) clever. We can always queue on the VL3 entry
+ * which will trigger a VL3->UL3 lookup request (as it is effectively
+ * a superset of the VL2->UL3 lookup). However, if we know we already
+ * have an outstanding VL3->UL3 request, we queue on the VL2 entry and
+ * avoid doing another redundant lookup. We can also queue on the VL2
+ * entry when it is a local (same vnet, same DC) destination -- we
+ * currently cannot generate VL2->UL3 lookups for remote destinations,
+ * only same vnet, same DC. Queueing on the VL2 entry also allows
+ * instances on the same vlan as the queued VL2 entry to piggy back on
+ * the lookup request and avoid a redundant lookup. However if the
+ * VL2 entry is remote, we have to do a VL3->UL3 lookup.
+ */
+ if ((vl2e->ote_flags & OVERLAY_ENTRY_F_VALID) == 0) {
+ if ((vl2e->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0 &&
+ vl2e->ote_u.ote_vl2.otvl2_mac.otm_dcid !=
+ vl2e->ote_odd->odd_dcid) {
+ mutex_exit(&vl2e->ote_lock);
+ /* This drops vl3e->ote_lock */
+ return (overlay_target_try_queue(vl3e, mp));
+ }
+
+ mutex_exit(&vl3e->ote_lock);
+ /* This drops vl2e->ote_lock */
+ return (overlay_target_try_queue(vl2e, mp));
+ }
+
+ ASSERT(vl2e->ote_flags & OVERLAY_ENTRY_F_VALID);
+
+ vl2p = &vl2e->ote_u.ote_vl2;
+
+ *vidp = vl2p->otvl2_route.otr_vnet;
+ bcopy(&vl2p->otvl2_dest.otp_ip, &v6->sin6_addr,
+ sizeof (struct in6_addr));
+ v6->sin6_port = htons(vl2p->otvl2_dest.otp_port);
+ *slenp = sizeof (struct sockaddr_in6);
+
+ ret = overlay_route(vl2e->ote_odd, mp, &vl2p->otvl2_route,
+ &vl2p->otvl2_mac);
+ mutex_exit(&vl2e->ote_lock);
+ mutex_exit(&vl3e->ote_lock);
+ return (ret);
+}
+
+static int
+overlay_route_lookup(overlay_dev_t *odd, mblk_t *mp, uint16_t vlan,
+ struct sockaddr *sock, socklen_t *slenp, uint64_t *vidp)
+{
+ overlay_target_t *ott = odd->odd_target;
+ overlay_target_entry_t *entry, *vl2_entry = NULL;
+ struct sockaddr_in6 *v6 = (struct sockaddr_in6 *)sock;
+ overlay_target_vl3_t vl3 = { 0 };
+ int ret = OVERLAY_TARGET_DROP;
+
+ /* overlay_target_lookup() should have set this */
+ ASSERT3U(v6->sin6_family, ==, AF_INET6);
+
+ /* We should only be called for dynamic endpoints */
+ ASSERT3U(ott->ott_mode, ==, OVERLAY_TARGET_DYNAMIC);
+
+ vl3.otvl3_src_vlan = vlan;
+ if ((ret = overlay_get_vl3_ips(mp, &vl3.otvl3_src, &vl3.otvl3_dst))
+ != OVERLAY_TARGET_OK) {
+ OVERLAY_DROP(mp, "could not read VL3 src/dst IPs");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ mutex_enter(&ott->ott_lock);
+ entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_l3dhash, &vl3);
+ if (entry == NULL) {
+ if ((entry = kmem_cache_alloc(overlay_entry_cache,
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
+ mutex_exit(&ott->ott_lock);
+ OVERLAY_DROP(mp, "failed VL3 target entry allocation");
+ return (OVERLAY_TARGET_DROP);
+ }
+
+ bcopy(&vl3, &entry->ote_u.ote_vl3, sizeof (vl3));
+ entry->ote_flags = OVERLAY_ENTRY_F_VL3;
+
+ entry->ote_chead = entry->ote_ctail = mp;
+ entry->ote_mbsize = msgsize(mp);
+ entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+
+ entry->ote_ott = ott;
+ entry->ote_odd = odd;
+
+ qqcache_insert(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ avl_add(&ott->ott_u.ott_dyn.ott_l3tree, entry);
+ mutex_exit(&ott->ott_lock);
+
+ overlay_target_queue(entry);
+
+ mutex_enter(&ott->ott_lock);
+ qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ mutex_exit(&ott->ott_lock);
+ return (OVERLAY_TARGET_ASYNC);
+ }
+ qqcache_hold(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ mutex_enter(&entry->ote_lock);
+
+ /*
+ * A bit ugly, but if we need the VL2 entry, we want to look it up
+ * while we still hold ott_lock.
+ */
+ if ((entry->ote_flags &
+ (OVERLAY_ENTRY_F_DROP|OVERLAY_ENTRY_F_ROUTER|
+ OVERLAY_ENTRY_F_VALID)) == OVERLAY_ENTRY_F_VALID) {
+ vl2_entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ &entry->ote_u.ote_vl3.otvl3_vl2);
+ if (vl2_entry != NULL)
+ qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, vl2_entry);
+ }
+ mutex_exit(&ott->ott_lock);
+
+ ASSERT(entry->ote_flags & OVERLAY_ENTRY_F_VL3);
+
+ if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ mutex_exit(&entry->ote_lock);
+ OVERLAY_DROP(mp, "VL3 target entry marked drop");
+ ret = OVERLAY_TARGET_DROP;
+ } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ /*
+ * XXX: A packet with a dst IP of an overlay router.
+ * Maybe generate an ICMP reply? For now, we drop.
+ */
+ mutex_exit(&entry->ote_lock);
+ OVERLAY_DROP(mp, "VL3 target entry is router");
+ ret = OVERLAY_TARGET_DROP;
+ } else if ((entry->ote_flags & OVERLAY_ENTRY_F_VALID) == 0) {
+ /* This drops entry->ote_lock */
+ ret = overlay_target_try_queue(entry, mp);
+ } else {
+ /* This drops entry->ote_lock */
+ ret = overlay_route_lookup_vl2(entry, vl2_entry, vidp, v6,
+ slenp, mp);
+ }
+
+ mutex_enter(&ott->ott_lock);
+ qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ if (vl2_entry != NULL)
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, vl2_entry);
+ mutex_exit(&ott->ott_lock);
+ return (ret);
+}
+
+/*
+ * This function assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
* OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
- * this time, say for NVGRE, we drop all packets that mcuh this.
+ * this time, say for NVGRE, we drop all packets that match this.
*/
int
overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
- socklen_t *slenp)
+ socklen_t *slenp, uint64_t *vidp)
{
int ret;
struct sockaddr_in6 *v6;
overlay_target_t *ott;
- mac_header_info_t mhi;
overlay_target_entry_t *entry;
+ mac_header_info_t mhi;
+ overlay_target_mac_t omac;
ASSERT(odd->odd_target != NULL);
+ /* Default to our local vid, routing may change this if necessary */
+ *vidp = odd->odd_vid;
+
/*
* At this point, the overlay device is in a mux which means that it's
* been activated. At this point, parts of the target, such as the mode
@@ -323,8 +874,10 @@ overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
* about synchronization for them.
*/
ott = odd->odd_target;
- if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
+ if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) {
+ OVERLAY_DROP(mp, "plugin doesn't support IP or port");
return (OVERLAY_TARGET_DROP);
+ }
v6 = (struct sockaddr_in6 *)sock;
bzero(v6, sizeof (struct sockaddr_in6));
@@ -343,76 +896,89 @@ overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
- /*
- * Note we only want the MAC address here, therefore we won't bother
- * using mac_vlan_header_info(). If any caller needs the vlan info at
- * this point, this should change to a call to mac_vlan_header_info().
- */
- if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
+ if (mac_vlan_header_info(odd->odd_mh, mp, &mhi) != 0) {
+ OVERLAY_DROP(mp, "could not read vlan header");
return (OVERLAY_TARGET_DROP);
+ }
+
+ omac.otm_dcid = odd->odd_dcid;
+ bcopy(mhi.mhi_daddr, omac.otm_mac, ETHERADDRL);
+
mutex_enter(&ott->ott_lock);
- entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- mhi.mhi_daddr);
+ entry = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash, &omac);
if (entry == NULL) {
+ overlay_target_vl2_t *vl2p;
+
entry = kmem_cache_alloc(overlay_entry_cache,
KM_NOSLEEP | KM_NORMALPRI);
if (entry == NULL) {
mutex_exit(&ott->ott_lock);
+ OVERLAY_DROP(mp, "VL2 target entry allocation failed");
return (OVERLAY_TARGET_DROP);
}
- bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
+
+ vl2p = &entry->ote_u.ote_vl2;
+ bcopy(mhi.mhi_daddr, vl2p->otvl2_mac.otm_mac, ETHERADDRL);
+ vl2p->otvl2_mac.otm_dcid = odd->odd_dcid;
+ vl2p->otvl2_route.otr_vnet = odd->odd_vid;
+ vl2p->otvl2_route.otr_vlan = VLAN_ID(mhi.mhi_tci);
+
entry->ote_chead = entry->ote_ctail = mp;
entry->ote_mbsize = msgsize(mp);
entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+
entry->ote_ott = ott;
entry->ote_odd = odd;
- refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
+
+ qqcache_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
+ qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
mutex_exit(&ott->ott_lock);
+
overlay_target_queue(entry);
+
+ mutex_enter(&ott->ott_lock);
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+ mutex_exit(&ott->ott_lock);
return (OVERLAY_TARGET_ASYNC);
}
- refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
+ qqcache_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
mutex_exit(&ott->ott_lock);
mutex_enter(&entry->ote_lock);
if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ mutex_exit(&entry->ote_lock);
+ OVERLAY_DROP(mp, "VL2 target marked drop");
ret = OVERLAY_TARGET_DROP;
+ } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ if (mhi.mhi_bindsap == ETHERTYPE_ARP) {
+ /*
+ * Send unicast ARP requests to varpd for processing.
+ * We will eventually need something similar for IPv6.
+ * This drops entry->ote_lock.
+ */
+ ret = overlay_target_try_queue(entry, mp);
+ } else {
+ mutex_exit(&entry->ote_lock);
+ ret = overlay_route_lookup(odd, mp,
+ VLAN_ID(mhi.mhi_tci), sock, slenp, vidp);
+ }
} else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
- bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
- sizeof (struct in6_addr));
- v6->sin6_port = htons(entry->ote_dest.otp_port);
+ overlay_target_point_t *otp = &entry->ote_u.ote_vl2.otvl2_dest;
+
+ bcopy(&otp->otp_ip, &v6->sin6_addr, sizeof (struct in6_addr));
+ v6->sin6_port = htons(otp->otp_port);
+ mutex_exit(&entry->ote_lock);
+
*slenp = sizeof (struct sockaddr_in6);
ret = OVERLAY_TARGET_OK;
} else {
- size_t mlen = msgsize(mp);
-
- if (mlen + entry->ote_mbsize > overlay_ent_size) {
- ret = OVERLAY_TARGET_DROP;
- } else {
- if (entry->ote_ctail != NULL) {
- ASSERT(entry->ote_ctail->b_next ==
- NULL);
- entry->ote_ctail->b_next = mp;
- entry->ote_ctail = mp;
- } else {
- entry->ote_chead = mp;
- entry->ote_ctail = mp;
- }
- entry->ote_mbsize += mlen;
- if ((entry->ote_flags &
- OVERLAY_ENTRY_F_PENDING) == 0) {
- entry->ote_flags |=
- OVERLAY_ENTRY_F_PENDING;
- overlay_target_queue(entry);
- }
- ret = OVERLAY_TARGET_ASYNC;
- }
+ /* This drops entry->ote_lock */
+ ret = overlay_target_try_queue(entry, mp);
}
- mutex_exit(&entry->ote_lock);
mutex_enter(&ott->ott_lock);
- refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
mutex_exit(&ott->ott_lock);
return (ret);
@@ -437,6 +1003,7 @@ overlay_target_info(overlay_target_hdl_t *thdl, void *arg)
if (odd->odd_flags & OVERLAY_F_ACTIVATED)
oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE;
oti->oti_vnetid = odd->odd_vid;
+ oti->oti_dcid = odd->odd_dcid;
mutex_exit(&odd->odd_lock);
overlay_hold_rele(odd);
return (0);
@@ -488,6 +1055,13 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
}
}
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_VARPD) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (EEXIST);
+ }
+
ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP);
ott->ott_flags = 0;
ott->ott_ocount = 0;
@@ -499,21 +1073,44 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
bcopy(&ota->ota_point, &ott->ott_u.ott_point,
sizeof (overlay_target_point_t));
} else {
- ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
+ int ret;
+
+ ret = qqcache_create(&ott->ott_u.ott_dyn.ott_dhash,
+ odd->odd_vl2sz, odd->odd_vl2a, OVERLAY_HSIZE,
overlay_mac_hash, overlay_mac_cmp,
- overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
+ overlay_target_entry_l2qq_dtor,
+ sizeof (overlay_target_entry_t),
offsetof(overlay_target_entry_t, ote_reflink),
- offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
+ offsetof(overlay_target_entry_t, ote_u.ote_vl2.otvl2_mac),
+ KM_SLEEP);
+ if (ret != 0) {
+ mutex_exit(&odd->odd_lock);
+ kmem_cache_free(overlay_target_cache, ott);
+ overlay_hold_rele(odd);
+ return (ret);
+ }
+
+ ret = qqcache_create(&ott->ott_u.ott_dyn.ott_l3dhash,
+ odd->odd_routesz, odd->odd_routea, OVERLAY_HSIZE,
+ overlay_ip_hash, overlay_ip_cmp,
+ overlay_target_entry_l3qq_dtor,
+ sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_reflink),
+ offsetof(overlay_target_entry_t, ote_u.ote_vl3), KM_SLEEP);
+ if (ret != 0) {
+ mutex_exit(&odd->odd_lock);
+ qqcache_destroy(ott->ott_u.ott_dyn.ott_l3dhash);
+ kmem_cache_free(overlay_target_cache, ott);
+ overlay_hold_rele(odd);
+ return (ret);
+ }
+
avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
sizeof (overlay_target_entry_t),
offsetof(overlay_target_entry_t, ote_avllink));
- }
- mutex_enter(&odd->odd_lock);
- if (odd->odd_flags & OVERLAY_F_VARPD) {
- mutex_exit(&odd->odd_lock);
- kmem_cache_free(overlay_target_cache, ott);
- overlay_hold_rele(odd);
- return (EEXIST);
+ avl_create(&ott->ott_u.ott_dyn.ott_l3tree, overlay_ip_avl,
+ sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_avllink));
}
odd->odd_flags |= OVERLAY_F_VARPD;
@@ -521,8 +1118,6 @@ overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
mutex_exit(&odd->odd_lock);
overlay_hold_rele(odd);
-
-
return (0);
}
@@ -601,7 +1196,16 @@ again:
entry = list_remove_head(&overlay_target_list);
mutex_exit(&overlay_target_lock);
mutex_enter(&entry->ote_lock);
- if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ /*
+ * Router entries may send lookups to varpd even when valid. For
+ * example, illumos systems will send unicast ARP queries to cached
+ * entries (including the router mac address). To answer those, we
+ * need to forward on the query to varpd. IPv6 will eventually
+ * need something similar for ND requests.
+ */
+ if ((entry->ote_flags &
+ (OVERLAY_ENTRY_F_VALID|OVERLAY_ENTRY_F_ROUTER)) ==
+ OVERLAY_ENTRY_F_VALID) {
ASSERT(entry->ote_chead == NULL);
mutex_exit(&entry->ote_lock);
goto again;
@@ -637,10 +1241,23 @@ again:
otl->otl_hdrsize = mhi.mhi_hdrsize;
otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
- bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL);
- bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL);
- otl->otl_dsttype = mhi.mhi_dsttype;
- otl->otl_sap = mhi.mhi_bindsap;
+ if (entry->ote_flags & OVERLAY_ENTRY_F_VL3) {
+ overlay_targ_l3_t *l3p = &otl->otl_addru.otlu_l3;
+
+ otl->otl_l3req = B_TRUE;
+ bcopy(&entry->ote_u.ote_vl3.otvl3_src, &l3p->otl3_srcip,
+ sizeof (struct in6_addr));
+ bcopy(&entry->ote_u.ote_vl3.otvl3_dst, &l3p->otl3_dstip,
+ sizeof (struct in6_addr));
+ } else {
+ overlay_targ_l2_t *l2p = &otl->otl_addru.otlu_l2;
+
+ otl->otl_l3req = B_FALSE;
+ bcopy(mhi.mhi_daddr, l2p->otl2_dstaddr, ETHERADDRL);
+ bcopy(mhi.mhi_saddr, l2p->otl2_srcaddr, ETHERADDRL);
+ l2p->otl2_dsttype = mhi.mhi_dsttype;
+ l2p->otl2_sap = mhi.mhi_bindsap;
+ }
otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
mutex_exit(&entry->ote_lock);
@@ -651,12 +1268,128 @@ again:
return (0);
}
+static void
+overlay_target_lookup_respond_vl3(const overlay_targ_resp_t *otr,
+ overlay_target_entry_t *entry)
+{
+ overlay_target_entry_t *shared = NULL;
+ overlay_target_entry_t *vl2_entry;
+ overlay_target_t *ott = entry->ote_ott;
+ qqcache_t *mhash = ott->ott_u.ott_dyn.ott_dhash;
+ hrtime_t now = gethrtime();
+
+ ASSERT(MUTEX_HELD(&entry->ote_lock));
+ ASSERT(entry->ote_flags & OVERLAY_ENTRY_F_VL3);
+
+ /*
+ * A cross-{vlan,dc,vnet} packet with a destination VL3 of an overlay
+ * router IP. For now we drop these.
+ */
+ if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+ entry->ote_flags |= OVERLAY_ENTRY_F_DROP;
+ return;
+ }
+
+ bcopy(&otr->otr_mac, &entry->ote_u.ote_vl3.otvl3_vl2,
+ sizeof (overlay_target_mac_t));
+
+ mutex_enter(&ott->ott_lock);
+ if ((shared = qqcache_lookup(mhash, &otr->otr_mac)) != NULL)
+ qqcache_hold(mhash, shared);
+ mutex_exit(&ott->ott_lock);
+
+ /*
+ * Once we have the VL2 destination, we need to see if we already
+ * have an existing VL2 entry we can reuse. If not, we create a
+ * fully-formed (i.e. valid) VL2 entry that we add to the cache.
+ */
+ if (shared == NULL) {
+ vl2_entry = kmem_cache_alloc(overlay_entry_cache,
+ KM_NOSLEEP | KM_NORMALPRI);
+ if (vl2_entry == NULL) {
+ /*
+ * If we can't allocate a VL2 entry for the VL3
+ * destination, we just give up for now and drain
+ * any queued packets. New packets will retry this
+ * allocation, so if the memory pressure lets up, we
+ * should recover.
+ */
+ freemsgchain(entry->ote_chead);
+ entry->ote_chead = entry->ote_ctail = NULL;
+ return;
+ }
+
+ vl2_entry->ote_ott = ott;
+ vl2_entry->ote_odd = entry->ote_odd;
+
+ bcopy(&otr->otr_answer, &vl2_entry->ote_u.ote_vl2.otvl2_dest,
+ sizeof (overlay_target_point_t));
+ bcopy(&otr->otr_mac, &vl2_entry->ote_u.ote_vl2.otvl2_mac,
+ sizeof (overlay_target_mac_t));
+ bcopy(&otr->otr_route, &vl2_entry->ote_u.ote_vl2.otvl2_route,
+ sizeof (overlay_target_route_t));
+ vl2_entry->ote_flags =
+ OVERLAY_ENTRY_F_HAS_ROUTE | OVERLAY_ENTRY_F_VALID;
+ vl2_entry->ote_vtime = entry->ote_vtime = now;
+
+ mutex_enter(&ott->ott_lock);
+ if ((shared = qqcache_lookup(mhash, &otr->otr_mac)) != NULL) {
+ overlay_target_entry_dtor(vl2_entry);
+ kmem_cache_free(overlay_entry_cache, vl2_entry);
+ qqcache_hold(mhash, shared);
+
+ vl2_entry = shared;
+ } else {
+ qqcache_insert(mhash, vl2_entry);
+ avl_add(&ott->ott_u.ott_dyn.ott_tree, vl2_entry);
+ qqcache_hold(mhash, vl2_entry);
+ }
+ mutex_exit(&ott->ott_lock);
+ } else {
+ vl2_entry = shared;
+ }
+
+ mutex_enter(&vl2_entry->ote_lock);
+ if ((vl2_entry->ote_flags & (OVERLAY_ENTRY_F_HAS_ROUTE)) == 0) {
+ bcopy(&otr->otr_route, &vl2_entry->ote_u.ote_vl2.otvl2_route,
+ sizeof (overlay_target_route_t));
+ vl2_entry->ote_flags |= OVERLAY_ENTRY_F_HAS_ROUTE;
+ }
+
+ /*
+ * Update the VL2 entry if it doesn't have a valid destination, hasn't
+ * been marked as dropping all packets, and doesn't have an existing
+ * outstanding request. If a route and VL2 request involving the
+ * same VL2 destination are pending and the route response is processed
+ * prior to the VL2 request, we will continue to queue (on the VL2
+ * entry) until the VL2 response is received, even though we have
+ * an answer from the route response. If we set the valid flag
+ * while there's still oustanding requests, it will cause problems
+ * with the outstanding requests.
+ */
+ if ((vl2_entry->ote_flags & (OVERLAY_ENTRY_F_PENDING|
+ OVERLAY_ENTRY_F_VALID|OVERLAY_ENTRY_F_DROP)) == 0) {
+ bcopy(&otr->otr_answer, &vl2_entry->ote_u.ote_vl2.otvl2_dest,
+ sizeof (overlay_target_point_t));
+ vl2_entry->ote_vtime = gethrtime();
+ vl2_entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
+ }
+ mutex_exit(&vl2_entry->ote_lock);
+
+ mutex_enter(&ott->ott_lock);
+ qqcache_rele(mhash, vl2_entry);
+ mutex_exit(&ott->ott_lock);
+}
+
static int
overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
{
const overlay_targ_resp_t *otr = arg;
+ overlay_target_t *ott;
overlay_target_entry_t *entry;
mblk_t *mp;
+ boolean_t is_vl3 = B_FALSE;
mutex_enter(&thdl->oth_lock);
for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
@@ -673,38 +1406,88 @@ overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
mutex_exit(&thdl->oth_lock);
mutex_enter(&entry->ote_lock);
- bcopy(&otr->otr_answer, &entry->ote_dest,
- sizeof (overlay_target_point_t));
+ ott = entry->ote_ott;
+
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0)
+ is_vl3 = B_TRUE;
+
+ /*
+ * If we ever support a protocol that uses MAC addresses as the UL
+ * destination address, this check should probably include checking
+ * that otp_mac is also all zeros.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&otr->otr_answer.otp_ip) &&
+ otr->otr_answer.otp_port == 0)
+ entry->ote_flags |= OVERLAY_ENTRY_F_ROUTER;
+
+ if (!is_vl3) {
+ bcopy(&otr->otr_answer, &entry->ote_u.ote_vl2.otvl2_dest,
+ sizeof (overlay_target_point_t));
+ entry->ote_vtime = gethrtime();
+ } else {
+ overlay_target_lookup_respond_vl3(otr, entry);
+ }
+
entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
+
mp = entry->ote_chead;
entry->ote_chead = NULL;
entry->ote_ctail = NULL;
entry->ote_mbsize = 0;
- entry->ote_vtime = gethrtime();
mutex_exit(&entry->ote_lock);
/*
- * For now do an in-situ drain.
+ * For now do an in-situ drain. For VL3 entries, if we re-use
+ * and existing VL2 entry, it is possible the VL2 lookup is still
+ * pending (though should be rare). In such instances, the packets
+ * queued on the VL3 entry will get queued on the VL2 entry until
+ * the VL2 entry is resolved.
*/
mp = overlay_m_tx(entry->ote_odd, mp);
freemsgchain(mp);
- mutex_enter(&entry->ote_ott->ott_lock);
- entry->ote_ott->ott_ocount--;
- cv_signal(&entry->ote_ott->ott_cond);
- mutex_exit(&entry->ote_ott->ott_lock);
+ mutex_enter(&ott->ott_lock);
+ ott->ott_ocount--;
+ if (is_vl3)
+ qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ else
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+ cv_signal(&ott->ott_cond);
+ mutex_exit(&ott->ott_lock);
return (0);
}
+static boolean_t
+overlay_target_for_varpd(overlay_dev_t *odd, mblk_t *mp)
+{
+ mac_header_info_t mhi;
+
+ /* We should have dropped runts prior to ever queueing */
+ VERIFY0(mac_vlan_header_info(odd->odd_mh, mp, &mhi));
+ if (mhi.mhi_bindsap == ETHERTYPE_ARP)
+ return (B_TRUE);
+
+ /* TODO: NDP packets */
+ return (B_FALSE);
+}
+
+typedef enum overlay_target_lookup_drop_act {
+ OTLDA_NONE,
+ OTLDA_QUEUE,
+ OTLDA_DELETE
+} overlay_target_lookup_drop_act_t;
+
static int
overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
{
const overlay_targ_resp_t *otr = arg;
+ overlay_target_t *ott;
overlay_target_entry_t *entry;
mblk_t *mp;
- boolean_t queue = B_FALSE;
+ overlay_target_lookup_drop_act_t action = OTLDA_NONE;
+ boolean_t is_vl3 = B_FALSE;
mutex_enter(&thdl->oth_lock);
for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
@@ -721,9 +1504,19 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
mutex_exit(&thdl->oth_lock);
mutex_enter(&entry->ote_lock);
+ ott = entry->ote_ott;
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_VL3) != 0)
+ is_vl3 = B_TRUE;
- /* Safeguard against a confused varpd */
- if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ mp = entry->ote_chead;
+
+ /*
+ * Safeguard against a confused varpd. Packets specifically for
+ * varpd may receive replies (e.g. ARP replies) that require us to
+ * drop, even when the entry is valid.
+ */
+ if ((entry->ote_flags & OVERLAY_ENTRY_F_VALID) &&
+ !overlay_target_for_varpd(entry->ote_odd, mp)) {
entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
DTRACE_PROBE1(overlay__target__valid__drop,
overlay_target_entry_t *, entry);
@@ -731,7 +1524,21 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
goto done;
}
- mp = entry->ote_chead;
+ /*
+ * If varpd is instructing us to drop the head mblk in a VL2 entry,
+ * this could be because it's already provided a response (e.g. an
+ * ARP reply), and the entry itself might still be used for other
+ * purposes. VL3 entries on the other had have no such uses. If
+ * we are told to drop the packet, there is no reason to retain
+ * the VL3 entry and we can delete it.
+ */
+ if (is_vl3) {
+ action = OTLDA_DELETE;
+ mutex_exit(&entry->ote_lock);
+ goto done;
+ }
+
+ /* Drop the first packet in entry */
if (mp != NULL) {
entry->ote_chead = mp->b_next;
mp->b_next = NULL;
@@ -739,23 +1546,34 @@ overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
entry->ote_ctail = entry->ote_chead;
entry->ote_mbsize -= msgsize(mp);
}
+
if (entry->ote_chead != NULL) {
- queue = B_TRUE;
+ action = OTLDA_QUEUE;
entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
} else {
entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
}
mutex_exit(&entry->ote_lock);
- if (queue == B_TRUE)
+ if (action == OTLDA_QUEUE)
overlay_target_queue(entry);
freemsg(mp);
done:
- mutex_enter(&entry->ote_ott->ott_lock);
- entry->ote_ott->ott_ocount--;
- cv_signal(&entry->ote_ott->ott_cond);
- mutex_exit(&entry->ote_ott->ott_lock);
+ mutex_enter(&ott->ott_lock);
+ ott->ott_ocount--;
+ if (action == OTLDA_DELETE) {
+ /* overlay_target_entry_dtor() will free the mblk chain */
+ qqcache_remove(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ }
+
+ if (is_vl3)
+ qqcache_rele(ott->ott_u.ott_dyn.ott_l3dhash, entry);
+ else
+ qqcache_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+
+ cv_signal(&ott->ott_cond);
+ mutex_exit(&ott->ott_lock);
return (0);
}
@@ -1083,31 +1901,35 @@ overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg)
sizeof (overlay_target_point_t));
} else {
overlay_target_entry_t *ote;
- ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- otc->otc_entry.otce_mac);
- if (ote != NULL) {
- mutex_enter(&ote->ote_lock);
- if ((ote->ote_flags &
- OVERLAY_ENTRY_F_VALID_MASK) != 0) {
- if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
- otc->otc_entry.otce_flags =
- OVERLAY_TARGET_CACHE_DROP;
- } else {
- otc->otc_entry.otce_flags = 0;
- bcopy(&ote->ote_dest,
- &otc->otc_entry.otce_dest,
- sizeof (overlay_target_point_t));
- }
- ret = 0;
+
+ if ((ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ &otc->otc_entry.otce_mac)) == NULL) {
+ ret = ENOENT;
+ goto done;
+ }
+
+ mutex_enter(&ote->ote_lock);
+ if ((ote->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) != 0) {
+ if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ otc->otc_entry.otce_flags =
+ OVERLAY_TARGET_CACHE_DROP;
+ } else if (ote->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ otc->otc_entry.otce_flags =
+ OVERLAY_TARGET_CACHE_ROUTER;
} else {
- ret = ENOENT;
+ otc->otc_entry.otce_flags = 0;
+ bcopy(&ote->ote_u.ote_vl2.otvl2_dest,
+ &otc->otc_entry.otce_dest,
+ sizeof (overlay_target_point_t));
}
- mutex_exit(&ote->ote_lock);
+ ret = 0;
} else {
ret = ENOENT;
}
+ mutex_exit(&ote->ote_lock);
}
+done:
mutex_exit(&ott->ott_lock);
overlay_hold_rele(odd);
@@ -1120,53 +1942,64 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
{
overlay_dev_t *odd;
overlay_target_t *ott;
- overlay_target_entry_t *ote;
+ overlay_target_entry_t *ote, *new = NULL;
overlay_targ_cache_t *otc = arg;
mblk_t *mp = NULL;
- if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
+ if (otc->otc_entry.otce_flags &
+ ~(OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER))
+ return (EINVAL);
+
+ if (otc->otc_entry.otce_flags ==
+ (OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER))
return (EINVAL);
odd = overlay_hold_by_dlid(otc->otc_linkid);
if (odd == NULL)
return (ENOENT);
+ /*
+ * Optimistically create the new entry. If not needed, we'll free it.
+ * We shouldn't be calling this ioctl rapidly enough that any potential
+ * alloc/free churn should cause a problem.
+ */
+ new = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
+ bcopy(&otc->otc_entry.otce_mac, &new->ote_u.ote_vl2.otvl2_mac,
+ sizeof (overlay_target_mac_t));
+
mutex_enter(&odd->odd_lock);
if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
mutex_exit(&odd->odd_lock);
overlay_hold_rele(odd);
+ overlay_target_entry_dtor(new);
return (ENXIO);
}
ott = odd->odd_target;
if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
mutex_exit(&odd->odd_lock);
overlay_hold_rele(odd);
+ overlay_target_entry_dtor(new);
return (ENOTSUP);
}
+
+ new->ote_ott = ott;
+ new->ote_odd = odd;
+
mutex_enter(&ott->ott_lock);
mutex_exit(&odd->odd_lock);
- ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- otc->otc_entry.otce_mac);
- if (ote == NULL) {
- ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
- bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL);
- ote->ote_chead = ote->ote_ctail = NULL;
- ote->ote_mbsize = 0;
- ote->ote_ott = ott;
- ote->ote_odd = odd;
- mutex_enter(&ote->ote_lock);
- refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
- avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
- } else {
- mutex_enter(&ote->ote_lock);
- }
+ if ((ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ &otc->otc_entry.otce_mac)) == NULL)
+ ote = new;
+ mutex_enter(&ote->ote_lock);
if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
} else {
ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
- bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
+ if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_ROUTER)
+ ote->ote_flags |= OVERLAY_ENTRY_F_ROUTER;
+ bcopy(&otc->otc_entry.otce_dest, &ote->ote_u.ote_vl2.otvl2_dest,
sizeof (overlay_target_point_t));
mp = ote->ote_chead;
ote->ote_chead = NULL;
@@ -1175,6 +2008,10 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
ote->ote_vtime = gethrtime();
}
+ if (ote == new) {
+ qqcache_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
+ avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
+ }
mutex_exit(&ote->ote_lock);
mutex_exit(&ott->ott_lock);
@@ -1185,6 +2022,9 @@ overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
overlay_hold_rele(odd);
+ if (ote != new)
+ overlay_target_entry_dtor(new);
+
return (0);
}
@@ -1217,8 +2057,11 @@ overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg)
mutex_enter(&ott->ott_lock);
mutex_exit(&odd->odd_lock);
- ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- otc->otc_entry.otce_mac);
+ if (otc->otc_entry.otce_mac.otm_dcid == 0)
+ otc->otc_entry.otce_mac.otm_dcid = odd->odd_dcid;
+
+ ote = qqcache_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ &otc->otc_entry.otce_mac);
if (ote != NULL) {
mutex_enter(&ote->ote_lock);
ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
@@ -1269,8 +2112,13 @@ overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg)
ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
mutex_exit(&ote->ote_lock);
}
- ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
- otc->otc_entry.otce_mac);
+
+ avl = &ott->ott_u.ott_dyn.ott_l3tree;
+ for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) {
+ mutex_enter(&ote->ote_lock);
+ ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
+ mutex_exit(&ote->ote_lock);
+ }
mutex_exit(&ott->ott_lock);
overlay_hold_rele(odd);
@@ -1304,9 +2152,10 @@ overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize,
}
typedef struct overlay_targ_cache_marker {
- uint8_t otcm_mac[ETHERADDRL];
+ overlay_target_mac_t otcm_mac;
uint16_t otcm_done;
-} overlay_targ_cache_marker_t;
+} overlay_targ_cache_marker_t __aligned(8);
+CTASSERT(sizeof (overlay_targ_cache_marker_t) == 2 * sizeof (uint64_t));
/* ARGSUSED */
static int
@@ -1356,7 +2205,7 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
if (ott->ott_mode == OVERLAY_TARGET_POINT) {
overlay_targ_cache_entry_t *out = &iter->otci_ents[0];
- bzero(out->otce_mac, ETHERADDRL);
+ bzero(&out->otce_mac, sizeof (out->otce_mac));
out->otce_flags = 0;
bcopy(&ott->ott_u.ott_point, &out->otce_dest,
sizeof (overlay_target_point_t));
@@ -1365,7 +2214,9 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
}
avl = &ott->ott_u.ott_dyn.ott_tree;
- bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL);
+ lookup.ote_u.ote_vl2.otvl2_mac.otm_dcid = odd->odd_dcid;
+ bcopy(&mark->otcm_mac, &lookup.ote_u.ote_vl2.otvl2_mac,
+ sizeof (mark->otcm_mac));
ent = avl_find(avl, &lookup, &where);
/*
@@ -1390,19 +2241,21 @@ overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
mutex_exit(&ent->ote_lock);
continue;
}
- bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL);
+ bcopy(&ent->ote_u.ote_vl2.otvl2_mac, &out->otce_mac,
+ sizeof (out->otce_mac));
out->otce_flags = 0;
if (ent->ote_flags & OVERLAY_ENTRY_F_DROP)
out->otce_flags |= OVERLAY_TARGET_CACHE_DROP;
if (ent->ote_flags & OVERLAY_ENTRY_F_VALID)
- bcopy(&ent->ote_dest, &out->otce_dest,
+ bcopy(&ent->ote_u.ote_vl2.otvl2_dest, &out->otce_dest,
sizeof (overlay_target_point_t));
written++;
mutex_exit(&ent->ote_lock);
}
if (ent != NULL) {
- bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL);
+ bcopy(&ent->ote_u.ote_vl2.otvl2_mac, &mark->otcm_mac,
+ sizeof (mark->otcm_mac));
} else {
mark->otcm_done = 1;
}
@@ -1432,6 +2285,122 @@ overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize,
return (0);
}
+/*
+ * Take an IPv6 address + prefix length, and turn it into the network address.
+ * E.g. ::ffff:192.168.51.50/120 -> ::ffff:192.168.51.0
+ */
+static void
+overlay_in6_to_subnet(const struct in6_addr *src, struct in6_addr *dst,
+ uint8_t prefixlen)
+{
+ uint32_t val;
+
+ for (size_t i = 0; i < 4; i++) {
+ val = ntohl(src->_S6_un._S6_u32[i]);
+ val &= IN6_MASK_FROM_PREFIX(i, prefixlen);
+ dst->_S6_un._S6_u32[i] = htonl(val);
+ }
+}
+
+/*
+ * Find the first target entry whose source IP falls within the source subnet
+ * given by otcne. If no entries match, NULL is returned.
+ */
+static overlay_target_entry_t *
+overlay_target_cache_first_net(overlay_target_t *ott,
+ const overlay_targ_cache_net_entry_t *otcne)
+{
+ avl_tree_t *avl;
+ overlay_target_entry_t *ote;
+ struct in6_addr *start;
+ overlay_target_entry_t cmp = { 0 };
+ avl_index_t where = { 0 };
+
+ ASSERT(MUTEX_HELD(&ott->ott_lock));
+
+ avl = &ott->ott_u.ott_dyn.ott_l3tree;
+ start = &cmp.ote_u.ote_vl3.otvl3_src;
+
+ /*
+ * The first possible source address for a subnet is the network
+ * address (e.g. 192.160.10.0 for a /24). While it normally shouldn't
+ * appear, we either start here, or at the first entry after where
+ * it would exist if present. This should be the first possible
+ * entry in the subnet. If it's not within the subnet, then we
+ * know no entries with that source subnet are present.
+ */
+ overlay_in6_to_subnet(&otcne->otcne_src, start,
+ otcne->otcne_src_prefixlen);
+
+ if ((ote = avl_find(avl, &cmp, &where)) == NULL)
+ ote = avl_nearest(avl, where, AVL_AFTER);
+
+ if (ote == NULL || !IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_src,
+ &ote->ote_u.ote_vl3.otvl3_src, otcne->otcne_src_prefixlen))
+ return (NULL);
+
+ return (ote);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_remove_net(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_cache_net_t *otcn = arg;
+ overlay_targ_cache_net_entry_t *otcne = &otcn->otcn_entry;
+ overlay_dev_t *odd = NULL;
+ overlay_target_t *ott = NULL;
+ overlay_target_entry_t *ote = NULL, *ote_next = NULL;
+ avl_tree_t *avl = NULL;
+
+ odd = overlay_hold_by_dlid(otcn->otcn_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ avl = &ott->ott_u.ott_dyn.ott_l3tree;
+
+ for (ote = overlay_target_cache_first_net(ott, otcne);
+ ote != NULL && IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_src,
+ &ote->ote_u.ote_vl3.otvl3_src, otcne->otcne_src_prefixlen);
+ ote = ote_next) {
+ ote_next = AVL_NEXT(avl, ote);
+
+ /*
+ * Entries are sorted by src ip, dst ip, src vlan, there can
+ * be entries from this src ip to destinations on other
+ * subnets besides the one we are removing that will need to
+ * be skipped over.
+ */
+ if (ote->ote_u.ote_vl3.otvl3_src_vlan != otcne->otcne_vlan)
+ continue;
+
+ if (!IN6_ARE_PREFIXEDADDR_EQUAL(&otcne->otcne_dst,
+ &ote->ote_u.ote_vl3.otvl3_dst, otcne->otcne_dst_prefixlen))
+ continue;
+
+ qqcache_remove(ott->ott_u.ott_dyn.ott_l3dhash, ote);
+ }
+
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+ return (0);
+}
+
static overlay_target_ioctl_t overlay_target_ioctab[] = {
{ OVERLAY_TARG_INFO, B_TRUE, B_TRUE,
NULL, overlay_target_info,
@@ -1492,6 +2461,9 @@ static overlay_target_ioctl_t overlay_target_ioctab[] = {
overlay_target_cache_iter,
overlay_target_cache_iter_copyout,
sizeof (overlay_targ_cache_iter_t) },
+ { OVERLAY_TARG_CACHE_REMOVE_NET, B_TRUE, B_TRUE,
+ NULL, overlay_target_cache_remove_net,
+ NULL, sizeof (overlay_targ_cache_net_t) },
{ 0 }
};
diff --git a/usr/src/uts/common/qqcache/qqcache.c b/usr/src/uts/common/qqcache/qqcache.c
new file mode 100644
index 0000000000..ccd90c3814
--- /dev/null
+++ b/usr/src/uts/common/qqcache/qqcache.c
@@ -0,0 +1,444 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018, Joyent, Inc.
+ */
+
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/null.h>
+#include <sys/types.h>
+#include <sys/qqcache.h>
+#include <sys/qqcache_impl.h>
+#include <sys/stddef.h>
+
+/*
+ * Currently, the non _KERNEL pieces are to support testing in usr/src/test.
+ */
+#ifdef _KERNEL
+#include <sys/kmem.h>
+#define ZALLOC kmem_zalloc
+#define FREE kmem_free
+#else
+#include <umem.h>
+#define ZALLOC umem_zalloc
+#define FREE umem_free
+#endif
+
+
+/*
+ * The *_overflow functions mimic the gcc/clang intrinsic functions. Once
+ * we are using a newer compiler version to that includes these as intrisnics,
+ * these can be replaced with those versions.
+ */
+static int
+uadd_overflow(const size_t a, const size_t b, size_t *sump)
+{
+ *sump = a + b;
+ if (*sump < a || *sump < b)
+ return (1);
+ return (0);
+}
+
+#define MUL_NO_OVERFLOW ((size_t)1 << (sizeof (size_t) * 4))
+
+static int
+umul_overflow(const size_t a, const size_t b, size_t *cp)
+{
+ *cp = a * b;
+
+ if ((a >= MUL_NO_OVERFLOW || b >= MUL_NO_OVERFLOW) &&
+ a != 0 && b != 0 && SIZE_MAX / a < b)
+ return (1);
+
+ return (0);
+}
+
+/* Calculate the capacity of each list based on sz and a */
+static void
+qqcache_size_lists(size_t sz, size_t a, size_t *maxp)
+{
+ VERIFY3U(sz, >=, QQCACHE_NUM_LISTS);
+
+ /*
+ * The general approach is to start with list 0 being sized as a% of
+ * sz. However every other list must be able to hold at least one
+ * entry unless a == 100 (i.e. 100%). If the straight percentage
+ * leaves any of the remaining lists with zero entries, we give them
+ * a size of 1, and then adjust list0's size according so that the
+ * sum off all list sizes == sz (this is mostly only a concern where
+ * sz is small enough such that (100 - a)% of sz < QQCACHE_NUM_LISTS).
+ */
+ size_t list0sz = sz * a / 100;
+ size_t othersz = (sz - list0sz) / (QQCACHE_NUM_LISTS - 1);
+
+ if (list0sz == 0)
+ list0sz = 1;
+
+ if (othersz == 0 && a != 100)
+ othersz = 1;
+
+ if (list0sz + othersz * (QQCACHE_NUM_LISTS - 1) > sz)
+ list0sz = sz - othersz * (QQCACHE_NUM_LISTS - 1);
+
+ maxp[0] = list0sz;
+ for (size_t i = 1; i < QQCACHE_NUM_LISTS; i++)
+ maxp[i] = othersz;
+}
+
+int
+qqcache_create(qqcache_t **qp, size_t sz, size_t a, size_t buckets,
+ qqcache_hash_fn_t hash_fn, qqcache_cmp_fn_t cmp_fn,
+ qqcache_dtor_fn_t dtor_fn, size_t elsize, size_t link_off, size_t tag_off,
+ int kmflags)
+{
+ qqcache_t *qc;
+ size_t len = 0;
+
+ if (sz < QQCACHE_MIN_SIZE)
+ return (EINVAL);
+ if (a > 100)
+ return (EINVAL);
+
+ if (umul_overflow(sizeof (qqcache_list_t), buckets, &len))
+ return (EINVAL);
+ if (uadd_overflow(sizeof (*qc), len, &len))
+ return (EINVAL);
+
+ if ((qc = ZALLOC(len, kmflags)) == NULL)
+ return (ENOMEM);
+
+ qc->qqc_hash_fn = hash_fn;
+ qc->qqc_cmp_fn = cmp_fn;
+ qc->qqc_dtor_fn = dtor_fn;
+ qc->qqc_link_off = link_off;
+ qc->qqc_tag_off = tag_off;
+ qc->qqc_nbuckets = buckets;
+ qc->qqc_size = sz;
+ qc->qqc_a = a;
+
+ qqcache_size_lists(sz, a, qc->qqc_max);
+
+ for (size_t i = 0; i < buckets; i++) {
+ list_create(&qc->qqc_buckets[i].qqcl_list, elsize,
+ offsetof(qqcache_link_t, qqln_hash_link));
+ }
+
+ for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) {
+ list_create(&qc->qqc_lists[i].qqcl_list, elsize,
+ offsetof(qqcache_link_t, qqln_list_link));
+ }
+
+ *qp = qc;
+ return (0);
+}
+
+void
+qqcache_destroy(qqcache_t *qc)
+{
+ size_t len;
+
+ if (qc == NULL)
+ return;
+
+ /* If creation succeeded, this calculation cannot overflow */
+ len = sizeof (*qc) + qc->qqc_nbuckets * sizeof (qqcache_list_t);
+
+ for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) {
+ list_t *l = &qc->qqc_lists[i].qqcl_list;
+ qqcache_link_t *lnk;
+
+ while ((lnk = list_remove_head(l)) != NULL)
+ ;
+ }
+
+ for (size_t i = 0; i < qc->qqc_nbuckets; i++) {
+ list_t *l = &qc->qqc_buckets[i].qqcl_list;
+ qqcache_link_t *lnk;
+
+ while ((lnk = list_remove_head(l)) != NULL) {
+ ASSERT0(lnk->qqln_refcnt);
+ qc->qqc_dtor_fn(link_to_obj(qc, lnk));
+ }
+ }
+
+ FREE(qc, len);
+}
+
+/*
+ * Removal of an entry is a two step process. qqcache_remove() removes the
+ * entry from the cache lists, and if a reference is held, sets the
+ * QQCACHE_F_DEAD flag. When there are no more references held on an entry,
+ * (either none are held at the time qqcache_remove() is called, or the last
+ * reference is removed via qqcache_rele(), qqcache_delete() is called which
+ * removes the entry from its hash bucket and calls the entry's dtor function.
+ *
+ * The main reason for the two step process is largely simplicity. If the
+ * entry remains in the cache lists w/ the QQCACHE_F_DEAD flag set, it
+ * complicates keeping each cache within its size limits -- either the
+ * list size must reflect the number of non-dead entries (which could be
+ * confusing during troubleshooting), or as we push things down the list, we
+ * would need to skip/ignore dead entries. The hash buckets however don't
+ * have any size limits (to impose limits would require the hash function
+ * provided by the consumer to produce perfectly equal distribution of entries
+ * across all the hash buckets at all times). The only time we care about
+ * the QQCACHE_F_DEAD flag in the hash buckets is when trying to lookup a
+ * 'dead' value, so leaving the entries in there does not present the same
+ * issues as leaving them in the hash buckets (while still providing a way to
+ * find refheld entries).
+ */
+static void
+qqcache_delete(qqcache_t *qc, qqcache_link_t *lp)
+{
+ void *op = link_to_obj(qc, lp);
+ void *tp = obj_to_tag(qc, op);
+ uint_t n = qc->qqc_hash_fn(tp) % qc->qqc_nbuckets;
+
+ ASSERT3U(qc->qqc_buckets[n].qqcl_len, >, 0);
+ ASSERT(!list_is_empty(&qc->qqc_buckets[n].qqcl_list));
+ ASSERT(!list_link_active(&lp->qqln_list_link));
+ ASSERT(list_link_active(&lp->qqln_hash_link));
+
+ list_remove(&qc->qqc_buckets[n].qqcl_list, lp);
+ qc->qqc_buckets[n].qqcl_len--;
+ qc->qqc_dtor_fn(op);
+}
+
+void
+qqcache_remove(qqcache_t *qc, void *op)
+{
+ qqcache_link_t *lp = obj_to_link(qc, op);
+ qqcache_list_t *lst = QQCACHE_LIST(qc, lp);
+
+ ASSERT(!list_is_empty(&lst->qqcl_list));
+ ASSERT3U(lst->qqcl_len, >, 0);
+
+ list_remove(&lst->qqcl_list, lp);
+ lst->qqcl_len--;
+
+ if (lp->qqln_refcnt > 0)
+ lp->qqln_flags |= QQCACHE_F_DEAD;
+ else
+ qqcache_delete(qc, lp);
+}
+
+void
+qqcache_hold(qqcache_t *qc, void *op)
+{
+ qqcache_link_t *lp = obj_to_link(qc, op);
+
+ ++lp->qqln_refcnt;
+}
+
+void
+qqcache_rele(qqcache_t *qc, void *op)
+{
+ qqcache_link_t *lp = obj_to_link(qc, op);
+
+ VERIFY3U(lp->qqln_refcnt, >, 0);
+
+ if (--lp->qqln_refcnt == 0 && (lp->qqln_flags & QQCACHE_F_DEAD))
+ qqcache_delete(qc, lp);
+}
+
+static qqcache_link_t *
+qqcache_hash_lookup(qqcache_t *qc, const void *tp, qqcache_list_t **lpp)
+{
+ uint_t n = qc->qqc_hash_fn(tp) % qc->qqc_nbuckets;
+ qqcache_link_t *lp;
+ qqcache_list_t *bucket = &qc->qqc_buckets[n];
+ list_t *l = &bucket->qqcl_list;
+ void *cmp;
+
+ if (lpp != NULL)
+ *lpp = bucket;
+
+ for (lp = list_head(l); lp != NULL; lp = list_next(l, lp)) {
+ cmp = obj_to_tag(qc, link_to_obj(qc, lp));
+
+ if (qc->qqc_cmp_fn(cmp, tp) == 0 &&
+ !(lp->qqln_flags & QQCACHE_F_DEAD)) {
+ return (lp);
+ }
+ }
+
+ return (NULL);
+}
+
+/*
+ * Starting at listnum, push entries from the tail of cache list 'n' to the
+ * head of * list 'n + 1', keeping each list within their size limits. Excess
+ * entries on the tail of the last list are deleted. If 'for_insert' is
+ * B_TRUE, also guarantee after this returns that there are no more than
+ * 'max - 1' entries on listnum (so there is room to insert an entry onto
+ * listnum).
+ */
+static void
+qqcache_ripple(qqcache_t *qc, uint_t listnum, boolean_t for_insert)
+{
+ VERIFY3U(listnum, <, QQCACHE_NUM_LISTS);
+
+ for (uint_t i = listnum; i < QQCACHE_NUM_LISTS; i++) {
+ qqcache_list_t *ql = &qc->qqc_lists[i];
+ qqcache_list_t *qlnext = &qc->qqc_lists[i + 1];
+ size_t max = qc->qqc_max[i];
+
+ ASSERT3U(max, >, 0);
+
+ /*
+ * If we're planning to insert an entry on list 'listnum',
+ * we bump the maximum size down by one to guarantee we
+ * have sufficient room for the entry
+ */
+ if (for_insert && i == listnum)
+ max--;
+
+ while (ql->qqcl_len > max) {
+ qqcache_link_t *lnk = list_tail(&ql->qqcl_list);
+
+ if (i + 1 < QQCACHE_NUM_LISTS) {
+ list_remove(&ql->qqcl_list, lnk);
+ ql->qqcl_len--;
+
+ ASSERT3U(lnk->qqln_listnum, ==, i);
+ lnk->qqln_listnum++;
+
+ list_insert_head(&qlnext->qqcl_list, lnk);
+ qlnext->qqcl_len++;
+ } else {
+ qqcache_remove(qc, link_to_obj(qc, lnk));
+ }
+ }
+ }
+}
+
+int
+qqcache_insert(qqcache_t *qc, void *obj)
+{
+ qqcache_link_t *lp = obj_to_link(qc, obj);
+ qqcache_list_t *bucket;
+
+ if (qqcache_hash_lookup(qc, obj_to_tag(qc, obj), &bucket) != NULL)
+ return (EEXIST);
+
+ list_link_init(&lp->qqln_hash_link);
+ list_link_init(&lp->qqln_list_link);
+ lp->qqln_refcnt = 0;
+ lp->qqln_flags = 0;
+ lp->qqln_listnum = QQCACHE_INSERT_LIST;
+
+ qqcache_ripple(qc, QQCACHE_INSERT_LIST, B_TRUE);
+
+ list_insert_tail(&bucket->qqcl_list, lp);
+ bucket->qqcl_len++;
+
+ list_insert_head(&qc->qqc_lists[QQCACHE_INSERT_LIST].qqcl_list, lp);
+ qc->qqc_lists[QQCACHE_INSERT_LIST].qqcl_len++;
+
+ return (0);
+}
+
+void *
+qqcache_lookup(qqcache_t *qc, const void *tp)
+{
+ qqcache_link_t *lp;
+ qqcache_list_t *src;
+ uint_t tgtnum;
+
+ if ((lp = qqcache_hash_lookup(qc, tp, NULL)) == NULL)
+ return (NULL);
+
+ src = QQCACHE_LIST(qc, lp);
+ list_remove(&src->qqcl_list, lp);
+ src->qqcl_len--;
+
+ tgtnum = (lp->qqln_listnum > 0) ? lp->qqln_listnum - 1 : 0;
+
+ if (tgtnum != lp->qqln_listnum)
+ qqcache_ripple(qc, tgtnum, B_TRUE);
+
+ lp->qqln_listnum = tgtnum;
+ list_insert_head(&qc->qqc_lists[tgtnum].qqcl_list, lp);
+ qc->qqc_lists[tgtnum].qqcl_len++;
+
+ return (link_to_obj(qc, lp));
+}
+
+int
+qqcache_adjust_size(qqcache_t *qc, size_t sz)
+{
+ if (sz < QQCACHE_MIN_SIZE)
+ return (EINVAL);
+
+ qc->qqc_size = sz;
+ qqcache_size_lists(sz, qc->qqc_a, qc->qqc_max);
+ qqcache_ripple(qc, 0, B_FALSE);
+ return (0);
+}
+
+int
+qqcache_adjust_a(qqcache_t *qc, size_t a)
+{
+ if (a > 100)
+ return (EINVAL);
+
+ qc->qqc_a = a;
+ qqcache_size_lists(qc->qqc_size, a, qc->qqc_max);
+ qqcache_ripple(qc, 0, B_FALSE);
+ return (0);
+}
+
+size_t
+qqcache_size(const qqcache_t *qc)
+{
+ return (qc->qqc_size);
+}
+
+size_t
+qqcache_a(const qqcache_t *qc)
+{
+ return (qc->qqc_a);
+}
+
+void *
+qqcache_first(qqcache_t *qc)
+{
+ for (size_t i = 0; i < QQCACHE_NUM_LISTS; i++) {
+ qqcache_list_t *l = &qc->qqc_lists[i];
+
+ if (l->qqcl_len > 0)
+ return (link_to_obj(qc, list_head(&l->qqcl_list)));
+ }
+
+ return (NULL);
+}
+
+void *
+qqcache_next(qqcache_t *qc, void *obj)
+{
+ qqcache_link_t *lp = obj_to_link(qc, obj);
+ qqcache_link_t *next;
+ qqcache_list_t *l = QQCACHE_LIST(qc, lp);
+
+ ASSERT3U(lp->qqln_listnum, <, QQCACHE_NUM_LISTS);
+
+ if ((next = list_next(&l->qqcl_list, lp)) != NULL)
+ return (link_to_obj(qc, next));
+
+ for (size_t i = lp->qqln_listnum + 1; i < QQCACHE_NUM_LISTS; i++) {
+ l = &qc->qqc_lists[i];
+ if (l->qqcl_len > 0)
+ return (link_to_obj(qc, list_head(&l->qqcl_list)));
+ }
+
+ return (NULL);
+}
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 24fdd94c11..eaf06f476c 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -28,7 +28,7 @@
# Copyright 2017 Nexenta Systems, Inc.
# Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
# Copyright 2019 Peter Tribble.
-# Copyright 2015, Joyent, Inc. All rights reserved.
+# Copyright 2018 Joyent, Inc.
#
include $(SRC)/uts/Makefile.uts
@@ -491,6 +491,8 @@ CHKHDRS= \
ptem.h \
ptms.h \
ptyvar.h \
+ qqcache.h \
+ qqcache_impl.h \
raidioctl.h \
ramdisk.h \
random.h \
diff --git a/usr/src/uts/common/sys/ethernet.h b/usr/src/uts/common/sys/ethernet.h
index 5b9de2f2bf..f19912bfc3 100644
--- a/usr/src/uts/common/sys/ethernet.h
+++ b/usr/src/uts/common/sys/ethernet.h
@@ -23,6 +23,8 @@
*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018, Joyent, Inc.
*/
/*
@@ -139,6 +141,18 @@ struct ether_vlan_extinfo {
#define ether_copy(a, b) (bcopy((caddr_t)a, (caddr_t)b, 6))
#endif
+/*
+ * Ethernet is-zero check
+ */
+#if defined(__sparc) || defined(__i386) || defined(__amd64)
+#define ether_is_zero(a) \
+ (((short *)a)[0] == 0 && ((short *)a)[1] == 0 && ((short *)a)[2] == 0)
+#else
+#define ether_is_zero(a) (((uint8_t *)a)[0] == 0 && ((uint8_t *)a)[1] == 0 && \
+ ((uint8_t *)a)[2] == 0 && ((uint8_t *)a)[3] == 0 && \
+ ((uint8_t *)a)[4] == 0 && ((uint8_t *)a)[5] == 0)
+#endif
+
#ifdef _KERNEL
extern int localetheraddr(struct ether_addr *, struct ether_addr *);
extern char *ether_sprintf(struct ether_addr *);
diff --git a/usr/src/uts/common/sys/overlay.h b/usr/src/uts/common/sys/overlay.h
index 12d0dbca51..90f1843282 100644
--- a/usr/src/uts/common/sys/overlay.h
+++ b/usr/src/uts/common/sys/overlay.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015, Joyent, Inc.
+ * Copyright 2018, Joyent, Inc.
*/
#ifndef _SYS_OVERLAY_H
@@ -40,7 +40,7 @@ extern "C" {
typedef struct overlay_ioc_create {
datalink_id_t oic_linkid;
- uint32_t oic_filler;
+ uint32_t oic_dcid;
uint64_t oic_vnetid;
char oic_encap[MAXLINKNAMELEN];
} overlay_ioc_create_t;
diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h
index d638096006..de682a0397 100644
--- a/usr/src/uts/common/sys/overlay_common.h
+++ b/usr/src/uts/common/sys/overlay_common.h
@@ -42,7 +42,8 @@ typedef enum overlay_prop_type {
OVERLAY_PROP_T_INT = 0x1, /* signed int */
OVERLAY_PROP_T_UINT, /* unsigned int */
OVERLAY_PROP_T_IP, /* sinaddr6 */
- OVERLAY_PROP_T_STRING /* OVERLAY_PROPS_SIZEMAX */
+ OVERLAY_PROP_T_STRING, /* OVERLAY_PROPS_SIZEMAX */
+ OVERLAY_PROP_T_ETHER /* 6-byte MAC address */
} overlay_prop_type_t;
typedef enum overlay_prop_prot {
diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h
index 7fb8b8da1d..28e80d6d58 100644
--- a/usr/src/uts/common/sys/overlay_impl.h
+++ b/usr/src/uts/common/sys/overlay_impl.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_OVERLAY_IMPL_H
@@ -29,9 +29,9 @@
#include <sys/avl.h>
#include <sys/ksocket.h>
#include <sys/socket.h>
-#include <sys/refhash.h>
#include <sys/ethernet.h>
#include <sys/list.h>
+#include <sys/qqcache.h>
#ifdef __cplusplus
extern "C" {
@@ -59,7 +59,7 @@ typedef struct overlay_mux {
int omux_domain; /* RO: socket domain */
int omux_family; /* RO: socket family */
int omux_protocol; /* RO: socket protocol */
- struct sockaddr *omux_addr; /* RO: socket address */
+ struct sockaddr *omux_addr; /* RO: socket address */
socklen_t omux_alen; /* RO: sockaddr len */
kmutex_t omux_lock; /* Protects everything below */
uint_t omux_count; /* Active instances */
@@ -81,8 +81,10 @@ typedef struct overlay_target {
union { /* ott_lock */
overlay_target_point_t ott_point;
struct overlay_target_dyn {
- refhash_t *ott_dhash;
+ qqcache_t *ott_dhash;
+ qqcache_t *ott_l3dhash;
avl_tree_t ott_tree;
+ avl_tree_t ott_l3tree;
} ott_dyn;
} ott_u;
} overlay_target_t;
@@ -117,6 +119,12 @@ typedef struct overlay_dev {
uint64_t odd_vid; /* RO if active else odd_lock */
avl_node_t odd_muxnode; /* managed by mux */
overlay_target_t *odd_target; /* See big theory statement */
+ uint32_t odd_dcid; /* RO if active else odd_lock */
+ uint32_t odd_vl2sz; /* protected by odd_lock */
+ uint32_t odd_vl2a; /* protected by odd_lock */
+ uint32_t odd_routesz; /* protected by odd_lock */
+ uint32_t odd_routea; /* protected by odd_lock */
+ uint8_t odd_macaddr[ETHERADDRL]; /* RO same as odd_dcid */
char odd_fmamsg[OVERLAY_STATUS_BUFLEN]; /* odd_lock */
} overlay_dev_t;
@@ -124,25 +132,50 @@ typedef enum overlay_target_entry_flags {
OVERLAY_ENTRY_F_PENDING = 0x01, /* lookup in progress */
OVERLAY_ENTRY_F_VALID = 0x02, /* entry is currently valid */
OVERLAY_ENTRY_F_DROP = 0x04, /* always drop target */
- OVERLAY_ENTRY_F_VALID_MASK = 0x06
+ OVERLAY_ENTRY_F_ROUTER = 0x08, /* VL2 router entry */
+ OVERLAY_ENTRY_F_HAS_ROUTE = 0x10,
+ OVERLAY_ENTRY_F_VALID_MASK = 0x1e,
+ OVERLAY_ENTRY_F_VL3 = 0x20, /* Is VL3 entry */
} overlay_target_entry_flags_t;
-typedef struct overlay_target_entry {
+struct overlay_target_entry;
+typedef struct overlay_target_entry overlay_target_entry_t;
+
+/*
+ * For VL3 target entries, if we need to lock both the VL3 entry and the
+ * (possibly shared with multiple VL3 entries) VL2 entry, we must always
+ * take the VL3 lock prior to the VL2 entry lock.
+ */
+typedef struct overlay_target_vl3 {
+ struct in6_addr otvl3_src;
+ struct in6_addr otvl3_dst;
+ uint16_t otvl3_src_vlan;
+ overlay_target_mac_t otvl3_vl2;
+} overlay_target_vl3_t;
+
+typedef struct overlay_target_vl2 {
+ overlay_target_route_t otvl2_route;
+ overlay_target_mac_t otvl2_mac;
+ overlay_target_point_t otvl2_dest;
+} overlay_target_vl2_t;
+
+struct overlay_target_entry {
kmutex_t ote_lock;
- refhash_link_t ote_reflink; /* hashtable link */
+ qqcache_link_t ote_reflink; /* hashtable link */
avl_node_t ote_avllink; /* iteration link */
list_node_t ote_qlink;
overlay_target_entry_flags_t ote_flags; /* RW: state flags */
- uint8_t ote_addr[ETHERADDRL]; /* RO: mac addr */
overlay_target_t *ote_ott; /* RO */
overlay_dev_t *ote_odd; /* RO */
- overlay_target_point_t ote_dest; /* RW: destination */
mblk_t *ote_chead; /* RW: blocked mb chain head */
mblk_t *ote_ctail; /* RW: blocked mb chain tail */
size_t ote_mbsize; /* RW: outstanding mblk size */
hrtime_t ote_vtime; /* RW: valid timestamp */
-} overlay_target_entry_t;
-
+ union {
+ overlay_target_vl2_t ote_vl2;
+ overlay_target_vl3_t ote_vl3;
+ } ote_u;
+};
#define OVERLAY_CTL "overlay"
@@ -186,7 +219,7 @@ extern void overlay_target_free(overlay_dev_t *);
#define OVERLAY_TARGET_DROP 1
#define OVERLAY_TARGET_ASYNC 2
extern int overlay_target_lookup(overlay_dev_t *, mblk_t *, struct sockaddr *,
- socklen_t *);
+ socklen_t *, uint64_t *);
extern void overlay_target_quiesce(overlay_target_t *);
extern void overlay_target_fini(void);
diff --git a/usr/src/uts/common/sys/overlay_target.h b/usr/src/uts/common/sys/overlay_target.h
index ae92ef3532..28c0559acb 100644
--- a/usr/src/uts/common/sys/overlay_target.h
+++ b/usr/src/uts/common/sys/overlay_target.h
@@ -10,7 +10,7 @@
*/
/*
- * Copyright (c) 2015 Joyent, Inc.
+ * Copyright (c) 2018 Joyent, Inc.
*/
#ifndef _OVERLAY_TARGET_H
@@ -29,11 +29,43 @@
extern "C" {
#endif
+/*
+ * The overlay_target_point_t structure represents the destination where
+ * encapsulated frames are sent. Currently supported virtualization protocols
+ * (i.e. vxlan) only use otp_ip and otp_port, but other methods might use
+ * a L2 address instead of an L3 address to represent a destination.
+ */
typedef struct overlay_target_point {
- uint8_t otp_mac[ETHERADDRL];
struct in6_addr otp_ip;
uint16_t otp_port;
-} overlay_target_point_t;
+ uint8_t otp_mac[ETHERADDRL];
+} overlay_target_point_t __aligned(8);
+
+/*
+ * An overlay_target_mac_t represents the overlay representation of a VL2 MAC
+ * address. With the advent of cross-DC routing, it is possible to have
+ * duplicate MAC addresses in different data centers, so the data center id
+ * is necessary to uniquely identify a MAC address.
+ *
+ * XXX: In hindsight, using a uint16_t for the DCID might have been nicer.
+ */
+typedef struct overlay_target_mac {
+ uint32_t otm_dcid;
+ uint8_t otm_mac[ETHERADDRL];
+} overlay_target_mac_t;
+
+/*
+ * The overlay_target_route_t represents the fields of the packet that
+ * have to be modified to deliver a packet to remote (routed) destinations.
+ * All three values are always populated when a packet is routed, even if
+ * some of the overlay_target_route_t values end up being the same as the
+ * original values in the packet being routed.
+ */
+typedef struct overlay_target_route {
+ uint64_t otr_vnet;
+ uint8_t otr_srcmac[ETHERADDRL];
+ uint16_t otr_vlan;
+} overlay_target_route_t;
#define OVERLAY_TARG_IOCTL (('o' << 24) | ('v' << 16) | ('t' << 8))
@@ -52,6 +84,7 @@ typedef struct overlay_targ_info {
uint32_t oti_needs;
uint64_t oti_flags;
uint64_t oti_vnetid;
+ uint32_t oti_dcid;
} overlay_targ_info_t;
/*
@@ -134,7 +167,7 @@ typedef struct overlay_targ_id {
*
* This ioctl can be used to copy data from a given request into a
* user buffer. This can be used in combination with
- * OVERLAY_TARG_INJECT to implemnt services such as a proxy-arp.
+ * OVERLAY_TARG_INJECT to implement services such as a proxy-arp.
*
*
* OVERLAY_TARG_RESEND - overlay_targ_pkt_t
@@ -152,6 +185,18 @@ typedef struct overlay_targ_id {
#define OVERLAY_TARG_PKT (OVERLAY_TARG_IOCTL | 0x14)
#define OVERLAY_TARG_RESEND (OVERLAY_TARG_IOCTL | 0x15)
+typedef struct overlay_targ_l2 {
+ uint8_t otl2_srcaddr[ETHERADDRL];
+ uint8_t otl2_dstaddr[ETHERADDRL];
+ uint32_t otl2_dsttype;
+ uint32_t otl2_sap;
+} overlay_targ_l2_t;
+
+typedef struct overlay_targ_l3 {
+ struct in6_addr otl3_srcip;
+ struct in6_addr otl3_dstip;
+} overlay_targ_l3_t;
+
typedef struct overlay_targ_lookup {
uint64_t otl_dlid;
uint64_t otl_reqid;
@@ -159,16 +204,20 @@ typedef struct overlay_targ_lookup {
uint64_t otl_vnetid;
uint64_t otl_hdrsize;
uint64_t otl_pktsize;
- uint8_t otl_srcaddr[ETHERADDRL];
- uint8_t otl_dstaddr[ETHERADDRL];
- uint32_t otl_dsttype;
- uint32_t otl_sap;
+ union {
+ overlay_targ_l2_t otlu_l2;
+ overlay_targ_l3_t otlu_l3;
+ } otl_addru;
int32_t otl_vlan;
+ boolean_t otl_l3req;
} overlay_targ_lookup_t;
+
typedef struct overlay_targ_resp {
- uint64_t otr_reqid;
- overlay_target_point_t otr_answer;
+ uint64_t otr_reqid;
+ overlay_target_route_t otr_route; /* Ignored for VL2->UL3 requests */
+ overlay_target_mac_t otr_mac; /* Ignored for VL2->UL3 requests */
+ overlay_target_point_t otr_answer;
} overlay_targ_resp_t;
typedef struct overlay_targ_pkt {
@@ -255,6 +304,7 @@ typedef struct overlay_targ_list {
#define OVERLAY_TARG_CACHE_REMOVE (OVERLAY_TARG_IOCTL | 0x32)
#define OVERLAY_TARG_CACHE_FLUSH (OVERLAY_TARG_IOCTL | 0x33)
#define OVERLAY_TARG_CACHE_ITER (OVERLAY_TARG_IOCTL | 0x34)
+#define OVERLAY_TARG_CACHE_REMOVE_NET (OVERLAY_TARG_IOCTL | 0x35)
/*
* This is a pretty arbitrary number that we're constraining ourselves to
@@ -265,22 +315,36 @@ typedef struct overlay_targ_list {
#define OVERLAY_TARGET_ITER_MAX 500
#define OVERLAY_TARGET_CACHE_DROP 0x01
+#define OVERLAY_TARGET_CACHE_ROUTER 0x02
typedef struct overlay_targ_cache_entry {
- uint8_t otce_mac[ETHERADDRL];
+ overlay_target_mac_t otce_mac;
uint16_t otce_flags;
overlay_target_point_t otce_dest;
} overlay_targ_cache_entry_t;
+typedef struct overlay_targ_cache_net_entry {
+ struct in6_addr otcne_src;
+ struct in6_addr otcne_dst;
+ uint16_t otcne_vlan; /* src vlan */
+ uint8_t otcne_src_prefixlen;
+ uint8_t otcne_dst_prefixlen;
+} overlay_targ_cache_net_entry_t;
+
typedef struct overlay_targ_cache {
datalink_id_t otc_linkid;
overlay_targ_cache_entry_t otc_entry;
} overlay_targ_cache_t;
+typedef struct overlay_targ_cache_net {
+ datalink_id_t otcn_linkid;
+ overlay_targ_cache_net_entry_t otcn_entry;
+} overlay_targ_cache_net_t;
+
typedef struct overlay_targ_cache_iter {
datalink_id_t otci_linkid;
uint32_t otci_pad;
- uint64_t otci_marker;
+ uint64_t otci_marker[2];
uint16_t otci_count;
uint8_t otci_pad2[3];
overlay_targ_cache_entry_t otci_ents[];
diff --git a/usr/src/uts/common/sys/qqcache.h b/usr/src/uts/common/sys/qqcache.h
new file mode 100644
index 0000000000..a2244338dd
--- /dev/null
+++ b/usr/src/uts/common/sys/qqcache.h
@@ -0,0 +1,176 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018, Joyent, Inc.
+ */
+
+#ifndef _QQCACHE_H
+#define _QQCACHE_H
+
+#include <sys/list.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This implements a fixed-size hash table that uses the 2Q algorithm
+ * from Johnson and Shasha to manage the contents of the entries.
+ *
+ * Briefly, there are two fixed sizes lists (0 and 1). New entries are
+ * added to the head of list 1, and upon subsequent access (lookup), are
+ * moved to the head of list 0. Entries that fall off the end of list 0
+ * are pushed onto the head of list 1, and entries that fall off the end
+ * of list 1 are deleted. The percentage of the total size of the cache
+ * for each list is determined by the parameter 'a', which is a percentage
+ * (0-100) of the cache size that is dedicated to list 0.
+ *
+ * This implementation does generalize this algorithm somewhat to an
+ * arbitrary number of lists (instead of just 2) via the QQCACHE_NUM_LISTS
+ * and QQCACHE_INSERT_LIST preprocessor symbols (defined in
+ * sys/qqcache_impl.h). New entries are added to list QQCACHE_INSERT_LIST
+ * and as each list gets full, the oldest entry in each list is pushed to
+ * the head of the succeeding list, and the oldest entries are removed
+ * from the cache (so each list never has more entries than their maximum
+ * size).
+ *
+ * The API itself is very similar to that of refhash. A qqcache_link_t struct
+ * is embedded within the definition of the entries that are being stored in
+ * a given qqcache_t. Functions are provided to hash/compare the tag (key)
+ * value of an entry, as well as destroying the entry during the creation
+ * of the cache. Lookups then occur by passing a pointer to the key value
+ * being looked up.
+ *
+ * NOTE: As one can take references to entries in the cache via the
+ * qqcache_hold() function, refheld entries that are marked for deletion are
+ * not counted when tracking the cache size, and their dtor function is not
+ * called until the last reference has been released (by calling the
+ * qqcache_rele() function).
+ */
+
+typedef enum qqcache_flag {
+ QQCACHE_F_DEAD = 0x01,
+} qqcache_flag_t;
+
+typedef struct qqcache_link {
+ list_node_t qqln_hash_link; /* Hash chain bucket */
+ list_node_t qqln_list_link; /* Cache list link */
+ uint_t qqln_listnum;
+ uint_t qqln_refcnt;
+ qqcache_flag_t qqln_flags;
+} qqcache_link_t;
+
+struct qqcache;
+typedef struct qqcache qqcache_t;
+
+typedef uint64_t (*qqcache_hash_fn_t)(const void *);
+typedef int (*qqcache_cmp_fn_t)(const void *, const void *);
+typedef void (*qqcache_dtor_fn_t)(void *);
+
+/*
+ * qqcache_create(qcp, sz, a, buckets, hash_fn, cmp_fn, dtor_fn,
+ * elsize, link_off, tag_off, flags);
+ *
+ * Creates a new 2Q cache:
+ *
+ * qqcache_t **qcp A pointer to the pointer that will hold the new
+ * cache.
+ *
+ * size_t sz The size of the cache (in entries).
+ *
+ * size_t a The percentage (0-100) of the cache dedicated to
+ * MRU entries (list 0);
+ *
+ * size_t buckets The number of hash buckets in the cache.
+ *
+ * qqcache_hash_fn_t hash_fn The function used to create a
+ * hash value for a given entry's tag
+ * value.
+ *
+ * qqcache_cmp_fn_t cmp_fn The function used to compare the two
+ * tag values of two entries. The function
+ * should return '0' if the two entries
+ * are equal, '1' if they are not equal.
+ *
+ * qqcache_dtor_fn_t dtor_fn The function used to destroy/free
+ * entries.
+ *
+ * size_t elsize The size of each entry.
+ *
+ * size_t link_off The offset of the qqcache_link_t struct in the entry.
+ *
+ * size_t tag_off The offset in the entry of the tag value (used for
+ * hashing and comparison).
+ *
+ * int flags The flags passed to kmem_zalloc/umem_zalloc.
+ *
+ * Returns:
+ * 0 Success
+ * EINVAL A parameter was not valid
+ * ENOMEM The memory allocation failed (only possible when
+ * KM_NOSLEEP/UMEM_DEFAULT is passed to flags).
+ */
+extern int qqcache_create(qqcache_t **, size_t, size_t, size_t,
+ qqcache_hash_fn_t, qqcache_cmp_fn_t, qqcache_dtor_fn_t,
+ size_t, size_t, size_t, int);
+
+/* Destroy the given qqcache_t */
+extern void qqcache_destroy(qqcache_t *);
+
+/*
+ * qqcache_insert(qc, obj)
+ *
+ * qqcache_t *qc The cache to insert the item into.
+ *
+ * void *obj The object to add.
+ *
+ * Returns:
+ * 0 Success
+ * EEXIST The same entry (as determined by the cache cmp function) already
+ * exists in the cache.
+ */
+extern int qqcache_insert(qqcache_t *, void *);
+
+/* Lookup an entry with the given tag/key, or return NULL if not found */
+extern void *qqcache_lookup(qqcache_t *, const void *);
+
+/* Remove the given entry from the cache */
+extern void qqcache_remove(qqcache_t *, void *);
+
+/* Add a hold on the entry in the cache */
+extern void qqcache_hold(qqcache_t *, void *);
+
+/* Release the hold on the entry in the cache */
+extern void qqcache_rele(qqcache_t *, void *);
+
+/*
+ * Adjust the size and percentage of the cache for list 0. If new values are
+ * smaller than current values, entries may be evicted as necessary to reduce
+ * the size of the cache to the given size.
+ */
+extern int qqcache_adjust_size(qqcache_t *, size_t);
+extern int qqcache_adjust_a(qqcache_t *, size_t);
+
+/* Return the current values of size or a. */
+extern size_t qqcache_size(const qqcache_t *);
+extern size_t qqcache_a(const qqcache_t *);
+
+/* Iterate through entries. */
+extern void *qqcache_first(qqcache_t *);
+extern void *qqcache_next(qqcache_t *, void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _QQCACHE_H */
diff --git a/usr/src/uts/common/sys/qqcache_impl.h b/usr/src/uts/common/sys/qqcache_impl.h
new file mode 100644
index 0000000000..f709b74d6c
--- /dev/null
+++ b/usr/src/uts/common/sys/qqcache_impl.h
@@ -0,0 +1,72 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018, Joyent, Inc.
+ */
+
+#ifndef _QQCACHE_IMPL_H
+#define _QQCACHE_IMPL_H
+
+#include <sys/debug.h>
+#include <sys/qqcache.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QQCACHE_NUM_LISTS 2
+#define QQCACHE_INSERT_LIST 1
+#define QQCACHE_MIN_SIZE 10
+
+CTASSERT(QQCACHE_INSERT_LIST < QQCACHE_NUM_LISTS);
+CTASSERT(QQCACHE_NUM_LISTS >= 2);
+
+typedef struct qqcache_list {
+ list_t qqcl_list;
+ size_t qqcl_len;
+} qqcache_list_t;
+
+struct qqcache {
+ qqcache_hash_fn_t qqc_hash_fn;
+ qqcache_cmp_fn_t qqc_cmp_fn;
+ qqcache_dtor_fn_t qqc_dtor_fn;
+ size_t qqc_link_off;
+ size_t qqc_tag_off;
+ size_t qqc_nbuckets;
+ size_t qqc_size;
+ size_t qqc_a;
+ size_t qqc_max[QQCACHE_NUM_LISTS];
+ qqcache_list_t qqc_lists[QQCACHE_NUM_LISTS];
+ qqcache_list_t qqc_buckets[];
+};
+
+#define QQCACHE_LIST(qqc, lnk) \
+ (&(qqc)->qqc_lists[(lnk)->qqln_listnum])
+
+#ifdef lint
+extern qqcache_link_t *obj_to_link(qqcache_t *, void *);
+extern void *link_to_obj(qqcache_t *, qqcache_link_t *);
+extern void *obj_to_tag(qqcache_t *, void *);
+#else
+#define obj_to_link(_q, _o) \
+ ((qqcache_link_t *)(((char *)(_o)) + (_q)->qqc_link_off))
+#define link_to_obj(_q, _l) \
+ ((void *)(((char *)(_l)) - (_q)->qqc_link_off))
+#define obj_to_tag(_q, _o) \
+ ((void *)(((char *)(_o)) + (_q)->qqc_tag_off))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _QQCACHE_IMPL_H */