summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--exception_lists/packaging12
-rw-r--r--usr/src/Targetdirs4
-rw-r--r--usr/src/cmd/Makefile1
-rw-r--r--usr/src/cmd/cmd-inet/etc/services6
-rw-r--r--usr/src/cmd/devfsadm/misc_link.c3
-rw-r--r--usr/src/cmd/dladm/Makefile3
-rw-r--r--usr/src/cmd/dladm/dladm.c831
-rw-r--r--usr/src/cmd/varpd/Makefile64
-rw-r--r--usr/src/cmd/varpd/varpd.c526
-rw-r--r--usr/src/cmd/varpd/varpd.xml67
-rw-r--r--usr/src/lib/Makefile6
-rw-r--r--usr/src/lib/libdladm/Makefile11
-rw-r--r--usr/src/lib/libdladm/Makefile.com8
-rw-r--r--usr/src/lib/libdladm/common/libdladm.c85
-rw-r--r--usr/src/lib/libdladm/common/libdladm.h17
-rw-r--r--usr/src/lib/libdladm/common/libdladm_impl.h7
-rw-r--r--usr/src/lib/libdladm/common/libdloverlay.c885
-rw-r--r--usr/src/lib/libdladm/common/libdloverlay.h107
-rw-r--r--usr/src/lib/libdladm/common/libdlvlan.c2
-rw-r--r--usr/src/lib/libdladm/common/libdlvnic.c11
-rw-r--r--usr/src/lib/libdladm/common/libdlvnic.h3
-rw-r--r--usr/src/lib/libdladm/common/mapfile-vers18
-rw-r--r--usr/src/lib/varpd/Makefile33
-rw-r--r--usr/src/lib/varpd/Makefile.plugin19
-rw-r--r--usr/src/lib/varpd/direct/Makefile39
-rw-r--r--usr/src/lib/varpd/direct/Makefile.com35
-rw-r--r--usr/src/lib/varpd/direct/amd64/Makefile19
-rw-r--r--usr/src/lib/varpd/direct/common/libvarpd_direct.c411
-rw-r--r--usr/src/lib/varpd/direct/common/mapfile-vers35
-rw-r--r--usr/src/lib/varpd/direct/i386/Makefile18
-rw-r--r--usr/src/lib/varpd/files/Makefile39
-rw-r--r--usr/src/lib/varpd/files/Makefile.com36
-rw-r--r--usr/src/lib/varpd/files/amd64/Makefile19
-rw-r--r--usr/src/lib/varpd/files/common/libvarpd_files.c605
-rw-r--r--usr/src/lib/varpd/files/common/libvarpd_files_json.c936
-rw-r--r--usr/src/lib/varpd/files/common/libvarpd_files_json.h52
-rw-r--r--usr/src/lib/varpd/files/common/mapfile-vers35
-rw-r--r--usr/src/lib/varpd/files/i386/Makefile18
-rw-r--r--usr/src/lib/varpd/libvarpd/Makefile54
-rw-r--r--usr/src/lib/varpd/libvarpd/Makefile.com48
-rw-r--r--usr/src/lib/varpd/libvarpd/amd64/Makefile19
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd.c345
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd.h77
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c649
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_client.c626
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_client.h92
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_door.c469
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h248
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c588
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_panic.c53
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c586
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_plugin.c256
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_prop.c300
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h417
-rw-r--r--usr/src/lib/varpd/libvarpd/common/libvarpd_util.c91
-rw-r--r--usr/src/lib/varpd/libvarpd/common/mapfile-plugin57
-rw-r--r--usr/src/lib/varpd/libvarpd/common/mapfile-vers113
-rw-r--r--usr/src/lib/varpd/libvarpd/i386/Makefile18
-rw-r--r--usr/src/man/man1m/dladm.1m372
-rw-r--r--usr/src/man/man4/Makefile1
-rw-r--r--usr/src/man/man4/overlay_files.4187
-rw-r--r--usr/src/man/man5/Makefile1
-rw-r--r--usr/src/man/man5/overlay.5521
-rw-r--r--usr/src/man/man7p/Makefile58
-rw-r--r--usr/src/man/man7p/vxlan.7p130
-rw-r--r--usr/src/pkg/manifests/system-network-overlay.p5m62
-rw-r--r--usr/src/test/util-tests/tests/dladm/Makefile2
-rw-r--r--usr/src/uts/Makefile.targ3
-rw-r--r--usr/src/uts/Makefile.uts6
-rw-r--r--usr/src/uts/common/Makefile.files5
-rw-r--r--usr/src/uts/common/Makefile.rules8
-rw-r--r--usr/src/uts/common/inet/udp/udp.c16
-rw-r--r--usr/src/uts/common/io/dld/dld_drv.c14
-rw-r--r--usr/src/uts/common/io/mac/mac_client.c8
-rw-r--r--usr/src/uts/common/io/mac/mac_datapath_setup.c11
-rw-r--r--usr/src/uts/common/io/mac/mac_provider.c3
-rw-r--r--usr/src/uts/common/io/overlay/overlay.c2184
-rw-r--r--usr/src/uts/common/io/overlay/overlay.conf16
-rw-r--r--usr/src/uts/common/io/overlay/overlay.mapfile46
-rw-r--r--usr/src/uts/common/io/overlay/overlay_fm.c82
-rw-r--r--usr/src/uts/common/io/overlay/overlay_mux.c363
-rw-r--r--usr/src/uts/common/io/overlay/overlay_plugin.c281
-rw-r--r--usr/src/uts/common/io/overlay/overlay_prop.c122
-rw-r--r--usr/src/uts/common/io/overlay/overlay_target.c1651
-rw-r--r--usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c394
-rw-r--r--usr/src/uts/common/netinet/in.h1
-rw-r--r--usr/src/uts/common/sys/Makefile3
-rw-r--r--usr/src/uts/common/sys/dld_ioc.h3
-rw-r--r--usr/src/uts/common/sys/dls_mgmt.h7
-rw-r--r--usr/src/uts/common/sys/mac_client_priv.h1
-rw-r--r--usr/src/uts/common/sys/mac_impl.h1
-rw-r--r--usr/src/uts/common/sys/mac_provider.h3
-rw-r--r--usr/src/uts/common/sys/overlay.h96
-rw-r--r--usr/src/uts/common/sys/overlay_common.h65
-rw-r--r--usr/src/uts/common/sys/overlay_impl.h205
-rw-r--r--usr/src/uts/common/sys/overlay_plugin.h324
-rw-r--r--usr/src/uts/common/sys/overlay_target.h295
-rw-r--r--usr/src/uts/intel/Makefile.intel6
-rw-r--r--usr/src/uts/intel/overlay/Makefile46
-rw-r--r--usr/src/uts/intel/vxlan/Makefile41
100 files changed, 17706 insertions, 70 deletions
diff --git a/exception_lists/packaging b/exception_lists/packaging
index f5cdbbf189..dc74d6d3bf 100644
--- a/exception_lists/packaging
+++ b/exception_lists/packaging
@@ -400,6 +400,18 @@ usr/include/libidspace.h
# VXLAN
#
usr/include/sys/vxlan.h
+lib/libvarpd.so
+lib/amd64/libvarpd.so
+#
+# Overlay
+#
+usr/include/libdloverlay.h
+usr/include/libvarpd.h
+usr/include/libvarpd_client.h
+usr/include/libvarpd_provider.h
+usr/include/sys/overlay.h
+usr/include/sys/overlay_common.h
+usr/include/sys/overlay_target.h
#
# Private interfaces in libsec
#
diff --git a/usr/src/Targetdirs b/usr/src/Targetdirs
index b78dc9df7f..94ec907b77 100644
--- a/usr/src/Targetdirs
+++ b/usr/src/Targetdirs
@@ -309,6 +309,7 @@ DIRS= \
/usr/lib/mdb/kvm \
/usr/lib/mdb/proc \
/usr/lib/nfs \
+ /usr/lib/varpd \
/usr/net \
/usr/net/servers \
/usr/lib/pool \
@@ -490,6 +491,7 @@ DIRS64= \
/usr/lib/security/$(MACH64) \
/usr/lib/smbsrv/$(MACH64) \
/usr/lib/abi/$(MACH64) \
+ /usr/lib/varpd/$(MACH64) \
/usr/sbin/$(MACH64) \
/usr/ucb/$(MACH64) \
/usr/ucblib/$(MACH64) \
@@ -544,6 +546,7 @@ SYM.DIRS64= \
/usr/lib/lwp/64 \
/usr/lib/secure/64 \
/usr/lib/security/64 \
+ /usr/lib/varpd/64 \
/usr/xpg4/lib/64 \
/var/ld/64 \
/usr/ucblib/64
@@ -647,6 +650,7 @@ $(BUILD64) $(ROOT)/usr/lib/lwp/64:= LINKDEST=$(MACH64)
$(BUILD64) $(ROOT)/usr/lib/link_audit/64:= LINKDEST=$(MACH64)
$(BUILD64) $(ROOT)/usr/lib/secure/64:= LINKDEST=$(MACH64)
$(BUILD64) $(ROOT)/usr/lib/security/64:= LINKDEST=$(MACH64)
+$(BUILD64) $(ROOT)/usr/lib/varpd/64:= LINKDEST=$(MACH64)
$(BUILD64) $(ROOT)/usr/xpg4/lib/64:= LINKDEST=$(MACH64)
$(BUILD64) $(ROOT)/var/ld/64:= LINKDEST=$(MACH64)
$(BUILD64) $(ROOT)/usr/ucblib/64:= LINKDEST=$(MACH64)
diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile
index 8159ad677b..4f496112e8 100644
--- a/usr/src/cmd/Makefile
+++ b/usr/src/cmd/Makefile
@@ -446,6 +446,7 @@ COMMON_SUBDIRS= \
utmpd \
uuidgen \
valtools \
+ varpd \
vgrind \
vi \
volcheck \
diff --git a/usr/src/cmd/cmd-inet/etc/services b/usr/src/cmd/cmd-inet/etc/services
index 37514ac0a7..4562baff66 100644
--- a/usr/src/cmd/cmd-inet/etc/services
+++ b/usr/src/cmd/cmd-inet/etc/services
@@ -1,6 +1,7 @@
#
# Copyright 2010 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
+# Copyright 2015 Joyent, Inc.
#
# CDDL HEADER START
#
@@ -33,7 +34,7 @@ systat 11/tcp users
daytime 13/tcp
daytime 13/udp
netstat 15/tcp
-qotd 17/tcp # Quote of the Day
+qotd 17/tcp # Quote of the Day
chargen 19/tcp ttytst source
chargen 19/udp ttytst source
ftp-data 20/tcp
@@ -80,7 +81,7 @@ imap3 220/tcp
imap3 220/udp
clearcase 371/tcp
clearcase 371/udp
-ldap 389/tcp # Lightweight Directory Access Protocol
+ldap 389/tcp # Lightweight Directory Access Protocol
ldap 389/udp # Lightweight Directory Access Protocol
https 443/tcp
https 443/udp
@@ -227,6 +228,7 @@ eklogin 2105/tcp # Kerberos encrypted rlogin
lockd 4045/udp # NFS lock daemon/manager
lockd 4045/tcp
ipsec-nat-t 4500/udp # IPsec NAT-Traversal
+vxlan 4789/udp # Virtual eXtensible Local Area Network (VXLAN)
mdns 5353/udp # Multicast DNS
mdns 5353/tcp
vnc-server 5900/tcp # VNC Server
diff --git a/usr/src/cmd/devfsadm/misc_link.c b/usr/src/cmd/devfsadm/misc_link.c
index 30e3e1863e..7397fcdb40 100644
--- a/usr/src/cmd/devfsadm/misc_link.c
+++ b/usr/src/cmd/devfsadm/misc_link.c
@@ -204,6 +204,9 @@ static devfsadm_create_t misc_cbt[] = {
{ "pseudo", "ddi_pseudo", "tpm",
TYPE_EXACT | DRV_EXACT, ILEVEL_0, minor_name
},
+ { "pseudo", "ddi_pseudo", "overlay",
+ TYPE_EXACT | DRV_EXACT, ILEVEL_0, minor_name
+ }
};
DEVFSADM_CREATE_INIT_V0(misc_cbt);
diff --git a/usr/src/cmd/dladm/Makefile b/usr/src/cmd/dladm/Makefile
index 6171822797..bba8a8cede 100644
--- a/usr/src/cmd/dladm/Makefile
+++ b/usr/src/cmd/dladm/Makefile
@@ -20,6 +20,7 @@
#
# Copyright 2010 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
+# Copyright 2015 Joyent, Inc.
#
# Copyright (c) 2018, Joyent, Inc.
@@ -38,7 +39,7 @@ XGETFLAGS += -a -x $(PROG).xcl
LDLIBS += -L$(ROOT)/lib -lsocket
LDLIBS += -ldladm -ldlpi -lkstat -lsecdb -lbsm -lofmt -linetutil -ldevinfo
-LDLIBS += $(ZLAZYLOAD) -lrstp $(ZNOLAZYLOAD)
+LDLIBS += $(ZLAZYLOAD) -lrstp $(ZNOLAZYLOAD) -lnsl -lumem -lcustr
CERRWARN += -_gcc=-Wno-switch
CERRWARN += -_gcc=-Wno-unused-label
diff --git a/usr/src/cmd/dladm/dladm.c b/usr/src/cmd/dladm/dladm.c
index 04a520f537..9d4d345bca 100644
--- a/usr/src/cmd/dladm/dladm.c
+++ b/usr/src/cmd/dladm/dladm.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc.
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved.
* Copyright 2020 Peter Tribble.
* Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
*/
@@ -63,6 +64,7 @@
#include <libdliptun.h>
#include <libdlsim.h>
#include <libdlbridge.h>
+#include <libdloverlay.h>
#include <libinetutil.h>
#include <libvrrpadm.h>
#include <bsm/adt.h>
@@ -78,6 +80,7 @@
#include <stddef.h>
#include <stp_in.h>
#include <ofmt.h>
+#include <libcustr.h>
#define MAXPORT 256
#define MAXVNIC 256
@@ -196,6 +199,7 @@ static ofmt_cb_t print_lacp_cb, print_phys_one_mac_cb;
static ofmt_cb_t print_xaggr_cb, print_aggr_stats_cb;
static ofmt_cb_t print_phys_one_hwgrp_cb, print_wlan_attr_cb;
static ofmt_cb_t print_wifi_status_cb, print_link_attr_cb;
+static ofmt_cb_t print_overlay_cb, print_overlay_fma_cb, print_overlay_targ_cb;
typedef void cmdfunc_t(int, char **, const char *);
@@ -223,6 +227,8 @@ static cmdfunc_t do_create_bridge, do_modify_bridge, do_delete_bridge;
static cmdfunc_t do_add_bridge, do_remove_bridge, do_show_bridge;
static cmdfunc_t do_create_iptun, do_modify_iptun, do_delete_iptun;
static cmdfunc_t do_show_iptun, do_up_iptun, do_down_iptun;
+static cmdfunc_t do_create_overlay, do_delete_overlay, do_modify_overlay;
+static cmdfunc_t do_show_overlay;
static void do_up_vnic_common(int, char **, const char *, boolean_t);
@@ -258,8 +264,11 @@ static void die(const char *, ...);
static void die_optdup(int);
static void die_opterr(int, int, const char *);
static void die_dlerr(dladm_status_t, const char *, ...);
+static void die_dlerrlist(dladm_status_t, dladm_errlist_t *,
+ const char *, ...);
static void warn(const char *, ...);
static void warn_dlerr(dladm_status_t, const char *, ...);
+static void warn_dlerrlist(dladm_errlist_t *);
typedef struct cmd {
char *c_name;
@@ -406,6 +415,17 @@ static cmd_t cmds[] = {
" <bridge>\n"
" show-bridge -t [-p] [-o <field>,...] [-s [-i <interval>]]"
" <bridge>\n" },
+ { "create-overlay", do_create_overlay,
+ " create-overlay [-t] -e <encap> -s <search> -v <vnetid>\n"
+ "\t\t [ -p <prop>=<value>[,...]] <overlay>" },
+ { "delete-overlay", do_delete_overlay,
+ " delete-overlay <overlay>" },
+ { "modify-overlay", do_modify_overlay,
+ " modify-overlay -d mac | -f | -s mac=ip:port "
+ "<overlay>" },
+ { "show-overlay", do_show_overlay,
+ " show-overlay [-f | -t] [[-p] -o <field>,...] "
+ "[<overlay>]\n" },
{ "show-usage", do_show_usage,
" show-usage [-a] [-d | -F <format>] "
"[-s <DD/MM/YYYY,HH:MM:SS>]\n"
@@ -1430,6 +1450,82 @@ static ofmt_field_t bridge_trill_fields[] = {
offsetof(bridge_trill_fields_buf_t, bridget_nexthop), print_default_cb },
{ NULL, 0, 0, NULL}};
+static const struct option overlay_create_lopts[] = {
+ { "encap", required_argument, NULL, 'e' },
+ { "prop", required_argument, NULL, 'p' },
+ { "search", required_argument, NULL, 's' },
+ { "temporary", no_argument, NULL, 't' },
+ { "vnetid", required_argument, NULL, 'v' },
+ { NULL, 0, NULL, 0 }
+};
+
+static const struct option overlay_modify_lopts[] = {
+ { "delete-entry", required_argument, NULL, 'd' },
+ { "flush-table", no_argument, NULL, 'f' },
+ { "set-entry", required_argument, NULL, 's' },
+ { NULL, 0, NULL, 0 }
+};
+
+static const struct option overlay_show_lopts[] = {
+ { "fma", no_argument, NULL, 'f' },
+ { "target", no_argument, NULL, 't' },
+ { "parsable", no_argument, NULL, 'p' },
+ { "parseable", no_argument, NULL, 'p' },
+ { "output", required_argument, NULL, 'o' },
+ { NULL, 0, NULL, 0 }
+};
+
+/*
+ * Structures for dladm show-overlay
+ */
+typedef enum {
+ OVERLAY_LINK,
+ OVERLAY_PROPERTY,
+ OVERLAY_PERM,
+ OVERLAY_REQ,
+ OVERLAY_VALUE,
+ OVERLAY_DEFAULT,
+ OVERLAY_POSSIBLE
+} overlay_field_index_t;
+
+static const ofmt_field_t overlay_fields[] = {
+/* name, field width, index */
+{ "LINK", 19, OVERLAY_LINK, print_overlay_cb },
+{ "PROPERTY", 19, OVERLAY_PROPERTY, print_overlay_cb },
+{ "PERM", 5, OVERLAY_PERM, print_overlay_cb },
+{ "REQ", 4, OVERLAY_REQ, print_overlay_cb },
+{ "VALUE", 11, OVERLAY_VALUE, print_overlay_cb },
+{ "DEFAULT", 10, OVERLAY_DEFAULT, print_overlay_cb },
+{ "POSSIBLE", 10, OVERLAY_POSSIBLE, print_overlay_cb },
+{ NULL, 0, 0, NULL }
+};
+
+typedef enum {
+ OVERLAY_FMA_LINK,
+ OVERLAY_FMA_STATUS,
+ OVERLAY_FMA_DETAILS
+} overlay_fma_field_index_t;
+
+static const ofmt_field_t overlay_fma_fields[] = {
+{ "LINK", 20, OVERLAY_FMA_LINK, print_overlay_fma_cb },
+{ "STATUS", 8, OVERLAY_FMA_STATUS, print_overlay_fma_cb },
+{ "DETAILS", 52, OVERLAY_FMA_DETAILS, print_overlay_fma_cb },
+{ NULL, 0, 0, NULL }
+};
+
+typedef enum {
+ OVERLAY_TARG_LINK,
+ OVERLAY_TARG_TARGET,
+ OVERLAY_TARG_DEST
+} overlay_targ_field_index_t;
+
+static const ofmt_field_t overlay_targ_fields[] = {
+{ "LINK", 20, OVERLAY_TARG_LINK, print_overlay_targ_cb },
+{ "TARGET", 18, OVERLAY_TARG_TARGET, print_overlay_targ_cb },
+{ "DESTINATION", 42, OVERLAY_TARG_DEST, print_overlay_targ_cb },
+{ NULL, 0, 0, NULL }
+};
+
static char *progname;
static sig_atomic_t signalled;
@@ -1439,6 +1535,12 @@ static sig_atomic_t signalled;
*/
static dladm_handle_t handle = NULL;
+/*
+ * Global error list that all routines can use. It's initialized by the main
+ * code.
+ */
+static dladm_errlist_t errlist;
+
#define DLADM_ETHERSTUB_NAME "etherstub"
#define DLADM_IS_ETHERSTUB(id) (id == DATALINK_INVALID_LINKID)
@@ -1506,6 +1608,8 @@ main(int argc, char *argv[])
"could not open /dev/dld");
}
+ dladm_errlist_init(&errlist);
+
cmdp->c_fn(argc - 1, &argv[1], cmdp->c_usage);
dladm_close(handle);
@@ -4801,7 +4905,7 @@ do_create_vnic(int argc, char *argv[], const char *use)
status = dladm_vnic_create(handle, name, dev_linkid, mac_addr_type,
mac_addr, maclen, &mac_slot, mac_prefix_len, vid, vrid, af,
- &linkid, proplist, flags);
+ &linkid, proplist, &errlist, flags);
switch (status) {
case DLADM_STATUS_OK:
break;
@@ -4812,7 +4916,8 @@ do_create_vnic(int argc, char *argv[], const char *use)
break;
default:
- die_dlerr(status, "vnic creation over %s failed", devname);
+ die_dlerrlist(status, &errlist, "vnic creation over %s failed",
+ devname);
}
dladm_free_props(proplist);
@@ -5311,7 +5416,7 @@ do_create_etherstub(int argc, char *argv[], const char *use)
status = dladm_vnic_create(handle, name, DATALINK_INVALID_LINKID,
VNIC_MAC_ADDR_TYPE_AUTO, mac_addr, ETHERADDRL, NULL, 0, 0,
- VRRP_VRID_NONE, AF_UNSPEC, NULL, NULL, flags);
+ VRRP_VRID_NONE, AF_UNSPEC, NULL, NULL, &errlist, flags);
if (status != DLADM_STATUS_OK)
die_dlerr(status, "etherstub creation failed");
}
@@ -8953,6 +9058,21 @@ warn_dlerr(dladm_status_t err, const char *format, ...)
(void) fprintf(stderr, ": %s\n", dladm_status2str(err, errmsg));
}
+static void
+warn_dlerrlist(dladm_errlist_t *errlist)
+{
+ if (errlist != NULL && errlist->el_count > 0) {
+ int i;
+ for (i = 0; i < errlist->el_count; i++) {
+ (void) fprintf(stderr, gettext("%s: warning: "),
+ progname);
+
+ (void) fprintf(stderr, "%s\n",
+ gettext(errlist->el_errs[i]));
+ }
+ }
+}
+
/*
* Also closes the dladm handle if it is not NULL.
*/
@@ -8978,6 +9098,34 @@ die_dlerr(dladm_status_t err, const char *format, ...)
exit(EXIT_FAILURE);
}
+/*
+ * Like die_dlerr, but uses the errlist for additional information.
+ */
+/* PRINTFLIKE3 */
+static void
+die_dlerrlist(dladm_status_t err, dladm_errlist_t *errlist,
+ const char *format, ...)
+{
+ va_list alist;
+ char errmsg[DLADM_STRSIZE];
+
+ warn_dlerrlist(errlist);
+ format = gettext(format);
+ (void) fprintf(stderr, "%s: ", progname);
+
+ va_start(alist, format);
+ (void) vfprintf(stderr, format, alist);
+ va_end(alist);
+ (void) fprintf(stderr, ": %s\n", dladm_status2str(err, errmsg));
+
+ /* close dladm handle if it was opened */
+ if (handle != NULL)
+ dladm_close(handle);
+
+ exit(EXIT_FAILURE);
+
+}
+
/* PRINTFLIKE1 */
static void
die(const char *format, ...)
@@ -9685,3 +9833,680 @@ do_up_part(int argc, char *argv[], const char *use)
(void) dladm_part_up(handle, partid, 0);
}
+
+static void
+do_create_overlay(int argc, char *argv[], const char *use)
+{
+ int opt;
+ char *encap = NULL, *endp, *search = NULL;
+ char name[MAXLINKNAMELEN];
+ dladm_status_t status;
+ uint32_t flags = DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST;
+ uint64_t vid = 0;
+ boolean_t havevid = B_FALSE;
+ char propstr[DLADM_STRSIZE];
+ dladm_arg_list_t *proplist = NULL;
+
+ bzero(propstr, sizeof (propstr));
+ while ((opt = getopt_long(argc, argv, ":te:v:p:s:",
+ overlay_create_lopts, NULL)) != -1) {
+ switch (opt) {
+ case 'e':
+ encap = optarg;
+ break;
+ case 's':
+ search = optarg;
+ break;
+ case 't':
+ flags &= ~DLADM_OPT_PERSIST;
+ break;
+ case 'p':
+ (void) strlcat(propstr, optarg, DLADM_STRSIZE);
+ if (strlcat(propstr, ",", DLADM_STRSIZE) >=
+ DLADM_STRSIZE)
+ die("property list too long '%s'", propstr);
+ break;
+ case 'v':
+ vid = strtoul(optarg, &endp, 10);
+ if (*endp != '\0' || (vid == 0 && errno == EINVAL))
+ die("couldn't parse virtual networkd id: %s",
+ optarg);
+ if (vid == ULONG_MAX && errno == ERANGE)
+ die("virtual networkd id too large: %s",
+ optarg);
+ havevid = B_TRUE;
+ break;
+ default:
+ die_opterr(optopt, opt, use);
+ }
+ }
+
+ /*
+ * Overlays do not currently support persistence.
+ * This will be addressed by https://www.illumos.org/issues/14434
+ */
+ if ((flags & DLADM_OPT_PERSIST) != 0)
+ die("overlays do not (yet) support persistence, use -t");
+
+ if (havevid == B_FALSE)
+ die("missing required virtual network id");
+
+ if (encap == NULL)
+ die("missing required encapsulation plugin");
+
+ if (search == NULL)
+ die("missing required search plugin");
+
+ if (optind != (argc - 1))
+ die("missing device name");
+
+ if (strlcpy(name, argv[optind], MAXLINKNAMELEN) >= MAXLINKNAMELEN)
+ die("link name too long '%s'", argv[optind]);
+
+ if (!dladm_valid_linkname(name))
+ die("invalid link name '%s'", argv[optind]);
+
+ if (strlen(encap) + 1 > MAXLINKNAMELEN)
+ die("encapsulation plugin name too long '%s'", encap);
+
+ if (strlen(search) + 1 > MAXLINKNAMELEN)
+ die("search plugin name too long '%s'", encap);
+
+ if (dladm_parse_link_props(propstr, &proplist, B_FALSE)
+ != DLADM_STATUS_OK)
+ die("invalid overlay property");
+
+ status = dladm_overlay_create(handle, name, encap, search, vid,
+ proplist, &errlist, flags);
+ dladm_free_props(proplist);
+ if (status != DLADM_STATUS_OK) {
+ die_dlerrlist(status, &errlist, "overlay creation failed");
+ }
+}
+
+/* ARGSUSED */
+static void
+do_delete_overlay(int argc, char *argv[], const char *use)
+{
+ datalink_id_t linkid = DATALINK_ALL_LINKID;
+ dladm_status_t status;
+
+ if (argc != 2) {
+ usage();
+ }
+
+ status = dladm_name2info(handle, argv[1], &linkid, NULL, NULL, NULL);
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "failed to delete %s", argv[1]);
+
+ status = dladm_overlay_delete(handle, linkid);
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "failed to delete %s", argv[1]);
+}
+
+typedef struct showoverlay_state {
+ ofmt_handle_t sho_ofmt;
+ const char *sho_linkname;
+ dladm_overlay_propinfo_handle_t sho_info;
+ uint8_t sho_value[DLADM_OVERLAY_PROP_SIZEMAX];
+ uint32_t sho_size;
+} showoverlay_state_t;
+
+typedef struct showoverlay_fma_state {
+ ofmt_handle_t shof_ofmt;
+ const char *shof_linkname;
+ dladm_overlay_status_t *shof_status;
+} showoverlay_fma_state_t;
+
+typedef struct showoverlay_targ_state {
+ ofmt_handle_t shot_ofmt;
+ const char *shot_linkname;
+ const struct ether_addr *shot_key;
+ const dladm_overlay_point_t *shot_point;
+} showoverlay_targ_state_t;
+
+static void
+print_overlay_value(char *outbuf, uint_t bufsize, uint_t type, const void *pbuf,
+ const size_t psize)
+{
+ const struct in6_addr *ipv6;
+ struct in_addr ip;
+
+ switch (type) {
+ case OVERLAY_PROP_T_INT:
+ if (psize != 1 && psize != 2 && psize != 4 && psize != 8) {
+ (void) snprintf(outbuf, bufsize, "?");
+ break;
+ }
+ if (psize == 1)
+ (void) snprintf(outbuf, bufsize, "%d", *(int8_t *)pbuf);
+ if (psize == 2)
+ (void) snprintf(outbuf, bufsize, "%d",
+ *(int16_t *)pbuf);
+ if (psize == 4)
+ (void) snprintf(outbuf, bufsize, "%d",
+ *(int32_t *)pbuf);
+ if (psize == 8)
+ (void) snprintf(outbuf, bufsize, "%d",
+ *(int64_t *)pbuf);
+ break;
+ case OVERLAY_PROP_T_UINT:
+ if (psize != 1 && psize != 2 && psize != 4 && psize != 8) {
+ (void) snprintf(outbuf, bufsize, "?");
+ break;
+ }
+ if (psize == 1)
+ (void) snprintf(outbuf, bufsize, "%d",
+ *(uint8_t *)pbuf);
+ if (psize == 2)
+ (void) snprintf(outbuf, bufsize, "%d",
+ *(uint16_t *)pbuf);
+ if (psize == 4)
+ (void) snprintf(outbuf, bufsize, "%d",
+ *(uint32_t *)pbuf);
+ if (psize == 8)
+ (void) snprintf(outbuf, bufsize, "%d",
+ *(uint64_t *)pbuf);
+ break;
+ case OVERLAY_PROP_T_IP:
+ if (psize != sizeof (struct in6_addr)) {
+ warn("malformed overlay IP property: %d bytes\n",
+ psize);
+ (void) snprintf(outbuf, bufsize, "--");
+ break;
+ }
+
+ ipv6 = pbuf;
+ if (IN6_IS_ADDR_V4MAPPED(ipv6)) {
+ IN6_V4MAPPED_TO_INADDR(ipv6, &ip);
+ if (inet_ntop(AF_INET, &ip, outbuf, bufsize) == NULL) {
+ warn("malformed overlay IP property\n");
+ (void) snprintf(outbuf, bufsize, "--");
+ break;
+ }
+ } else {
+ if (inet_ntop(AF_INET6, ipv6, outbuf, bufsize) ==
+ NULL) {
+ warn("malformed overlay IP property\n");
+ (void) snprintf(outbuf, bufsize, "--");
+ break;
+ }
+ }
+
+ break;
+ case OVERLAY_PROP_T_STRING:
+ (void) snprintf(outbuf, bufsize, "%s", pbuf);
+ break;
+ default:
+ abort();
+ }
+
+ return;
+
+}
+
+static boolean_t
+print_overlay_cb(ofmt_arg_t *ofarg, char *buf, uint_t bufsize)
+{
+ dladm_status_t status;
+ showoverlay_state_t *sp = ofarg->ofmt_cbarg;
+ dladm_overlay_propinfo_handle_t infop = sp->sho_info;
+ const char *pname;
+ uint_t type, prot;
+ const void *def;
+ uint32_t defsize;
+ const mac_propval_range_t *rangep;
+
+ if ((status = dladm_overlay_prop_info(infop, &pname, &type, &prot, &def,
+ &defsize, &rangep)) != DLADM_STATUS_OK) {
+ warn_dlerr(status, "failed to get get property info");
+ return (B_TRUE);
+ }
+
+ switch (ofarg->ofmt_id) {
+ case OVERLAY_LINK:
+ (void) snprintf(buf, bufsize, "%s", sp->sho_linkname);
+ break;
+ case OVERLAY_PROPERTY:
+ (void) snprintf(buf, bufsize, "%s", pname);
+ break;
+ case OVERLAY_PERM:
+ if ((prot & OVERLAY_PROP_PERM_RW) == OVERLAY_PROP_PERM_RW) {
+ (void) snprintf(buf, bufsize, "%s", "rw");
+ } else if ((prot & OVERLAY_PROP_PERM_RW) ==
+ OVERLAY_PROP_PERM_READ) {
+ (void) snprintf(buf, bufsize, "%s", "r-");
+ } else {
+ (void) snprintf(buf, bufsize, "%s", "--");
+ }
+ break;
+ case OVERLAY_REQ:
+ (void) snprintf(buf, bufsize, "%s",
+ prot & OVERLAY_PROP_PERM_REQ ? "y" : "-");
+ break;
+ case OVERLAY_VALUE:
+ if (sp->sho_size == 0) {
+ (void) snprintf(buf, bufsize, "%s", "--");
+ } else {
+ print_overlay_value(buf, bufsize, type, sp->sho_value,
+ sp->sho_size);
+ }
+ break;
+ case OVERLAY_DEFAULT:
+ if (defsize == 0) {
+ (void) snprintf(buf, bufsize, "%s", "--");
+ } else {
+ print_overlay_value(buf, bufsize, type, def, defsize);
+ }
+ break;
+ case OVERLAY_POSSIBLE: {
+ int i;
+ char **vals, *ptr, *lim;
+ if (rangep->mpr_count == 0) {
+ (void) snprintf(buf, bufsize, "%s", "--");
+ break;
+ }
+
+ vals = malloc((sizeof (char *) + DLADM_PROP_VAL_MAX) *
+ rangep->mpr_count);
+ if (vals == NULL)
+ die("insufficient memory");
+ for (i = 0; i < rangep->mpr_count; i++) {
+ vals[i] = (char *)vals + sizeof (char *) *
+ rangep->mpr_count + i * DLADM_MAX_PROP_VALCNT;
+ }
+
+ if (dladm_range2strs(rangep, vals) != 0) {
+ free(vals);
+ (void) snprintf(buf, bufsize, "%s", "?");
+ break;
+ }
+
+ ptr = buf;
+ lim = buf + bufsize;
+ for (i = 0; i < rangep->mpr_count; i++) {
+ ptr += snprintf(ptr, lim - ptr, "%s,", vals[i]);
+ if (ptr >= lim)
+ break;
+ }
+ if (rangep->mpr_count > 0)
+ buf[strlen(buf) - 1] = '\0';
+ free(vals);
+ break;
+ }
+ default:
+ abort();
+ }
+ return (B_TRUE);
+}
+
+static int
+dladm_overlay_show_one(dladm_handle_t handle, datalink_id_t linkid,
+ dladm_overlay_propinfo_handle_t phdl, void *arg)
+{
+ showoverlay_state_t *sp = arg;
+ sp->sho_info = phdl;
+
+ sp->sho_size = sizeof (sp->sho_value);
+ if (dladm_overlay_get_prop(handle, linkid, phdl, &sp->sho_value,
+ &sp->sho_size) != DLADM_STATUS_OK)
+ return (DLADM_WALK_CONTINUE);
+
+ ofmt_print(sp->sho_ofmt, sp);
+ return (DLADM_WALK_CONTINUE);
+}
+
+static int
+show_one_overlay(dladm_handle_t hdl, datalink_id_t linkid, void *arg)
+{
+ char buf[MAXLINKNAMELEN];
+ showoverlay_state_t state;
+ datalink_class_t class;
+
+ if (dladm_datalink_id2info(hdl, linkid, NULL, &class, NULL, buf,
+ MAXLINKNAMELEN) != DLADM_STATUS_OK ||
+ class != DATALINK_CLASS_OVERLAY)
+ return (DLADM_WALK_CONTINUE);
+
+ state.sho_linkname = buf;
+ state.sho_ofmt = arg;
+
+ dladm_errlist_reset(&errlist);
+ (void) dladm_overlay_walk_prop(handle, linkid, dladm_overlay_show_one,
+ &state, &errlist);
+ warn_dlerrlist(&errlist);
+
+ return (DLADM_WALK_CONTINUE);
+}
+
+static boolean_t
+print_overlay_targ_cb(ofmt_arg_t *ofarg, char *buf, uint_t bufsize)
+{
+ char keybuf[ETHERADDRSTRL];
+ const showoverlay_targ_state_t *shot = ofarg->ofmt_cbarg;
+ const dladm_overlay_point_t *point = shot->shot_point;
+ char macbuf[ETHERADDRSTRL];
+ char ipbuf[INET6_ADDRSTRLEN];
+ custr_t *cus;
+
+ switch (ofarg->ofmt_id) {
+ case OVERLAY_TARG_LINK:
+ (void) snprintf(buf, bufsize, shot->shot_linkname);
+ break;
+ case OVERLAY_TARG_TARGET:
+ if ((point->dop_flags & DLADM_OVERLAY_F_DEFAULT) != 0) {
+ (void) snprintf(buf, bufsize, "*:*:*:*:*:*");
+ } else {
+ if (ether_ntoa_r(shot->shot_key, keybuf) == NULL) {
+ warn("encountered malformed mac address key\n");
+ return (B_FALSE);
+ }
+ (void) snprintf(buf, bufsize, "%s", keybuf);
+ }
+ break;
+ case OVERLAY_TARG_DEST:
+ if (custr_alloc_buf(&cus, buf, bufsize) != 0) {
+ die("ran out of memory for printing the overlay "
+ "target destination");
+ }
+
+ if (point->dop_dest & OVERLAY_PLUGIN_D_ETHERNET) {
+ if (ether_ntoa_r(&point->dop_mac, macbuf) == NULL) {
+ warn("encountered malformed mac address target "
+ "for key %s\n", keybuf);
+ return (B_FALSE);
+ }
+ (void) custr_append(cus, macbuf);
+ }
+
+ if (point->dop_dest & OVERLAY_PLUGIN_D_IP) {
+ if (IN6_IS_ADDR_V4MAPPED(&point->dop_ip)) {
+ struct in_addr v4;
+ IN6_V4MAPPED_TO_INADDR(&point->dop_ip, &v4);
+ if (inet_ntop(AF_INET, &v4, ipbuf,
+ sizeof (ipbuf)) == NULL)
+ abort();
+ } else if (inet_ntop(AF_INET6, &point->dop_ip, ipbuf,
+ sizeof (ipbuf)) == NULL) {
+ /*
+ * The only failures we should get are
+ * EAFNOSUPPORT and ENOSPC because of buffer
+ * exhaustion. In either of these cases, that
+ * means something has gone horribly wrong.
+ */
+ abort();
+ }
+ if (point->dop_dest & OVERLAY_PLUGIN_D_ETHERNET)
+ (void) custr_appendc(cus, ',');
+ (void) custr_append(cus, ipbuf);
+ }
+
+ if (point->dop_dest & OVERLAY_PLUGIN_D_PORT) {
+ if (point->dop_dest & OVERLAY_PLUGIN_D_IP)
+ (void) custr_appendc(cus, ':');
+ else if (point->dop_dest & OVERLAY_PLUGIN_D_ETHERNET)
+ (void) custr_appendc(cus, ',');
+ (void) custr_append_printf(cus, "%u", point->dop_port);
+ }
+
+ custr_free(cus);
+
+ break;
+ }
+ return (B_TRUE);
+}
+
+/* ARGSUSED */
+static int
+show_one_overlay_table_entry(dladm_handle_t handle, datalink_id_t linkid,
+ const struct ether_addr *key, const dladm_overlay_point_t *point, void *arg)
+{
+ showoverlay_targ_state_t *shot = arg;
+
+ shot->shot_key = key;
+ shot->shot_point = point;
+ ofmt_print(shot->shot_ofmt, shot);
+
+ return (DLADM_WALK_CONTINUE);
+}
+
+/* ARGSUSED */
+static int
+show_one_overlay_table(dladm_handle_t handle, datalink_id_t linkid, void *arg)
+{
+ char linkbuf[MAXLINKNAMELEN];
+ showoverlay_targ_state_t shot;
+ datalink_class_t class;
+
+ if (dladm_datalink_id2info(handle, linkid, NULL, &class, NULL, linkbuf,
+ MAXLINKNAMELEN) != DLADM_STATUS_OK ||
+ class != DATALINK_CLASS_OVERLAY)
+ return (DLADM_WALK_CONTINUE);
+
+ shot.shot_ofmt = arg;
+ shot.shot_linkname = linkbuf;
+
+ (void) dladm_overlay_walk_cache(handle, linkid,
+ show_one_overlay_table_entry, &shot);
+
+ return (DLADM_WALK_CONTINUE);
+}
+
+static boolean_t
+print_overlay_fma_cb(ofmt_arg_t *ofarg, char *buf, uint_t bufsize)
+{
+ showoverlay_fma_state_t *shof = ofarg->ofmt_cbarg;
+ dladm_overlay_status_t *st = shof->shof_status;
+
+ switch (ofarg->ofmt_id) {
+ case OVERLAY_FMA_LINK:
+ (void) snprintf(buf, bufsize, "%s", shof->shof_linkname);
+ break;
+ case OVERLAY_FMA_STATUS:
+ (void) snprintf(buf, bufsize, st->dos_degraded == B_TRUE ?
+ "DEGRADED": "ONLINE");
+ break;
+ case OVERLAY_FMA_DETAILS:
+ (void) snprintf(buf, bufsize, "%s", st->dos_degraded == B_TRUE ?
+ st->dos_fmamsg : "-");
+ break;
+ default:
+ abort();
+ }
+ return (B_TRUE);
+}
+
+/* ARGSUSED */
+static void
+show_one_overlay_fma_cb(dladm_handle_t handle, datalink_id_t linkid,
+ dladm_overlay_status_t *stat, void *arg)
+{
+ showoverlay_fma_state_t *shof = arg;
+ shof->shof_status = stat;
+ ofmt_print(shof->shof_ofmt, shof);
+}
+
+
+static int
+show_one_overlay_fma(dladm_handle_t handle, datalink_id_t linkid, void *arg)
+{
+ dladm_status_t status;
+ char linkbuf[MAXLINKNAMELEN];
+ datalink_class_t class;
+ showoverlay_fma_state_t shof;
+
+ if (dladm_datalink_id2info(handle, linkid, NULL, &class, NULL, linkbuf,
+ MAXLINKNAMELEN) != DLADM_STATUS_OK ||
+ class != DATALINK_CLASS_OVERLAY) {
+ die("datalink %s is not an overlay device\n", linkbuf);
+ }
+
+ shof.shof_ofmt = arg;
+ shof.shof_linkname = linkbuf;
+
+ status = dladm_overlay_status(handle, linkid,
+ show_one_overlay_fma_cb, &shof);
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "failed to obtain device status for %s",
+ linkbuf);
+
+ return (DLADM_WALK_CONTINUE);
+}
+
+static void
+do_show_overlay(int argc, char *argv[], const char *use)
+{
+ int i, opt;
+ datalink_id_t linkid = DATALINK_ALL_LINKID;
+ dladm_status_t status;
+ int (*funcp)(dladm_handle_t, datalink_id_t, void *);
+ char *fields_str = NULL;
+ const ofmt_field_t *fieldsp;
+ ofmt_status_t oferr;
+ boolean_t parse;
+ ofmt_handle_t ofmt;
+ uint_t ofmtflags;
+ int err;
+
+
+ funcp = show_one_overlay;
+ fieldsp = overlay_fields;
+ parse = B_FALSE;
+ ofmtflags = OFMT_WRAP;
+ while ((opt = getopt_long(argc, argv, ":o:pft", overlay_show_lopts,
+ NULL)) != -1) {
+ switch (opt) {
+ case 'f':
+ funcp = show_one_overlay_fma;
+ fieldsp = overlay_fma_fields;
+ break;
+ case 'o':
+ fields_str = optarg;
+ break;
+ case 'p':
+ parse = B_TRUE;
+ ofmtflags = OFMT_PARSABLE;
+ break;
+ case 't':
+ funcp = show_one_overlay_table;
+ fieldsp = overlay_targ_fields;
+ break;
+ default:
+ die_opterr(optopt, opt, use);
+ }
+ }
+
+ if (fields_str != NULL && strcasecmp(fields_str, "all") == 0)
+ fields_str = NULL;
+
+ oferr = ofmt_open(fields_str, fieldsp, ofmtflags, 0, &ofmt);
+ ofmt_check(oferr, parse, ofmt, die, warn);
+
+ err = 0;
+ if (argc > optind) {
+ for (i = optind; i < argc; i++) {
+ status = dladm_name2info(handle, argv[i], &linkid,
+ NULL, NULL, NULL);
+ if (status != DLADM_STATUS_OK) {
+ warn_dlerr(status, "failed to find %s",
+ argv[i]);
+ err = 1;
+ continue;
+ }
+ (void) funcp(handle, linkid, ofmt);
+ }
+ } else {
+ (void) dladm_walk_datalink_id(funcp, handle, ofmt,
+ DATALINK_CLASS_OVERLAY, DATALINK_ANY_MEDIATYPE,
+ DLADM_OPT_ACTIVE);
+ }
+ ofmt_close(ofmt);
+
+ exit(err);
+}
+
+static void
+do_modify_overlay(int argc, char *argv[], const char *use)
+{
+ int opt, ocnt = 0;
+ boolean_t flush, set, delete;
+ struct ether_addr e;
+ char *dest = NULL;
+ datalink_id_t linkid = DATALINK_ALL_LINKID;
+ dladm_status_t status;
+
+ flush = set = delete = B_FALSE;
+ while ((opt = getopt_long(argc, argv, ":fd:s:", overlay_modify_lopts,
+ NULL)) != -1) {
+ switch (opt) {
+ case 'd':
+ if (delete == B_TRUE)
+ die_optdup('d');
+ delete = B_TRUE;
+ ocnt++;
+ if (ether_aton_r(optarg, &e) == NULL)
+ die("invalid mac address: %s\n", optarg);
+ break;
+ case 'f':
+ if (flush == B_TRUE)
+ die_optdup('f');
+ flush = B_TRUE;
+ ocnt++;
+ break;
+ case 's':
+ if (set == B_TRUE)
+ die_optdup('s');
+ set = B_TRUE;
+ ocnt++;
+ dest = strchr(optarg, '=');
+ *dest = '\0';
+ dest++;
+ if (dest == NULL)
+ die("malformed value, expected mac=dest, "
+ "got: %s\n", optarg);
+ if (ether_aton_r(optarg, &e) == NULL)
+ die("invalid mac address: %s\n", optarg);
+ break;
+ default:
+ die_opterr(optopt, opt, use);
+ }
+ }
+
+ if (ocnt == 0)
+ die("need to specify one of -d, -f, or -s");
+ if (ocnt > 1)
+ die("only one of -d, -f, or -s may be used");
+
+ if (argv[optind] == NULL)
+ die("missing required overlay device\n");
+ if (argc > optind + 1)
+ die("only one overlay device may be specified\n");
+
+ status = dladm_name2info(handle, argv[optind], &linkid, NULL, NULL,
+ NULL);
+ if (status != DLADM_STATUS_OK) {
+ die_dlerr(status, "failed to find overlay %s", argv[optind]);
+ }
+
+ if (flush == B_TRUE) {
+ status = dladm_overlay_cache_flush(handle, linkid);
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "failed to flush target cache for "
+ "overlay %s", argv[optind]);
+ }
+
+ if (delete == B_TRUE) {
+ status = dladm_overlay_cache_delete(handle, linkid, &e);
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "failed to flush target %s from "
+ "overlay target cache %s", optarg, argv[optind]);
+ }
+
+ if (set == B_TRUE) {
+ status = dladm_overlay_cache_set(handle, linkid, &e, dest);
+ if (status != DLADM_STATUS_OK)
+ die_dlerr(status, "failed to set target %s for overlay "
+ "target cache %s", optarg, argv[optind]);
+ }
+
+}
diff --git a/usr/src/cmd/varpd/Makefile b/usr/src/cmd/varpd/Makefile
new file mode 100644
index 0000000000..4d9e29cd26
--- /dev/null
+++ b/usr/src/cmd/varpd/Makefile
@@ -0,0 +1,64 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+PROG= varpd
+OBJS = varpd.o
+SRCS = $(OBJS:%.o=../%.c)
+MANIFEST = varpd.xml
+ROOTLIBVARPD = $(ROOTLIB)/varpd
+ROOTLIBVARPDPROG= $(PROG:%=$(ROOTLIBVARPD)/%)
+
+
+include ../Makefile.cmd
+include ../Makefile.ctf
+
+ROOTMANIFESTDIR= $(ROOTSVCNETWORK)
+
+CLEANFILES += $(OBJS)
+CPPFLAGS += -D_REENTRANT
+CFLAGS += $(CCVERBOSE)
+LDLIBS += -lvarpd -lumem -lscf
+$(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG
+
+CSTD= $(CSTD_GNU99)
+
+.KEEP_STATE:
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+ $(LINK.c) -o $@ $(OBJS) $(LDLIBS)
+ $(POST_PROCESS)
+
+clean:
+ -$(RM) $(CLEANFILES)
+
+%.o: ../%.c
+ $(COMPILE.c) $<
+ $(POST_PROCESS_O)
+
+check: $(CHKMANIFEST)
+
+install: $(PROG) $(ROOTLIBVARPDPROG) $(ROOTMANIFEST)
+
+$(ROOTLIBVARPD):
+ $(INS.dir)
+
+$(ROOTLIBVARPD)/%: % $(ROOTLIBVARPD)
+ $(INS.file)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/varpd/varpd.c b/usr/src/cmd/varpd/varpd.c
new file mode 100644
index 0000000000..1b013417f8
--- /dev/null
+++ b/usr/src/cmd/varpd/varpd.c
@@ -0,0 +1,526 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2021 Joyent, Inc.
+ */
+
+/*
+ * virtual arp daemon -- varpd
+ *
+ * The virtual arp daemon is the user land counterpart to the overlay driver. To
+ * truly understand its purpose and how it fits into things, you should read the
+ * overlay big theory statement in uts/common/io/overlay/overlay.c.
+ *
+ * varpd's purpose it to provide a means for looking up the destination on the
+ * underlay network for a host on an overlay network and to also be a door
+ * server such that dladm(1M) via libdladm can configure and get useful status
+ * information. The heavy lifting is all done by libvarpd and the various lookup
+ * plugins.
+ *
+ * When varpd first starts up, we take care of chdiring into /var/run/varpd,
+ * which is also where we create /var/run/varpd/varpd.door, our door server.
+ * After that we daemonize and only after we daemonize do we go ahead and load
+ * plugins. The reason that we don't load plugins before daemonizing is that
+ * they could very well be creating threads and thus lose them all. In general,
+ * we want to make things easier on our children and not require them to be
+ * fork safe.
+ *
+ * Once it's spun up, the main varpd thread sits in sigsuspend and really just
+ * hangs out waiting for something, libvarpd handles everything else.
+ */
+
+#include <libvarpd.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <libgen.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <paths.h>
+#include <limits.h>
+#include <sys/corectl.h>
+#include <signal.h>
+#include <strings.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <thread.h>
+#include <priv.h>
+#include <libscf.h>
+
+#define VARPD_EXIT_REQUESTED SMF_EXIT_OK
+#define VARPD_EXIT_FATAL SMF_EXIT_ERR_FATAL
+#define VARPD_EXIT_USAGE SMF_EXIT_ERR_CONFIG
+
+#define VARPD_RUNDIR "/var/run/varpd"
+#define VARPD_DEFAULT_DOOR "/var/run/varpd/varpd.door"
+
+#define VARPD_PG "varpd"
+#define VARPD_PROP_INC "include_path"
+
+static varpd_handle_t *varpd_handle;
+static const char *varpd_pname;
+static volatile boolean_t varpd_exit = B_FALSE;
+
+/*
+ * Debug builds are automatically wired up for umem debugging.
+ */
+#ifdef DEBUG
+const char *
+_umem_debug_init()
+{
+ return ("default,verbose");
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents");
+}
+#endif /* DEBUG */
+
+static void
+varpd_vwarn(FILE *out, const char *fmt, va_list ap)
+{
+ int error = errno;
+
+ (void) fprintf(out, "%s: ", varpd_pname);
+ (void) vfprintf(out, fmt, ap);
+
+ if (fmt[strlen(fmt) - 1] != '\n')
+ (void) fprintf(out, ": %s\n", strerror(error));
+}
+
+static void
+varpd_fatal(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ varpd_vwarn(stderr, fmt, ap);
+ va_end(ap);
+
+ exit(VARPD_EXIT_FATAL);
+}
+
+static void
+varpd_dfatal(int dfd, const char *fmt, ...)
+{
+ int status = VARPD_EXIT_FATAL;
+ va_list ap;
+
+ va_start(ap, fmt);
+ varpd_vwarn(stdout, fmt, ap);
+ va_end(ap);
+
+ /* Take a single shot at this */
+ (void) write(dfd, &status, sizeof (status));
+ exit(status);
+}
+
+/* ARGSUSED */
+static int
+varpd_plugin_walk_cb(varpd_handle_t *vph, const char *name, void *unused)
+{
+ (void) printf("loaded %s!\n", name);
+ return (0);
+}
+
+static int
+varpd_dir_setup(void)
+{
+ int fd;
+
+ if (mkdir(VARPD_RUNDIR, 0700) != 0) {
+ if (errno != EEXIST)
+ varpd_fatal("failed to create %s: %s", VARPD_RUNDIR,
+ strerror(errno));
+ }
+
+ fd = open(VARPD_RUNDIR, O_RDONLY);
+ if (fd < 0)
+ varpd_fatal("failed to open %s: %s", VARPD_RUNDIR,
+ strerror(errno));
+
+ if (fchown(fd, UID_NETADM, GID_NETADM) != 0)
+ varpd_fatal("failed to chown %s: %s\n", VARPD_RUNDIR,
+ strerror(errno));
+
+ return (fd);
+}
+
+/*
+ * Because varpd is generally run under SMF, we opt to keep its stdout and
+ * stderr to be whatever our parent set them up to be.
+ */
+static void
+varpd_fd_setup(void)
+{
+ int dupfd;
+
+ closefrom(STDERR_FILENO + 1);
+ dupfd = open(_PATH_DEVNULL, O_RDONLY);
+ if (dupfd < 0)
+ varpd_fatal("failed to open %s: %s", _PATH_DEVNULL,
+ strerror(errno));
+ if (dup2(dupfd, STDIN_FILENO) == -1)
+ varpd_fatal("failed to dup out stdin: %s", strerror(errno));
+}
+
+/*
+ * We borrow fmd's daemonization style. Basically, the parent waits for the
+ * child to successfully set up a door and recover all of the old configurations
+ * before we say that we're good to go.
+ */
+static int
+varpd_daemonize(int dirfd)
+{
+ char path[PATH_MAX];
+ struct rlimit rlim;
+ sigset_t set, oset;
+ int estatus, pfds[2];
+ pid_t child;
+ priv_set_t *pset;
+
+ /*
+ * Set a per-process core path to be inside of /var/run/varpd. Make sure
+ * that we aren't limited in our dump size.
+ */
+ (void) snprintf(path, sizeof (path),
+ "/var/run/varpd/core.%s.%%p", varpd_pname);
+ (void) core_set_process_path(path, strlen(path) + 1, getpid());
+
+ rlim.rlim_cur = RLIM_INFINITY;
+ rlim.rlim_max = RLIM_INFINITY;
+ (void) setrlimit(RLIMIT_CORE, &rlim);
+
+ /*
+ * Claim as many file descriptors as the system will let us.
+ */
+ if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) {
+ rlim.rlim_cur = rlim.rlim_max;
+ (void) setrlimit(RLIMIT_NOFILE, &rlim);
+ }
+
+ /*
+ * chdir /var/run/varpd
+ */
+ if (fchdir(dirfd) != 0)
+ varpd_fatal("failed to chdir to %s", VARPD_RUNDIR);
+
+
+ /*
+ * At this point block all signals going in so we don't have the parent
+ * mistakingly exit when the child is running, but never block SIGABRT.
+ */
+ if (sigfillset(&set) != 0)
+ abort();
+ if (sigdelset(&set, SIGABRT) != 0)
+ abort();
+ if (sigprocmask(SIG_BLOCK, &set, &oset) != 0)
+ abort();
+
+ /*
+ * Do the fork+setsid dance.
+ */
+ if (pipe(pfds) != 0)
+ varpd_fatal("failed to create pipe for daemonizing");
+
+ if ((child = fork()) == -1)
+ varpd_fatal("failed to fork for daemonizing");
+
+ if (child != 0) {
+ /* We'll be exiting shortly, so allow for silent failure */
+ (void) close(pfds[1]);
+ if (read(pfds[0], &estatus, sizeof (estatus)) ==
+ sizeof (estatus))
+ _exit(estatus);
+
+ if (waitpid(child, &estatus, 0) == child && WIFEXITED(estatus))
+ _exit(WEXITSTATUS(estatus));
+
+ _exit(VARPD_EXIT_FATAL);
+ }
+
+ /*
+ * Drop privileges here.
+ *
+ * We should make sure we keep around PRIV_NET_PRIVADDR and
+ * PRIV_SYS_DLCONFIG, but drop everything else; however, keep basic
+ * privs and have our child drop them.
+ *
+ * We should also run as netadm:netadm and drop all of our groups.
+ */
+ if (setgroups(0, NULL) != 0)
+ abort();
+ if (setgid(GID_NETADM) == -1 || seteuid(UID_NETADM) == -1)
+ abort();
+ if ((pset = priv_allocset()) == NULL)
+ abort();
+ priv_basicset(pset);
+ if (priv_delset(pset, PRIV_PROC_EXEC) == -1 ||
+ priv_delset(pset, PRIV_PROC_INFO) == -1 ||
+ priv_delset(pset, PRIV_PROC_FORK) == -1 ||
+ priv_delset(pset, PRIV_PROC_SESSION) == -1 ||
+ priv_delset(pset, PRIV_FILE_LINK_ANY) == -1 ||
+ priv_addset(pset, PRIV_SYS_DL_CONFIG) == -1 ||
+ priv_addset(pset, PRIV_NET_PRIVADDR) == -1) {
+ abort();
+ }
+ /*
+ * Remove privs from the permitted set. That will cause them to be
+ * removed from the effective set. We want to make sure that in the case
+ * of a vulnerability, something can't get back in here and wreak more
+ * havoc. But if we want non-basic privs in the effective set, we have
+ * to request them explicitly.
+ */
+ if (setppriv(PRIV_SET, PRIV_PERMITTED, pset) == -1)
+ abort();
+ if (setppriv(PRIV_SET, PRIV_EFFECTIVE, pset) == -1)
+ abort();
+
+ priv_freeset(pset);
+
+ if (close(pfds[0]) != 0)
+ abort();
+ if (setsid() == -1)
+ abort();
+ if (sigprocmask(SIG_SETMASK, &oset, NULL) != 0)
+ abort();
+ (void) umask(0022);
+
+ return (pfds[1]);
+}
+
+static int
+varpd_setup_lookup_threads(void)
+{
+ int ret;
+ long i, ncpus = sysconf(_SC_NPROCESSORS_ONLN) * 2 + 1;
+
+ if (ncpus <= 0)
+ abort();
+ for (i = 0; i < ncpus; i++) {
+ thread_t thr;
+
+ ret = thr_create(NULL, 0, libvarpd_overlay_lookup_run,
+ varpd_handle, THR_DETACHED | THR_DAEMON, &thr);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+static void
+varpd_cleanup(void)
+{
+ varpd_exit = B_TRUE;
+}
+
+/*
+ * Load default information from SMF and apply any of if necessary. We recognize
+ * the following properties:
+ *
+ * varpd/include_path Treat these as a series of -i options.
+ *
+ * If we're not under SMF, just move on.
+ */
+static void
+varpd_load_smf(int dfd)
+{
+ char *fmri, *inc;
+ scf_simple_prop_t *prop;
+
+ if ((fmri = getenv("SMF_FMRI")) == NULL)
+ return;
+
+ if ((prop = scf_simple_prop_get(NULL, fmri, VARPD_PG,
+ VARPD_PROP_INC)) == NULL)
+ return;
+
+ while ((inc = scf_simple_prop_next_astring(prop)) != NULL) {
+ int err = libvarpd_plugin_load(varpd_handle, inc);
+ if (err != 0) {
+ varpd_dfatal(dfd, "failed to load from %s: %s\n",
+ inc, strerror(err));
+ }
+ }
+
+ scf_simple_prop_free(prop);
+}
+
+/*
+ * There are a bunch of things we need to do to be a proper daemon here.
+ *
+ * o Ensure that /var/run/varpd exists or create it
+ * o make stdin /dev/null (stdout?)
+ * o Ensure any other fds that we somehow inherited are closed, eg.
+ * closefrom()
+ * o Properly daemonize
+ * o Mask all signals except sigabrt before creating our first door -- all
+ * other doors will inherit from that.
+ * o Have the main thread sigsuspend looking for most things that are
+ * actionable...
+ */
+int
+main(int argc, char *argv[])
+{
+ int err, c, dirfd, dfd, i;
+ const char *doorpath = VARPD_DEFAULT_DOOR;
+ sigset_t set;
+ struct sigaction act;
+ int nincpath = 0, nextincpath = 0;
+ char **incpath = NULL;
+
+ varpd_pname = basename(argv[0]);
+
+ /*
+ * We want to clean up our file descriptors before we do anything else
+ * as we can't assume that libvarpd won't open file descriptors, etc.
+ */
+ varpd_fd_setup();
+
+ if ((err = libvarpd_create(&varpd_handle)) != 0) {
+ varpd_fatal("failed to open a libvarpd handle");
+ return (1);
+ }
+
+ while ((c = getopt(argc, argv, ":i:d:")) != -1) {
+ switch (c) {
+ case 'i':
+ if (nextincpath == nincpath) {
+ if (nincpath == 0)
+ nincpath = 16;
+ else
+ nincpath *= 2;
+ incpath = realloc(incpath, sizeof (char *) *
+ nincpath);
+ if (incpath == NULL) {
+ (void) fprintf(stderr, "failed to "
+ "allocate memory for the %dth "
+ "-I option: %s\n", nextincpath + 1,
+ strerror(errno));
+ }
+
+ }
+ incpath[nextincpath] = optarg;
+ nextincpath++;
+ break;
+ case 'd':
+ doorpath = optarg;
+ break;
+ default:
+ (void) fprintf(stderr, "unknown option: %c\n", c);
+ return (1);
+ }
+ }
+
+ dirfd = varpd_dir_setup();
+
+ (void) libvarpd_plugin_walk(varpd_handle, varpd_plugin_walk_cb, NULL);
+
+ dfd = varpd_daemonize(dirfd);
+
+ /*
+ * Now that we're in the child, go ahead and load all of our plug-ins.
+ * We do this, in part, because these plug-ins may need threads of their
+ * own and fork won't preserve those and we'd rather the plug-ins don't
+ * have to learn about fork-handlers.
+ */
+ for (i = 0; i < nextincpath; i++) {
+ err = libvarpd_plugin_load(varpd_handle, incpath[i]);
+ if (err != 0) {
+ varpd_dfatal(dfd, "failed to load from %s: %s\n",
+ incpath[i], strerror(err));
+ }
+ }
+
+ varpd_load_smf(dfd);
+
+ if ((err = libvarpd_persist_enable(varpd_handle, VARPD_RUNDIR)) != 0)
+ varpd_dfatal(dfd, "failed to enable varpd persistence: %s\n",
+ strerror(err));
+
+ if ((err = libvarpd_persist_restore(varpd_handle)) != 0)
+ varpd_dfatal(dfd, "failed to enable varpd persistence: %s\n",
+ strerror(err));
+
+ /*
+ * The ur-door thread will inherit from this signal mask. So set it to
+ * what we want before doing anything else. In addition, so will our
+ * threads that handle varpd lookups.
+ */
+ if (sigfillset(&set) != 0)
+ varpd_dfatal(dfd, "failed to fill a signal set...");
+
+ if (sigdelset(&set, SIGABRT) != 0)
+ varpd_dfatal(dfd, "failed to unmask SIGABRT");
+
+ if (sigprocmask(SIG_BLOCK, &set, NULL) != 0)
+ varpd_dfatal(dfd, "failed to set our door signal mask");
+
+ if ((err = varpd_setup_lookup_threads()) != 0)
+ varpd_dfatal(dfd, "failed to create lookup threads: %s\n",
+ strerror(err));
+
+ if ((err = libvarpd_door_server_create(varpd_handle, doorpath)) != 0)
+ varpd_dfatal(dfd, "failed to create door server at %s: %s\n",
+ doorpath, strerror(err));
+
+ /*
+ * At this point, finish up signal initialization and finally go ahead,
+ * notify the parent that we're okay, and enter the sigsuspend loop.
+ */
+ bzero(&act, sizeof (struct sigaction));
+ act.sa_handler = varpd_cleanup;
+ if (sigfillset(&act.sa_mask) != 0)
+ varpd_dfatal(dfd, "failed to fill sigaction mask");
+ act.sa_flags = 0;
+ if (sigaction(SIGHUP, &act, NULL) != 0)
+ varpd_dfatal(dfd, "failed to register HUP handler");
+ if (sigdelset(&set, SIGHUP) != 0)
+ varpd_dfatal(dfd, "failed to remove HUP from mask");
+ if (sigaction(SIGQUIT, &act, NULL) != 0)
+ varpd_dfatal(dfd, "failed to register QUIT handler");
+ if (sigdelset(&set, SIGQUIT) != 0)
+ varpd_dfatal(dfd, "failed to remove QUIT from mask");
+ if (sigaction(SIGINT, &act, NULL) != 0)
+ varpd_dfatal(dfd, "failed to register INT handler");
+ if (sigdelset(&set, SIGINT) != 0)
+ varpd_dfatal(dfd, "failed to remove INT from mask");
+ if (sigaction(SIGTERM, &act, NULL) != 0)
+ varpd_dfatal(dfd, "failed to register TERM handler");
+ if (sigdelset(&set, SIGTERM) != 0)
+ varpd_dfatal(dfd, "failed to remove TERM from mask");
+
+ err = 0;
+ (void) write(dfd, &err, sizeof (err));
+ (void) close(dfd);
+
+ for (;;) {
+ if (sigsuspend(&set) == -1)
+ if (errno == EFAULT)
+ abort();
+ if (varpd_exit == B_TRUE)
+ break;
+ }
+
+ libvarpd_door_server_destroy(varpd_handle);
+ libvarpd_destroy(varpd_handle);
+
+ return (VARPD_EXIT_REQUESTED);
+}
diff --git a/usr/src/cmd/varpd/varpd.xml b/usr/src/cmd/varpd/varpd.xml
new file mode 100644
index 0000000000..df7015a3d6
--- /dev/null
+++ b/usr/src/cmd/varpd/varpd.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0"?>
+<!DOCTYPE service_bundle SYSTEM "/usr/share/lib/xml/dtd/service_bundle.dtd.1">
+<!--
+This file and its contents are supplied under the terms of the
+Common Development and Distribution License ("CDDL"), version 1.0.
+You may only use this file in accordance with the terms of version
+1.0 of the CDDL.
+
+A full copy of the text of the CDDL should have accompanied this
+source. A copy of the CDDL is also available via the Internet at
+http://www.illumos.org/license/CDDL.
+
+Copyright 2018, Joyent, Inc.
+-->
+
+<service_bundle type="manifest" name="illumos:varpd" >
+
+ <service name="network/varpd" type="service" version="1" >
+
+ <create_default_instance enabled="true" />
+
+ <single_instance/>
+
+ <dependency name="varpd-network-physical"
+ grouping="require_all"
+ restart_on="none"
+ type="service">
+ <service_fmri value="svc:/network/physical:default" />
+ </dependency>
+
+ <dependency name="varpd-device-local"
+ grouping="require_all"
+ restart_on="none"
+ type="service">
+ <service_fmri value="svc:/system/device/local:default" />
+ </dependency>
+
+ <exec_method
+ type="method"
+ name="start"
+ exec="/usr/lib/varpd/varpd"
+ timeout_seconds="60" />
+
+ <exec_method
+ type="method"
+ name="stop"
+ exec=":kill"
+ timeout_seconds="10" />
+
+ <property_group name='varpd' type='application'>
+ <property name='include_path' type='astring'>
+ <astring_list>
+ <value_node value='/usr/lib/varpd'/>
+ </astring_list>
+ </property>
+ </property_group>
+
+ <stability value='Unstable' />
+
+ <template>
+ <common_name>
+ <loctext xml:lang="C">virtual ARP daemon
+ </loctext>
+ </common_name>
+ </template>
+ </service>
+</service_bundle>
diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile
index aa163cc3bf..d1a33d262d 100644
--- a/usr/src/lib/Makefile
+++ b/usr/src/lib/Makefile
@@ -272,6 +272,7 @@ SUBDIRS += \
sun_fc \
sun_sas \
udapl \
+ varpd \
watchmalloc \
$($(MACH)_SUBDIRS)
@@ -491,6 +492,7 @@ HDRSUBDIRS= \
smbsrv \
smhba \
udapl \
+ varpd \
$($(MACH)_HDRSUBDIRS)
i386_HDRSUBDIRS= \
@@ -609,7 +611,7 @@ libdhcputil: libgen libinetutil libdlpi
libdiskmgt: libdevid libdevinfo libadm libefi libkstat libsysevent
$(INTEL_BLD)libdiskmgt: libfdisk
libdladm: libdevinfo libinetutil libscf librcm libexacct libkstat \
- libpool
+ libpool varpd
libdlpi: libinetutil libdladm
libds: libsysevent
libdtrace: libproc libgen libctf libmapmalloc
@@ -721,6 +723,8 @@ storage: libdevice libdevinfo libdevid
sun_fc: libdevinfo libsysevent
sun_sas: libdevinfo libsysevent libkstat libdevid
udapl: libdevinfo libdladm
+varpd: libavl libidspace libumem libnsl libnvpair libmd5 librename \
+ libcustr
#
# The reason this rule checks for the existence of the
diff --git a/usr/src/lib/libdladm/Makefile b/usr/src/lib/libdladm/Makefile
index 5202579b6c..e4825d91da 100644
--- a/usr/src/lib/libdladm/Makefile
+++ b/usr/src/lib/libdladm/Makefile
@@ -20,6 +20,7 @@
#
#
# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright 2015, Joyent, Inc.
#
#
@@ -29,7 +30,7 @@ HDRS = libdladm.h libdladm_impl.h libdllink.h libdlaggr.h \
libdlwlan.h libdlwlan_impl.h libdlvnic.h libdlvlan.h \
libdlmgmt.h libdlflow.h libdlflow_impl.h libdlstat.h \
libdlether.h libdlsim.h libdlbridge.h libdliptun.h \
- libdlib.h
+ libdlib.h libdloverlay.h
HDRDIR = common
@@ -71,7 +72,13 @@ TYPELIST = \
dlmgmt_getconfsnapshot_retval_t \
dlmgmt_door_zoneboot_t \
dlmgmt_remapid_retval_t \
- dlmgmt_createid_retval_t
+ dlmgmt_createid_retval_t \
+ overlay_ioc_create_t \
+ overlay_ioc_activate_t \
+ overlay_ioc_delete_t \
+ overlay_ioc_nprops_t \
+ overlay_ioc_propinfo_t \
+ overlay_ioc_prop_t
all := TARGET = all
clean := TARGET = clean
diff --git a/usr/src/lib/libdladm/Makefile.com b/usr/src/lib/libdladm/Makefile.com
index d170a97998..13a5e8384a 100644
--- a/usr/src/lib/libdladm/Makefile.com
+++ b/usr/src/lib/libdladm/Makefile.com
@@ -28,7 +28,8 @@ VERS = .1
OBJECTS = libdladm.o secobj.o linkprop.o libdllink.o libdlaggr.o \
libdlwlan.o libdlvnic.o libdlmgmt.o libdlvlan.o libdlib.o\
flowattr.o flowprop.o propfuncs.o libdlflow.o libdlstat.o \
- usage.o libdlether.o libdlsim.o libdlbridge.o libdliptun.o
+ usage.o libdlether.o libdlsim.o libdlbridge.o libdliptun.o \
+ libdloverlay.o
include ../../Makefile.lib
@@ -37,7 +38,7 @@ include ../../Makefile.rootfs
LIBS = $(DYNLIB)
LDLIBS += -ldevinfo -lc -linetutil -lsocket -lscf -lrcm -lnvpair \
- -lexacct -lkstat -lpool
+ -lexacct -lkstat -lpool -lvarpd
SRCDIR = ../common
@@ -51,9 +52,10 @@ CPPFLAGS += -I$(SRCDIR) -D_REENTRANT
# not linted
SMATCH=off
+CSTD= $(CSTD_GNU99)
+
.KEEP_STATE:
all: $(LIBS)
-
include $(SRC)/lib/Makefile.targ
diff --git a/usr/src/lib/libdladm/common/libdladm.c b/usr/src/lib/libdladm/common/libdladm.c
index eb099376a4..55e6d3e1e0 100644
--- a/usr/src/lib/libdladm/common/libdladm.c
+++ b/usr/src/lib/libdladm/common/libdladm.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
*/
/*
@@ -37,6 +38,9 @@
#include <strings.h>
#include <dirent.h>
#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdarg.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/param.h>
@@ -440,6 +444,9 @@ dladm_status2str(dladm_status_t status, char *buf)
case DLADM_STATUS_PERSIST_ON_TEMP:
s = "can't create persistent object on top of temporary object";
break;
+ case DLADM_STATUS_BAD_ENCAP:
+ s = "invalid encapsulation protocol";
+ break;
default:
s = "<unknown error>";
break;
@@ -672,6 +679,9 @@ dladm_class2str(datalink_class_t class, char *buf)
case DATALINK_CLASS_PART:
s = "part";
break;
+ case DATALINK_CLASS_OVERLAY:
+ s = "overlay";
+ break;
default:
s = "unknown";
break;
@@ -1157,15 +1167,15 @@ dladm_strs2range(char **prop_val, uint_t val_cnt, mac_propval_type_t type,
* Convert a mac_propval_range_t structure into an array of elements.
*/
dladm_status_t
-dladm_range2list(mac_propval_range_t *rangep, void *elem, uint_t *nelem)
+dladm_range2list(const mac_propval_range_t *rangep, void *elem, uint_t *nelem)
{
int i, j, k;
dladm_status_t status = DLADM_STATUS_OK;
switch (rangep->mpr_type) {
case MAC_PROPVAL_UINT32: {
- mac_propval_uint32_range_t *ur;
- uint32_t *elem32 = elem;
+ const mac_propval_uint32_range_t *ur;
+ uint32_t *elem32 = elem;
k = 0;
ur = &rangep->mpr_range_uint32[0];
@@ -1193,13 +1203,13 @@ dladm_range2list(mac_propval_range_t *rangep, void *elem, uint_t *nelem)
* of single elements or ranges.
*/
int
-dladm_range2strs(mac_propval_range_t *rangep, char **prop_val)
+dladm_range2strs(const mac_propval_range_t *rangep, char **prop_val)
{
int i;
switch (rangep->mpr_type) {
case MAC_PROPVAL_UINT32: {
- mac_propval_uint32_range_t *ur;
+ const mac_propval_uint32_range_t *ur;
/* Write ranges and individual elements */
ur = &rangep->mpr_range_uint32[0];
@@ -1216,6 +1226,20 @@ dladm_range2strs(mac_propval_range_t *rangep, char **prop_val)
}
return (0);
}
+ case MAC_PROPVAL_STR: {
+ const mac_propval_str_range_t *str;
+ size_t coff, len;
+
+ coff = 0;
+ str = &rangep->u.mpr_str;
+ for (i = 0; i < rangep->mpr_count; i++) {
+ len = strlen(&str->mpur_data[coff]);
+ (void) strlcpy(prop_val[i], &str->mpur_data[coff],
+ DLADM_PROP_VAL_MAX);
+ coff += len + 1;
+ }
+ return (0);
+ }
default:
break;
}
@@ -1293,3 +1317,54 @@ dladm_list2range(void *elem, uint_t nelem, mac_propval_type_t type,
return (status);
}
+
+void
+dladm_errlist_init(dladm_errlist_t *erl)
+{
+ bzero(erl, sizeof (dladm_errlist_t));
+}
+
+void
+dladm_errlist_reset(dladm_errlist_t *erl)
+{
+ uint_t i;
+
+ for (i = 0; i < erl->el_count; i++)
+ free(erl->el_errs[i]);
+ free(erl->el_errs);
+ dladm_errlist_init(erl);
+}
+
+dladm_status_t
+dladm_errlist_append(dladm_errlist_t *erl, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+ char *m = NULL;
+
+ if (erl->el_count == erl->el_alloc) {
+ int alloc;
+ void *addr;
+ if (erl->el_alloc == 0) {
+ assert(erl->el_errs == NULL);
+ alloc = 32;
+ } else {
+ alloc = erl->el_alloc + 32;
+ }
+ addr = realloc(erl->el_errs, sizeof (char *) * alloc);
+ if (addr == NULL)
+ return (DLADM_STATUS_NOMEM);
+
+ erl->el_errs = addr;
+ erl->el_alloc = alloc;
+ }
+
+ va_start(ap, fmt);
+ ret = vasprintf(&m, fmt, ap);
+ va_end(ap);
+ if (ret == -1)
+ return (dladm_errno2status(errno));
+ erl->el_errs[erl->el_count] = m;
+ erl->el_count++;
+ return (DLADM_STATUS_OK);
+}
diff --git a/usr/src/lib/libdladm/common/libdladm.h b/usr/src/lib/libdladm/common/libdladm.h
index 350c9c50f3..5a97bacaa0 100644
--- a/usr/src/lib/libdladm/common/libdladm.h
+++ b/usr/src/lib/libdladm/common/libdladm.h
@@ -23,6 +23,7 @@
*/
/*
+ * Copyright 2015, Joyent, Inc.
* Copyright 2020 OmniOS Community Edition (OmniOSce) Association
*/
@@ -179,7 +180,8 @@ typedef enum {
DLADM_STATUS_INVALID_PKEY_TBL_SIZE,
DLADM_STATUS_PORT_NOPROTO,
DLADM_STATUS_INVALID_MTU,
- DLADM_STATUS_PERSIST_ON_TEMP
+ DLADM_STATUS_PERSIST_ON_TEMP,
+ DLADM_STATUS_BAD_ENCAP
} dladm_status_t;
typedef enum {
@@ -233,6 +235,12 @@ typedef struct dladm_arg_list {
char *al_buf;
} dladm_arg_list_t;
+typedef struct dladm_errlist {
+ uint_t el_count;
+ uint_t el_alloc;
+ char **el_errs;
+} dladm_errlist_t;
+
typedef enum {
DLADM_LOGTYPE_LINK = 1,
DLADM_LOGTYPE_FLOW
@@ -294,12 +302,15 @@ extern dladm_status_t dladm_zone_halt(dladm_handle_t, zoneid_t);
extern dladm_status_t dladm_strs2range(char **, uint_t, mac_propval_type_t,
mac_propval_range_t **);
-extern dladm_status_t dladm_range2list(mac_propval_range_t *, void*,
+extern dladm_status_t dladm_range2list(const mac_propval_range_t *, void *,
uint_t *);
-extern int dladm_range2strs(mac_propval_range_t *, char **);
+extern int dladm_range2strs(const mac_propval_range_t *, char **);
extern dladm_status_t dladm_list2range(void *, uint_t, mac_propval_type_t,
mac_propval_range_t **);
+extern void dladm_errlist_init(dladm_errlist_t *);
+extern void dladm_errlist_reset(dladm_errlist_t *);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/lib/libdladm/common/libdladm_impl.h b/usr/src/lib/libdladm/common/libdladm_impl.h
index 20db1cb1d7..9cd91d56c1 100644
--- a/usr/src/lib/libdladm/common/libdladm_impl.h
+++ b/usr/src/lib/libdladm/common/libdladm_impl.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
/*
@@ -173,6 +174,12 @@ typedef struct resource_prop_s {
*/
#define FBRIDGE "bridge" /* string */
+/*
+ * For error lists
+ */
+extern dladm_status_t dladm_errlist_append(dladm_errlist_t *,
+ const char *, ...);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/lib/libdladm/common/libdloverlay.c b/usr/src/lib/libdladm/common/libdloverlay.c
new file mode 100644
index 0000000000..a25be3d201
--- /dev/null
+++ b/usr/src/lib/libdladm/common/libdloverlay.c
@@ -0,0 +1,885 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2015 Joyent, Inc.
+ */
+
+#include <libdladm_impl.h>
+#include <libdllink.h>
+#include <libdloverlay.h>
+#include <sys/dld.h>
+#include <sys/overlay.h>
+#include <strings.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <limits.h>
+#include <libvarpd_client.h>
+
+#define VARPD_PROPERTY_NAME "varpd/id"
+
+static const char *dladm_overlay_doorpath = "/var/run/varpd/varpd.door";
+
+typedef struct dladm_overlay_propinfo {
+ boolean_t dop_isvarpd;
+ union {
+ overlay_ioc_propinfo_t *dop_overlay;
+ varpd_client_prop_handle_t *dop_varpd;
+ } dop_un;
+} dladm_overlay_propinfo_t;
+
+dladm_status_t
+dladm_overlay_prop_info(dladm_overlay_propinfo_handle_t phdl,
+ const char **namep, uint_t *typep, uint_t *protp, const void **defp,
+ uint32_t *sizep, const mac_propval_range_t **possp)
+{
+ dladm_overlay_propinfo_t *infop = (dladm_overlay_propinfo_t *)phdl;
+ overlay_ioc_propinfo_t *oinfop = infop->dop_un.dop_overlay;
+
+ if (infop->dop_isvarpd == B_FALSE) {
+ if (namep != NULL)
+ *namep = oinfop->oipi_name;
+ if (typep != NULL)
+ *typep = oinfop->oipi_type;
+ if (protp != NULL)
+ *protp = oinfop->oipi_prot;
+ if (defp != NULL)
+ *defp = oinfop->oipi_default;
+ if (sizep != NULL)
+ *sizep = oinfop->oipi_defsize;
+ if (possp != NULL) {
+ *possp = (const mac_propval_range_t *)oinfop->oipi_poss;
+ }
+
+ } else {
+ int ret;
+ ret = libvarpd_c_prop_info(infop->dop_un.dop_varpd, namep,
+ typep, protp, defp, sizep, possp);
+ if (ret != 0)
+ return (dladm_errno2status(ret));
+
+ }
+
+ return (DLADM_STATUS_OK);
+}
+
+static dladm_status_t
+dladm_overlay_parse_prop(overlay_prop_type_t type, void *buf, uint32_t *sizep,
+ const char *val)
+{
+ int ret;
+ int64_t ival;
+ uint64_t uval;
+ char *eptr;
+ struct in6_addr ipv6;
+ struct in_addr ip;
+
+ switch (type) {
+ case OVERLAY_PROP_T_INT:
+ errno = 0;
+ ival = strtol(val, &eptr, 10);
+ if ((ival == 0 && errno == EINVAL) ||
+ ((ival == LONG_MAX || ival == LONG_MIN) &&
+ errno == ERANGE))
+ return (DLADM_STATUS_BADARG);
+ bcopy(&ival, buf, sizeof (int64_t));
+ *sizep = sizeof (int64_t);
+ break;
+ case OVERLAY_PROP_T_UINT:
+ errno = 0;
+ uval = strtol(val, &eptr, 10);
+ if ((uval == 0 && errno == EINVAL) ||
+ (uval == ULONG_MAX && errno == ERANGE))
+ return (DLADM_STATUS_BADARG);
+ bcopy(&uval, buf, sizeof (uint64_t));
+ *sizep = sizeof (uint64_t);
+ break;
+ case OVERLAY_PROP_T_STRING:
+ ret = strlcpy((char *)buf, val, OVERLAY_PROP_SIZEMAX);
+ if (ret >= OVERLAY_PROP_SIZEMAX)
+ return (DLADM_STATUS_BADARG);
+ *sizep = ret + 1;
+ break;
+ case OVERLAY_PROP_T_IP:
+ /*
+ * Always try to parse the IP as an IPv6 address. If that fails,
+ * try to interpret it as an IPv4 address and transform it into
+ * an IPv6 mapped IPv4 address.
+ */
+ if (inet_pton(AF_INET6, val, &ipv6) != 1) {
+ if (inet_pton(AF_INET, val, &ip) != 1)
+ return (DLADM_STATUS_BADARG);
+
+ IN6_INADDR_TO_V4MAPPED(&ip, &ipv6);
+ }
+ bcopy(&ipv6, buf, sizeof (struct in6_addr));
+ *sizep = sizeof (struct in6_addr);
+ break;
+ default:
+ abort();
+ }
+
+ return (DLADM_STATUS_OK);
+}
+
+/* ARGSUSED */
+static dladm_status_t
+dladm_overlay_varpd_setprop(dladm_handle_t handle, varpd_client_handle_t *chdl,
+ uint64_t inst, const char *name, char *const *valp, uint_t cnt)
+{
+ int ret;
+ uint32_t size;
+ uint8_t buf[LIBVARPD_PROP_SIZEMAX];
+ varpd_client_prop_handle_t *phdl;
+ uint_t type;
+ dladm_status_t status;
+
+ if ((ret = libvarpd_c_prop_handle_alloc(chdl, inst, &phdl)) != 0)
+ return (dladm_errno2status(ret));
+
+ if ((ret = libvarpd_c_prop_info_fill_by_name(phdl, name)) != 0) {
+ libvarpd_c_prop_handle_free(phdl);
+ return (dladm_errno2status(ret));
+ }
+
+ if ((ret = libvarpd_c_prop_info(phdl, NULL, &type, NULL, NULL, NULL,
+ NULL)) != 0) {
+ libvarpd_c_prop_handle_free(phdl);
+ return (dladm_errno2status(ret));
+ }
+
+ if ((status = dladm_overlay_parse_prop(type, buf, &size, valp[0])) !=
+ DLADM_STATUS_OK) {
+ libvarpd_c_prop_handle_free(phdl);
+ return (status);
+ }
+
+ ret = libvarpd_c_prop_set(phdl, buf, size);
+ libvarpd_c_prop_handle_free(phdl);
+
+ return (dladm_errno2status(ret));
+}
+
+dladm_status_t
+dladm_overlay_setprop(dladm_handle_t handle, datalink_id_t linkid,
+ const char *name, char *const *valp, uint_t cnt)
+{
+ int ret;
+ dladm_status_t status;
+ overlay_ioc_propinfo_t info;
+ overlay_ioc_prop_t prop;
+
+ if (linkid == DATALINK_INVALID_LINKID ||
+ name == NULL || valp == NULL || cnt != 1)
+ return (DLADM_STATUS_BADARG);
+
+ bzero(&info, sizeof (overlay_ioc_propinfo_t));
+ info.oipi_linkid = linkid;
+ info.oipi_id = -1;
+ if (strlcpy(info.oipi_name, name, OVERLAY_PROP_NAMELEN) >=
+ OVERLAY_PROP_NAMELEN)
+ return (DLADM_STATUS_BADARG);
+
+ status = DLADM_STATUS_OK;
+ ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_PROPINFO, &info);
+ if (ret != 0)
+ status = dladm_errno2status(errno);
+
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ prop.oip_linkid = linkid;
+ prop.oip_id = info.oipi_id;
+ prop.oip_name[0] = '\0';
+ if ((ret = dladm_overlay_parse_prop(info.oipi_type, prop.oip_value,
+ &prop.oip_size, valp[0])) != DLADM_STATUS_OK)
+ return (ret);
+
+ status = DLADM_STATUS_OK;
+ ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_SETPROP, &prop);
+ if (ret != 0)
+ status = dladm_errno2status(errno);
+
+ return (ret);
+}
+
+/*
+ * Tell the user about any unset required properties.
+ */
+static int
+dladm_overlay_activate_cb(dladm_handle_t handle, datalink_id_t linkid,
+ dladm_overlay_propinfo_handle_t phdl, void *arg)
+{
+ dladm_status_t status;
+ uint8_t buf[DLADM_OVERLAY_PROP_SIZEMAX];
+ uint_t prot;
+ size_t size = sizeof (buf);
+ const char *name;
+ dladm_errlist_t *errs = arg;
+
+ if ((status = dladm_overlay_prop_info(phdl, &name, NULL, &prot, NULL,
+ NULL, NULL)) != DLADM_STATUS_OK)
+ return (status);
+
+ if ((prot & OVERLAY_PROP_PERM_REQ) == 0)
+ return (DLADM_WALK_CONTINUE);
+
+ if (dladm_overlay_get_prop(handle, linkid, phdl, buf, &size) !=
+ DLADM_STATUS_OK)
+ return (DLADM_WALK_CONTINUE);
+
+ if (size == 0)
+ (void) dladm_errlist_append(errs, "unset required property: %s",
+ name);
+
+ return (DLADM_WALK_CONTINUE);
+}
+
+/*
+ * We need to clean up the world here. The problem is that we may or may not
+ * actually have everything created. While in the normal case, we'd always have
+ * an overlay device, assigned datalink id, and a varpd instance, we might not
+ * have any of those, except for the datalink instance. Therefore, as long as
+ * the id refers to a valid overlay, we should try to clean up as much of the
+ * state as possible and most importantly, we need to make sure we delete the
+ * datalink id. If we fail to do that, then that name will become lost to time.
+ */
+dladm_status_t
+dladm_overlay_delete(dladm_handle_t handle, datalink_id_t linkid)
+{
+ datalink_class_t class;
+ overlay_ioc_delete_t oid;
+ varpd_client_handle_t *chdl;
+ int ret;
+ uint32_t flags;
+ uint64_t varpdid;
+
+ if (dladm_datalink_id2info(handle, linkid, &flags, &class, NULL,
+ NULL, 0) != DLADM_STATUS_OK)
+ return (DLADM_STATUS_BADARG);
+
+ if (class != DATALINK_CLASS_OVERLAY)
+ return (DLADM_STATUS_BADARG);
+
+ oid.oid_linkid = linkid;
+ ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_DELETE, &oid);
+ if (ret != 0 && errno != ENOENT) {
+ return (dladm_errno2status(errno));
+ }
+
+ if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0) {
+ return (dladm_errno2status(ret));
+ }
+
+ if ((ret = libvarpd_c_instance_lookup(chdl, linkid, &varpdid)) != 0) {
+ if (ret == ENOENT) {
+ goto finish;
+ }
+ (void) libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+
+ ret = libvarpd_c_instance_destroy(chdl, varpdid);
+finish:
+ (void) libvarpd_c_destroy(chdl);
+ (void) dladm_destroy_datalink_id(handle, linkid, flags);
+
+ return (dladm_errno2status(ret));
+}
+
+dladm_status_t
+dladm_overlay_get_prop(dladm_handle_t handle, datalink_id_t linkid,
+ dladm_overlay_propinfo_handle_t infohdl, void *buf, size_t *sizep)
+{
+ int ret;
+ overlay_ioc_prop_t oip;
+ dladm_overlay_propinfo_t *infop = (dladm_overlay_propinfo_t *)infohdl;
+
+ /*
+ * It'd be nice if we had a better or more specific error for this. If
+ * this kind of error becomes common place, let's get a better dladm
+ * error.
+ */
+ if (*sizep < DLADM_OVERLAY_PROP_SIZEMAX)
+ return (dladm_errno2status(ERANGE));
+
+ if (infop->dop_isvarpd == B_FALSE) {
+ bzero(&oip, sizeof (overlay_ioc_prop_t));
+ oip.oip_linkid = linkid;
+ oip.oip_id = infop->dop_un.dop_overlay->oipi_id;
+ ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_GETPROP, &oip);
+ if (ret != 0)
+ return (dladm_errno2status(errno));
+ bcopy(oip.oip_value, buf, DLADM_OVERLAY_PROP_SIZEMAX);
+ *sizep = oip.oip_size;
+ } else {
+ uint32_t size = *sizep;
+
+ ret = libvarpd_c_prop_get(infop->dop_un.dop_varpd, buf, &size);
+ if (ret != 0)
+ return (dladm_errno2status(errno));
+ *sizep = size;
+ }
+
+ return (DLADM_STATUS_OK);
+}
+
+static dladm_status_t
+dladm_overlay_walk_varpd_prop(dladm_handle_t handle, datalink_id_t linkid,
+ uint64_t varpdid, dladm_overlay_prop_f func, void *arg)
+{
+ int ret, i;
+ varpd_client_handle_t *chdl;
+ varpd_client_prop_handle_t *phdl;
+ uint_t nprops;
+ dladm_status_t status;
+
+ if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0)
+ return (dladm_errno2status(ret));
+
+ if ((ret = libvarpd_c_prop_handle_alloc(chdl, varpdid, &phdl)) != 0) {
+ (void) libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+
+ if ((ret = libvarpd_c_prop_nprops(chdl, varpdid, &nprops)) != 0) {
+ libvarpd_c_prop_handle_free(phdl);
+ (void) libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+
+ status = DLADM_STATUS_OK;
+ for (i = 0; i < nprops; i++) {
+ dladm_overlay_propinfo_t dop;
+
+ bzero(&dop, sizeof (dop));
+ dop.dop_isvarpd = B_TRUE;
+ dop.dop_un.dop_varpd = phdl;
+
+ if ((ret = libvarpd_c_prop_info_fill(phdl, i)) != 0) {
+ status = dladm_errno2status(ret);
+ break;
+ }
+
+ ret = func(handle, linkid,
+ (dladm_overlay_propinfo_handle_t)&dop, arg);
+ if (ret == DLADM_WALK_TERMINATE)
+ break;
+ }
+
+ libvarpd_c_prop_handle_free(phdl);
+ libvarpd_c_destroy(chdl);
+
+ return (status);
+}
+
+dladm_status_t
+dladm_overlay_walk_prop(dladm_handle_t handle, datalink_id_t linkid,
+ dladm_overlay_prop_f func, void *arg, dladm_errlist_t *errs)
+{
+ int i, ret;
+ datalink_class_t class;
+ overlay_ioc_nprops_t oin;
+ overlay_ioc_propinfo_t oipi;
+ dladm_overlay_propinfo_t dop;
+ uint64_t varpdid = UINT64_MAX;
+
+ if (dladm_datalink_id2info(handle, linkid, NULL, &class, NULL,
+ NULL, 0) != DLADM_STATUS_OK)
+ return (DLADM_STATUS_BADARG);
+
+ if (class != DATALINK_CLASS_OVERLAY)
+ return (DLADM_STATUS_BADARG);
+
+ bzero(&oin, sizeof (overlay_ioc_nprops_t));
+ oin.oipn_linkid = linkid;
+ ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_NPROPS, &oin);
+ if (ret != 0)
+ return (dladm_errno2status(errno));
+
+ for (i = 0; i < oin.oipn_nprops; i++) {
+ bzero(&dop, sizeof (dladm_overlay_propinfo_t));
+ bzero(&oipi, sizeof (overlay_ioc_propinfo_t));
+ oipi.oipi_linkid = linkid;
+ oipi.oipi_id = i;
+ ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_PROPINFO, &oipi);
+ if (ret != 0) {
+ (void) dladm_errlist_append(errs, "failed to get "
+ "propinfo for property %d: %s", i, strerror(errno));
+ return (dladm_errno2status(errno));
+ }
+
+ dop.dop_isvarpd = B_FALSE;
+ dop.dop_un.dop_overlay = &oipi;
+ ret = func(handle, linkid,
+ (dladm_overlay_propinfo_handle_t)&dop, arg);
+ if (ret == DLADM_WALK_TERMINATE)
+ break;
+
+ if (strcmp(oipi.oipi_name, VARPD_PROPERTY_NAME) == 0) {
+ uint8_t buf[DLADM_OVERLAY_PROP_SIZEMAX];
+ size_t bufsize = sizeof (buf);
+ uint64_t *vp;
+
+ if (dladm_overlay_get_prop(handle, linkid,
+ (dladm_overlay_propinfo_handle_t)&dop, buf,
+ &bufsize) != DLADM_STATUS_OK)
+ continue;
+
+ vp = (uint64_t *)buf;
+ varpdid = *vp;
+ }
+ }
+
+ /* Should this really be possible? */
+ if (varpdid == UINT64_MAX)
+ return (DLADM_STATUS_OK);
+
+ return (dladm_overlay_walk_varpd_prop(handle, linkid, varpdid, func,
+ arg));
+}
+
+dladm_status_t
+dladm_overlay_create(dladm_handle_t handle, const char *name,
+ const char *encap, const char *search, uint64_t vid,
+ dladm_arg_list_t *props, dladm_errlist_t *errs, uint32_t flags)
+{
+ int ret, i;
+ dladm_status_t status;
+ datalink_id_t linkid;
+ overlay_ioc_create_t oic;
+ overlay_ioc_activate_t oia;
+ size_t slen;
+ varpd_client_handle_t *vch;
+ uint64_t id;
+
+ status = dladm_create_datalink_id(handle, name, DATALINK_CLASS_OVERLAY,
+ DL_ETHER, flags, &linkid);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ bzero(&oic, sizeof (oic));
+ oic.oic_linkid = linkid;
+ oic.oic_vnetid = vid;
+ (void) strlcpy(oic.oic_encap, encap, MAXLINKNAMELEN);
+
+ status = DLADM_STATUS_OK;
+ ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_CREATE, &oic);
+ if (ret != 0) {
+ /*
+ * It'd be nice if we had private errors so we could better
+ * distinguish between different classes of errors.
+ */
+ status = dladm_errno2status(errno);
+ }
+
+ if (status != DLADM_STATUS_OK) {
+ (void) dladm_destroy_datalink_id(handle, linkid, flags);
+ return (status);
+ }
+
+ slen = strlen(search);
+ for (i = 0; props != NULL && i < props->al_count; i++) {
+ dladm_arg_info_t *aip = &props->al_info[i];
+
+ /*
+ * If it's a property for the search plugin, eg. it has the
+ * prefix '<search>/', then we don't set the property on the
+ * overlay device and instead set it on the varpd instance.
+ */
+ if (strncmp(aip->ai_name, search, slen) == 0 &&
+ aip->ai_name[slen] == '/')
+ continue;
+ status = dladm_overlay_setprop(handle, linkid, aip->ai_name,
+ aip->ai_val, aip->ai_count);
+ if (status != DLADM_STATUS_OK) {
+ (void) dladm_errlist_append(errs,
+ "failed to set property %s",
+ aip->ai_name);
+ (void) dladm_overlay_delete(handle, linkid);
+ return (status);
+ }
+ }
+
+ if ((ret = libvarpd_c_create(&vch, dladm_overlay_doorpath)) != 0) {
+ (void) dladm_errlist_append(errs,
+ "failed to create libvarpd handle: %s", strerror(ret));
+ (void) dladm_overlay_delete(handle, linkid);
+ return (dladm_errno2status(ret));
+ }
+
+ if ((ret = libvarpd_c_instance_create(vch, linkid, search,
+ &id)) != 0) {
+ (void) dladm_errlist_append(errs,
+ "failed to create varpd instance: %s", strerror(ret));
+ libvarpd_c_destroy(vch);
+ (void) dladm_overlay_delete(handle, linkid);
+ return (dladm_errno2status(ret));
+ }
+
+ for (i = 0; props != NULL && i < props->al_count; i++) {
+ dladm_arg_info_t *aip = &props->al_info[i];
+
+ /*
+ * Skip arguments we've processed already.
+ */
+ if (strncmp(aip->ai_name, search, slen) != 0)
+ continue;
+
+ if (aip->ai_name[slen] != '/')
+ continue;
+
+ ret = dladm_overlay_varpd_setprop(handle, vch, id, aip->ai_name,
+ aip->ai_val, aip->ai_count);
+ if (ret != 0) {
+ (void) dladm_errlist_append(errs,
+ "failed to set varpd prop: %s\n",
+ aip->ai_name);
+ (void) libvarpd_c_instance_destroy(vch, id);
+ libvarpd_c_destroy(vch);
+ (void) dladm_overlay_delete(handle, linkid);
+ return (dladm_errno2status(ret));
+ }
+ }
+
+ if ((ret = libvarpd_c_instance_activate(vch, id)) != 0) {
+ (void) dladm_errlist_append(errs,
+ "failed to activate varpd instance: %s", strerror(ret));
+ (void) dladm_overlay_walk_varpd_prop(handle, linkid, id,
+ dladm_overlay_activate_cb, errs);
+ (void) libvarpd_c_instance_destroy(vch, id);
+ libvarpd_c_destroy(vch);
+ (void) dladm_overlay_delete(handle, linkid);
+ return (dladm_errno2status(ret));
+
+ }
+
+ bzero(&oia, sizeof (oia));
+ oia.oia_linkid = linkid;
+ status = DLADM_STATUS_OK;
+ ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_ACTIVATE, &oia);
+ if (ret != 0) {
+ ret = errno;
+ (void) dladm_errlist_append(errs, "failed to activate "
+ "device: %s", strerror(ret));
+ (void) libvarpd_c_instance_destroy(vch, id);
+ (void) dladm_overlay_walk_prop(handle, linkid,
+ dladm_overlay_activate_cb, errs, errs);
+ status = dladm_errno2status(ret);
+ (void) libvarpd_c_instance_destroy(vch, id);
+ }
+
+ libvarpd_c_destroy(vch);
+ if (status != DLADM_STATUS_OK)
+ (void) dladm_overlay_delete(handle, linkid);
+
+ return (status);
+}
+
+
+
+typedef struct overlay_walk_cb {
+ dladm_handle_t owc_handle;
+ datalink_id_t owc_linkid;
+ void *owc_arg;
+ dladm_overlay_cache_f owc_func;
+ uint_t owc_mode;
+ uint_t owc_dest;
+} overlay_walk_cb_t;
+
+/* ARGSUSED */
+static int
+dladm_overlay_walk_cache_cb(varpd_client_handle_t *chdl, uint64_t varpdid,
+ const struct ether_addr *key, const varpd_client_cache_entry_t *entry,
+ void *arg)
+{
+ overlay_walk_cb_t *owc = arg;
+ dladm_overlay_point_t point;
+
+ bzero(&point, sizeof (dladm_overlay_point_t));
+ point.dop_dest = owc->owc_dest;
+ point.dop_mac = entry->vcp_mac;
+ point.dop_flags = entry->vcp_flags;
+ point.dop_ip = entry->vcp_ip;
+ point.dop_port = entry->vcp_port;
+
+ if (owc->owc_mode == OVERLAY_TARGET_POINT)
+ point.dop_flags |= DLADM_OVERLAY_F_DEFAULT;
+
+ if (owc->owc_func(owc->owc_handle, owc->owc_linkid, key, &point,
+ owc->owc_arg) == DLADM_WALK_TERMINATE)
+ return (1);
+ return (0);
+}
+
+dladm_status_t
+dladm_overlay_walk_cache(dladm_handle_t handle, datalink_id_t linkid,
+ dladm_overlay_cache_f func, void *arg)
+{
+ int ret;
+ uint_t mode, dest;
+ uint64_t varpdid;
+ varpd_client_handle_t *chdl;
+ overlay_walk_cb_t cbarg;
+
+ if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0)
+ return (dladm_errno2status(ret));
+
+ if ((ret = libvarpd_c_instance_lookup(chdl, linkid, &varpdid)) != 0) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+
+ if ((ret = libvarpd_c_instance_target_mode(chdl, varpdid,
+ &dest, &mode)) != 0) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+
+ cbarg.owc_handle = handle;
+ cbarg.owc_linkid = linkid;
+ cbarg.owc_arg = arg;
+ cbarg.owc_func = func;
+ cbarg.owc_dest = dest;
+ cbarg.owc_mode = mode;
+ ret = libvarpd_c_instance_cache_walk(chdl, varpdid,
+ dladm_overlay_walk_cache_cb, &cbarg);
+ libvarpd_c_destroy(chdl);
+
+ return (dladm_errno2status(ret));
+}
+
+/* ARGSUSED */
+dladm_status_t
+dladm_overlay_cache_flush(dladm_handle_t handle, datalink_id_t linkid)
+{
+ int ret;
+ uint64_t varpdid;
+ varpd_client_handle_t *chdl;
+
+ if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0)
+ return (dladm_errno2status(ret));
+
+ if ((ret = libvarpd_c_instance_lookup(chdl, linkid, &varpdid)) != 0) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+
+ ret = libvarpd_c_instance_cache_flush(chdl, varpdid);
+ libvarpd_c_destroy(chdl);
+
+ return (dladm_errno2status(ret));
+}
+
+/* ARGSUSED */
+dladm_status_t
+dladm_overlay_cache_delete(dladm_handle_t handle, datalink_id_t linkid,
+ const struct ether_addr *key)
+{
+ int ret;
+ uint64_t varpdid;
+ varpd_client_handle_t *chdl;
+
+ if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0)
+ return (dladm_errno2status(ret));
+
+ if ((ret = libvarpd_c_instance_lookup(chdl, linkid, &varpdid)) != 0) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+
+ ret = libvarpd_c_instance_cache_delete(chdl, varpdid, key);
+ libvarpd_c_destroy(chdl);
+
+ return (dladm_errno2status(ret));
+}
+
+/* ARGSUSED */
+dladm_status_t
+dladm_overlay_cache_set(dladm_handle_t handle, datalink_id_t linkid,
+ const struct ether_addr *key, char *val)
+{
+ int ret;
+ uint_t dest;
+ uint64_t varpdid;
+ char *ip, *port = NULL;
+ varpd_client_handle_t *chdl;
+ varpd_client_cache_entry_t vcp;
+
+
+ if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0)
+ return (dladm_errno2status(ret));
+
+ if ((ret = libvarpd_c_instance_lookup(chdl, linkid, &varpdid)) != 0) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+
+ if ((ret = libvarpd_c_instance_target_mode(chdl, varpdid,
+ &dest, NULL)) != 0) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+
+ /*
+ * Mode tells us what we should expect in val. It we have more than one
+ * thing listed, the canonical format of it right now is mac,ip:port.
+ */
+ bzero(&vcp, sizeof (varpd_client_cache_entry_t));
+
+ if (strcasecmp(val, "drop") == 0) {
+ vcp.vcp_flags = OVERLAY_TARGET_CACHE_DROP;
+ goto send;
+ }
+
+ if (dest & OVERLAY_PLUGIN_D_ETHERNET) {
+ if (ether_aton_r(val, &vcp.vcp_mac) == NULL) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(EINVAL));
+ }
+ }
+
+ if (dest & OVERLAY_PLUGIN_D_IP) {
+ if (dest & OVERLAY_PLUGIN_D_ETHERNET) {
+ if ((ip = strchr(val, ',')) == NULL) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+ ip++;
+ } else {
+ ip = val;
+ }
+
+ if (dest & OVERLAY_PLUGIN_D_PORT) {
+ if ((port = strchr(val, ':')) == NULL) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+ *port = '\0';
+ port++;
+ }
+
+ /* Try v6, then fall back to v4 */
+ ret = inet_pton(AF_INET6, ip, &vcp.vcp_ip);
+ if (ret == -1)
+ abort();
+ if (ret == 0) {
+ struct in_addr v4;
+
+ ret = inet_pton(AF_INET, ip, &v4);
+ if (ret == -1)
+ abort();
+ if (ret == 0) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+ IN6_INADDR_TO_V4MAPPED(&v4, &vcp.vcp_ip);
+ }
+ }
+
+ if (dest & OVERLAY_PLUGIN_D_PORT) {
+ char *eptr;
+ unsigned long l;
+ if (port == NULL && (dest & OVERLAY_PLUGIN_D_ETHERNET)) {
+ if ((port = strchr(val, ',')) == NULL) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(EINVAL));
+ }
+ } else if (port == NULL)
+ port = val;
+
+ errno = 0;
+ l = strtoul(port, &eptr, 10);
+ if (errno != 0 || *eptr != '\0') {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(EINVAL));
+ }
+ if (l == 0 || l > UINT16_MAX) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(EINVAL));
+ }
+ vcp.vcp_port = l;
+ }
+
+send:
+ ret = libvarpd_c_instance_cache_set(chdl, varpdid, key, &vcp);
+
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+}
+
+/* ARGSUSED */
+dladm_status_t
+dladm_overlay_cache_get(dladm_handle_t handle, datalink_id_t linkid,
+ const struct ether_addr *key, dladm_overlay_point_t *point)
+{
+ int ret;
+ uint_t dest, mode;
+ uint64_t varpdid;
+ varpd_client_handle_t *chdl;
+ varpd_client_cache_entry_t entry;
+
+ if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0)
+ return (dladm_errno2status(ret));
+
+ if ((ret = libvarpd_c_instance_lookup(chdl, linkid, &varpdid)) != 0) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+
+ if ((ret = libvarpd_c_instance_target_mode(chdl, varpdid,
+ &dest, &mode)) != 0) {
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+ }
+
+ ret = libvarpd_c_instance_cache_get(chdl, varpdid, key, &entry);
+ if (ret == 0) {
+ point->dop_dest = dest;
+ point->dop_mac = entry.vcp_mac;
+ point->dop_flags = entry.vcp_flags;
+ point->dop_ip = entry.vcp_ip;
+ point->dop_port = entry.vcp_port;
+ if (mode == OVERLAY_TARGET_POINT)
+ point->dop_flags |= DLADM_OVERLAY_F_DEFAULT;
+ }
+
+ libvarpd_c_destroy(chdl);
+ return (dladm_errno2status(ret));
+}
+
+dladm_status_t
+dladm_overlay_status(dladm_handle_t handle, datalink_id_t linkid,
+ dladm_overlay_status_f func, void *arg)
+{
+ int ret;
+ dladm_status_t status;
+ overlay_ioc_status_t ois;
+ dladm_overlay_status_t dos;
+
+ ois.ois_linkid = linkid;
+ status = DLADM_STATUS_OK;
+ ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_STATUS, &ois);
+ if (ret != 0)
+ status = dladm_errno2status(errno);
+ if (status != DLADM_STATUS_OK)
+ return (status);
+
+ dos.dos_degraded = ois.ois_status == OVERLAY_I_DEGRADED ? B_TRUE :
+ B_FALSE;
+ (void) strlcpy(dos.dos_fmamsg, ois.ois_message,
+ sizeof (dos.dos_fmamsg));
+ func(handle, linkid, &dos, arg);
+ return (DLADM_STATUS_OK);
+}
diff --git a/usr/src/lib/libdladm/common/libdloverlay.h b/usr/src/lib/libdladm/common/libdloverlay.h
new file mode 100644
index 0000000000..39b01ccae3
--- /dev/null
+++ b/usr/src/lib/libdladm/common/libdloverlay.h
@@ -0,0 +1,107 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2015 Joyent, Inc.
+ */
+
+#ifndef _LIBDLOVERLAY_H
+#define _LIBDLOVERLAY_H
+
+/*
+ * libdladm Overlay device routines
+ */
+
+#include <libdladm.h>
+#include <libdladm_impl.h>
+#include <sys/overlay.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DLADM_OVERLAY_F_DROP 0x0001
+#define DLADM_OVERLAY_F_DEFAULT 0xf000
+
+typedef struct dladm_overlay_point {
+ uint_t dop_dest;
+ struct ether_addr dop_mac;
+ uint16_t dop_flags;
+ struct in6_addr dop_ip;
+ uint16_t dop_port;
+} dladm_overlay_point_t;
+
+typedef struct dladm_overlay_status {
+ boolean_t dos_degraded;
+ char dos_fmamsg[256];
+} dladm_overlay_status_t;
+
+extern dladm_status_t dladm_overlay_create(dladm_handle_t, const char *,
+ const char *, const char *, uint64_t, dladm_arg_list_t *, dladm_errlist_t *,
+ uint32_t);
+extern dladm_status_t dladm_overlay_delete(dladm_handle_t, datalink_id_t);
+
+typedef void (*dladm_overlay_status_f)(dladm_handle_t, datalink_id_t,
+ dladm_overlay_status_t *, void *);
+extern dladm_status_t dladm_overlay_status(dladm_handle_t, datalink_id_t,
+ dladm_overlay_status_f, void *);
+
+extern dladm_status_t dladm_overlay_cache_flush(dladm_handle_t, datalink_id_t);
+extern dladm_status_t dladm_overlay_cache_delete(dladm_handle_t, datalink_id_t,
+ const struct ether_addr *);
+extern dladm_status_t dladm_overlay_cache_set(dladm_handle_t, datalink_id_t,
+ const struct ether_addr *, char *);
+extern dladm_status_t dladm_overlay_cache_get(dladm_handle_t, datalink_id_t,
+ const struct ether_addr *, dladm_overlay_point_t *);
+
+#define DLADM_OVERLAY_PROP_SIZEMAX 256
+#define DLADM_OVERLAY_PROP_NAMELEN 32
+
+typedef struct __dladm_overlay_propinfo *dladm_overlay_propinfo_handle_t;
+
+extern dladm_status_t dladm_overlay_prop_info(dladm_overlay_propinfo_handle_t,
+ const char **, uint_t *, uint_t *, const void **, uint32_t *,
+ const mac_propval_range_t **);
+extern dladm_status_t dladm_overlay_get_prop(dladm_handle_t, datalink_id_t,
+ dladm_overlay_propinfo_handle_t, void *buf, size_t *bufsize);
+
+typedef int (*dladm_overlay_prop_f)(dladm_handle_t, datalink_id_t,
+ dladm_overlay_propinfo_handle_t, void *);
+extern dladm_status_t dladm_overlay_walk_prop(dladm_handle_t, datalink_id_t,
+ dladm_overlay_prop_f, void *arg, dladm_errlist_t *);
+
+typedef int (*dladm_overlay_cache_f)(dladm_handle_t, datalink_id_t,
+ const struct ether_addr *, const dladm_overlay_point_t *, void *);
+extern dladm_status_t dladm_overlay_walk_cache(dladm_handle_t, datalink_id_t,
+ dladm_overlay_cache_f, void *);
+
+/*
+ * Some day we'll want to support being able to set properties after creation.
+ * If we do, the following strawman API might serve us well.
+ *
+ * extern dladm_status_t dladm_overlay_prop_lookup(dladm_handle_t,
+ * datalink_id_t, const char *, dladm_overlay_propinfo_handle_t *);
+ * extern void dladm_overlay_prop_handle_free(dladm_handle_t, datalink_id_t,
+ * dladm_overlay_propinfo_handle_t *);
+ * extern dladm_status_t dladm_overlay_set_prop(dladm_handle_t, datalink_id_t,
+ * dladm_propinfo_handle_t, void *buf, size_t *bufsize);
+ * extern dladm_status_t dladm_overlay_str_to_buf(dladm_handle_t, datalink_id_t,
+ * dladm_overlay_propinfo_handle_t *, const char *, void *, size_t *);
+ * extern dladm_status_t dladm_overlay_buf_to_str(dladm_handle_t, datalink_id_t,
+ * dladm_overlay_propinfo_handle_t *, const void *, const size_t, char *,
+ * size_t *);
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBDLOVERLAY_H */
diff --git a/usr/src/lib/libdladm/common/libdlvlan.c b/usr/src/lib/libdladm/common/libdlvlan.c
index 943728dc03..34c1e6682d 100644
--- a/usr/src/lib/libdladm/common/libdlvlan.c
+++ b/usr/src/lib/libdladm/common/libdlvlan.c
@@ -64,7 +64,7 @@ dladm_vlan_create(dladm_handle_t handle, const char *vlan, datalink_id_t linkid,
{
return (dladm_vnic_create(handle, vlan, linkid,
VNIC_MAC_ADDR_TYPE_PRIMARY, NULL, 0, NULL, 0, vid, VRRP_VRID_NONE,
- AF_UNSPEC, vlan_id_out, proplist, flags | DLADM_OPT_VLAN));
+ AF_UNSPEC, vlan_id_out, proplist, NULL, flags | DLADM_OPT_VLAN));
}
/*
diff --git a/usr/src/lib/libdladm/common/libdlvnic.c b/usr/src/lib/libdladm/common/libdlvnic.c
index bad25e69ed..73c001b744 100644
--- a/usr/src/lib/libdladm/common/libdlvnic.c
+++ b/usr/src/lib/libdladm/common/libdlvnic.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent Inc.
* Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
*/
@@ -400,7 +401,7 @@ dladm_vnic_create(dladm_handle_t handle, const char *vnic, datalink_id_t linkid,
vnic_mac_addr_type_t mac_addr_type, uchar_t *mac_addr, uint_t mac_len,
int *mac_slot, uint_t mac_prefix_len, uint16_t vid, vrid_t vrid,
int af, datalink_id_t *vnic_id_out, dladm_arg_list_t *proplist,
- uint32_t flags)
+ dladm_errlist_t *errs, uint32_t flags)
{
dladm_vnic_attr_t attr;
datalink_id_t vnic_id;
@@ -567,8 +568,14 @@ dladm_vnic_create(dladm_handle_t handle, const char *vnic, datalink_id_t linkid,
status = dladm_set_linkprop(handle, vnic_id,
aip->ai_name, aip->ai_val, aip->ai_count,
DLADM_OPT_PERSIST);
- if (status != DLADM_STATUS_OK)
+ if (status != DLADM_STATUS_OK) {
+ char errmsg[DLADM_STRSIZE];
+ (void) dladm_errlist_append(errs,
+ "failed to set property %s: %s",
+ aip->ai_name,
+ dladm_status2str(status, errmsg));
break;
+ }
}
}
diff --git a/usr/src/lib/libdladm/common/libdlvnic.h b/usr/src/lib/libdladm/common/libdlvnic.h
index 94b656aadf..839b2de9f2 100644
--- a/usr/src/lib/libdladm/common/libdlvnic.h
+++ b/usr/src/lib/libdladm/common/libdlvnic.h
@@ -55,7 +55,8 @@ typedef struct dladm_vnic_attr {
extern dladm_status_t dladm_vnic_create(dladm_handle_t, const char *,
datalink_id_t, vnic_mac_addr_type_t, uchar_t *,
uint_t, int *, uint_t, uint16_t, vrid_t, int,
- datalink_id_t *, dladm_arg_list_t *, uint32_t);
+ datalink_id_t *, dladm_arg_list_t *,
+ dladm_errlist_t *, uint32_t);
extern dladm_status_t dladm_vnic_delete(dladm_handle_t, datalink_id_t,
uint32_t);
diff --git a/usr/src/lib/libdladm/common/mapfile-vers b/usr/src/lib/libdladm/common/mapfile-vers
index 63a86529fc..eba6118ace 100644
--- a/usr/src/lib/libdladm/common/mapfile-vers
+++ b/usr/src/lib/libdladm/common/mapfile-vers
@@ -20,6 +20,7 @@
#
#
# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright 2015 Joyent, Inc.
#
#
@@ -269,6 +270,23 @@ SYMBOL_VERSION SUNWprivate_1.1 {
dladm_strs2range;
dladm_range2list;
dladm_list2range;
+
+ dladm_errlist_init;
+ dladm_errlist_reset;
+ dladm_errlist_append;
+
+ dladm_overlay_create;
+ dladm_overlay_delete;
+ dladm_overlay_status;
+ dladm_overlay_prop_info;
+ dladm_overlay_get_prop;
+ dladm_overlay_walk_prop;
+
+ dladm_overlay_cache_set;
+ dladm_overlay_cache_get;
+ dladm_overlay_cache_delete;
+ dladm_overlay_cache_flush;
+ dladm_overlay_walk_cache;
local:
*;
};
diff --git a/usr/src/lib/varpd/Makefile b/usr/src/lib/varpd/Makefile
new file mode 100644
index 0000000000..0962119d1c
--- /dev/null
+++ b/usr/src/lib/varpd/Makefile
@@ -0,0 +1,33 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+SUBDIRS = libvarpd direct files
+
+all := TARGET = all
+clean := TARGET = clean
+clobber := TARGET = clobber
+check := TARGET = check
+install := TARGET = install
+install_h := TARGET = install_h
+
+.KEEP_STATE:
+
+all clean clobber install install_h check: $(SUBDIRS)
+direct files svp: libvarpd
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
diff --git a/usr/src/lib/varpd/Makefile.plugin b/usr/src/lib/varpd/Makefile.plugin
new file mode 100644
index 0000000000..48f188500c
--- /dev/null
+++ b/usr/src/lib/varpd/Makefile.plugin
@@ -0,0 +1,19 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+ROOTLIBDIR = $(ROOT)/usr/lib/varpd
+ROOTLIBDIR64 = $(ROOT)/usr/lib/varpd/$(MACH64)
+
+MAPFILES += ../../libvarpd/common/mapfile-plugin
diff --git a/usr/src/lib/varpd/direct/Makefile b/usr/src/lib/varpd/direct/Makefile
new file mode 100644
index 0000000000..511ea1f94d
--- /dev/null
+++ b/usr/src/lib/varpd/direct/Makefile
@@ -0,0 +1,39 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+include ../../Makefile.lib
+
+SUBDIRS = $(MACH)
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all := TARGET = all
+clean := TARGET = clean
+clobber := TARGET = clobber
+install := TARGET = install
+
+.KEEP_STATE:
+
+all clean clobber install: $(SUBDIRS)
+
+install_h:
+
+check:
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include ../../Makefile.targ
diff --git a/usr/src/lib/varpd/direct/Makefile.com b/usr/src/lib/varpd/direct/Makefile.com
new file mode 100644
index 0000000000..4e8564bae0
--- /dev/null
+++ b/usr/src/lib/varpd/direct/Makefile.com
@@ -0,0 +1,35 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+LIBRARY = libvarpd_direct.a
+VERS = .1
+OBJECTS = libvarpd_direct.o
+
+include ../../../Makefile.lib
+include ../../Makefile.plugin
+
+LIBS = $(DYNLIB)
+LDLIBS += -lc -lumem -lnvpair
+CPPFLAGS += -I../common
+
+CSTD= $(CSTD_GNU99)
+
+SRCDIR = ../common
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+include ../../../Makefile.targ
diff --git a/usr/src/lib/varpd/direct/amd64/Makefile b/usr/src/lib/varpd/direct/amd64/Makefile
new file mode 100644
index 0000000000..1881990d79
--- /dev/null
+++ b/usr/src/lib/varpd/direct/amd64/Makefile
@@ -0,0 +1,19 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+include ../Makefile.com
+include ../../../Makefile.lib.64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64)
diff --git a/usr/src/lib/varpd/direct/common/libvarpd_direct.c b/usr/src/lib/varpd/direct/common/libvarpd_direct.c
new file mode 100644
index 0000000000..ed9f79fc7f
--- /dev/null
+++ b/usr/src/lib/varpd/direct/common/libvarpd_direct.c
@@ -0,0 +1,411 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Point to point plug-in for varpd.
+ *
+ * This plugin implements a simple point to point plugin for a packet. It
+ * represents the traditional tunnel, just in overlay form. As such, the only
+ * properties it needs are those to determine where to send everything. At this
+ * time, we don't allow a multicast address; however, there's no reason that the
+ * direct plugin shouldn't in theory support multicast, though when implementing
+ * it the best path will become clear.
+ *
+ * In general this module has been designed to make it easy to support a
+ * destination of either IP or IP and port; however, we restrict it to the
+ * latter as we don't currently have an implementation that would allow us to
+ * test that.
+ */
+
+#include <libvarpd_provider.h>
+#include <umem.h>
+#include <errno.h>
+#include <thread.h>
+#include <synch.h>
+#include <strings.h>
+#include <assert.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <libnvpair.h>
+
+typedef struct varpd_direct {
+ overlay_plugin_dest_t vad_dest; /* RO */
+ mutex_t vad_lock; /* Protects the rest */
+ boolean_t vad_hip;
+ boolean_t vad_hport;
+ struct in6_addr vad_ip;
+ uint16_t vad_port;
+} varpd_direct_t;
+
+static const char *varpd_direct_props[] = {
+ "direct/dest_ip",
+ "direct/dest_port"
+};
+
+static boolean_t
+varpd_direct_valid_dest(overlay_plugin_dest_t dest)
+{
+ if (dest & ~(OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
+ return (B_FALSE);
+
+ if (!(dest & (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/* ARGSUSED */
+static int
+varpd_direct_create(varpd_provider_handle_t *hdl, void **outp,
+ overlay_plugin_dest_t dest)
+{
+ int ret;
+ varpd_direct_t *vdp;
+
+ if (varpd_direct_valid_dest(dest) == B_FALSE)
+ return (ENOTSUP);
+
+ vdp = umem_alloc(sizeof (varpd_direct_t), UMEM_DEFAULT);
+ if (vdp == NULL)
+ return (ENOMEM);
+
+ if ((ret = mutex_init(&vdp->vad_lock, USYNC_THREAD | LOCK_ERRORCHECK,
+ NULL)) != 0) {
+ umem_free(vdp, sizeof (varpd_direct_t));
+ return (ret);
+ }
+
+ vdp->vad_dest = dest;
+ vdp->vad_hip = B_FALSE;
+ vdp->vad_hport = B_FALSE;
+ *outp = vdp;
+ return (0);
+}
+
+static int
+varpd_direct_start(void *arg)
+{
+ varpd_direct_t *vdp = arg;
+
+ mutex_enter(&vdp->vad_lock);
+ if (vdp->vad_hip == B_FALSE ||((vdp->vad_dest & OVERLAY_PLUGIN_D_IP) &&
+ vdp->vad_hport == B_FALSE)) {
+ mutex_exit(&vdp->vad_lock);
+ return (EAGAIN);
+ }
+ mutex_exit(&vdp->vad_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+varpd_direct_stop(void *arg)
+{
+}
+
+static void
+varpd_direct_destroy(void *arg)
+{
+ varpd_direct_t *vdp = arg;
+
+ if (mutex_destroy(&vdp->vad_lock) != 0)
+ abort();
+ umem_free(vdp, sizeof (varpd_direct_t));
+}
+
+static int
+varpd_direct_default(void *arg, overlay_target_point_t *otp)
+{
+ varpd_direct_t *vdp = arg;
+
+ mutex_enter(&vdp->vad_lock);
+ bcopy(&vdp->vad_ip, &otp->otp_ip, sizeof (struct in6_addr));
+ otp->otp_port = vdp->vad_port;
+ mutex_exit(&vdp->vad_lock);
+
+ return (VARPD_LOOKUP_OK);
+}
+
+static int
+varpd_direct_nprops(void *arg, uint_t *nprops)
+{
+ const varpd_direct_t *vdp = arg;
+
+ *nprops = 0;
+ if (vdp->vad_dest & OVERLAY_PLUGIN_D_ETHERNET)
+ *nprops += 1;
+
+ if (vdp->vad_dest & OVERLAY_PLUGIN_D_IP)
+ *nprops += 1;
+
+ if (vdp->vad_dest & OVERLAY_PLUGIN_D_PORT)
+ *nprops += 1;
+
+ assert(*nprops == 1 || *nprops == 2);
+
+ return (0);
+}
+
+static int
+varpd_direct_propinfo(void *arg, uint_t propid, varpd_prop_handle_t *vph)
+{
+ varpd_direct_t *vdp = arg;
+
+ /*
+ * Because we only support IP + port combos right now, prop 0 should
+ * always be the IP. We don't support a port without an IP.
+ */
+ assert(vdp->vad_dest & OVERLAY_PLUGIN_D_IP);
+ if (propid == 0) {
+ libvarpd_prop_set_name(vph, varpd_direct_props[0]);
+ libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_RRW);
+ libvarpd_prop_set_type(vph, OVERLAY_PROP_T_IP);
+ libvarpd_prop_set_nodefault(vph);
+ return (0);
+ }
+
+ if (propid == 1 && vdp->vad_dest & OVERLAY_PLUGIN_D_PORT) {
+ libvarpd_prop_set_name(vph, varpd_direct_props[1]);
+ libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_RRW);
+ libvarpd_prop_set_type(vph, OVERLAY_PROP_T_UINT);
+ libvarpd_prop_set_nodefault(vph);
+ libvarpd_prop_set_range_uint32(vph, 1, UINT16_MAX);
+ return (0);
+ }
+
+ return (EINVAL);
+}
+
+static int
+varpd_direct_getprop(void *arg, const char *pname, void *buf, uint32_t *sizep)
+{
+ varpd_direct_t *vdp = arg;
+
+ /* direct/dest_ip */
+ if (strcmp(pname, varpd_direct_props[0]) == 0) {
+ if (*sizep < sizeof (struct in6_addr))
+ return (EOVERFLOW);
+ mutex_enter(&vdp->vad_lock);
+ if (vdp->vad_hip == B_FALSE) {
+ *sizep = 0;
+ } else {
+ bcopy(&vdp->vad_ip, buf, sizeof (struct in6_addr));
+ *sizep = sizeof (struct in6_addr);
+ }
+ mutex_exit(&vdp->vad_lock);
+ return (0);
+ }
+
+ /* direct/dest_port */
+ if (strcmp(pname, varpd_direct_props[1]) == 0) {
+ uint64_t val;
+
+ if (*sizep < sizeof (uint64_t))
+ return (EOVERFLOW);
+ mutex_enter(&vdp->vad_lock);
+ if (vdp->vad_hport == B_FALSE) {
+ *sizep = 0;
+ } else {
+ val = vdp->vad_port;
+ bcopy(&val, buf, sizeof (uint64_t));
+ *sizep = sizeof (uint64_t);
+ }
+ mutex_exit(&vdp->vad_lock);
+ return (0);
+ }
+
+ return (EINVAL);
+}
+
+static int
+varpd_direct_setprop(void *arg, const char *pname, const void *buf,
+ const uint32_t size)
+{
+ varpd_direct_t *vdp = arg;
+
+ /* direct/dest_ip */
+ if (strcmp(pname, varpd_direct_props[0]) == 0) {
+ const struct in6_addr *ipv6 = buf;
+
+ if (size < sizeof (struct in6_addr))
+ return (EOVERFLOW);
+
+ if (IN6_IS_ADDR_V4COMPAT(ipv6))
+ return (EINVAL);
+
+ if (IN6_IS_ADDR_6TO4(ipv6))
+ return (EINVAL);
+
+ mutex_enter(&vdp->vad_lock);
+ bcopy(buf, &vdp->vad_ip, sizeof (struct in6_addr));
+ vdp->vad_hip = B_TRUE;
+ mutex_exit(&vdp->vad_lock);
+ return (0);
+ }
+
+ /* direct/dest_port */
+ if (strcmp(pname, varpd_direct_props[1]) == 0) {
+ const uint64_t *valp = buf;
+ if (size < sizeof (uint64_t))
+ return (EOVERFLOW);
+
+ if (*valp == 0 || *valp > UINT16_MAX)
+ return (EINVAL);
+
+ mutex_enter(&vdp->vad_lock);
+ vdp->vad_port = (uint16_t)*valp;
+ vdp->vad_hport = B_TRUE;
+ mutex_exit(&vdp->vad_lock);
+ return (0);
+ }
+
+ return (EINVAL);
+}
+
+static int
+varpd_direct_save(void *arg, nvlist_t *nvp)
+{
+ int ret;
+ varpd_direct_t *vdp = arg;
+
+ mutex_enter(&vdp->vad_lock);
+ if (vdp->vad_hport == B_TRUE) {
+ if ((ret = nvlist_add_uint16(nvp, varpd_direct_props[1],
+ vdp->vad_port)) != 0) {
+ mutex_exit(&vdp->vad_lock);
+ return (ret);
+ }
+ }
+
+ if (vdp->vad_hip == B_TRUE) {
+ char buf[INET6_ADDRSTRLEN];
+
+ if (inet_ntop(AF_INET6, &vdp->vad_ip, buf, sizeof (buf)) ==
+ NULL)
+ abort();
+ if ((ret = nvlist_add_string(nvp, varpd_direct_props[0],
+ buf)) != 0) {
+ mutex_exit(&vdp->vad_lock);
+ return (ret);
+ }
+ }
+ mutex_exit(&vdp->vad_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+varpd_direct_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl,
+ overlay_plugin_dest_t dest, void **outp)
+{
+ int ret;
+ char *ipstr;
+ varpd_direct_t *vdp;
+
+ if (varpd_direct_valid_dest(dest) == B_FALSE)
+ return (ENOTSUP);
+
+ vdp = umem_alloc(sizeof (varpd_direct_t), UMEM_DEFAULT);
+ if (vdp == NULL)
+ return (ENOMEM);
+
+ if ((ret = mutex_init(&vdp->vad_lock, USYNC_THREAD | LOCK_ERRORCHECK,
+ NULL)) != 0) {
+ umem_free(vdp, sizeof (varpd_direct_t));
+ return (ret);
+ }
+
+ if ((ret = nvlist_lookup_uint16(nvp, varpd_direct_props[1],
+ &vdp->vad_port)) != 0) {
+ if (ret != ENOENT) {
+ if (mutex_destroy(&vdp->vad_lock) != 0)
+ abort();
+ umem_free(vdp, sizeof (varpd_direct_t));
+ return (ret);
+ }
+ vdp->vad_hport = B_FALSE;
+ } else {
+ vdp->vad_hport = B_TRUE;
+ }
+
+ if ((ret = nvlist_lookup_string(nvp, varpd_direct_props[0],
+ &ipstr)) != 0) {
+ if (ret != ENOENT) {
+ if (mutex_destroy(&vdp->vad_lock) != 0)
+ abort();
+ umem_free(vdp, sizeof (varpd_direct_t));
+ return (ret);
+ }
+ vdp->vad_hip = B_FALSE;
+ } else {
+ ret = inet_pton(AF_INET6, ipstr, &vdp->vad_ip);
+ /*
+ * inet_pton is only defined to return -1 with errno set to
+ * EAFNOSUPPORT, which really, shouldn't happen.
+ */
+ if (ret == -1) {
+ assert(errno == EAFNOSUPPORT);
+ abort();
+ }
+ if (ret == 0) {
+ if (mutex_destroy(&vdp->vad_lock) != 0)
+ abort();
+ umem_free(vdp, sizeof (varpd_direct_t));
+ return (EINVAL);
+ }
+ }
+
+ *outp = vdp;
+ return (0);
+}
+
+static const varpd_plugin_ops_t varpd_direct_ops = {
+ 0,
+ varpd_direct_create,
+ varpd_direct_start,
+ varpd_direct_stop,
+ varpd_direct_destroy,
+ varpd_direct_default,
+ NULL,
+ varpd_direct_nprops,
+ varpd_direct_propinfo,
+ varpd_direct_getprop,
+ varpd_direct_setprop,
+ varpd_direct_save,
+ varpd_direct_restore
+};
+
+#pragma init(varpd_direct_init)
+static void
+varpd_direct_init(void)
+{
+ int err;
+ varpd_plugin_register_t *vpr;
+
+ vpr = libvarpd_plugin_alloc(VARPD_CURRENT_VERSION, &err);
+ if (vpr == NULL)
+ return;
+
+ vpr->vpr_mode = OVERLAY_TARGET_POINT;
+ vpr->vpr_name = "direct";
+ vpr->vpr_ops = &varpd_direct_ops;
+ (void) libvarpd_plugin_register(vpr);
+ libvarpd_plugin_free(vpr);
+}
diff --git a/usr/src/lib/varpd/direct/common/mapfile-vers b/usr/src/lib/varpd/direct/common/mapfile-vers
new file mode 100644
index 0000000000..6b7c5a5067
--- /dev/null
+++ b/usr/src/lib/varpd/direct/common/mapfile-vers
@@ -0,0 +1,35 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+# usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION SUNWprivate {
+ local:
+ *;
+};
diff --git a/usr/src/lib/varpd/direct/i386/Makefile b/usr/src/lib/varpd/direct/i386/Makefile
new file mode 100644
index 0000000000..4398507523
--- /dev/null
+++ b/usr/src/lib/varpd/direct/i386/Makefile
@@ -0,0 +1,18 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+include ../Makefile.com
+
+install: all $(ROOTLIBS) $(ROOTLINKS)
diff --git a/usr/src/lib/varpd/files/Makefile b/usr/src/lib/varpd/files/Makefile
new file mode 100644
index 0000000000..511ea1f94d
--- /dev/null
+++ b/usr/src/lib/varpd/files/Makefile
@@ -0,0 +1,39 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+include ../../Makefile.lib
+
+SUBDIRS = $(MACH)
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all := TARGET = all
+clean := TARGET = clean
+clobber := TARGET = clobber
+install := TARGET = install
+
+.KEEP_STATE:
+
+all clean clobber install: $(SUBDIRS)
+
+install_h:
+
+check:
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include ../../Makefile.targ
diff --git a/usr/src/lib/varpd/files/Makefile.com b/usr/src/lib/varpd/files/Makefile.com
new file mode 100644
index 0000000000..13ff2149ce
--- /dev/null
+++ b/usr/src/lib/varpd/files/Makefile.com
@@ -0,0 +1,36 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2018 Joyent, Inc.
+#
+
+LIBRARY = libvarpd_files.a
+VERS = .1
+OBJECTS = libvarpd_files.o \
+ libvarpd_files_json.o
+
+include ../../../Makefile.lib
+include ../../Makefile.plugin
+
+LIBS = $(DYNLIB)
+LDLIBS += -lc -lumem -lnvpair -lsocket -lcustr
+CPPFLAGS += -I../common
+
+CSTD= $(CSTD_GNU99)
+
+SRCDIR = ../common
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+include ../../../Makefile.targ
diff --git a/usr/src/lib/varpd/files/amd64/Makefile b/usr/src/lib/varpd/files/amd64/Makefile
new file mode 100644
index 0000000000..1881990d79
--- /dev/null
+++ b/usr/src/lib/varpd/files/amd64/Makefile
@@ -0,0 +1,19 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+include ../Makefile.com
+include ../../../Makefile.lib.64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64)
diff --git a/usr/src/lib/varpd/files/common/libvarpd_files.c b/usr/src/lib/varpd/files/common/libvarpd_files.c
new file mode 100644
index 0000000000..84cb27f9e8
--- /dev/null
+++ b/usr/src/lib/varpd/files/common/libvarpd_files.c
@@ -0,0 +1,605 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+/*
+ * Files based plug-in for varpd
+ *
+ * This is a dynamic varpd plug-in that has a static backing store. It's really
+ * nothing more than a glorified version of /etc/ethers, though it facilitates
+ * a bit more. The files module allows for the full set of mappings to be fixed
+ * at creation time. In addition, it also provides support for proxying ARP,
+ * NDP, and DHCP.
+ *
+ * At this time, the plugin requires that the destination type involve both an
+ * IP address and a port; however, there's no reason that this cannot be made
+ * more flexible as we have additional encapsulation algorithms that support it.
+ * The plug-in only has a single property, which is the location of the JSON
+ * file. The JSON file itself looks something like:
+ *
+ * {
+ * "aa:bb:cc:dd:ee:ff": {
+ * "arp": "10.23.69.1",
+ * "ndp": "2600:3c00::f03c:91ff:fe96:a264",
+ * "ip": "192.168.1.1",
+ * "port": 8080
+ * },
+ * ...
+ * }
+ */
+
+#include <libvarpd_provider.h>
+#include <umem.h>
+#include <errno.h>
+#include <thread.h>
+#include <synch.h>
+#include <strings.h>
+#include <assert.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <libnvpair.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/ethernet.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include <libvarpd_files_json.h>
+
+typedef struct varpd_files {
+ overlay_plugin_dest_t vaf_dest; /* RO */
+ varpd_provider_handle_t *vaf_hdl; /* RO */
+ char *vaf_path; /* WO */
+ nvlist_t *vaf_nvl; /* WO */
+ uint64_t vaf_nmisses; /* Atomic */
+ uint64_t vaf_narp; /* Atomic */
+} varpd_files_t;
+
+static const char *varpd_files_props[] = {
+ "files/config"
+};
+
+static boolean_t
+varpd_files_valid_dest(overlay_plugin_dest_t dest)
+{
+ if (dest & ~(OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
+ return (B_FALSE);
+
+ if (!(dest & (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static int
+varpd_files_create(varpd_provider_handle_t *hdl, void **outp,
+ overlay_plugin_dest_t dest)
+{
+ varpd_files_t *vaf;
+
+ if (varpd_files_valid_dest(dest) == B_FALSE)
+ return (ENOTSUP);
+
+ vaf = umem_alloc(sizeof (varpd_files_t), UMEM_DEFAULT);
+ if (vaf == NULL)
+ return (ENOMEM);
+
+ bzero(vaf, sizeof (varpd_files_t));
+ vaf->vaf_dest = dest;
+ vaf->vaf_path = NULL;
+ vaf->vaf_nvl = NULL;
+ vaf->vaf_hdl = hdl;
+ *outp = vaf;
+ return (0);
+}
+
+static int
+varpd_files_normalize_nvlist(varpd_files_t *vaf, nvlist_t *nvl)
+{
+ int ret;
+ nvlist_t *out;
+ nvpair_t *pair;
+
+ if ((ret = nvlist_alloc(&out, NV_UNIQUE_NAME, 0)) != 0)
+ return (ret);
+
+ for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(nvl, pair)) {
+ char *name, fname[ETHERADDRSTRL];
+ nvlist_t *data;
+ struct ether_addr ether, *e;
+ e = &ether;
+
+ if (nvpair_type(pair) != DATA_TYPE_NVLIST) {
+ nvlist_free(out);
+ return (EINVAL);
+ }
+
+ name = nvpair_name(pair);
+ if ((ret = nvpair_value_nvlist(pair, &data)) != 0) {
+ nvlist_free(out);
+ return (EINVAL);
+ }
+
+ if (ether_aton_r(name, e) == NULL) {
+ nvlist_free(out);
+ return (EINVAL);
+ }
+
+ if (ether_ntoa_r(e, fname) == NULL) {
+ nvlist_free(out);
+ return (ENOMEM);
+ }
+
+ if ((ret = nvlist_add_nvlist(out, fname, data)) != 0) {
+ nvlist_free(out);
+ return (EINVAL);
+ }
+ }
+
+ vaf->vaf_nvl = out;
+ return (0);
+}
+
+static int
+varpd_files_start(void *arg)
+{
+ int fd, ret;
+ void *maddr;
+ struct stat st;
+ nvlist_t *nvl;
+ varpd_files_t *vaf = arg;
+
+ if (vaf->vaf_path == NULL)
+ return (EAGAIN);
+
+ if ((fd = open(vaf->vaf_path, O_RDONLY)) < 0)
+ return (errno);
+
+ if (fstat(fd, &st) != 0) {
+ ret = errno;
+ if (close(fd) != 0)
+ abort();
+ return (ret);
+ }
+
+ maddr = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+ fd, 0);
+ if (maddr == NULL) {
+ ret = errno;
+ if (close(fd) != 0)
+ abort();
+ return (ret);
+ }
+
+ ret = nvlist_parse_json(maddr, st.st_size, &nvl,
+ NVJSON_FORCE_INTEGER, NULL);
+ if (ret == 0) {
+ ret = varpd_files_normalize_nvlist(vaf, nvl);
+ nvlist_free(nvl);
+ }
+ if (munmap(maddr, st.st_size) != 0)
+ abort();
+ if (close(fd) != 0)
+ abort();
+
+ return (ret);
+}
+
+static void
+varpd_files_stop(void *arg)
+{
+ varpd_files_t *vaf = arg;
+
+ nvlist_free(vaf->vaf_nvl);
+ vaf->vaf_nvl = NULL;
+}
+
+static void
+varpd_files_destroy(void *arg)
+{
+ varpd_files_t *vaf = arg;
+
+ assert(vaf->vaf_nvl == NULL);
+ if (vaf->vaf_path != NULL) {
+ umem_free(vaf->vaf_path, strlen(vaf->vaf_path) + 1);
+ vaf->vaf_path = NULL;
+ }
+ umem_free(vaf, sizeof (varpd_files_t));
+}
+
+static void
+varpd_files_lookup(void *arg, varpd_query_handle_t *qh,
+ const overlay_targ_lookup_t *otl, overlay_target_point_t *otp)
+{
+ char macstr[ETHERADDRSTRL], *ipstr;
+ nvlist_t *nvl;
+ varpd_files_t *vaf = arg;
+ int32_t port;
+ static const uint8_t bcast[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
+ /* We don't support a default */
+ if (otl == NULL) {
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ if (otl->otl_sap == ETHERTYPE_ARP) {
+ libvarpd_plugin_proxy_arp(vaf->vaf_hdl, qh, otl);
+ return;
+ }
+
+ if (otl->otl_sap == ETHERTYPE_IPV6 &&
+ otl->otl_dstaddr[0] == 0x33 &&
+ otl->otl_dstaddr[1] == 0x33) {
+ libvarpd_plugin_proxy_ndp(vaf->vaf_hdl, qh, otl);
+ return;
+ }
+
+ if (otl->otl_sap == ETHERTYPE_IP &&
+ bcmp(otl->otl_dstaddr, bcast, ETHERADDRL) == 0) {
+ char *mac;
+ struct ether_addr a, *addr;
+
+ addr = &a;
+ if (ether_ntoa_r((struct ether_addr *)otl->otl_srcaddr,
+ macstr) == NULL) {
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ if (nvlist_lookup_nvlist(vaf->vaf_nvl, macstr, &nvl) != 0) {
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ if (nvlist_lookup_string(nvl, "dhcp-proxy", &mac) != 0) {
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ if (ether_aton_r(mac, addr) == NULL) {
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ libvarpd_plugin_proxy_dhcp(vaf->vaf_hdl, qh, otl);
+ return;
+ }
+
+ if (ether_ntoa_r((struct ether_addr *)otl->otl_dstaddr,
+ macstr) == NULL) {
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ if (nvlist_lookup_nvlist(vaf->vaf_nvl, macstr, &nvl) != 0) {
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ if (nvlist_lookup_int32(nvl, "port", &port) != 0) {
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ if (port <= 0 || port > UINT16_MAX) {
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ return;
+ }
+ otp->otp_port = port;
+
+ if (nvlist_lookup_string(nvl, "ip", &ipstr) != 0) {
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ /*
+ * Try to parse it as a v6 address and then if it's not, try to
+ * transform it into a v4 address which we'll then wrap it into a v4
+ * mapped address.
+ */
+ if (inet_pton(AF_INET6, ipstr, &otp->otp_ip) != 1) {
+ uint32_t v4;
+ if (inet_pton(AF_INET, ipstr, &v4) != 1) {
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP);
+ return;
+ }
+ IN6_IPADDR_TO_V4MAPPED(v4, &otp->otp_ip);
+ }
+
+ libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_OK);
+}
+
+/* ARGSUSED */
+static int
+varpd_files_nprops(void *arg, uint_t *nprops)
+{
+ *nprops = 1;
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+varpd_files_propinfo(void *arg, uint_t propid, varpd_prop_handle_t *vph)
+{
+ if (propid != 0)
+ return (EINVAL);
+
+ libvarpd_prop_set_name(vph, varpd_files_props[0]);
+ libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_RRW);
+ libvarpd_prop_set_type(vph, OVERLAY_PROP_T_STRING);
+ libvarpd_prop_set_nodefault(vph);
+ return (0);
+}
+
+static int
+varpd_files_getprop(void *arg, const char *pname, void *buf, uint32_t *sizep)
+{
+ varpd_files_t *vaf = arg;
+
+ if (strcmp(pname, varpd_files_props[0]) != 0)
+ return (EINVAL);
+
+ if (vaf->vaf_path != NULL) {
+ size_t len = strlen(vaf->vaf_path) + 1;
+ if (*sizep < len)
+ return (EOVERFLOW);
+ *sizep = len;
+ (void) strlcpy(buf, vaf->vaf_path, *sizep);
+
+ } else {
+ *sizep = 0;
+ }
+
+ return (0);
+}
+
+static int
+varpd_files_setprop(void *arg, const char *pname, const void *buf,
+ const uint32_t size)
+{
+ varpd_files_t *vaf = arg;
+
+ if (strcmp(pname, varpd_files_props[0]) != 0)
+ return (EINVAL);
+
+ if (vaf->vaf_path != NULL)
+ umem_free(vaf->vaf_path, strlen(vaf->vaf_path) + 1);
+
+ vaf->vaf_path = umem_alloc(size, UMEM_DEFAULT);
+ if (vaf->vaf_path == NULL)
+ return (ENOMEM);
+ (void) strlcpy(vaf->vaf_path, buf, size);
+ return (0);
+}
+
+static int
+varpd_files_save(void *arg, nvlist_t *nvp)
+{
+ int ret;
+ varpd_files_t *vaf = arg;
+
+ if (vaf->vaf_path == NULL)
+ return (0);
+
+ if ((ret = nvlist_add_string(nvp, varpd_files_props[0],
+ vaf->vaf_path)) != 0)
+ return (ret);
+
+ if ((ret = nvlist_add_uint64(nvp, "files/vaf_nmisses",
+ vaf->vaf_nmisses)) != 0)
+ return (ret);
+
+ if ((ret = nvlist_add_uint64(nvp, "files/vaf_narp",
+ vaf->vaf_narp)) != 0)
+ return (ret);
+ return (0);
+}
+
+static int
+varpd_files_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl,
+ overlay_plugin_dest_t dest, void **outp)
+{
+ varpd_files_t *vaf;
+ char *str;
+ int ret;
+ uint64_t nmisses, narp;
+
+ if (varpd_files_valid_dest(dest) == B_FALSE)
+ return (EINVAL);
+
+ ret = nvlist_lookup_string(nvp, varpd_files_props[0], &str);
+ if (ret != 0 && ret != ENOENT)
+ return (ret);
+ else if (ret == ENOENT)
+ str = NULL;
+
+ if (nvlist_lookup_uint64(nvp, "files/vaf_nmisses", &nmisses) != 0)
+ return (EINVAL);
+ if (nvlist_lookup_uint64(nvp, "files/vaf_narp", &narp) != 0)
+ return (EINVAL);
+
+ vaf = umem_alloc(sizeof (varpd_files_t), UMEM_DEFAULT);
+ if (vaf == NULL)
+ return (ENOMEM);
+
+ bzero(vaf, sizeof (varpd_files_t));
+ vaf->vaf_dest = dest;
+ if (str != NULL) {
+ size_t len = strlen(str) + 1;
+ vaf->vaf_path = umem_alloc(len, UMEM_DEFAULT);
+ if (vaf->vaf_path == NULL) {
+ umem_free(vaf, sizeof (varpd_files_t));
+ return (ENOMEM);
+ }
+ (void) strlcpy(vaf->vaf_path, str, len);
+ }
+
+ vaf->vaf_hdl = hdl;
+ *outp = vaf;
+ return (0);
+}
+
+static void
+varpd_files_proxy_arp(void *arg, varpd_arp_handle_t *vah, int kind,
+ const struct sockaddr *sock, uint8_t *out)
+{
+ varpd_files_t *vaf = arg;
+ const struct sockaddr_in *ip;
+ const struct sockaddr_in6 *ip6;
+ nvpair_t *pair;
+
+ if (kind != VARPD_QTYPE_ETHERNET) {
+ libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ if (sock->sa_family != AF_INET && sock->sa_family != AF_INET6) {
+ libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ ip = (const struct sockaddr_in *)sock;
+ ip6 = (const struct sockaddr_in6 *)sock;
+ for (pair = nvlist_next_nvpair(vaf->vaf_nvl, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(vaf->vaf_nvl, pair)) {
+ char *mac, *ipstr;
+ nvlist_t *data;
+ struct in_addr ia;
+ struct in6_addr ia6;
+ struct ether_addr ether, *e;
+ e = &ether;
+
+ if (nvpair_type(pair) != DATA_TYPE_NVLIST)
+ continue;
+
+ mac = nvpair_name(pair);
+ if (nvpair_value_nvlist(pair, &data) != 0)
+ continue;
+
+
+ if (sock->sa_family == AF_INET) {
+ if (nvlist_lookup_string(data, "arp", &ipstr) != 0)
+ continue;
+
+ if (inet_pton(AF_INET, ipstr, &ia) != 1)
+ continue;
+
+ if (bcmp(&ia, &ip->sin_addr,
+ sizeof (struct in_addr)) != 0)
+ continue;
+ } else {
+ if (nvlist_lookup_string(data, "ndp", &ipstr) != 0)
+ continue;
+
+ if (inet_pton(AF_INET6, ipstr, &ia6) != 1)
+ continue;
+
+ if (bcmp(&ia6, &ip6->sin6_addr,
+ sizeof (struct in6_addr)) != 0)
+ continue;
+ }
+
+ if (ether_aton_r(mac, e) == NULL) {
+ libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ bcopy(e, out, ETHERADDRL);
+ libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_OK);
+ return;
+ }
+
+ libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP);
+}
+
+static void
+varpd_files_proxy_dhcp(void *arg, varpd_dhcp_handle_t *vdh, int type,
+ const overlay_targ_lookup_t *otl, uint8_t *out)
+{
+ varpd_files_t *vaf = arg;
+ nvlist_t *nvl;
+ char macstr[ETHERADDRSTRL], *mac;
+ struct ether_addr a, *addr;
+
+ addr = &a;
+ if (type != VARPD_QTYPE_ETHERNET) {
+ libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ if (ether_ntoa_r((struct ether_addr *)otl->otl_srcaddr,
+ macstr) == NULL) {
+ libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ if (nvlist_lookup_nvlist(vaf->vaf_nvl, macstr, &nvl) != 0) {
+ libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ if (nvlist_lookup_string(nvl, "dhcp-proxy", &mac) != 0) {
+ libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ if (ether_aton_r(mac, addr) == NULL) {
+ libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP);
+ return;
+ }
+
+ bcopy(addr, out, ETHERADDRL);
+ libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_OK);
+}
+
+static const varpd_plugin_ops_t varpd_files_ops = {
+ 0,
+ varpd_files_create,
+ varpd_files_start,
+ varpd_files_stop,
+ varpd_files_destroy,
+ NULL,
+ varpd_files_lookup,
+ varpd_files_nprops,
+ varpd_files_propinfo,
+ varpd_files_getprop,
+ varpd_files_setprop,
+ varpd_files_save,
+ varpd_files_restore,
+ varpd_files_proxy_arp,
+ varpd_files_proxy_dhcp
+};
+
+#pragma init(varpd_files_init)
+static void
+varpd_files_init(void)
+{
+ int err;
+ varpd_plugin_register_t *vpr;
+
+ vpr = libvarpd_plugin_alloc(VARPD_CURRENT_VERSION, &err);
+ if (vpr == NULL)
+ return;
+
+ vpr->vpr_mode = OVERLAY_TARGET_DYNAMIC;
+ vpr->vpr_name = "files";
+ vpr->vpr_ops = &varpd_files_ops;
+ (void) libvarpd_plugin_register(vpr);
+ libvarpd_plugin_free(vpr);
+}
diff --git a/usr/src/lib/varpd/files/common/libvarpd_files_json.c b/usr/src/lib/varpd/files/common/libvarpd_files_json.c
new file mode 100644
index 0000000000..53e63c6244
--- /dev/null
+++ b/usr/src/lib/varpd/files/common/libvarpd_files_json.c
@@ -0,0 +1,936 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <strings.h>
+#include <errno.h>
+#include <libnvpair.h>
+#include <sys/ccompile.h>
+
+#include "libvarpd_files_json.h"
+
+typedef enum json_type {
+ JSON_TYPE_NOTHING = 0,
+ JSON_TYPE_STRING = 1,
+ JSON_TYPE_INTEGER,
+ JSON_TYPE_DOUBLE,
+ JSON_TYPE_BOOLEAN,
+ JSON_TYPE_NULL,
+ JSON_TYPE_OBJECT,
+ JSON_TYPE_ARRAY
+} json_type_t;
+
+typedef enum parse_state {
+ PARSE_ERROR = -1,
+ PARSE_DONE = 0,
+ PARSE_REST,
+ PARSE_OBJECT,
+ PARSE_KEY_STRING,
+ PARSE_COLON,
+ PARSE_STRING,
+ PARSE_OBJECT_COMMA,
+ PARSE_ARRAY,
+ PARSE_BAREWORD,
+ PARSE_NUMBER,
+ PARSE_ARRAY_VALUE,
+ PARSE_ARRAY_COMMA
+} parse_state_t;
+
+#define JSON_MARKER ".__json_"
+#define JSON_MARKER_ARRAY JSON_MARKER "array"
+
+typedef struct parse_frame {
+ parse_state_t pf_ps;
+ nvlist_t *pf_nvl;
+
+ char *pf_key;
+ void *pf_value;
+ json_type_t pf_value_type;
+ int pf_array_index;
+
+ struct parse_frame *pf_next;
+} parse_frame_t;
+
+typedef struct state {
+ const char *s_in;
+ unsigned long s_pos;
+ unsigned long s_len;
+
+ parse_frame_t *s_top;
+
+ nvlist_parse_json_flags_t s_flags;
+
+ /*
+ * This string buffer is used for temporary storage by the
+ * "collect_*()" family of functions.
+ */
+ custr_t *s_collect;
+
+ int s_errno;
+ custr_t *s_errstr;
+} state_t;
+
+typedef void (*parse_handler_t)(state_t *);
+
+static void
+movestate(state_t *s, parse_state_t ps)
+{
+ if (s->s_flags & NVJSON_DEBUG) {
+ (void) fprintf(stderr, "nvjson: move state %d -> %d\n",
+ s->s_top->pf_ps, ps);
+ }
+ s->s_top->pf_ps = ps;
+}
+
+static void
+posterror(state_t *s, int erno, const char *error)
+{
+ /*
+ * If the caller wants error messages printed to stderr, do that
+ * first.
+ */
+ if (s->s_flags & NVJSON_ERRORS_TO_STDERR) {
+ (void) fprintf(stderr, "nvjson error (pos %ld, errno %d): %s\n",
+ s->s_pos, erno, error);
+ }
+
+ /*
+ * Try and store the error message for the caller. This may fail if
+ * the error was related to memory pressure, and that condition still
+ * exists.
+ */
+ s->s_errno = erno;
+ if (s->s_errstr != NULL) {
+ (void) custr_append(s->s_errstr, error);
+ }
+
+ movestate(s, PARSE_ERROR);
+}
+
+static int
+pushstate(state_t *s, parse_state_t ps, parse_state_t retps)
+{
+ parse_frame_t *n;
+
+ if (s->s_flags & NVJSON_DEBUG) {
+ (void) fprintf(stderr, "nvjson: push state %d -> %d (ret %d)\n",
+ s->s_top->pf_ps, ps, retps);
+ }
+
+ if ((n = calloc(1, sizeof (*n))) == NULL) {
+ posterror(s, errno, "pushstate calloc failure");
+ return (-1);
+ }
+
+ /*
+ * Store the state we'll return to when popping this
+ * frame:
+ */
+ s->s_top->pf_ps = retps;
+
+ /*
+ * Store the initial state for the new frame, and
+ * put it on top of the stack:
+ */
+ n->pf_ps = ps;
+ n->pf_value_type = JSON_TYPE_NOTHING;
+
+ n->pf_next = s->s_top;
+ s->s_top = n;
+
+ return (0);
+}
+
+static char
+popchar(state_t *s)
+{
+ if (s->s_pos > s->s_len) {
+ return (0);
+ }
+ return (s->s_in[s->s_pos++]);
+}
+
+static char
+peekchar(state_t *s)
+{
+ if (s->s_pos > s->s_len) {
+ return (0);
+ }
+ return (s->s_in[s->s_pos]);
+}
+
+static void
+discard_whitespace(state_t *s)
+{
+ while (isspace(peekchar(s))) {
+ (void) popchar(s);
+ }
+}
+
+static char *escape_pairs[] = {
+ "\"\"", "\\\\", "//", "b\b", "f\f", "n\n", "r\r", "t\t", NULL
+};
+
+static char
+collect_string_escape(state_t *s)
+{
+ int i;
+ char c = popchar(s);
+
+ if (c == '\0') {
+ posterror(s, EPROTO, "EOF mid-escape sequence");
+ return (-1);
+ }
+
+ /*
+ * Handle four-digit Unicode escapes up to and including \u007f.
+ * Strings that cannot be represented as 7-bit clean ASCII are not
+ * currently supported.
+ */
+ if (c == 'u') {
+ int res;
+ int ndigs = 0;
+ char digs[5];
+
+ /*
+ * Deal with 4-digit unicode escape.
+ */
+ while (ndigs < 4) {
+ if ((digs[ndigs++] = popchar(s)) == '\0') {
+ posterror(s, EPROTO, "EOF mid-escape "
+ "sequence");
+ return (-1);
+ }
+ }
+ digs[4] = '\0';
+ if ((res = atoi(digs)) > 127) {
+ posterror(s, EPROTO, "unicode escape above 0x7f");
+ return (-1);
+ }
+
+ if (custr_appendc(s->s_collect, res) != 0) {
+ posterror(s, errno, "custr_appendc failure");
+ return (-1);
+ }
+ return (0);
+ }
+
+ /*
+ * See if this is a C-style escape character we recognise.
+ */
+ for (i = 0; escape_pairs[i] != NULL; i++) {
+ char *ep = escape_pairs[i];
+ if (ep[0] == c) {
+ if (custr_appendc(s->s_collect, ep[1]) != 0) {
+ posterror(s, errno, "custr_appendc failure");
+ return (-1);
+ }
+ return (0);
+ }
+ }
+
+ posterror(s, EPROTO, "unrecognised escape sequence");
+ return (-1);
+}
+
+static int
+collect_string(state_t *s)
+{
+ custr_reset(s->s_collect);
+
+ for (;;) {
+ char c;
+
+ switch (c = popchar(s)) {
+ case '"':
+ /*
+ * Legal End of String.
+ */
+ return (0);
+
+ case '\0':
+ posterror(s, EPROTO, "EOF mid-string");
+ return (-1);
+
+ case '\\':
+ /*
+ * Escape Characters and Sequences.
+ */
+ if (collect_string_escape(s) != 0) {
+ return (-1);
+ }
+ break;
+
+ default:
+ if (custr_appendc(s->s_collect, c) != 0) {
+ posterror(s, errno, "custr_appendc failure");
+ return (-1);
+ }
+ break;
+ }
+ }
+}
+
+static int
+collect_bareword(state_t *s)
+{
+ custr_reset(s->s_collect);
+
+ for (;;) {
+ if (!islower(peekchar(s))) {
+ return (0);
+ }
+
+ if (custr_appendc(s->s_collect, popchar(s)) != 0) {
+ posterror(s, errno, "custr_appendc failure");
+ return (-1);
+ }
+ }
+}
+
+static void
+hdlr_bareword(state_t *s)
+{
+ const char *str;
+
+ if (collect_bareword(s) != 0) {
+ return;
+ }
+
+ str = custr_cstr(s->s_collect);
+ if (strcmp(str, "true") == 0) {
+ s->s_top->pf_value_type = JSON_TYPE_BOOLEAN;
+ s->s_top->pf_value = (void *)B_TRUE;
+ } else if (strcmp(str, "false") == 0) {
+ s->s_top->pf_value_type = JSON_TYPE_BOOLEAN;
+ s->s_top->pf_value = (void *)B_FALSE;
+ } else if (strcmp(str, "null") == 0) {
+ s->s_top->pf_value_type = JSON_TYPE_NULL;
+ } else {
+ posterror(s, EPROTO, "expected 'true', 'false' or 'null'");
+ return;
+ }
+
+ movestate(s, PARSE_DONE);
+}
+
+/* ARGSUSED */
+static int
+collect_number(state_t *s, boolean_t *isint, int32_t *result,
+ double *fresult __unused)
+{
+ boolean_t neg = B_FALSE;
+ int t;
+
+ custr_reset(s->s_collect);
+
+ if (peekchar(s) == '-') {
+ neg = B_TRUE;
+ (void) popchar(s);
+ }
+ /*
+ * Read the 'int' portion:
+ */
+ if (!isdigit(peekchar(s))) {
+ posterror(s, EPROTO, "malformed number: expected digit (0-9)");
+ return (-1);
+ }
+ for (;;) {
+ if (!isdigit(peekchar(s))) {
+ break;
+ }
+ if (custr_appendc(s->s_collect, popchar(s)) != 0) {
+ posterror(s, errno, "custr_append failure");
+ return (-1);
+ }
+ }
+ if (peekchar(s) == '.' || peekchar(s) == 'e' || peekchar(s) == 'E') {
+ posterror(s, ENOTSUP, "do not yet support FRACs or EXPs");
+ return (-1);
+ }
+
+ t = atoi(custr_cstr(s->s_collect));
+
+ *isint = B_TRUE;
+ *result = (neg == B_TRUE) ? (-t) : t;
+ return (0);
+}
+
+static void
+hdlr_number(state_t *s)
+{
+ boolean_t isint;
+ int32_t result;
+ double fresult;
+
+ if (collect_number(s, &isint, &result, &fresult) != 0) {
+ return;
+ }
+
+ if (isint == B_TRUE) {
+ s->s_top->pf_value = (void *)(uintptr_t)result;
+ s->s_top->pf_value_type = JSON_TYPE_INTEGER;
+ } else {
+ s->s_top->pf_value = malloc(sizeof (fresult));
+ bcopy(&fresult, s->s_top->pf_value, sizeof (fresult));
+ s->s_top->pf_value_type = JSON_TYPE_DOUBLE;
+ }
+
+ movestate(s, PARSE_DONE);
+}
+
+static void
+hdlr_rest(state_t *s)
+{
+ char c;
+ discard_whitespace(s);
+ c = popchar(s);
+ switch (c) {
+ case '{':
+ movestate(s, PARSE_OBJECT);
+ return;
+
+ case '[':
+ movestate(s, PARSE_ARRAY);
+ return;
+
+ default:
+ posterror(s, EPROTO, "EOF before object or array");
+ return;
+ }
+}
+
+static int
+add_empty_child(state_t *s)
+{
+ /*
+ * Here, we create an empty nvlist to represent this object
+ * or array:
+ */
+ nvlist_t *empty;
+ if (nvlist_alloc(&empty, NV_UNIQUE_NAME, 0) != 0) {
+ posterror(s, errno, "nvlist_alloc failure");
+ return (-1);
+ }
+ if (s->s_top->pf_next != NULL) {
+ /*
+ * If we're a child of the frame above, we store ourselves in
+ * that frame's nvlist:
+ */
+ nvlist_t *nvl = s->s_top->pf_next->pf_nvl;
+ char *key = s->s_top->pf_next->pf_key;
+
+ if (nvlist_add_nvlist(nvl, key, empty) != 0) {
+ posterror(s, errno, "nvlist_add_nvlist failure");
+ nvlist_free(empty);
+ return (-1);
+ }
+ nvlist_free(empty);
+ if (nvlist_lookup_nvlist(nvl, key, &empty) != 0) {
+ posterror(s, errno, "nvlist_lookup_nvlist failure");
+ return (-1);
+ }
+ }
+ s->s_top->pf_nvl = empty;
+ return (0);
+}
+
+static int
+decorate_array(state_t *s)
+{
+ int idx = s->s_top->pf_array_index;
+ /*
+ * When we are done creating an array, we store a 'length'
+ * property on it, as well as an internal-use marker value.
+ */
+ if (nvlist_add_boolean(s->s_top->pf_nvl, JSON_MARKER_ARRAY) != 0 ||
+ nvlist_add_uint32(s->s_top->pf_nvl, "length", idx) != 0) {
+ posterror(s, errno, "nvlist_add failure");
+ return (-1);
+ }
+
+ return (0);
+}
+
+static void
+hdlr_array(state_t *s)
+{
+ s->s_top->pf_value_type = JSON_TYPE_ARRAY;
+
+ if (add_empty_child(s) != 0) {
+ return;
+ }
+
+ discard_whitespace(s);
+
+ switch (peekchar(s)) {
+ case ']':
+ (void) popchar(s);
+
+ if (decorate_array(s) != 0) {
+ return;
+ }
+
+ movestate(s, PARSE_DONE);
+ return;
+
+ default:
+ movestate(s, PARSE_ARRAY_VALUE);
+ return;
+ }
+}
+
+static void
+hdlr_array_comma(state_t *s)
+{
+ discard_whitespace(s);
+
+ switch (popchar(s)) {
+ case ']':
+ if (decorate_array(s) != 0) {
+ return;
+ }
+
+ movestate(s, PARSE_DONE);
+ return;
+ case ',':
+ movestate(s, PARSE_ARRAY_VALUE);
+ return;
+ default:
+ posterror(s, EPROTO, "expected ',' or ']'");
+ return;
+ }
+}
+
+static void
+hdlr_array_value(state_t *s)
+{
+ char c;
+
+ /*
+ * Generate keyname from the next array index:
+ */
+ if (s->s_top->pf_key != NULL) {
+ (void) fprintf(stderr, "pf_key not null! was %s\n",
+ s->s_top->pf_key);
+ abort();
+ }
+
+ if (asprintf(&s->s_top->pf_key, "%d", s->s_top->pf_array_index++) < 0) {
+ posterror(s, errno, "asprintf failure");
+ return;
+ }
+
+ discard_whitespace(s);
+
+ /*
+ * Select which type handler we need for the next value:
+ */
+ switch (c = peekchar(s)) {
+ case '"':
+ (void) popchar(s);
+ (void) pushstate(s, PARSE_STRING, PARSE_ARRAY_COMMA);
+ return;
+
+ case '{':
+ (void) popchar(s);
+ (void) pushstate(s, PARSE_OBJECT, PARSE_ARRAY_COMMA);
+ return;
+
+ case '[':
+ (void) popchar(s);
+ (void) pushstate(s, PARSE_ARRAY, PARSE_ARRAY_COMMA);
+ return;
+
+ default:
+ if (islower(c)) {
+ (void) pushstate(s, PARSE_BAREWORD,
+ PARSE_ARRAY_COMMA);
+ return;
+ } else if (c == '-' || isdigit(c)) {
+ (void) pushstate(s, PARSE_NUMBER, PARSE_ARRAY_COMMA);
+ return;
+ } else {
+ posterror(s, EPROTO, "unexpected character at start "
+ "of value");
+ return;
+ }
+ }
+}
+
+static void
+hdlr_object(state_t *s)
+{
+ s->s_top->pf_value_type = JSON_TYPE_OBJECT;
+
+ if (add_empty_child(s) != 0) {
+ return;
+ }
+
+ discard_whitespace(s);
+
+ switch (popchar(s)) {
+ case '}':
+ movestate(s, PARSE_DONE);
+ return;
+
+ case '"':
+ movestate(s, PARSE_KEY_STRING);
+ return;
+
+ default:
+ posterror(s, EPROTO, "expected key or '}'");
+ return;
+ }
+}
+
+static void
+hdlr_key_string(state_t *s)
+{
+ if (collect_string(s) != 0) {
+ return;
+ }
+
+ /*
+ * Record the key name of the next value.
+ */
+ if ((s->s_top->pf_key = strdup(custr_cstr(s->s_collect))) == NULL) {
+ posterror(s, errno, "strdup failure");
+ return;
+ }
+
+ movestate(s, PARSE_COLON);
+}
+
+static void
+hdlr_colon(state_t *s)
+{
+ char c;
+ discard_whitespace(s);
+
+ if ((c = popchar(s)) != ':') {
+ posterror(s, EPROTO, "expected ':'");
+ return;
+ }
+
+ discard_whitespace(s);
+
+ /*
+ * Select which type handler we need for the value after the colon:
+ */
+ switch (c = peekchar(s)) {
+ case '"':
+ (void) popchar(s);
+ (void) pushstate(s, PARSE_STRING, PARSE_OBJECT_COMMA);
+ return;
+
+ case '{':
+ (void) popchar(s);
+ (void) pushstate(s, PARSE_OBJECT, PARSE_OBJECT_COMMA);
+ return;
+
+ case '[':
+ (void) popchar(s);
+ (void) pushstate(s, PARSE_ARRAY, PARSE_OBJECT_COMMA);
+ return;
+
+ default:
+ if (islower(c)) {
+ (void) pushstate(s, PARSE_BAREWORD, PARSE_OBJECT_COMMA);
+ return;
+ } else if (c == '-' || isdigit(c)) {
+ (void) pushstate(s, PARSE_NUMBER, PARSE_OBJECT_COMMA);
+ return;
+ } else {
+ (void) posterror(s, EPROTO, "unexpected character at "
+ "start of value");
+ return;
+ }
+ }
+}
+
+static void
+hdlr_object_comma(state_t *s)
+{
+ discard_whitespace(s);
+
+ switch (popchar(s)) {
+ case '}':
+ movestate(s, PARSE_DONE);
+ return;
+
+ case ',':
+ discard_whitespace(s);
+ if (popchar(s) != '"') {
+ posterror(s, EPROTO, "expected '\"'");
+ return;
+ }
+ movestate(s, PARSE_KEY_STRING);
+ return;
+
+ default:
+ posterror(s, EPROTO, "expected ',' or '}'");
+ return;
+ }
+}
+
+static void
+hdlr_string(state_t *s)
+{
+ if (collect_string(s) != 0) {
+ return;
+ }
+
+ s->s_top->pf_value_type = JSON_TYPE_STRING;
+ if ((s->s_top->pf_value = strdup(custr_cstr(s->s_collect))) == NULL) {
+ posterror(s, errno, "strdup failure");
+ return;
+ }
+
+ movestate(s, PARSE_DONE);
+}
+
+static int
+store_value(state_t *s)
+{
+ nvlist_t *targ = s->s_top->pf_next->pf_nvl;
+ char *key = s->s_top->pf_next->pf_key;
+ json_type_t type = s->s_top->pf_value_type;
+ int ret = 0;
+
+ switch (type) {
+ case JSON_TYPE_STRING:
+ if (nvlist_add_string(targ, key, s->s_top->pf_value) != 0) {
+ posterror(s, errno, "nvlist_add_string failure");
+ ret = -1;
+ }
+ free(s->s_top->pf_value);
+ break;
+
+ case JSON_TYPE_BOOLEAN:
+ if (nvlist_add_boolean_value(targ, key,
+ (boolean_t)s->s_top->pf_value) != 0) {
+ posterror(s, errno, "nvlist_add_boolean_value "
+ "failure");
+ ret = -1;
+ }
+ break;
+
+ case JSON_TYPE_NULL:
+ if (nvlist_add_boolean(targ, key) != 0) {
+ posterror(s, errno, "nvlist_add_boolean failure");
+ ret = -1;
+ }
+ break;
+
+ case JSON_TYPE_INTEGER:
+ if (nvlist_add_int32(targ, key,
+ (int32_t)(uintptr_t)s->s_top->pf_value) != 0) {
+ posterror(s, errno, "nvlist_add_int32 failure");
+ ret = -1;
+ }
+ break;
+
+ case JSON_TYPE_ARRAY:
+ case JSON_TYPE_OBJECT:
+ /*
+ * Objects and arrays are already 'stored' in their target
+ * nvlist on creation. See: hdlr_object, hdlr_array.
+ */
+ break;
+
+ default:
+ (void) fprintf(stderr, "ERROR: could not store unknown "
+ "type %d\n", type);
+ abort();
+ }
+
+ s->s_top->pf_value = NULL;
+ free(s->s_top->pf_next->pf_key);
+ s->s_top->pf_next->pf_key = NULL;
+ return (ret);
+}
+
+static parse_frame_t *
+parse_frame_free(parse_frame_t *pf, boolean_t free_nvl)
+{
+ parse_frame_t *next = pf->pf_next;
+ if (pf->pf_key != NULL) {
+ free(pf->pf_key);
+ }
+ if (pf->pf_value != NULL) {
+ abort();
+ }
+ if (free_nvl && pf->pf_nvl != NULL) {
+ nvlist_free(pf->pf_nvl);
+ }
+ free(pf);
+ return (next);
+}
+
+static parse_handler_t hdlrs[] = {
+ NULL, /* PARSE_DONE */
+ hdlr_rest, /* PARSE_REST */
+ hdlr_object, /* PARSE_OBJECT */
+ hdlr_key_string, /* PARSE_KEY_STRING */
+ hdlr_colon, /* PARSE_COLON */
+ hdlr_string, /* PARSE_STRING */
+ hdlr_object_comma, /* PARSE_OBJECT_COMMA */
+ hdlr_array, /* PARSE_ARRAY */
+ hdlr_bareword, /* PARSE_BAREWORD */
+ hdlr_number, /* PARSE_NUMBER */
+ hdlr_array_value, /* PARSE_ARRAY_VALUE */
+ hdlr_array_comma /* PARSE_ARRAY_COMMA */
+};
+#define NUM_PARSE_HANDLERS (int)(sizeof (hdlrs) / sizeof (hdlrs[0]))
+
+int
+nvlist_parse_json(const char *buf, size_t buflen, nvlist_t **nvlp,
+ nvlist_parse_json_flags_t flag, nvlist_parse_json_error_t *errout)
+{
+ state_t s;
+
+ /*
+ * Check for valid flags:
+ */
+ if ((flag & NVJSON_FORCE_INTEGER) && (flag & NVJSON_FORCE_DOUBLE)) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if ((flag & ~NVJSON_ALL) != 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ /*
+ * Initialise parsing state structure:
+ */
+ bzero(&s, sizeof (s));
+ s.s_in = buf;
+ s.s_pos = 0;
+ s.s_len = buflen;
+ s.s_flags = flag;
+
+ /*
+ * Allocate the collect buffer string.
+ */
+ if (custr_alloc(&s.s_collect) != 0) {
+ s.s_errno = errno;
+ if (errout != NULL) {
+ (void) snprintf(errout->nje_message,
+ sizeof (errout->nje_message),
+ "custr alloc failure: %s",
+ strerror(errno));
+ }
+ goto out;
+ }
+
+ /*
+ * If the caller has requested error information, allocate the error
+ * string now.
+ */
+ if (errout != NULL) {
+ if (custr_alloc_buf(&s.s_errstr, errout->nje_message,
+ sizeof (errout->nje_message)) != 0) {
+ s.s_errno = errno;
+ (void) snprintf(errout->nje_message,
+ sizeof (errout->nje_message),
+ "custr alloc failure: %s",
+ strerror(errno));
+ goto out;
+ }
+ custr_reset(s.s_errstr);
+ }
+
+ /*
+ * Allocate top-most stack frame:
+ */
+ if ((s.s_top = calloc(1, sizeof (*s.s_top))) == NULL) {
+ s.s_errno = errno;
+ goto out;
+ }
+
+ s.s_top->pf_ps = PARSE_REST;
+ for (;;) {
+ if (s.s_top->pf_ps < 0) {
+ /*
+ * The parser reported an error.
+ */
+ goto out;
+ }
+
+ if (s.s_top->pf_ps == PARSE_DONE) {
+ if (s.s_top->pf_next == NULL) {
+ /*
+ * Last frame, so we're really
+ * done.
+ */
+ *nvlp = s.s_top->pf_nvl;
+ goto out;
+ } else {
+ /*
+ * Otherwise, pop a frame and continue in
+ * previous state. Copy out the value we
+ * created in the old frame:
+ */
+ if (store_value(&s) != 0) {
+ goto out;
+ }
+
+ /*
+ * Free old frame:
+ */
+ s.s_top = parse_frame_free(s.s_top, B_FALSE);
+ }
+ }
+
+ /*
+ * Dispatch to parser handler routine for this state:
+ */
+ if (s.s_top->pf_ps >= NUM_PARSE_HANDLERS ||
+ hdlrs[s.s_top->pf_ps] == NULL) {
+ (void) fprintf(stderr, "no handler for state %d\n",
+ s.s_top->pf_ps);
+ abort();
+ }
+ hdlrs[s.s_top->pf_ps](&s);
+ }
+
+out:
+ if (errout != NULL) {
+ /*
+ * Copy out error number and parse position. The custr_t for
+ * the error message was backed by the buffer in the error
+ * object, so no copying is required.
+ */
+ errout->nje_errno = s.s_errno;
+ errout->nje_pos = s.s_pos;
+ }
+
+ /*
+ * Free resources:
+ */
+ while (s.s_top != NULL) {
+ s.s_top = parse_frame_free(s.s_top, s.s_errno == 0 ? B_FALSE :
+ B_TRUE);
+ }
+ custr_free(s.s_collect);
+ custr_free(s.s_errstr);
+
+ errno = s.s_errno;
+ return (s.s_errno == 0 ? 0 : -1);
+}
diff --git a/usr/src/lib/varpd/files/common/libvarpd_files_json.h b/usr/src/lib/varpd/files/common/libvarpd_files_json.h
new file mode 100644
index 0000000000..9fe765741b
--- /dev/null
+++ b/usr/src/lib/varpd/files/common/libvarpd_files_json.h
@@ -0,0 +1,52 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+#ifndef _LIBVARPD_FILES_JSON_H
+#define _LIBVARPD_FILES_JSON_H
+
+#include <libnvpair.h>
+#include <libcustr.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum nvlist_parse_json_flags {
+ NVJSON_FORCE_INTEGER = 0x01,
+ NVJSON_FORCE_DOUBLE = 0x02,
+ NVJSON_ERRORS_TO_STDERR = 0x04,
+ NVJSON_DEBUG = 0x08
+} nvlist_parse_json_flags_t;
+
+typedef struct nvlist_parse_json_error {
+ int nje_errno;
+ long nje_pos;
+ char nje_message[512];
+} nvlist_parse_json_error_t;
+
+#define NVJSON_ALL \
+ (NVJSON_FORCE_INTEGER | \
+ NVJSON_FORCE_DOUBLE | \
+ NVJSON_ERRORS_TO_STDERR | \
+ NVJSON_DEBUG)
+
+extern int nvlist_parse_json(const char *, size_t, nvlist_t **,
+ nvlist_parse_json_flags_t, nvlist_parse_json_error_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBVARPD_FILES_JSON_H */
diff --git a/usr/src/lib/varpd/files/common/mapfile-vers b/usr/src/lib/varpd/files/common/mapfile-vers
new file mode 100644
index 0000000000..6b7c5a5067
--- /dev/null
+++ b/usr/src/lib/varpd/files/common/mapfile-vers
@@ -0,0 +1,35 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+# usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION SUNWprivate {
+ local:
+ *;
+};
diff --git a/usr/src/lib/varpd/files/i386/Makefile b/usr/src/lib/varpd/files/i386/Makefile
new file mode 100644
index 0000000000..4398507523
--- /dev/null
+++ b/usr/src/lib/varpd/files/i386/Makefile
@@ -0,0 +1,18 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+include ../Makefile.com
+
+install: all $(ROOTLIBS) $(ROOTLINKS)
diff --git a/usr/src/lib/varpd/libvarpd/Makefile b/usr/src/lib/varpd/libvarpd/Makefile
new file mode 100644
index 0000000000..034ba30c1d
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/Makefile
@@ -0,0 +1,54 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+include ../../Makefile.lib
+
+HDRS = libvarpd.h libvarpd_client.h libvarpd_provider.h
+HDRDIR = common
+SUBDIRS = $(MACH)
+$(BUILD64)SUBDIRS += $(MACH64)
+
+TYPECHECK_LIB = libvarpd.so.1
+TYPELIST = \
+ varpd_client_instance_arg_t \
+ varpd_client_nprops_arg_t \
+ varpd_client_propinfo_arg_t \
+ varpd_client_eresp_t \
+ varpd_persist_header_t \
+ overlay_targ_cache_entry_t \
+ overlay_targ_cache_t \
+ overlay_targ_cache_iter_t
+
+all := TARGET = all
+clean := TARGET = clean
+clobber := TARGET = clobber
+install := TARGET = install
+
+.KEEP_STATE:
+
+all clean clobber: $(SUBDIRS)
+
+install: $(SUBDIRS) $(VARPD_MAPFILES) install_h
+
+install_h: $(ROOTHDRS)
+
+check: $(CHECKHDRS) $(TYPECHECK)
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include ../../Makefile.targ
diff --git a/usr/src/lib/varpd/libvarpd/Makefile.com b/usr/src/lib/varpd/libvarpd/Makefile.com
new file mode 100644
index 0000000000..73e8f17883
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/Makefile.com
@@ -0,0 +1,48 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+LIBRARY = libvarpd.a
+VERS = .1
+OBJECTS = libvarpd.o \
+ libvarpd_arp.o \
+ libvarpd_client.o \
+ libvarpd_door.o \
+ libvarpd_overlay.o \
+ libvarpd_panic.o \
+ libvarpd_persist.o \
+ libvarpd_prop.o \
+ libvarpd_plugin.o \
+ libvarpd_util.o
+
+include ../../../Makefile.lib
+
+# install this library in the root filesystem
+include ../../../Makefile.rootfs
+
+LIBS = $(DYNLIB)
+LDLIBS += -lc -lavl -lumem -lidspace -lnvpair -lmd5 -lrename
+CPPFLAGS += -I../common
+
+CERRWARN += -erroff=E_STRUCT_DERIVED_FROM_FLEX_MBR
+
+CSTD= $(CSTD_GNU99)
+
+SRCDIR = ../common
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+include ../../../Makefile.targ
diff --git a/usr/src/lib/varpd/libvarpd/amd64/Makefile b/usr/src/lib/varpd/libvarpd/amd64/Makefile
new file mode 100644
index 0000000000..1881990d79
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/amd64/Makefile
@@ -0,0 +1,19 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+include ../Makefile.com
+include ../../../Makefile.lib.64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64)
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd.c b/usr/src/lib/varpd/libvarpd/common/libvarpd.c
new file mode 100644
index 0000000000..4e4c189a43
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd.c
@@ -0,0 +1,345 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * varpd library
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <umem.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/avl.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <strings.h>
+
+#include <libvarpd_impl.h>
+
+static int
+libvarpd_instance_comparator(const void *lp, const void *rp)
+{
+ const varpd_instance_t *lpp, *rpp;
+ lpp = lp;
+ rpp = rp;
+
+ if (lpp->vri_id > rpp->vri_id)
+ return (1);
+ if (lpp->vri_id < rpp->vri_id)
+ return (-1);
+ return (0);
+}
+
+static int
+libvarpd_instance_lcomparator(const void *lp, const void *rp)
+{
+ const varpd_instance_t *lpp, *rpp;
+ lpp = lp;
+ rpp = rp;
+
+ if (lpp->vri_linkid > rpp->vri_linkid)
+ return (1);
+ if (lpp->vri_linkid < rpp->vri_linkid)
+ return (-1);
+ return (0);
+}
+
+int
+libvarpd_create(varpd_handle_t **vphp)
+{
+ int ret;
+ varpd_impl_t *vip;
+ char buf[32];
+
+ if (vphp == NULL)
+ return (EINVAL);
+
+ *vphp = NULL;
+ vip = umem_alloc(sizeof (varpd_impl_t), UMEM_DEFAULT);
+ if (vip == NULL)
+ return (errno);
+
+ bzero(vip, sizeof (varpd_impl_t));
+ (void) snprintf(buf, sizeof (buf), "varpd_%p", vip);
+ vip->vdi_idspace = id_space_create(buf, LIBVARPD_ID_MIN,
+ LIBVARPD_ID_MAX);
+ if (vip->vdi_idspace == NULL) {
+ int ret = errno;
+ umem_free(vip, sizeof (varpd_impl_t));
+ return (ret);
+ }
+
+ vip->vdi_qcache = umem_cache_create("query", sizeof (varpd_query_t), 0,
+ NULL, NULL, NULL, NULL, NULL, 0);
+ if (vip->vdi_qcache == NULL) {
+ int ret = errno;
+ id_space_destroy(vip->vdi_idspace);
+ umem_free(vip, sizeof (varpd_impl_t));
+ return (ret);
+ }
+
+ if ((ret = libvarpd_overlay_init(vip)) != 0) {
+ umem_cache_destroy(vip->vdi_qcache);
+ id_space_destroy(vip->vdi_idspace);
+ umem_free(vip, sizeof (varpd_impl_t));
+ return (ret);
+ }
+
+ libvarpd_persist_init(vip);
+
+ avl_create(&vip->vdi_plugins, libvarpd_plugin_comparator,
+ sizeof (varpd_plugin_t), offsetof(varpd_plugin_t, vpp_node));
+
+ avl_create(&vip->vdi_instances, libvarpd_instance_comparator,
+ sizeof (varpd_instance_t), offsetof(varpd_instance_t, vri_inode));
+ avl_create(&vip->vdi_linstances, libvarpd_instance_lcomparator,
+ sizeof (varpd_instance_t), offsetof(varpd_instance_t, vri_lnode));
+
+ if (mutex_init(&vip->vdi_lock, USYNC_THREAD | LOCK_ERRORCHECK,
+ NULL) != 0)
+ libvarpd_panic("failed to create mutex: %d", errno);
+
+ vip->vdi_doorfd = -1;
+ *vphp = (varpd_handle_t *)vip;
+ return (0);
+}
+
+void
+libvarpd_destroy(varpd_handle_t *vhp)
+{
+ varpd_impl_t *vip = (varpd_impl_t *)vhp;
+
+ libvarpd_overlay_lookup_quiesce(vhp);
+ if (mutex_destroy(&vip->vdi_lock) != 0)
+ libvarpd_panic("failed to destroy mutex: %d", errno);
+ libvarpd_persist_fini(vip);
+ libvarpd_overlay_fini(vip);
+ umem_cache_destroy(vip->vdi_qcache);
+ id_space_destroy(vip->vdi_idspace);
+ umem_free(vip, sizeof (varpd_impl_t));
+}
+
+int
+libvarpd_instance_create(varpd_handle_t *vhp, datalink_id_t linkid,
+ const char *pname, varpd_instance_handle_t **outp)
+{
+ int ret;
+ varpd_impl_t *vip = (varpd_impl_t *)vhp;
+ varpd_plugin_t *plugin;
+ varpd_instance_t *inst, lookup;
+ overlay_plugin_dest_t dest;
+ uint64_t vid;
+
+ /*
+ * We should really have our own errnos.
+ */
+ plugin = libvarpd_plugin_lookup(vip, pname);
+ if (plugin == NULL)
+ return (ENOENT);
+
+ if ((ret = libvarpd_overlay_info(vip, linkid, &dest, NULL, &vid)) != 0)
+ return (ret);
+
+ inst = umem_alloc(sizeof (varpd_instance_t), UMEM_DEFAULT);
+ if (inst == NULL)
+ return (ENOMEM);
+
+ inst->vri_id = id_alloc(vip->vdi_idspace);
+ if (inst->vri_id == -1)
+ libvarpd_panic("failed to allocate id from vdi_idspace: %d",
+ errno);
+ inst->vri_linkid = linkid;
+ inst->vri_vnetid = vid;
+ inst->vri_mode = plugin->vpp_mode;
+ inst->vri_dest = dest;
+ inst->vri_plugin = plugin;
+ inst->vri_impl = vip;
+ inst->vri_flags = 0;
+ if ((ret = plugin->vpp_ops->vpo_create((varpd_provider_handle_t *)inst,
+ &inst->vri_private, dest)) != 0) {
+ id_free(vip->vdi_idspace, inst->vri_id);
+ umem_free(inst, sizeof (varpd_instance_t));
+ return (ret);
+ }
+
+ if (mutex_init(&inst->vri_lock, USYNC_THREAD | LOCK_ERRORCHECK,
+ NULL) != 0)
+ libvarpd_panic("failed to create mutex: %d", errno);
+
+ mutex_enter(&vip->vdi_lock);
+ lookup.vri_id = inst->vri_id;
+ if (avl_find(&vip->vdi_instances, &lookup, NULL) != NULL)
+ libvarpd_panic("found duplicate instance with id %d",
+ lookup.vri_id);
+ avl_add(&vip->vdi_instances, inst);
+ lookup.vri_linkid = inst->vri_linkid;
+ if (avl_find(&vip->vdi_linstances, &lookup, NULL) != NULL)
+ libvarpd_panic("found duplicate linstance with id %d",
+ lookup.vri_linkid);
+ avl_add(&vip->vdi_linstances, inst);
+ mutex_exit(&vip->vdi_lock);
+ *outp = (varpd_instance_handle_t *)inst;
+ return (0);
+}
+
+uint64_t
+libvarpd_instance_id(varpd_instance_handle_t *ihp)
+{
+ varpd_instance_t *inst = (varpd_instance_t *)ihp;
+ return (inst->vri_id);
+}
+
+uint64_t
+libvarpd_plugin_vnetid(varpd_provider_handle_t *vhp)
+{
+ varpd_instance_t *inst = (varpd_instance_t *)vhp;
+ return (inst->vri_vnetid);
+}
+
+varpd_instance_handle_t *
+libvarpd_instance_lookup(varpd_handle_t *vhp, uint64_t id)
+{
+ varpd_impl_t *vip = (varpd_impl_t *)vhp;
+ varpd_instance_t lookup, *retp;
+
+ lookup.vri_id = id;
+ mutex_enter(&vip->vdi_lock);
+ retp = avl_find(&vip->vdi_instances, &lookup, NULL);
+ mutex_exit(&vip->vdi_lock);
+ return ((varpd_instance_handle_t *)retp);
+}
+
+/*
+ * If this function becomes external to varpd, we need to change it to return a
+ * varpd_instance_handle_t.
+ */
+varpd_instance_t *
+libvarpd_instance_lookup_by_dlid(varpd_impl_t *vip, datalink_id_t linkid)
+{
+ varpd_instance_t lookup, *retp;
+
+ lookup.vri_linkid = linkid;
+ mutex_enter(&vip->vdi_lock);
+ retp = avl_find(&vip->vdi_linstances, &lookup, NULL);
+ mutex_exit(&vip->vdi_lock);
+ return (retp);
+}
+
+/*
+ * When an instance is being destroyed, that means we should deactivate it, as
+ * well as clean it up. That means here, the proper order is calling the plug-in
+ * stop and then the destroy function.
+ */
+void
+libvarpd_instance_destroy(varpd_instance_handle_t *ihp)
+{
+ varpd_instance_t *inst = (varpd_instance_t *)ihp;
+ varpd_impl_t *vip = inst->vri_impl;
+
+ /*
+ * First things first, remove it from global visibility.
+ */
+ mutex_enter(&vip->vdi_lock);
+ avl_remove(&vip->vdi_instances, inst);
+ avl_remove(&vip->vdi_linstances, inst);
+ mutex_exit(&vip->vdi_lock);
+
+ mutex_enter(&inst->vri_lock);
+
+ /*
+ * We need to clean up this instance, that means remove it from
+ * persistence and stopping it. Then finally we'll have to clean it up
+ * entirely.
+ */
+ if (inst->vri_flags & VARPD_INSTANCE_F_ACTIVATED) {
+ inst->vri_flags &= ~VARPD_INSTANCE_F_ACTIVATED;
+ libvarpd_torch_instance(vip, inst);
+ inst->vri_plugin->vpp_ops->vpo_stop(inst->vri_private);
+ inst->vri_plugin->vpp_ops->vpo_destroy(inst->vri_private);
+ inst->vri_private = NULL;
+ }
+ mutex_exit(&inst->vri_lock);
+
+ /* Do the full clean up of the instance */
+ if (mutex_destroy(&inst->vri_lock) != 0)
+ libvarpd_panic("failed to destroy instance vri_lock");
+ id_free(vip->vdi_idspace, inst->vri_id);
+ umem_free(inst, sizeof (varpd_instance_t));
+}
+
+int
+libvarpd_instance_activate(varpd_instance_handle_t *ihp)
+{
+ int ret;
+ varpd_instance_t *inst = (varpd_instance_t *)ihp;
+
+ mutex_enter(&inst->vri_lock);
+
+ if (inst->vri_flags & VARPD_INSTANCE_F_ACTIVATED) {
+ ret = EEXIST;
+ goto out;
+ }
+
+ if ((ret = inst->vri_plugin->vpp_ops->vpo_start(inst->vri_private)) !=
+ 0)
+ goto out;
+
+ if ((ret = libvarpd_persist_instance(inst->vri_impl, inst)) != 0)
+ goto out;
+
+ /*
+ * If this fails, we don't need to call stop, as the caller should end
+ * up calling destroy on the instance, which takes care of calling stop
+ * and destroy.
+ */
+ if ((ret = libvarpd_overlay_associate(inst)) != 0)
+ goto out;
+
+ inst->vri_flags |= VARPD_INSTANCE_F_ACTIVATED;
+
+out:
+ mutex_exit(&inst->vri_lock);
+ return (ret);
+}
+
+static void
+libvarpd_prefork(void)
+{
+ libvarpd_plugin_prefork();
+}
+
+static void
+libvarpd_postfork(void)
+{
+ libvarpd_plugin_postfork();
+}
+
+#pragma init(libvarpd_init)
+static void
+libvarpd_init(void)
+{
+ libvarpd_plugin_init();
+ if (pthread_atfork(libvarpd_prefork, libvarpd_postfork,
+ libvarpd_postfork) != 0)
+ libvarpd_panic("failed to create varpd atfork: %d", errno);
+}
+
+#pragma fini(libvarpd_fini)
+static void
+libvarpd_fini(void)
+{
+ libvarpd_plugin_fini();
+}
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd.h b/usr/src/lib/varpd/libvarpd/common/libvarpd.h
new file mode 100644
index 0000000000..106d4272d9
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd.h
@@ -0,0 +1,77 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _LIBVARPD_H
+#define _LIBVARPD_H
+
+/*
+ * varpd interfaces
+ */
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <sys/mac.h>
+#include <libvarpd_client.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct __varpd_handle varpd_handle_t;
+typedef struct __varpd_prop_handle varpd_prop_handle_t;
+typedef struct __varpd_instance_handle varpd_instance_handle_t;
+
+extern int libvarpd_create(varpd_handle_t **);
+extern void libvarpd_destroy(varpd_handle_t *);
+
+extern int libvarpd_persist_enable(varpd_handle_t *, const char *);
+extern int libvarpd_persist_restore(varpd_handle_t *);
+extern int libvarpd_persist_disable(varpd_handle_t *);
+
+extern int libvarpd_instance_create(varpd_handle_t *, datalink_id_t,
+ const char *, varpd_instance_handle_t **);
+extern uint64_t libvarpd_instance_id(varpd_instance_handle_t *);
+extern varpd_instance_handle_t *libvarpd_instance_lookup(varpd_handle_t *,
+ uint64_t);
+extern void libvarpd_instance_destroy(varpd_instance_handle_t *);
+extern int libvarpd_instance_activate(varpd_instance_handle_t *);
+
+extern int libvarpd_plugin_load(varpd_handle_t *, const char *);
+typedef int (*libvarpd_plugin_walk_f)(varpd_handle_t *, const char *, void *);
+extern int libvarpd_plugin_walk(varpd_handle_t *, libvarpd_plugin_walk_f,
+ void *);
+
+extern int libvarpd_prop_handle_alloc(varpd_handle_t *,
+ varpd_instance_handle_t *, varpd_prop_handle_t **);
+extern void libvarpd_prop_handle_free(varpd_prop_handle_t *);
+extern int libvarpd_prop_nprops(varpd_instance_handle_t *, uint_t *);
+extern int libvarpd_prop_info_fill(varpd_prop_handle_t *, uint_t);
+extern int libvarpd_prop_info(varpd_prop_handle_t *, const char **, uint_t *,
+ uint_t *, const void **, uint32_t *, const mac_propval_range_t **);
+extern int libvarpd_prop_get(varpd_prop_handle_t *, void *, uint32_t *);
+extern int libvarpd_prop_set(varpd_prop_handle_t *, const void *, uint32_t);
+
+extern int libvarpd_door_server_create(varpd_handle_t *, const char *);
+extern void libvarpd_door_server_destroy(varpd_handle_t *);
+
+extern void *libvarpd_overlay_lookup_run(void *);
+extern void libvarpd_overlay_lookup_quiesce(varpd_handle_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBVARPD_H */
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c
new file mode 100644
index 0000000000..7180fcb2de
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c
@@ -0,0 +1,649 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * Common routines for implementing proxy arp
+ */
+
+#include <sys/types.h>
+#include <net/if.h>
+#include <netinet/if_ether.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/udp.h>
+#include <netinet/dhcp.h>
+#include <libvarpd_impl.h>
+#include <sys/vlan.h>
+#include <strings.h>
+#include <assert.h>
+
+#define IPV6_VERSION 6
+
+typedef struct varpd_arp_query {
+ int vaq_type;
+ char vaq_buf[ETHERMAX + VLAN_TAGSZ];
+ size_t vaq_bsize;
+ uint8_t vaq_lookup[ETHERADDRL];
+ struct sockaddr_storage vaq_sock;
+ varpd_instance_t *vaq_inst;
+ struct ether_arp *vaq_ea;
+ varpd_query_handle_t *vaq_query;
+ const overlay_targ_lookup_t *vaq_otl;
+ ip6_t *vaq_ip6;
+ nd_neighbor_solicit_t *vaq_ns;
+} varpd_arp_query_t;
+
+typedef struct varpd_dhcp_query {
+ char vdq_buf[ETHERMAX + VLAN_TAGSZ];
+ size_t vdq_bsize;
+ uint8_t vdq_lookup[ETHERADDRL];
+ const overlay_targ_lookup_t *vdq_otl;
+ varpd_instance_t *vdq_inst;
+ varpd_query_handle_t *vdq_query;
+ struct ether_header *vdq_ether;
+} varpd_dhcp_query_t;
+
+static const uint8_t libvarpd_arp_bcast[6] = { 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff };
+
+void
+libvarpd_plugin_proxy_arp(varpd_provider_handle_t *hdl,
+ varpd_query_handle_t *vqh, const overlay_targ_lookup_t *otl)
+{
+ varpd_arp_query_t *vaq;
+ varpd_instance_t *inst = (varpd_instance_t *)hdl;
+ struct ether_arp *ea;
+ struct sockaddr_in *ip;
+
+ vaq = umem_alloc(sizeof (varpd_arp_query_t), UMEM_DEFAULT);
+ if (vaq == NULL) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ return;
+ }
+ vaq->vaq_bsize = sizeof (vaq->vaq_buf);
+
+ if (otl->otl_sap != ETHERTYPE_ARP) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ /*
+ * An ARP packet should not be very large because it's definited to only
+ * be allowed to have a single entry at a given time. But our data must
+ * be at least as large as an ether_arp and our header must be at least
+ * as large as a standard ethernet header.
+ */
+ if (otl->otl_hdrsize + otl->otl_pktsize > vaq->vaq_bsize ||
+ otl->otl_pktsize < sizeof (struct ether_arp) ||
+ otl->otl_hdrsize < sizeof (struct ether_header)) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ if (libvarpd_overlay_packet(inst->vri_impl, otl, vaq->vaq_buf,
+ &vaq->vaq_bsize) != 0) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ if (otl->otl_hdrsize + otl->otl_pktsize < vaq->vaq_bsize) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ ea = (void *)((uintptr_t)vaq->vaq_buf + (uintptr_t)otl->otl_hdrsize);
+
+ /*
+ * Make sure it matches something that we know about.
+ */
+ if (ntohs(ea->ea_hdr.ar_hrd) != ARPHRD_ETHER ||
+ ntohs(ea->ea_hdr.ar_pro) != ETHERTYPE_IP ||
+ ea->ea_hdr.ar_hln != ETHERADDRL ||
+ ea->ea_hdr.ar_pln != sizeof (ea->arp_spa) ||
+ ntohs(ea->ea_hdr.ar_op) != ARPOP_REQUEST) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ /*
+ * Now that we've verified that our data is sane, see if we're doing a
+ * gratuitous arp and if so, drop it. Otherwise, we may end up
+ * triggering duplicate address detection.
+ */
+ if (bcmp(ea->arp_spa, ea->arp_tpa, sizeof (ea->arp_spa)) == 0) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ bzero(&vaq->vaq_sock, sizeof (struct sockaddr_storage));
+ ip = (struct sockaddr_in *)&vaq->vaq_sock;
+ ip->sin_family = AF_INET;
+ bcopy(ea->arp_tpa, &ip->sin_addr, sizeof (ea->arp_tpa));
+
+ vaq->vaq_type = AF_INET;
+ vaq->vaq_inst = inst;
+ vaq->vaq_ea = ea;
+ vaq->vaq_query = vqh;
+ vaq->vaq_otl = otl;
+
+ if (inst->vri_plugin->vpp_ops->vpo_arp == NULL)
+ libvarpd_panic("%s plugin asked to do arp, but has no method",
+ inst->vri_plugin->vpp_name);
+
+ inst->vri_plugin->vpp_ops->vpo_arp(inst->vri_private,
+ (varpd_arp_handle_t *)vaq, VARPD_QTYPE_ETHERNET,
+ (struct sockaddr *)ip, vaq->vaq_lookup);
+}
+
+static void
+libvarpd_proxy_arp_fini(varpd_arp_query_t *vaq)
+{
+ struct ether_header *ether;
+ struct sockaddr_in *ip;
+
+ ip = (struct sockaddr_in *)&vaq->vaq_sock;
+ /*
+ * Modify our packet in place for a reply. We need to swap around the
+ * sender and target addresses.
+ */
+ vaq->vaq_ea->ea_hdr.ar_op = htons(ARPOP_REPLY);
+ bcopy(vaq->vaq_ea->arp_sha, vaq->vaq_ea->arp_tha, ETHERADDRL);
+ bcopy(vaq->vaq_lookup, vaq->vaq_ea->arp_sha, ETHERADDRL);
+ bcopy(vaq->vaq_ea->arp_spa, &ip->sin_addr,
+ sizeof (vaq->vaq_ea->arp_spa));
+ bcopy(vaq->vaq_ea->arp_tpa, vaq->vaq_ea->arp_spa,
+ sizeof (vaq->vaq_ea->arp_spa));
+ bcopy(&ip->sin_addr, vaq->vaq_ea->arp_tpa,
+ sizeof (vaq->vaq_ea->arp_spa));
+
+ /*
+ * Finally go ahead and fix up the mac header and reply to the sender
+ * explicitly.
+ */
+ ether = (struct ether_header *)vaq->vaq_buf;
+ bcopy(&ether->ether_shost, &ether->ether_dhost, ETHERADDRL);
+ bcopy(vaq->vaq_lookup, &ether->ether_shost, ETHERADDRL);
+
+ (void) libvarpd_overlay_inject(vaq->vaq_inst->vri_impl, vaq->vaq_otl,
+ vaq->vaq_buf, vaq->vaq_bsize);
+
+ libvarpd_plugin_query_reply(vaq->vaq_query, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+}
+
+static uint16_t
+libvarpd_icmpv6_checksum(const ip6_t *v6hdr, const uint16_t *buf, uint16_t mlen)
+{
+ int i;
+ uint16_t *v;
+ uint32_t sum = 0;
+
+ assert(mlen % 2 == 0);
+ v = (uint16_t *)&v6hdr->ip6_src;
+ for (i = 0; i < sizeof (struct in6_addr); i += 2, v++)
+ sum += *v;
+ v = (uint16_t *)&v6hdr->ip6_dst;
+ for (i = 0; i < sizeof (struct in6_addr); i += 2, v++)
+ sum += *v;
+ sum += htons(mlen);
+#ifdef _BIG_ENDIAN
+ sum += IPPROTO_ICMPV6;
+#else
+ sum += IPPROTO_ICMPV6 << 8;
+#endif /* _BIG_ENDIAN */
+
+ for (i = 0; i < mlen; i += 2, buf++)
+ sum += *buf;
+
+ while ((sum >> 16) != 0)
+ sum = (sum & 0xffff) + (sum >> 16);
+
+ return (sum & 0xffff);
+}
+
+/*
+ * Proxying NDP is much more involved than proxying ARP. For starters, NDP
+ * neighbor solicitations are implemented in terms of IPv6 ICMP as opposed to
+ * its own Ethertype. Therefore, we're going to have to grab a packet if it's a
+ * multicast packet and then determine if we actually want to do anything with
+ * it.
+ */
+void
+libvarpd_plugin_proxy_ndp(varpd_provider_handle_t *hdl,
+ varpd_query_handle_t *vqh, const overlay_targ_lookup_t *otl)
+{
+ size_t bsize, plen;
+ varpd_arp_query_t *vaq;
+ ip6_t *v6hdr;
+ nd_neighbor_solicit_t *ns;
+ nd_opt_hdr_t *opt;
+ struct sockaddr_in6 *s6;
+
+ varpd_instance_t *inst = (varpd_instance_t *)hdl;
+ uint8_t *eth = NULL;
+
+ vaq = umem_alloc(sizeof (varpd_arp_query_t), UMEM_DEFAULT);
+ if (vaq == NULL) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ return;
+ }
+ vaq->vaq_bsize = sizeof (vaq->vaq_buf);
+
+ if (otl->otl_dstaddr[0] != 0x33 ||
+ otl->otl_dstaddr[1] != 0x33) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ /*
+ * If we have more than a standard frame size for the ICMP neighbor
+ * solicitation, drop it. Similarly if there isn't enough data present
+ * for us, drop it.
+ */
+ if (otl->otl_hdrsize + otl->otl_pktsize > vaq->vaq_bsize) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ if (otl->otl_pktsize < sizeof (ip6_t) +
+ sizeof (nd_neighbor_solicit_t)) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ if (libvarpd_overlay_packet(inst->vri_impl, otl, vaq->vaq_buf,
+ &vaq->vaq_bsize) != 0) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ bsize = vaq->vaq_bsize;
+ bsize -= otl->otl_hdrsize;
+ assert(bsize > sizeof (ip6_t));
+
+ v6hdr = (ip6_t *)(vaq->vaq_buf + otl->otl_hdrsize);
+ if (((v6hdr->ip6_vfc & 0xf0) >> 4) != IPV6_VERSION) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ if (v6hdr->ip6_nxt != IPPROTO_ICMPV6) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ /*
+ * In addition to getting these requests on the multicast address for
+ * node solicitation, we may also end up getting them on a generic
+ * multicast address due to timeouts or other choices by various OSes.
+ * We should fairly liberal and accept both, even though the standard
+ * wants them to a solicitation address.
+ */
+ if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&v6hdr->ip6_dst) &&
+ !IN6_IS_ADDR_MC_LINKLOCAL(&v6hdr->ip6_dst)) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ bsize -= sizeof (ip6_t);
+ plen = ntohs(v6hdr->ip6_plen);
+ if (bsize < plen) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ /*
+ * Now we know that this is an ICMPv6 request targeting the right
+ * IPv6 multicast prefix. Let's go through and verify that ICMPv6
+ * indicates that we have the real thing and ensure that per RFC 4861
+ * the target address is not a multicast address. Further, because this
+ * is a multicast on Ethernet, we must have a source link-layer address.
+ *
+ * We should probably enforce that we have a valid ICMP checksum at some
+ * point.
+ */
+ ns = (nd_neighbor_solicit_t *)(vaq->vaq_buf + otl->otl_hdrsize +
+ sizeof (ip6_t));
+ if (ns->nd_ns_type != ND_NEIGHBOR_SOLICIT && ns->nd_ns_code != 0) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ if (IN6_IS_ADDR_MULTICAST(&ns->nd_ns_target) ||
+ IN6_IS_ADDR_V4MAPPED(&ns->nd_ns_target) ||
+ IN6_IS_ADDR_LOOPBACK(&ns->nd_ns_target)) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ plen -= sizeof (nd_neighbor_solicit_t);
+ opt = (nd_opt_hdr_t *)(ns+1);
+ while (plen >= sizeof (struct nd_opt_hdr)) {
+ /* If we have an option with no lenght, that's clear bogus */
+ if (opt->nd_opt_len == 0) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ if (opt->nd_opt_type == ND_OPT_SOURCE_LINKADDR) {
+ eth = (uint8_t *)((uintptr_t)opt +
+ sizeof (nd_opt_hdr_t));
+ }
+ plen -= opt->nd_opt_len * 8;
+ opt = (nd_opt_hdr_t *)((uintptr_t)opt +
+ opt->nd_opt_len * 8);
+ }
+
+ if (eth == NULL) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ }
+
+ bzero(&vaq->vaq_sock, sizeof (struct sockaddr_storage));
+ s6 = (struct sockaddr_in6 *)&vaq->vaq_sock;
+ s6->sin6_family = AF_INET6;
+ bcopy(&ns->nd_ns_target, &s6->sin6_addr, sizeof (s6->sin6_addr));
+
+ if (inst->vri_plugin->vpp_ops->vpo_arp == NULL)
+ libvarpd_panic("%s plugin asked to do arp, but has no method",
+ inst->vri_plugin->vpp_name);
+
+ vaq->vaq_type = AF_INET6;
+ vaq->vaq_inst = inst;
+ vaq->vaq_ea = NULL;
+ vaq->vaq_query = vqh;
+ vaq->vaq_otl = otl;
+ vaq->vaq_ns = ns;
+ vaq->vaq_ip6 = v6hdr;
+ inst->vri_plugin->vpp_ops->vpo_arp(inst->vri_private,
+ (varpd_arp_handle_t *)vaq, VARPD_QTYPE_ETHERNET,
+ (struct sockaddr *)s6, vaq->vaq_lookup);
+}
+
+static void
+libvarpd_proxy_ndp_fini(varpd_arp_query_t *vaq)
+{
+ char resp[ETHERMAX + VLAN_TAGSZ];
+ struct ether_header *ether;
+ nd_neighbor_advert_t *na;
+ nd_opt_hdr_t *opt;
+ ip6_t *v6hdr;
+ size_t roff = 0;
+
+ /*
+ * Now we need to assemble an RA as a response. Unlike with arp, we opt
+ * to use a new packet just to make things a bit simpler saner here.
+ */
+ v6hdr = vaq->vaq_ip6;
+ bcopy(vaq->vaq_buf, resp, vaq->vaq_otl->otl_hdrsize);
+ ether = (struct ether_header *)resp;
+ bcopy(&ether->ether_shost, &ether->ether_dhost, ETHERADDRL);
+ bcopy(vaq->vaq_lookup, &ether->ether_shost, ETHERADDRL);
+ roff += vaq->vaq_otl->otl_hdrsize;
+ bcopy(v6hdr, resp + roff, sizeof (ip6_t));
+ v6hdr = (ip6_t *)(resp + roff);
+ bcopy(&v6hdr->ip6_src, &v6hdr->ip6_dst, sizeof (struct in6_addr));
+ bcopy(&vaq->vaq_ns->nd_ns_target, &v6hdr->ip6_src,
+ sizeof (struct in6_addr));
+ roff += sizeof (ip6_t);
+ na = (nd_neighbor_advert_t *)(resp + roff);
+ na->nd_na_type = ND_NEIGHBOR_ADVERT;
+ na->nd_na_code = 0;
+ /*
+ * RFC 4443 defines that we should set the checksum to zero before we
+ * calculate it.
+ */
+ na->nd_na_cksum = 0;
+ /*
+ * Nota bene, the header <netinet/icmp6.h> has already transformed this
+ * into the appropriate host order. Don't use htonl.
+ */
+ na->nd_na_flags_reserved = ND_NA_FLAG_SOLICITED | ND_NA_FLAG_OVERRIDE;
+ bcopy(&vaq->vaq_ns->nd_ns_target, &na->nd_na_target,
+ sizeof (struct in6_addr));
+ roff += sizeof (nd_neighbor_advert_t);
+
+ opt = (nd_opt_hdr_t *)(resp + roff);
+ opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
+ opt->nd_opt_len = 1;
+ roff += sizeof (nd_opt_hdr_t);
+ bcopy(vaq->vaq_lookup, resp + roff, ETHERADDRL);
+ roff += ETHERADDRL;
+
+ /*
+ * Now that we've filled in the packet, go back and compute the checksum
+ * and fill in the IPv6 payload size.
+ */
+ v6hdr->ip6_plen = htons(roff - sizeof (ip6_t) -
+ vaq->vaq_otl->otl_hdrsize);
+ na->nd_na_cksum = ~libvarpd_icmpv6_checksum(v6hdr, (uint16_t *)na,
+ ntohs(v6hdr->ip6_plen)) & 0xffff;
+
+ (void) libvarpd_overlay_inject(vaq->vaq_inst->vri_impl, vaq->vaq_otl,
+ resp, roff);
+
+ libvarpd_plugin_query_reply(vaq->vaq_query, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+}
+
+void
+libvarpd_plugin_arp_reply(varpd_arp_handle_t *vah, int action)
+{
+ varpd_arp_query_t *vaq = (varpd_arp_query_t *)vah;
+
+ if (vaq == NULL)
+ libvarpd_panic("unknown plugin passed invalid "
+ "varpd_arp_handle_t");
+
+ if (action == VARPD_LOOKUP_DROP) {
+ libvarpd_plugin_query_reply(vaq->vaq_query, VARPD_LOOKUP_DROP);
+ umem_free(vaq, sizeof (varpd_arp_query_t));
+ return;
+ } else if (action != VARPD_LOOKUP_OK)
+ libvarpd_panic("%s plugin returned invalid action %d",
+ vaq->vaq_inst->vri_plugin->vpp_name, action);
+
+ switch (vaq->vaq_type) {
+ case AF_INET:
+ libvarpd_proxy_arp_fini(vaq);
+ break;
+ case AF_INET6:
+ libvarpd_proxy_ndp_fini(vaq);
+ break;
+ default:
+ libvarpd_panic("encountered unknown vaq_type: %d",
+ vaq->vaq_type);
+ }
+}
+
+void
+libvarpd_plugin_proxy_dhcp(varpd_provider_handle_t *hdl,
+ varpd_query_handle_t *vqh, const overlay_targ_lookup_t *otl)
+{
+ varpd_dhcp_query_t *vdq;
+ struct ether_header *ether;
+ struct ip *ip;
+ struct udphdr *udp;
+ varpd_instance_t *inst = (varpd_instance_t *)hdl;
+
+ vdq = umem_alloc(sizeof (varpd_dhcp_query_t), UMEM_DEFAULT);
+ if (vdq == NULL) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ return;
+ }
+ vdq->vdq_bsize = sizeof (vdq->vdq_buf);
+
+ if (otl->otl_sap != ETHERTYPE_IP) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vdq, sizeof (varpd_dhcp_query_t));
+ return;
+ }
+
+ if (bcmp(otl->otl_dstaddr, libvarpd_arp_bcast, ETHERADDRL) != 0) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vdq, sizeof (varpd_dhcp_query_t));
+ return;
+ }
+
+ if (otl->otl_hdrsize + otl->otl_pktsize > vdq->vdq_bsize ||
+ otl->otl_pktsize < sizeof (struct ip) + sizeof (struct udphdr) +
+ sizeof (struct dhcp) ||
+ otl->otl_hdrsize < sizeof (struct ether_header)) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vdq, sizeof (varpd_dhcp_query_t));
+ return;
+ }
+
+ if (libvarpd_overlay_packet(inst->vri_impl, otl, vdq->vdq_buf,
+ &vdq->vdq_bsize) != 0) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vdq, sizeof (varpd_dhcp_query_t));
+ return;
+ }
+
+ if (vdq->vdq_bsize != otl->otl_hdrsize + otl->otl_pktsize) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vdq, sizeof (varpd_dhcp_query_t));
+ return;
+ }
+
+ ether = (struct ether_header *)vdq->vdq_buf;
+ ip = (struct ip *)(vdq->vdq_buf + otl->otl_hdrsize);
+
+ if (ip->ip_v != IPVERSION && ip->ip_p != IPPROTO_UDP) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vdq, sizeof (varpd_dhcp_query_t));
+ return;
+ }
+
+ if (otl->otl_hdrsize + ip->ip_hl * 4 + sizeof (struct udphdr) >
+ vdq->vdq_bsize) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vdq, sizeof (varpd_dhcp_query_t));
+ return;
+ }
+
+ udp = (struct udphdr *)(vdq->vdq_buf + otl->otl_hdrsize +
+ ip->ip_hl * 4);
+
+ if (ntohs(udp->uh_sport) != IPPORT_BOOTPC ||
+ ntohs(udp->uh_dport) != IPPORT_BOOTPS) {
+ libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP);
+ umem_free(vdq, sizeof (varpd_dhcp_query_t));
+ return;
+ }
+
+ vdq->vdq_ether = ether;
+ vdq->vdq_inst = inst;
+ vdq->vdq_query = vqh;
+ vdq->vdq_otl = otl;
+
+ if (inst->vri_plugin->vpp_ops->vpo_dhcp == NULL)
+ libvarpd_panic("%s plugin asked to do dhcp, but has no method",
+ inst->vri_plugin->vpp_name);
+
+ inst->vri_plugin->vpp_ops->vpo_dhcp(inst->vri_private,
+ (varpd_dhcp_handle_t *)vdq, VARPD_QTYPE_ETHERNET, otl,
+ vdq->vdq_lookup);
+}
+
+void
+libvarpd_plugin_dhcp_reply(varpd_dhcp_handle_t *vdh, int action)
+{
+ varpd_dhcp_query_t *vdq = (varpd_dhcp_query_t *)vdh;
+
+ if (vdq == NULL)
+ libvarpd_panic("unknown plugin passed invalid "
+ "varpd_dhcp_handle_t");
+
+ if (action == VARPD_LOOKUP_DROP) {
+ libvarpd_plugin_query_reply(vdq->vdq_query, VARPD_LOOKUP_DROP);
+ umem_free(vdq, sizeof (varpd_dhcp_query_t));
+ return;
+ } else if (action != VARPD_LOOKUP_OK)
+ libvarpd_panic("%s plugin returned invalid action %d",
+ vdq->vdq_inst->vri_plugin->vpp_name, action);
+
+ bcopy(vdq->vdq_lookup, &vdq->vdq_ether->ether_dhost, ETHERADDRL);
+ (void) libvarpd_overlay_resend(vdq->vdq_inst->vri_impl, vdq->vdq_otl,
+ vdq->vdq_buf, vdq->vdq_bsize);
+
+ libvarpd_plugin_query_reply(vdq->vdq_query, VARPD_LOOKUP_DROP);
+ umem_free(vdq, sizeof (varpd_dhcp_query_t));
+}
+
+/*
+ * Inject a gratuitous ARP packet to the specified mac address.
+ */
+void
+libvarpd_inject_arp(varpd_provider_handle_t *vph, const uint16_t vlan,
+ const uint8_t *srcmac, const struct in_addr *srcip, const uint8_t *dstmac)
+{
+ char buf[500];
+ size_t bsize = 0;
+ struct ether_arp *ea;
+ varpd_instance_t *inst = (varpd_instance_t *)vph;
+
+ if (vlan != 0) {
+ struct ether_vlan_header *eh;
+ eh = (struct ether_vlan_header *)(buf + bsize);
+ bsize += sizeof (struct ether_vlan_header);
+ bcopy(dstmac, &eh->ether_dhost, ETHERADDRL);
+ bcopy(srcmac, &eh->ether_shost, ETHERADDRL);
+ eh->ether_tpid = htons(ETHERTYPE_VLAN);
+ eh->ether_tci = htons(VLAN_TCI(0, ETHER_CFI, vlan));
+ eh->ether_type = htons(ETHERTYPE_ARP);
+ } else {
+ struct ether_header *eh;
+ eh = (struct ether_header *)(buf + bsize);
+ bsize += sizeof (struct ether_header);
+ bcopy(dstmac, &eh->ether_dhost, ETHERADDRL);
+ bcopy(srcmac, &eh->ether_shost, ETHERADDRL);
+ eh->ether_type = htons(ETHERTYPE_ARP);
+ }
+
+ ea = (struct ether_arp *)(buf + bsize);
+ bsize += sizeof (struct ether_arp);
+ ea->ea_hdr.ar_hrd = htons(ARPHRD_ETHER);
+ ea->ea_hdr.ar_pro = htons(ETHERTYPE_IP);
+ ea->ea_hdr.ar_hln = ETHERADDRL;
+ ea->ea_hdr.ar_pln = sizeof (struct in_addr);
+ ea->ea_hdr.ar_op = htons(ARPOP_REQUEST);
+ bcopy(srcmac, ea->arp_sha, ETHERADDRL);
+ bcopy(srcip, ea->arp_spa, sizeof (struct in_addr));
+ bcopy(libvarpd_arp_bcast, ea->arp_tha, ETHERADDRL);
+ bcopy(srcip, ea->arp_tpa, sizeof (struct in_addr));
+
+ (void) libvarpd_overlay_instance_inject(inst, buf, bsize);
+}
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c
new file mode 100644
index 0000000000..1254c14e19
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c
@@ -0,0 +1,626 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * varpd client interfaces
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <umem.h>
+#include <unistd.h>
+#include <string.h>
+#include <strings.h>
+#include <door.h>
+
+#include <libvarpd_impl.h>
+
+typedef struct varpd_client {
+ int vcl_doorfd;
+} varpd_client_t;
+
+typedef struct varpd_client_prop_info {
+ varpd_client_t *vcprop_client;
+ uint64_t vcprop_instance;
+ uint_t vcprop_propid;
+ uint_t vcprop_type;
+ uint_t vcprop_prot;
+ uint32_t vcprop_defsize;
+ uint32_t vcprop_psize;
+ char vcprop_name[LIBVARPD_PROP_NAMELEN];
+ uint8_t vcprop_default[LIBVARPD_PROP_SIZEMAX];
+ uint8_t vcprop_poss[LIBVARPD_PROP_SIZEMAX];
+} varpd_client_prop_info_t;
+
+static int
+libvarpd_c_door_call(varpd_client_t *client, varpd_client_arg_t *argp,
+ size_t altsize)
+{
+ int ret;
+ door_arg_t darg;
+
+ darg.data_ptr = (char *)argp;
+ darg.desc_ptr = NULL;
+ darg.desc_num = 0;
+ darg.rbuf = (char *)argp;
+ if (altsize != 0) {
+ darg.data_size = altsize;
+ darg.rsize = altsize;
+ } else {
+ darg.data_size = sizeof (varpd_client_arg_t);
+ darg.rsize = sizeof (varpd_client_arg_t);
+ }
+
+ do {
+ ret = door_call(client->vcl_doorfd, &darg);
+ } while (ret != 0 && errno == EINTR);
+ if (ret != 0) {
+ switch (errno) {
+ case E2BIG:
+ case EFAULT:
+ case EINVAL:
+ case ENOTSUP:
+ case EOVERFLOW:
+ case ENFILE:
+ libvarpd_panic("unhandleable errno from door_call: %d",
+ errno);
+ }
+ ret = errno;
+ }
+
+ return (ret);
+}
+
+int
+libvarpd_c_create(varpd_client_handle_t **chpp, const char *doorname)
+{
+ varpd_client_t *client;
+
+ client = umem_alloc(sizeof (varpd_client_t), UMEM_DEFAULT);
+ if (client == NULL)
+ return (ENOMEM);
+
+ client->vcl_doorfd = open(doorname, O_RDWR);
+ if (client->vcl_doorfd < 0) {
+ int ret = errno;
+ umem_free(client, sizeof (varpd_client_t));
+ return (ret);
+ }
+
+ *chpp = (varpd_client_handle_t *)client;
+ return (0);
+}
+
+void
+libvarpd_c_destroy(varpd_client_handle_t *chp)
+{
+ varpd_client_t *client = (varpd_client_t *)chp;
+ if (close(client->vcl_doorfd) != 0)
+ libvarpd_panic("failed to close door fd %d: %d",
+ client->vcl_doorfd, errno);
+
+ umem_free(chp, sizeof (varpd_client_t));
+}
+
+int
+libvarpd_c_instance_create(varpd_client_handle_t *chp, datalink_id_t linkid,
+ const char *search, uint64_t *cidp)
+{
+ int ret;
+ varpd_client_t *client = (varpd_client_t *)chp;
+ varpd_client_arg_t carg;
+ varpd_client_create_arg_t *cap = &carg.vca_un.vca_create;
+
+ if (strlen(search) >= LIBVARPD_PROP_NAMELEN)
+ return (EINVAL);
+ carg.vca_command = VARPD_CLIENT_CREATE;
+ carg.vca_errno = 0;
+ cap->vcca_linkid = linkid;
+ (void) strlcpy(cap->vcca_plugin, search, LIBVARPD_PROP_NAMELEN);
+
+ ret = libvarpd_c_door_call(client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+
+ *cidp = cap->vcca_id;
+
+ return (0);
+}
+
+int
+libvarpd_c_instance_activate(varpd_client_handle_t *chp, uint64_t cid)
+{
+ int ret;
+ varpd_client_t *client = (varpd_client_t *)chp;
+ varpd_client_arg_t carg;
+ varpd_client_instance_arg_t *vciap = &carg.vca_un.vca_instance;
+
+ carg.vca_command = VARPD_CLIENT_ACTIVATE;
+ carg.vca_errno = 0;
+ vciap->vcia_id = cid;
+
+ ret = libvarpd_c_door_call(client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+
+ return (0);
+}
+
+int
+libvarpd_c_instance_destroy(varpd_client_handle_t *chp, uint64_t cid)
+{
+ int ret;
+ varpd_client_t *client = (varpd_client_t *)chp;
+ varpd_client_arg_t carg;
+ varpd_client_instance_arg_t *vciap = &carg.vca_un.vca_instance;
+
+ carg.vca_command = VARPD_CLIENT_DESTROY;
+ carg.vca_errno = 0;
+ vciap->vcia_id = cid;
+
+ ret = libvarpd_c_door_call(client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+
+ return (0);
+}
+
+int
+libvarpd_c_prop_nprops(varpd_client_handle_t *chp, uint64_t cid, uint_t *nprops)
+{
+ int ret;
+ varpd_client_t *client = (varpd_client_t *)chp;
+ varpd_client_arg_t carg;
+ varpd_client_nprops_arg_t *vcnap = &carg.vca_un.vca_nprops;
+
+ carg.vca_command = VARPD_CLIENT_NPROPS;
+ carg.vca_errno = 0;
+ vcnap->vcna_id = cid;
+ vcnap->vcna_nprops = 0;
+
+ ret = libvarpd_c_door_call(client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+ *nprops = vcnap->vcna_nprops;
+ return (0);
+}
+
+int
+libvarpd_c_prop_handle_alloc(varpd_client_handle_t *chp, uint64_t cid,
+ varpd_client_prop_handle_t **phdlp)
+{
+ varpd_client_prop_info_t *infop;
+
+ infop = umem_alloc(sizeof (varpd_client_prop_info_t), UMEM_DEFAULT);
+ if (infop == NULL)
+ return (ENOMEM);
+
+ bzero(infop, sizeof (varpd_client_prop_info_t));
+ infop->vcprop_client = (varpd_client_t *)chp;
+ infop->vcprop_instance = cid;
+ infop->vcprop_propid = UINT_MAX;
+ *phdlp = (varpd_client_prop_handle_t *)infop;
+ return (0);
+}
+
+void
+libvarpd_c_prop_handle_free(varpd_client_prop_handle_t *phdl)
+{
+ umem_free(phdl, sizeof (varpd_client_prop_info_t));
+ phdl = NULL;
+}
+
+static void
+libvarpd_c_prop_info_from_door(varpd_client_prop_info_t *infop,
+ const varpd_client_propinfo_arg_t *vcfap)
+{
+ infop->vcprop_propid = vcfap->vcfa_propid;
+ infop->vcprop_type = vcfap->vcfa_type;
+ infop->vcprop_prot = vcfap->vcfa_prot;
+ infop->vcprop_defsize = vcfap->vcfa_defsize;
+ infop->vcprop_psize = vcfap->vcfa_psize;
+ bcopy(vcfap->vcfa_name, infop->vcprop_name, LIBVARPD_PROP_NAMELEN);
+ bcopy(vcfap->vcfa_default, infop->vcprop_default,
+ LIBVARPD_PROP_SIZEMAX);
+ bcopy(vcfap->vcfa_poss, infop->vcprop_poss, LIBVARPD_PROP_SIZEMAX);
+}
+
+int
+libvarpd_c_prop_info_fill_by_name(varpd_client_prop_handle_t *phdl,
+ const char *name)
+{
+ int ret;
+ varpd_client_arg_t carg;
+ varpd_client_propinfo_arg_t *vcfap = &carg.vca_un.vca_info;
+ varpd_client_prop_info_t *infop = (varpd_client_prop_info_t *)phdl;
+
+ if (strlen(name) >= LIBVARPD_PROP_NAMELEN)
+ return (EINVAL);
+ bzero(&carg, sizeof (varpd_client_arg_t));
+ carg.vca_command = VARPD_CLIENT_PROPINFO;
+ carg.vca_errno = 0;
+ vcfap->vcfa_id = infop->vcprop_instance;
+ vcfap->vcfa_propid = UINT_MAX;
+ (void) strlcpy(vcfap->vcfa_name, name, LIBVARPD_PROP_NAMELEN);
+
+ ret = libvarpd_c_door_call(infop->vcprop_client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+
+ libvarpd_c_prop_info_from_door(infop, vcfap);
+ return (0);
+}
+
+int
+libvarpd_c_prop_info_fill(varpd_client_prop_handle_t *phdl, uint_t propid)
+{
+ int ret;
+ varpd_client_arg_t carg;
+ varpd_client_propinfo_arg_t *vcfap = &carg.vca_un.vca_info;
+ varpd_client_prop_info_t *infop = (varpd_client_prop_info_t *)phdl;
+
+ bzero(&carg, sizeof (varpd_client_arg_t));
+ carg.vca_command = VARPD_CLIENT_PROPINFO;
+ carg.vca_errno = 0;
+ vcfap->vcfa_id = infop->vcprop_instance;
+ vcfap->vcfa_propid = propid;
+
+ ret = libvarpd_c_door_call(infop->vcprop_client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+
+ libvarpd_c_prop_info_from_door(infop, vcfap);
+ return (0);
+}
+
+int
+libvarpd_c_prop_info(varpd_client_prop_handle_t *phdl, const char **namep,
+ uint_t *typep, uint_t *protp, const void **defp, uint32_t *defsizep,
+ const mac_propval_range_t **possp)
+{
+ varpd_client_prop_info_t *infop = (varpd_client_prop_info_t *)phdl;
+ if (infop->vcprop_propid == UINT_MAX)
+ return (EINVAL);
+
+ if (namep != NULL)
+ *namep = infop->vcprop_name;
+ if (typep != NULL)
+ *typep = infop->vcprop_type;
+ if (protp != NULL)
+ *protp = infop->vcprop_prot;
+ if (defp != NULL)
+ *defp = infop->vcprop_default;
+ if (defsizep != NULL)
+ *defsizep = infop->vcprop_defsize;
+ if (possp != NULL)
+ *possp = (const mac_propval_range_t *)infop->vcprop_poss;
+ return (0);
+}
+
+int
+libvarpd_c_prop_get(varpd_client_prop_handle_t *phdl, void *buf, uint32_t *len)
+{
+ int ret;
+ varpd_client_arg_t carg;
+ varpd_client_prop_arg_t *vcpap = &carg.vca_un.vca_prop;
+ varpd_client_prop_info_t *infop = (varpd_client_prop_info_t *)phdl;
+
+ if (len == NULL || buf == NULL || infop->vcprop_propid == UINT_MAX)
+ return (EINVAL);
+ if (*len < LIBVARPD_PROP_SIZEMAX)
+ return (EOVERFLOW);
+
+ bzero(&carg, sizeof (varpd_client_arg_t));
+ carg.vca_command = VARPD_CLIENT_GETPROP;
+ carg.vca_errno = 0;
+ vcpap->vcpa_id = infop->vcprop_instance;
+ vcpap->vcpa_propid = infop->vcprop_propid;
+
+ ret = libvarpd_c_door_call(infop->vcprop_client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+
+ /*
+ * If the buffer size is too large then something odd has certainly
+ * happened here, it means that varpd has gone rogue. In such a case we
+ * return a rather odd errror, though we don't believe that this should
+ * generally happen.
+ */
+ if (vcpap->vcpa_bufsize > LIBVARPD_PROP_SIZEMAX)
+ return (E2BIG);
+
+ bcopy(vcpap->vcpa_buf, buf, vcpap->vcpa_bufsize);
+ *len = vcpap->vcpa_bufsize;
+ return (0);
+}
+
+int
+libvarpd_c_prop_set(varpd_client_prop_handle_t *phdl, const void *buf,
+ uint32_t len)
+{
+ int ret;
+ varpd_client_arg_t carg;
+ varpd_client_prop_arg_t *vcpap = &carg.vca_un.vca_prop;
+ varpd_client_prop_info_t *infop = (varpd_client_prop_info_t *)phdl;
+
+ if (len == 0 || buf == NULL || infop->vcprop_propid == UINT_MAX)
+ return (EINVAL);
+ if (len > LIBVARPD_PROP_SIZEMAX)
+ return (EOVERFLOW);
+
+ carg.vca_command = VARPD_CLIENT_SETPROP;
+ carg.vca_errno = 0;
+ vcpap->vcpa_id = infop->vcprop_instance;
+ vcpap->vcpa_propid = infop->vcprop_propid;
+ vcpap->vcpa_bufsize = len;
+ bcopy(buf, vcpap->vcpa_buf, len);
+
+ ret = libvarpd_c_door_call(infop->vcprop_client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+
+ return (0);
+}
+
+int
+libvarpd_c_instance_lookup(varpd_client_handle_t *chp, datalink_id_t linkid,
+ uint64_t *instp)
+{
+ int ret;
+ varpd_client_arg_t carg;
+ varpd_client_lookup_arg_t *vclap = &carg.vca_un.vca_lookup;
+ varpd_client_t *client = (varpd_client_t *)chp;
+
+ carg.vca_command = VARPD_CLIENT_LOOKUP;
+ carg.vca_errno = 0;
+ vclap->vcla_linkid = linkid;
+ ret = libvarpd_c_door_call(client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+ if (instp != NULL)
+ *instp = vclap->vcla_id;
+
+ return (0);
+}
+
+int
+libvarpd_c_instance_target_mode(varpd_client_handle_t *chp, uint64_t cid,
+ uint_t *dtype, uint_t *mtype)
+{
+ int ret;
+ varpd_client_arg_t carg;
+ varpd_client_target_mode_arg_t *vctmap = &carg.vca_un.vca_mode;
+ varpd_client_t *client = (varpd_client_t *)chp;
+
+ carg.vca_command = VARPD_CLIENT_TARGET_MODE;
+ carg.vca_errno = 0;
+ vctmap->vtma_id = cid;
+ ret = libvarpd_c_door_call(client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+ if (ret == 0) {
+ if (mtype != NULL)
+ *mtype = vctmap->vtma_mode;
+ if (dtype != NULL)
+ *dtype = vctmap->vtma_dest;
+ }
+
+ return (ret);
+}
+
+int
+libvarpd_c_instance_cache_flush(varpd_client_handle_t *chp, uint64_t cid)
+{
+ int ret;
+ varpd_client_arg_t carg;
+ varpd_client_target_cache_arg_t *vctcap = &carg.vca_un.vca_cache;
+ varpd_client_t *client = (varpd_client_t *)chp;
+
+ carg.vca_command = VARPD_CLIENT_CACHE_FLUSH;
+ carg.vca_errno = 0;
+
+ vctcap->vtca_id = cid;
+ ret = libvarpd_c_door_call(client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+
+ return (0);
+}
+
+int
+libvarpd_c_instance_cache_delete(varpd_client_handle_t *chp, uint64_t cid,
+ const struct ether_addr *key)
+{
+ int ret;
+ varpd_client_arg_t carg;
+ varpd_client_target_cache_arg_t *vctcap = &carg.vca_un.vca_cache;
+ varpd_client_t *client = (varpd_client_t *)chp;
+
+ if (key == NULL)
+ return (EINVAL);
+
+ carg.vca_command = VARPD_CLIENT_CACHE_DELETE;
+ carg.vca_errno = 0;
+ vctcap->vtca_id = cid;
+ bcopy(key, vctcap->vtca_key, ETHERADDRL);
+
+ ret = libvarpd_c_door_call(client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+
+ return (0);
+}
+
+int
+libvarpd_c_instance_cache_get(varpd_client_handle_t *chp, uint64_t cid,
+ const struct ether_addr *key, varpd_client_cache_entry_t *entry)
+{
+ int ret;
+ varpd_client_arg_t carg;
+ varpd_client_target_cache_arg_t *vctcap = &carg.vca_un.vca_cache;
+ varpd_client_t *client = (varpd_client_t *)chp;
+
+ if (key == NULL || entry == NULL)
+ return (EINVAL);
+
+ carg.vca_command = VARPD_CLIENT_CACHE_GET;
+ carg.vca_errno = 0;
+ vctcap->vtca_id = cid;
+ bcopy(key, vctcap->vtca_key, ETHERADDRL);
+ bzero(&vctcap->vtca_entry, sizeof (varpd_client_cache_entry_t));
+
+ ret = libvarpd_c_door_call(client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+
+ bcopy(&vctcap->vtca_entry, entry, sizeof (varpd_client_cache_entry_t));
+ return (0);
+}
+
+int
+libvarpd_c_instance_cache_set(varpd_client_handle_t *chp, uint64_t cid,
+ const struct ether_addr *key, const varpd_client_cache_entry_t *entry)
+{
+ int ret;
+ varpd_client_arg_t carg;
+ varpd_client_target_cache_arg_t *vctcap = &carg.vca_un.vca_cache;
+ varpd_client_t *client = (varpd_client_t *)chp;
+
+ if (key == NULL || entry == NULL)
+ return (EINVAL);
+
+ carg.vca_command = VARPD_CLIENT_CACHE_SET;
+ carg.vca_errno = 0;
+ vctcap->vtca_id = cid;
+ bcopy(key, vctcap->vtca_key, ETHERADDRL);
+ bcopy(entry, &vctcap->vtca_entry, sizeof (varpd_client_cache_entry_t));
+
+ ret = libvarpd_c_door_call(client, &carg, 0);
+ if (ret != 0)
+ return (ret);
+
+ if (carg.vca_errno != 0)
+ return (carg.vca_errno);
+
+ return (0);
+}
+
+int
+libvarpd_c_instance_cache_walk(varpd_client_handle_t *chp, uint64_t cid,
+ varpd_client_cache_f func, void *arg)
+{
+ int ret = 0;
+ size_t bufsize = sizeof (varpd_client_arg_t) +
+ 100 * sizeof (varpd_client_cache_entry_t);
+ varpd_client_t *client = (varpd_client_t *)chp;
+ varpd_client_arg_t *cargp;
+ varpd_client_target_walk_arg_t *vctwap;
+
+ /*
+ * Because the number of entries involved in a walk may be large, we
+ * dynamically allocate a number of queries to make at a single time.
+ * This also means that the average door request doesn't inflate by the
+ * number of entries we want. For now, let's always grab 100 entries in
+ * a request.
+ */
+ cargp = umem_zalloc(bufsize, UMEM_DEFAULT);
+ if (cargp == NULL)
+ return (errno);
+ vctwap = &cargp->vca_un.vca_walk;
+ for (;;) {
+ int i;
+
+ cargp->vca_command = VARPD_CLIENT_CACHE_WALK;
+ cargp->vca_errno = 0;
+ vctwap->vtcw_id = cid;
+ vctwap->vtcw_count = 100;
+
+ ret = libvarpd_c_door_call(client, cargp, bufsize);
+ if (ret != 0)
+ break;
+
+ if (cargp->vca_errno != 0) {
+ ret = cargp->vca_errno;
+ break;
+ }
+
+ if (vctwap->vtcw_count == 0) {
+ ret = 0;
+ break;
+ }
+
+ for (i = 0; i < vctwap->vtcw_count; i++) {
+ varpd_client_cache_entry_t ent;
+
+ ent.vcp_flags = vctwap->vtcw_ents[i].otce_flags;
+ bcopy(vctwap->vtcw_ents[i].otce_dest.otp_mac,
+ &ent.vcp_mac, ETHERADDRL);
+ ent.vcp_ip = vctwap->vtcw_ents[i].otce_dest.otp_ip;
+ ent.vcp_port = vctwap->vtcw_ents[i].otce_dest.otp_port;
+ ret = func(chp, cid,
+ (struct ether_addr *)vctwap->vtcw_ents[i].otce_mac,
+ &ent, arg);
+ if (ret != 0) {
+ ret = 0;
+ goto done;
+ }
+ }
+ }
+
+done:
+ umem_free(cargp, bufsize);
+ return (ret);
+}
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h
new file mode 100644
index 0000000000..459711b385
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h
@@ -0,0 +1,92 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _LIBVARPD_CLIENT_H
+#define _LIBVARPD_CLIENT_H
+
+/*
+ * varpd interfaces
+ */
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <sys/mac.h>
+#include <sys/overlay_target.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct __varpd_client_handle varpd_client_handle_t;
+typedef struct __varpd_client_prop_handle varpd_client_prop_handle_t;
+
+typedef struct varpd_client_cache_entry {
+ struct ether_addr vcp_mac;
+ uint16_t vcp_flags;
+ struct in6_addr vcp_ip;
+ uint16_t vcp_port;
+} varpd_client_cache_entry_t;
+
+/*
+ * We just use the values from the kernel for now.
+ */
+#define LIBVARPD_PROP_SIZEMAX OVERLAY_PROP_SIZEMAX
+#define LIBVARPD_PROP_NAMELEN OVERLAY_PROP_NAMELEN
+
+extern int libvarpd_c_create(varpd_client_handle_t **, const char *);
+extern void libvarpd_c_destroy(varpd_client_handle_t *);
+extern int libvarpd_c_instance_create(varpd_client_handle_t *, datalink_id_t,
+ const char *, uint64_t *);
+extern int libvarpd_c_instance_activate(varpd_client_handle_t *, uint64_t);
+extern int libvarpd_c_instance_destroy(varpd_client_handle_t *, uint64_t);
+
+extern int libvarpd_c_prop_nprops(varpd_client_handle_t *, uint64_t, uint_t *);
+extern int libvarpd_c_prop_handle_alloc(varpd_client_handle_t *, uint64_t,
+ varpd_client_prop_handle_t **);
+extern void libvarpd_c_prop_handle_free(varpd_client_prop_handle_t *);
+extern int libvarpd_c_prop_info_fill(varpd_client_prop_handle_t *, uint_t);
+extern int libvarpd_c_prop_info_fill_by_name(varpd_client_prop_handle_t *,
+ const char *);
+extern int libvarpd_c_prop_info(varpd_client_prop_handle_t *, const char **,
+ uint_t *, uint_t *, const void **, uint32_t *,
+ const mac_propval_range_t **);
+extern int libvarpd_c_prop_get(varpd_client_prop_handle_t *, void *,
+ uint32_t *);
+extern int libvarpd_c_prop_set(varpd_client_prop_handle_t *, const void *,
+ uint32_t);
+
+extern int libvarpd_c_instance_lookup(varpd_client_handle_t *, datalink_id_t,
+ uint64_t *);
+extern int libvarpd_c_instance_target_mode(varpd_client_handle_t *, uint64_t,
+ uint_t *, uint_t *);
+extern int libvarpd_c_instance_cache_flush(varpd_client_handle_t *, uint64_t);
+extern int libvarpd_c_instance_cache_delete(varpd_client_handle_t *, uint64_t,
+ const struct ether_addr *);
+extern int libvarpd_c_instance_cache_get(varpd_client_handle_t *, uint64_t,
+ const struct ether_addr *, varpd_client_cache_entry_t *);
+extern int libvarpd_c_instance_cache_set(varpd_client_handle_t *, uint64_t,
+ const struct ether_addr *, const varpd_client_cache_entry_t *);
+
+typedef int (*varpd_client_cache_f)(varpd_client_handle_t *, uint64_t,
+ const struct ether_addr *, const varpd_client_cache_entry_t *, void *);
+extern int libvarpd_c_instance_cache_walk(varpd_client_handle_t *, uint64_t,
+ varpd_client_cache_f, void *);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBVARPD_CLIENT_H */
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c
new file mode 100644
index 0000000000..f684e031a8
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c
@@ -0,0 +1,469 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * varpd door server logic
+ */
+
+#include <door.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stropts.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <priv.h>
+#include <libvarpd_impl.h>
+
+typedef int (libvarpd_door_f)(varpd_impl_t *, varpd_client_arg_t *, ucred_t *);
+
+static boolean_t
+libvarpd_door_privileged(ucred_t *credp)
+{
+ const priv_set_t *ps;
+
+ ps = ucred_getprivset(credp, PRIV_EFFECTIVE);
+ if (ps == NULL)
+ return (B_FALSE);
+
+ return (priv_ismember(ps, PRIV_SYS_NET_CONFIG));
+}
+
+/* ARGSUSED */
+static int
+libvarpd_door_f_create(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ int ret;
+ varpd_instance_handle_t *ihdl;
+ varpd_client_create_arg_t *vccap = &vcap->vca_un.vca_create;
+
+ vccap->vcca_plugin[LIBVARPD_PROP_NAMELEN-1] = '\0';
+ ret = libvarpd_instance_create((varpd_handle_t *)vip,
+ vccap->vcca_linkid, vccap->vcca_plugin, &ihdl);
+ if (ret == 0)
+ vccap->vcca_id = libvarpd_instance_id(ihdl);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+libvarpd_door_f_activate(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ varpd_instance_handle_t *ihp;
+ varpd_client_instance_arg_t *vciap = &vcap->vca_un.vca_instance;
+
+ ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vciap->vcia_id);
+ if (ihp == NULL)
+ return (ENOENT);
+ return (libvarpd_instance_activate(ihp));
+}
+
+/* ARGSUSED */
+static int
+libvarpd_door_f_destroy(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ varpd_instance_handle_t *ihp;
+ varpd_client_instance_arg_t *vciap = &vcap->vca_un.vca_instance;
+
+ ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vciap->vcia_id);
+ if (ihp == NULL)
+ return (ENOENT);
+ libvarpd_instance_destroy(ihp);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+libvarpd_door_f_nprops(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ varpd_instance_handle_t *ihp;
+ varpd_client_nprops_arg_t *vcnap = &vcap->vca_un.vca_nprops;
+
+ ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vcnap->vcna_id);
+ if (ihp == NULL)
+ return (ENOENT);
+
+ return (libvarpd_prop_nprops(ihp, &vcnap->vcna_nprops));
+}
+
+/* ARGSUSED */
+static int
+libvarpd_door_f_propinfo(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ int ret;
+ varpd_instance_handle_t *ihp;
+ varpd_prop_handle_t *phdl;
+ varpd_client_propinfo_arg_t *vcfap = &vcap->vca_un.vca_info;
+
+ ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vcfap->vcfa_id);
+ if (ihp == NULL)
+ return (ENOENT);
+ ret = libvarpd_prop_handle_alloc((varpd_handle_t *)vip, ihp, &phdl);
+ if (ret != 0)
+ return (ret);
+
+ if (vcfap->vcfa_propid != UINT_MAX) {
+ ret = libvarpd_prop_info_fill(phdl, vcfap->vcfa_propid);
+ if (ret != 0) {
+ libvarpd_prop_handle_free(phdl);
+ return (ret);
+ }
+ } else {
+ uint_t i, nprop;
+ const char *name;
+
+ vcfap->vcfa_name[LIBVARPD_PROP_NAMELEN-1] = '\0';
+ ret = libvarpd_prop_nprops(ihp, &nprop);
+ if (ret != 0) {
+ libvarpd_prop_handle_free(phdl);
+ return (ret);
+ }
+ for (i = 0; i < nprop; i++) {
+ ret = libvarpd_prop_info_fill(phdl, i);
+ if (ret != 0) {
+ libvarpd_prop_handle_free(phdl);
+ return (ret);
+ }
+ ret = libvarpd_prop_info(phdl, &name, NULL, NULL, NULL,
+ NULL, NULL);
+ if (ret != 0) {
+ libvarpd_prop_handle_free(phdl);
+ return (ret);
+ }
+ if (strcmp(vcfap->vcfa_name, name) == 0)
+ break;
+ }
+
+ if (i == nprop) {
+ libvarpd_prop_handle_free(phdl);
+ return (ENOENT);
+ }
+ vcfap->vcfa_propid = i;
+ }
+ libvarpd_prop_door_convert(phdl, vcfap);
+ libvarpd_prop_handle_free(phdl);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+libvarpd_door_f_getprop(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ int ret;
+ uint32_t size;
+ varpd_instance_handle_t *ihp;
+ varpd_prop_handle_t *phdl;
+ varpd_client_prop_arg_t *vcpap = &vcap->vca_un.vca_prop;
+
+ ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vcpap->vcpa_id);
+ if (ihp == NULL)
+ return (ENOENT);
+ ret = libvarpd_prop_handle_alloc((varpd_handle_t *)vip, ihp, &phdl);
+ if (ret != 0)
+ return (ret);
+
+ ret = libvarpd_prop_info_fill(phdl, vcpap->vcpa_propid);
+ if (ret != 0) {
+ libvarpd_prop_handle_free(phdl);
+ return (ret);
+ }
+
+ ret = libvarpd_prop_get(phdl, vcpap->vcpa_buf, &size);
+ if (ret == 0)
+ vcpap->vcpa_bufsize = size;
+ libvarpd_prop_handle_free(phdl);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+libvarpd_door_f_setprop(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ int ret;
+ varpd_instance_handle_t *ihp;
+ varpd_prop_handle_t *phdl;
+ varpd_client_prop_arg_t *vcpap = &vcap->vca_un.vca_prop;
+
+ ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vcpap->vcpa_id);
+ if (ihp == NULL)
+ return (ENOENT);
+ ret = libvarpd_prop_handle_alloc((varpd_handle_t *)vip, ihp, &phdl);
+ if (ret != 0)
+ return (ret);
+
+ ret = libvarpd_prop_info_fill(phdl, vcpap->vcpa_propid);
+ if (ret != 0) {
+ libvarpd_prop_handle_free(phdl);
+ return (ret);
+ }
+
+ ret = libvarpd_prop_set(phdl, vcpap->vcpa_buf, vcpap->vcpa_bufsize);
+ libvarpd_prop_handle_free(phdl);
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+libvarpd_door_f_lookup(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ varpd_instance_t *inst;
+ varpd_client_lookup_arg_t *vclap = &vcap->vca_un.vca_lookup;
+
+ inst = libvarpd_instance_lookup_by_dlid(vip, vclap->vcla_linkid);
+ if (inst == NULL)
+ return (ENOENT);
+
+ vclap->vcla_id = inst->vri_id;
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+libvarpd_door_f_target(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ varpd_instance_handle_t *ihp;
+ varpd_instance_t *inst;
+ varpd_client_target_mode_arg_t *vtmap = &vcap->vca_un.vca_mode;
+
+ ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vtmap->vtma_id);
+ if (ihp == NULL)
+ return (ENOENT);
+ inst = (varpd_instance_t *)ihp;
+ vtmap->vtma_dest = inst->vri_dest;
+ vtmap->vtma_mode = inst->vri_mode;
+ return (0);
+}
+
+static int
+libvarpd_door_f_flush(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ varpd_instance_handle_t *ihp;
+ varpd_client_target_cache_arg_t *vtcap = &vcap->vca_un.vca_cache;
+
+ if (libvarpd_door_privileged(credp) == B_FALSE)
+ return (EPERM);
+
+ ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vtcap->vtca_id);
+ if (ihp == NULL)
+ return (ENOENT);
+ return (libvarpd_overlay_cache_flush((varpd_instance_t *)ihp));
+}
+
+static int
+libvarpd_door_f_delete(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ varpd_instance_handle_t *ihp;
+ varpd_client_target_cache_arg_t *vtcap = &vcap->vca_un.vca_cache;
+
+ if (libvarpd_door_privileged(credp) == B_FALSE)
+ return (EPERM);
+
+ ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vtcap->vtca_id);
+ if (ihp == NULL)
+ return (ENOENT);
+ return (libvarpd_overlay_cache_delete((varpd_instance_t *)ihp,
+ vtcap->vtca_key));
+}
+
+/* ARGSUSED */
+static int
+libvarpd_door_f_get(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ varpd_instance_handle_t *ihp;
+ varpd_client_target_cache_arg_t *vtcap = &vcap->vca_un.vca_cache;
+
+ ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vtcap->vtca_id);
+ if (ihp == NULL)
+ return (ENOENT);
+ return (libvarpd_overlay_cache_get((varpd_instance_t *)ihp,
+ vtcap->vtca_key, &vtcap->vtca_entry));
+}
+
+static int
+libvarpd_door_f_set(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ varpd_instance_handle_t *ihp;
+ varpd_client_target_cache_arg_t *vtcap = &vcap->vca_un.vca_cache;
+
+ if (libvarpd_door_privileged(credp) == B_FALSE)
+ return (EPERM);
+
+ ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vtcap->vtca_id);
+ if (ihp == NULL)
+ return (ENOENT);
+
+ return (libvarpd_overlay_cache_set((varpd_instance_t *)ihp,
+ vtcap->vtca_key, &vtcap->vtca_entry));
+}
+
+/* ARGSUSED */
+static int
+libvarpd_door_f_walk(varpd_impl_t *vip, varpd_client_arg_t *vcap,
+ ucred_t *credp)
+{
+ varpd_instance_handle_t *ihp;
+ varpd_client_target_walk_arg_t *vctwp = &vcap->vca_un.vca_walk;
+
+ ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vctwp->vtcw_id);
+ if (ihp == NULL)
+ return (ENOENT);
+
+ return (libvarpd_overlay_cache_walk_fill((varpd_instance_t *)ihp,
+ &vctwp->vtcw_marker, &vctwp->vtcw_count, vctwp->vtcw_ents));
+}
+
+static libvarpd_door_f *libvarpd_door_table[] = {
+ libvarpd_door_f_create,
+ libvarpd_door_f_activate,
+ libvarpd_door_f_destroy,
+ libvarpd_door_f_nprops,
+ libvarpd_door_f_propinfo,
+ libvarpd_door_f_getprop,
+ libvarpd_door_f_setprop,
+ libvarpd_door_f_lookup,
+ libvarpd_door_f_target,
+ libvarpd_door_f_flush,
+ libvarpd_door_f_delete,
+ libvarpd_door_f_get,
+ libvarpd_door_f_set,
+ libvarpd_door_f_walk
+};
+
+/* ARGSUSED */
+static void
+libvarpd_door_server(void *cookie, char *argp, size_t argsz, door_desc_t *dp,
+ uint_t ndesc)
+{
+ int ret;
+ varpd_client_eresp_t err;
+ ucred_t *credp = NULL;
+ varpd_impl_t *vip = cookie;
+ varpd_client_arg_t *vcap = (varpd_client_arg_t *)argp;
+
+ err.vce_command = VARPD_CLIENT_INVALID;
+ if (argsz < sizeof (varpd_client_arg_t)) {
+ err.vce_errno = EINVAL;
+ goto errout;
+ }
+
+ if ((ret = door_ucred(&credp)) != 0) {
+ err.vce_errno = ret;
+ goto errout;
+ }
+
+ if (vcap->vca_command == VARPD_CLIENT_INVALID ||
+ vcap->vca_command >= VARPD_CLIENT_MAX) {
+ err.vce_errno = EINVAL;
+ goto errout;
+ }
+
+ vcap->vca_errno = 0;
+ ret = libvarpd_door_table[vcap->vca_command - 1](vip, vcap, credp);
+ if (ret != 0)
+ vcap->vca_errno = ret;
+
+ ucred_free(credp);
+ (void) door_return(argp, argsz, NULL, 0);
+ return;
+
+errout:
+ ucred_free(credp);
+ (void) door_return((char *)&err, sizeof (err), NULL, 0);
+}
+
+int
+libvarpd_door_server_create(varpd_handle_t *vhp, const char *path)
+{
+ int fd, ret;
+ varpd_impl_t *vip = (varpd_impl_t *)vhp;
+
+ mutex_enter(&vip->vdi_lock);
+ if (vip->vdi_doorfd >= 0) {
+ mutex_exit(&vip->vdi_lock);
+ return (EEXIST);
+ }
+
+ vip->vdi_doorfd = door_create(libvarpd_door_server, vip,
+ DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
+ if (vip->vdi_doorfd == -1) {
+ mutex_exit(&vip->vdi_lock);
+ return (errno);
+ }
+
+ if ((fd = open(path, O_CREAT | O_RDWR, 0666)) == -1) {
+ ret = errno;
+ if (door_revoke(vip->vdi_doorfd) != 0)
+ libvarpd_panic("failed to revoke door: %d",
+ errno);
+ mutex_exit(&vip->vdi_lock);
+ return (errno);
+ }
+
+ if (fchown(fd, UID_NETADM, GID_NETADM) != 0) {
+ ret = errno;
+ if (door_revoke(vip->vdi_doorfd) != 0)
+ libvarpd_panic("failed to revoke door: %d",
+ errno);
+ mutex_exit(&vip->vdi_lock);
+ return (ret);
+ }
+
+ if (close(fd) != 0)
+ libvarpd_panic("failed to close door fd %d: %d",
+ fd, errno);
+ (void) fdetach(path);
+ if (fattach(vip->vdi_doorfd, path) != 0) {
+ ret = errno;
+ if (door_revoke(vip->vdi_doorfd) != 0)
+ libvarpd_panic("failed to revoke door: %d",
+ errno);
+ mutex_exit(&vip->vdi_lock);
+ return (ret);
+ }
+
+ mutex_exit(&vip->vdi_lock);
+ return (0);
+}
+
+void
+libvarpd_door_server_destroy(varpd_handle_t *vhp)
+{
+ varpd_impl_t *vip = (varpd_impl_t *)vhp;
+
+ mutex_enter(&vip->vdi_lock);
+ if (vip->vdi_doorfd != 0) {
+ if (door_revoke(vip->vdi_doorfd) != 0)
+ libvarpd_panic("failed to revoke door: %d",
+ errno);
+ vip->vdi_doorfd = -1;
+ }
+ mutex_exit(&vip->vdi_lock);
+}
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h b/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h
new file mode 100644
index 0000000000..f8530a7112
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h
@@ -0,0 +1,248 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _LIBVARPD_IMPL_H
+#define _LIBVARPD_IMPL_H
+
+/*
+ * varpd internal interfaces
+ */
+
+#include <libvarpd.h>
+#include <libvarpd_provider.h>
+#include <sys/avl.h>
+#include <thread.h>
+#include <synch.h>
+#include <limits.h>
+#include <libidspace.h>
+#include <umem.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBVARPD_ID_MIN 1
+#define LIBVARPD_ID_MAX INT32_MAX
+
+typedef struct varpd_plugin {
+ avl_node_t vpp_node;
+ const char *vpp_name;
+ overlay_target_mode_t vpp_mode;
+ const varpd_plugin_ops_t *vpp_ops;
+ mutex_t vpp_lock;
+ uint_t vpp_active;
+} varpd_plugin_t;
+
+typedef struct varpd_impl {
+ mutex_t vdi_lock;
+ rwlock_t vdi_pfdlock;
+ avl_tree_t vdi_plugins; /* vdi_lock */
+ avl_tree_t vdi_instances; /* vdi_lock */
+ avl_tree_t vdi_linstances; /* vdi_lock */
+ id_space_t *vdi_idspace; /* RO */
+ umem_cache_t *vdi_qcache; /* RO */
+ int vdi_overlayfd; /* RO */
+ int vdi_doorfd; /* vdi_lock */
+ int vdi_persistfd; /* vdi_plock */
+ cond_t vdi_lthr_cv; /* vdi_lock */
+ boolean_t vdi_lthr_quiesce; /* vdi_lock */
+ uint_t vdi_lthr_count; /* vdi_lock */
+} varpd_impl_t;
+
+typedef enum varpd_instance_flags {
+ VARPD_INSTANCE_F_ACTIVATED = 0x01
+} varpd_instance_flags_t;
+
+typedef struct varpd_instance {
+ avl_node_t vri_inode;
+ avl_node_t vri_lnode;
+ uint64_t vri_id; /* RO */
+ uint64_t vri_vnetid; /* RO */
+ datalink_id_t vri_linkid; /* RO */
+ overlay_target_mode_t vri_mode; /* RO */
+ overlay_plugin_dest_t vri_dest; /* RO */
+ varpd_impl_t *vri_impl; /* RO */
+ varpd_plugin_t *vri_plugin; /* RO */
+ void *vri_private; /* RO */
+ mutex_t vri_lock;
+ varpd_instance_flags_t vri_flags; /* vri_lock */
+} varpd_instance_t;
+
+typedef struct varpd_query {
+ overlay_targ_lookup_t vq_lookup;
+ overlay_targ_resp_t vq_response;
+ varpd_instance_t *vq_instance;
+} varpd_query_t;
+
+typedef struct varpd_client_create_arg {
+ datalink_id_t vcca_linkid;
+ uint64_t vcca_id;
+ char vcca_plugin[LIBVARPD_PROP_NAMELEN];
+} varpd_client_create_arg_t;
+
+typedef struct varpd_client_instance_arg {
+ uint64_t vcia_id;
+} varpd_client_instance_arg_t;
+
+typedef struct varpd_client_nprops_arg {
+ uint64_t vcna_id;
+ uint_t vcna_nprops;
+ uint8_t vcna_pad[4];
+} varpd_client_nprops_arg_t;
+
+typedef struct varpd_client_propinfo_arg {
+ uint64_t vcfa_id;
+ uint_t vcfa_propid;
+ uint_t vcfa_type;
+ uint_t vcfa_prot;
+ uint32_t vcfa_defsize;
+ uint32_t vcfa_psize;
+ uint8_t vcfa_pad[4];
+ char vcfa_name[LIBVARPD_PROP_NAMELEN];
+ uint8_t vcfa_default[LIBVARPD_PROP_SIZEMAX];
+ uint8_t vcfa_poss[LIBVARPD_PROP_SIZEMAX];
+} varpd_client_propinfo_arg_t;
+
+typedef struct varpd_client_prop_arg {
+ uint64_t vcpa_id;
+ uint_t vcpa_propid;
+ uint8_t vcpa_buf[LIBVARPD_PROP_SIZEMAX];
+ size_t vcpa_bufsize;
+} varpd_client_prop_arg_t;
+
+typedef struct varpd_client_lookup_arg {
+ datalink_id_t vcla_linkid;
+ uint32_t vcla_pad;
+ uint64_t vcla_id;
+} varpd_client_lookup_arg_t;
+
+typedef struct varpd_client_target_mode_arg {
+ uint64_t vtma_id;
+ uint32_t vtma_dest;
+ uint32_t vtma_mode;
+} varpd_client_target_mode_arg_t;
+
+typedef struct varpd_client_target_cache_arg {
+ uint64_t vtca_id;
+ uint8_t vtca_key[ETHERADDRL];
+ uint8_t vtca_pad[2];
+ varpd_client_cache_entry_t vtca_entry;
+} varpd_client_target_cache_arg_t;
+
+typedef struct varpd_client_target_walk_arg {
+ uint64_t vtcw_id;
+ uint64_t vtcw_marker;
+ uint64_t vtcw_count;
+ overlay_targ_cache_entry_t vtcw_ents[];
+} varpd_client_target_walk_arg_t;
+
+typedef enum varpd_client_command {
+ VARPD_CLIENT_INVALID = 0x0,
+ VARPD_CLIENT_CREATE,
+ VARPD_CLIENT_ACTIVATE,
+ VARPD_CLIENT_DESTROY,
+ VARPD_CLIENT_NPROPS,
+ VARPD_CLIENT_PROPINFO,
+ VARPD_CLIENT_GETPROP,
+ VARPD_CLIENT_SETPROP,
+ VARPD_CLIENT_LOOKUP,
+ VARPD_CLIENT_TARGET_MODE,
+ VARPD_CLIENT_CACHE_FLUSH,
+ VARPD_CLIENT_CACHE_DELETE,
+ VARPD_CLIENT_CACHE_GET,
+ VARPD_CLIENT_CACHE_SET,
+ VARPD_CLIENT_CACHE_WALK,
+ VARPD_CLIENT_MAX
+} varpd_client_command_t;
+
+typedef struct varpd_client_arg {
+ uint_t vca_command;
+ uint_t vca_errno;
+ union {
+ varpd_client_create_arg_t vca_create;
+ varpd_client_instance_arg_t vca_instance;
+ varpd_client_nprops_arg_t vca_nprops;
+ varpd_client_propinfo_arg_t vca_info;
+ varpd_client_prop_arg_t vca_prop;
+ varpd_client_lookup_arg_t vca_lookup;
+ varpd_client_target_mode_arg_t vca_mode;
+ varpd_client_target_cache_arg_t vca_cache;
+ varpd_client_target_walk_arg_t vca_walk;
+ } vca_un;
+} varpd_client_arg_t;
+
+typedef struct varpd_client_eresp {
+ uint_t vce_command;
+ uint_t vce_errno;
+} varpd_client_eresp_t;
+
+extern void libvarpd_plugin_init(void);
+extern void libvarpd_plugin_prefork(void);
+extern void libvarpd_plugin_postfork(void);
+extern void libvarpd_plugin_fini(void);
+extern int libvarpd_plugin_comparator(const void *, const void *);
+extern varpd_plugin_t *libvarpd_plugin_lookup(varpd_impl_t *, const char *);
+
+extern varpd_instance_t *libvarpd_instance_lookup_by_dlid(varpd_impl_t *,
+ datalink_id_t);
+
+extern void libvarpd_prop_door_convert(const varpd_prop_handle_t *,
+ varpd_client_propinfo_arg_t *);
+
+extern const char *libvarpd_isaext(void);
+typedef int (*libvarpd_dirwalk_f)(varpd_impl_t *, const char *, void *);
+extern int libvarpd_dirwalk(varpd_impl_t *, const char *, const char *,
+ libvarpd_dirwalk_f, void *);
+
+extern int libvarpd_overlay_init(varpd_impl_t *);
+extern void libvarpd_overlay_fini(varpd_impl_t *);
+extern int libvarpd_overlay_info(varpd_impl_t *, datalink_id_t,
+ overlay_plugin_dest_t *, uint64_t *, uint64_t *);
+extern int libvarpd_overlay_associate(varpd_instance_t *);
+extern int libvarpd_overlay_disassociate(varpd_instance_t *);
+extern int libvarpd_overlay_degrade(varpd_instance_t *, const char *);
+extern int libvarpd_overlay_degrade_datalink(varpd_impl_t *, datalink_id_t,
+ const char *);
+extern int libvarpd_overlay_restore(varpd_instance_t *);
+extern int libvarpd_overlay_packet(varpd_impl_t *,
+ const overlay_targ_lookup_t *, void *, size_t *);
+extern int libvarpd_overlay_inject(varpd_impl_t *,
+ const overlay_targ_lookup_t *, void *, size_t);
+extern int libvarpd_overlay_instance_inject(varpd_instance_t *, void *, size_t);
+extern int libvarpd_overlay_resend(varpd_impl_t *,
+ const overlay_targ_lookup_t *, void *, size_t);
+typedef int (*libvarpd_overlay_iter_f)(varpd_impl_t *, datalink_id_t, void *);
+extern int libvarpd_overlay_iter(varpd_impl_t *, libvarpd_overlay_iter_f,
+ void *);
+extern int libvarpd_overlay_cache_flush(varpd_instance_t *);
+extern int libvarpd_overlay_cache_delete(varpd_instance_t *, const uint8_t *);
+extern int libvarpd_overlay_cache_delete(varpd_instance_t *, const uint8_t *);
+extern int libvarpd_overlay_cache_get(varpd_instance_t *, const uint8_t *,
+ varpd_client_cache_entry_t *);
+extern int libvarpd_overlay_cache_set(varpd_instance_t *, const uint8_t *,
+ const varpd_client_cache_entry_t *);
+extern int libvarpd_overlay_cache_walk_fill(varpd_instance_t *, uint64_t *,
+ uint64_t *, overlay_targ_cache_entry_t *);
+
+extern void libvarpd_persist_init(varpd_impl_t *);
+extern void libvarpd_persist_fini(varpd_impl_t *);
+extern int libvarpd_persist_instance(varpd_impl_t *, varpd_instance_t *);
+extern void libvarpd_torch_instance(varpd_impl_t *, varpd_instance_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBVARPD_IMPL_H */
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c
new file mode 100644
index 0000000000..167c004a90
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c
@@ -0,0 +1,588 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * Interactions with /dev/overlay
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stropts.h>
+#include <strings.h>
+#include <umem.h>
+
+#include <libvarpd_impl.h>
+#include <sys/overlay_target.h>
+
+#define OVERLAY_PATH "/dev/overlay"
+
+int
+libvarpd_overlay_init(varpd_impl_t *vip)
+{
+ vip->vdi_overlayfd = open(OVERLAY_PATH, O_RDWR | O_EXCL);
+ if (vip->vdi_overlayfd == -1)
+ return (errno);
+ return (0);
+}
+
+void
+libvarpd_overlay_fini(varpd_impl_t *vip)
+{
+ assert(vip->vdi_overlayfd > 0);
+ if (close(vip->vdi_overlayfd) != 0)
+ libvarpd_panic("failed to close /dev/overlay fd %d: %d",
+ vip->vdi_overlayfd, errno);
+}
+
+int
+libvarpd_overlay_info(varpd_impl_t *vip, datalink_id_t linkid,
+ overlay_plugin_dest_t *destp, uint64_t *flags, uint64_t *vnetid)
+{
+ overlay_targ_info_t oti;
+
+ oti.oti_linkid = linkid;
+ if (ioctl(vip->vdi_overlayfd, OVERLAY_TARG_INFO, &oti) != 0)
+ return (errno);
+
+ if (destp != NULL)
+ *destp = oti.oti_needs;
+ if (flags != NULL)
+ *flags = oti.oti_flags;
+ if (vnetid != NULL)
+ *vnetid = oti.oti_vnetid;
+ return (0);
+}
+
+int
+libvarpd_overlay_associate(varpd_instance_t *inst)
+{
+ overlay_targ_associate_t ota;
+ varpd_impl_t *vip = inst->vri_impl;
+
+ bzero(&ota, sizeof (overlay_targ_associate_t));
+ ota.ota_linkid = inst->vri_linkid;
+ ota.ota_mode = inst->vri_mode;
+ ota.ota_id = inst->vri_id;
+ ota.ota_provides = inst->vri_dest;
+
+ if (ota.ota_mode == OVERLAY_TARGET_POINT) {
+ int ret;
+ ret = inst->vri_plugin->vpp_ops->vpo_default(inst->vri_private,
+ &ota.ota_point);
+ if (ret != VARPD_LOOKUP_OK)
+ return (ret);
+ }
+
+ if (ioctl(vip->vdi_overlayfd, OVERLAY_TARG_ASSOCIATE, &ota) != 0)
+ return (errno);
+
+ return (0);
+}
+
+int
+libvarpd_overlay_disassociate(varpd_instance_t *inst)
+{
+ overlay_targ_id_t otid;
+ varpd_impl_t *vip = inst->vri_impl;
+
+ otid.otid_linkid = inst->vri_linkid;
+ if (ioctl(vip->vdi_overlayfd, OVERLAY_TARG_DISASSOCIATE, &otid) != 0)
+ return (errno);
+ return (0);
+}
+
+int
+libvarpd_overlay_degrade_datalink(varpd_impl_t *vip, datalink_id_t linkid,
+ const char *msg)
+{
+ overlay_targ_degrade_t otd;
+
+ otd.otd_linkid = linkid;
+ (void) strlcpy(otd.otd_buf, msg, OVERLAY_STATUS_BUFLEN);
+ if (ioctl(vip->vdi_overlayfd, OVERLAY_TARG_DEGRADE, &otd) != 0)
+ return (errno);
+ return (0);
+
+}
+
+int
+libvarpd_overlay_degrade(varpd_instance_t *inst, const char *msg)
+{
+ return (libvarpd_overlay_degrade_datalink(inst->vri_impl,
+ inst->vri_linkid, msg));
+}
+
+int
+libvarpd_overlay_restore(varpd_instance_t *inst)
+{
+ overlay_targ_id_t otid;
+ varpd_impl_t *vip = inst->vri_impl;
+
+ otid.otid_linkid = inst->vri_linkid;
+ if (ioctl(vip->vdi_overlayfd, OVERLAY_TARG_RESTORE, &otid) != 0)
+ return (errno);
+ return (0);
+}
+
+int
+libvarpd_overlay_packet(varpd_impl_t *vip, const overlay_targ_lookup_t *otl,
+ void *buf, size_t *buflen)
+{
+ int ret;
+ overlay_targ_pkt_t otp;
+
+ otp.otp_linkid = UINT64_MAX;
+ otp.otp_reqid = otl->otl_reqid;
+ otp.otp_size = *buflen;
+ otp.otp_buf = buf;
+
+ do {
+ ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_PKT, &otp);
+ } while (ret != 0 && errno == EINTR);
+ if (ret != 0 && errno == EFAULT)
+ libvarpd_panic("OVERLAY_TARG_PKT ioctl efault");
+ else if (ret != 0)
+ ret = errno;
+
+ if (ret == 0)
+ *buflen = otp.otp_size;
+
+ return (ret);
+}
+
+static int
+libvarpd_overlay_inject_common(varpd_impl_t *vip, varpd_instance_t *inst,
+ const overlay_targ_lookup_t *otl, void *buf, size_t buflen, int cmd)
+{
+ int ret;
+ overlay_targ_pkt_t otp;
+
+ if (otl == NULL) {
+ otp.otp_linkid = inst->vri_linkid;
+ otp.otp_reqid = 0;
+ } else {
+ otp.otp_linkid = UINT64_MAX;
+ otp.otp_reqid = otl->otl_reqid;
+ }
+ otp.otp_size = buflen;
+ otp.otp_buf = buf;
+
+ do {
+ ret = ioctl(vip->vdi_overlayfd, cmd, &otp);
+ } while (ret != 0 && errno == EINTR);
+ if (ret != 0 && errno == EFAULT)
+ libvarpd_panic("overlay_inject_common ioctl EFAULT");
+ else if (ret != 0)
+ ret = errno;
+
+ return (ret);
+}
+
+int
+libvarpd_overlay_inject(varpd_impl_t *vip, const overlay_targ_lookup_t *otl,
+ void *buf, size_t buflen)
+{
+ return (libvarpd_overlay_inject_common(vip, NULL, otl, buf, buflen,
+ OVERLAY_TARG_INJECT));
+}
+
+int
+libvarpd_overlay_instance_inject(varpd_instance_t *inst, void *buf,
+ size_t buflen)
+{
+ return (libvarpd_overlay_inject_common(inst->vri_impl, inst, NULL, buf,
+ buflen, OVERLAY_TARG_INJECT));
+}
+
+int
+libvarpd_overlay_resend(varpd_impl_t *vip, const overlay_targ_lookup_t *otl,
+ void *buf, size_t buflen)
+{
+ return (libvarpd_overlay_inject_common(vip, NULL, otl, buf, buflen,
+ OVERLAY_TARG_RESEND));
+}
+
+static void
+libvarpd_overlay_lookup_reply(varpd_impl_t *vip,
+ const overlay_targ_lookup_t *otl, overlay_targ_resp_t *otr, int cmd)
+{
+ int ret;
+
+ otr->otr_reqid = otl->otl_reqid;
+ do {
+ ret = ioctl(vip->vdi_overlayfd, cmd, otr);
+ } while (ret != 0 && errno == EINTR);
+
+ /*
+ * The only errors that should cause us to end up here are due to
+ * programmer errors. Arguably the EINVAL case indicates that something
+ * is a bit off; however, at this time we don't opt to kill varpd.
+ */
+ if (ret != 0 && errno != EINVAL)
+ libvarpd_panic("received bad errno from lookup_reply "
+ "(cmd %d): %d\n", cmd, errno);
+}
+
+static void
+libvarpd_overlay_lookup_handle(varpd_impl_t *vip)
+{
+ int ret;
+ varpd_query_t *vqp;
+ overlay_targ_lookup_t *otl;
+ overlay_targ_resp_t *otr;
+ varpd_instance_t *inst;
+
+ vqp = umem_cache_alloc(vip->vdi_qcache, UMEM_DEFAULT);
+ otl = &vqp->vq_lookup;
+ otr = &vqp->vq_response;
+ /*
+ * abort doesn't really help here that much, maybe we can instead try
+ * and for a reap or something?
+ */
+ if (vqp == NULL)
+ libvarpd_panic("failed to allocate memory for lookup "
+ "handle..., we should not panic()");
+ ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_LOOKUP, otl);
+ if (ret != 0 && errno != ETIME && errno != EINTR)
+ libvarpd_panic("received bad errno from OVERLAY_TARG_LOOKUP: "
+ "%d", errno);
+
+ if (ret != 0) {
+ umem_cache_free(vip->vdi_qcache, vqp);
+ return;
+ }
+
+ inst = (varpd_instance_t *)libvarpd_instance_lookup(
+ (varpd_handle_t *)vip, otl->otl_varpdid);
+ if (inst == NULL) {
+ libvarpd_overlay_lookup_reply(vip, otl, otr,
+ OVERLAY_TARG_DROP);
+ umem_cache_free(vip->vdi_qcache, vqp);
+ return;
+ }
+ vqp->vq_instance = inst;
+
+ inst->vri_plugin->vpp_ops->vpo_lookup(inst->vri_private,
+ (varpd_query_handle_t *)vqp, otl, &otr->otr_answer);
+}
+
+/* Use "void *" for vhp here to play nicely with thr_create(). */
+void *
+libvarpd_overlay_lookup_run(void *vhp)
+{
+ varpd_impl_t *vip = (varpd_impl_t *)vhp;
+
+ mutex_enter(&vip->vdi_lock);
+ if (vip->vdi_lthr_quiesce == B_TRUE) {
+ mutex_exit(&vip->vdi_lock);
+ return (NULL);
+ }
+ vip->vdi_lthr_count++;
+
+ for (;;) {
+ mutex_exit(&vip->vdi_lock);
+ libvarpd_overlay_lookup_handle(vip);
+ mutex_enter(&vip->vdi_lock);
+ if (vip->vdi_lthr_quiesce == B_TRUE)
+ break;
+ }
+ assert(vip->vdi_lthr_count > 0);
+ vip->vdi_lthr_count--;
+ (void) cond_signal(&vip->vdi_lthr_cv);
+ mutex_exit(&vip->vdi_lock);
+ return (NULL);
+}
+
+void
+libvarpd_overlay_lookup_quiesce(varpd_handle_t *vhp)
+{
+ varpd_impl_t *vip = (varpd_impl_t *)vhp;
+
+ mutex_enter(&vip->vdi_lock);
+ if (vip->vdi_lthr_count == 0) {
+ mutex_exit(&vip->vdi_lock);
+ return;
+ }
+ vip->vdi_lthr_quiesce = B_TRUE;
+ while (vip->vdi_lthr_count > 0)
+ (void) cond_wait(&vip->vdi_lthr_cv, &vip->vdi_lock);
+ vip->vdi_lthr_quiesce = B_FALSE;
+ mutex_exit(&vip->vdi_lock);
+}
+
+int
+libvarpd_overlay_iter(varpd_impl_t *vip, libvarpd_overlay_iter_f func,
+ void *arg)
+{
+ uint32_t curents = 0, i;
+ size_t size;
+ overlay_targ_list_t *otl;
+
+ for (;;) {
+ size = sizeof (overlay_targ_list_t) +
+ sizeof (uint32_t) * curents;
+ otl = umem_alloc(size, UMEM_DEFAULT);
+ if (otl == NULL)
+ return (ENOMEM);
+
+ otl->otl_nents = curents;
+ if (ioctl(vip->vdi_overlayfd, OVERLAY_TARG_LIST, otl) != 0) {
+ if (errno == EFAULT)
+ libvarpd_panic("OVERLAY_TARG_LIST ioctl "
+ "efault");
+ umem_free(otl, size);
+ if (errno == EINTR)
+ continue;
+ else
+ return (errno);
+ }
+
+ if (otl->otl_nents == curents)
+ break;
+
+ curents = otl->otl_nents;
+ umem_free(otl, size);
+ }
+
+ for (i = 0; i < otl->otl_nents; i++) {
+ if (func(vip, otl->otl_ents[i], arg) != 0)
+ break;
+ }
+ umem_free(otl, size);
+ return (0);
+}
+
+int
+libvarpd_overlay_cache_flush(varpd_instance_t *inst)
+{
+ int ret;
+ overlay_targ_cache_t cache;
+ varpd_impl_t *vip = inst->vri_impl;
+
+ bzero(&cache, sizeof (overlay_targ_cache_t));
+ cache.otc_linkid = inst->vri_linkid;
+
+ ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_FLUSH, &cache);
+ if (ret != 0 && errno == EFAULT)
+ libvarpd_panic("OVERLAY_TARG_CACHE_FLUSH ioctl efault");
+ else if (ret != 0)
+ ret = errno;
+
+ return (ret);
+}
+
+int
+libvarpd_overlay_cache_delete(varpd_instance_t *inst, const uint8_t *key)
+{
+ int ret;
+ overlay_targ_cache_t cache;
+ varpd_impl_t *vip = inst->vri_impl;
+
+ bzero(&cache, sizeof (overlay_targ_cache_t));
+ cache.otc_linkid = inst->vri_linkid;
+ bcopy(key, cache.otc_entry.otce_mac, ETHERADDRL);
+
+ ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_REMOVE, &cache);
+ if (ret != 0 && errno == EFAULT)
+ libvarpd_panic("OVERLAY_TARG_CACHE_REMOVE ioctl efault");
+ else if (ret != 0)
+ ret = errno;
+
+ return (ret);
+
+}
+
+int
+libvarpd_overlay_cache_get(varpd_instance_t *inst, const uint8_t *key,
+ varpd_client_cache_entry_t *entry)
+{
+ int ret;
+ overlay_targ_cache_t cache;
+ varpd_impl_t *vip = inst->vri_impl;
+
+ bzero(&cache, sizeof (overlay_targ_cache_t));
+ cache.otc_linkid = inst->vri_linkid;
+ bcopy(key, cache.otc_entry.otce_mac, ETHERADDRL);
+
+ ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_GET, &cache);
+ if (ret != 0 && errno == EFAULT)
+ libvarpd_panic("OVERLAY_TARG_CACHE_GET ioctl efault");
+ else if (ret != 0)
+ return (errno);
+
+ bcopy(cache.otc_entry.otce_dest.otp_mac, &entry->vcp_mac, ETHERADDRL);
+ entry->vcp_flags = cache.otc_entry.otce_flags;
+ entry->vcp_ip = cache.otc_entry.otce_dest.otp_ip;
+ entry->vcp_port = cache.otc_entry.otce_dest.otp_port;
+
+ return (0);
+}
+
+int
+libvarpd_overlay_cache_set(varpd_instance_t *inst, const uint8_t *key,
+ const varpd_client_cache_entry_t *entry)
+{
+ int ret;
+ overlay_targ_cache_t cache;
+ varpd_impl_t *vip = inst->vri_impl;
+
+ bzero(&cache, sizeof (overlay_targ_cache_t));
+ cache.otc_linkid = inst->vri_linkid;
+ bcopy(key, cache.otc_entry.otce_mac, ETHERADDRL);
+ bcopy(&entry->vcp_mac, cache.otc_entry.otce_dest.otp_mac, ETHERADDRL);
+ cache.otc_entry.otce_flags = entry->vcp_flags;
+ cache.otc_entry.otce_dest.otp_ip = entry->vcp_ip;
+ cache.otc_entry.otce_dest.otp_port = entry->vcp_port;
+
+ ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_SET, &cache);
+ if (ret != 0 && errno == EFAULT)
+ libvarpd_panic("OVERLAY_TARG_CACHE_SET ioctl efault");
+ else if (ret != 0)
+ return (errno);
+
+ return (0);
+}
+
+int
+libvarpd_overlay_cache_walk_fill(varpd_instance_t *inst, uint64_t *markerp,
+ uint64_t *countp, overlay_targ_cache_entry_t *ents)
+{
+ int ret;
+ size_t asize;
+ overlay_targ_cache_iter_t *iter;
+ varpd_impl_t *vip = inst->vri_impl;
+
+ if (*countp > 200)
+ return (E2BIG);
+
+ asize = sizeof (overlay_targ_cache_iter_t) +
+ *countp * sizeof (overlay_targ_cache_entry_t);
+ iter = umem_alloc(asize, UMEM_DEFAULT);
+ if (iter == NULL)
+ return (ENOMEM);
+
+ iter->otci_linkid = inst->vri_linkid;
+ iter->otci_marker = *markerp;
+ iter->otci_count = *countp;
+ ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_ITER, iter);
+ if (ret != 0 && errno == EFAULT)
+ libvarpd_panic("OVERLAY_TARG_CACHE_ITER ioctl efault");
+ else if (ret != 0) {
+ ret = errno;
+ goto out;
+ }
+
+ *markerp = iter->otci_marker;
+ *countp = iter->otci_count;
+ bcopy(iter->otci_ents, ents,
+ *countp * sizeof (overlay_targ_cache_entry_t));
+out:
+ umem_free(iter, asize);
+ return (ret);
+}
+
+void
+libvarpd_plugin_query_reply(varpd_query_handle_t *vqh, int action)
+{
+ varpd_query_t *vqp = (varpd_query_t *)vqh;
+
+ if (vqp == NULL)
+ libvarpd_panic("unknown plugin passed invalid "
+ "varpd_query_handle_t");
+
+ if (action == VARPD_LOOKUP_DROP)
+ libvarpd_overlay_lookup_reply(vqp->vq_instance->vri_impl,
+ &vqp->vq_lookup, &vqp->vq_response, OVERLAY_TARG_DROP);
+ else if (action == VARPD_LOOKUP_OK)
+ libvarpd_overlay_lookup_reply(vqp->vq_instance->vri_impl,
+ &vqp->vq_lookup, &vqp->vq_response, OVERLAY_TARG_RESPOND);
+ else
+ libvarpd_panic("plugin %s passed in an invalid action: %d",
+ vqp->vq_instance->vri_plugin->vpp_name, action);
+
+ umem_cache_free(vqp->vq_instance->vri_impl->vdi_qcache, vqp);
+}
+
+void
+libvarpd_inject_varp(varpd_provider_handle_t *vph, const uint8_t *mac,
+ const overlay_target_point_t *otp)
+{
+ int ret;
+ overlay_targ_cache_t otc;
+ varpd_instance_t *inst = (varpd_instance_t *)vph;
+ varpd_impl_t *vip = inst->vri_impl;
+
+ if (otp == NULL) {
+ (void) libvarpd_overlay_cache_delete(inst, mac);
+ return;
+ }
+
+ otc.otc_linkid = inst->vri_linkid;
+ otc.otc_entry.otce_flags = 0;
+ bcopy(mac, otc.otc_entry.otce_mac, ETHERADDRL);
+ bcopy(otp, &otc.otc_entry.otce_dest, sizeof (overlay_target_point_t));
+
+ ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_SET, &otc);
+ if (ret != 0) {
+ switch (errno) {
+ case EBADF:
+ case EFAULT:
+ case ENOTSUP:
+ libvarpd_panic("received bad errno from "
+ "OVERLAY_TARG_CACHE_SET: %d", errno);
+ default:
+ break;
+ }
+ }
+}
+
+void
+libvarpd_fma_degrade(varpd_provider_handle_t *vph, const char *msg)
+{
+ int ret;
+ varpd_instance_t *inst = (varpd_instance_t *)vph;
+
+ ret = libvarpd_overlay_degrade(inst, msg);
+ switch (ret) {
+ case ENOENT:
+ case EFAULT:
+ libvarpd_panic("received bad errno from degrade ioctl: %d",
+ errno);
+ default:
+ break;
+ }
+}
+
+void
+libvarpd_fma_restore(varpd_provider_handle_t *vph)
+{
+ int ret;
+ varpd_instance_t *inst = (varpd_instance_t *)vph;
+
+ ret = libvarpd_overlay_restore(inst);
+ switch (ret) {
+ case ENOENT:
+ case EFAULT:
+ libvarpd_panic("received bad errno from restore ioctl: %d",
+ errno);
+ default:
+ break;
+ }
+}
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_panic.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_panic.c
new file mode 100644
index 0000000000..ba6bc26bf6
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_panic.c
@@ -0,0 +1,53 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+/*
+ * No, 'tis not so deep as a well, nor so wide as a church door; but 'tis
+ * enough, 'twill serve. Ask for me tomorrow, and you shall find me a grave man.
+ *
+ * This file maintains various routines for handling when we die.
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <thread.h>
+#include <stdlib.h>
+
+/*
+ * Normally these would be static, but if they're static, that throws off lint
+ * because it thinks we never use them, which is kind of the point, because we
+ * only read them in the core...
+ */
+int varpd_panic_errno;
+char varpd_panic_buf[1024];
+thread_t varpd_panic_thread;
+
+void
+libvarpd_panic(const char *fmt, ...)
+{
+ va_list ap;
+
+ /* Always save errno first! */
+ varpd_panic_errno = errno;
+ varpd_panic_thread = thr_self();
+
+ if (fmt != NULL) {
+ va_start(ap, fmt);
+ (void) vsnprintf(varpd_panic_buf, sizeof (varpd_panic_buf), fmt,
+ ap);
+ }
+ abort();
+}
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c
new file mode 100644
index 0000000000..27cc802a9c
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c
@@ -0,0 +1,586 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * varpd persistence backend
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <strings.h>
+#include <librename.h>
+#include <md5.h>
+#include <sys/sysmacros.h>
+#include <dirent.h>
+#include <sys/mman.h>
+#include <umem.h>
+#include <sys/debug.h>
+
+#include <libvarpd_impl.h>
+
+static uint8_t varpd_persist_magic[4] = {
+ 'v',
+ 'a',
+ 'r',
+ 'p',
+};
+
+#define VARPD_PERSIST_MAXWRITE 4096
+#define VARPD_PERSIST_VERSION_ONE 1
+#define VARPD_PERSIST_SUFFIX ".varpd"
+
+typedef struct varpd_persist_header {
+ uint8_t vph_magic[4];
+ uint32_t vph_version;
+ uint8_t vph_md5[16];
+} varpd_persist_header_t;
+
+void
+libvarpd_persist_init(varpd_impl_t *vip)
+{
+ vip->vdi_persistfd = -1;
+ if (rwlock_init(&vip->vdi_pfdlock, USYNC_THREAD, NULL) != 0)
+ libvarpd_panic("failed to create rw vdi_pfdlock");
+}
+
+void
+libvarpd_persist_fini(varpd_impl_t *vip)
+{
+ /*
+ * Clean up for someone that left something behind.
+ */
+ if (vip->vdi_persistfd != -1) {
+ if (close(vip->vdi_persistfd) != 0)
+ libvarpd_panic("failed to close persist fd %d: %d",
+ vip->vdi_persistfd, errno);
+ vip->vdi_persistfd = -1;
+ }
+ if (rwlock_destroy(&vip->vdi_pfdlock) != 0)
+ libvarpd_panic("failed to destroy rw vdi_pfdlock");
+}
+
+int
+libvarpd_persist_enable(varpd_handle_t *vhp, const char *rootdir)
+{
+ int fd;
+ struct stat st;
+ varpd_impl_t *vip = (varpd_impl_t *)vhp;
+
+ fd = open(rootdir, O_RDONLY);
+ if (fd < 0)
+ return (errno);
+
+ if (fstat(fd, &st) != 0) {
+ int ret = errno;
+ if (close(fd) != 0)
+ libvarpd_panic("failed to close rootdir fd (%s) %d: %d",
+ rootdir, fd, errno);
+ return (ret);
+ }
+
+ if (!S_ISDIR(st.st_mode)) {
+ if (close(fd) != 0)
+ libvarpd_panic("failed to close rootdir fd (%s) %d: %d",
+ rootdir, fd, errno);
+ return (EINVAL);
+ }
+
+
+ VERIFY0(rw_wrlock(&vip->vdi_pfdlock));
+ if (vip->vdi_persistfd != -1) {
+ VERIFY0(rw_unlock(&vip->vdi_pfdlock));
+ if (close(fd) != 0)
+ libvarpd_panic("failed to close rootdir fd (%s) %d: %d",
+ rootdir, fd, errno);
+ return (EEXIST);
+ }
+ vip->vdi_persistfd = fd;
+ VERIFY0(rw_unlock(&vip->vdi_pfdlock));
+
+ return (0);
+}
+
+static int
+libvarpd_persist_write(int fd, const void *buf, size_t buflen)
+{
+ ssize_t ret;
+ off_t off = 0;
+
+ while (buflen > 0) {
+ ret = write(fd, (void *)((uintptr_t)buf + off),
+ MIN(buflen, VARPD_PERSIST_MAXWRITE));
+ if (ret == -1 && errno == EINTR)
+ continue;
+ if (ret == -1)
+ return (errno);
+
+ off += ret;
+ buflen -= ret;
+ }
+
+ return (0);
+}
+
+static int
+libvarpd_persist_nvlist(int dirfd, uint64_t id, nvlist_t *nvl)
+{
+ int err, fd;
+ size_t size;
+ varpd_persist_header_t hdr;
+ librename_atomic_t *lrap;
+ char *buf = NULL, *name;
+
+ if ((err = nvlist_pack(nvl, &buf, &size, NV_ENCODE_XDR, 0)) != 0)
+ return (err);
+
+ if (asprintf(&name, "%llu%s", (unsigned long long)id, ".varpd") == -1) {
+ err = errno;
+ free(buf);
+ return (err);
+ }
+
+ if ((err = librename_atomic_fdinit(dirfd, name, NULL, 0600, 0,
+ &lrap)) != 0) {
+ free(name);
+ free(buf);
+ return (err);
+ }
+
+ fd = librename_atomic_fd(lrap);
+
+ bzero(&hdr, sizeof (varpd_persist_header_t));
+ bcopy(varpd_persist_magic, hdr.vph_magic, sizeof (varpd_persist_magic));
+ hdr.vph_version = VARPD_PERSIST_VERSION_ONE;
+ md5_calc(hdr.vph_md5, buf, size);
+
+ if ((err = libvarpd_persist_write(fd, &hdr,
+ sizeof (varpd_persist_header_t))) != 0) {
+ librename_atomic_fini(lrap);
+ free(name);
+ free(buf);
+ return (err);
+ }
+
+ if ((err = libvarpd_persist_write(fd, buf, size)) != 0) {
+ librename_atomic_fini(lrap);
+ free(name);
+ free(buf);
+ return (err);
+ }
+
+ do {
+ err = librename_atomic_commit(lrap);
+ } while (err == EINTR);
+
+ librename_atomic_fini(lrap);
+ free(name);
+ free(buf);
+ return (err);
+}
+
+int
+libvarpd_persist_instance(varpd_impl_t *vip, varpd_instance_t *inst)
+{
+ int err = 0;
+ nvlist_t *nvl = NULL, *cvl = NULL;
+
+ VERIFY0(rw_rdlock(&vip->vdi_pfdlock));
+ /* Check if persistence exists */
+ if (vip->vdi_persistfd == -1)
+ goto out;
+
+ if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0)) != 0)
+ goto out;
+
+ if ((err = nvlist_alloc(&cvl, NV_UNIQUE_NAME, 0)) != 0)
+ goto out;
+
+ if ((err = nvlist_add_uint64(nvl, "vri_id", inst->vri_id)) != 0)
+ goto out;
+
+ if ((err = nvlist_add_uint32(nvl, "vri_linkid", inst->vri_linkid)) != 0)
+ goto out;
+
+ if ((err = nvlist_add_uint32(nvl, "vri_dest",
+ (uint32_t)inst->vri_dest)) != 0)
+ goto out;
+ if ((err = nvlist_add_uint32(nvl, "vri_mode",
+ (uint32_t)inst->vri_mode)) != 0)
+ goto out;
+
+ if ((err = nvlist_add_string(nvl, "vri_plugin",
+ inst->vri_plugin->vpp_name)) != 0)
+ goto out;
+
+ err = inst->vri_plugin->vpp_ops->vpo_save(inst->vri_private, cvl);
+ if (err != 0)
+ goto out;
+
+ if ((err = nvlist_add_nvlist(nvl, "vri_private", cvl)) != 0)
+ goto out;
+
+ err = libvarpd_persist_nvlist(vip->vdi_persistfd, inst->vri_id, nvl);
+out:
+ nvlist_free(nvl);
+ nvlist_free(cvl);
+ VERIFY0(rw_unlock(&vip->vdi_pfdlock));
+ return (err);
+}
+
+void
+libvarpd_torch_instance(varpd_impl_t *vip, varpd_instance_t *inst)
+{
+ char buf[32];
+ int ret;
+
+ VERIFY0(rw_rdlock(&vip->vdi_pfdlock));
+ if (vip->vdi_persistfd == -1) {
+ VERIFY0(rw_unlock(&vip->vdi_pfdlock));
+ return;
+ }
+
+ if (snprintf(buf, sizeof (buf), "%lld.varpd", inst->vri_id) >= 32)
+ libvarpd_panic("somehow exceeded static value for "
+ "libvarpd_torch_instance buffer");
+
+ do {
+ ret = unlinkat(vip->vdi_persistfd, buf, 0);
+ } while (ret == -1 && errno == EINTR);
+ if (ret != 0) {
+ switch (errno) {
+ case ENOENT:
+ break;
+ default:
+ libvarpd_panic("failed to unlinkat %d`%s: %s",
+ vip->vdi_persistfd, buf, strerror(errno));
+ }
+ }
+
+ VERIFY0(rw_unlock(&vip->vdi_pfdlock));
+}
+
+static int
+libvarpd_persist_restore_instance(varpd_impl_t *vip, nvlist_t *nvl)
+{
+ int err;
+ nvlist_t *pvl;
+ uint64_t id, flags, vid;
+ uint32_t linkid, dest, mode;
+ char *pluginstr;
+ varpd_plugin_t *plugin;
+ overlay_plugin_dest_t adest;
+ varpd_instance_t *inst, lookup;
+
+ if (nvlist_lookup_uint64(nvl, "vri_id", &id) != 0)
+ return (EINVAL);
+
+ if (nvlist_lookup_uint32(nvl, "vri_linkid", &linkid) != 0)
+ return (EINVAL);
+
+ if (nvlist_lookup_uint32(nvl, "vri_dest", &dest) != 0)
+ return (EINVAL);
+
+ if (nvlist_lookup_uint32(nvl, "vri_mode", &mode) != 0)
+ return (EINVAL);
+
+ if (nvlist_lookup_string(nvl, "vri_plugin", &pluginstr) != 0)
+ return (EINVAL);
+
+ if (nvlist_lookup_nvlist(nvl, "vri_private", &pvl) != 0)
+ return (EINVAL);
+
+ plugin = libvarpd_plugin_lookup(vip, pluginstr);
+ if (plugin == NULL)
+ return (EINVAL);
+
+ if (plugin->vpp_mode != mode)
+ return (EINVAL);
+
+ if (libvarpd_overlay_info(vip, linkid, &adest, &flags, &vid) != 0)
+ return (EINVAL);
+
+ if (dest != adest)
+ return (EINVAL);
+
+ inst = umem_alloc(sizeof (varpd_instance_t), UMEM_DEFAULT);
+ if (inst == NULL)
+ libvarpd_panic("failed to allocate instance for restore");
+
+ inst->vri_id = id_alloc_specific(vip->vdi_idspace, id);
+ if (inst->vri_id != id) {
+ umem_free(inst, sizeof (varpd_instance_t));
+ return (EINVAL);
+ }
+
+ inst->vri_linkid = linkid;
+ inst->vri_vnetid = vid;
+ inst->vri_mode = plugin->vpp_mode;
+ inst->vri_dest = dest;
+ inst->vri_plugin = plugin;
+ inst->vri_impl = vip;
+ inst->vri_flags = 0;
+ if (plugin->vpp_ops->vpo_restore(pvl, (varpd_provider_handle_t *)inst,
+ dest, &inst->vri_private) != 0) {
+ id_free(vip->vdi_idspace, id);
+ umem_free(inst, sizeof (varpd_instance_t));
+ return (EINVAL);
+ }
+
+ if (mutex_init(&inst->vri_lock, USYNC_THREAD | LOCK_ERRORCHECK,
+ NULL) != 0)
+ libvarpd_panic("failed to create vri_lock mutex");
+
+ mutex_enter(&vip->vdi_lock);
+ lookup.vri_id = inst->vri_id;
+ if (avl_find(&vip->vdi_instances, &lookup, NULL) != NULL)
+ libvarpd_panic("found duplicate instance with id %d",
+ lookup.vri_id);
+ avl_add(&vip->vdi_instances, inst);
+ lookup.vri_linkid = inst->vri_linkid;
+ if (avl_find(&vip->vdi_linstances, &lookup, NULL) != NULL)
+ libvarpd_panic("found duplicate linstance with id %d",
+ lookup.vri_linkid);
+ avl_add(&vip->vdi_linstances, inst);
+ mutex_exit(&vip->vdi_lock);
+
+ if (plugin->vpp_ops->vpo_start(inst->vri_private) != 0) {
+ libvarpd_instance_destroy((varpd_instance_handle_t *)inst);
+ return (EINVAL);
+ }
+
+ if (flags & OVERLAY_TARG_INFO_F_ACTIVE)
+ (void) libvarpd_overlay_disassociate(inst);
+
+ if (libvarpd_overlay_associate(inst) != 0) {
+ libvarpd_instance_destroy((varpd_instance_handle_t *)inst);
+ return (EINVAL);
+ }
+
+ if (flags & OVERLAY_TARG_INFO_F_DEGRADED) {
+ if ((err = libvarpd_overlay_restore(inst)) != 0) {
+ libvarpd_panic("failed to restore instance %p: %d\n",
+ inst, err);
+ }
+ }
+
+ mutex_enter(&inst->vri_lock);
+ inst->vri_flags |= VARPD_INSTANCE_F_ACTIVATED;
+ mutex_exit(&inst->vri_lock);
+
+ return (0);
+}
+
+static int
+libvarpd_persist_restore_one(varpd_impl_t *vip, int fd)
+{
+ int err;
+ size_t fsize;
+ struct stat st;
+ void *buf, *datap;
+ varpd_persist_header_t *hdr;
+ uint8_t md5[16];
+ nvlist_t *nvl;
+
+ if (fstat(fd, &st) != 0)
+ return (errno);
+
+ if (st.st_size <= sizeof (varpd_persist_header_t))
+ return (EINVAL);
+ fsize = st.st_size - sizeof (varpd_persist_header_t);
+
+ buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (buf == MAP_FAILED)
+ return (errno);
+
+ hdr = buf;
+ if (bcmp(varpd_persist_magic, hdr->vph_magic,
+ sizeof (varpd_persist_magic)) != 0) {
+ if (munmap(buf, st.st_size) != 0)
+ libvarpd_panic("failed to munmap %p: %d", buf, errno);
+ return (EINVAL);
+ }
+
+ if (hdr->vph_version != VARPD_PERSIST_VERSION_ONE) {
+ if (munmap(buf, st.st_size) != 0)
+ libvarpd_panic("failed to munmap %p: %d", buf, errno);
+ return (EINVAL);
+ }
+
+ datap = (void *)((uintptr_t)buf + sizeof (varpd_persist_header_t));
+ md5_calc(md5, datap, fsize);
+ if (bcmp(md5, hdr->vph_md5, sizeof (uint8_t) * 16) != 0) {
+ if (munmap(buf, st.st_size) != 0)
+ libvarpd_panic("failed to munmap %p: %d", buf, errno);
+ return (EINVAL);
+ }
+
+ err = nvlist_unpack(datap, fsize, &nvl, 0);
+ if (munmap(buf, st.st_size) != 0)
+ libvarpd_panic("failed to munmap %p: %d", buf, errno);
+
+ if (err != 0)
+ return (EINVAL);
+
+ err = libvarpd_persist_restore_instance(vip, nvl);
+ nvlist_free(nvl);
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+libvarpd_check_degrade_cb(varpd_impl_t *vip, datalink_id_t linkid, void *arg)
+{
+ varpd_instance_t *inst;
+
+ mutex_enter(&vip->vdi_lock);
+ for (inst = avl_first(&vip->vdi_instances); inst != NULL;
+ inst = AVL_NEXT(&vip->vdi_instances, inst)) {
+ if (inst->vri_linkid == linkid) {
+ mutex_exit(&vip->vdi_lock);
+ return (0);
+ }
+ }
+
+ mutex_exit(&vip->vdi_lock);
+
+ (void) libvarpd_overlay_degrade_datalink(vip, linkid,
+ "no varpd instance exists");
+ return (0);
+}
+
+static void
+libvarpd_check_degrade(varpd_impl_t *vip)
+{
+ (void) libvarpd_overlay_iter(vip, libvarpd_check_degrade_cb, NULL);
+}
+
+int
+libvarpd_persist_restore(varpd_handle_t *vhp)
+{
+ int dirfd;
+ int ret = 0;
+ DIR *dirp = NULL;
+ struct dirent *dp;
+ varpd_impl_t *vip = (varpd_impl_t *)vhp;
+
+ VERIFY0(rw_rdlock(&vip->vdi_pfdlock));
+ if ((dirfd = dup(vip->vdi_persistfd)) < 0) {
+ ret = errno;
+ goto out;
+ }
+
+ if ((dirp = fdopendir(dirfd)) == NULL) {
+ ret = errno;
+ if (close(dirfd) != 0)
+ libvarpd_panic("failed to close dirfd %d: %d",
+ dirfd, errno);
+ goto out;
+ }
+
+ for (;;) {
+ int fd;
+ uint64_t id;
+ char *eptr;
+ struct stat st;
+
+ errno = 0;
+ dp = readdir(dirp);
+ if (dp == NULL) {
+ ret = errno;
+ break;
+ }
+
+ if (strcmp(dp->d_name, ".") == 0 ||
+ strcmp(dp->d_name, "..") == 0)
+ continue;
+
+ /*
+ * Leave files that we don't recognize alone. A valid file has
+ * the format `%llu.varpd`.
+ */
+ errno = 0;
+ id = strtoull(dp->d_name, &eptr, 10);
+ if ((id == 0 && errno == EINVAL) ||
+ (id == ULLONG_MAX && errno == ERANGE))
+ continue;
+
+ if (strcmp(eptr, VARPD_PERSIST_SUFFIX) != 0)
+ continue;
+
+ fd = openat(vip->vdi_persistfd, dp->d_name, O_RDONLY);
+ if (fd < 0) {
+ ret = errno;
+ break;
+ }
+
+ if (fstat(fd, &st) != 0) {
+ ret = errno;
+ break;
+ }
+
+ if (!S_ISREG(st.st_mode)) {
+ if (close(fd) != 0)
+ libvarpd_panic("failed to close fd (%s) %d: "
+ "%d\n", dp->d_name, fd, errno);
+ continue;
+ }
+
+ ret = libvarpd_persist_restore_one(vip, fd);
+ if (close(fd) != 0)
+ libvarpd_panic("failed to close fd (%s) %d: "
+ "%d\n", dp->d_name, fd, errno);
+ /*
+ * This is an invalid file. We'll unlink it to save us this
+ * trouble in the future.
+ */
+ if (ret != 0) {
+ if (unlinkat(vip->vdi_persistfd, dp->d_name, 0) != 0) {
+ ret = errno;
+ break;
+ }
+ }
+ }
+
+ libvarpd_check_degrade(vip);
+
+out:
+ if (dirp != NULL)
+ (void) closedir(dirp);
+ VERIFY0(rw_unlock(&vip->vdi_pfdlock));
+ return (ret);
+}
+
+int
+libvarpd_persist_disable(varpd_handle_t *vhp)
+{
+ varpd_impl_t *vip = (varpd_impl_t *)vhp;
+
+ VERIFY0(rw_wrlock(&vip->vdi_pfdlock));
+ if (vip->vdi_persistfd == -1) {
+ mutex_exit(&vip->vdi_lock);
+ VERIFY0(rw_unlock(&vip->vdi_pfdlock));
+ return (ENOENT);
+ }
+ if (close(vip->vdi_persistfd) != 0)
+ libvarpd_panic("failed to close persist fd %d: %d",
+ vip->vdi_persistfd, errno);
+ vip->vdi_persistfd = -1;
+ VERIFY0(rw_unlock(&vip->vdi_pfdlock));
+ return (0);
+}
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_plugin.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_plugin.c
new file mode 100644
index 0000000000..176306a3f7
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_plugin.c
@@ -0,0 +1,256 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * varpd plugin management
+ */
+
+#include <libvarpd_impl.h>
+#include <errno.h>
+#include <umem.h>
+#include <assert.h>
+#include <strings.h>
+#include <dlfcn.h>
+#include <link.h>
+#include <stdio.h>
+
+static varpd_impl_t *varpd_load_handle;
+static const char *varpd_load_path;
+static mutex_t varpd_load_lock;
+static cond_t varpd_load_cv;
+
+int
+libvarpd_plugin_comparator(const void *lp, const void *rp)
+{
+ int ret;
+ const varpd_plugin_t *lpp, *rpp;
+
+ lpp = lp;
+ rpp = rp;
+
+ ret = strcmp(lpp->vpp_name, rpp->vpp_name);
+ if (ret > 0)
+ return (1);
+ if (ret < 0)
+ return (-1);
+ return (0);
+}
+
+varpd_plugin_register_t *
+libvarpd_plugin_alloc(uint_t version, int *errp)
+{
+ int err;
+ varpd_plugin_register_t *vprp;
+
+ if (errp == NULL)
+ errp = &err;
+
+ if (version != VARPD_VERSION_ONE) {
+ (void) fprintf(stderr,
+ "unsupported registration version %u - %s\n",
+ version, varpd_load_path);
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ vprp = umem_alloc(sizeof (varpd_plugin_register_t), UMEM_DEFAULT);
+ if (vprp == NULL) {
+ (void) fprintf(stderr,
+ "failed to allocate registration handle - %s\n",
+ varpd_load_path);
+ *errp = ENOMEM;
+ return (NULL);
+ }
+
+ vprp->vpr_version = VARPD_VERSION_ONE;
+
+ return (vprp);
+}
+
+void
+libvarpd_plugin_free(varpd_plugin_register_t *vprp)
+{
+ umem_free(vprp, sizeof (varpd_plugin_register_t));
+}
+
+int
+libvarpd_plugin_register(varpd_plugin_register_t *vprp)
+{
+ varpd_plugin_t *vpp;
+ varpd_plugin_t lookup;
+
+ vpp = umem_alloc(sizeof (varpd_plugin_t), UMEM_DEFAULT);
+ if (vpp == NULL) {
+ (void) fprintf(stderr,
+ "failed to allocate memory for the varpd_plugin_t - %s\n",
+ varpd_load_path);
+ return (ENOMEM);
+ }
+
+ /* Watch out for an evil plugin */
+ if (vprp->vpr_version != VARPD_VERSION_ONE) {
+ (void) fprintf(stderr,
+ "unsupported registration version %u - %s\n",
+ vprp->vpr_version, varpd_load_path);
+ return (EINVAL);
+ }
+
+ mutex_enter(&varpd_load_lock);
+ if (varpd_load_handle == NULL)
+ libvarpd_panic("varpd_load_handle was unexpectedly null");
+
+ mutex_enter(&varpd_load_handle->vdi_lock);
+ lookup.vpp_name = vprp->vpr_name;
+ if (avl_find(&varpd_load_handle->vdi_plugins, &lookup, NULL) != NULL) {
+ (void) fprintf(stderr,
+ "module already exists with requested name '%s' - %s\n",
+ vprp->vpr_name, varpd_load_path);
+ mutex_exit(&varpd_load_handle->vdi_lock);
+ mutex_exit(&varpd_load_lock);
+ umem_free(vpp, sizeof (varpd_plugin_t));
+ return (EEXIST);
+ }
+ vpp->vpp_name = strdup(vprp->vpr_name);
+ if (vpp->vpp_name == NULL) {
+ (void) fprintf(stderr,
+ "failed to allocate memory to duplicate name - %s\n",
+ varpd_load_path);
+ mutex_exit(&varpd_load_handle->vdi_lock);
+ mutex_exit(&varpd_load_lock);
+ umem_free(vpp, sizeof (varpd_plugin_t));
+ return (ENOMEM);
+ }
+
+ vpp->vpp_mode = vprp->vpr_mode;
+ vpp->vpp_ops = vprp->vpr_ops;
+ if (mutex_init(&vpp->vpp_lock, USYNC_THREAD | LOCK_ERRORCHECK,
+ NULL) != 0)
+ libvarpd_panic("failed to create plugin's vpp_lock");
+ vpp->vpp_active = 0;
+ avl_add(&varpd_load_handle->vdi_plugins, vpp);
+ mutex_exit(&varpd_load_handle->vdi_lock);
+ mutex_exit(&varpd_load_lock);
+
+ return (0);
+}
+
+varpd_plugin_t *
+libvarpd_plugin_lookup(varpd_impl_t *vip, const char *name)
+{
+ varpd_plugin_t lookup, *ret;
+
+ lookup.vpp_name = name;
+ mutex_enter(&vip->vdi_lock);
+ ret = avl_find(&vip->vdi_plugins, &lookup, NULL);
+ mutex_exit(&vip->vdi_lock);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+libvarpd_plugin_load_cb(varpd_impl_t *vip, const char *path, void *unused)
+{
+ void *dlp;
+
+ varpd_load_path = path;
+ dlp = dlopen(path, RTLD_LOCAL | RTLD_NOW);
+ if (dlp == NULL)
+ (void) fprintf(stderr, "dlopen failed - %s\n", path);
+ path = NULL;
+
+ return (0);
+}
+
+int
+libvarpd_plugin_load(varpd_handle_t *vph, const char *path)
+{
+ int ret = 0;
+ varpd_impl_t *vip = (varpd_impl_t *)vph;
+
+ if (vip == NULL || path == NULL)
+ return (EINVAL);
+ mutex_enter(&varpd_load_lock);
+ while (varpd_load_handle != NULL)
+ (void) cond_wait(&varpd_load_cv, &varpd_load_lock);
+ varpd_load_handle = vip;
+ mutex_exit(&varpd_load_lock);
+
+ ret = libvarpd_dirwalk(vip, path, ".so", libvarpd_plugin_load_cb, NULL);
+
+ mutex_enter(&varpd_load_lock);
+ varpd_load_handle = NULL;
+ (void) cond_signal(&varpd_load_cv);
+ mutex_exit(&varpd_load_lock);
+
+ return (ret);
+}
+
+int
+libvarpd_plugin_walk(varpd_handle_t *vph, libvarpd_plugin_walk_f func,
+ void *arg)
+{
+ varpd_impl_t *vip = (varpd_impl_t *)vph;
+ varpd_plugin_t *vpp;
+
+ mutex_enter(&vip->vdi_lock);
+ for (vpp = avl_first(&vip->vdi_plugins); vpp != NULL;
+ vpp = AVL_NEXT(&vip->vdi_plugins, vpp)) {
+ if (func(vph, vpp->vpp_name, arg) != 0) {
+ mutex_exit(&vip->vdi_lock);
+ return (1);
+ }
+ }
+ mutex_exit(&vip->vdi_lock);
+ return (0);
+}
+
+void
+libvarpd_plugin_init(void)
+{
+ if (mutex_init(&varpd_load_lock, USYNC_THREAD | LOCK_RECURSIVE |
+ LOCK_ERRORCHECK, NULL) != 0)
+ libvarpd_panic("failed to create varpd_load_lock");
+
+ if (cond_init(&varpd_load_cv, USYNC_THREAD, NULL) != 0)
+ libvarpd_panic("failed to create varpd_load_cv");
+
+ varpd_load_handle = NULL;
+}
+
+void
+libvarpd_plugin_fini(void)
+{
+ assert(varpd_load_handle == NULL);
+ if (mutex_destroy(&varpd_load_lock) != 0)
+ libvarpd_panic("failed to destroy varpd_load_lock");
+ if (cond_destroy(&varpd_load_cv) != 0)
+ libvarpd_panic("failed to destroy varpd_load_cv");
+}
+
+void
+libvarpd_plugin_prefork(void)
+{
+ mutex_enter(&varpd_load_lock);
+ while (varpd_load_handle != NULL)
+ (void) cond_wait(&varpd_load_cv, &varpd_load_lock);
+}
+
+void
+libvarpd_plugin_postfork(void)
+{
+ (void) cond_signal(&varpd_load_cv);
+ mutex_exit(&varpd_load_lock);
+}
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_prop.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_prop.c
new file mode 100644
index 0000000000..f3a1492408
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_prop.c
@@ -0,0 +1,300 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * varpd property management
+ */
+
+#include <libvarpd_impl.h>
+#include <errno.h>
+#include <strings.h>
+#include <sys/mac.h>
+#include <umem.h>
+
+typedef struct varpd_prop_info {
+ varpd_impl_t *vprop_vip;
+ varpd_instance_t *vprop_instance;
+ uint_t vprop_type;
+ uint_t vprop_prot;
+ uint32_t vprop_defsize;
+ uint32_t vprop_psize;
+ char vprop_name[LIBVARPD_PROP_NAMELEN];
+ uint8_t vprop_default[LIBVARPD_PROP_SIZEMAX];
+ uint8_t vprop_poss[LIBVARPD_PROP_SIZEMAX];
+} varpd_prop_info_t;
+
+/* Internal Properties */
+static int varpd_nintprops = 1;
+static const char *varpd_intprops[] = {
+ "search"
+};
+
+static int
+libvarpd_prop_get_search(varpd_prop_info_t *infop, void *buf, uint32_t *sizep)
+{
+ varpd_plugin_t *vpp = infop->vprop_instance->vri_plugin;
+ size_t nlen;
+
+ nlen = strlen(vpp->vpp_name) + 1;
+ if (nlen > *sizep)
+ return (EOVERFLOW);
+ *sizep = nlen;
+ (void) strlcpy(buf, vpp->vpp_name, *sizep);
+ return (0);
+}
+
+void
+libvarpd_prop_set_name(varpd_prop_handle_t *phdl, const char *name)
+{
+ varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl;
+ (void) strlcpy(infop->vprop_name, name, OVERLAY_PROP_NAMELEN);
+}
+
+void
+libvarpd_prop_set_prot(varpd_prop_handle_t *phdl, overlay_prop_prot_t perm)
+{
+ varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl;
+ infop->vprop_prot = perm;
+}
+
+void
+libvarpd_prop_set_type(varpd_prop_handle_t *phdl, overlay_prop_type_t type)
+{
+ varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl;
+ infop->vprop_type = type;
+}
+
+int
+libvarpd_prop_set_default(varpd_prop_handle_t *phdl, void *buf, ssize_t len)
+{
+ varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl;
+
+ if (len > LIBVARPD_PROP_SIZEMAX)
+ return (E2BIG);
+
+ if (len < 0)
+ return (EOVERFLOW);
+
+ bcopy(buf, infop->vprop_default, len);
+ infop->vprop_defsize = len;
+ return (0);
+}
+
+void
+libvarpd_prop_set_nodefault(varpd_prop_handle_t *phdl)
+{
+ varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl;
+
+ infop->vprop_default[0] = '\0';
+ infop->vprop_defsize = 0;
+}
+
+void
+libvarpd_prop_set_range_uint32(varpd_prop_handle_t *phdl, uint32_t min,
+ uint32_t max)
+{
+ varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl;
+ mac_propval_range_t *rangep = (mac_propval_range_t *)infop->vprop_poss;
+
+ if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_UINT32)
+ return;
+
+ if (infop->vprop_psize + sizeof (mac_propval_uint32_range_t) >
+ sizeof (infop->vprop_poss))
+ return;
+
+ infop->vprop_psize += sizeof (mac_propval_uint32_range_t);
+ rangep->mpr_count++;
+ rangep->mpr_type = MAC_PROPVAL_UINT32;
+ rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_min = min;
+ rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_max = max;
+}
+
+void
+libvarpd_prop_set_range_str(varpd_prop_handle_t *phdl, const char *str)
+{
+ varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl;
+ size_t len = strlen(str) + 1; /* Account for a null terminator */
+ mac_propval_range_t *rangep = (mac_propval_range_t *)infop->vprop_poss;
+ mac_propval_str_range_t *pstr = &rangep->u.mpr_str;
+
+ if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_STR)
+ return;
+
+ if (infop->vprop_psize + len > sizeof (infop->vprop_poss))
+ return;
+
+ rangep->mpr_count++;
+ rangep->mpr_type = MAC_PROPVAL_STR;
+ (void) strlcpy((char *)&pstr->mpur_data[pstr->mpur_nextbyte], str,
+ sizeof (infop->vprop_poss) - infop->vprop_psize);
+ pstr->mpur_nextbyte += len;
+ infop->vprop_psize += len;
+}
+
+int
+libvarpd_prop_handle_alloc(varpd_handle_t *vph, varpd_instance_handle_t *inst,
+ varpd_prop_handle_t **phdlp)
+{
+ varpd_prop_info_t *infop;
+
+ infop = umem_alloc(sizeof (varpd_prop_info_t), UMEM_DEFAULT);
+ if (infop == NULL)
+ return (ENOMEM);
+
+ bzero(infop, sizeof (varpd_prop_info_t));
+ infop->vprop_vip = (varpd_impl_t *)vph;
+ infop->vprop_instance = (varpd_instance_t *)inst;
+
+ *phdlp = (varpd_prop_handle_t *)infop;
+ return (0);
+}
+
+void
+libvarpd_prop_handle_free(varpd_prop_handle_t *phdl)
+{
+ umem_free(phdl, sizeof (varpd_prop_info_t));
+}
+
+int
+libvarpd_prop_nprops(varpd_instance_handle_t *ihdl, uint_t *np)
+{
+ int ret;
+ varpd_instance_t *instp = (varpd_instance_t *)ihdl;
+
+ ret = instp->vri_plugin->vpp_ops->vpo_nprops(instp->vri_private, np);
+ if (ret != 0)
+ return (ret);
+ *np += varpd_nintprops;
+ return (0);
+}
+
+static int
+libvarpd_prop_info_fill_int_cb(varpd_handle_t *handle, const char *name,
+ void *arg)
+{
+ varpd_prop_handle_t *vph = arg;
+ libvarpd_prop_set_range_str(vph, name);
+ return (0);
+}
+
+static int
+libvarpd_prop_info_fill_int(varpd_prop_handle_t *vph, uint_t propid)
+{
+ varpd_prop_info_t *infop = (varpd_prop_info_t *)vph;
+ if (propid >= varpd_nintprops)
+ abort();
+ libvarpd_prop_set_name(vph, varpd_intprops[0]);
+ libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_READ);
+ libvarpd_prop_set_type(vph, OVERLAY_PROP_T_STRING);
+ libvarpd_prop_set_nodefault(vph);
+ (void) libvarpd_plugin_walk(
+ (varpd_handle_t *)infop->vprop_instance->vri_impl,
+ libvarpd_prop_info_fill_int_cb, vph);
+ return (0);
+}
+
+int
+libvarpd_prop_info_fill(varpd_prop_handle_t *phdl, uint_t propid)
+{
+ varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl;
+ varpd_instance_t *instp = infop->vprop_instance;
+ mac_propval_range_t *rangep = (mac_propval_range_t *)infop->vprop_poss;
+
+ infop->vprop_psize = sizeof (mac_propval_range_t);
+
+ bzero(rangep, sizeof (mac_propval_range_t));
+ if (propid < varpd_nintprops) {
+ return (libvarpd_prop_info_fill_int(phdl, propid));
+ } else {
+ varpd_plugin_t *vpp = instp->vri_plugin;
+ return (vpp->vpp_ops->vpo_propinfo(instp->vri_private,
+ propid - varpd_nintprops, phdl));
+ }
+}
+
+int
+libvarpd_prop_info(varpd_prop_handle_t *phdl, const char **namep,
+ uint_t *typep, uint_t *protp, const void **defp, uint32_t *sizep,
+ const mac_propval_range_t **possp)
+{
+ varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl;
+ if (namep != NULL)
+ *namep = infop->vprop_name;
+ if (typep != NULL)
+ *typep = infop->vprop_type;
+ if (protp != NULL)
+ *protp = infop->vprop_prot;
+ if (defp != NULL)
+ *defp = infop->vprop_default;
+ if (sizep != NULL)
+ *sizep = infop->vprop_psize;
+ if (possp != NULL)
+ *possp = (mac_propval_range_t *)infop->vprop_poss;
+ return (0);
+}
+
+int
+libvarpd_prop_get(varpd_prop_handle_t *phdl, void *buf, uint32_t *sizep)
+{
+ varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl;
+ varpd_instance_t *instp = infop->vprop_instance;
+
+ if (infop->vprop_name[0] == '\0')
+ return (EINVAL);
+
+ if (strcmp(varpd_intprops[0], infop->vprop_name) == 0) {
+ /* search property */
+ return (libvarpd_prop_get_search(infop, buf, sizep));
+ }
+
+ return (instp->vri_plugin->vpp_ops->vpo_getprop(instp->vri_private,
+ infop->vprop_name, buf, sizep));
+}
+
+int
+libvarpd_prop_set(varpd_prop_handle_t *phdl, const void *buf, uint32_t size)
+{
+ int i;
+ varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl;
+ varpd_instance_t *instp = infop->vprop_instance;
+
+ if (infop->vprop_name[0] == '\0')
+ return (EINVAL);
+
+ for (i = 0; i < varpd_nintprops; i++) {
+ if (strcmp(infop->vprop_name, varpd_intprops[i]) == 0) {
+ return (EPERM);
+ }
+ }
+
+ return (instp->vri_plugin->vpp_ops->vpo_setprop(instp->vri_private,
+ infop->vprop_name, buf, size));
+}
+
+void
+libvarpd_prop_door_convert(const varpd_prop_handle_t *phdl,
+ varpd_client_propinfo_arg_t *vcfap)
+{
+ const varpd_prop_info_t *infop = (const varpd_prop_info_t *)phdl;
+
+ vcfap->vcfa_type = infop->vprop_type;
+ vcfap->vcfa_prot = infop->vprop_prot;
+ vcfap->vcfa_defsize = infop->vprop_defsize;
+ vcfap->vcfa_psize = infop->vprop_psize;
+ bcopy(infop->vprop_name, vcfap->vcfa_name, LIBVARPD_PROP_NAMELEN);
+ bcopy(infop->vprop_default, vcfap->vcfa_default, LIBVARPD_PROP_SIZEMAX);
+ bcopy(infop->vprop_poss, vcfap->vcfa_poss, LIBVARPD_PROP_SIZEMAX);
+}
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h b/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h
new file mode 100644
index 0000000000..c44a8f6941
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h
@@ -0,0 +1,417 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _LIBVARPD_PROVIDER_H
+#define _LIBVARPD_PROVIDER_H
+
+/*
+ * varpd provider interface for lookup modules
+ *
+ * This header file defines all the structures and functions that a given lookup
+ * module needs to implement and perform its purpose. At this time, all of these
+ * interfaces are considered private to illumos and therefore are subject to
+ * change. At some point we will move to more broadly stabilize these interfaces
+ * and commit to them. Until such time, expect breakage for out of gate
+ * consumers.
+ *
+ * A plugin is a dynamic shared object that is placed inside of varpd's default
+ * module.
+ *
+ * The shared object must define an initializer, such as with #pragma init. This
+ * function will be run with the module is dlopened by libvarpd. In that init
+ * function, the function must allocate a varpd_plugin_register by calling
+ * libvarpd_plugin_alloc() and specifying VARPD_CURRENT_VERSION. If that
+ * succeeds, then it should proceed to fill out the registration and then call,
+ * libvarpd_plugin_register() with it. Regardless of whether it succeeds or
+ * fails, it should call libvarpd_plugin_free(). In the case of failure, there
+ * is not much that the module should do, other than log some message to
+ * stderr.
+ *
+ * Once libvarpd_plugin_register() returns, the module should assume that any
+ * of the operations it defined in the operation vector may be called and
+ * therefore it is recommended that any other required initialization should be
+ * performed at that time.
+ *
+ * At this time, once a plugin is loaded, it will not be unloaded. Therefore,
+ * there is no corresponding requirement to unregister, though that may come in
+ * a future version.
+ *
+ * -----------------------------
+ * Plugin Types and Destinations
+ * -----------------------------
+ *
+ * There are two different kinds of plugins in this world, there are point to
+ * point plugins and there are dynamic plugins. The key difference is in how
+ * packets are routed through the system. In a point to point plugin, a single
+ * destination is used when the instance is started. In dynamic plugins,
+ * destinations are looked up as they are required and an instance of a plugin
+ * is required to provide that.
+ *
+ * These point to point plugins define a type of OVERLAY_TARGET_POINT and the
+ * dynamic plugins instead define a type of OVERLAY_TARGET_DYNAMIC.
+ *
+ * Encapsulation plugins have multiple types of destinations. They may require
+ * an Ethernet address (OVERLAY_PLUGIN_D_ETHERNET), IP address
+ * (OVERLAY_PLUGIN_D_IP), and a port (OVERLAY_PLUGIN_D_PORT). For example,
+ * consider vxlan, it requires an IP and a port; while a hypothetical nvgre,
+ * would only require an IP.
+ *
+ * A plugin is allowed to describe which of these fields that it supports and
+ * given which encapsulation plugin it is paired with, it can support a varying
+ * degree of properties. For example, consider the example of the direct plugin.
+ * It has a notion of a destination port and a destination IP. If it is paired
+ * with a plugin that only requires an IP, then it wouldn't need to show a
+ * property that's related to a destination port.
+ *
+ * ------------------
+ * Plugin Definitions
+ * ------------------
+ *
+ * A plugin is required to fill in both an operations vector and a series of
+ * additional metadata that it passed in to libvarpd_plugin_register(). The
+ * following lists all of the routines and their purposes. The full signatures
+ * are available in the body of the header file.
+ *
+ * varpd_plugin_create_f
+ *
+ * Create a new instance of a plugin. Each instance refers to a different
+ * overlay device and thus a different overlay identifier. Each instance
+ * has its own property space and is unique. This function gives the chance
+ * for the plugin to create and provide any private data that it will
+ * require.
+ *
+ * In addition, the plugin is given the type of destination that is
+ * required and it is its job to determine whether or not it supports it.
+ *
+ * varpd_plugin_destroy_f
+ *
+ * This is the opposite of varpd_plugin_create_f. It is called to allow the
+ * plugin to reclaim any resources with the private argument that it passed
+ * out as part of the destroy function.
+ *
+ * varpd_plugin_start_f
+ *
+ * This routine is called to indicate that an instance should be started.
+ * This is a plugin's chance to verify that it has all of its required
+ * properties set and to take care of any action that needs to be handled
+ * to begin the plugin. After this point it will be legal to have the
+ * varpd_plugin_default_f, varpd_plugin_lookup_f, varpd_plugin_arp_f and
+ * varpd_plugin_dhcp_f endpoints called.
+ *
+ * varpd_plugin_stop_f
+ *
+ * This routine is called to indicate that an instance is stopping, it is
+ * the opposite of varpd_plugin_start_f. This is a chance to clean up
+ * resources that are a side effect of having started the instance.
+ *
+ * varpd_plugin_default_f
+ *
+ * This routine is defined by plugins of type OVERLAY_TARGET_POINT. It is
+ * used to answer the question of where should all traffic for this
+ * instance be destined. Plugins of type OVERLAY_TARGET_DYNAMIC should
+ * leave this entry set to NULL.
+ *
+ * On success, the default routine should return VARPD_LOOKUP_OK. On
+ * failure, it should return the macro VARPD_LOOKUP_DROP.
+ *
+ * varpd_plugin_lookup_f
+ *
+ * This routine must be defined by plugins of type OVERLAY_TARGET_DYNAMIC.
+ * It is used to lookup the destination for a given request. Each request
+ * comes in with its own MAC address this allows a plugin to direct it to
+ * any remote location.
+ *
+ * This is designed as an asynchronous API. Once a lookup is completed it
+ * should call libvarpd_plugin_query_reply() and pass as the second
+ * argument either VARPD_LOOKUP_OK to indicate that it went alright or it
+ * should reply VARPD_LOOKUP_DROP to indicate that the packet should be
+ * dropped.
+ *
+ * In addition, there are several utility routines that can take care of
+ * various kinds of traffic automatically. For example, if an ARP, NDP, or
+ * DHCP packet comes in, there are utilities such as
+ * libvarpd_plugin_proxy_arp(), libvarpd_plugin_proxy_ndp() and
+ * libvarpd_plugin_proxy_dhcp(), which allows the system to do the heavy
+ * lifting of validating the packet once it finds that it matches certain
+ * properties.
+ *
+ * varpd_plugin_arp_f
+ *
+ * This is an optional entry for plugins of type OVERLAY_TARGET_DYNAMIC.
+ * This is called after a plugin calls libvarpd_plugin_proxy_arp() and is
+ * used to ask the plugin to perform an ARP or NDP query. The type of query
+ * is passed in in the third argument, the only valid value for which will
+ * be VARPD_QTYPE_ETHERNET, to indicate we're doing an Ethernet lookup.
+ *
+ * The layer three IP address that is being looked up will be included in
+ * the struct sockaddr. The sockaddr(3SOCKET)'s sa_family will be set to
+ * indicate the type, eg. AF_INET or AF_INET6 and that will indicate the
+ * kind of sockaddr that will be used. For more information see
+ * sockaddr(3SOCKET). The implementation ensures that enough space for the
+ * link layer address will exist.
+ *
+ * This is an asynchronous lookup. Once the answer has been written, a
+ * plugin should call libvarpd_plugin_arp_reply and if it was successful,
+ * VARPD_LOOKUP_OK should be passed in and if it failed, VARPD_LOOKUP_DROP
+ * should be passed in instead.
+ *
+ * varpd_plugin_dhcp_f
+ *
+ * This is an optional entry for plugins of type OVERLAY_TARGET_DYNAMIC.
+ * This is called after a plugin calls the libvarpd_plugin_proxy_dhcp() and
+ * is used to ask the plugin to determine where is the DHCP server that
+ * this packet should actually be sent to. What is happening here is that
+ * rather than broadcast the initial DHCP request, we instead unicast it to
+ * a specified DHCP server that this operation vector indicates.
+ *
+ * The plugin is given a type, the same as the ARP plugin which indicates
+ * the kind of link layer address, the only valid type is
+ * VARPD_QTYPE_ETHERNET, other types should be rejected. Then, like the arp
+ * entry point, the dhcp entry point should determine the link layer
+ * address of the DHCP server and write that out in the appropriate memory
+ * and call libvarpd_plugin_dhcp_reply() when done. Similar to the arp
+ * entry point, it should use VARPD_LOOKUP_OK to indicate that it was
+ * filled in and VARPD_LOOKUP_DROP to indicate that it was not.
+ *
+ * varpd_plugin_nprops_f
+ *
+ * This is used by a plugin to indicate the number of properties that
+ * should exist for this instance. Recall from the section that Plugin
+ * types and Destinations, that the number of entries here may vary. As
+ * such, the plugin should return the number that is appropriate for the
+ * instance.
+ *
+ * This number will be used to obtain information about a property via the
+ * propinfo functions. However, the getprop and setprop interfaces will
+ * always use names to indicate the property it is getting and setting.
+ * This difference is structured this way to deal with property discovery
+ * and to make the getprop and setprop interfaces slightly easier for other
+ * parts of the broader varpd/dladm infrastructure.
+ *
+ * varpd_plugin_propinfo_f
+ *
+ * This interface is used to get information about a property, the property
+ * that information is being requested for is being passed in via the
+ * second argument. Here, callers should set properties such as the name,
+ * the protection, whether or not the property is required, set any default
+ * value, if it exist, and if relevant, set the valid range of values.
+ *
+ * varpd_plugin_getprop_f
+ *
+ * This is used to get the value of a property, if it is set. The passed in
+ * length indicates the length of the buffer that is used for updating
+ * properties. If it is not of sufficient size, the function should return
+ * an error and not update the buffer. Otherwise, it should update the size
+ * pointer with the valid size.
+ *
+ * varpd_plugin_setprop_f
+ *
+ * This is used to set the value of a property. An endpoint should validate
+ * that the property is valid before updating it. In addition, it should
+ * update its state as appropriate.
+ *
+ * varpd_plugin_save_f
+ *
+ * This is used to serialize the state of a given instance of a plugin such
+ * that if varpd crashes, it can be recovered. The plugin should write all
+ * state into the nvlist that it is passed in, it may use any keys and
+ * values that it wants. The only consumer of that nvlist will be the
+ * plugin itself when the restore endpoint is called.
+ *
+ * varpd_plugin_restore_f
+ *
+ * This is called by the server to restore an instance that used to exist,
+ * but was lost due to a crash. This is a combination of calling create and
+ * setting properties. The plugin should restore any private state that it
+ * can find recorded from the nvlist. The only items in the nvlist will be
+ * those that were written out during a previous call to
+ * varpd_plugin_save_f.
+ *
+ *
+ * Once all of these interfaces are implemented, the plugin should define the
+ * following members in the varpd_plugin_register_t.
+ *
+ * vpr_version
+ *
+ * This indicates the version of the plugin. Plugins should set this to the
+ * macro VARPD_CURRENT_VERSION.
+ *
+ * vpr_mode
+ *
+ * This indicates the mode of the plugin. The plugin's mode should be one
+ * of OVERLAY_TARGET_POINT and OVERLAY_TARGET_DYNAMIC. For more discussion
+ * of these types and the differences, see the section on Plugin Types and
+ * Destinations.
+ *
+ * vpr_name
+ *
+ * This is the name of the plugin. This is how users will refer to it in
+ * the context of running dladm(1M) commands. Note, this name must be
+ * unique across the different plugins, as it will cause others with the
+ * same name not to be registered.
+ *
+ * vpr_ops
+ *
+ * This is the operations vector as described above. Importantly, the
+ * member vpo_callbacks must be set to zero, this is being used for future
+ * expansion of the structure.
+ *
+ *
+ * --------------------------------------------------
+ * Downcalls, Upcalls, and Synchronization Guarantees
+ * --------------------------------------------------
+ *
+ * Every instance of a plugin is independent. Calls into a plugin may be made
+ * for different instances in parallel. Any necessary locking is left to the
+ * plugin module. Within an instance, various calls may come in parallel.
+ *
+ * The primary guarantees are that none of the varpd_plugin_save_f,
+ * varpd_plugin_lookup_f, varpd_default_f, varpd_plugin_arp_f, and
+ * varpd_plugin_dhcp_f will be called until after a call to varpd_plugin_start_f
+ * has been called. Similarly, they will not be called after a call to
+ * varpd_plugin_stop_f.
+ *
+ * The functions documented in this header may be called back into from any
+ * context, including from the operation vectors.
+ */
+
+#include <libvarpd.h>
+#include <libnvpair.h>
+#include <sys/socket.h>
+#include <sys/overlay_target.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define VARPD_VERSION_ONE 1
+#define VARPD_CURRENT_VERSION VARPD_VERSION_ONE
+
+typedef struct __varpd_provier_handle varpd_provider_handle_t;
+typedef struct __varpd_query_handle varpd_query_handle_t;
+typedef struct __varpd_arp_handle varpd_arp_handle_t;
+typedef struct __varpd_dhcp_handle varpd_dhcp_handle_t;
+
+typedef int (*varpd_plugin_create_f)(varpd_provider_handle_t *, void **,
+ overlay_plugin_dest_t);
+typedef int (*varpd_plugin_start_f)(void *);
+typedef void (*varpd_plugin_stop_f)(void *);
+typedef void (*varpd_plugin_destroy_f)(void *);
+
+#define VARPD_LOOKUP_OK (0)
+#define VARPD_LOOKUP_DROP (-1)
+typedef int (*varpd_plugin_default_f)(void *, overlay_target_point_t *);
+typedef void (*varpd_plugin_lookup_f)(void *, varpd_query_handle_t *,
+ const overlay_targ_lookup_t *, overlay_target_point_t *);
+
+#define VARPD_QTYPE_ETHERNET 0x0
+typedef void (*varpd_plugin_arp_f)(void *, varpd_arp_handle_t *, int,
+ const struct sockaddr *, uint8_t *);
+typedef void (*varpd_plugin_dhcp_f)(void *, varpd_dhcp_handle_t *, int,
+ const overlay_targ_lookup_t *, uint8_t *);
+
+typedef int (*varpd_plugin_nprops_f)(void *, uint_t *);
+typedef int (*varpd_plugin_propinfo_f)(void *, const uint_t,
+ varpd_prop_handle_t *);
+typedef int (*varpd_plugin_getprop_f)(void *, const char *, void *, uint32_t *);
+typedef int (*varpd_plugin_setprop_f)(void *, const char *, const void *,
+ const uint32_t);
+
+typedef int (*varpd_plugin_save_f)(void *, nvlist_t *);
+typedef int (*varpd_plugin_restore_f)(nvlist_t *, varpd_provider_handle_t *,
+ overlay_plugin_dest_t, void **);
+
+typedef struct varpd_plugin_ops {
+ uint_t vpo_callbacks;
+ varpd_plugin_create_f vpo_create;
+ varpd_plugin_start_f vpo_start;
+ varpd_plugin_stop_f vpo_stop;
+ varpd_plugin_destroy_f vpo_destroy;
+ varpd_plugin_default_f vpo_default;
+ varpd_plugin_lookup_f vpo_lookup;
+ varpd_plugin_nprops_f vpo_nprops;
+ varpd_plugin_propinfo_f vpo_propinfo;
+ varpd_plugin_getprop_f vpo_getprop;
+ varpd_plugin_setprop_f vpo_setprop;
+ varpd_plugin_save_f vpo_save;
+ varpd_plugin_restore_f vpo_restore;
+ varpd_plugin_arp_f vpo_arp;
+ varpd_plugin_dhcp_f vpo_dhcp;
+} varpd_plugin_ops_t;
+
+typedef struct varpd_plugin_register {
+ uint_t vpr_version;
+ uint_t vpr_mode;
+ const char *vpr_name;
+ const varpd_plugin_ops_t *vpr_ops;
+} varpd_plugin_register_t;
+
+extern varpd_plugin_register_t *libvarpd_plugin_alloc(uint_t, int *);
+extern void libvarpd_plugin_free(varpd_plugin_register_t *);
+extern int libvarpd_plugin_register(varpd_plugin_register_t *);
+
+/*
+ * Blowing up and logging
+ */
+extern void libvarpd_panic(const char *, ...) __NORETURN;
+
+/*
+ * Misc. Information APIs
+ */
+extern uint64_t libvarpd_plugin_vnetid(varpd_provider_handle_t *);
+
+/*
+ * Lookup Replying query and proxying
+ */
+extern void libvarpd_plugin_query_reply(varpd_query_handle_t *, int);
+
+extern void libvarpd_plugin_proxy_arp(varpd_provider_handle_t *,
+ varpd_query_handle_t *, const overlay_targ_lookup_t *);
+extern void libvarpd_plugin_proxy_ndp(varpd_provider_handle_t *,
+ varpd_query_handle_t *, const overlay_targ_lookup_t *);
+extern void libvarpd_plugin_arp_reply(varpd_arp_handle_t *, int);
+
+extern void libvarpd_plugin_proxy_dhcp(varpd_provider_handle_t *,
+ varpd_query_handle_t *, const overlay_targ_lookup_t *);
+extern void libvarpd_plugin_dhcp_reply(varpd_dhcp_handle_t *, int);
+
+
+/*
+ * Property information callbacks
+ */
+extern void libvarpd_prop_set_name(varpd_prop_handle_t *, const char *);
+extern void libvarpd_prop_set_prot(varpd_prop_handle_t *, overlay_prop_prot_t);
+extern void libvarpd_prop_set_type(varpd_prop_handle_t *, overlay_prop_type_t);
+extern int libvarpd_prop_set_default(varpd_prop_handle_t *, void *, ssize_t);
+extern void libvarpd_prop_set_nodefault(varpd_prop_handle_t *);
+extern void libvarpd_prop_set_range_uint32(varpd_prop_handle_t *, uint32_t,
+ uint32_t);
+extern void libvarpd_prop_set_range_str(varpd_prop_handle_t *, const char *);
+
+/*
+ * Various injecting and invalidation routines
+ */
+extern void libvarpd_inject_varp(varpd_provider_handle_t *, const uint8_t *,
+ const overlay_target_point_t *);
+extern void libvarpd_inject_arp(varpd_provider_handle_t *, const uint16_t,
+ const uint8_t *, const struct in_addr *, const uint8_t *);
+extern void libvarpd_fma_degrade(varpd_provider_handle_t *, const char *);
+extern void libvarpd_fma_restore(varpd_provider_handle_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBVARPD_PROVIDER_H */
diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_util.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_util.c
new file mode 100644
index 0000000000..92e50b5f1b
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_util.c
@@ -0,0 +1,91 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#include <libvarpd_impl.h>
+#include <assert.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+const char *
+libvarpd_isaext(void)
+{
+#if defined(__amd64)
+ return ("64");
+#elif defined(__i386)
+ return ("");
+#else
+#error "unknown ISA"
+#endif
+}
+
+int
+libvarpd_dirwalk(varpd_impl_t *vip, const char *path, const char *suffix,
+ libvarpd_dirwalk_f func, void *arg)
+{
+ int ret;
+ size_t slen;
+ char *dirpath, *filepath;
+ DIR *dirp;
+ struct dirent *dp;
+ assert(vip != NULL && path != NULL);
+
+ if (asprintf(&dirpath, "%s/%s", path, libvarpd_isaext()) == -1)
+ return (errno);
+
+ if ((dirp = opendir(dirpath)) == NULL) {
+ ret = errno;
+ return (ret);
+ }
+
+ slen = strlen(suffix);
+ for (;;) {
+ size_t len;
+
+ errno = 0;
+ dp = readdir(dirp);
+ if (dp == NULL) {
+ ret = errno;
+ break;
+ }
+
+ len = strlen(dp->d_name);
+ if (len <= slen)
+ continue;
+
+ if (strcmp(suffix, dp->d_name + (len - slen)) != 0)
+ continue;
+
+ if (asprintf(&filepath, "%s/%s", dirpath, dp->d_name) == -1) {
+ ret = errno;
+ break;
+ }
+
+ if (func(vip, filepath, arg) != 0) {
+ free(filepath);
+ ret = 0;
+ break;
+ }
+
+ free(filepath);
+ }
+
+ (void) closedir(dirp);
+ free(dirpath);
+ return (ret);
+}
diff --git a/usr/src/lib/varpd/libvarpd/common/mapfile-plugin b/usr/src/lib/varpd/libvarpd/common/mapfile-plugin
new file mode 100644
index 0000000000..8cef7f669f
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/mapfile-plugin
@@ -0,0 +1,57 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+# usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_SCOPE {
+ global:
+ libvarpd_fma_degrade { FLAGS = EXTERN };
+ libvarpd_inject_arp { FLAGS = EXTERN };
+ libvarpd_inject_ndp { FLAGS = EXTERN };
+ libvarpd_inject_varp { FLAGS = EXTERN };
+ libvarpd_fma_restore { FLAGS = EXTERN };
+ libvarpd_panic { FLAGS = EXTERN };
+ libvarpd_plugin_alloc { FLAGS = EXTERN };
+ libvarpd_plugin_arp_reply { FLAGS = EXTERN };
+ libvarpd_plugin_dhcp_reply { FLAGS = EXTERN };
+ libvarpd_plugin_free { FLAGS = EXTERN };
+ libvarpd_plugin_proxy_arp { FLAGS = EXTERN };
+ libvarpd_plugin_proxy_dhcp { FLAGS = EXTERN };
+ libvarpd_plugin_proxy_ndp { FLAGS = EXTERN };
+ libvarpd_plugin_query_reply { FLAGS = EXTERN };
+ libvarpd_plugin_register { FLAGS = EXTERN };
+ libvarpd_plugin_vnetid { FLAGS = EXTERN };
+ libvarpd_prop_set_name { FLAGS = EXTERN };
+ libvarpd_prop_set_prot { FLAGS = EXTERN };
+ libvarpd_prop_set_type { FLAGS = EXTERN };
+ libvarpd_prop_set_default { FLAGS = EXTERN };
+ libvarpd_prop_set_nodefault { FLAGS = EXTERN };
+ libvarpd_prop_set_range_uint32 { FLAGS = EXTERN };
+ libvarpd_prop_set_rangestr { FLAGS = EXTERN };
+};
diff --git a/usr/src/lib/varpd/libvarpd/common/mapfile-vers b/usr/src/lib/varpd/libvarpd/common/mapfile-vers
new file mode 100644
index 0000000000..7aa930cb54
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/common/mapfile-vers
@@ -0,0 +1,113 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+# usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION SUNWprivate {
+ global:
+ libvarpd_c_create;
+ libvarpd_c_destroy;
+ libvarpd_c_instance_activate;
+ libvarpd_c_instance_create;
+ libvarpd_c_instance_destroy;
+ libvarpd_c_prop_nprops;
+ libvarpd_c_prop_handle_alloc;
+ libvarpd_c_prop_handle_free;
+ libvarpd_c_prop_info_fill;
+ libvarpd_c_prop_info_fill_by_name;
+ libvarpd_c_prop_info;
+ libvarpd_c_prop_get;
+ libvarpd_c_prop_set;
+
+ libvarpd_c_instance_lookup;
+ libvarpd_c_instance_target_mode;
+ libvarpd_c_instance_cache_flush;
+ libvarpd_c_instance_cache_delete;
+ libvarpd_c_instance_cache_get;
+ libvarpd_c_instance_cache_set;
+ libvarpd_c_instance_cache_walk;
+
+ libvarpd_create;
+ libvarpd_destroy;
+
+ libvarpd_door_server_create;
+ libvarpd_door_server_destroy;
+
+ libvarpd_fma_degrade;
+ libvarpd_fma_restore;
+
+ libvarpd_inject_varp;
+ libvarpd_inject_arp;
+
+ libvarpd_instance_activate;
+ libvarpd_instance_create;
+ libvarpd_instance_destroy;
+ libvarpd_instance_lookup;
+ libvarpd_instance_id;
+
+ libvarpd_panic;
+
+ libvarpd_persist_disable;
+ libvarpd_persist_enable;
+ libvarpd_persist_restore;
+
+ libvarpd_plugin_alloc;
+ libvarpd_plugin_load;
+ libvarpd_plugin_free;
+ libvarpd_plugin_arp_reply;
+ libvarpd_plugin_dhcp_reply;
+ libvarpd_plugin_query_reply;
+ libvarpd_plugin_proxy_arp;
+ libvarpd_plugin_proxy_dhcp;
+ libvarpd_plugin_proxy_ndp;
+ libvarpd_plugin_register;
+ libvarpd_plugin_walk;
+ libvarpd_plugin_vnetid;
+
+ libvarpd_prop_set_default;
+ libvarpd_prop_set_nodefault;
+ libvarpd_prop_set_name;
+ libvarpd_prop_set_prot;
+ libvarpd_prop_set_range_uint32;
+ libvarpd_prop_set_range_str;
+ libvarpd_prop_set_type;
+
+ libvarpd_prop_handle_alloc;
+ libvarpd_prop_handle_free;
+ libvarpd_prop_nprops;
+ libvarpd_prop_info_fill;
+ libvarpd_prop_info;
+ libvarpd_prop_get;
+ libvarpd_prop_set;
+
+ libvarpd_overlay_lookup_quiesce;
+ libvarpd_overlay_lookup_run;
+ local:
+ *;
+};
diff --git a/usr/src/lib/varpd/libvarpd/i386/Makefile b/usr/src/lib/varpd/libvarpd/i386/Makefile
new file mode 100644
index 0000000000..4398507523
--- /dev/null
+++ b/usr/src/lib/varpd/libvarpd/i386/Makefile
@@ -0,0 +1,18 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+include ../Makefile.com
+
+install: all $(ROOTLIBS) $(ROOTLINKS)
diff --git a/usr/src/man/man1m/dladm.1m b/usr/src/man/man1m/dladm.1m
index 6e03105132..e76b8998c7 100644
--- a/usr/src/man/man1m/dladm.1m
+++ b/usr/src/man/man1m/dladm.1m
@@ -178,6 +178,14 @@ dladm \- administer data links
.LP
.nf
+\fBdladm create-overlay\fR [\fB-t\fR] \fB-e\fR \fIencap\fR \fB-s\fR \fIsearch\fR \fB-v\fR \fIvnetid\fR [\fB-p\fR \fIprop\fR=\fIvalue\fR[,...]] \fIoverlay\fR
+\fBdladm delete-overlay\fR \fIoverlay\fR
+\fBdladm modify-overlay\fR \fB-d\fR \fImac\fR | \fB-f\fR | \fB-s\fR \fImac=ip:port\fR \fIoverlay\fR
+\fBdladm show-overlay\fR [ \fB-f\fR | \fB-t\fR ] [[\fB-p\fR] \fB-o\fR \fIfield\fR[,...]] [\fIoverlay\fR]
+.fi
+
+.LP
+.nf
\fBdladm show-usage\fR [\fB-a\fR] \fB-f\fR \fIfilename\fR [\fB-p\fR \fIplotfile\fR \fB-F\fR \fIformat\fR] [\fB-s\fR \fItime\fR]
[\fB-e\fR \fItime\fR] [\fIlink\fR]
.fi
@@ -264,9 +272,9 @@ A WiFi datalink.
.ad
.sp .6
.RS 4n
-A virtual network interface created on a link or an \fBetherstub\fR. It is a
-pseudo device that can be treated as if it were an network interface card on a
-machine.
+A virtual network interface created on a link, an \fBetherstub\fR, or \fBan
+overlay\fR. It is a pseudo device that can be treated as if it were an network
+interface card on a machine.
.RE
.sp
@@ -334,6 +342,20 @@ use any alphanumeric characters, as well as underscore (\fB_\fR), period
characters.
.RE
+.sp
+.ne 2
+.na
+.B overlay
+.ad
+.sp .6
+.RS 4n
+An overlay instance, identified by an administratively-chosen name. An overlay
+can be used to create or join an existing software defined network.
+VNICs created on an overlay will appear to be connected by a local virtual
+switch and will also be connected to interfaces on matching overlays provided by
+other hosts. For more information on overlay devices, see \fBoverlay\fR(5).
+.RE
+
.SS "Options"
Each \fBdladm\fR subcommand has its own set of options. However, many of the
subcommands have the following as a common option:
@@ -4370,6 +4392,348 @@ The tunnel destination address.
.sp
.ne 2
.na
+\fBdladm create-overlay\fR \fB-e\fR \fIencap\fR \fB-s\fR \fIsearch\fR
+\fB-v\fR \fIvnetid\fR [\fB-p\fR \fIprop\fR=\fIvalue\fR[,...]] \fIoverlay\fR
+.ad
+.sp .6
+.RS 4n
+Create an overlay device named \fIoverlay\fR.
+.sp
+Overlay devices are similar to etherstubs. VNICs can be created on top
+of them. However, unlike an etherstub which is local to the system, an
+overlay device can be configured to communicate to remote hosts,
+providing a means for network virtualization. The way in which it does
+this is described by the encapsulation module and the search plugin. For
+more information on these, see \fBoverlay\fR(5).
+.sp
+An overlay device has a series of required and optional properties. These
+properties vary based upon the search and encapsulation modules and are fully
+specified in \fBoverlay\fR(5). Not every property needs to be specified - some
+have default values which will be used if nothing specific is specified. For
+example, the default port for VXLAN comes from its IANA standard. If a
+required property is missing, the command will fail and inform you of the
+missing properties.
+.sp
+.ne 2
+.na
+\fB\fB-t\fR, \fB--temporary\fR\fR
+.ad
+.sp .6
+.RS 4n
+Specifies that the overlay is temporary. Temporary overlays last until
+the next reboot.
+.RE
+
+.sp
+.ne 2
+.na
+\fB-e\fR \fIencap\fR, \fB--encap\fR=\fIencap\fR
+.ad
+.sp .6
+.RS 4n
+Use \fIencap\fR as the encapsulation plugin for the overlay device
+\fIoverlay\fR. The encapsulation plugin determines how packets are transformed
+before being put on the wire.
+.RE
+
+.sp
+.ne 2
+.na
+\fB-s\fR \fIsearch\fR, \fB--search\fR=\fIsearch\fR
+.ad
+.sp .6
+.RS 4n
+Use \fIsearch\fR as the search plugin for \fIoverlay\fR. The search plugin
+determines how non-local targets are found and where packets are directed to.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-p\fR \fIprop\fR=\fIvalue\fR,..., \fB--prop\fR
+\fIprop\fR=\fIvalue\fR,...\fR
+.ad
+.sp .6
+.RS 4n
+A comma-separated list of properties to set to the specified values.
+.RE
+
+.sp
+.ne 2
+.na
+\fB-v\fR \fIvnetid\fR, \fB--vnetid\fR=\fIvnetid\fR
+.ad
+.sp .6
+.RS 4n
+Sets the virtual networking identifier to \fIvnetid\fR. A virtual network
+identifier determines is similar to a VLAN identifier, in that it identifies a
+unique virtual network. All overlay devices on the system share the same space
+for the virtual network identifier. However, the valid range of identifiers is
+determined by the encapsulation plugin specified by \fB-e\fR.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.na
+\fBdladm delete-overlay\fR \fIoverlay\fR
+.ad
+.sp .6
+.RS 4n
+Delete the specified overlay. This will fail if there are VNICs on top of the
+device.
+.RE
+
+.sp
+.ne 2
+.na
+\fBdladm modify-overlay\fR \fB-d\fR \fImac\fR | \fB-f\fR | \fB-s\fR \fImac=ip:port\fR \fIoverlay\fR
+.ad
+.sp .6
+.RS 4n
+Modifies the target tables for the specified overlay.
+.sp
+The different options allow for different ways of modifying the target table.
+One of \fB-d\fR, \fB-f\fR, and \fB-s\fR is required. This is not applicable for
+all kinds of overlay devices. For more information, see \fBoverlay\fR(5).
+.sp
+.ne 2
+.na
+\fB-d\fR \fImac\fR, \fB--delete-entry\fR=\fImac\fR
+.ad
+.sp .6
+.RS 4n
+Deletes the entry for \fImac\fR from the target table for \fIoverlay\fR. Note,
+if a lookup is pending or outstanding, this does not cancel it or stop it from
+updating the value.
+.RE
+
+.sp
+.ne 2
+.na
+\fB-f\fR, \fB--flush-table\fR
+.ad
+.sp .6
+.RS 4n
+Flushes all values in the target table for \fIoverlay\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fB-s\fR \fImac\fR=\fIvalue\fR, \fB--set-entry\fR=\fImac\fR=\fIvalue\fR
+.ad
+.sp .6
+.RS 4n
+Sets the value of \fIoverlay\fR's target table entry for \fImac\fR to the
+specified value. The specified value varies upon the encapsulation plugin. The
+value may be a combination of a MAC address, IP address, and port. Generally,
+this looks like [\fImac\fR,][\fIIP\fR:][\fIport\fR]. If a component is the last
+one, then there is no need for a separator. eg. if just the MAC address or IP
+is needed, it would look like \fImac\fR and \fIIP\fR respectively.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.na
+\fBdladm show-overlay\fR [ \fB-f\fR | \fB-t\fR ] [[\fB-p\fR] \fB-o\fR \fIfield\fR[,...]] [\fIoverlay\fR]
+.ad
+.sp .6
+.RS 4n
+Shows overlay configuration (the default), internal target tables (\fB-t\fR), or
+the FMA state (\fB-f\fR), either for all overlays or the specified overlay.
+.sp
+By default (with neither \fB-f\fR or \fB-t\fR specified), the following fields
+will be displayed:
+.sp
+.ne 2
+.na
+\fB\fBLINK\fR\fR
+.ad
+.sp .6
+.RS 4n
+The name of the overlay.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBPROPERTY\fR\fR
+.ad
+.sp .6
+.RS 4n
+The name of the property.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBPERM\fR\fR
+.ad
+.sp .6
+.RS 4n
+The read/write permissions of the property. The value shown is one of \fBr-\fR
+or \fBrw\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBVALUE\fR\fR
+.ad
+.sp .6
+.RS 4n
+The current property value. If the value is not set, it is shown as \fB--\fR.
+If it is unknown, the value is shown as \fB?\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBDEFAULT\fR\fR
+.ad
+.sp .6
+.RS 4n
+The default value of the property. If the property has no default value,
+\fB--\fR is shown.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBPOSSIBLE\fR\fR
+.ad
+.sp .6
+.RS 4n
+A comma-separated list of the values the property can have. If the values span
+a numeric range, \fImin\fR - \fImax\fR might be shown as shorthand. If the
+possible values are unknown or unbounded, \fB--\fR is shown.
+.RE
+
+.sp
+When the \fB-f\fR option is displayed, the following fields will be displayed:
+.sp
+.ne 2
+.na
+\fB\fBLINK\fR\fR
+.ad
+.sp .6
+.RS 4n
+The name of the overlay.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBSTATUS\fR\fR
+.ad
+.sp .6
+.RS 4n
+Either \fBONLINE\fR or \fBDEGRADED\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBDETAILS\fR\fR
+.ad
+.sp .6
+.RS 4n
+When the \fBoverlay\fR's status is \fBONLINE\fR, then this has the value
+\fB--\fR. Otherwise, when it is \fBDEGRADED\fR, this field provides a more
+detailed explanation as to why it's degraded.
+.RE
+
+.sp
+When the \fB-t\fR option is displayed, the following fields will be displayed:
+.sp
+.ne 2
+.na
+\fB\fBLINK\fR\fR
+.ad
+.sp .6
+.RS 4n
+The name of the overlay.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBTARGET\fR\fR
+.ad
+.sp .6
+.RS 4n
+The target MAC address of a table entry.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBDESTINATION\fR\fR
+.ad
+.sp .6
+.RS 4n
+The address that an encapsulated packet will be sent to when a packet has the
+address specified by \fBTARGET\fR.
+.RE
+
+The \fBshow-overlay\fR command supports the following options:
+
+.sp
+.ne 2
+.na
+\fB-f\fR, \fB--fma\fR
+.ad
+.sp .6
+.RS 4n
+Displays information about an overlay device's FMA state. For more
+information on the target table, see \fBoverlay\fR(5).
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-o\fR \fIfield\fR[,...], \fB--output\fR=\fIfield\fR\fR
+.ad
+.sp .6
+.RS 4n
+A case-insensitive, comma-separated list of output fields to display. The field
+name must be one of the fields listed above, or the special value \fBall\fR, to
+display all fields. The fields applicable to the \fB-o\fR option are limited to
+those listed under each output mode. For example, if using \fB-L\fR, only the
+fields listed under \fB-L\fR, above, can be used with \fB-o\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-p\fR, \fB--parsable\fR\fR
+.ad
+.sp .6
+.RS 4n
+Display using a stable machine-parsable format. The \fB-o\fR option is
+required with \fB-p\fR. See "Parsable Output Format", below.
+.RE
+
+.sp
+.ne 2
+.na
+\fB-t\fR, \fB--target\fR
+.ad
+.sp .6
+.RS 4n
+Displays information about an overlay device's target table. For more
+information on the target table, see \fBoverlay\fR(5).
+.RE
+
+.RE
+
+.sp
+.ne 2
+.na
\fB\fBdladm show-usage\fR [\fB-a\fR] \fB-f\fR \fIfilename\fR [\fB-p\fR
\fIplotfile\fR \fB-F\fR \fIformat\fR] [\fB-s\fR \fItime\fR] [\fB-e\fR
\fItime\fR] [\fIlink\fR]\fR
@@ -5606,7 +5970,7 @@ Interface Stability Committed
.SH SEE ALSO
\fBacctadm\fR(1M), \fBautopush\fR(1M), \fBifconfig\fR(1M), \fBipsecconf\fR(1M),
\fBndd\fR(1M), \fBpsrset\fR(1M), \fBwpad\fR(1M), \fBzonecfg\fR(1M),
-\fBattributes\fR(5), \fBieee802.3\fR(5), \fBdlpi\fR(7P)
+\fBattributes\fR(5), \fBieee802.3\fR(5), \fBoverlay\fR(5), \fBdlpi\fR(7P)
.SH NOTES
The preferred method of referring to an aggregation in the aggregation
subcommands is by its link name. Referring to an aggregation by its integer
diff --git a/usr/src/man/man4/Makefile b/usr/src/man/man4/Makefile
index 40c7f78d41..941757d4f3 100644
--- a/usr/src/man/man4/Makefile
+++ b/usr/src/man/man4/Makefile
@@ -133,6 +133,7 @@ _MANFILES= Intro.4 \
nsmbrc.4 \
nss.4 \
nsswitch.conf.4 \
+ overlay_files.4 \
packingrules.4 \
pam.conf.4 \
passwd.4 \
diff --git a/usr/src/man/man4/overlay_files.4 b/usr/src/man/man4/overlay_files.4
new file mode 100644
index 0000000000..b9e5387871
--- /dev/null
+++ b/usr/src/man/man4/overlay_files.4
@@ -0,0 +1,187 @@
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright 2015, Joyent, Inc.
+.\"
+.Dd Apr 13, 2015
+.Dt OVERLAY_FILES 4
+.Os
+.Sh NAME
+.Nm overlay_files
+.Nd Overlay files plugin file format
+.Sh DESCRIPTION
+The
+.Sy files
+plugin provides a means for a dynamic overlay where the destinations are
+determined based on a static description contained in a
+.Sy JSON
+file.
+This manual describes the format of the file used by the
+.Sy files/config
+property.
+To create and manage overlays with the
+.Sy files
+plugin, use
+.Xr dladm 1M .
+For more information on overlays, see
+.Xr overlay 5 .
+.Pp
+Using the
+.Sy files
+module, a static and simple overlay network can be created.
+This network does not support the use of
+.Em broadcast
+or
+.Em multicast
+traffic.
+Both ARP and NDP traffic are proxied by the plugin itself.
+In addition, the plugin allows for DHCP.
+Instead of providing a traditional DHCP proxy, when an initial DHCP broadcast
+goes out to a broadcast address, it will get rewritten to target a specific MAC
+address.
+The
+.Sy files
+plugin is useful as proof of concept and for simple static networks
+where addresses do not need to be reconfigured.
+If more advanced topologies or more streamlined updates are required, consider
+a different plugin.
+.Pp
+The file format is encoded as a series of
+.Sy JSON
+objects.
+Each object has a key, which is a MAC address on the
+.Sy overlay
+network.
+It has multiple values, some required, some optional, which describe various
+properties.
+The valid properties are:
+.Bl -hang -width Ds
+.It Sy ip
+.Bd -filled -compact
+The
+.Sy ip
+key indicates the IP address on the
+.Sy underlay
+network that houses the MAC address in question.
+Packets directed for the MAC address will be encapsulated and set to this
+address.
+This field is required.
+.Pp
+The value is a
+.Em JSON String .
+Both IPv4 and IPv6 addresses are supported and should be written out in their
+traditional forms.
+Follow the guidelines for writing addresses in
+.Xr inet_aton 3SOCKET .
+.Ed
+.It Sy port
+.Bd -filled -compact
+The
+.Sy port
+key indicates the port on the
+.Sy underlay
+network that houses the MAC address in question.
+This property is required if the encapsulation module requires a port for its
+destination.
+The value is a
+.Em JSON Number .
+.Ed
+.It Sy arp
+.Bd -filled -compact
+The
+.Sy arp
+key stores the IPv4 address that corresponds to this MAC address on the
+.Sy overlay
+network.
+This will be used to respond to ARP queries that would traditionally have been
+received by the OS kernel.
+If this address is not present, no IPv4 packets directed to this IP address will
+be received by the network interface that has this MAC address, regardless of
+what is configured on top of it.
+.Pp
+The value is a
+.Em JSON String
+and should be written out following the guidelines for IPv4 addresses in
+.Xr inet_aton 3SOCKET .
+.Ed
+.It Sy ndp
+.Bd -filled -compact
+The
+.Sy ndp
+key stores the IPv6 address that corresponds to this MAC address on the
+.Sy overlay
+network.
+This will be used to respond to NDP queries that would traditionally have been
+received by the OS kernel.
+If this address is not present, no IPv6 packets directed to this IP address will
+be received by the network interface that has this MAC address, regardless of
+what is configured on top of it.
+.Pp
+The value is a
+.Em JSON String
+and should be written out following the guidelines for IPv6 addresses in
+.Xr inet_aton 3SOCKET .
+.Ed
+.It Sy dhcp-proxy
+.Bd -filled -compact
+The
+.Sy dhcp-proxy
+key stores a MAC address that DHCP messages directed to a broadcast address get
+rewritten to be sent to.
+This can be viewed as a form of proxy DHCP, but is different in mechanism from a
+traditional proxy.
+The value is a
+.Em JSON String
+and should be written as a traditional MAC address string as described by
+.Xr ether_aton 3SOCKET .
+.Ed
+.El
+.Sh EXAMPLES
+.Sy Example 1
+Sample configuration file
+.Pp
+This configuration file provides information for three different MAC
+addresses.
+Each MAC address has an entry which describes what its IPv4
+and IPv6 address is, as well as the IP address and port of the host on
+the underlay network.
+Finally, one host has a DHCP proxy entry to demonstrate how one might
+configure DHCP.
+.Bd -literal -offset indent
+{
+ "de:ad:be:ef:00:00": {
+ "arp": "10.55.55.2",
+ "ip": "10.88.88.69",
+ "ndp": "fe80::3",
+ "port": 4789
+ },
+ "de:ad:be:ef:00:01": {
+ "arp": "10.55.55.3",
+ "dhcp-proxy": "de:ad:be:ef:00:00",
+ "ip": "10.88.88.70",
+ "ndp": "fe80::4",
+ "port": 4789
+ },
+ "de:ad:be:ef:00:02": {
+ "arp": "10.55.55.4",
+ "ip": "10.88.88.71",
+ "ndp": "fe80::5",
+ "port": 4789
+ }
+}
+.Ed
+.Sh STABILITY
+This file format is
+.Sy committed ;
+however, keys that are not listed here are reserved for future use.
+.Sh SEE ALSO
+.Xr dladm 1M ,
+.Xr overlay 5
diff --git a/usr/src/man/man5/Makefile b/usr/src/man/man5/Makefile
index 9eb12d0164..ea0520872a 100644
--- a/usr/src/man/man5/Makefile
+++ b/usr/src/man/man5/Makefile
@@ -83,6 +83,7 @@ _MANFILES= Intro.5 \
ms.5 \
mutex.5 \
nfssec.5 \
+ overlay.5 \
pam_allow.5 \
pam_authtok_check.5 \
pam_authtok_get.5 \
diff --git a/usr/src/man/man5/overlay.5 b/usr/src/man/man5/overlay.5
new file mode 100644
index 0000000000..41d1b18739
--- /dev/null
+++ b/usr/src/man/man5/overlay.5
@@ -0,0 +1,521 @@
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright 2015 Joyent, Inc.
+.\"
+.Dd Apr 09, 2015
+.Dt OVERLAY 5
+.Os
+.Sh NAME
+.Nm overlay
+.Nd Overlay Devices
+.Sh DESCRIPTION
+Overlay devices are a GLDv3 device that allows users to create overlay
+networks that can be used to form the basis of network virtualization
+and software defined networking.
+Overlay networks allow a single physical network, often called an
+.Sy underlay
+network, to provide the means for creating multiple logical, isolated,
+and discrete layer two and layer three networks on top of it.
+.Pp
+Overlay devices are administered through
+.Xr dladm 1M .
+Overlay devices themselves cannot be plumbed up with
+.Sy IP ,
+.Sy vnd ,
+or any other protocol.
+Instead, like an
+.Sy etherstub ,
+they allow for VNICs to be created on top of them.
+Like an
+.Sy etherstub ,
+an overlay device acts as a local switch; however, when it encounters a
+non-local destination address, it instead looks up where it should send
+the packet, encapsulates it, and sends it out another interface in the
+system.
+.Pp
+A single overlay device encapsulates the logic to answer two different,
+but related, questions:
+.Pp
+.Bl -enum -offset indent -compact
+.It
+How should a packet be transformed and put on the wire?
+.It
+Where should a transformed packet be sent?
+.El
+.Pp
+Each of these questions is answered by a plugin.
+The first question is answered by what's called an
+.Em encapsulation plugin .
+The second question is answered by what's called a
+.Em search plugin .
+Packets are encapsulated and decapsulated using the encapsulation plugin
+by the kernel.
+The search plugins are all user land plugins that are consumed by the
+varpd service whose FMRI is
+.Em svc:/network/varpd:default .
+This separation allows for the kernel to be responsible for the data
+path, while having the search plugins in userland allows the system to
+provide a much more expressive interface.
+.Ss Overlay Types
+Overlay devices come in two different flavors, one where all packets are
+always sent to a single address, the other, where the destination of a
+packet varies based on the target MAC address of the packet.
+This information is maintained in a
+.Em target table ,
+which is independent and unique to each overlay device.
+We call the plugins that send traffic to a single location, for example
+a single unicast or multicast IP address, a
+.Sy point to point
+overlay and the overlay devices that can send traffic to different
+locations based on the MAC address of that packet a
+.Sy dynamic
+overlay.
+The plugin type is determined based on the type of the
+.Sy search plugin .
+These are all fully listed in the section
+.Sx Plugins and their Properties .
+.Ss Overlay Destination
+Both encapsulation and search plugins define the kinds of destinations
+that they know how to support.
+An encapsulation plugin always has a single destination type that's
+determined based on how the encapsulation is defined.
+A search plugin, on the other hand, can support multiple combinations of
+destinations.
+A search plugin must support the destination type of the encapsulation
+device.
+The destination may require any of the following three pieces of
+information, depending on the encapsulation plugin:
+.Bl -hang -width Ds
+.It Sy MAC Address
+.Bd -filled -compact
+An Ethernet MAC address is required to determine the destination.
+.Ed
+.It Sy IP Address
+.Bd -filled -compact
+An IP address is required.
+Both IPv4 and IPv6 addresses are supported.
+.Ed
+.It Sy Port
+.Bd -filled -compact
+An IP protocol level (TCP, UDP, SCTP, etc.) port is required.
+.Ed
+.El
+.Pp
+The list of destination types that are supported by both the search and
+encapsulation plugins is listed in the section
+.Sx Plugins and their Properties .
+.Ss varpd
+The varpd service, mentioned above, is responsible for providing the
+virtual ARP daemon.
+Its responsibility is conceptually similar to ARP.
+It runs all instances of search plugins in the system and is responsible
+for answering the kernel's ARP-like questions for where packets should
+be sent.
+.Pp
+The varpd service, svc:/network/varpd:default, must be enabled for
+overlay devices to function.
+If it is disabled while there are active devices, then most overlay
+devices will not function correctly and likely will end up dropping
+traffic.
+.Sh PLUGINS AND PROPERTIES
+Properties fall into three categories in the system:
+.Bl -enum -offset indent -compact
+.It
+Generic properties all overlay devices have
+.It
+Properties specific to the encapsulation plugin
+.It
+Properties specific to the search plugin
+.El
+.Pp
+Each property in the system has the following attributes, which mirror
+the traditional
+.Xr dladm 1M
+link properties:
+.Bl -hang -width Ds
+.It Sy Name
+.Bd -filled -compact
+The name of a property is namespaced by its module and always structured
+and referred to as as module/property.
+This allows for both an encapsulation and search plugin to have a
+property with the same name.
+Properties that are valid for all overlay devices and not specific to a
+module do not generally use a module prefix.
+.Pp
+For example, the property
+.Sy vxlan/listen_ip
+is associated with the
+.Sy vxlan
+encapsulation module.
+.Ed
+.It Sy Type
+.Bd -filled -compact
+Each property in the system has a type.
+.Xr dladm 1M
+takes care of converting between the internal representation and a
+value, but the type influences the acceptable input range.
+The types are:
+.Bl -hang -width Ds
+.It Sy INT
+A signed integer that is up to eight bytes long
+.Pq Sy int64_t .
+.It Sy UINT
+An unsigned integer that is up to eight bytes long
+.Pq Sy uint64_t .
+.It Sy IP
+Either an IPv4 or IPv6 address in traditional string form.
+For example, 192.168.128.23 or 2001:470:8af4::1:1.
+IPv4 addresses may also be encoded as IPv4-mapped IPv6 addresses.
+.It Sy STRING
+A string of ASCII or UTF-8 encoded characters terminated with a
+.Sy NUL
+byte.
+The maximum string length, including the terminator, is currently
+256 bytes.
+.El
+.Ed
+.It Sy Permissions
+.Bd -filled -compact
+Each property has permissions associated with it, which indicate whether
+the system considers them read-only properties or read-write properties.
+A read-only property can never be updated once the device is created.
+This generally includes things like the overlay's encapsulation module.
+.Ed
+.It Sy Required
+.Bd -filled -compact
+This property indicates whether the property is required for the given
+plugin.
+If it is not specified during a call to
+.Sy dladm create-overlay ,
+then the overlay cannot be successfully created.
+Properties which have a
+.Sy default
+will use that value if one is not specified rather than cause the
+overlay creation to fail.
+.Ed
+.It Sy Current Value
+.Bd -filled -compact
+The current value of a property, if the property has a value set.
+Required properties always have a value set.
+.Ed
+.It Sy Default Value
+.Bd -filled -compact
+The default value is an optional part of a given property.
+If a property does define a default value, then it will be used when an
+overlay is created and no other value is given.
+.Ed
+.It Sy Value ranges
+.Bd -filled -compact
+Value ranges are an optional part of a given property.
+They indicate a range or set of values that are valid and may be set for
+a property.
+A property may not declare such a range as it may be impractical or
+unknown.
+For example, most properties based on IP addresses will not
+declare a range.
+.Ed
+.El
+.Pp
+The following sections describe both the modules and the properties that
+exist for each module, noting their name, type, permissions, whether or
+not they are required, and if there is a default value.
+In addition, the effects of each property will be described.
+.Ss Encapsulation Plugins
+.Bl -hang -width Ds
+.It Sy vxlan
+The
+.Sy vxlan
+module is a UDP based encapsulation method.
+It takes a frame that would be put on the wire, wraps it up in a VXLAN
+header and places it in a UDP packet that gets sent out on the
+underlying network.
+For more details about the specific format of the VXLAN header, see
+.Xr vxlan 7P .
+.Pp
+The
+.Sy vxlan
+module requires both an
+.Sy IP address
+and
+.Sy port
+to address it.
+It has a 24-bit virtual network ID space, allowing for
+virtual network identifiers that range from
+.Sy 0
+-
+.Sy 16777215 .
+.Pp
+The
+.Sy vxlan
+module has the following properties:
+.Bl -hang -width Ds
+.It Sy vxlan/listen_ip
+.Bd -filled -compact
+Type:
+.Sy IP |
+Permissions:
+.Sy Read/Write |
+.Sy Required
+.Ed
+.Bd -filled
+The
+.Sy vxlan/listen_ip
+property determines the IP address that the system will accept VXLAN
+encapsulated packets on for this overlay.
+.Ed
+.It Sy vxlan/listen_port
+.Bd -filled -compact
+Type:
+.Sy UINT |
+Permissions:
+.Sy Read/Write |
+.Sy Required
+.Ed
+.Bd -filled -compact
+Default Value:
+.Sy 4789 |
+Range:
+.Sy 0 - 65535
+.Ed
+.Bd -filled
+The
+.Sy vxlan/listen_port
+property determines the UDP port that the system will listen on for
+VXLAN traffic for this overlay.
+The default value is
+.Sy 4789 ,
+the IANA assigned port for VXLAN.
+.Ed
+.El
+.Pp
+The
+.Sy vxlan/listen_ip
+and
+.Sy vxlan/listen_port
+properties determine how the system will accept VXLAN encapsulated
+packets for this interface.
+It does not determine the interface that packets will be sent out over.
+Multiple overlays that all use VXLAN can share the same IP and port
+combination, as the virtual network identifier can be used to tell the
+different overlays apart.
+.El
+.Ss Search Plugins
+Because search plugins may support multiple destinations, they may have
+more properties listed than necessarily show up for a given overlay.
+For example, the
+.Sy direct
+plugin supports destinations that are identified by both an IP address
+and a port, or just an IP address.
+In cases where the device is created over an overlay that only uses an
+IP address for its destination, then it will not have the
+.Sy direct/dest_port
+property.
+.Bl -hang -width Ds
+.It Sy direct
+The
+.Sy direct
+plugin is a point to point module that can be used to create an overlay
+that forwards all non-local traffic to a single destination.
+It supports destinations that are a combination of an
+.Sy IP Address
+and a
+.Sy port .
+.Pp
+The
+.Sy direct
+plugin has the following properties:
+.Bl -hang -width Ds
+.It Sy direct/dest_ip
+.Bd -filled -compact
+Type:
+.Sy IP |
+Permissions:
+.Sy Read/Write |
+.Sy Required
+.Ed
+.Bd -filled
+The
+.Sy direct/dest_ip
+property indicates the IP address that all traffic will be sent out.
+Traffic will be sent out the corresponding interface based on
+traditional IP routing rules and the configuration of the networking
+stack of the global zone.
+.Ed
+.It Sy direct/dest_port
+.Bd -filled -compact
+Type:
+.Sy UINT |
+Permissions:
+.Sy Read/Write |
+.Sy Required
+.Ed
+.Bd -filled -compact
+Default Value:
+.Sy - |
+Range:
+.Sy 0 - 65535
+.Ed
+.Bd -filled
+The
+.Sy direct/dest_port
+property indicates the TCP or UDP port that all traffic will be directed
+to.
+.Ed
+.El
+.It Sy files
+The
+.Sy files
+plugin implements a
+.Sy dynamic
+plugin that specifies where traffic should be sent based on a file.
+It is a glorified version of /etc/ethers.
+The
+.Sy dynamic
+plugin does not support broadcast or multicast traffic, but it has
+support for proxy ARP, NDP, and DHCPv4.
+For the full details of the file format, see
+.Xr overlay_files 4 .
+.Pp
+The
+.Sy files
+plugin has the following property:
+.Bl -hang -width Ds
+.It Sy files/config
+.Bd -filled -compact
+Type:
+.Sy String |
+Permissions:
+.Sy Read/Write |
+.Sy Required
+.Ed
+.Bd -filled
+The
+.Sy files/config
+property specifies an absolute path to a file to read.
+The file is a JSON file that is formatted according to
+.Xr overlay_files 4 .
+.Ed
+.El
+.El
+.Ss General Properties
+Each overlay has the following properties which are used to give
+additional information about the system.
+None of these properties may be specified as part of a
+.Sy dladm create-overlay ,
+instead they come from other arguments or from internal parts of the
+system.
+.Bl -hang -width Ds
+.It Sy encap
+.Bd -filled -compact
+.Sy String |
+Permissions:
+.Sy Read Only
+.Ed
+.Bd -filled
+The
+.Sy encap
+property contains the name of the encapsulation module that's in use.
+.Ed
+.It Sy mtu
+.Bd -filled -compact
+.Sy UINT |
+Permissions:
+.Sy Read/Write
+.Ed
+.Bd -filled -compact
+Default Value:
+.Sy 1400 |
+Range:
+.Sy 576 - 9000
+.Ed
+.Bd -filled
+The
+.Sy mtu
+property describes the maximum transmission unit of the overlay.
+The default value is
+.Sy 1400
+bytes, which ensures that in a traditional deployment with an MTU of
+1500 bytes, the overhead that is added from encapsulation is all
+accounted for.
+It is the administrator's responsibility to ensure that
+the device's MTU and the encapsulation overhead does not exceed that of
+the interfaces that the encapsulated traffic will be sent out of.
+.Pp
+To modify the
+.Sy mtu
+property, use
+.Sy dladm set-linkprop .
+.Ed
+.It Sy search
+.Bd -filled -compact
+.Sy String |
+Permissions:
+.Sy Read Only
+.Ed
+.Bd -filled
+The
+.Sy search
+property contains the name of the search plugin that's in use.
+.Ed
+.It Sy varpd/id
+.Bd -filled -compact
+.Sy String |
+Permissions:
+.Sy Read Only
+.Ed
+.Bd -filled
+The
+.Sy varpd/id
+property indicates the identifier which the
+.Sy varpd
+service uses for this overlay.
+.Ed
+.It Sy vnetid
+.Bd -filled -compact
+.Sy UINT |
+Permissions:
+.Sy Read/Write
+.Ed
+.Bd -filled
+The
+.Sy vnetid
+property has the virtual network identifier that belongs to this overlay.
+The valid range for the virtual network identifier depends on the
+encapsulation engine.
+.Ed
+.El
+.Sh FMA INTEGRATION
+Overlay devices are wired into FMA, the illumos fault management
+architecture, and generates error reports depending on the
+.Sy search
+plugin in use.
+Due to limitations in FMA today, when a single overlay
+enters a degraded state, meaning that it cannot properly perform look
+ups or another error occurred, then it degrades the overall
+.Sy overlay
+pseudo-device driver.
+.Pp
+For more fine-grained information about which overlay is actually in a
+.Em degraded
+state, one should run
+.Sy dladm show-overlay -f .
+In addition, for each overlay in a degraded state a more useful
+diagnostic message is provided which describes the reason that caused
+this overlay to enter into a degraded state.
+.Pp
+The overlay driver is self-healing.
+If the problem corrects itself on its own, it will clear the fault on
+the corresponding device.
+.Sh SEE ALSO
+.Xr dladm 1M ,
+.Xr overlay_files 4 ,
+.Xr vxlan 7P
diff --git a/usr/src/man/man7p/Makefile b/usr/src/man/man7p/Makefile
index 13cb58770d..9186b1ac20 100644
--- a/usr/src/man/man7p/Makefile
+++ b/usr/src/man/man7p/Makefile
@@ -16,30 +16,31 @@
include $(SRC)/Makefile.master
-MANSECT= 7p
-
-MANFILES= arp.7p \
- dlpi.7p \
- icmp.7p \
- icmp6.7p \
- if_tcp.7p \
- inet.7p \
- inet6.7p \
- ip.7p \
- ip6.7p \
- ipsec.7p \
- ipsecah.7p \
- ipsecesp.7p \
- ndp.7p \
- pf_key.7p \
- rarp.7p \
- route.7p \
- routing.7p \
- sctp.7p \
- sip.7p \
- slp.7p \
- tcp.7p \
- udp.7p
+MANSECT= 7p
+
+MANFILES= arp.7p \
+ dlpi.7p \
+ icmp.7p \
+ icmp6.7p \
+ if_tcp.7p \
+ inet.7p \
+ inet6.7p \
+ ip.7p \
+ ip6.7p \
+ ipsec.7p \
+ ipsecah.7p \
+ ipsecesp.7p \
+ ndp.7p \
+ pf_key.7p \
+ rarp.7p \
+ route.7p \
+ routing.7p \
+ sctp.7p \
+ sip.7p \
+ slp.7p \
+ tcp.7p \
+ udp.7p \
+ vxlan.7p
MANLINKS= AH.7p \
ARP.7p \
@@ -51,7 +52,8 @@ MANLINKS= AH.7p \
SCTP.7p \
TCP.7p \
UDP.7p \
- if.7p
+ VXLAN.7p \
+ if.7p
ARP.7p := LINKSRC = arp.7p
@@ -67,14 +69,16 @@ ESP.7p := LINKSRC = ipsecesp.7p
NDP.7p := LINKSRC = ndp.7p
-RARP.7p := LINKSRC = rarp.7p
+RARP.7p := LINKSRC = rarp.7p
-SCTP.7p := LINKSRC = sctp.7p
+SCTP.7p := LINKSRC = sctp.7p
TCP.7p := LINKSRC = tcp.7p
UDP.7p := LINKSRC = udp.7p
+VXLAN.7p := LINKSRC = vxlan.7p
+
.KEEP_STATE:
include $(SRC)/man/Makefile.man
diff --git a/usr/src/man/man7p/vxlan.7p b/usr/src/man/man7p/vxlan.7p
new file mode 100644
index 0000000000..43c4756585
--- /dev/null
+++ b/usr/src/man/man7p/vxlan.7p
@@ -0,0 +1,130 @@
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright 2015 Joyent, Inc.
+.\"
+.Dd Apr 10, 2015
+.Dt VXLAN 7P
+.Os
+.Sh NAME
+.Nm VXLAN ,
+.Nm vxlan
+.Nd Virtual eXtensible Local Area Network
+.Sh SYNOPSIS
+.In sys/vxlan.h
+.Sh DESCRIPTION
+.Nm
+(RFC 7348) is a network encapsulation protocol that is used by
+.Xr overlay 5
+devices.
+A payload, commonly an Ethernet frame, is placed inside of a
+UDP packet and prepended with an 8-byte
+.Nm
+header.
+.Pp
+The
+.Nm
+header contains two 32-bit words.
+The first word is an 8-bit flags field followed by 24 reserved bits.
+The second word is a 24-bit virtual network identifier followed by 8
+reserved bits.
+The virtual network identifier identifies a unique
+.Nm
+and
+is similar in concept to an IEEE 802.1Q VLAN identifier.
+.Pp
+The system provides access to
+.Nm
+through dladm overlays.
+See
+.Xr dladm 1M
+and
+.Xr overlay 5
+for more information.
+.Pp
+The
+.In sys/vxlan.h
+header provides information for working with the
+.Nm
+protocol.
+The contents of this header are
+.Sy uncommitted .
+The header defines a structure that may be used to encode and decode a VXLAN
+header.
+It defines a packed structure type
+.Sy vxlan_hdr_t
+which represents the
+.Nm
+frame header and has the following members:
+.Bd -literal
+ uint32_t vxlan_flags; /* flags in upper 8 bits */
+ uint32_t vxlan_id; /* VXLAN ID in upper 24 bits */
+.Ed
+.Sh EXAMPLES
+.Sy Example 1
+Decoding a
+.Nm
+header
+.Pp
+The following example shows how to validate a
+.Nm header.
+For more information on this process, see RFC 7348.
+.Bd -literal -offset indent
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <inttypes.h>
+#include <sys/vxlan.h>
+
+\&...
+
+/*
+ * Validate the following bytes as a VXLAN header. If valid, return
+ * 0 and store the VXLAN identifier in *vidp. Otherwise, return an
+ * error.
+ */
+int
+validate_vxlan(void *buf, int len, uint32_t *vidp)
+{
+ vxlan_hdr_t *hdr;
+
+ if (len < sizeof (vxlan_hdr_t))
+ return (EINAVL);
+
+ hdr = buf;
+ if ((ntohl(hdr->vxlan_flags) & VXLAN_MAGIC) == 0)
+ return (EINAVL);
+
+ *vidp = ntohl(vxlan->vxlan_id) >> VXLAN_ID_SHIFT;
+
+ return (0);
+}
+.Ed
+.Sh STABILITY
+The contents of
+.In sys/vxlan.h
+are
+.Sy Uncommitted .
+.Sh SEE ALSO
+.Xr dladm 1M ,
+.Xr overlay 5
+.Rs
+.%A Mahalingam, M.
+.%A Dutt, D.
+.%A Duda, K.
+.%A Agarwal, P.
+.%A Kreeger L.
+.%A Sridhar, T.
+.%A Bursell, M.
+.%A C. Wright
+.%T RFC 7348, Virtual eXtensible Local Area Network (VXLAN): A Framework
+.%T for Overlaying Virtualized Layer 2 Networks over Layer 3 Networks
+.%D August 2014
+.Re
diff --git a/usr/src/pkg/manifests/system-network-overlay.p5m b/usr/src/pkg/manifests/system-network-overlay.p5m
new file mode 100644
index 0000000000..8cdbd10775
--- /dev/null
+++ b/usr/src/pkg/manifests/system-network-overlay.p5m
@@ -0,0 +1,62 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
+#
+
+<include global_zone_only_component>
+set name=pkg.fmri value=pkg:/system/network/overlay@$(PKGVERS)
+set name=pkg.summary value="illumos overlay driver"
+set name=pkg.description value="Device driver implementing network overlays"
+set name=info.classification \
+ value=org.opensolaris.category.2008:Drivers/Networking
+set name=variant.arch value=$(ARCH)
+dir path=kernel group=sys
+dir path=kernel/drv group=sys
+dir path=kernel/drv/$(ARCH64) group=sys
+file path=kernel/drv/$(ARCH64)/overlay group=sys
+file path=kernel/drv/overlay.conf group=sys
+dir path=kernel/overlay
+dir path=kernel/overlay/$(ARCH64)
+file path=kernel/overlay/$(ARCH64)/vxlan group=sys mode=0755
+dir path=lib
+file path=lib/$(ARCH64)/libvarpd.so.1 mode=0755 \
+ variant.opensolaris.zone=__NODEFAULT
+file path=lib/libvarpd.so.1 mode=0755 variant.opensolaris.zone=__NODEFAULT
+dir path=lib/svc
+dir path=lib/svc/manifest group=sys
+dir path=lib/svc/manifest/network group=sys
+file path=lib/svc/manifest/network/varpd.xml mode=0444
+dir path=usr/lib
+dir path=usr/lib/$(ARCH64)
+dir path=usr/lib/varpd
+dir path=usr/lib/varpd/$(ARCH64)
+link path=usr/lib/varpd/$(ARCH64)/libvarpd_direct.so target=libvarpd_direct.so.1
+file path=usr/lib/varpd/$(ARCH64)/libvarpd_direct.so.1
+link path=usr/lib/varpd/$(ARCH64)/libvarpd_files.so target=libvarpd_files.so.1
+file path=usr/lib/varpd/$(ARCH64)/libvarpd_files.so.1
+link path=usr/lib/varpd/64 target=$(ARCH64)
+link path=usr/lib/varpd/libvarpd_direct.so target=libvarpd_direct.so.1
+file path=usr/lib/varpd/libvarpd_direct.so.1
+link path=usr/lib/varpd/libvarpd_files.so target=libvarpd_files.so.1
+file path=usr/lib/varpd/libvarpd_files.so.1
+file path=usr/lib/varpd/varpd mode=0555
+dir path=usr/share/man
+dir path=usr/share/man/man4
+file path=usr/share/man/man4/overlay_files.4
+dir path=usr/share/man/man5
+file path=usr/share/man/man5/overlay.5
+dir path=usr/share/man/man7p
+link path=usr/share/man/man7p/VXLAN.7p target=vxlan.7p
+file usr/share/man/man7p/vxlan.7p path=usr/share/man/man7p/vxlan.7p mode=0444
+driver name=overlay
+license lic_CDDL license=lic_CDDL
diff --git a/usr/src/test/util-tests/tests/dladm/Makefile b/usr/src/test/util-tests/tests/dladm/Makefile
index e37ae56072..53fc2ce092 100644
--- a/usr/src/test/util-tests/tests/dladm/Makefile
+++ b/usr/src/test/util-tests/tests/dladm/Makefile
@@ -25,8 +25,6 @@ all:
install: $(ROOTPROG)
-lint:
-
clobber: clean
clean:
diff --git a/usr/src/uts/Makefile.targ b/usr/src/uts/Makefile.targ
index 1f686f2ca0..bb03cca4e2 100644
--- a/usr/src/uts/Makefile.targ
+++ b/usr/src/uts/Makefile.targ
@@ -177,6 +177,9 @@ $(ROOT_FONT_DIR)/%: $(OBJS_DIR)/% $(ROOT_MOD_DIR) $(ROOT_FONT_DIR) FRC
$(ROOT_MAC_DIR)/%: $(OBJS_DIR)/% $(ROOT_MOD_DIR) $(ROOT_MAC_DIR) FRC
$(INS.file)
+$(ROOT_OVERLAY_DIR)/%: $(OBJS_DIR)/% $(ROOT_MOD_DIR) $(ROOT_OVERLAY_DIR) FRC
+ $(INS.file)
+
$(USR_DRV_DIR)/%: $(OBJS_DIR)/% $(USR_DRV_DIR) FRC
$(INS.file)
diff --git a/usr/src/uts/Makefile.uts b/usr/src/uts/Makefile.uts
index a508c37287..841e84e40a 100644
--- a/usr/src/uts/Makefile.uts
+++ b/usr/src/uts/Makefile.uts
@@ -400,6 +400,7 @@ ROOT_DACF_DIR_32 = $(ROOT_MOD_DIR)/dacf
ROOT_CRYPTO_DIR_32 = $(ROOT_MOD_DIR)/crypto
ROOT_MAC_DIR_32 = $(ROOT_MOD_DIR)/mac
ROOT_CC_DIR_32 = $(ROOT_MOD_DIR)/cc
+ROOT_OVERLAY_DIR_32 = $(ROOT_MOD_DIR)/overlay
ROOT_KICONV_DIR_32 = $(ROOT_MOD_DIR)/kiconv
ROOT_KERN_DIR_64 = $(ROOT_MOD_DIR)/$(SUBDIR64)
@@ -428,6 +429,7 @@ ROOT_DACF_DIR_64 = $(ROOT_MOD_DIR)/dacf/$(SUBDIR64)
ROOT_CRYPTO_DIR_64 = $(ROOT_MOD_DIR)/crypto/$(SUBDIR64)
ROOT_MAC_DIR_64 = $(ROOT_MOD_DIR)/mac/$(SUBDIR64)
ROOT_CC_DIR_64 = $(ROOT_MOD_DIR)/cc/$(SUBDIR64)
+ROOT_OVERLAY_DIR_64 = $(ROOT_MOD_DIR)/overlay/$(SUBDIR64)
ROOT_KICONV_DIR_64 = $(ROOT_MOD_DIR)/kiconv/$(SUBDIR64)
ROOT_KERN_DIR = $(ROOT_KERN_DIR_$(CLASS))
@@ -456,6 +458,7 @@ ROOT_DACF_DIR = $(ROOT_DACF_DIR_$(CLASS))
ROOT_CRYPTO_DIR = $(ROOT_CRYPTO_DIR_$(CLASS))
ROOT_MAC_DIR = $(ROOT_MAC_DIR_$(CLASS))
ROOT_CC_DIR = $(ROOT_CC_DIR_$(CLASS))
+ROOT_OVERLAY_DIR = $(ROOT_OVERLAY_DIR_$(CLASS))
ROOT_KICONV_DIR = $(ROOT_KICONV_DIR_$(CLASS))
ROOT_FIRMWARE_DIR = $(ROOT_MOD_DIR)/firmware
@@ -475,6 +478,7 @@ ROOT_MOD_DIRS_32 += $(ROOT_CPU_DIR_32) $(ROOT_FONT_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_TOD_DIR_32) $(ROOT_DACF_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_CRYPTO_DIR_32) $(ROOT_MAC_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_CC_DIR_32)
+ROOT_MOD_DIRS_32 += $(ROOT_OVERLAY_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_KICONV_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_FIRMWARE_DIR)
@@ -568,7 +572,7 @@ PARALLEL_KMODS = $(DRV_KMODS) $(EXEC_KMODS) $(FS_KMODS) $(SCHED_KMODS) \
$(MMU_KMODS) $(DACF_KMODS) $(EXPORT_KMODS) $(IPP_KMODS) \
$(CRYPTO_KMODS) $(PCBE_KMODS) \
$(DRV_KMODS_$(CLASS)) $(MISC_KMODS_$(CLASS)) $(MAC_KMODS) \
- $(BRAND_KMODS) $(KICONV_KMODS) $(CC_KMODS) \
+ $(BRAND_KMODS) $(KICONV_KMODS) $(CC_KMODS) $(OVERLAY_KMODS) \
$(SOCKET_KMODS)
KMODS = $(GENUNIX_KMODS) $(PARALLEL_KMODS)
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index b6729071bf..f87e659a2b 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -705,6 +705,11 @@ NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \
VNIC_OBJS += vnic_ctl.o vnic_dev.o
+OVERLAY_OBJS += overlay.o overlay_fm.o overlay_mux.o overlay_plugin.o \
+ overlay_prop.o overlay_target.o
+
+OVERLAY_VXLAN_OBJS += overlay_vxlan.o
+
SIMNET_OBJS += simnet.o
IB_OBJS += ibnex.o ibnex_ioctl.o ibnex_hca.o
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index c3ca37feb5..870758e866 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -1004,6 +1004,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nxge/npi/%.c
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nxge/%.s
$(COMPILE.s) -o $@ $<
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/overlay/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/overlay/plugins/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/pci-ide/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 58e6d95e99..5d42a69fa2 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -22,7 +22,7 @@
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2015, Joyent, Inc.
+ * Copyright 2018, Joyent, Inc.
* Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -388,7 +388,19 @@ udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max,
if (!IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)))
return (def);
- if (MBLKL(mp) < VXLAN_HDR_LEN) {
+ /*
+ * The following logic is VXLAN specific to get at the header, if we
+ * have formats, eg. GENEVE, then we should ignore this.
+ *
+ * The kernel overlay device often puts a first mblk_t for the data
+ * which is just the encap. If so, then we're going to use that and try
+ * to avoid a pull up.
+ */
+ if (MBLKL(mp) == VXLAN_HDR_LEN) {
+ if (mp->b_cont == NULL)
+ return (def);
+ mp = mp->b_cont;
+ } else if (MBLKL(mp) < VXLAN_HDR_LEN) {
return (def);
} else {
szused = VXLAN_HDR_LEN;
diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c
index eca17349c3..dbcd9caea8 100644
--- a/usr/src/uts/common/io/dld/dld_drv.c
+++ b/usr/src/uts/common/io/dld/dld_drv.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent Inc.
* Copyright (c) 2017, Joyent, Inc.
*/
@@ -631,7 +632,7 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set,
cred_t *cred, int mode)
{
int err = EINVAL;
- dls_dl_handle_t dlh = NULL;
+ dls_dl_handle_t dlh = NULL;
dls_link_t *dlp = NULL;
mac_perim_handle_t mph = NULL;
dld_ioc_macprop_t *kprop;
@@ -1327,7 +1328,7 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred,
{
int ret = 0;
mac_perim_handle_t mph = NULL;
- dls_dl_handle_t dlh = NULL;
+ dls_dl_handle_t dlh = NULL;
dls_link_t *dlp = NULL;
dld_ioc_gettran_t *dgt = karg;
@@ -1371,7 +1372,7 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred,
{
int ret = 0;
mac_perim_handle_t mph = NULL;
- dls_dl_handle_t dlh = NULL;
+ dls_dl_handle_t dlh = NULL;
dls_link_t *dlp = NULL;
dld_ioc_tranio_t *dti = karg;
uint8_t buf[256];
@@ -1424,7 +1425,7 @@ drv_ioc_getled(void *karg, intptr_t arg, int mode, cred_t *cred,
{
int ret = 0;
mac_perim_handle_t mph = NULL;
- dls_dl_handle_t dlh = NULL;
+ dls_dl_handle_t dlh = NULL;
dls_link_t *dlp = NULL;
dld_ioc_led_t *dil = karg;
@@ -1470,7 +1471,7 @@ drv_ioc_setled(void *karg, intptr_t arg, int mode, cred_t *cred,
{
int ret = 0;
mac_perim_handle_t mph = NULL;
- dls_dl_handle_t dlh = NULL;
+ dls_dl_handle_t dlh = NULL;
dls_link_t *dlp = NULL;
dld_ioc_led_t *dil = karg;
@@ -1585,7 +1586,8 @@ static dld_ioc_modentry_t dld_ioc_modtable[] = {
{SIMNET_IOC, "simnet", 0, NULL, 0},
{BRIDGE_IOC, "bridge", 0, NULL, 0},
{IPTUN_IOC, "iptun", 0, NULL, 0},
- {IBPART_IOC, "ibp", -1, NULL, 0}
+ {IBPART_IOC, "ibp", -1, NULL, 0},
+ {OVERLAY_IOC, "overlay", 0, NULL, 0}
};
#define DLDIOC_CNT \
(sizeof (dld_ioc_modtable) / sizeof (dld_ioc_modentry_t))
diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c
index 605cb51bf7..167ed2b90f 100644
--- a/usr/src/uts/common/io/mac/mac_client.c
+++ b/usr/src/uts/common/io/mac/mac_client.c
@@ -4524,7 +4524,13 @@ mac_addr_len(mac_handle_t mh)
boolean_t
mac_is_vnic(mac_handle_t mh)
{
- return (((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC);
+ return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC) != 0);
+}
+
+boolean_t
+mac_is_overlay(mac_handle_t mh)
+{
+ return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_OVERLAY) != 0);
}
mac_handle_t
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
index bfb41afe5e..0da404853c 100644
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -605,6 +605,7 @@ mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg)
*
* TODO: Cleanup and tighten some of the assumptions.
*/
+boolean_t mac_check_overlay = B_TRUE;
boolean_t mac_use_bw_heuristic = B_TRUE;
static int
mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
@@ -612,6 +613,7 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
uint64_t cpu_speed, bw = 0;
int srings = 0;
boolean_t bw_enabled = B_FALSE;
+ mac_client_impl_t *mcip = flent->fe_mcip;
ASSERT(!(flent->fe_type & FLOW_USER));
if (flent->fe_resource_props.mrp_mask & MRP_MAXBW &&
@@ -639,7 +641,16 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
*/
if (mac_soft_ring_enable)
srings = srings * 2;
+ } else if (mac_check_overlay == B_TRUE &&
+ (mcip->mci_state_flags & MCIS_IS_VNIC) != 0) {
+ /* Is this a VNIC on an overlay? */
+ mac_handle_t mh = (mac_handle_t)mcip->mci_mip;
+ if (mac_is_overlay(mh) == B_TRUE) {
+ srings = mac_rx_soft_ring_10gig_count;
+ }
}
+
+
} else {
/*
* Soft ring computation using CPU speed and specified
diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c
index bfaf232d25..bcca602589 100644
--- a/usr/src/uts/common/io/mac/mac_provider.c
+++ b/usr/src/uts/common/io/mac/mac_provider.c
@@ -393,6 +393,9 @@ mac_register(mac_register_t *mregp, mac_handle_t *mhp)
if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
mip->mi_state_flags |= MIS_IS_AGGR;
+ if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL))
+ mip->mi_state_flags |= MIS_IS_OVERLAY;
+
mac_addr_factory_init(mip);
mac_transceiver_init(mip);
diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c
new file mode 100644
index 0000000000..e43f3671b4
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.c
@@ -0,0 +1,2184 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay Devices
+ *
+ * Overlay devices provide a means for creating overlay networks, a means of
+ * multiplexing multiple logical, isolated, and discrete layer two and layer
+ * three networks on top of one physical network.
+ *
+ * In general, these overlay devices encapsulate the logic to answer two
+ * different questions:
+ *
+ * 1) How should I transform a packet to put it on the wire?
+ * 2) Where should I send a transformed packet?
+ *
+ * Each overlay device is presented to the user as a GLDv3 device. While the
+ * link itself cannot have an IP interface created on top of it, it allows for
+ * additional GLDv3 devices, such as a VNIC, to be created on top of it which
+ * can be plumbed up with IP interfaces.
+ *
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * The logical overlay device that a user sees in dladm(1M) is a combination of
+ * two different components that work together. The first component is this
+ * kernel module, which is responsible for answering question one -- how should
+ * I transform a packet to put it on the wire.
+ *
+ * The second component is what we call the virtual ARP daemon, or varpd. It is
+ * a userland component that is responsible for answering the second question --
+ * Where should I send a transformed packet. Instances of the kernel overlay
+ * GLDv3 device ask varpd the question of where should a packet go.
+ *
+ * The split was done for a few reasons. Importantly, we wanted to keep the act
+ * of generating encapsulated packets in the kernel so as to ensure that the
+ * general data path was fast and also kept simple. On the flip side, while the
+ * question of where should something go may be simple, it may often be
+ * complicated and need to interface with several different external or
+ * distributed systems. In those cases, it's simpler to allow for the full
+ * flexibility of userland to be brought to bear to solve that problem and in
+ * general, the path isn't very common.
+ *
+ * The following is what makes up the logical overlay device that a user would
+ * create with dladm(1M).
+ *
+ * Kernel Userland
+ * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
+ * . +--------+ +--------+ +--------+ . . .
+ * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . .
+ * . +--------+ +--------+ +--------+ . . .
+ * . | | | . . .
+ * . | | | . . .
+ * . +------------+-----------+ . . .
+ * . | . . /dev/overlay .
+ * . +--------------+ . . . +------------+ .
+ * . | | . . . | | .
+ * . | Overlay |======*=================| Virtual | .
+ * . | GLDv3 Device |========================| ARP Daemon | .
+ * . | | . . | | .
+ * . +--------------+ . . +------------+ .
+ * . | . . | .
+ * . | . . | .
+ * . +----------------+ . . +--------+ .
+ * . | Overlay | . . | varpd | .
+ * . | Encapsulation | . . | Lookup | .
+ * . | Plugin | . . | Plugin | .
+ * . +----------------+ . . +--------+ .
+ * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
+ *
+ *
+ * This image shows the two different components and where they live.
+ * Importantly, it also shows that both the kernel overlay device and the
+ * userland varpd both support plugins. The plugins actually implement the
+ * things that users care about and the APIs have been designed to try to
+ * minimize the amount of things that a module writer needs to worry about it.
+ *
+ * IDENTIFIERS
+ *
+ * Every overlay device is defined by a unique identifier which is the overlay
+ * identifier. Its purpose is similar to that of a VLAN identifier, it's a
+ * unique number that is used to differentiate between different entries on the
+ * wire.
+ *
+ * ENCAPSULATION
+ *
+ * An overlay encapsulation plugin is a kernel miscellaneous module whose
+ * purpose is to contain knowledge about how to transform packets to put them
+ * onto the wire and to take them off. An example of an encapsulation plugin is
+ * vxlan. It's also how support for things like nvgre or geneve would be brought
+ * into the system.
+ *
+ * Each encapsulation plugins defines a series of operation vectors and
+ * properties. For the full details on everything they should provide, please
+ * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible
+ * for telling the system what information is required to send a packet. For
+ * example, vxlan is defined to send everything over a UDP packet and therefore
+ * requires a port and an IP address, while nvgre on the other hand is its own
+ * IP type and therefore just requires an IP address. In addition, it also
+ * provides information about the kind of socket that should be created. This is
+ * used by the kernel multiplexor, more of that in the Kernel Components
+ * section.
+ *
+ * LOOKUPS
+ *
+ * The kernel communicates requests for lookups over the character device
+ * /dev/overlay. varpd is responsible for listening for requests on that device
+ * and answering them. The character device is specific to the target path and
+ * varpd.
+ *
+ * Much as the kernel overlay module handles the bulk of the scaffolding but
+ * leaves the important work to the encapsulation plugin, varpd provides a
+ * similar role and leaves the full brunt of lookups to a userland dynamic
+ * shared object which implements the logic of lookups.
+ *
+ * Each lookup plugin defines a series of operation vectors and properties. For
+ * the full details on everything that they should provide, please read
+ * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC
+ * address and asked to give an address on the physical network that it should
+ * be sent to. In addition, they handle questions related to how to handle
+ * things like broadcast and multicast traffic, etc.
+ *
+ * ----------
+ * Properties
+ * ----------
+ *
+ * A device from a dladm perspective has a unique set of properties that are
+ * combined from three different sources:
+ *
+ * 1) Generic properties that every overlay device has
+ * 2) Properties that are specific to the encapsulation plugin
+ * 3) Properties that are specific to the lookup plugin
+ *
+ * All of these are exposed in a single set of properties in dladm. Note that
+ * these are not necessarily traditional link properties. However, if something
+ * is both a traditional GLDv3 link property, say the MTU of a device, and a
+ * specific property here, than the driver ensures that all existing GLDv3
+ * specific means of manipulating it are used and wraps up its private property
+ * interfaces to ensure that works.
+ *
+ * Properties in the second and third category are prefixed with the name of
+ * their module. For example, the vxlan encapsulation module has a property
+ * called the 'listen_ip'. This property would show up in dladm as
+ * 'vxlan/listen_ip'. This allows different plugins to both use similar names
+ * for similar properties and to also have independent name spaces so that
+ * overlapping names do not conflict with anything else.
+ *
+ * While the kernel combines both sets one and two into a single coherent view,
+ * it does not do anything with respect to the properties that are owned by the
+ * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in
+ * charge of bridging these two worlds into one magical experience for the user.
+ * It carries the burden of knowing about both overlay specific and varpd
+ * specific properties. Importantly, we want to maintain this distinction. We
+ * don't want to treat the kernel as an arbitrary key/value store for varpd and
+ * we want the kernel to own its own data and not have to ask userland for
+ * information that it owns.
+ *
+ * Every property in the system has the following attributes:
+ *
+ * o A name
+ * o A type
+ * o A size
+ * o Permissions
+ * o Default value
+ * o Valid value ranges
+ * o A value
+ *
+ * Everything except for the value is obtained by callers through the propinfo
+ * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX,
+ * currently 256 bytes.
+ *
+ * The following are the supported types of properties:
+ *
+ * OVERLAY_PROP_T_INT
+ *
+ * A signed integer, its length is 8 bytes, corresponding to a
+ * int64_t.
+ *
+ * OVERLAY_PROP_T_UINT
+ *
+ * An unsigned integer, its length is 8 bytes, corresponding to a
+ * uint64_t.
+ *
+ * OVERLAY_PROP_T_IP
+ *
+ * A struct in6_addr, it has a fixed size.
+ *
+ * OVERLAY_PROP_T_STRING
+ *
+ * A null-terminated character string encoded in either ASCII or
+ * UTF-8. Note that the size of the string includes the null
+ * terminator.
+ *
+ * The next thing that we apply to a property is its permission. The permissions
+ * are put together by the bitwise or of the following flags and values.
+ *
+ * OVERLAY_PROP_PERM_REQ
+ *
+ * This indicates a required property. A property that is required
+ * must be set by a consumer before the device can be created. If a
+ * required property has a default property, this constraint is
+ * loosened because the default property defines the value.
+ *
+ * OVERLAY_PORP_PERM_READ
+ *
+ * This indicates that a property can be read. All properties will
+ * have this value set.
+ *
+ * OVERLAY_PROP_PERM_WRITE
+ *
+ * This indicates that a property can be written to and thus
+ * updated by userland. Properties that are only intended to
+ * display information, will not have OVERLAY_PROP_PERM_WRITE set.
+ *
+ * In addition, a few additional values are defined as a convenience to
+ * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of
+ * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second,
+ * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ,
+ * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a
+ * property should generally be a constant across its lifetime.
+ *
+ * A property may optionally have a default value. If it does have a default
+ * value, and that property is not set to be a different value, then the default
+ * value is inherited automatically. It also means that if the default value is
+ * acceptable, there is no need to set the value for a required property. For
+ * example, the vxlan module has the vxlan/listen_port property which is
+ * required, but has a default value of 4789 (the IANA assigned port). Because
+ * of that default value, there is no need for it to be set.
+ *
+ * Finally, a property may declare a list of valid values. These valid values
+ * are used for display purposes, they are not enforced by the broader system,
+ * but merely allow a means for the information to be communicated to the user
+ * through dladm(1M). Like a default value, this is optional.
+ *
+ * The general scaffolding does not do very much with respect to the getting and
+ * setting of properties. That is really owned by the individual plugins
+ * themselves.
+ *
+ * -----------------------------
+ * Destinations and Plugin Types
+ * -----------------------------
+ *
+ * Both encapsulation and lookup plugins define the kinds of destinations that
+ * they know how to support. There are three different pieces of information
+ * that can be used to address to a destination currently, all of which is
+ * summarized in the type overlay_point_t. Any combination of these is
+ * supported.
+ *
+ * OVERLAY_PLUGIN_D_ETHERNET
+ *
+ * An Ethernet MAC address is required.
+ *
+ * OVERLAY_PLUGIN_D_IP
+ *
+ * An IP address is required. All IP addresses used by the overlay
+ * system are transmitted as IPv6 addresses. IPv4 addresses can be
+ * represented by using IPv4-mapped IPv6 addresses.
+ *
+ * OVERLAY_PLUGIN_D_PORT
+ *
+ * A TCP/UDP port is required.
+ *
+ * A kernel encapsulation plugin declares which of these that it requires, it's
+ * a static set. On the other hand, a userland lookup plugin can be built to
+ * support all of these or any combination thereof. It gets passed the required
+ * destination type, based on the kernel encapsulation method, and then it makes
+ * the determination as to whether or not it supports it. For example, the
+ * direct plugin can support either an IP or both an IP and a port, it simply
+ * doesn't display the direct/dest_port property in the cases where a port is
+ * not required to support this.
+ *
+ * The user lookup plugins have two different modes of operation which
+ * determines how they interact with the broader system and how look ups are
+ * performed. These types are:
+ *
+ * OVERLAY_TARGET_POINT
+ *
+ * A point to point plugin has a single static definition for where
+ * to send all traffic. Every packet in the system always gets sent
+ * to the exact same destination which is programmed into the
+ * kernel when the general device is activated.
+ *
+ * OVERLAY_TARGET_DYNAMIC
+ *
+ * A dynamic plugin does not have a single static definition.
+ * Instead, for each destination, the kernel makes an asynchronous
+ * request to varpd to determine where the packet should be routed,
+ * and if a specific destination is found, then that destination is
+ * cached in the overlay device's target cache.
+ *
+ * This distinction, while important for the general overlay device's operation,
+ * is not important to the encapsulation plugins. They don't need to know about
+ * any of these pieces. It's just a concern for varpd, the userland plugin, and
+ * the general overlay scaffolding.
+ *
+ * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not
+ * maintain a target cache, and instead just keeps track of the destination and
+ * always sends encapsulated packets to that address. When the target type is of
+ * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such
+ * destinations. These destinations are kept around in an instance of a
+ * reference hash that is specific to the given overlay device. Entries in the
+ * cache can be invalidated and replaced by varpd and its lookup plugins.
+ *
+ * ----------------------------------
+ * Kernel Components and Architecture
+ * ----------------------------------
+ *
+ * There are multiple pieces inside the kernel that work together, there is the
+ * general overlay_dev_t structure, which is the logical GLDv3 device, but it
+ * itself has references to things like an instance of an encapsulation plugin,
+ * a pointer to a mux and a target cache. It can roughly be summarized in the
+ * following image:
+ *
+ * +------------------+
+ * | global |
+ * | overlay list |
+ * | overlay_dev_list |
+ * +------------------+
+ * |
+ * | +-----------------------+ +---------------+
+ * +->| GLDv3 Device |----------->| GLDv3 Device | -> ...
+ * | overlay_dev_t | | overlay_dev_t |
+ * | | +---------------+
+ * | |
+ * | mac_handle_t -----+---> GLDv3 handle to MAC
+ * | datalink_id_t -----+---> Datalink ID used by DLS
+ * | overlay_dev_flag_t ---+---> Device state
+ * | uint_t -----+---> Current device MTU
+ * | uint_t -----+---> In-progress RX operations
+ * | uint_t -----+---> In-progress TX operations
+ * | char[] -----+---> FMA degraded message
+ * | void * -----+---> plugin private data
+ * | overlay_target_t * ---+---------------------+
+ * | overlay_plugin_t * ---+---------+ |
+ * +-----------------------+ | |
+ * ^ | |
+ * +--------------------+ | | |
+ * | Kernel Socket | | | |
+ * | Multiplexor | | | |
+ * | overlay_mux_t | | | |
+ * | | | | |
+ * | avl_tree_t -+--+ | |
+ * | uint_t -+--> socket family | |
+ * | uint_t -+--> socket type | |
+ * | uint_t -+--> socket protocol | |
+ * | ksocket_t -+--> I/O socket | |
+ * | struct sockaddr * -+--> ksocket address | |
+ * | overlay_plugin_t --+--------+ | |
+ * +--------------------+ | | |
+ * | | |
+ * +-------------------------+ | | |
+ * | Encap Plugin |<--+-----------+ |
+ * | overlay_plugin_t | |
+ * | | |
+ * | char * ---+--> plugin name |
+ * | overlay_plugin_ops_t * -+--> plugin downcalls |
+ * | char ** (props) ---+--> property list |
+ * | uint_t ---+--> id length |
+ * | overlay_plugin_flags_t -+--> plugin flags |
+ * | overlay_plugin_dest_t --+--> destination type v
+ * +-------------------------+ +-------------------------+
+ * | Target Cache |
+ * | overlay_target_t |
+ * | |
+ * cache mode <--+- overlay_target_mode_t |
+ * dest type <--+- overlay_plugin_dest_t |
+ * cache flags <--+- overlay_target_flag_t |
+ * varpd id <--+- uint64_t |
+ * outstanding varpd reqs. <--+- uint_t |
+ * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t |
+ * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t |
+ * | +-------------------------+
+ * +-----------------------+
+ * |
+ * v
+ * +-------------------------------+ +------------------------+
+ * | Target Entry |-->| Target Entry |--> ...
+ * | overlay_target_entry_t | | overlay_target_entry_t |
+ * | | +------------------------+
+ * | |
+ * | overlay_target_entry_flags_t -+--> Entry flags
+ * | uint8_t[ETHERADDRL] ---+--> Target MAC address
+ * | overlay_target_point_t ---+--> Target underlay address
+ * | mblk_t * ---+--> outstanding mblk head
+ * | mblk_t * ---+--> outstanding mblk tail
+ * | size_t ---+--> outstanding mblk size
+ * +-------------------------------+
+ *
+ * The primary entries that we care about are the overlay_dev_t, which
+ * correspond to each overlay device that is created with dladm(1M). Globally,
+ * these devices are maintained in a simple list_t which is protected with a
+ * lock. Hence, these include important information such as the mac_handle_t
+ * and a datalink_id_t which is used to interact with the broader MAC and DLS
+ * ecosystem. We also maintain additional information such as the current state,
+ * outstanding operations, the mtu, and importantly, the plugin's private data.
+ * This is the instance of an encapsulation plugin that gets created as part of
+ * creating an overlay device. Another aspect of this is that the overlay_dev_t
+ * also includes information with respect to FMA. For more information, see the
+ * FMA section.
+ *
+ * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin
+ * is the encapsulation plugin. This allows the device to make downcalls into it
+ * based on doing things like getting and setting properties. Otherwise, the
+ * plugin itself is a fairly straightforward entity. They are maintained in an
+ * (not pictured above) list. The plugins themselves mostly maintain things like
+ * the static list of properties, what kind of destination they require, and the
+ * operations vector. A given module may contain more if necessary.
+ *
+ * The next piece of the puzzle is the mux, or a multiplexor. The mux itself
+ * maintains a ksocket and it is through the mux that we send and receive
+ * message blocks. The mux represents a socket type and address, as well as a
+ * plugin. Multiple overlay_dev_t devices may then share the same mux. For
+ * example, consider the case where you have different instances of vxlan all on
+ * the same underlay network. These would all logically share the same IP
+ * address and port that packets are sent and received on; however, what differs
+ * is the decapuslation ID.
+ *
+ * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike
+ * a socket, we enable a direct callback on the ksocket. This means that
+ * whenever a message block chain is received, rather than sitting there and
+ * getting a callback in a context and kicking that back out to a taskq. Instead
+ * data comes into the callback function overlay_mux_recv().
+ *
+ * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx
+ * function) to transmit. It receives encapsulated packets, decapsulates them to
+ * determine the overlay identifier, looks up the given device that matches that
+ * identifier, and then causes the broader MAC world to receive the packet with
+ * a call to mac_rx().
+ *
+ * Today, we don't do too much that's special with the ksocket; however, as
+ * hardware is gaining understanding for these encapsulation protocols, we'll
+ * probably want to think of better ways to get those capabilities passed down
+ * and potentially better ways to program receive filters so they get directly
+ * to us. Though, that's all fantasy future land.
+ *
+ * The next part of the puzzle is the target cache. The purpose of the target
+ * cache is to cache where we should send a packet on the underlay network,
+ * given its mac address. The target cache operates in two modes depending on
+ * whether the lookup module was declared to OVERLAY_TARGET_POINT or
+ * OVERLAY_TARGET_DYANMIC.
+ *
+ * In the case where the target cache has been programmed to be
+ * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t
+ * which has the destination that we send everything, no matter the destination
+ * mac address.
+ *
+ * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
+ * are much more interesting and as a result, more complicated. We primarily
+ * store lists of overlay_target_entry_t's which are stored in both an avl tree
+ * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
+ * is only used for a few of the target ioctls used to dump data such that we
+ * can get a consistent iteration order for things like dladm show-overlay -t.
+ * The key that we use for the reference hashtable is based on the mac address
+ * in the cache and currently we just do a simple CRC32 to transform it into a
+ * hash.
+ *
+ * Each entry maintains a set of flags to indicate the current status of the
+ * request. The flags may indicate one of three states: that current cache entry
+ * is valid, that the current cache entry has been directed to drop all output,
+ * and that the current cache entry is invalid and may be being looked up. In
+ * the case where it's valid, we just take the destination address and run with
+ * it.
+ *
+ * If it's invalid and a lookup has not been made, then we start the process
+ * that prepares a query that will make its way up to varpd. The cache entry
+ * entry maintains a message block chain of outstanding message blocks and a
+ * size. These lists are populated only when we don't know the answer as to
+ * where should these be sent. The size entry is used to cap the amount of
+ * outstanding data that we don't know the answer to. If we exceed a cap on the
+ * amount of outstanding data (currently 1 Mb), then we'll drop any additional
+ * packets. Once we get an answer indicating a valid destination, we transmit
+ * any outstanding data to that place. For the full story on how we look that up
+ * will be discussed in the section on the Target Cache Lifecycle.
+ *
+ * ------------------------
+ * FMA and Degraded Devices
+ * ------------------------
+ *
+ * Every kernel overlay device keeps track of its FMA state. Today in FMA we
+ * cannot represent partitions between resources nor can we represent that a
+ * given minor node of a pseudo device has failed -- if we degrade the overlay
+ * device, then the entire dev_info_t is degraded. However, we still want to be
+ * able to indicate to administrators that things may go wrong.
+ *
+ * To this end, we've added a notion of a degraded state to every overlay
+ * device. This state is primarily dictated by userland and it can happen for
+ * various reasons. Generally, because a userland lookup plugin has been
+ * partitioned, or something has gone wrong such that there is no longer any
+ * userland lookup module for a device, then we'll mark it degraded.
+ *
+ * As long as any of our minor instances is degraded, then we'll fire off the
+ * FMA event to note that. Once the last degraded instance is no longer
+ * degraded, then we'll end up telling FMA that we're all clean.
+ *
+ * To help administrators get a better sense of which of the various minor
+ * devices is wrong, we store the odd_fmamsg[] character array. This character
+ * array can be fetched with doing a dladm show-overlay -f.
+ *
+ * Note, that it's important that we do not update the link status of the
+ * devices. We want to remain up as much as possible. By changing the link in a
+ * degraded state, this may end up making things worse. We may still actually
+ * have information in the target cache and if we mark the link down, that'll
+ * result in not being able to use it. The reason being that this'll mark all
+ * the downstream VNICs down which will go to IP and from there we end up
+ * dealing with sadness.
+ *
+ * -----------------------
+ * Target Cache Life Cycle
+ * -----------------------
+ *
+ * This section only applies when we have a lookup plugin of
+ * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type
+ * OVERLAY_TARGET_POINT.
+ *
+ * While we got into the target cache in the general architecture section, it's
+ * worth going into more details as to how this actually works and showing some
+ * examples and state machines. Recall that a target cache entry basically has
+ * the following state transition diagram:
+ *
+ * Initial state
+ * . . . . . . first access . . . varpd lookup enqueued
+ * . . .
+ * . . .
+ * +-------+ . +----------+ .
+ * | No |------*---->| Invalid |-------*----+
+ * | Entry | | Entry | |
+ * +-------+ +----------+ |
+ * varpd ^ ^ varpd |
+ * invalidate | | drop |
+ * . . . * * . . v
+ * +-------+ | | +---------+
+ * | Entry |--->-----+ +----<----| Entry |
+ * | Valid |<----------*---------<----| Pending |->-+ varpd
+ * +-------+ . +---------+ * . . drop, but
+ * . varpd ^ | other queued
+ * . success | | entries
+ * +-----+
+ *
+ * When the table is first created, it is empty. As we attempt to lookup entries
+ * and we find there is no entry at all, we'll create a new table entry for it.
+ * At that point the entry is technically in an invalid state, that means that
+ * we have no valid data from varpd. In that case, we'll go ahead and queue the
+ * packet into the entry's pending chain, and queue a varpd lookup, setting the
+ * OVERLAY_ENTRY_F_PENDING flag in the progress.
+ *
+ * If additional mblk_t's come in for this entry, we end up appending them to
+ * the tail of the chain, if and only if, we don't exceed the threshold for the
+ * amount of space they can take up. An entry remains pending until we get a
+ * varpd reply. If varpd replies with a valid results, we move to the valid
+ * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one
+ * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate.
+ *
+ * Once an entry is valid, it stays valid until user land tells us to invalidate
+ * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and
+ * OVERLAY_TARG_CACHE_SET respectively.
+ *
+ * If the lookup fails with a call to drop the packet, then the next state is
+ * determined by the state of the queue. If the set of outstanding entries is
+ * empty, then we just transition back to the invalid state. If instead, the
+ * set of outstanding entries is not empty, then we'll queue another entry and
+ * stay in the same state, repeating this until the number of requests is
+ * drained.
+ *
+ * The following images describes the flow of a given lookup and where the
+ * overlay_target_entry_t is at any given time.
+ *
+ * +-------------------+
+ * | Invalid Entry | An entry starts off as an invalid entry
+ * | de:ad:be:ef:00:00 | and only exists in the target cache.
+ * +-------------------+
+ *
+ * ~~~~
+ *
+ * +---------------------+
+ * | Global list_t | A mblk_t comes in for an entry. We
+ * | overlay_target_list | append it to the overlay_target_list.
+ * +---------------------+
+ * |
+ * v
+ * +-------------------+ +-------------------+
+ * | Pending Entry |----->| Pending Entry |--->...
+ * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 |
+ * +-------------------+ +-------------------+
+ *
+ * ~~~~
+ *
+ * +--------------------------+
+ * | /dev/overlay minor state | User land said that it would look up an
+ * | overlay_target_hdl_t | entry for us. We remove it from the
+ * +--------------------------+ global list and add it to the handle's
+ * | outstanding list.
+ * |
+ * v
+ * +-------------------+ +-------------------+
+ * | Pending Entry |----->| Pending Entry |
+ * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 |
+ * +-------------------+ +-------------------+
+ *
+ * ~~~~
+ *
+ * +-------------------+
+ * | Valid Entry | varpd returned an answer with
+ * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache
+ * | 10.169.23.42:4789 | entry is now populated with a
+ * +-------------------+ destination and marked as valid
+ *
+ *
+ * The lookup mechanism is performed via a series of operations on the character
+ * pseudo-device /dev/overlay. The only thing that uses this device is the
+ * userland daemon varpd. /dev/overlay is a cloneable device, each open of it
+ * granting a new minor number which maintains its own state. We maintain this
+ * state so that way if an outstanding lookup was queued to something that
+ * crashed or closed its handle without responding, we can know about this and
+ * thus handle it appropriately.
+ *
+ * When a lookup is first created it's added to our global list of outstanding
+ * lookups. To service requests, userland is required to perform an ioctl to ask
+ * for a request. We will block it in the kernel a set amount of time waiting
+ * for a request. When we give a request to a given minor instance of the
+ * device, we remove it from the global list and append the request to the
+ * device's list of outstanding entries, for the reasons we discussed above.
+ * When a lookup comes in, we give user land a smaller amount of information
+ * specific to that packet, the overlay_targ_lookup_t. It includes a request id
+ * to identify this, and then the overlay id, the varpd id, the header and
+ * packet size, the source and destination mac address, the SAP, and any
+ * potential VLAN header.
+ *
+ * At that point, it stays in that outstanding list until one of two ioctls are
+ * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time,
+ * userland may also perform other operations. For example, it may use
+ * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth
+ * analysis of what to do beyond what we gave it initially. This is useful for
+ * providing proxy arp and the like. Finally, there are two other ioctls that
+ * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the
+ * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which
+ * causes us to encapsulate and send out the packet they've given us.
+ *
+ *
+ * Finally, through the target cache, several ioctls are provided to allow for
+ * interrogation and management of the cache. They allow for individual entries
+ * to be retrieved, set, or have the entire table flushed. For the full set of
+ * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h.
+ *
+ * ------------------
+ * Sample Packet Flow
+ * ------------------
+ *
+ * There's a lot of pieces here, hopefully an example of how this all fits
+ * together will help clarify and elucidate what's going on. We're going to
+ * first track an outgoing packet, eg. one that is sent from an IP interface on
+ * a VNIC on top of an overlay device, and then we'll look at what it means to
+ * respond to that.
+ *
+ *
+ * +----------------+ +--------------+ +------------------+
+ * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches |
+ * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx |
+ * +----------------+ | VNIC device | | overlay_m_tx() |
+ * +--------------+ +------------------+
+ * |
+ * . lookup . cache |
+ * . drop . miss v
+ * +---------+ . +--------+ . +------------------+
+ * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk |
+ * | mblk_t | | lookup | | in the target |
+ * +---------+ | queued | | cache |
+ * ^ +--------+ +------------------+
+ * on send | | | cache
+ * error . . * *. . lookup * . . hit
+ * | | success v
+ * | | +------------------+
+ * +-----------------+ +--------------->| call plugin |
+ * | Send out | | ovpo_encap() to |
+ * | overlay_mux_t's |<----------------------------------| get encap mblk_t |
+ * | ksocket | +------------------+
+ * +-----------------+
+ *
+ * The receive end point looks a little different and looks more like:
+ *
+ * +------------------+ +----------------+ +-----------+
+ * | mblk_t comes off |---->| enter netstack |--->| delivered |---+
+ * | the physical | | IP stack | | to | * . . direct
+ * | device | +----------------+ | ksocket | | callback
+ * +------------------+ +-----------+ |
+ * . overlay id |
+ * . not found v
+ * +-----------+ . +-----------------+ +--------------------+
+ * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() |
+ * | mblk_t | | ovpo_decap() to | +--------------------+
+ * +-----------+ | decap mblk_t |
+ * +-----------------+
+ * |
+ * * . . overlay id
+ * v found
+ * +--------+ +----------------+
+ * | adjust |----->| call mac_rx |
+ * | mblk_t | | on original |
+ * +--------+ | decaped packet |
+ * +----------------+
+ *
+ * ------------------
+ * Netstack Awareness
+ * ------------------
+ *
+ * In the above image we note that this enters a netstack. Today the only
+ * netstack that can be is the global zone as the overlay driver itself is not
+ * exactly netstack aware. What this really means is that varpd cannot run in a
+ * non-global zone and an overlay device cannot belong to a non-global zone.
+ * Non-global zones can still have a VNIC assigned to them that's been created
+ * over the overlay device the same way they would if it had been created over
+ * an etherstub or a physical device.
+ *
+ * The majority of the work to make it netstack aware is straightforward and the
+ * biggest thing is to create a netstack module that allows us to hook into
+ * netstack (and thus zone) creation and destruction. From there, we need to
+ * amend the target cache lookup routines that we discussed earlier to not have
+ * a global outstanding list and a global list of handles, but rather, one per
+ * netstack.
+ *
+ * For the mux, we'll need to open the ksocket in the context of the zone, we
+ * can likely do this with a properly composed credential, but we'll need to do
+ * some more work on that path. Finally, we'll want to make sure the dld ioctls
+ * are aware of the zoneid of the caller and we use that appropriately and store
+ * it in the overlay_dev_t.
+ *
+ * -----------
+ * GLDv3 Notes
+ * -----------
+ *
+ * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more
+ * relevant and other parts are much less relevant for us. For example, the
+ * GLDv3 is used to toggle the device being put into and out of promiscuous
+ * mode, to program MAC addresses for unicast and multicast hardware filters.
+ * Today, an overlay device doesn't have a notion of promiscuous mode nor does
+ * it have a notion of unicast and multicast addresses programmed into the
+ * device. Instead, for the purposes of the hardware filter, we don't do
+ * anything and just always accept new addresses being added and removed.
+ *
+ * If the GLDv3 start function has not been called, then we will not use this
+ * device for I/O purposes. Any calls to transmit or receive should be dropped,
+ * though the GLDv3 guarantees us that transmit will not be called without
+ * calling start. Similarly, once stop is called, then no packets can be dealt
+ * with.
+ *
+ * Today we don't support the stat interfaces, though there's no good reason
+ * that we shouldn't assemble some of the stats based on what we have in the
+ * future.
+ *
+ * When it comes to link properties, many of the traditional link properties do
+ * not apply and many others MAC handles for us. For example, we don't need to
+ * implement anything for overlay_m_getprop() to deal with returning the MTU, as
+ * MAC never calls into us for that. As such, there isn't much of anything to
+ * support in terms of properties.
+ *
+ * Today, we don't support any notion of hardware capabilities. However, if
+ * future NIC hardware or other changes to the system cause it to make sense for
+ * us to emulate logical groups, then we should do that. However, we still do
+ * implement a capab function so that we can identify ourselves as an overlay
+ * device to the broader MAC framework. This is done mostly so that a device
+ * created on top of us can have fanout rings as we don't try to lie about a
+ * speed for our device.
+ *
+ * The other question is what should be done for a device's MTU and margin. We
+ * set our minimum supported MTU to be the minimum value that an IP network may
+ * be set to 576 -- which mimics what an etherstub does. On the flip side, we
+ * have our upper bound set to 8900. This value comes from the fact that a lot
+ * of jumbo networks use their maximum as 9000. As such, we want to reserve 100
+ * bytes, which isn't exactly the most accurate number, but it'll be good enough
+ * for now. Because of that, our default MTU off of these devices is 1400, as
+ * the default MTU for everything is usually 1500 or whatever the underlying
+ * device is at; however, this is a bit simpler than asking the netstack what
+ * are all the IP interfaces at. It also calls into question how PMTU and PMTU
+ * discovery should work here. The challenge, especially for
+ * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's
+ * not clear that if you have a single bad entry that the overall MTU should be
+ * lowered. Instead, we should figure out a better way of determining these
+ * kinds of PMTU errors and appropriately alerting the administrator via FMA.
+ *
+ * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether
+ * or not the underlying encapsulation device supports VLAN tags. If it does,
+ * then we'll set the margin to allow for it, otherwise, we will not.
+ */
+
+#include <sys/conf.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/policy.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/ddifm.h>
+
+#include <sys/dls.h>
+#include <sys/dld_ioc.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_ether.h>
+#include <sys/vlan.h>
+
+#include <sys/overlay_impl.h>
+
+dev_info_t *overlay_dip;
+static kmutex_t overlay_dev_lock;
+static list_t overlay_dev_list;
+static uint8_t overlay_macaddr[ETHERADDRL] =
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+
+typedef enum overlay_dev_prop {
+ OVERLAY_DEV_P_MTU = 0,
+ OVERLAY_DEV_P_VNETID,
+ OVERLAY_DEV_P_ENCAP,
+ OVERLAY_DEV_P_VARPDID
+} overlay_dev_prop_t;
+
+#define OVERLAY_DEV_NPROPS 4
+static const char *overlay_dev_props[] = {
+ "mtu",
+ "vnetid",
+ "encap",
+ "varpd/id"
+};
+
+#define OVERLAY_MTU_MIN 576
+#define OVERLAY_MTU_DEF 1400
+#define OVERLAY_MTU_MAX 8900
+
+overlay_dev_t *
+overlay_hold_by_dlid(datalink_id_t id)
+{
+ overlay_dev_t *o;
+
+ mutex_enter(&overlay_dev_lock);
+ for (o = list_head(&overlay_dev_list); o != NULL;
+ o = list_next(&overlay_dev_list, o)) {
+ if (id == o->odd_linkid) {
+ mutex_enter(&o->odd_lock);
+ o->odd_ref++;
+ mutex_exit(&o->odd_lock);
+ mutex_exit(&overlay_dev_lock);
+ return (o);
+ }
+ }
+
+ mutex_exit(&overlay_dev_lock);
+ return (NULL);
+}
+
+void
+overlay_hold_rele(overlay_dev_t *odd)
+{
+ mutex_enter(&odd->odd_lock);
+ ASSERT(odd->odd_ref > 0);
+ odd->odd_ref--;
+ mutex_exit(&odd->odd_lock);
+}
+
+void
+overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+ ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
+ ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+ if (flag & OVERLAY_F_IN_RX)
+ odd->odd_rxcount++;
+ if (flag & OVERLAY_F_IN_TX)
+ odd->odd_txcount++;
+ odd->odd_flags |= flag;
+}
+
+void
+overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+ boolean_t signal = B_FALSE;
+
+ ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
+ ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+ if (flag & OVERLAY_F_IN_RX) {
+ ASSERT(odd->odd_rxcount > 0);
+ odd->odd_rxcount--;
+ if (odd->odd_rxcount == 0) {
+ signal = B_TRUE;
+ odd->odd_flags &= ~OVERLAY_F_IN_RX;
+ }
+ }
+ if (flag & OVERLAY_F_IN_TX) {
+ ASSERT(odd->odd_txcount > 0);
+ odd->odd_txcount--;
+ if (odd->odd_txcount == 0) {
+ signal = B_TRUE;
+ odd->odd_flags &= ~OVERLAY_F_IN_TX;
+ }
+ }
+
+ if (signal == B_TRUE)
+ cv_broadcast(&odd->odd_iowait);
+}
+
+static void
+overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+ ASSERT((flag & ~OVERLAY_F_IOMASK) == 0);
+ ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+ while (odd->odd_flags & flag) {
+ cv_wait(&odd->odd_iowait, &odd->odd_lock);
+ }
+}
+
+void
+overlay_dev_iter(overlay_dev_iter_f func, void *arg)
+{
+ overlay_dev_t *odd;
+
+ mutex_enter(&overlay_dev_lock);
+ for (odd = list_head(&overlay_dev_list); odd != NULL;
+ odd = list_next(&overlay_dev_list, odd)) {
+ if (func(odd, arg) != 0) {
+ mutex_exit(&overlay_dev_lock);
+ return;
+ }
+ }
+ mutex_exit(&overlay_dev_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_stat(void *arg, uint_t stat, uint64_t *val)
+{
+ return (ENOTSUP);
+}
+
+static int
+overlay_m_start(void *arg)
+{
+ overlay_dev_t *odd = arg;
+ overlay_mux_t *mux;
+ int ret, domain, family, prot;
+ struct sockaddr_storage storage;
+ socklen_t slen;
+
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) {
+ mutex_exit(&odd->odd_lock);
+ return (EAGAIN);
+ }
+ mutex_exit(&odd->odd_lock);
+
+ ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain,
+ &family, &prot, (struct sockaddr *)&storage, &slen);
+ if (ret != 0)
+ return (ret);
+
+ mux = overlay_mux_open(odd->odd_plugin, domain, family, prot,
+ (struct sockaddr *)&storage, slen, &ret);
+ if (mux == NULL)
+ return (ret);
+
+ overlay_mux_add_dev(mux, odd);
+ odd->odd_mux = mux;
+ mutex_enter(&odd->odd_lock);
+ ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX));
+ odd->odd_flags |= OVERLAY_F_IN_MUX;
+ mutex_exit(&odd->odd_lock);
+
+ return (0);
+}
+
+static void
+overlay_m_stop(void *arg)
+{
+ overlay_dev_t *odd = arg;
+
+ /*
+ * The MAC Perimeter is held here, so we don't have to worry about
+ * synchronizing this with respect to metadata operations.
+ */
+ mutex_enter(&odd->odd_lock);
+ VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX);
+ VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP));
+ odd->odd_flags |= OVERLAY_F_MDDROP;
+ overlay_io_wait(odd, OVERLAY_F_IOMASK);
+ mutex_exit(&odd->odd_lock);
+
+ overlay_mux_remove_dev(odd->odd_mux, odd);
+ overlay_mux_close(odd->odd_mux);
+ odd->odd_mux = NULL;
+
+ mutex_enter(&odd->odd_lock);
+ odd->odd_flags &= ~OVERLAY_F_IN_MUX;
+ odd->odd_flags &= ~OVERLAY_F_MDDROP;
+ VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0);
+ mutex_exit(&odd->odd_lock);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_promisc(void *arg, boolean_t on)
+{
+ return (0);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp)
+{
+ return (0);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_unicast(void *arg, const uint8_t *macaddr)
+{
+ return (0);
+}
+
+mblk_t *
+overlay_m_tx(void *arg, mblk_t *mp_chain)
+{
+ overlay_dev_t *odd = arg;
+ mblk_t *mp, *ep;
+ int ret;
+ ovep_encap_info_t einfo;
+ struct msghdr hdr;
+
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
+ !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+ mutex_exit(&odd->odd_lock);
+ freemsgchain(mp_chain);
+ return (NULL);
+ }
+ overlay_io_start(odd, OVERLAY_F_IN_TX);
+ mutex_exit(&odd->odd_lock);
+
+ bzero(&hdr, sizeof (struct msghdr));
+
+ bzero(&einfo, sizeof (ovep_encap_info_t));
+ einfo.ovdi_id = odd->odd_vid;
+ mp = mp_chain;
+ while (mp != NULL) {
+ socklen_t slen;
+ struct sockaddr_storage storage;
+
+ mp_chain = mp->b_next;
+ mp->b_next = NULL;
+ ep = NULL;
+
+ ret = overlay_target_lookup(odd, mp,
+ (struct sockaddr *)&storage, &slen);
+ if (ret != OVERLAY_TARGET_OK) {
+ if (ret == OVERLAY_TARGET_DROP)
+ freemsg(mp);
+ mp = mp_chain;
+ continue;
+ }
+
+ hdr.msg_name = &storage;
+ hdr.msg_namelen = slen;
+
+ ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp,
+ &einfo, &ep);
+ if (ret != 0 || ep == NULL) {
+ freemsg(mp);
+ goto out;
+ }
+
+ ASSERT(ep->b_cont == mp || ep == mp);
+ ret = overlay_mux_tx(odd->odd_mux, &hdr, ep);
+ if (ret != 0)
+ goto out;
+
+ mp = mp_chain;
+ }
+
+out:
+ mutex_enter(&odd->odd_lock);
+ overlay_io_done(odd, OVERLAY_F_IN_TX);
+ mutex_exit(&odd->odd_lock);
+ return (mp_chain);
+}
+
+/* ARGSUSED */
+static void
+overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
+{
+ miocnak(q, mp, 0, ENOTSUP);
+}
+
+/* ARGSUSED */
+static boolean_t
+overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
+{
+ /*
+ * Tell MAC we're an overlay.
+ */
+ if (cap == MAC_CAPAB_OVERLAY)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+ uint_t pr_valsize, const void *pr_val)
+{
+ uint32_t mtu, old;
+ int err;
+ overlay_dev_t *odd = arg;
+
+ if (pr_num != MAC_PROP_MTU)
+ return (ENOTSUP);
+
+ bcopy(pr_val, &mtu, sizeof (mtu));
+ if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX)
+ return (EINVAL);
+
+ mutex_enter(&odd->odd_lock);
+ old = odd->odd_mtu;
+ odd->odd_mtu = mtu;
+ err = mac_maxsdu_update(odd->odd_mh, mtu);
+ if (err != 0)
+ odd->odd_mtu = old;
+ mutex_exit(&odd->odd_lock);
+
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+ uint_t pr_valsize, void *pr_val)
+{
+ return (ENOTSUP);
+}
+
+/* ARGSUSED */
+static void
+overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+ mac_prop_info_handle_t prh)
+{
+ if (pr_num != MAC_PROP_MTU)
+ return;
+
+ mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF);
+ mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX);
+}
+
+static mac_callbacks_t overlay_m_callbacks = {
+ .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP |
+ MC_PROPINFO),
+ .mc_getstat = overlay_m_stat,
+ .mc_start = overlay_m_start,
+ .mc_stop = overlay_m_stop,
+ .mc_setpromisc = overlay_m_promisc,
+ .mc_multicst = overlay_m_multicast,
+ .mc_unicst = overlay_m_unicast,
+ .mc_tx = overlay_m_tx,
+ .mc_ioctl = overlay_m_ioctl,
+ .mc_getcapab = overlay_m_getcapab,
+ .mc_getprop = overlay_m_getprop,
+ .mc_setprop = overlay_m_setprop,
+ .mc_propinfo = overlay_m_propinfo
+};
+
+static boolean_t
+overlay_valid_name(const char *name, size_t buflen)
+{
+ size_t actlen;
+ int err, i;
+
+ for (i = 0; i < buflen; i++) {
+ if (name[i] == '\0')
+ break;
+ }
+
+ if (i == 0 || i == buflen)
+ return (B_FALSE);
+ actlen = i;
+ if (strchr(name, '/') != NULL)
+ return (B_FALSE);
+ if (u8_validate((char *)name, actlen, NULL,
+ U8_VALIDATE_ENTIRE, &err) < 0)
+ return (B_FALSE);
+ return (B_TRUE);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ int err;
+ uint64_t maxid;
+ overlay_dev_t *odd, *o;
+ mac_register_t *mac;
+ overlay_ioc_create_t *oicp = karg;
+
+ if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE)
+ return (EINVAL);
+
+ odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP);
+ odd->odd_linkid = oicp->oic_linkid;
+ odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap);
+ if (odd->odd_plugin == NULL) {
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (ENOENT);
+ }
+ err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd,
+ &odd->odd_pvoid);
+ if (err != 0) {
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EINVAL);
+ }
+
+ /*
+ * Make sure that our virtual network id is valid for the given plugin
+ * that we're working with.
+ */
+ ASSERT(odd->odd_plugin->ovp_id_size <= 8);
+ maxid = UINT64_MAX;
+ if (odd->odd_plugin->ovp_id_size != 8)
+ maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL;
+ if (oicp->oic_vnetid > maxid) {
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EINVAL);
+ }
+ odd->odd_vid = oicp->oic_vnetid;
+
+ mac = mac_alloc(MAC_VERSION);
+ if (mac == NULL) {
+ mutex_exit(&overlay_dev_lock);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EINVAL);
+ }
+
+ mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
+ mac->m_driver = odd;
+ mac->m_dip = overlay_dip;
+ mac->m_dst_addr = NULL;
+ mac->m_callbacks = &overlay_m_callbacks;
+ mac->m_pdata = NULL;
+ mac->m_pdata_size = 0;
+
+ mac->m_priv_props = NULL;
+
+ /* Let mac handle this itself. */
+ mac->m_instance = (uint_t)-1;
+
+ /*
+ * There is no real source address that should be used here, but saying
+ * that we're not ethernet is going to cause its own problems. At the
+ * end of the say, this is fine.
+ */
+ mac->m_src_addr = overlay_macaddr;
+
+ /*
+ * Start with the default MTU as the max SDU. If the MTU is changed, the
+ * SDU will be changed to reflect that.
+ */
+ mac->m_min_sdu = 1;
+ mac->m_max_sdu = OVERLAY_MTU_DEF;
+ mac->m_multicast_sdu = 0;
+
+ /*
+ * The underlying device doesn't matter, instead this comes from the
+ * encapsulation protocol and whether or not they allow VLAN tags.
+ */
+ if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) {
+ mac->m_margin = VLAN_TAGSZ;
+ } else {
+ mac->m_margin = 0;
+ }
+
+ /*
+ * Today, we have no MAC virtualization, it may make sense in the future
+ * to go ahead and emulate some subset of this, but it doesn't today.
+ */
+ mac->m_v12n = MAC_VIRT_NONE;
+
+ mutex_enter(&overlay_dev_lock);
+ for (o = list_head(&overlay_dev_list); o != NULL;
+ o = list_next(&overlay_dev_list, o)) {
+ if (o->odd_linkid == oicp->oic_linkid) {
+ mutex_exit(&overlay_dev_lock);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EEXIST);
+ }
+
+ if (o->odd_vid == oicp->oic_vnetid &&
+ o->odd_plugin == odd->odd_plugin) {
+ mutex_exit(&overlay_dev_lock);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EEXIST);
+ }
+ }
+
+ err = mac_register(mac, &odd->odd_mh);
+ mac_free(mac);
+ if (err != 0) {
+ mutex_exit(&overlay_dev_lock);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (err);
+ }
+
+ err = dls_devnet_create(odd->odd_mh, odd->odd_linkid,
+ crgetzoneid(cred));
+ if (err != 0) {
+ mutex_exit(&overlay_dev_lock);
+ (void) mac_unregister(odd->odd_mh);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (err);
+ }
+
+ mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL);
+ odd->odd_ref = 0;
+ odd->odd_flags = 0;
+ list_insert_tail(&overlay_dev_list, odd);
+ mutex_exit(&overlay_dev_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ int i, ret;
+ overlay_dev_t *odd;
+ mac_perim_handle_t mph;
+ overlay_ioc_activate_t *oiap = karg;
+ overlay_ioc_propinfo_t *infop;
+ overlay_ioc_prop_t *oip;
+ overlay_prop_handle_t phdl;
+
+ odd = overlay_hold_by_dlid(oiap->oia_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP);
+ oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP);
+ phdl = (overlay_prop_handle_t)infop;
+
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
+ mutex_exit(&odd->odd_lock);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (EEXIST);
+ }
+ mutex_exit(&odd->odd_lock);
+
+ for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
+ const char *pname = odd->odd_plugin->ovp_props[i];
+ bzero(infop, sizeof (overlay_ioc_propinfo_t));
+ overlay_prop_init(phdl);
+ ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl);
+ if (ret != 0) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (ret);
+ }
+
+ if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0)
+ continue;
+ bzero(oip, sizeof (overlay_ioc_prop_t));
+ oip->oip_size = sizeof (oip->oip_value);
+ ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
+ pname, oip->oip_value, &oip->oip_size);
+ if (ret != 0) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (ret);
+ }
+ if (oip->oip_size == 0) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (EINVAL);
+ }
+ }
+
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) {
+ mutex_exit(&odd->odd_lock);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (ENXIO);
+ }
+
+ ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0);
+ odd->odd_flags |= OVERLAY_F_ACTIVATED;
+
+ /*
+ * Now that we've activated ourselves, we should indicate to the world
+ * that we're up. Note that we may not be able to perform lookups at
+ * this time, but our notion of being 'up' isn't dependent on that
+ * ability.
+ */
+ mac_link_update(odd->odd_mh, LINK_STATE_UP);
+ mutex_exit(&odd->odd_lock);
+
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ overlay_ioc_delete_t *oidp = karg;
+ overlay_dev_t *odd;
+ datalink_id_t tid;
+ int ret;
+
+ odd = overlay_hold_by_dlid(oidp->oid_linkid);
+ if (odd == NULL) {
+ return (ENOENT);
+ }
+
+ mutex_enter(&odd->odd_lock);
+ /* If we're not the only hold, we're busy */
+ if (odd->odd_ref != 1) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (EBUSY);
+ }
+
+ if (odd->odd_flags & OVERLAY_F_IN_MUX) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (EBUSY);
+ }
+
+ /*
+ * To remove this, we need to first remove it from dls and then remove
+ * it from mac. The act of removing it from mac will check if there are
+ * devices on top of this, eg. vnics. If there are, then that will fail
+ * and we'll have to go through and recreate the dls entry. Only after
+ * mac_unregister has succeeded, then we'll go through and actually free
+ * everything and drop the dev lock.
+ */
+ ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE);
+ if (ret != 0) {
+ overlay_hold_rele(odd);
+ return (ret);
+ }
+
+ ASSERT(oidp->oid_linkid == tid);
+ ret = mac_disable(odd->odd_mh);
+ if (ret != 0) {
+ (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid,
+ crgetzoneid(cred));
+ overlay_hold_rele(odd);
+ return (ret);
+ }
+
+ overlay_target_quiesce(odd->odd_target);
+
+ mutex_enter(&overlay_dev_lock);
+ list_remove(&overlay_dev_list, odd);
+ mutex_exit(&overlay_dev_lock);
+
+ cv_destroy(&odd->odd_iowait);
+ mutex_destroy(&odd->odd_lock);
+ overlay_target_free(odd);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ overlay_dev_t *odd;
+ overlay_ioc_nprops_t *on = karg;
+
+ odd = overlay_hold_by_dlid(on->oipn_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+ on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS;
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+static int
+overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg)
+{
+ overlay_prop_handle_t phdl = arg;
+ overlay_prop_set_range_str(phdl, opp->ovp_name);
+ return (0);
+}
+
+static int
+overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id)
+{
+ int i;
+
+ for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+ if (strcmp(overlay_dev_props[i], name) == 0) {
+ *id = i;
+ return (0);
+ }
+ }
+
+ for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
+ if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) {
+ *id = i + OVERLAY_DEV_NPROPS;
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+static void
+overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl)
+{
+ uint32_t def;
+ mac_propval_range_t range;
+ uint_t perm;
+
+ ASSERT(MAC_PERIM_HELD(odd->odd_mh));
+
+ bzero(&range, sizeof (mac_propval_range_t));
+ range.mpr_count = 1;
+ if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def,
+ sizeof (def), &range, &perm) != 0)
+ return;
+
+ if (perm == MAC_PROP_PERM_READ)
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+ else if (perm == MAC_PROP_PERM_WRITE)
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE);
+ else if (perm == MAC_PROP_PERM_RW)
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min,
+ range.mpr_range_uint32[0].mpur_max);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ overlay_dev_t *odd;
+ int ret;
+ mac_perim_handle_t mph;
+ uint_t propid = UINT_MAX;
+ overlay_ioc_propinfo_t *oip = karg;
+ overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
+
+ odd = overlay_hold_by_dlid(oip->oipi_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ overlay_prop_init(phdl);
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+
+ /*
+ * If the id is -1, then the property that we're looking for is named in
+ * oipi_name and we should fill in its id. Otherwise, we've been given
+ * an id and we need to turn that into a name for our plugin's sake. The
+ * id is our own fabrication for property discovery.
+ */
+ if (oip->oipi_id == -1) {
+ /*
+ * Determine if it's a known generic property or it belongs to a
+ * module by checking against the list of known names.
+ */
+ oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+ if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name,
+ &propid)) != 0) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ }
+ oip->oipi_id = propid;
+ if (propid >= OVERLAY_DEV_NPROPS) {
+ ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
+ oip->oipi_name, phdl);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+
+ }
+ } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) {
+ uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS;
+
+ if (id >= odd->odd_plugin->ovp_nprops) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (EINVAL);
+ }
+ ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
+ odd->odd_plugin->ovp_props[id], phdl);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ } else if (oip->oipi_id < -1) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (EINVAL);
+ } else {
+ ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS);
+ ASSERT(oip->oipi_id >= 0);
+ propid = oip->oipi_id;
+ (void) strlcpy(oip->oipi_name, overlay_dev_props[propid],
+ sizeof (oip->oipi_name));
+ }
+
+ switch (propid) {
+ case OVERLAY_DEV_P_MTU:
+ overlay_i_propinfo_mtu(odd, phdl);
+ break;
+ case OVERLAY_DEV_P_VNETID:
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_nodefault(phdl);
+ break;
+ case OVERLAY_DEV_P_ENCAP:
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING);
+ overlay_prop_set_nodefault(phdl);
+ overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl);
+ break;
+ case OVERLAY_DEV_P_VARPDID:
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_nodefault(phdl);
+ break;
+ default:
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ENOENT);
+ }
+
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ int ret;
+ overlay_dev_t *odd;
+ mac_perim_handle_t mph;
+ overlay_ioc_prop_t *oip = karg;
+ uint_t propid, mtu;
+
+ odd = overlay_hold_by_dlid(oip->oip_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+ oip->oip_size = OVERLAY_PROP_SIZEMAX;
+ oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+ if (oip->oip_id == -1) {
+ int i;
+
+ for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+ if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
+ break;
+ if (i == OVERLAY_DEV_NPROPS) {
+ ret = odd->odd_plugin->ovp_ops->ovpo_getprop(
+ odd->odd_pvoid, oip->oip_name,
+ oip->oip_value, &oip->oip_size);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ }
+ }
+
+ propid = i;
+ } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
+ uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
+
+ if (id > odd->odd_plugin->ovp_nprops) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (EINVAL);
+ }
+ ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
+ odd->odd_plugin->ovp_props[id], oip->oip_value,
+ &oip->oip_size);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ } else if (oip->oip_id < -1) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (EINVAL);
+ } else {
+ ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
+ ASSERT(oip->oip_id >= 0);
+ propid = oip->oip_id;
+ }
+
+ ret = 0;
+ switch (propid) {
+ case OVERLAY_DEV_P_MTU:
+ /*
+ * The MTU is always set and retrieved through MAC, to allow for
+ * MAC to do whatever it wants, as really that property belongs
+ * to MAC. This is important for things where vnics have hold on
+ * the MTU.
+ */
+ mac_sdu_get(odd->odd_mh, NULL, &mtu);
+ bcopy(&mtu, oip->oip_value, sizeof (uint_t));
+ oip->oip_size = sizeof (uint_t);
+ break;
+ case OVERLAY_DEV_P_VNETID:
+ /*
+ * While it's read-only while inside of a mux, we're not in a
+ * context that can guarantee that. Therefore we always grab the
+ * overlay_dev_t's odd_lock.
+ */
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint64_t);
+ break;
+ case OVERLAY_DEV_P_ENCAP:
+ oip->oip_size = strlcpy((char *)oip->oip_value,
+ odd->odd_plugin->ovp_name, oip->oip_size);
+ break;
+ case OVERLAY_DEV_P_VARPDID:
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_VARPD) {
+ const uint64_t val = odd->odd_target->ott_id;
+ bcopy(&val, oip->oip_value, sizeof (uint64_t));
+ oip->oip_size = sizeof (uint64_t);
+ } else {
+ oip->oip_size = 0;
+ }
+ mutex_exit(&odd->odd_lock);
+ break;
+ default:
+ ret = ENOENT;
+ }
+
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+}
+
+static void
+overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
+{
+ mutex_enter(&odd->odd_lock);
+
+ /* Simple case, not active */
+ if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+ odd->odd_vid = vnetid;
+ mutex_exit(&odd->odd_lock);
+ return;
+ }
+
+ /*
+ * In the hard case, we need to set the drop flag, quiesce I/O and then
+ * we can go ahead and do everything.
+ */
+ odd->odd_flags |= OVERLAY_F_MDDROP;
+ overlay_io_wait(odd, OVERLAY_F_IOMASK);
+ mutex_exit(&odd->odd_lock);
+
+ overlay_mux_remove_dev(odd->odd_mux, odd);
+ mutex_enter(&odd->odd_lock);
+ odd->odd_vid = vnetid;
+ mutex_exit(&odd->odd_lock);
+ overlay_mux_add_dev(odd->odd_mux, odd);
+
+ mutex_enter(&odd->odd_lock);
+ ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
+ odd->odd_flags &= ~OVERLAY_F_IN_MUX;
+ mutex_exit(&odd->odd_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ int ret;
+ overlay_dev_t *odd;
+ overlay_ioc_prop_t *oip = karg;
+ uint_t propid = UINT_MAX;
+ mac_perim_handle_t mph;
+ uint64_t maxid, *vidp;
+
+ if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
+ return (EINVAL);
+
+ odd = overlay_hold_by_dlid(oip->oip_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
+ mac_perim_exit(mph);
+ mutex_exit(&odd->odd_lock);
+ return (ENOTSUP);
+ }
+ mutex_exit(&odd->odd_lock);
+ if (oip->oip_id == -1) {
+ int i;
+
+ for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+ if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
+ break;
+ if (i == OVERLAY_DEV_NPROPS) {
+ ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
+ odd->odd_pvoid, oip->oip_name,
+ oip->oip_value, oip->oip_size);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ }
+ }
+
+ propid = i;
+ } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
+ uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
+
+ if (id > odd->odd_plugin->ovp_nprops) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+ ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid,
+ odd->odd_plugin->ovp_props[id], oip->oip_value,
+ oip->oip_size);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (ret);
+ } else if (oip->oip_id < -1) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ } else {
+ ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
+ ASSERT(oip->oip_id >= 0);
+ propid = oip->oip_id;
+ }
+
+ ret = 0;
+ switch (propid) {
+ case OVERLAY_DEV_P_MTU:
+ ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu",
+ oip->oip_value, oip->oip_size);
+ break;
+ case OVERLAY_DEV_P_VNETID:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ vidp = (uint64_t *)oip->oip_value;
+ ASSERT(odd->odd_plugin->ovp_id_size <= 8);
+ maxid = UINT64_MAX;
+ if (odd->odd_plugin->ovp_id_size != 8)
+ maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) -
+ 1ULL;
+ if (*vidp >= maxid) {
+ ret = EINVAL;
+ break;
+ }
+ overlay_setprop_vnetid(odd, *vidp);
+ break;
+ case OVERLAY_DEV_P_ENCAP:
+ case OVERLAY_DEV_P_VARPDID:
+ ret = EPERM;
+ break;
+ default:
+ ret = ENOENT;
+ }
+
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ overlay_dev_t *odd;
+ overlay_ioc_status_t *os = karg;
+
+ odd = overlay_hold_by_dlid(os->ois_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) {
+ os->ois_status = OVERLAY_I_DEGRADED;
+ if (odd->odd_fmamsg != NULL) {
+ (void) strlcpy(os->ois_message, odd->odd_fmamsg,
+ OVERLAY_STATUS_BUFLEN);
+ } else {
+ os->ois_message[0] = '\0';
+ }
+
+ } else {
+ os->ois_status = OVERLAY_I_OK;
+ os->ois_message[0] = '\0';
+ }
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+static dld_ioc_info_t overlay_ioc_list[] = {
+ { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t),
+ overlay_i_create, secpolicy_dl_config },
+ { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t),
+ overlay_i_activate, secpolicy_dl_config },
+ { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t),
+ overlay_i_delete, secpolicy_dl_config },
+ { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT,
+ sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo,
+ secpolicy_dl_config },
+ { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT,
+ sizeof (overlay_ioc_prop_t), overlay_i_getprop,
+ secpolicy_dl_config },
+ { OVERLAY_IOC_SETPROP, DLDCOPYIN,
+ sizeof (overlay_ioc_prop_t), overlay_i_setprop,
+ secpolicy_dl_config },
+ { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT,
+ sizeof (overlay_ioc_nprops_t), overlay_i_nprops,
+ secpolicy_dl_config },
+ { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT,
+ sizeof (overlay_ioc_status_t), overlay_i_status,
+ NULL }
+};
+
+static int
+overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ int fmcap = DDI_FM_EREPORT_CAPABLE;
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ if (overlay_dip != NULL || ddi_get_instance(dip) != 0)
+ return (DDI_FAILURE);
+
+ ddi_fm_init(dip, &fmcap, NULL);
+
+ if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR,
+ ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE)
+ return (DDI_FAILURE);
+
+ if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list,
+ DLDIOCCNT(overlay_ioc_list)) != 0) {
+ ddi_remove_minor_node(dip, OVERLAY_CTL);
+ return (DDI_FAILURE);
+ }
+
+ overlay_dip = dip;
+ return (DDI_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
+{
+ int error;
+
+ switch (cmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *resp = (void *)overlay_dip;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *resp = (void *)0;
+ error = DDI_SUCCESS;
+ break;
+ default:
+ error = DDI_FAILURE;
+ break;
+ }
+
+ return (error);
+}
+
+static int
+overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ mutex_enter(&overlay_dev_lock);
+ if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) {
+ mutex_exit(&overlay_dev_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&overlay_dev_lock);
+
+
+ dld_ioc_unregister(OVERLAY_IOC);
+ ddi_remove_minor_node(dip, OVERLAY_CTL);
+ ddi_fm_fini(dip);
+ overlay_dip = NULL;
+ return (DDI_SUCCESS);
+}
+
+static struct cb_ops overlay_cbops = {
+ overlay_target_open, /* cb_open */
+ overlay_target_close, /* cb_close */
+ nodev, /* cb_strategy */
+ nodev, /* cb_print */
+ nodev, /* cb_dump */
+ nodev, /* cb_read */
+ nodev, /* cb_write */
+ overlay_target_ioctl, /* cb_ioctl */
+ nodev, /* cb_devmap */
+ nodev, /* cb_mmap */
+ nodev, /* cb_segmap */
+ nochpoll, /* cb_chpoll */
+ ddi_prop_op, /* cb_prop_op */
+ NULL, /* cb_stream */
+ D_MP, /* cb_flag */
+ CB_REV, /* cb_rev */
+ nodev, /* cb_aread */
+ nodev, /* cb_awrite */
+};
+
+static struct dev_ops overlay_dev_ops = {
+ DEVO_REV, /* devo_rev */
+ 0, /* devo_refcnt */
+ overlay_getinfo, /* devo_getinfo */
+ nulldev, /* devo_identify */
+ nulldev, /* devo_probe */
+ overlay_attach, /* devo_attach */
+ overlay_detach, /* devo_detach */
+ nulldev, /* devo_reset */
+ &overlay_cbops, /* devo_cb_ops */
+ NULL, /* devo_bus_ops */
+ NULL, /* devo_power */
+ ddi_quiesce_not_supported /* devo_quiesce */
+};
+
+static struct modldrv overlay_modldrv = {
+ &mod_driverops,
+ "Overlay Network Driver",
+ &overlay_dev_ops
+};
+
+static struct modlinkage overlay_linkage = {
+ MODREV_1,
+ &overlay_modldrv
+};
+
+static int
+overlay_init(void)
+{
+ mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&overlay_dev_list, sizeof (overlay_dev_t),
+ offsetof(overlay_dev_t, odd_link));
+ overlay_mux_init();
+ overlay_plugin_init();
+ overlay_target_init();
+
+ return (DDI_SUCCESS);
+}
+
+static void
+overlay_fini(void)
+{
+ overlay_target_fini();
+ overlay_plugin_fini();
+ overlay_mux_fini();
+ mutex_destroy(&overlay_dev_lock);
+ list_destroy(&overlay_dev_list);
+}
+
+int
+_init(void)
+{
+ int err;
+
+ if ((err = overlay_init()) != DDI_SUCCESS)
+ return (err);
+
+ mac_init_ops(NULL, "overlay");
+ err = mod_install(&overlay_linkage);
+ if (err != DDI_SUCCESS) {
+ overlay_fini();
+ return (err);
+ }
+
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&overlay_linkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int err;
+
+ err = mod_remove(&overlay_linkage);
+ if (err != 0)
+ return (err);
+
+ overlay_fini();
+ return (0);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay.conf b/usr/src/uts/common/io/overlay/overlay.conf
new file mode 100644
index 0000000000..4b62fafd94
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015, Joyent, Inc.
+#
+
+name="overlay" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/io/overlay/overlay.mapfile b/usr/src/uts/common/io/overlay/overlay.mapfile
new file mode 100644
index 0000000000..800d72dc2b
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.mapfile
@@ -0,0 +1,46 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+# usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+ global:
+ # DDI Interfaces
+ _fini;
+ _init;
+ _info;
+
+ # Encapsualation Plugin interfaces
+ overlay_plugin_alloc;
+ overlay_plugin_free;
+ overlay_plugin_register;
+ overlay_plugin_unregister;
+ local:
+ *;
+};
diff --git a/usr/src/uts/common/io/overlay/overlay_fm.c b/usr/src/uts/common/io/overlay/overlay_fm.c
new file mode 100644
index 0000000000..0701d08e8b
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_fm.c
@@ -0,0 +1,82 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay device FMA operations.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/ddifm.h>
+#include <sys/overlay_impl.h>
+
+kmutex_t overlay_fm_lock;
+uint_t overlay_fm_count;
+
+void
+overlay_fm_init(void)
+{
+ overlay_fm_count = 0;
+ mutex_init(&overlay_fm_lock, NULL, MUTEX_DRIVER, NULL);
+}
+
+void
+overlay_fm_fini(void)
+{
+ VERIFY(overlay_fm_count == 0);
+ mutex_destroy(&overlay_fm_lock);
+}
+
+void
+overlay_fm_degrade(overlay_dev_t *odd, const char *msg)
+{
+ mutex_enter(&overlay_fm_lock);
+ mutex_enter(&odd->odd_lock);
+
+ if (msg != NULL)
+ (void) strlcpy(odd->odd_fmamsg, msg, OVERLAY_STATUS_BUFLEN);
+
+ if (odd->odd_flags & OVERLAY_F_DEGRADED)
+ goto out;
+
+ odd->odd_flags |= OVERLAY_F_DEGRADED;
+ overlay_fm_count++;
+ if (overlay_fm_count == 1) {
+ ddi_fm_service_impact(overlay_dip, DDI_SERVICE_DEGRADED);
+ }
+out:
+ mutex_exit(&odd->odd_lock);
+ mutex_exit(&overlay_fm_lock);
+}
+
+void
+overlay_fm_restore(overlay_dev_t *odd)
+{
+ mutex_enter(&overlay_fm_lock);
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_DEGRADED))
+ goto out;
+
+ odd->odd_fmamsg[0] = '\0';
+ odd->odd_flags &= ~OVERLAY_F_DEGRADED;
+ overlay_fm_count--;
+ if (overlay_fm_count == 0) {
+ ddi_fm_service_impact(overlay_dip, DDI_SERVICE_RESTORED);
+ }
+out:
+ mutex_exit(&odd->odd_lock);
+ mutex_exit(&overlay_fm_lock);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c
new file mode 100644
index 0000000000..cd612763e1
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_mux.c
@@ -0,0 +1,363 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * Overlay device ksocket multiplexer.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ksynch.h>
+#include <sys/ksocket.h>
+#include <sys/avl.h>
+#include <sys/list.h>
+#include <sys/pattr.h>
+#include <sys/sysmacros.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/tihdr.h>
+
+#include <sys/overlay_impl.h>
+
+#include <sys/sdt.h>
+
+#define OVERLAY_FREEMSG(mp, reason) \
+ DTRACE_PROBE2(overlay__freemsg, mblk_t *, mp, char *, reason)
+
+static list_t overlay_mux_list;
+static kmutex_t overlay_mux_lock;
+
+void
+overlay_mux_init(void)
+{
+ list_create(&overlay_mux_list, sizeof (overlay_mux_t),
+ offsetof(overlay_mux_t, omux_lnode));
+ mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
+}
+
+void
+overlay_mux_fini(void)
+{
+ mutex_destroy(&overlay_mux_lock);
+ list_destroy(&overlay_mux_list);
+}
+
+static int
+overlay_mux_comparator(const void *a, const void *b)
+{
+ const overlay_dev_t *odl, *odr;
+ odl = a;
+ odr = b;
+ if (odl->odd_vid > odr->odd_vid)
+ return (1);
+ else if (odl->odd_vid < odr->odd_vid)
+ return (-1);
+ else
+ return (0);
+}
+
+/*
+ * This is the central receive data path. We need to decode the packet, if we
+ * can, and then deliver it to the appropriate overlay.
+ */
+/* ARGSUSED */
+static boolean_t
+overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
+ void *arg)
+{
+ mblk_t *mp, *nmp, *fmp;
+ overlay_mux_t *mux = arg;
+
+ /*
+ * We may have a received a chain of messages. Each message in the
+ * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
+ * If we aren't getting that, we should probably drop that for the
+ * moment.
+ */
+ for (mp = mpchain; mp != NULL; mp = nmp) {
+ struct T_unitdata_ind *tudi;
+ ovep_encap_info_t infop;
+ overlay_dev_t od, *odd;
+ int ret;
+
+ nmp = mp->b_next;
+ mp->b_next = NULL;
+
+ if (DB_TYPE(mp) != M_PROTO) {
+ OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
+ freemsg(mp);
+ continue;
+ }
+
+ if (mp->b_cont == NULL) {
+ OVERLAY_FREEMSG(mp, "missing a b_cont");
+ freemsg(mp);
+ continue;
+ }
+
+ tudi = (struct T_unitdata_ind *)mp->b_rptr;
+ if (tudi->PRIM_type != T_UNITDATA_IND) {
+ OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
+ freemsg(mp);
+ continue;
+ }
+
+ /*
+ * In the future, we'll care about the source information
+ * for purposes of telling varpd for oob invalidation. But for
+ * now, just drop that block.
+ */
+ fmp = mp;
+ mp = fmp->b_cont;
+ freeb(fmp);
+
+ /*
+ * Until we have VXLAN-or-other-decap HW acceleration support
+ * (e.g. we support NICs that reach into VXLAN-encapsulated
+ * packets and check the inside-VXLAN IP packets' checksums,
+ * or do LSO with VXLAN), we should clear any HW-accelerated-
+ * performed bits.
+ */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ /*
+ * Decap and deliver.
+ */
+ bzero(&infop, sizeof (ovep_encap_info_t));
+ ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
+ if (ret != 0) {
+ OVERLAY_FREEMSG(mp, "decap failed");
+ freemsg(mp);
+ continue;
+ }
+ if (MBLKL(mp) > infop.ovdi_hdr_size) {
+ mp->b_rptr += infop.ovdi_hdr_size;
+ } else {
+ while (infop.ovdi_hdr_size != 0) {
+ size_t rem, blkl;
+
+ if (mp == NULL)
+ break;
+
+ blkl = MBLKL(mp);
+ rem = MIN(infop.ovdi_hdr_size, blkl);
+ infop.ovdi_hdr_size -= rem;
+ mp->b_rptr += rem;
+ if (rem == blkl) {
+ fmp = mp;
+ mp = fmp->b_cont;
+ fmp->b_cont = NULL;
+ OVERLAY_FREEMSG(mp,
+ "freed a fmp block");
+ freemsg(fmp);
+ }
+ }
+ if (mp == NULL) {
+ OVERLAY_FREEMSG(mp, "freed it all...");
+ continue;
+ }
+ }
+
+
+ od.odd_vid = infop.ovdi_id;
+ mutex_enter(&mux->omux_lock);
+ odd = avl_find(&mux->omux_devices, &od, NULL);
+ if (odd == NULL) {
+ mutex_exit(&mux->omux_lock);
+ OVERLAY_FREEMSG(mp, "no matching vid");
+ freemsg(mp);
+ continue;
+ }
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
+ !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+ mutex_exit(&odd->odd_lock);
+ mutex_exit(&mux->omux_lock);
+ OVERLAY_FREEMSG(mp, "dev dropped");
+ freemsg(mp);
+ continue;
+ }
+ overlay_io_start(odd, OVERLAY_F_IN_RX);
+ mutex_exit(&odd->odd_lock);
+ mutex_exit(&mux->omux_lock);
+
+ mac_rx(odd->odd_mh, NULL, mp);
+
+ mutex_enter(&odd->odd_lock);
+ overlay_io_done(odd, OVERLAY_F_IN_RX);
+ mutex_exit(&odd->odd_lock);
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * Register a given device with a socket backend. If no such device socket
+ * exists, create a new one.
+ */
+overlay_mux_t *
+overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
+ struct sockaddr *addr, socklen_t len, int *errp)
+{
+ int err;
+ overlay_mux_t *mux;
+ ksocket_t ksock;
+
+ if (errp == NULL)
+ errp = &err;
+
+ mutex_enter(&overlay_mux_lock);
+ for (mux = list_head(&overlay_mux_list); mux != NULL;
+ mux = list_next(&overlay_mux_list, mux)) {
+ if (domain == mux->omux_domain &&
+ family == mux->omux_family &&
+ protocol == mux->omux_protocol &&
+ len == mux->omux_alen &&
+ bcmp(addr, mux->omux_addr, len) == 0) {
+
+ if (opp != mux->omux_plugin) {
+ *errp = EEXIST;
+ return (NULL);
+ }
+
+ mutex_enter(&mux->omux_lock);
+ mux->omux_count++;
+ mutex_exit(&mux->omux_lock);
+ mutex_exit(&overlay_mux_lock);
+ *errp = 0;
+ return (mux);
+ }
+ }
+
+ /*
+ * Today we aren't zone-aware and only exist in the global zone. When we
+ * allow for things to exist in the non-global zone, we'll want to use a
+ * credential that's actually specific to the zone.
+ */
+ *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
+ kcred);
+ if (*errp != 0) {
+ mutex_exit(&overlay_mux_lock);
+ return (NULL);
+ }
+
+ *errp = ksocket_bind(ksock, addr, len, kcred);
+ if (*errp != 0) {
+ mutex_exit(&overlay_mux_lock);
+ ksocket_close(ksock, kcred);
+ return (NULL);
+ }
+
+ /*
+ * Ask our lower layer to optionally toggle anything they need on this
+ * socket. Because a socket is owned by a single type of plugin, we can
+ * then ask it to perform any additional socket set up it'd like to do.
+ */
+ if (opp->ovp_ops->ovpo_sockopt != NULL &&
+ (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
+ mutex_exit(&overlay_mux_lock);
+ ksocket_close(ksock, kcred);
+ return (NULL);
+ }
+
+ mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
+ list_link_init(&mux->omux_lnode);
+ mux->omux_ksock = ksock;
+ mux->omux_plugin = opp;
+ mux->omux_domain = domain;
+ mux->omux_family = family;
+ mux->omux_protocol = protocol;
+ mux->omux_addr = kmem_alloc(len, KM_SLEEP);
+ bcopy(addr, mux->omux_addr, len);
+ mux->omux_alen = len;
+ mux->omux_count = 1;
+ avl_create(&mux->omux_devices, overlay_mux_comparator,
+ sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
+ mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
+
+
+ /* Once this is called, we need to expect to rx data */
+ *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
+ if (*errp != 0) {
+ ksocket_close(ksock, kcred);
+ mutex_destroy(&mux->omux_lock);
+ avl_destroy(&mux->omux_devices);
+ kmem_free(mux->omux_addr, len);
+ kmem_free(mux, sizeof (overlay_mux_t));
+ return (NULL);
+ }
+
+ list_insert_tail(&overlay_mux_list, mux);
+ mutex_exit(&overlay_mux_lock);
+
+ *errp = 0;
+ return (mux);
+}
+
+void
+overlay_mux_close(overlay_mux_t *mux)
+{
+ mutex_enter(&overlay_mux_lock);
+ mutex_enter(&mux->omux_lock);
+ mux->omux_count--;
+ if (mux->omux_count != 0) {
+ mutex_exit(&mux->omux_lock);
+ mutex_exit(&overlay_mux_lock);
+ return;
+ }
+ list_remove(&overlay_mux_list, mux);
+ mutex_exit(&mux->omux_lock);
+ mutex_exit(&overlay_mux_lock);
+
+ ksocket_close(mux->omux_ksock, kcred);
+ avl_destroy(&mux->omux_devices);
+ kmem_free(mux->omux_addr, mux->omux_alen);
+ kmem_free(mux, sizeof (overlay_mux_t));
+}
+
+void
+overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
+{
+ mutex_enter(&mux->omux_lock);
+ avl_add(&mux->omux_devices, odd);
+ mutex_exit(&mux->omux_lock);
+}
+
+void
+overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
+{
+ mutex_enter(&mux->omux_lock);
+ avl_remove(&mux->omux_devices, odd);
+ mutex_exit(&mux->omux_lock);
+}
+
+int
+overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
+{
+ int ret;
+
+ /*
+ * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
+ * that isn't actually supported by UDP at this time.
+ */
+ ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred);
+ if (ret != 0)
+ freemsg(mp);
+
+ return (ret);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_plugin.c b/usr/src/uts/common/io/overlay/overlay_plugin.c
new file mode 100644
index 0000000000..348ddb92a2
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_plugin.c
@@ -0,0 +1,281 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * Overlay device encapsulation plugin management
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/errno.h>
+#include <sys/sysmacros.h>
+#include <sys/modctl.h>
+
+#include <sys/overlay_impl.h>
+
+static kmem_cache_t *overlay_plugin_cache;
+static kmutex_t overlay_plugin_lock;
+static list_t overlay_plugin_list;
+
+#define OVERLAY_MODDIR "overlay"
+
+/* ARGSUSED */
+static int
+overlay_plugin_cache_constructor(void *buf, void *arg, int kmflags)
+{
+ overlay_plugin_t *opp = buf;
+
+ mutex_init(&opp->ovp_mutex, NULL, MUTEX_DRIVER, NULL);
+ list_link_init(&opp->ovp_link);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_plugin_cache_destructor(void *buf, void *arg)
+{
+ overlay_plugin_t *opp = buf;
+ ASSERT(list_link_active(&opp->ovp_link) == 0);
+ mutex_destroy(&opp->ovp_mutex);
+}
+
+void
+overlay_plugin_init(void)
+{
+ mutex_init(&overlay_plugin_lock, NULL, MUTEX_DRIVER, 0);
+
+ /*
+ * In the future we may want to have a reaper to unload unused modules
+ * to help the kernel be able to reclaim memory.
+ */
+ overlay_plugin_cache = kmem_cache_create("overlay_plugin_cache",
+ sizeof (overlay_plugin_t), 0, overlay_plugin_cache_constructor,
+ overlay_plugin_cache_destructor, NULL, NULL, NULL, 0);
+ list_create(&overlay_plugin_list, sizeof (overlay_plugin_t),
+ offsetof(overlay_plugin_t, ovp_link));
+}
+
+void
+overlay_plugin_fini(void)
+{
+ mutex_enter(&overlay_plugin_lock);
+ VERIFY(list_is_empty(&overlay_plugin_list));
+ mutex_exit(&overlay_plugin_lock);
+
+ list_destroy(&overlay_plugin_list);
+ kmem_cache_destroy(overlay_plugin_cache);
+ mutex_destroy(&overlay_plugin_lock);
+}
+
+overlay_plugin_register_t *
+overlay_plugin_alloc(uint_t version)
+{
+ overlay_plugin_register_t *ovrp;
+ /* Version 1 is the only one that exists */
+ if (version != OVEP_VERSION_ONE)
+ return (NULL);
+
+ ovrp = kmem_zalloc(sizeof (overlay_plugin_register_t), KM_SLEEP);
+ ovrp->ovep_version = version;
+ return (ovrp);
+}
+
+void
+overlay_plugin_free(overlay_plugin_register_t *ovrp)
+{
+ kmem_free(ovrp, sizeof (overlay_plugin_register_t));
+}
+
+int
+overlay_plugin_register(overlay_plugin_register_t *ovrp)
+{
+ overlay_plugin_t *opp, *ipp;
+
+ /* Sanity check parameters of the registration */
+ if (ovrp->ovep_version != OVEP_VERSION_ONE)
+ return (EINVAL);
+
+ if (ovrp->ovep_name == NULL || ovrp->ovep_ops == NULL)
+ return (EINVAL);
+
+ if ((ovrp->ovep_flags & ~(OVEP_F_VLAN_TAG)) != 0)
+ return (EINVAL);
+
+ if (ovrp->ovep_id_size < 1)
+ return (EINVAL);
+
+ /* Don't support anything that has an id size larger than 8 bytes */
+ if (ovrp->ovep_id_size > 8)
+ return (ENOTSUP);
+
+ if (ovrp->ovep_dest == OVERLAY_PLUGIN_D_INVALID)
+ return (EINVAL);
+
+ if ((ovrp->ovep_dest & ~OVERLAY_PLUGIN_D_MASK) != 0)
+ return (EINVAL);
+
+ if (ovrp->ovep_ops->ovpo_callbacks != 0)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_init == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_fini == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_encap == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_decap == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_socket == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_getprop == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_setprop == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_propinfo == NULL)
+ return (EINVAL);
+
+
+ opp = kmem_cache_alloc(overlay_plugin_cache, KM_SLEEP);
+ opp->ovp_active = 0;
+ opp->ovp_name = ovrp->ovep_name;
+ opp->ovp_ops = ovrp->ovep_ops;
+ opp->ovp_props = ovrp->ovep_props;
+ opp->ovp_id_size = ovrp->ovep_id_size;
+ opp->ovp_flags = ovrp->ovep_flags;
+ opp->ovp_dest = ovrp->ovep_dest;
+
+ opp->ovp_nprops = 0;
+ if (ovrp->ovep_props != NULL) {
+ while (ovrp->ovep_props[opp->ovp_nprops] != NULL) {
+ if (strlen(ovrp->ovep_props[opp->ovp_nprops]) >=
+ OVERLAY_PROP_NAMELEN) {
+ mutex_exit(&overlay_plugin_lock);
+ kmem_cache_free(overlay_plugin_cache, opp);
+ return (EINVAL);
+ }
+ opp->ovp_nprops++;
+ }
+ }
+
+ mutex_enter(&overlay_plugin_lock);
+ for (ipp = list_head(&overlay_plugin_list); ipp != NULL;
+ ipp = list_next(&overlay_plugin_list, ipp)) {
+ if (strcmp(ipp->ovp_name, opp->ovp_name) == 0) {
+ mutex_exit(&overlay_plugin_lock);
+ kmem_cache_free(overlay_plugin_cache, opp);
+ return (EEXIST);
+ }
+ }
+ list_insert_tail(&overlay_plugin_list, opp);
+ mutex_exit(&overlay_plugin_lock);
+
+ return (0);
+}
+
+int
+overlay_plugin_unregister(const char *name)
+{
+ overlay_plugin_t *opp;
+
+ mutex_enter(&overlay_plugin_lock);
+ for (opp = list_head(&overlay_plugin_list); opp != NULL;
+ opp = list_next(&overlay_plugin_list, opp)) {
+ if (strcmp(opp->ovp_name, name) == 0)
+ break;
+ }
+
+ if (opp == NULL) {
+ mutex_exit(&overlay_plugin_lock);
+ return (ENOENT);
+ }
+
+ mutex_enter(&opp->ovp_mutex);
+ if (opp->ovp_active > 0) {
+ mutex_exit(&opp->ovp_mutex);
+ mutex_exit(&overlay_plugin_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&opp->ovp_mutex);
+
+ list_remove(&overlay_plugin_list, opp);
+ mutex_exit(&overlay_plugin_lock);
+
+ kmem_cache_free(overlay_plugin_cache, opp);
+ return (0);
+}
+
+overlay_plugin_t *
+overlay_plugin_lookup(const char *name)
+{
+ overlay_plugin_t *opp;
+ boolean_t trymodload = B_FALSE;
+
+ for (;;) {
+ mutex_enter(&overlay_plugin_lock);
+ for (opp = list_head(&overlay_plugin_list); opp != NULL;
+ opp = list_next(&overlay_plugin_list, opp)) {
+ if (strcmp(name, opp->ovp_name) == 0) {
+ mutex_enter(&opp->ovp_mutex);
+ opp->ovp_active++;
+ mutex_exit(&opp->ovp_mutex);
+ mutex_exit(&overlay_plugin_lock);
+ return (opp);
+ }
+ }
+ mutex_exit(&overlay_plugin_lock);
+
+ if (trymodload == B_TRUE)
+ return (NULL);
+
+ /*
+ * If we didn't find it, it may still exist, but just not have
+ * been a loaded module. In that case, we'll do one attempt to
+ * load it.
+ */
+ if (modload(OVERLAY_MODDIR, (char *)name) == -1)
+ return (NULL);
+ trymodload = B_TRUE;
+ }
+
+}
+
+void
+overlay_plugin_rele(overlay_plugin_t *opp)
+{
+ mutex_enter(&opp->ovp_mutex);
+ ASSERT(opp->ovp_active > 0);
+ opp->ovp_active--;
+ mutex_exit(&opp->ovp_mutex);
+}
+
+void
+overlay_plugin_walk(overlay_plugin_walk_f func, void *arg)
+{
+ overlay_plugin_t *opp;
+ mutex_enter(&overlay_plugin_lock);
+ for (opp = list_head(&overlay_plugin_list); opp != NULL;
+ opp = list_next(&overlay_plugin_list, opp)) {
+ if (func(opp, arg) != 0) {
+ mutex_exit(&overlay_plugin_lock);
+ return;
+ }
+ }
+ mutex_exit(&overlay_plugin_lock);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_prop.c b/usr/src/uts/common/io/overlay/overlay_prop.c
new file mode 100644
index 0000000000..ba1ea2a629
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_prop.c
@@ -0,0 +1,122 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+/*
+ * Routines for manipulating property information structures.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/overlay_impl.h>
+
+void
+overlay_prop_init(overlay_prop_handle_t phdl)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+
+ infop->oipi_posssize = sizeof (mac_propval_range_t);
+ bzero(rangep, sizeof (mac_propval_range_t));
+}
+
+void
+overlay_prop_set_name(overlay_prop_handle_t phdl, const char *name)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ (void) strlcpy(infop->oipi_name, name, OVERLAY_PROP_NAMELEN);
+}
+
+void
+overlay_prop_set_prot(overlay_prop_handle_t phdl, overlay_prop_prot_t prot)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ infop->oipi_prot = prot;
+}
+
+void
+overlay_prop_set_type(overlay_prop_handle_t phdl, overlay_prop_type_t type)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ infop->oipi_type = type;
+}
+
+int
+overlay_prop_set_default(overlay_prop_handle_t phdl, void *def, ssize_t len)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+
+ if (len > OVERLAY_PROP_SIZEMAX)
+ return (E2BIG);
+
+ if (len < 0)
+ return (EOVERFLOW);
+
+ bcopy(def, infop->oipi_default, len);
+ infop->oipi_defsize = (uint32_t)len;
+
+ return (0);
+}
+
+void
+overlay_prop_set_nodefault(overlay_prop_handle_t phdl)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ infop->oipi_default[0] = '\0';
+ infop->oipi_defsize = 0;
+}
+
+void
+overlay_prop_set_range_uint32(overlay_prop_handle_t phdl, uint32_t min,
+ uint32_t max)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+
+ if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_UINT32)
+ return;
+
+ if (infop->oipi_posssize + sizeof (mac_propval_uint32_range_t) >
+ sizeof (infop->oipi_poss))
+ return;
+
+ infop->oipi_posssize += sizeof (mac_propval_uint32_range_t);
+ rangep->mpr_count++;
+ rangep->mpr_type = MAC_PROPVAL_UINT32;
+ rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_min = min;
+ rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_max = max;
+}
+
+void
+overlay_prop_set_range_str(overlay_prop_handle_t phdl, const char *str)
+{
+ size_t len = strlen(str) + 1; /* Account for a null terminator */
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+ mac_propval_str_range_t *pstr = &rangep->u.mpr_str;
+
+ if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_STR)
+ return;
+
+ if (infop->oipi_posssize + len > sizeof (infop->oipi_poss))
+ return;
+
+ rangep->mpr_count++;
+ rangep->mpr_type = MAC_PROPVAL_STR;
+ strlcpy((char *)&pstr->mpur_data[pstr->mpur_nextbyte], str,
+ sizeof (infop->oipi_poss) - infop->oipi_posssize);
+ pstr->mpur_nextbyte += len;
+ infop->oipi_posssize += len;
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c
new file mode 100644
index 0000000000..cb1366708a
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_target.c
@@ -0,0 +1,1651 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay device target cache management
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/ethernet.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/sysmacros.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/strsubr.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
+#include <sys/vlan.h>
+#include <sys/crc32.h>
+#include <sys/cred.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+#include <sys/overlay_impl.h>
+#include <sys/sdt.h>
+
+/*
+ * This is total straw man, but at least it's a prime number. Here we're
+ * going to have to go through and do a lot of evaluation and understanding as
+ * to how these target caches should grow and shrink, as well as, memory
+ * pressure and evictions. This just gives us a starting point that'll be 'good
+ * enough', until it's not.
+ */
+#define OVERLAY_HSIZE 823
+
+/*
+ * We use this data structure to keep track of what requests have been actively
+ * allocated to a given instance so we know what to put back on the pending
+ * list.
+ */
+typedef struct overlay_target_hdl {
+ minor_t oth_minor; /* RO */
+ zoneid_t oth_zoneid; /* RO */
+ int oth_oflags; /* RO */
+ list_node_t oth_link; /* overlay_target_lock */
+ kmutex_t oth_lock;
+ list_t oth_outstanding; /* oth_lock */
+} overlay_target_hdl_t;
+
+typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int);
+typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *);
+typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int);
+
+typedef struct overlay_target_ioctl {
+ int oti_cmd; /* ioctl id */
+ boolean_t oti_write; /* ioctl requires FWRITE */
+ boolean_t oti_ncopyout; /* copyout data? */
+ overlay_target_copyin_f oti_copyin; /* copyin func */
+ overlay_target_ioctl_f oti_func; /* function to call */
+ overlay_target_copyout_f oti_copyout; /* copyin func */
+ size_t oti_size; /* size of user level structure */
+} overlay_target_ioctl_t;
+
+static kmem_cache_t *overlay_target_cache;
+static kmem_cache_t *overlay_entry_cache;
+static id_space_t *overlay_thdl_idspace;
+static void *overlay_thdl_state;
+
+/*
+ * When we support overlay devices in the NGZ, then all of these need to become
+ * zone aware, by plugging into the netstack engine and becoming per-netstack
+ * data.
+ */
+static list_t overlay_thdl_list;
+static kmutex_t overlay_target_lock;
+static kcondvar_t overlay_target_condvar;
+static list_t overlay_target_list;
+static boolean_t overlay_target_excl;
+
+/*
+ * Outstanding data per hash table entry.
+ */
+static int overlay_ent_size = 128 * 1024;
+
+/* ARGSUSED */
+static int
+overlay_target_cache_constructor(void *buf, void *arg, int kmflgs)
+{
+ overlay_target_t *ott = buf;
+
+ mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_target_cache_destructor(void *buf, void *arg)
+{
+ overlay_target_t *ott = buf;
+
+ cv_destroy(&ott->ott_cond);
+ mutex_destroy(&ott->ott_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs)
+{
+ overlay_target_entry_t *ote = buf;
+
+ bzero(ote, sizeof (overlay_target_entry_t));
+ mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_entry_cache_destructor(void *buf, void *arg)
+{
+ overlay_target_entry_t *ote = buf;
+
+ mutex_destroy(&ote->ote_lock);
+}
+
+static uint64_t
+overlay_mac_hash(const void *v)
+{
+ uint32_t crc;
+ CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
+ return (crc);
+}
+
+static int
+overlay_mac_cmp(const void *a, const void *b)
+{
+ return (bcmp(a, b, ETHERADDRL));
+}
+
+/* ARGSUSED */
+static void
+overlay_target_entry_dtor(void *arg)
+{
+ overlay_target_entry_t *ote = arg;
+
+ ote->ote_flags = 0;
+ bzero(ote->ote_addr, ETHERADDRL);
+ ote->ote_ott = NULL;
+ ote->ote_odd = NULL;
+ freemsgchain(ote->ote_chead);
+ ote->ote_chead = ote->ote_ctail = NULL;
+ ote->ote_mbsize = 0;
+ ote->ote_vtime = 0;
+ kmem_cache_free(overlay_entry_cache, ote);
+}
+
+static int
+overlay_mac_avl(const void *a, const void *b)
+{
+ int i;
+ const overlay_target_entry_t *l, *r;
+ l = a;
+ r = b;
+
+ for (i = 0; i < ETHERADDRL; i++) {
+ if (l->ote_addr[i] > r->ote_addr[i])
+ return (1);
+ else if (l->ote_addr[i] < r->ote_addr[i])
+ return (-1);
+ }
+
+ return (0);
+}
+
+void
+overlay_target_init(void)
+{
+ int ret;
+ ret = ddi_soft_state_init(&overlay_thdl_state,
+ sizeof (overlay_target_hdl_t), 1);
+ VERIFY(ret == 0);
+ overlay_target_cache = kmem_cache_create("overlay_target",
+ sizeof (overlay_target_t), 0, overlay_target_cache_constructor,
+ overlay_target_cache_destructor, NULL, NULL, NULL, 0);
+ overlay_entry_cache = kmem_cache_create("overlay_entry",
+ sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor,
+ overlay_entry_cache_destructor, NULL, NULL, NULL, 0);
+ mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL);
+ list_create(&overlay_target_list, sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_qlink));
+ list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t),
+ offsetof(overlay_target_hdl_t, oth_link));
+ overlay_thdl_idspace = id_space_create("overlay_target_minors",
+ 1, INT32_MAX);
+}
+
+void
+overlay_target_fini(void)
+{
+ id_space_destroy(overlay_thdl_idspace);
+ list_destroy(&overlay_thdl_list);
+ list_destroy(&overlay_target_list);
+ cv_destroy(&overlay_target_condvar);
+ mutex_destroy(&overlay_target_lock);
+ kmem_cache_destroy(overlay_entry_cache);
+ kmem_cache_destroy(overlay_target_cache);
+ ddi_soft_state_fini(&overlay_thdl_state);
+}
+
+void
+overlay_target_free(overlay_dev_t *odd)
+{
+ if (odd->odd_target == NULL)
+ return;
+
+ if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
+ avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
+ overlay_target_entry_t *ote;
+
+ /*
+ * Our AVL tree and hashtable contain the same elements,
+ * therefore we should just remove it from the tree, but then
+ * delete the entries when we remove them from the hash table
+ * (which happens through the refhash dtor).
+ */
+ while ((ote = avl_first(ap)) != NULL)
+ avl_remove(ap, ote);
+
+ avl_destroy(ap);
+ for (ote = refhash_first(rp); ote != NULL;
+ ote = refhash_next(rp, ote)) {
+ refhash_remove(rp, ote);
+ }
+ refhash_destroy(rp);
+ }
+
+ ASSERT(odd->odd_target->ott_ocount == 0);
+ kmem_cache_free(overlay_target_cache, odd->odd_target);
+}
+
+int
+overlay_target_busy()
+{
+ int ret;
+
+ mutex_enter(&overlay_target_lock);
+ ret = !list_is_empty(&overlay_thdl_list);
+ mutex_exit(&overlay_target_lock);
+
+ return (ret);
+}
+
+static void
+overlay_target_queue(overlay_target_entry_t *entry)
+{
+ mutex_enter(&overlay_target_lock);
+ mutex_enter(&entry->ote_ott->ott_lock);
+ if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
+ mutex_exit(&entry->ote_ott->ott_lock);
+ mutex_exit(&overlay_target_lock);
+ return;
+ }
+ entry->ote_ott->ott_ocount++;
+ mutex_exit(&entry->ote_ott->ott_lock);
+ list_insert_tail(&overlay_target_list, entry);
+ cv_signal(&overlay_target_condvar);
+ mutex_exit(&overlay_target_lock);
+}
+
+void
+overlay_target_quiesce(overlay_target_t *ott)
+{
+ if (ott == NULL)
+ return;
+ mutex_enter(&ott->ott_lock);
+ ott->ott_flags |= OVERLAY_T_TEARDOWN;
+ while (ott->ott_ocount != 0)
+ cv_wait(&ott->ott_cond, &ott->ott_lock);
+ mutex_exit(&ott->ott_lock);
+}
+
+/*
+ * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
+ * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
+ * this time, say for NVGRE, we drop all packets that mcuh this.
+ */
+int
+overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
+ socklen_t *slenp)
+{
+ int ret;
+ struct sockaddr_in6 *v6;
+ overlay_target_t *ott;
+ mac_header_info_t mhi;
+ overlay_target_entry_t *entry;
+
+ ASSERT(odd->odd_target != NULL);
+
+ /*
+ * At this point, the overlay device is in a mux which means that it's
+ * been activated. At this point, parts of the target, such as the mode
+ * and the destination are now read-only and we don't have to worry
+ * about synchronization for them.
+ */
+ ott = odd->odd_target;
+ if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
+ return (OVERLAY_TARGET_DROP);
+
+ v6 = (struct sockaddr_in6 *)sock;
+ bzero(v6, sizeof (struct sockaddr_in6));
+ v6->sin6_family = AF_INET6;
+
+ if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+ mutex_enter(&ott->ott_lock);
+ bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr,
+ sizeof (struct in6_addr));
+ v6->sin6_port = htons(ott->ott_u.ott_point.otp_port);
+ mutex_exit(&ott->ott_lock);
+ *slenp = sizeof (struct sockaddr_in6);
+
+ return (OVERLAY_TARGET_OK);
+ }
+
+ ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
+
+ /*
+ * Note we only want the MAC address here, therefore we won't bother
+ * using mac_vlan_header_info(). If any caller needs the vlan info at
+ * this point, this should change to a call to mac_vlan_header_info().
+ */
+ if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
+ return (OVERLAY_TARGET_DROP);
+ mutex_enter(&ott->ott_lock);
+ entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ mhi.mhi_daddr);
+ if (entry == NULL) {
+ entry = kmem_cache_alloc(overlay_entry_cache,
+ KM_NOSLEEP | KM_NORMALPRI);
+ if (entry == NULL) {
+ mutex_exit(&ott->ott_lock);
+ return (OVERLAY_TARGET_DROP);
+ }
+ bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
+ entry->ote_chead = entry->ote_ctail = mp;
+ entry->ote_mbsize = msgsize(mp);
+ entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+ entry->ote_ott = ott;
+ entry->ote_odd = odd;
+ refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
+ avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
+ mutex_exit(&ott->ott_lock);
+ overlay_target_queue(entry);
+ return (OVERLAY_TARGET_ASYNC);
+ }
+ refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
+ mutex_exit(&ott->ott_lock);
+
+ mutex_enter(&entry->ote_lock);
+ if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ ret = OVERLAY_TARGET_DROP;
+ } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
+ sizeof (struct in6_addr));
+ v6->sin6_port = htons(entry->ote_dest.otp_port);
+ *slenp = sizeof (struct sockaddr_in6);
+ ret = OVERLAY_TARGET_OK;
+ } else {
+ size_t mlen = msgsize(mp);
+
+ if (mlen + entry->ote_mbsize > overlay_ent_size) {
+ ret = OVERLAY_TARGET_DROP;
+ } else {
+ if (entry->ote_ctail != NULL) {
+ ASSERT(entry->ote_ctail->b_next ==
+ NULL);
+ entry->ote_ctail->b_next = mp;
+ entry->ote_ctail = mp;
+ } else {
+ entry->ote_chead = mp;
+ entry->ote_ctail = mp;
+ }
+ entry->ote_mbsize += mlen;
+ if ((entry->ote_flags &
+ OVERLAY_ENTRY_F_PENDING) == 0) {
+ entry->ote_flags |=
+ OVERLAY_ENTRY_F_PENDING;
+ overlay_target_queue(entry);
+ }
+ ret = OVERLAY_TARGET_ASYNC;
+ }
+ }
+ mutex_exit(&entry->ote_lock);
+
+ mutex_enter(&ott->ott_lock);
+ refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+ mutex_exit(&ott->ott_lock);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_info(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_targ_info_t *oti = arg;
+
+ odd = overlay_hold_by_dlid(oti->oti_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ oti->oti_flags = 0;
+ oti->oti_needs = odd->odd_plugin->ovp_dest;
+ if (odd->odd_flags & OVERLAY_F_DEGRADED)
+ oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED;
+ if (odd->odd_flags & OVERLAY_F_ACTIVATED)
+ oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE;
+ oti->oti_vnetid = odd->odd_vid;
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_targ_associate_t *ota = arg;
+
+ odd = overlay_hold_by_dlid(ota->ota_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ if (ota->ota_id == 0) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+
+ if (ota->ota_mode != OVERLAY_TARGET_POINT &&
+ ota->ota_mode != OVERLAY_TARGET_DYNAMIC) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+
+ if (ota->ota_provides != odd->odd_plugin->ovp_dest) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+
+ if (ota->ota_mode == OVERLAY_TARGET_POINT) {
+ if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) ||
+ IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) ||
+ IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+ }
+
+ if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) {
+ if (ota->ota_point.otp_port == 0) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+ }
+ }
+
+ ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP);
+ ott->ott_flags = 0;
+ ott->ott_ocount = 0;
+ ott->ott_mode = ota->ota_mode;
+ ott->ott_dest = ota->ota_provides;
+ ott->ott_id = ota->ota_id;
+
+ if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+ bcopy(&ota->ota_point, &ott->ott_u.ott_point,
+ sizeof (overlay_target_point_t));
+ } else {
+ ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
+ overlay_mac_hash, overlay_mac_cmp,
+ overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_reflink),
+ offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
+ avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
+ sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_avllink));
+ }
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_VARPD) {
+ mutex_exit(&odd->odd_lock);
+ kmem_cache_free(overlay_target_cache, ott);
+ overlay_hold_rele(odd);
+ return (EEXIST);
+ }
+
+ odd->odd_flags |= OVERLAY_F_VARPD;
+ odd->odd_target = ott;
+ mutex_exit(&odd->odd_lock);
+
+ overlay_hold_rele(odd);
+
+
+ return (0);
+}
+
+
+/* ARGSUSED */
+static int
+overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_targ_degrade_t *otd = arg;
+
+ odd = overlay_hold_by_dlid(otd->otd_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ overlay_fm_degrade(odd, otd->otd_buf);
+ overlay_hold_rele(odd);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_restore(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_targ_id_t *otid = arg;
+
+ odd = overlay_hold_by_dlid(otid->otid_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ overlay_fm_restore(odd);
+ overlay_hold_rele(odd);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_targ_id_t *otid = arg;
+
+ odd = overlay_hold_by_dlid(otid->otid_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ odd->odd_flags &= ~OVERLAY_F_VARPD;
+ mutex_exit(&odd->odd_lock);
+
+ overlay_hold_rele(odd);
+ return (0);
+
+}
+
+static int
+overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_lookup_t *otl = arg;
+ overlay_target_entry_t *entry;
+ clock_t ret, timeout;
+ mac_header_info_t mhi;
+
+ timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC);
+again:
+ mutex_enter(&overlay_target_lock);
+ while (list_is_empty(&overlay_target_list)) {
+ ret = cv_timedwait(&overlay_target_condvar,
+ &overlay_target_lock, timeout);
+ if (ret == -1) {
+ mutex_exit(&overlay_target_lock);
+ return (ETIME);
+ }
+ }
+ entry = list_remove_head(&overlay_target_list);
+ mutex_exit(&overlay_target_lock);
+ mutex_enter(&entry->ote_lock);
+ if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ ASSERT(entry->ote_chead == NULL);
+ mutex_exit(&entry->ote_lock);
+ goto again;
+ }
+ ASSERT(entry->ote_chead != NULL);
+
+ /*
+ * If we have a bogon that doesn't have a valid mac header, drop it and
+ * try again.
+ */
+ if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
+ &mhi) != 0) {
+ boolean_t queue = B_FALSE;
+ mblk_t *mp = entry->ote_chead;
+ entry->ote_chead = mp->b_next;
+ mp->b_next = NULL;
+ if (entry->ote_ctail == mp)
+ entry->ote_ctail = entry->ote_chead;
+ entry->ote_mbsize -= msgsize(mp);
+ if (entry->ote_chead != NULL)
+ queue = B_TRUE;
+ mutex_exit(&entry->ote_lock);
+ if (queue == B_TRUE)
+ overlay_target_queue(entry);
+ freemsg(mp);
+ goto again;
+ }
+
+ otl->otl_dlid = entry->ote_odd->odd_linkid;
+ otl->otl_reqid = (uintptr_t)entry;
+ otl->otl_varpdid = entry->ote_ott->ott_id;
+ otl->otl_vnetid = entry->ote_odd->odd_vid;
+
+ otl->otl_hdrsize = mhi.mhi_hdrsize;
+ otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
+ bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL);
+ bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL);
+ otl->otl_dsttype = mhi.mhi_dsttype;
+ otl->otl_sap = mhi.mhi_bindsap;
+ otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
+ mutex_exit(&entry->ote_lock);
+
+ mutex_enter(&thdl->oth_lock);
+ list_insert_tail(&thdl->oth_outstanding, entry);
+ mutex_exit(&thdl->oth_lock);
+
+ return (0);
+}
+
+static int
+overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
+{
+ const overlay_targ_resp_t *otr = arg;
+ overlay_target_entry_t *entry;
+ mblk_t *mp;
+
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == otr->otr_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ return (EINVAL);
+ }
+ list_remove(&thdl->oth_outstanding, entry);
+ mutex_exit(&thdl->oth_lock);
+
+ mutex_enter(&entry->ote_lock);
+ bcopy(&otr->otr_answer, &entry->ote_dest,
+ sizeof (overlay_target_point_t));
+ entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+ entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
+ mp = entry->ote_chead;
+ entry->ote_chead = NULL;
+ entry->ote_ctail = NULL;
+ entry->ote_mbsize = 0;
+ entry->ote_vtime = gethrtime();
+ mutex_exit(&entry->ote_lock);
+
+ /*
+ * For now do an in-situ drain.
+ */
+ mp = overlay_m_tx(entry->ote_odd, mp);
+ freemsgchain(mp);
+
+ mutex_enter(&entry->ote_ott->ott_lock);
+ entry->ote_ott->ott_ocount--;
+ cv_signal(&entry->ote_ott->ott_cond);
+ mutex_exit(&entry->ote_ott->ott_lock);
+
+ return (0);
+}
+
+static int
+overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
+{
+ const overlay_targ_resp_t *otr = arg;
+ overlay_target_entry_t *entry;
+ mblk_t *mp;
+ boolean_t queue = B_FALSE;
+
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == otr->otr_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ return (EINVAL);
+ }
+ list_remove(&thdl->oth_outstanding, entry);
+ mutex_exit(&thdl->oth_lock);
+
+ mutex_enter(&entry->ote_lock);
+
+ /* Safeguard against a confused varpd */
+ if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+ DTRACE_PROBE1(overlay__target__valid__drop,
+ overlay_target_entry_t *, entry);
+ mutex_exit(&entry->ote_lock);
+ goto done;
+ }
+
+ mp = entry->ote_chead;
+ if (mp != NULL) {
+ entry->ote_chead = mp->b_next;
+ mp->b_next = NULL;
+ if (entry->ote_ctail == mp)
+ entry->ote_ctail = entry->ote_chead;
+ entry->ote_mbsize -= msgsize(mp);
+ }
+ if (entry->ote_chead != NULL) {
+ queue = B_TRUE;
+ entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+ } else {
+ entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+ }
+ mutex_exit(&entry->ote_lock);
+
+ if (queue == B_TRUE)
+ overlay_target_queue(entry);
+ freemsg(mp);
+
+done:
+ mutex_enter(&entry->ote_ott->ott_lock);
+ entry->ote_ott->ott_ocount--;
+ cv_signal(&entry->ote_ott->ott_cond);
+ mutex_exit(&entry->ote_ott->ott_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize,
+ int flags)
+{
+ overlay_targ_pkt_t *pkt;
+ overlay_targ_pkt32_t *pkt32;
+
+ pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP);
+ *outp = pkt;
+ *bsize = sizeof (overlay_targ_pkt_t);
+ if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
+ uintptr_t addr;
+
+ if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t),
+ flags & FKIOCTL) != 0) {
+ kmem_free(pkt, *bsize);
+ return (EFAULT);
+ }
+ pkt32 = (overlay_targ_pkt32_t *)pkt;
+ addr = pkt32->otp_buf;
+ pkt->otp_buf = (void *)addr;
+ } else {
+ if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) {
+ kmem_free(pkt, *bsize);
+ return (EFAULT);
+ }
+ }
+ return (0);
+}
+
+static int
+overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize,
+ int flags)
+{
+ if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
+ overlay_targ_pkt_t *pkt = buf;
+ overlay_targ_pkt32_t *pkt32 = buf;
+ uintptr_t addr = (uintptr_t)pkt->otp_buf;
+ pkt32->otp_buf = (caddr32_t)addr;
+ if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t),
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+ } else {
+ if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0)
+ return (EFAULT);
+ }
+ return (0);
+}
+
+static int
+overlay_target_packet(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_pkt_t *pkt = arg;
+ overlay_target_entry_t *entry;
+ mblk_t *mp;
+ size_t mlen;
+ size_t boff;
+
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == pkt->otp_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ return (EINVAL);
+ }
+ mutex_enter(&entry->ote_lock);
+ mutex_exit(&thdl->oth_lock);
+ mp = entry->ote_chead;
+ /* Protect against a rogue varpd */
+ if (mp == NULL) {
+ mutex_exit(&entry->ote_lock);
+ return (EINVAL);
+ }
+ mlen = MIN(msgsize(mp), pkt->otp_size);
+ pkt->otp_size = mlen;
+ boff = 0;
+ while (mlen > 0) {
+ size_t wlen = MIN(MBLKL(mp), mlen);
+ if (ddi_copyout(mp->b_rptr,
+ (void *)((uintptr_t)pkt->otp_buf + boff),
+ wlen, 0) != 0) {
+ mutex_exit(&entry->ote_lock);
+ return (EFAULT);
+ }
+ mlen -= wlen;
+ boff += wlen;
+ mp = mp->b_cont;
+ }
+ mutex_exit(&entry->ote_lock);
+ return (0);
+}
+
+static int
+overlay_target_inject(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_pkt_t *pkt = arg;
+ overlay_target_entry_t *entry;
+ overlay_dev_t *odd;
+ mblk_t *mp;
+
+ if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
+ return (EINVAL);
+
+ mp = allocb(pkt->otp_size, 0);
+ if (mp == NULL)
+ return (ENOMEM);
+
+ if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
+ freeb(mp);
+ return (EFAULT);
+ }
+ mp->b_wptr += pkt->otp_size;
+
+ if (pkt->otp_linkid != UINT64_MAX) {
+ odd = overlay_hold_by_dlid(pkt->otp_linkid);
+ if (odd == NULL) {
+ freeb(mp);
+ return (ENOENT);
+ }
+ } else {
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == pkt->otp_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ freeb(mp);
+ return (ENOENT);
+ }
+ odd = entry->ote_odd;
+ mutex_exit(&thdl->oth_lock);
+ }
+
+ mutex_enter(&odd->odd_lock);
+ overlay_io_start(odd, OVERLAY_F_IN_RX);
+ mutex_exit(&odd->odd_lock);
+
+ mac_rx(odd->odd_mh, NULL, mp);
+
+ mutex_enter(&odd->odd_lock);
+ overlay_io_done(odd, OVERLAY_F_IN_RX);
+ mutex_exit(&odd->odd_lock);
+
+ return (0);
+}
+
+static int
+overlay_target_resend(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_pkt_t *pkt = arg;
+ overlay_target_entry_t *entry;
+ overlay_dev_t *odd;
+ mblk_t *mp;
+
+ if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
+ return (EINVAL);
+
+ mp = allocb(pkt->otp_size, 0);
+ if (mp == NULL)
+ return (ENOMEM);
+
+ if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
+ freeb(mp);
+ return (EFAULT);
+ }
+ mp->b_wptr += pkt->otp_size;
+
+ if (pkt->otp_linkid != UINT64_MAX) {
+ odd = overlay_hold_by_dlid(pkt->otp_linkid);
+ if (odd == NULL) {
+ freeb(mp);
+ return (ENOENT);
+ }
+ } else {
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == pkt->otp_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ freeb(mp);
+ return (ENOENT);
+ }
+ odd = entry->ote_odd;
+ mutex_exit(&thdl->oth_lock);
+ }
+
+ mp = overlay_m_tx(odd, mp);
+ freemsgchain(mp);
+
+ return (0);
+}
+
+typedef struct overlay_targ_list_int {
+ boolean_t otli_count;
+ uint32_t otli_cur;
+ uint32_t otli_nents;
+ uint32_t otli_ents[];
+} overlay_targ_list_int_t;
+
+static int
+overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize,
+ int flags)
+{
+ overlay_targ_list_t n;
+ overlay_targ_list_int_t *otl;
+
+ if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t),
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+
+ /*
+ */
+ if (n.otl_nents >= INT32_MAX / sizeof (uint32_t))
+ return (EINVAL);
+ *bsize = sizeof (overlay_targ_list_int_t) +
+ sizeof (uint32_t) * n.otl_nents;
+ otl = kmem_zalloc(*bsize, KM_SLEEP);
+ otl->otli_cur = 0;
+ otl->otli_nents = n.otl_nents;
+ if (otl->otli_nents != 0) {
+ otl->otli_count = B_FALSE;
+ if (ddi_copyin((void *)((uintptr_t)ubuf +
+ offsetof(overlay_targ_list_t, otl_ents)),
+ otl->otli_ents, n.otl_nents * sizeof (uint32_t),
+ flags & FKIOCTL) != 0) {
+ kmem_free(otl, *bsize);
+ return (EFAULT);
+ }
+ } else {
+ otl->otli_count = B_TRUE;
+ }
+
+ *outp = otl;
+ return (0);
+}
+
+static int
+overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg)
+{
+ overlay_targ_list_int_t *otl = arg;
+
+ if (otl->otli_cur < otl->otli_nents)
+ otl->otli_ents[otl->otli_cur] = odd->odd_linkid;
+ otl->otli_cur++;
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_iter(overlay_target_ioctl_list_cb, arg);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags)
+{
+ overlay_targ_list_int_t *otl = buf;
+
+ if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t),
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (otl->otli_count == B_FALSE) {
+ if (ddi_copyout(otl->otli_ents,
+ (void *)((uintptr_t)ubuf +
+ offsetof(overlay_targ_list_t, otl_ents)),
+ sizeof (uint32_t) * otl->otli_nents,
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg)
+{
+ int ret = 0;
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_targ_cache_t *otc = arg;
+
+ odd = overlay_hold_by_dlid(otc->otc_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_POINT &&
+ ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+ otc->otc_entry.otce_flags = 0;
+ bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest,
+ sizeof (overlay_target_point_t));
+ } else {
+ overlay_target_entry_t *ote;
+ ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ otc->otc_entry.otce_mac);
+ if (ote != NULL) {
+ mutex_enter(&ote->ote_lock);
+ if ((ote->ote_flags &
+ OVERLAY_ENTRY_F_VALID_MASK) != 0) {
+ if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ otc->otc_entry.otce_flags =
+ OVERLAY_TARGET_CACHE_DROP;
+ } else {
+ otc->otc_entry.otce_flags = 0;
+ bcopy(&ote->ote_dest,
+ &otc->otc_entry.otce_dest,
+ sizeof (overlay_target_point_t));
+ }
+ ret = 0;
+ } else {
+ ret = ENOENT;
+ }
+ mutex_exit(&ote->ote_lock);
+ } else {
+ ret = ENOENT;
+ }
+ }
+
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_target_entry_t *ote;
+ overlay_targ_cache_t *otc = arg;
+ mblk_t *mp = NULL;
+
+ if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
+ return (EINVAL);
+
+ odd = overlay_hold_by_dlid(otc->otc_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ otc->otc_entry.otce_mac);
+ if (ote == NULL) {
+ ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
+ bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL);
+ ote->ote_chead = ote->ote_ctail = NULL;
+ ote->ote_mbsize = 0;
+ ote->ote_ott = ott;
+ ote->ote_odd = odd;
+ mutex_enter(&ote->ote_lock);
+ refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
+ avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
+ } else {
+ mutex_enter(&ote->ote_lock);
+ }
+
+ if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
+ ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
+ } else {
+ ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
+ bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
+ sizeof (overlay_target_point_t));
+ mp = ote->ote_chead;
+ ote->ote_chead = NULL;
+ ote->ote_ctail = NULL;
+ ote->ote_mbsize = 0;
+ ote->ote_vtime = gethrtime();
+ }
+
+ mutex_exit(&ote->ote_lock);
+ mutex_exit(&ott->ott_lock);
+
+ if (mp != NULL) {
+ mp = overlay_m_tx(ote->ote_odd, mp);
+ freemsgchain(mp);
+ }
+
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg)
+{
+ int ret = 0;
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_target_entry_t *ote;
+ overlay_targ_cache_t *otc = arg;
+
+ odd = overlay_hold_by_dlid(otc->otc_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ otc->otc_entry.otce_mac);
+ if (ote != NULL) {
+ mutex_enter(&ote->ote_lock);
+ ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
+ mutex_exit(&ote->ote_lock);
+ ret = 0;
+ } else {
+ ret = ENOENT;
+ }
+
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg)
+{
+ avl_tree_t *avl;
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_target_entry_t *ote;
+ overlay_targ_cache_t *otc = arg;
+
+ odd = overlay_hold_by_dlid(otc->otc_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+ avl = &ott->ott_u.ott_dyn.ott_tree;
+
+ for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) {
+ mutex_enter(&ote->ote_lock);
+ ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
+ mutex_exit(&ote->ote_lock);
+ }
+ ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ otc->otc_entry.otce_mac);
+
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+static int
+overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize,
+ int flags)
+{
+ overlay_targ_cache_iter_t base, *iter;
+
+ if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t),
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (base.otci_count > OVERLAY_TARGET_ITER_MAX)
+ return (E2BIG);
+
+ if (base.otci_count == 0)
+ return (EINVAL);
+
+ *bsize = sizeof (overlay_targ_cache_iter_t) +
+ base.otci_count * sizeof (overlay_targ_cache_entry_t);
+ iter = kmem_alloc(*bsize, KM_SLEEP);
+ bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t));
+ *outp = iter;
+
+ return (0);
+}
+
+typedef struct overlay_targ_cache_marker {
+ uint8_t otcm_mac[ETHERADDRL];
+ uint16_t otcm_done;
+} overlay_targ_cache_marker_t;
+
+/* ARGSUSED */
+static int
+overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_target_entry_t lookup, *ent;
+ overlay_targ_cache_marker_t *mark;
+ avl_index_t where;
+ avl_tree_t *avl;
+ uint16_t written = 0;
+
+ overlay_targ_cache_iter_t *iter = arg;
+ mark = (void *)&iter->otci_marker;
+
+ if (mark->otcm_done != 0) {
+ iter->otci_count = 0;
+ return (0);
+ }
+
+ odd = overlay_hold_by_dlid(iter->otci_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC &&
+ ott->ott_mode != OVERLAY_TARGET_POINT) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+
+ /*
+ * Holding this lock across the entire iteration probably isn't very
+ * good. We should perhaps add an r/w lock for the avl tree. But we'll
+ * wait until we now it's necessary before we do more.
+ */
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+ overlay_targ_cache_entry_t *out = &iter->otci_ents[0];
+ bzero(out->otce_mac, ETHERADDRL);
+ out->otce_flags = 0;
+ bcopy(&ott->ott_u.ott_point, &out->otce_dest,
+ sizeof (overlay_target_point_t));
+ written++;
+ mark->otcm_done = 1;
+ }
+
+ avl = &ott->ott_u.ott_dyn.ott_tree;
+ bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL);
+ ent = avl_find(avl, &lookup, &where);
+
+ /*
+ * NULL ent means that the entry does not exist, so we want to start
+ * with the closest node in the tree. This means that we implicitly rely
+ * on the tree's order and the first node will be the mac 00:00:00:00:00
+ * and the last will be ff:ff:ff:ff:ff:ff.
+ */
+ if (ent == NULL) {
+ ent = avl_nearest(avl, where, AVL_AFTER);
+ if (ent == NULL) {
+ mark->otcm_done = 1;
+ goto done;
+ }
+ }
+
+ for (; ent != NULL && written < iter->otci_count;
+ ent = AVL_NEXT(avl, ent)) {
+ overlay_targ_cache_entry_t *out = &iter->otci_ents[written];
+ mutex_enter(&ent->ote_lock);
+ if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) {
+ mutex_exit(&ent->ote_lock);
+ continue;
+ }
+ bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL);
+ out->otce_flags = 0;
+ if (ent->ote_flags & OVERLAY_ENTRY_F_DROP)
+ out->otce_flags |= OVERLAY_TARGET_CACHE_DROP;
+ if (ent->ote_flags & OVERLAY_ENTRY_F_VALID)
+ bcopy(&ent->ote_dest, &out->otce_dest,
+ sizeof (overlay_target_point_t));
+ written++;
+ mutex_exit(&ent->ote_lock);
+ }
+
+ if (ent != NULL) {
+ bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL);
+ } else {
+ mark->otcm_done = 1;
+ }
+
+done:
+ iter->otci_count = written;
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize,
+ int flags)
+{
+ size_t outsize;
+ const overlay_targ_cache_iter_t *iter = buf;
+
+ outsize = sizeof (overlay_targ_cache_iter_t) +
+ iter->otci_count * sizeof (overlay_targ_cache_entry_t);
+
+ if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+static overlay_target_ioctl_t overlay_target_ioctab[] = {
+ { OVERLAY_TARG_INFO, B_TRUE, B_TRUE,
+ NULL, overlay_target_info,
+ NULL, sizeof (overlay_targ_info_t) },
+ { OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE,
+ NULL, overlay_target_associate,
+ NULL, sizeof (overlay_targ_associate_t) },
+ { OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE,
+ NULL, overlay_target_disassociate,
+ NULL, sizeof (overlay_targ_id_t) },
+ { OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE,
+ NULL, overlay_target_degrade,
+ NULL, sizeof (overlay_targ_degrade_t) },
+ { OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE,
+ NULL, overlay_target_restore,
+ NULL, sizeof (overlay_targ_id_t) },
+ { OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE,
+ NULL, overlay_target_lookup_request,
+ NULL, sizeof (overlay_targ_lookup_t) },
+ { OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE,
+ NULL, overlay_target_lookup_respond,
+ NULL, sizeof (overlay_targ_resp_t) },
+ { OVERLAY_TARG_DROP, B_TRUE, B_FALSE,
+ NULL, overlay_target_lookup_drop,
+ NULL, sizeof (overlay_targ_resp_t) },
+ { OVERLAY_TARG_PKT, B_TRUE, B_TRUE,
+ overlay_target_pkt_copyin,
+ overlay_target_packet,
+ overlay_target_pkt_copyout,
+ sizeof (overlay_targ_pkt_t) },
+ { OVERLAY_TARG_INJECT, B_TRUE, B_FALSE,
+ overlay_target_pkt_copyin,
+ overlay_target_inject,
+ NULL, sizeof (overlay_targ_pkt_t) },
+ { OVERLAY_TARG_RESEND, B_TRUE, B_FALSE,
+ overlay_target_pkt_copyin,
+ overlay_target_resend,
+ NULL, sizeof (overlay_targ_pkt_t) },
+ { OVERLAY_TARG_LIST, B_FALSE, B_TRUE,
+ overlay_target_list_copyin,
+ overlay_target_ioctl_list,
+ overlay_target_list_copyout,
+ sizeof (overlay_targ_list_t) },
+ { OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE,
+ NULL, overlay_target_cache_get,
+ NULL, sizeof (overlay_targ_cache_t) },
+ { OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE,
+ NULL, overlay_target_cache_set,
+ NULL, sizeof (overlay_targ_cache_t) },
+ { OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE,
+ NULL, overlay_target_cache_remove,
+ NULL, sizeof (overlay_targ_cache_t) },
+ { OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE,
+ NULL, overlay_target_cache_flush,
+ NULL, sizeof (overlay_targ_cache_t) },
+ { OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE,
+ overlay_target_cache_iter_copyin,
+ overlay_target_cache_iter,
+ overlay_target_cache_iter_copyout,
+ sizeof (overlay_targ_cache_iter_t) },
+ { 0 }
+};
+
+int
+overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp)
+{
+ minor_t mid;
+ overlay_target_hdl_t *thdl;
+
+ if (secpolicy_dl_config(credp) != 0)
+ return (EPERM);
+
+ if (getminor(*devp) != 0)
+ return (ENXIO);
+
+ if (otype & OTYP_BLK)
+ return (EINVAL);
+
+ if (flags & ~(FREAD | FWRITE | FEXCL))
+ return (EINVAL);
+
+ if ((flags & FWRITE) &&
+ !(flags & FEXCL))
+ return (EINVAL);
+
+ if (!(flags & FREAD) && !(flags & FWRITE))
+ return (EINVAL);
+
+ if (crgetzoneid(credp) != GLOBAL_ZONEID)
+ return (EPERM);
+
+ mid = id_alloc(overlay_thdl_idspace);
+ if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) {
+ id_free(overlay_thdl_idspace, mid);
+ return (ENXIO);
+ }
+
+ thdl = ddi_get_soft_state(overlay_thdl_state, mid);
+ VERIFY(thdl != NULL);
+ thdl->oth_minor = mid;
+ thdl->oth_zoneid = crgetzoneid(credp);
+ thdl->oth_oflags = flags;
+ mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_qlink));
+ *devp = makedevice(getmajor(*devp), mid);
+
+ mutex_enter(&overlay_target_lock);
+ if ((flags & FEXCL) && overlay_target_excl == B_TRUE) {
+ mutex_exit(&overlay_target_lock);
+ list_destroy(&thdl->oth_outstanding);
+ mutex_destroy(&thdl->oth_lock);
+ ddi_soft_state_free(overlay_thdl_state, mid);
+ id_free(overlay_thdl_idspace, mid);
+ return (EEXIST);
+ } else if ((flags & FEXCL) != 0) {
+ VERIFY(overlay_target_excl == B_FALSE);
+ overlay_target_excl = B_TRUE;
+ }
+ list_insert_tail(&overlay_thdl_list, thdl);
+ mutex_exit(&overlay_target_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+int
+overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+ int *rvalp)
+{
+ overlay_target_ioctl_t *ioc;
+ overlay_target_hdl_t *thdl;
+
+ if (secpolicy_dl_config(credp) != 0)
+ return (EPERM);
+
+ if ((thdl = ddi_get_soft_state(overlay_thdl_state,
+ getminor(dev))) == NULL)
+ return (ENXIO);
+
+ for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) {
+ int ret;
+ caddr_t buf;
+ size_t bufsize;
+
+ if (ioc->oti_cmd != cmd)
+ continue;
+
+ if (ioc->oti_write == B_TRUE && !(mode & FWRITE))
+ return (EBADF);
+
+ if (ioc->oti_copyin == NULL) {
+ bufsize = ioc->oti_size;
+ buf = kmem_alloc(bufsize, KM_SLEEP);
+ if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize,
+ mode & FKIOCTL) != 0) {
+ kmem_free(buf, bufsize);
+ return (EFAULT);
+ }
+ } else {
+ if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg,
+ (void **)&buf, &bufsize, mode)) != 0)
+ return (ret);
+ }
+
+ ret = ioc->oti_func(thdl, buf);
+ if (ret == 0 && ioc->oti_size != 0 &&
+ ioc->oti_ncopyout == B_TRUE) {
+ if (ioc->oti_copyout == NULL) {
+ if (ddi_copyout(buf, (void *)(uintptr_t)arg,
+ bufsize, mode & FKIOCTL) != 0)
+ ret = EFAULT;
+ } else {
+ ret = ioc->oti_copyout((void *)(uintptr_t)arg,
+ buf, bufsize, mode);
+ }
+ }
+
+ kmem_free(buf, bufsize);
+ return (ret);
+ }
+
+ return (ENOTTY);
+}
+
+/* ARGSUSED */
+int
+overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp)
+{
+ overlay_target_hdl_t *thdl;
+ overlay_target_entry_t *entry;
+ minor_t mid = getminor(dev);
+
+ if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL)
+ return (ENXIO);
+
+ mutex_enter(&overlay_target_lock);
+ list_remove(&overlay_thdl_list, thdl);
+ mutex_enter(&thdl->oth_lock);
+ while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL)
+ list_insert_tail(&overlay_target_list, entry);
+ cv_signal(&overlay_target_condvar);
+ mutex_exit(&thdl->oth_lock);
+ if ((thdl->oth_oflags & FEXCL) != 0) {
+ VERIFY(overlay_target_excl == B_TRUE);
+ overlay_target_excl = B_FALSE;
+ }
+ mutex_exit(&overlay_target_lock);
+
+ list_destroy(&thdl->oth_outstanding);
+ mutex_destroy(&thdl->oth_lock);
+ mid = thdl->oth_minor;
+ ddi_soft_state_free(overlay_thdl_state, mid);
+ id_free(overlay_thdl_idspace, mid);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c
new file mode 100644
index 0000000000..92144b3985
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c
@@ -0,0 +1,394 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * VXLAN encapsulation module
+ *
+ *
+ * The VXLAN header looks as follows in network byte order:
+ *
+ * |0 3| 4 |5 31|
+ * +----------+---+------------------------+
+ * | Reserved | I | Reserved |
+ * +---------------------------------------+
+ * | Virtual Network ID | Reserved |
+ * +----------------------------+----------+
+ * |0 23|24 31|
+ *
+ * All reserved values must be 0. The I bit must be 1. We call the top
+ * word the VXLAN magic field for the time being. The second word is
+ * definitely not the most friendly way to operate. Specifically, the ID
+ * is a 24-bit big endian value, but we have to make sure not to use the
+ * reserved byte.
+ *
+ * For us, VXLAN encapsulation is a fairly straightforward implementation. It
+ * only has two properties, a listen_ip and a listen_port. These determine on
+ * what address we should be listening on. While we do not have a default
+ * address to listen upon, we do have a default port, which is the IANA assigned
+ * port for VXLAN -- 4789.
+ */
+
+#include <sys/overlay_plugin.h>
+#include <sys/modctl.h>
+#include <sys/errno.h>
+#include <sys/byteorder.h>
+#include <sys/vxlan.h>
+#include <inet/ip.h>
+#include <netinet/in.h>
+#include <sys/strsun.h>
+#include <netinet/udp.h>
+
+static const char *vxlan_ident = "vxlan";
+static uint16_t vxlan_defport = IPPORT_VXLAN;
+
+/*
+ * Should we enable UDP source port hashing for fanout.
+ */
+boolean_t vxlan_fanout = B_TRUE;
+
+/*
+ * This represents the size in bytes that we want to allocate when allocating a
+ * vxlan header block. This is intended such that lower levels can try and use
+ * the message block that we allocate for the IP and UPD header. The hope is
+ * that even if this is tunneled, that this is enough space.
+ *
+ * The vxlan_noalloc_min value represents the minimum amount of space we need to
+ * consider not allocating a message block and just passing it down the stack in
+ * this form. This number assumes that we have a VLAN tag, so 18 byte Ethernet
+ * header, 20 byte IP header, 8 byte UDP header, and 8 byte VXLAN header.
+ */
+uint_t vxlan_alloc_size = 128;
+uint_t vxlan_noalloc_min = 54;
+
+static const char *vxlan_props[] = {
+ "vxlan/listen_ip",
+ "vxlan/listen_port",
+ NULL
+};
+
+typedef struct vxlan {
+ kmutex_t vxl_lock;
+ overlay_handle_t vxl_oh;
+ uint16_t vxl_lport;
+ boolean_t vxl_hladdr;
+ struct in6_addr vxl_laddr;
+} vxlan_t;
+
+static int
+vxlan_o_init(overlay_handle_t oh, void **outp)
+{
+ vxlan_t *vxl;
+
+ vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP);
+ *outp = vxl;
+ mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL);
+ vxl->vxl_oh = oh;
+ vxl->vxl_lport = vxlan_defport;
+ vxl->vxl_hladdr = B_FALSE;
+
+ return (0);
+}
+
+static void
+vxlan_o_fini(void *arg)
+{
+ vxlan_t *vxl = arg;
+
+ mutex_destroy(&vxl->vxl_lock);
+ kmem_free(arg, sizeof (vxlan_t));
+}
+
+static int
+vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr,
+ socklen_t *slenp)
+{
+ vxlan_t *vxl = arg;
+ struct sockaddr_in6 *in;
+
+ in = (struct sockaddr_in6 *)addr;
+ *dp = AF_INET6;
+ *fp = SOCK_DGRAM;
+ *pp = 0;
+ bzero(in, sizeof (struct sockaddr_in6));
+ in->sin6_family = AF_INET6;
+
+ /*
+ * We should consider a more expressive private errno set that
+ * provider's can use.
+ */
+ mutex_enter(&vxl->vxl_lock);
+ if (vxl->vxl_hladdr == B_FALSE) {
+ mutex_exit(&vxl->vxl_lock);
+ return (EINVAL);
+ }
+ in->sin6_port = htons(vxl->vxl_lport);
+ in->sin6_addr = vxl->vxl_laddr;
+ mutex_exit(&vxl->vxl_lock);
+ *slenp = sizeof (struct sockaddr_in6);
+
+ return (0);
+}
+
+static int
+vxlan_o_sockopt(ksocket_t ksock)
+{
+ int val, err;
+ if (vxlan_fanout == B_FALSE)
+ return (0);
+
+ val = UDP_HASH_VXLAN;
+ err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val,
+ sizeof (val), kcred);
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop,
+ mblk_t **outp)
+{
+ mblk_t *ob;
+ vxlan_hdr_t *vxh;
+
+ ASSERT(einfop->ovdi_id < (1 << 24));
+
+ if (DB_REF(mp) != 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
+ /*
+ * This allocation could get hot. We may want to have a good
+ * way to cache and handle this allocation the same way that IP
+ * does with keeping around a message block per entry, or
+ * basically treating this as an immutable message block in the
+ * system. Basically freemsg() will be a nop, but we'll do the
+ * right thing with respect to the rest of the chain.
+ */
+ ob = allocb(vxlan_alloc_size, 0);
+ if (ob == NULL)
+ return (ENOMEM);
+
+ ob->b_wptr = DB_LIM(ob);
+ ob->b_rptr = ob->b_wptr;
+ ob->b_cont = mp;
+ } else {
+ ob = mp;
+ }
+ ob->b_rptr -= VXLAN_HDR_LEN;
+
+ vxh = (vxlan_hdr_t *)ob->b_rptr;
+ vxh->vxlan_flags = ntohl(VXLAN_F_VDI);
+ vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT);
+ *outp = ob;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop)
+{
+ vxlan_hdr_t *vxh;
+
+ if (MBLKL(mp) < sizeof (vxlan_hdr_t))
+ return (EINVAL);
+ vxh = (vxlan_hdr_t *)mp->b_rptr;
+ if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0)
+ return (EINVAL);
+
+ dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT;
+ dinfop->ovdi_hdr_size = VXLAN_HDR_LEN;
+
+ return (0);
+}
+
+static int
+vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize)
+{
+ vxlan_t *vxl = arg;
+
+ /* vxlan/listen_ip */
+ if (strcmp(pr_name, vxlan_props[0]) == 0) {
+ if (*bufsize < sizeof (struct in6_addr))
+ return (EOVERFLOW);
+
+ mutex_enter(&vxl->vxl_lock);
+ if (vxl->vxl_hladdr == B_FALSE) {
+ *bufsize = 0;
+ } else {
+ bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr));
+ *bufsize = sizeof (struct in6_addr);
+ }
+ mutex_exit(&vxl->vxl_lock);
+ return (0);
+ }
+
+ /* vxlan/listen_port */
+ if (strcmp(pr_name, vxlan_props[1]) == 0) {
+ uint64_t val;
+ if (*bufsize < sizeof (uint64_t))
+ return (EOVERFLOW);
+
+ mutex_enter(&vxl->vxl_lock);
+ val = vxl->vxl_lport;
+ bcopy(&val, buf, sizeof (uint64_t));
+ *bufsize = sizeof (uint64_t);
+ mutex_exit(&vxl->vxl_lock);
+ return (0);
+ }
+
+ return (EINVAL);
+}
+
+static int
+vxlan_o_setprop(void *arg, const char *pr_name, const void *buf,
+ uint32_t bufsize)
+{
+ vxlan_t *vxl = arg;
+
+ /* vxlan/listen_ip */
+ if (strcmp(pr_name, vxlan_props[0]) == 0) {
+ const struct in6_addr *ipv6 = buf;
+ if (bufsize != sizeof (struct in6_addr))
+ return (EINVAL);
+
+ if (IN6_IS_ADDR_V4COMPAT(ipv6))
+ return (EINVAL);
+
+ if (IN6_IS_ADDR_MULTICAST(ipv6))
+ return (EINVAL);
+
+ if (IN6_IS_ADDR_6TO4(ipv6))
+ return (EINVAL);
+
+ if (IN6_IS_ADDR_V4MAPPED(ipv6)) {
+ ipaddr_t v4;
+ IN6_V4MAPPED_TO_IPADDR(ipv6, v4);
+ if (IN_MULTICAST(v4))
+ return (EINVAL);
+ }
+
+ mutex_enter(&vxl->vxl_lock);
+ vxl->vxl_hladdr = B_TRUE;
+ bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr));
+ mutex_exit(&vxl->vxl_lock);
+
+ return (0);
+ }
+
+ /* vxlan/listen_port */
+ if (strcmp(pr_name, vxlan_props[1]) == 0) {
+ const uint64_t *valp = buf;
+ if (bufsize != 8)
+ return (EINVAL);
+
+ if (*valp == 0 || *valp > UINT16_MAX)
+ return (EINVAL);
+
+ mutex_enter(&vxl->vxl_lock);
+ vxl->vxl_lport = *valp;
+ mutex_exit(&vxl->vxl_lock);
+ return (0);
+ }
+ return (EINVAL);
+}
+
+static int
+vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl)
+{
+ /* vxlan/listen_ip */
+ if (strcmp(pr_name, vxlan_props[0]) == 0) {
+ overlay_prop_set_name(phdl, vxlan_props[0]);
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP);
+ overlay_prop_set_nodefault(phdl);
+ return (0);
+ }
+
+ if (strcmp(pr_name, vxlan_props[1]) == 0) {
+ overlay_prop_set_name(phdl, vxlan_props[1]);
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ (void) overlay_prop_set_default(phdl, &vxlan_defport,
+ sizeof (vxlan_defport));
+ overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX);
+ return (0);
+ }
+
+ return (EINVAL);
+}
+
+static struct overlay_plugin_ops vxlan_o_ops = {
+ 0,
+ vxlan_o_init,
+ vxlan_o_fini,
+ vxlan_o_encap,
+ vxlan_o_decap,
+ vxlan_o_socket,
+ vxlan_o_sockopt,
+ vxlan_o_getprop,
+ vxlan_o_setprop,
+ vxlan_o_propinfo
+};
+
+static struct modlmisc vxlan_modlmisc = {
+ &mod_miscops,
+ "VXLAN encap plugin"
+};
+
+static struct modlinkage vxlan_modlinkage = {
+ MODREV_1,
+ &vxlan_modlmisc
+};
+
+int
+_init(void)
+{
+ int err;
+ overlay_plugin_register_t *ovrp;
+
+ ovrp = overlay_plugin_alloc(OVEP_VERSION);
+ if (ovrp == NULL)
+ return (ENOTSUP);
+ ovrp->ovep_name = vxlan_ident;
+ ovrp->ovep_ops = &vxlan_o_ops;
+ ovrp->ovep_id_size = VXLAN_ID_LEN;
+ ovrp->ovep_flags = OVEP_F_VLAN_TAG;
+ ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT;
+ ovrp->ovep_props = vxlan_props;
+
+ if ((err = overlay_plugin_register(ovrp)) == 0) {
+ if ((err = mod_install(&vxlan_modlinkage)) != 0) {
+ (void) overlay_plugin_unregister(vxlan_ident);
+ }
+ }
+
+ overlay_plugin_free(ovrp);
+ return (err);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&vxlan_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int err;
+
+ if ((err = overlay_plugin_unregister(vxlan_ident)) != 0)
+ return (err);
+
+ return (mod_remove(&vxlan_modlinkage));
+}
diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h
index 1f732c2f65..8b4358cb8d 100644
--- a/usr/src/uts/common/netinet/in.h
+++ b/usr/src/uts/common/netinet/in.h
@@ -3,6 +3,7 @@
* Use is subject to license terms.
*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
* Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
*/
/*
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index c05f5ca58f..ed4fcbef5a 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -436,6 +436,9 @@ CHKHDRS= \
ontrap.h \
open.h \
openpromio.h \
+ overlay.h \
+ overlay_common.h \
+ overlay_target.h \
panic.h \
param.h \
pathconf.h \
diff --git a/usr/src/uts/common/sys/dld_ioc.h b/usr/src/uts/common/sys/dld_ioc.h
index 2f519a8eda..093a4dc0c3 100644
--- a/usr/src/uts/common/sys/dld_ioc.h
+++ b/usr/src/uts/common/sys/dld_ioc.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SYS_DLD_IOC_H
@@ -59,6 +60,7 @@ extern "C" {
#define IPTUN_IOC 0x454A
#define BRIDGE_IOC 0xB81D
#define IBPART_IOC 0x6171
+#define OVERLAY_IOC 0x2005
/* GLDv3 modules use these macros to generate unique ioctl commands */
#define DLDIOC(cmdid) DLD_IOC_CMD(DLD_IOC, (cmdid))
@@ -68,6 +70,7 @@ extern "C" {
#define IPTUNIOC(cmdid) DLD_IOC_CMD(IPTUN_IOC, (cmdid))
#define BRIDGEIOC(cmdid) DLD_IOC_CMD(BRIDGE_IOC, (cmdid))
#define IBPARTIOC(cmdid) DLD_IOC_CMD(IBPART_IOC, (cmdid))
+#define OVERLAYIOC(cmdid) DLD_IOC_CMD(OVERLAY_IOC, (cmdid))
#ifdef _KERNEL
diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h
index e2893a2295..b60e53b267 100644
--- a/usr/src/uts/common/sys/dls_mgmt.h
+++ b/usr/src/uts/common/sys/dls_mgmt.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#ifndef _DLS_MGMT_H
@@ -46,13 +47,15 @@ typedef enum {
DATALINK_CLASS_SIMNET = 0x20,
DATALINK_CLASS_BRIDGE = 0x40,
DATALINK_CLASS_IPTUN = 0x80,
- DATALINK_CLASS_PART = 0x100
+ DATALINK_CLASS_PART = 0x100,
+ DATALINK_CLASS_OVERLAY = 0x200
} datalink_class_t;
#define DATALINK_CLASS_ALL (DATALINK_CLASS_PHYS | \
DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC | \
DATALINK_CLASS_ETHERSTUB | DATALINK_CLASS_SIMNET | \
- DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART)
+ DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART | \
+ DATALINK_CLASS_OVERLAY)
/*
* A combination of flags and media.
diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h
index 965dca263c..01cb27644c 100644
--- a/usr/src/uts/common/sys/mac_client_priv.h
+++ b/usr/src/uts/common/sys/mac_client_priv.h
@@ -188,6 +188,7 @@ extern void mac_client_set_intr_cpu(void *, mac_client_handle_t, int32_t);
extern void *mac_get_devinfo(mac_handle_t);
extern boolean_t mac_is_vnic(mac_handle_t);
+extern boolean_t mac_is_overlay(mac_handle_t);
extern uint32_t mac_no_notification(mac_handle_t);
extern int mac_set_prop(mac_handle_t, mac_prop_id_t, char *, void *, uint_t);
diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h
index da645ad382..21f2c10a8e 100644
--- a/usr/src/uts/common/sys/mac_impl.h
+++ b/usr/src/uts/common/sys/mac_impl.h
@@ -609,6 +609,7 @@ struct mac_impl_s {
#define MIS_LEGACY 0x0040
#define MIS_NO_ACTIVE 0x0080
#define MIS_POLL_DISABLE 0x0100
+#define MIS_IS_OVERLAY 0x0200
#define mi_getstat mi_callbacks->mc_getstat
#define mi_start mi_callbacks->mc_start
diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h
index 2cb326814a..431de67ff5 100644
--- a/usr/src/uts/common/sys/mac_provider.h
+++ b/usr/src/uts/common/sys/mac_provider.h
@@ -109,7 +109,8 @@ typedef enum {
MAC_CAPAB_NO_ZCOPY = 0x00100000, /* boolean only, no data */
MAC_CAPAB_LEGACY = 0x00200000, /* data is mac_capab_legacy_t */
MAC_CAPAB_VRRP = 0x00400000, /* data is mac_capab_vrrp_t */
- MAC_CAPAB_TRANSCEIVER = 0x01000000, /* mac_capab_transciever_t */
+ MAC_CAPAB_OVERLAY = 0x00800000, /* boolean only, no data */
+ MAC_CAPAB_TRANSCEIVER = 0x01000000, /* mac_capab_transceiver_t */
MAC_CAPAB_LED = 0x02000000 /* data is mac_capab_led_t */
} mac_capab_t;
diff --git a/usr/src/uts/common/sys/overlay.h b/usr/src/uts/common/sys/overlay.h
new file mode 100644
index 0000000000..12d0dbca51
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay.h
@@ -0,0 +1,96 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+#ifndef _SYS_OVERLAY_H
+#define _SYS_OVERLAY_H
+
+/*
+ * Overlay device support
+ */
+
+#include <sys/param.h>
+#include <sys/dld_ioc.h>
+#include <sys/mac.h>
+#include <sys/overlay_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define OVERLAY_IOC_CREATE OVERLAYIOC(1)
+#define OVERLAY_IOC_DELETE OVERLAYIOC(2)
+#define OVERLAY_IOC_PROPINFO OVERLAYIOC(3)
+#define OVERLAY_IOC_GETPROP OVERLAYIOC(4)
+#define OVERLAY_IOC_SETPROP OVERLAYIOC(5)
+#define OVERLAY_IOC_NPROPS OVERLAYIOC(6)
+#define OVERLAY_IOC_ACTIVATE OVERLAYIOC(7)
+#define OVERLAY_IOC_STATUS OVERLAYIOC(8)
+
+typedef struct overlay_ioc_create {
+ datalink_id_t oic_linkid;
+ uint32_t oic_filler;
+ uint64_t oic_vnetid;
+ char oic_encap[MAXLINKNAMELEN];
+} overlay_ioc_create_t;
+
+typedef struct overlay_ioc_activate {
+ datalink_id_t oia_linkid;
+} overlay_ioc_activate_t;
+
+typedef struct overlay_ioc_delete {
+ datalink_id_t oid_linkid;
+} overlay_ioc_delete_t;
+
+typedef struct overlay_ioc_nprops {
+ datalink_id_t oipn_linkid;
+ int32_t oipn_nprops;
+} overlay_ioc_nprops_t;
+
+typedef struct overlay_ioc_propinfo {
+ datalink_id_t oipi_linkid;
+ int32_t oipi_id;
+ char oipi_name[OVERLAY_PROP_NAMELEN];
+ uint_t oipi_type;
+ uint_t oipi_prot;
+ uint8_t oipi_default[OVERLAY_PROP_SIZEMAX];
+ uint32_t oipi_defsize;
+ uint32_t oipi_posssize;
+ uint8_t oipi_poss[OVERLAY_PROP_SIZEMAX];
+} overlay_ioc_propinfo_t;
+
+typedef struct overlay_ioc_prop {
+ datalink_id_t oip_linkid;
+ int32_t oip_id;
+ char oip_name[OVERLAY_PROP_NAMELEN];
+ uint8_t oip_value[OVERLAY_PROP_SIZEMAX];
+ uint32_t oip_size;
+} overlay_ioc_prop_t;
+
+typedef enum overlay_status {
+ OVERLAY_I_OK = 0x00,
+ OVERLAY_I_DEGRADED = 0x01
+} overlay_status_t;
+
+typedef struct overlay_ioc_status {
+ datalink_id_t ois_linkid;
+ uint_t ois_status;
+ char ois_message[OVERLAY_STATUS_BUFLEN];
+} overlay_ioc_status_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_OVERLAY_H */
diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h
new file mode 100644
index 0000000000..5c4b651f2c
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay_common.h
@@ -0,0 +1,65 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_OVERLAY_COMMON_H
+#define _SYS_OVERLAY_COMMON_H
+
+/*
+ * Common overlay definitions
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum overlay_target_mode {
+ OVERLAY_TARGET_NONE = 0x0,
+ OVERLAY_TARGET_POINT,
+ OVERLAY_TARGET_DYNAMIC
+} overlay_target_mode_t;
+
+typedef enum overlay_plugin_dest {
+ OVERLAY_PLUGIN_D_INVALID = 0x0,
+ OVERLAY_PLUGIN_D_ETHERNET = 0x1,
+ OVERLAY_PLUGIN_D_IP = 0x2,
+ OVERLAY_PLUGIN_D_PORT = 0x4,
+ OVERLAY_PLUGIN_D_MASK = 0x7
+} overlay_plugin_dest_t;
+
+typedef enum overlay_prop_type {
+ OVERLAY_PROP_T_INT = 0x1, /* signed int */
+ OVERLAY_PROP_T_UINT, /* unsigned int */
+ OVERLAY_PROP_T_IP, /* sinaddr6 */
+ OVERLAY_PROP_T_STRING /* OVERLAY_PROPS_SIZEMAX */
+} overlay_prop_type_t;
+
+typedef enum overlay_prop_prot {
+ OVERLAY_PROP_PERM_REQ = 0x1,
+ OVERLAY_PROP_PERM_READ = 0x2,
+ OVERLAY_PROP_PERM_WRITE = 0x4,
+ OVERLAY_PROP_PERM_RW = 0x6,
+ OVERLAY_PROP_PERM_RRW = 0x7,
+ OVERLAY_PROP_PERM_MASK = 0x7
+} overlay_prop_prot_t;
+
+#define OVERLAY_PROP_NAMELEN 64
+#define OVERLAY_PROP_SIZEMAX 256
+#define OVERLAY_STATUS_BUFLEN 256
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_OVERLAY_COMMON_H */
diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h
new file mode 100644
index 0000000000..0095c75eeb
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay_impl.h
@@ -0,0 +1,205 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _SYS_OVERLAY_IMPL_H
+#define _SYS_OVERLAY_IMPL_H
+
+/*
+ * Overlay device support
+ */
+
+#include <sys/overlay.h>
+#include <sys/overlay_common.h>
+#include <sys/overlay_plugin.h>
+#include <sys/overlay_target.h>
+#include <sys/ksynch.h>
+#include <sys/list.h>
+#include <sys/avl.h>
+#include <sys/ksocket.h>
+#include <sys/socket.h>
+#include <sys/refhash.h>
+#include <sys/ethernet.h>
+#include <sys/list.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define OVEP_VERSION_ONE 0x1
+
+typedef struct overlay_plugin {
+ kmutex_t ovp_mutex;
+ list_node_t ovp_link; /* overlay_plugin_lock */
+ uint_t ovp_active; /* ovp_mutex */
+ const char *ovp_name; /* RO */
+ const overlay_plugin_ops_t *ovp_ops; /* RO */
+ const char *const *ovp_props; /* RO */
+ uint_t ovp_nprops; /* RO */
+ uint_t ovp_id_size; /* RO */
+ overlay_plugin_flags_t ovp_flags; /* RO */
+ overlay_plugin_dest_t ovp_dest; /* RO */
+} overlay_plugin_t;
+
+typedef struct overlay_mux {
+ list_node_t omux_lnode;
+ ksocket_t omux_ksock; /* RO */
+ overlay_plugin_t *omux_plugin; /* RO: associated encap */
+ int omux_domain; /* RO: socket domain */
+ int omux_family; /* RO: socket family */
+ int omux_protocol; /* RO: socket protocol */
+ struct sockaddr *omux_addr; /* RO: socket address */
+ socklen_t omux_alen; /* RO: sockaddr len */
+ kmutex_t omux_lock; /* Protects everything below */
+ uint_t omux_count; /* Active instances */
+ avl_tree_t omux_devices; /* Tree of devices */
+} overlay_mux_t;
+
+typedef enum overlay_target_flag {
+ OVERLAY_T_TEARDOWN = 0x1
+} overlay_target_flag_t;
+
+typedef struct overlay_target {
+ kmutex_t ott_lock;
+ kcondvar_t ott_cond;
+ overlay_target_mode_t ott_mode; /* RO */
+ overlay_plugin_dest_t ott_dest; /* RO */
+ uint64_t ott_id; /* RO */
+ overlay_target_flag_t ott_flags; /* ott_lock */
+ uint_t ott_ocount; /* ott_lock */
+ union { /* ott_lock */
+ overlay_target_point_t ott_point;
+ struct overlay_target_dyn {
+ refhash_t *ott_dhash;
+ avl_tree_t ott_tree;
+ } ott_dyn;
+ } ott_u;
+} overlay_target_t;
+
+typedef enum overlay_dev_flag {
+ OVERLAY_F_ACTIVATED = 0x01, /* Activate ioctl completed */
+ OVERLAY_F_IN_MUX = 0x02, /* Currently in a mux */
+ OVERLAY_F_IN_TX = 0x04, /* Currently doing tx */
+ OVERLAY_F_IN_RX = 0x08, /* Currently doing rx */
+ OVERLAY_F_IOMASK = 0x0c, /* A mask for rx and tx */
+ OVERLAY_F_MDDROP = 0x10, /* Drop traffic for metadata update */
+ OVERLAY_F_STOPMASK = 0x1e, /* None set when stopping */
+ OVERLAY_F_VARPD = 0x20, /* varpd plugin exists */
+ OVERLAY_F_DEGRADED = 0x40, /* device is degraded */
+ OVERLAY_F_MASK = 0x7f /* mask of everything */
+} overlay_dev_flag_t;
+
+typedef struct overlay_dev {
+ kmutex_t odd_lock;
+ kcondvar_t odd_iowait;
+ list_node_t odd_link; /* overlay_dev_lock */
+ mac_handle_t odd_mh; /* RO */
+ overlay_plugin_t *odd_plugin; /* RO */
+ datalink_id_t odd_linkid; /* RO */
+ void *odd_pvoid; /* RO -- only used by plugin */
+ uint_t odd_ref; /* protected by odd_lock */
+ uint_t odd_mtu; /* protected by odd_lock */
+ overlay_dev_flag_t odd_flags; /* protected by odd_lock */
+ uint_t odd_rxcount; /* protected by odd_lock */
+ uint_t odd_txcount; /* protected by odd_lock */
+ overlay_mux_t *odd_mux; /* protected by odd_lock */
+ uint64_t odd_vid; /* RO if active else odd_lock */
+ avl_node_t odd_muxnode; /* managed by mux */
+ overlay_target_t *odd_target; /* See big theory statement */
+ char odd_fmamsg[OVERLAY_STATUS_BUFLEN]; /* odd_lock */
+} overlay_dev_t;
+
+typedef enum overlay_target_entry_flags {
+ OVERLAY_ENTRY_F_PENDING = 0x01, /* lookup in progress */
+ OVERLAY_ENTRY_F_VALID = 0x02, /* entry is currently valid */
+ OVERLAY_ENTRY_F_DROP = 0x04, /* always drop target */
+ OVERLAY_ENTRY_F_VALID_MASK = 0x06
+} overlay_target_entry_flags_t;
+
+typedef struct overlay_target_entry {
+ kmutex_t ote_lock;
+ refhash_link_t ote_reflink; /* hashtable link */
+ avl_node_t ote_avllink; /* iteration link */
+ list_node_t ote_qlink;
+ overlay_target_entry_flags_t ote_flags; /* RW: state flags */
+ uint8_t ote_addr[ETHERADDRL]; /* RO: mac addr */
+ overlay_target_t *ote_ott; /* RO */
+ overlay_dev_t *ote_odd; /* RO */
+ overlay_target_point_t ote_dest; /* RW: destination */
+ mblk_t *ote_chead; /* RW: blocked mb chain head */
+ mblk_t *ote_ctail; /* RW: blocked mb chain tail */
+ size_t ote_mbsize; /* RW: outstanding mblk size */
+ hrtime_t ote_vtime; /* RW: valid timestamp */
+} overlay_target_entry_t;
+
+
+#define OVERLAY_CTL "overlay"
+
+extern dev_info_t *overlay_dip;
+
+extern mblk_t *overlay_m_tx(void *, mblk_t *);
+
+typedef int (*overlay_dev_iter_f)(overlay_dev_t *, void *);
+extern void overlay_dev_iter(overlay_dev_iter_f, void *);
+
+extern void overlay_plugin_init(void);
+extern overlay_plugin_t *overlay_plugin_lookup(const char *);
+extern void overlay_plugin_rele(overlay_plugin_t *);
+extern void overlay_plugin_fini(void);
+typedef int (*overlay_plugin_walk_f)(overlay_plugin_t *, void *);
+extern void overlay_plugin_walk(overlay_plugin_walk_f, void *);
+
+extern void overlay_io_start(overlay_dev_t *, overlay_dev_flag_t);
+extern void overlay_io_done(overlay_dev_t *, overlay_dev_flag_t);
+
+extern void overlay_mux_init(void);
+extern void overlay_mux_fini(void);
+
+extern overlay_mux_t *overlay_mux_open(overlay_plugin_t *, int, int, int,
+ struct sockaddr *, socklen_t, int *);
+extern void overlay_mux_close(overlay_mux_t *);
+extern void overlay_mux_add_dev(overlay_mux_t *, overlay_dev_t *);
+extern void overlay_mux_remove_dev(overlay_mux_t *, overlay_dev_t *);
+extern int overlay_mux_tx(overlay_mux_t *, struct msghdr *, mblk_t *);
+
+extern void overlay_prop_init(overlay_prop_handle_t);
+
+extern void overlay_target_init(void);
+extern int overlay_target_busy(void);
+extern int overlay_target_open(dev_t *, int, int, cred_t *);
+extern int overlay_target_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+extern int overlay_target_close(dev_t, int, int, cred_t *);
+extern void overlay_target_free(overlay_dev_t *);
+
+#define OVERLAY_TARGET_OK 0
+#define OVERLAY_TARGET_DROP 1
+#define OVERLAY_TARGET_ASYNC 2
+extern int overlay_target_lookup(overlay_dev_t *, mblk_t *, struct sockaddr *,
+ socklen_t *);
+extern void overlay_target_quiesce(overlay_target_t *);
+extern void overlay_target_fini(void);
+
+extern void overlay_fm_init(void);
+extern void overlay_fm_fini(void);
+extern void overlay_fm_degrade(overlay_dev_t *, const char *);
+extern void overlay_fm_restore(overlay_dev_t *);
+
+extern overlay_dev_t *overlay_hold_by_dlid(datalink_id_t);
+extern void overlay_hold_rele(overlay_dev_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_OVERLAY_IMPL_H */
diff --git a/usr/src/uts/common/sys/overlay_plugin.h b/usr/src/uts/common/sys/overlay_plugin.h
new file mode 100644
index 0000000000..3392973562
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay_plugin.h
@@ -0,0 +1,324 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2015 Joyent, Inc.
+ */
+
+#ifndef _SYS_OVERLAY_PLUGIN_H
+#define _SYS_OVERLAY_PLUGIN_H
+
+/*
+ * overlay plugin interface for encapsulation/decapsulation modules
+ *
+ * This header file defines how encapsulation and decapsulation plugins
+ * interact within the broader system. At this time, these interfaces are
+ * considered private to illumos and therefore are subject to change. As we gain
+ * more experience with a few of the different encapsulation formats, say nvgre
+ * or geneve, then we can move to make this a more-stable interface.
+ *
+ * A plugin is a general kernel module that uses the miscellaneous mod-linkage.
+ *
+ * In it's _init(9E) routine, it must register itself with the overlay
+ * subsystem. To do this, it allocates an overlay_plugin_register_t via
+ * overlay_plugin_alloc(), that it then * fills out with various required
+ * information and then attempts to register with the system via a call to
+ * overlay_plugin_register(). If that succeeds, it should then call
+ * mod_install(9F). If the mod_install(9F) fails, then it should call
+ * overlay_plugin_unregister(). Regardless of success or failure, it should call
+ * overlay_plugin_free() to ensure that any memory that may be associated with
+ * the registration is freed.
+ *
+ * When the module's _fini(9E) is called, overlay_plugin_unregister() should be
+ * called first. It may return an error, such as EBUSY. In such cases, it should
+ * be returned as the return status of _fini(9E). This is quite necessary, it
+ * ensures that if the module is in use it doesn't get unloaded out from under
+ * us the broader subsystem while it's still in use. A driver can use that to
+ * know that there are no current instances of its private data.
+ *
+ * ------------------
+ * Plugin Definitions
+ * ------------------
+ *
+ * A plugin is required to fill in both an operations vector and a series of
+ * information to the callback routine. Here are the routines and their
+ * purposes. The full signatures are available below.
+ *
+ * overlay_plugin_init_t
+ *
+ * This interface is used to create a new instance of a plugin. An instance
+ * of a plugin will be created for each overlay device that is created. For
+ * example, if a device is created with VXLAN ID 23 and ID 42, then there
+ * will be two different calls to this function.
+ *
+ * This function gives the plugin a chance to create a private data
+ * structure that will be returned on subsequent calls to the system.
+ *
+ * overlay_plugin_fini_t
+ *
+ * This is the opposite of overlay_plugin_init_t. It will be called when it
+ * is safe to remove any private data that is associated with this instance
+ * of the plugin.
+ *
+ * overlay_plugin_propinfo_t
+ *
+ * This is called with the name of a property that is registered when the
+ * plugin is created. This function will be called with the name of the
+ * property that information is being requested about. The plugin is
+ * responsible for filling out information such as setting the name, the
+ * type of property it is, the protection of the property (can a user
+ * update it?), whether the property is required, an optional default value
+ * for the property, and an optional set of values or ranges that are
+ * allowed.
+ *
+ * overlay_plugin_getprop_t
+ *
+ * Return the value of the named property from the current instance of the
+ * plugin.
+ *
+ * overlay_plugin_setprop_t
+ *
+ * Set the value of the named property to the specified value for the
+ * current instance of the plugin. Note, that it is the plugin's
+ * responsibility to ensure that the value of the property is valid and to
+ * update state as appropriate.
+ *
+ * overlay_plugin_socket_t
+ *
+ * Every overlay device has a corresponding socket that it uses to send and
+ * receive traffic. This routine is used to get the parameters that should
+ * be used to define such a socket. The actual socket may be multiplexed
+ * with other uses of it.
+ *
+ * overlay_plugin_sockopt_t
+ *
+ * Allow a plugin to set any necessary socket options that it needs on the
+ * kernel socket that is being used by a mux. This will only be called once
+ * for a given mux, if additional devices are added to a mux, it will not
+ * be called additional times.
+ *
+ * overlay_plugin_encap_t
+ *
+ * In this routine you're given a message block and information about the
+ * packet, such as the identifier and are asked to fill out a message block
+ * that represents the encapsulation header and optionally manipulate the
+ * input message if required.
+ *
+ * overlay_plugin_decap_t
+ *
+ * In this routine, you're given the encapsulated message block. The
+ * requirement is to decapsulate it and determine what is the correct
+ * overlay identifier for this network and to fill in the header size so
+ * the broader system knows how much of this data should be considered
+ * consumed.
+ *
+ * ovpo_callbacks
+ *
+ * This should be set to zero, it's reserved for future use.
+ *
+ * Once these properties are defined, the module should define the following
+ * members in the overlay_plugin_register_t.
+ *
+ * ovep_version
+ *
+ * Should be set to the value of the macro OVEP_VERSION.
+ *
+ * ovep_name
+ *
+ * Should be set to a character string that has the name of the module.
+ * Generally this should match the name of the kernel module; however, this
+ * is the name that users will use to refer to this module when creating
+ * devices.
+ *
+ * overlay_plugin_ops_t
+ *
+ * Should be set to the functions as described above.
+ *
+ * ovep_props
+ *
+ * This is an array of character strings that holds the names of the
+ * properties of the encapsulation plugin.
+ *
+ *
+ * ovep_id_size
+ *
+ * This is the size in bytes of the valid range for the identifier. The
+ * valid identifier range is considered a ovep_id_size byte unsigned
+ * integer, [ 0, 1 << (ovep_id_size * 8) ).
+ *
+ * ovep_flags
+ *
+ * A series of flags that indicate optional features that are supported.
+ * Valid flags include:
+ *
+ * OVEP_F_VLAN_TAG
+ *
+ * The encapsulation format allows for the encapsulated
+ * packet to maintain a VLAN tag.
+ *
+ * ovep_dest
+ *
+ * Describes the kind of destination that the overlay plugin supports for
+ * sending traffic. For example, vxlan uses UDP, therefore it requires both
+ * an IP address and a port; however, nvgre uses the gre header and
+ * therefore only requires an IP address. The following flags may be
+ * combined:
+ *
+ * OVERLAY_PLUGIN_D_ETHERNET
+ *
+ * Indicates that to send a packet to its destination, we
+ * require a link-layer ethernet address.
+ *
+ * OVERLAY_PLUGIN_D_IP
+ *
+ * Indicates that to send a packet to its destination, we
+ * require an IP address. Note, all IP addresses are
+ * transmitted as IPv6 addresses and for an IPv4
+ * destination, using an IPv4-mapped IPv6 address is the
+ * expected way to transmit that.
+ *
+ * OVERLAY_PLUGIN_D_PORT
+ *
+ * Indicates that to send a packet to its destination, a
+ * port is required, this usually indicates that the
+ * protocol uses something like TCP or UDP.
+ *
+ *
+ * -------------------------------------------------
+ * Downcalls, Upcalls, and Synchronization Guarantees
+ * -------------------------------------------------
+ *
+ * Every instance of a given module is independent. The kernel only guarantees
+ * that it will probably perform downcalls into different instances in parallel
+ * at some point. No locking is provided by the framework for synchronization
+ * across instances. If a module finds itself needing that, it will be up to it
+ * to provide it.
+ *
+ * In a given instance, the kernel may call into entry points in parallel. If
+ * the instance has private data, it should likely synchronize it. The one
+ * guarantee that we do make, is that calls to getprop and setprop will be done
+ * synchronized by a caller holding the MAC perimeter.
+ *
+ * While servicing a downcall from the general overlay device framework, a
+ * kernel module should not make any upcalls, excepting those functions that are
+ * defined in this header file, eg. the property related callbacks. Importantly,
+ * it cannot make any assumptions about what locks may or may not be held by the
+ * broader system. The only thing that it is safe for it to use are its own
+ * locks.
+ *
+ * ----------------
+ * Downcall Context
+ * ----------------
+ *
+ * For all of the downcalls, excepting the overlay_plugin_encap_t and
+ * overlay_plugin_decap_t, the calls will be made either in kernel or user
+ * context, the module should not assume either way.
+ *
+ * overlay_plugin_encap_t and overlay_plugin_decap_t may be called in user,
+ * kernel or interrupt context; however, it is guaranteed that the interrupt
+ * will be below LOCK_LEVEL, and therefore it is safe to grab locks.
+ */
+
+#include <sys/stream.h>
+#include <sys/mac_provider.h>
+#include <sys/ksocket.h>
+#include <sys/overlay_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define OVEP_VERSION 0x1
+
+typedef enum overlay_plugin_flags {
+ OVEP_F_VLAN_TAG = 0x01 /* Supports VLAN Tags */
+} overlay_plugin_flags_t;
+
+/*
+ * The ID space could easily be more than a 64-bit number, even
+ * though today it's either a 24-64 bit value. How should we future
+ * proof ourselves here?
+ */
+typedef struct ovep_encap_info {
+ uint64_t ovdi_id;
+ size_t ovdi_hdr_size;
+} ovep_encap_info_t;
+
+typedef struct __overlay_prop_handle *overlay_prop_handle_t;
+typedef struct __overlay_handle *overlay_handle_t;
+
+/*
+ * Plugins are guaranteed that calls to setprop are serialized. However, any
+ * number of other calls can be going on in parallel otherwise.
+ */
+typedef int (*overlay_plugin_encap_t)(void *, mblk_t *,
+ ovep_encap_info_t *, mblk_t **);
+typedef int (*overlay_plugin_decap_t)(void *, mblk_t *,
+ ovep_encap_info_t *);
+typedef int (*overlay_plugin_init_t)(overlay_handle_t, void **);
+typedef void (*overlay_plugin_fini_t)(void *);
+typedef int (*overlay_plugin_socket_t)(void *, int *, int *, int *,
+ struct sockaddr *, socklen_t *);
+typedef int (*overlay_plugin_sockopt_t)(ksocket_t);
+typedef int (*overlay_plugin_getprop_t)(void *, const char *, void *,
+ uint32_t *);
+typedef int (*overlay_plugin_setprop_t)(void *, const char *, const void *,
+ uint32_t);
+typedef int (*overlay_plugin_propinfo_t)(const char *, overlay_prop_handle_t);
+
+typedef struct overlay_plugin_ops {
+ uint_t ovpo_callbacks;
+ overlay_plugin_init_t ovpo_init;
+ overlay_plugin_fini_t ovpo_fini;
+ overlay_plugin_encap_t ovpo_encap;
+ overlay_plugin_decap_t ovpo_decap;
+ overlay_plugin_socket_t ovpo_socket;
+ overlay_plugin_sockopt_t ovpo_sockopt;
+ overlay_plugin_getprop_t ovpo_getprop;
+ overlay_plugin_setprop_t ovpo_setprop;
+ overlay_plugin_propinfo_t ovpo_propinfo;
+} overlay_plugin_ops_t;
+
+typedef struct overlay_plugin_register {
+ uint_t ovep_version;
+ const char *ovep_name;
+ const overlay_plugin_ops_t *ovep_ops;
+ const char **ovep_props;
+ uint_t ovep_id_size;
+ uint_t ovep_flags;
+ uint_t ovep_dest;
+} overlay_plugin_register_t;
+
+/*
+ * Functions that interact with registration
+ */
+extern overlay_plugin_register_t *overlay_plugin_alloc(uint_t);
+extern void overlay_plugin_free(overlay_plugin_register_t *);
+extern int overlay_plugin_register(overlay_plugin_register_t *);
+extern int overlay_plugin_unregister(const char *);
+
+/*
+ * Property information callbacks
+ */
+extern void overlay_prop_set_name(overlay_prop_handle_t, const char *);
+extern void overlay_prop_set_prot(overlay_prop_handle_t, overlay_prop_prot_t);
+extern void overlay_prop_set_type(overlay_prop_handle_t, overlay_prop_type_t);
+extern int overlay_prop_set_default(overlay_prop_handle_t, void *, ssize_t);
+extern void overlay_prop_set_nodefault(overlay_prop_handle_t);
+extern void overlay_prop_set_range_uint32(overlay_prop_handle_t, uint32_t,
+ uint32_t);
+extern void overlay_prop_set_range_str(overlay_prop_handle_t, const char *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_OVERLAY_PLUGIN_H */
diff --git a/usr/src/uts/common/sys/overlay_target.h b/usr/src/uts/common/sys/overlay_target.h
new file mode 100644
index 0000000000..775c7d27b8
--- /dev/null
+++ b/usr/src/uts/common/sys/overlay_target.h
@@ -0,0 +1,295 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2015 Joyent, Inc.
+ */
+
+#ifndef _OVERLAY_TARGET_H
+#define _OVERLAY_TARGET_H
+
+/*
+ * Overlay device varpd ioctl interface (/dev/overlay)
+ */
+
+#include <sys/types.h>
+#include <sys/ethernet.h>
+#include <netinet/in.h>
+#include <sys/overlay_common.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct overlay_target_point {
+ uint8_t otp_mac[ETHERADDRL];
+ struct in6_addr otp_ip;
+ uint16_t otp_port;
+} overlay_target_point_t;
+
+#define OVERLAY_TARG_IOCTL (('o' << 24) | ('v' << 16) | ('t' << 8))
+
+#define OVERLAY_TARG_INFO (OVERLAY_TARG_IOCTL | 0x01)
+
+typedef enum overlay_targ_info_flags {
+ OVERLAY_TARG_INFO_F_ACTIVE = 0x01,
+ OVERLAY_TARG_INFO_F_DEGRADED = 0x02
+} overlay_targ_info_flags_t;
+
+/*
+ * Get target information about an overlay device
+ */
+typedef struct overlay_targ_info {
+ datalink_id_t oti_linkid;
+ uint32_t oti_needs;
+ uint64_t oti_flags;
+ uint64_t oti_vnetid;
+} overlay_targ_info_t;
+
+/*
+ * Declare an association between a given varpd instance and a datalink.
+ */
+#define OVERLAY_TARG_ASSOCIATE (OVERLAY_TARG_IOCTL | 0x02)
+
+typedef struct overlay_targ_associate {
+ datalink_id_t ota_linkid;
+ uint32_t ota_mode;
+ uint64_t ota_id;
+ uint32_t ota_provides;
+ overlay_target_point_t ota_point;
+} overlay_targ_associate_t;
+
+/*
+ * Remove an association from a device. If the device has already been started,
+ * this implies OVERLAY_TARG_DEGRADE.
+ */
+#define OVERLAY_TARG_DISASSOCIATE (OVERLAY_TARG_IOCTL | 0x3)
+
+/*
+ * Tells the kernel that while a varpd instance still exists, it basically isn't
+ * making any forward progress, so the device should consider itself degraded.
+ */
+#define OVERLAY_TARG_DEGRADE (OVERLAY_TARG_IOCTL | 0x4)
+
+typedef struct overlay_targ_degrade {
+ datalink_id_t otd_linkid;
+ uint32_t otd_pad;
+ char otd_buf[OVERLAY_STATUS_BUFLEN];
+} overlay_targ_degrade_t;
+
+/*
+ * Tells the kernel to remove the degraded status that it set on a device.
+ */
+#define OVERLAY_TARG_RESTORE (OVERLAY_TARG_IOCTL | 0x5)
+
+typedef struct overlay_targ_id {
+ datalink_id_t otid_linkid;
+} overlay_targ_id_t;
+
+/*
+ * The following ioctls are all used to support dynamic lookups from userland,
+ * generally serviced by varpd.
+ *
+ * The way this is designed to work is that user land will have threads sitting
+ * in OVERLAY_TARG_LOOKUP ioctls waiting to service requests. A thread will sit
+ * waiting for work for up to approximately one second of time before they will
+ * be sent back out to user land to give user land a chance to clean itself up
+ * or more generally, come back into the kernel for work. Once these threads
+ * return, they will have a request with which more action can be done. The
+ * following ioctls can all be used to answer the request.
+ *
+ * OVERLAY_TARG_RESPOND - overlay_targ_resp_t
+ *
+ * The overlay_targ_resp_t has the appropriate information from
+ * which a reply can be generated. The information is filled into
+ * an overlay_targ_point_t as appropriate based on the
+ * overlay_plugin_dest_t type.
+ *
+ *
+ * OVERLAY_TARG_DROP - overlay_targ_resp_t
+ *
+ * The overlay_targ_resp_t should identify a request for which to
+ * drop a packet.
+ *
+ *
+ * OVERLAY_TARG_INJECT - overlay_targ_pkt_t
+ *
+ * The overlay_targ_pkt_t injects a fully formed packet into the
+ * virtual network. It may either be identified by its data link id
+ * or by the request id. If both are specified, the
+ * datalink id will be used. Note, that an injection is not
+ * considered a reply and if this corresponds to a request, then
+ * that individual packet must still be dropped.
+ *
+ *
+ * OVERLAY_TARG_PKT - overlay_targ_pkt_t
+ *
+ * This ioctl can be used to copy data from a given request into a
+ * user buffer. This can be used in combination with
+ * OVERLAY_TARG_INJECT to implement services such as a proxy-arp.
+ *
+ *
+ * OVERLAY_TARG_RESEND - overlay_targ_pkt_t
+ *
+ * This ioctl is similar to the OVERLAY_TARG_INJECT, except instead
+ * of receiving it on the local mac handle, it queues it for
+ * retransmission again. This is useful if you have a packet that
+ * was originally destined for some broadcast or multicast address
+ * that you now want to send to a unicast address.
+ */
+#define OVERLAY_TARG_LOOKUP (OVERLAY_TARG_IOCTL | 0x10)
+#define OVERLAY_TARG_RESPOND (OVERLAY_TARG_IOCTL | 0x11)
+#define OVERLAY_TARG_DROP (OVERLAY_TARG_IOCTL | 0x12)
+#define OVERLAY_TARG_INJECT (OVERLAY_TARG_IOCTL | 0x13)
+#define OVERLAY_TARG_PKT (OVERLAY_TARG_IOCTL | 0x14)
+#define OVERLAY_TARG_RESEND (OVERLAY_TARG_IOCTL | 0x15)
+
+typedef struct overlay_targ_lookup {
+ uint64_t otl_dlid;
+ uint64_t otl_reqid;
+ uint64_t otl_varpdid;
+ uint64_t otl_vnetid;
+ uint64_t otl_hdrsize;
+ uint64_t otl_pktsize;
+ uint8_t otl_srcaddr[ETHERADDRL];
+ uint8_t otl_dstaddr[ETHERADDRL];
+ uint32_t otl_dsttype;
+ uint32_t otl_sap;
+ int32_t otl_vlan;
+} overlay_targ_lookup_t;
+
+typedef struct overlay_targ_resp {
+ uint64_t otr_reqid;
+ overlay_target_point_t otr_answer;
+} overlay_targ_resp_t;
+
+typedef struct overlay_targ_pkt {
+ uint64_t otp_linkid;
+ uint64_t otp_reqid;
+ uint64_t otp_size;
+ void *otp_buf;
+} overlay_targ_pkt_t;
+
+#ifdef _KERNEL
+
+#pragma pack(4)
+typedef struct overlay_targ_pkt32 {
+ uint64_t otp_linkid;
+ uint64_t otp_reqid;
+ uint64_t otp_size;
+ caddr32_t otp_buf;
+} overlay_targ_pkt32_t;
+#pragma pack()
+
+#endif /* _KERNEL */
+
+/*
+ * This provides a way to get a list of active overlay devices independently
+ * from dlmgmtd. At the end of the day the kernel always knows what will exist
+ * and this allows varpd which is an implementation of libdladm not to end up
+ * needing to call back into dlmgmtd via libdladm and create an unfortunate
+ * dependency cycle.
+ */
+
+#define OVERLAY_TARG_LIST (OVERLAY_TARG_IOCTL | 0x20)
+
+typedef struct overlay_targ_list {
+ uint32_t otl_nents;
+ uint32_t otl_ents[];
+} overlay_targ_list_t;
+
+/*
+ * The following family of ioctls all manipulate the target cache of a given
+ * device.
+ *
+ * OVERLAY_TARG_CACHE_GET - overlay_targ_cache_t
+ *
+ * The overlay_targ_cache_t should be have its link identifier and
+ * the desired mac address filled in. On return, it will fill in
+ * the otc_dest member, if the entry exists in the table.
+ *
+ *
+ * OVERLAY_TARG_CACHE_SET - overlay_targ_cache_t
+ *
+ * The cache table entry of the mac address referred to by otc_mac
+ * and otd_linkid will be filled in with the details provided by in
+ * the otc_dest member.
+ *
+ * OVERLAY_TARG_CACHE_REMOVE - overlay_targ_cache_t
+ *
+ * Removes the cache entry identified by otc_mac from the table.
+ * Note that this does not stop any in-flight lookups or deal with
+ * any data that is awaiting a lookup.
+ *
+ *
+ * OVERLAY_TARG_CACHE_FLUSH - overlay_targ_cache_t
+ *
+ * Similar to OVERLAY_TARG_CACHE_REMOVE, but functions on the
+ * entire table identified by otc_linkid. All other parameters are
+ * ignored.
+ *
+ *
+ * OVERLAY_TARG_CACHE_ITER - overlay_targ_cache_iter_t
+ *
+ * Iterates over the contents of a target cache identified by
+ * otci_linkid. Iteration is guaranteed to be exactly once for
+ * items which are in the hashtable at the beginning and end of
+ * iteration. For items which are added or removed after iteration
+ * has begun, only at most once semantics are guaranteed. Consumers
+ * should ensure that otci_marker is zeroed before starting
+ * iteration and should preserve its contents across calls.
+ *
+ * Before calling in, otci_count should be set to the number of
+ * entries that space has been allocated for in otci_ents. The
+ * value will be updated to indicate the total number written out.
+ */
+
+#define OVERLAY_TARG_CACHE_GET (OVERLAY_TARG_IOCTL | 0x30)
+#define OVERLAY_TARG_CACHE_SET (OVERLAY_TARG_IOCTL | 0x31)
+#define OVERLAY_TARG_CACHE_REMOVE (OVERLAY_TARG_IOCTL | 0x32)
+#define OVERLAY_TARG_CACHE_FLUSH (OVERLAY_TARG_IOCTL | 0x33)
+#define OVERLAY_TARG_CACHE_ITER (OVERLAY_TARG_IOCTL | 0x34)
+
+/*
+ * This is a pretty arbitrary number that we're constraining ourselves to
+ * for iteration. Basically the goal is to make sure that we can't have a user
+ * ask us to allocate too much memory on their behalf at any time. A more
+ * dynamic form may be necessary some day.
+ */
+#define OVERLAY_TARGET_ITER_MAX 500
+
+#define OVERLAY_TARGET_CACHE_DROP 0x01
+
+typedef struct overlay_targ_cache_entry {
+ uint8_t otce_mac[ETHERADDRL];
+ uint16_t otce_flags;
+ overlay_target_point_t otce_dest;
+} overlay_targ_cache_entry_t;
+
+typedef struct overlay_targ_cache {
+ datalink_id_t otc_linkid;
+ overlay_targ_cache_entry_t otc_entry;
+} overlay_targ_cache_t;
+
+typedef struct overlay_targ_cache_iter {
+ datalink_id_t otci_linkid;
+ uint32_t otci_pad;
+ uint64_t otci_marker;
+ uint16_t otci_count;
+ uint8_t otci_pad2[3];
+ overlay_targ_cache_entry_t otci_ents[];
+} overlay_targ_cache_iter_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _OVERLAY_TARGET_H */
diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel
index b586675d85..290eae88ff 100644
--- a/usr/src/uts/intel/Makefile.intel
+++ b/usr/src/uts/intel/Makefile.intel
@@ -706,6 +706,12 @@ MAC_KMODS += mac_wifi
MAC_KMODS += mac_ib
#
+# Overlay related modules (/kernel/overlay)
+#
+DRV_KMODS += overlay
+OVERLAY_KMODS += vxlan
+
+#
# socketmod (kernel/socketmod)
#
SOCKET_KMODS += sockpfp
diff --git a/usr/src/uts/intel/overlay/Makefile b/usr/src/uts/intel/overlay/Makefile
new file mode 100644
index 0000000000..deb77fcd6d
--- /dev/null
+++ b/usr/src/uts/intel/overlay/Makefile
@@ -0,0 +1,46 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+UTSBASE = ../..
+
+MODULE = overlay
+OBJECTS = $(OVERLAY_OBJS:%=$(OBJS_DIR)/%)
+ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
+
+include $(UTSBASE)/intel/Makefile.intel
+
+ALL_TARGET = $(BINARY) $(SRC_CONFFILE)
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+CONF_SRCDIR = $(UTSBASE)/common/io/overlay
+MAPFILE = $(UTSBASE)/common/io/overlay/overlay.mapfile
+
+LDFLAGS += -Nmisc/mac -Ndrv/dld -Nmisc/dls -Nmisc/ksocket
+
+# needs work
+SMATCH=off
+
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+install: $(INSTALL_DEPS)
+
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/intel/vxlan/Makefile b/usr/src/uts/intel/vxlan/Makefile
new file mode 100644
index 0000000000..530c66ee4c
--- /dev/null
+++ b/usr/src/uts/intel/vxlan/Makefile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+UTSBASE = ../..
+
+MODULE = vxlan
+OBJECTS = $(OVERLAY_VXLAN_OBJS:%=$(OBJS_DIR)/%)
+ROOTMODULE = $(ROOT_OVERLAY_DIR)/$(MODULE)
+
+include $(UTSBASE)/intel/Makefile.intel
+
+ALL_TARGET = $(BINARY)
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+LDFLAGS += -Ndrv/overlay -Ndrv/ip
+
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+install: $(INSTALL_DEPS)
+
+include $(UTSBASE)/intel/Makefile.targ