diff options
author | Robert Mustacchi <rm@joyent.com> | 2015-02-23 23:22:14 +0000 |
---|---|---|
committer | Andy Fiddaman <omnios@citrus-it.co.uk> | 2022-02-22 10:37:31 +0000 |
commit | 36589d6bb0cdae89e166b57b0d64ae56d53247d9 (patch) | |
tree | 987cd02be74c303307fa448d91ff82f57be47f95 | |
parent | 68df0c4f60a2e57680d6d1e6dba32ffa2d035538 (diff) | |
download | illumos-joyent-36589d6bb0cdae89e166b57b0d64ae56d53247d9.tar.gz |
13500 Want support for overlay networks
Portions contributed by: Dan McDonald <danmcd@joyent.com>
Portions contributed by: Jason King <jason.king@joyent.com>
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Portions contributed by: Mike Zeller <mike@mikezeller.net>
Portions contributed by: Andy Fiddaman <omnios@citrus-it.co.uk>
Reviewed by: Andy Fiddaman <andy@omnios.org>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Jason King <jbk@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Mike Gerdts <mike.gerdts@joyent.com>
Reviewed by: Mike Zeller <mike.zeller@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Rob Gulewich <robert.gulewich@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Ryan Zezeski <rpz@joyent.com>
Approved by: Joshua M. Clulow <josh@sysmgr.org>
100 files changed, 17706 insertions, 70 deletions
diff --git a/exception_lists/packaging b/exception_lists/packaging index f5cdbbf189..dc74d6d3bf 100644 --- a/exception_lists/packaging +++ b/exception_lists/packaging @@ -400,6 +400,18 @@ usr/include/libidspace.h # VXLAN # usr/include/sys/vxlan.h +lib/libvarpd.so +lib/amd64/libvarpd.so +# +# Overlay +# +usr/include/libdloverlay.h +usr/include/libvarpd.h +usr/include/libvarpd_client.h +usr/include/libvarpd_provider.h +usr/include/sys/overlay.h +usr/include/sys/overlay_common.h +usr/include/sys/overlay_target.h # # Private interfaces in libsec # diff --git a/usr/src/Targetdirs b/usr/src/Targetdirs index b78dc9df7f..94ec907b77 100644 --- a/usr/src/Targetdirs +++ b/usr/src/Targetdirs @@ -309,6 +309,7 @@ DIRS= \ /usr/lib/mdb/kvm \ /usr/lib/mdb/proc \ /usr/lib/nfs \ + /usr/lib/varpd \ /usr/net \ /usr/net/servers \ /usr/lib/pool \ @@ -490,6 +491,7 @@ DIRS64= \ /usr/lib/security/$(MACH64) \ /usr/lib/smbsrv/$(MACH64) \ /usr/lib/abi/$(MACH64) \ + /usr/lib/varpd/$(MACH64) \ /usr/sbin/$(MACH64) \ /usr/ucb/$(MACH64) \ /usr/ucblib/$(MACH64) \ @@ -544,6 +546,7 @@ SYM.DIRS64= \ /usr/lib/lwp/64 \ /usr/lib/secure/64 \ /usr/lib/security/64 \ + /usr/lib/varpd/64 \ /usr/xpg4/lib/64 \ /var/ld/64 \ /usr/ucblib/64 @@ -647,6 +650,7 @@ $(BUILD64) $(ROOT)/usr/lib/lwp/64:= LINKDEST=$(MACH64) $(BUILD64) $(ROOT)/usr/lib/link_audit/64:= LINKDEST=$(MACH64) $(BUILD64) $(ROOT)/usr/lib/secure/64:= LINKDEST=$(MACH64) $(BUILD64) $(ROOT)/usr/lib/security/64:= LINKDEST=$(MACH64) +$(BUILD64) $(ROOT)/usr/lib/varpd/64:= LINKDEST=$(MACH64) $(BUILD64) $(ROOT)/usr/xpg4/lib/64:= LINKDEST=$(MACH64) $(BUILD64) $(ROOT)/var/ld/64:= LINKDEST=$(MACH64) $(BUILD64) $(ROOT)/usr/ucblib/64:= LINKDEST=$(MACH64) diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile index 8159ad677b..4f496112e8 100644 --- a/usr/src/cmd/Makefile +++ b/usr/src/cmd/Makefile @@ -446,6 +446,7 @@ COMMON_SUBDIRS= \ utmpd \ uuidgen \ valtools \ + varpd \ vgrind \ vi \ volcheck \ diff --git a/usr/src/cmd/cmd-inet/etc/services b/usr/src/cmd/cmd-inet/etc/services index 37514ac0a7..4562baff66 100644 --- a/usr/src/cmd/cmd-inet/etc/services +++ b/usr/src/cmd/cmd-inet/etc/services @@ -1,6 +1,7 @@ # # Copyright 2010 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2015 Joyent, Inc. # # CDDL HEADER START # @@ -33,7 +34,7 @@ systat 11/tcp users daytime 13/tcp daytime 13/udp netstat 15/tcp -qotd 17/tcp # Quote of the Day +qotd 17/tcp # Quote of the Day chargen 19/tcp ttytst source chargen 19/udp ttytst source ftp-data 20/tcp @@ -80,7 +81,7 @@ imap3 220/tcp imap3 220/udp clearcase 371/tcp clearcase 371/udp -ldap 389/tcp # Lightweight Directory Access Protocol +ldap 389/tcp # Lightweight Directory Access Protocol ldap 389/udp # Lightweight Directory Access Protocol https 443/tcp https 443/udp @@ -227,6 +228,7 @@ eklogin 2105/tcp # Kerberos encrypted rlogin lockd 4045/udp # NFS lock daemon/manager lockd 4045/tcp ipsec-nat-t 4500/udp # IPsec NAT-Traversal +vxlan 4789/udp # Virtual eXtensible Local Area Network (VXLAN) mdns 5353/udp # Multicast DNS mdns 5353/tcp vnc-server 5900/tcp # VNC Server diff --git a/usr/src/cmd/devfsadm/misc_link.c b/usr/src/cmd/devfsadm/misc_link.c index 30e3e1863e..7397fcdb40 100644 --- a/usr/src/cmd/devfsadm/misc_link.c +++ b/usr/src/cmd/devfsadm/misc_link.c @@ -204,6 +204,9 @@ static devfsadm_create_t misc_cbt[] = { { "pseudo", "ddi_pseudo", "tpm", TYPE_EXACT | DRV_EXACT, ILEVEL_0, minor_name }, + { "pseudo", "ddi_pseudo", "overlay", + TYPE_EXACT | DRV_EXACT, ILEVEL_0, minor_name + } }; DEVFSADM_CREATE_INIT_V0(misc_cbt); diff --git a/usr/src/cmd/dladm/Makefile b/usr/src/cmd/dladm/Makefile index 6171822797..bba8a8cede 100644 --- a/usr/src/cmd/dladm/Makefile +++ b/usr/src/cmd/dladm/Makefile @@ -20,6 +20,7 @@ # # Copyright 2010 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2015 Joyent, Inc. # # Copyright (c) 2018, Joyent, Inc. @@ -38,7 +39,7 @@ XGETFLAGS += -a -x $(PROG).xcl LDLIBS += -L$(ROOT)/lib -lsocket LDLIBS += -ldladm -ldlpi -lkstat -lsecdb -lbsm -lofmt -linetutil -ldevinfo -LDLIBS += $(ZLAZYLOAD) -lrstp $(ZNOLAZYLOAD) +LDLIBS += $(ZLAZYLOAD) -lrstp $(ZNOLAZYLOAD) -lnsl -lumem -lcustr CERRWARN += -_gcc=-Wno-switch CERRWARN += -_gcc=-Wno-unused-label diff --git a/usr/src/cmd/dladm/dladm.c b/usr/src/cmd/dladm/dladm.c index 04a520f537..9d4d345bca 100644 --- a/usr/src/cmd/dladm/dladm.c +++ b/usr/src/cmd/dladm/dladm.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. * Copyright 2020 Peter Tribble. * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. */ @@ -63,6 +64,7 @@ #include <libdliptun.h> #include <libdlsim.h> #include <libdlbridge.h> +#include <libdloverlay.h> #include <libinetutil.h> #include <libvrrpadm.h> #include <bsm/adt.h> @@ -78,6 +80,7 @@ #include <stddef.h> #include <stp_in.h> #include <ofmt.h> +#include <libcustr.h> #define MAXPORT 256 #define MAXVNIC 256 @@ -196,6 +199,7 @@ static ofmt_cb_t print_lacp_cb, print_phys_one_mac_cb; static ofmt_cb_t print_xaggr_cb, print_aggr_stats_cb; static ofmt_cb_t print_phys_one_hwgrp_cb, print_wlan_attr_cb; static ofmt_cb_t print_wifi_status_cb, print_link_attr_cb; +static ofmt_cb_t print_overlay_cb, print_overlay_fma_cb, print_overlay_targ_cb; typedef void cmdfunc_t(int, char **, const char *); @@ -223,6 +227,8 @@ static cmdfunc_t do_create_bridge, do_modify_bridge, do_delete_bridge; static cmdfunc_t do_add_bridge, do_remove_bridge, do_show_bridge; static cmdfunc_t do_create_iptun, do_modify_iptun, do_delete_iptun; static cmdfunc_t do_show_iptun, do_up_iptun, do_down_iptun; +static cmdfunc_t do_create_overlay, do_delete_overlay, do_modify_overlay; +static cmdfunc_t do_show_overlay; static void do_up_vnic_common(int, char **, const char *, boolean_t); @@ -258,8 +264,11 @@ static void die(const char *, ...); static void die_optdup(int); static void die_opterr(int, int, const char *); static void die_dlerr(dladm_status_t, const char *, ...); +static void die_dlerrlist(dladm_status_t, dladm_errlist_t *, + const char *, ...); static void warn(const char *, ...); static void warn_dlerr(dladm_status_t, const char *, ...); +static void warn_dlerrlist(dladm_errlist_t *); typedef struct cmd { char *c_name; @@ -406,6 +415,17 @@ static cmd_t cmds[] = { " <bridge>\n" " show-bridge -t [-p] [-o <field>,...] [-s [-i <interval>]]" " <bridge>\n" }, + { "create-overlay", do_create_overlay, + " create-overlay [-t] -e <encap> -s <search> -v <vnetid>\n" + "\t\t [ -p <prop>=<value>[,...]] <overlay>" }, + { "delete-overlay", do_delete_overlay, + " delete-overlay <overlay>" }, + { "modify-overlay", do_modify_overlay, + " modify-overlay -d mac | -f | -s mac=ip:port " + "<overlay>" }, + { "show-overlay", do_show_overlay, + " show-overlay [-f | -t] [[-p] -o <field>,...] " + "[<overlay>]\n" }, { "show-usage", do_show_usage, " show-usage [-a] [-d | -F <format>] " "[-s <DD/MM/YYYY,HH:MM:SS>]\n" @@ -1430,6 +1450,82 @@ static ofmt_field_t bridge_trill_fields[] = { offsetof(bridge_trill_fields_buf_t, bridget_nexthop), print_default_cb }, { NULL, 0, 0, NULL}}; +static const struct option overlay_create_lopts[] = { + { "encap", required_argument, NULL, 'e' }, + { "prop", required_argument, NULL, 'p' }, + { "search", required_argument, NULL, 's' }, + { "temporary", no_argument, NULL, 't' }, + { "vnetid", required_argument, NULL, 'v' }, + { NULL, 0, NULL, 0 } +}; + +static const struct option overlay_modify_lopts[] = { + { "delete-entry", required_argument, NULL, 'd' }, + { "flush-table", no_argument, NULL, 'f' }, + { "set-entry", required_argument, NULL, 's' }, + { NULL, 0, NULL, 0 } +}; + +static const struct option overlay_show_lopts[] = { + { "fma", no_argument, NULL, 'f' }, + { "target", no_argument, NULL, 't' }, + { "parsable", no_argument, NULL, 'p' }, + { "parseable", no_argument, NULL, 'p' }, + { "output", required_argument, NULL, 'o' }, + { NULL, 0, NULL, 0 } +}; + +/* + * Structures for dladm show-overlay + */ +typedef enum { + OVERLAY_LINK, + OVERLAY_PROPERTY, + OVERLAY_PERM, + OVERLAY_REQ, + OVERLAY_VALUE, + OVERLAY_DEFAULT, + OVERLAY_POSSIBLE +} overlay_field_index_t; + +static const ofmt_field_t overlay_fields[] = { +/* name, field width, index */ +{ "LINK", 19, OVERLAY_LINK, print_overlay_cb }, +{ "PROPERTY", 19, OVERLAY_PROPERTY, print_overlay_cb }, +{ "PERM", 5, OVERLAY_PERM, print_overlay_cb }, +{ "REQ", 4, OVERLAY_REQ, print_overlay_cb }, +{ "VALUE", 11, OVERLAY_VALUE, print_overlay_cb }, +{ "DEFAULT", 10, OVERLAY_DEFAULT, print_overlay_cb }, +{ "POSSIBLE", 10, OVERLAY_POSSIBLE, print_overlay_cb }, +{ NULL, 0, 0, NULL } +}; + +typedef enum { + OVERLAY_FMA_LINK, + OVERLAY_FMA_STATUS, + OVERLAY_FMA_DETAILS +} overlay_fma_field_index_t; + +static const ofmt_field_t overlay_fma_fields[] = { +{ "LINK", 20, OVERLAY_FMA_LINK, print_overlay_fma_cb }, +{ "STATUS", 8, OVERLAY_FMA_STATUS, print_overlay_fma_cb }, +{ "DETAILS", 52, OVERLAY_FMA_DETAILS, print_overlay_fma_cb }, +{ NULL, 0, 0, NULL } +}; + +typedef enum { + OVERLAY_TARG_LINK, + OVERLAY_TARG_TARGET, + OVERLAY_TARG_DEST +} overlay_targ_field_index_t; + +static const ofmt_field_t overlay_targ_fields[] = { +{ "LINK", 20, OVERLAY_TARG_LINK, print_overlay_targ_cb }, +{ "TARGET", 18, OVERLAY_TARG_TARGET, print_overlay_targ_cb }, +{ "DESTINATION", 42, OVERLAY_TARG_DEST, print_overlay_targ_cb }, +{ NULL, 0, 0, NULL } +}; + static char *progname; static sig_atomic_t signalled; @@ -1439,6 +1535,12 @@ static sig_atomic_t signalled; */ static dladm_handle_t handle = NULL; +/* + * Global error list that all routines can use. It's initialized by the main + * code. + */ +static dladm_errlist_t errlist; + #define DLADM_ETHERSTUB_NAME "etherstub" #define DLADM_IS_ETHERSTUB(id) (id == DATALINK_INVALID_LINKID) @@ -1506,6 +1608,8 @@ main(int argc, char *argv[]) "could not open /dev/dld"); } + dladm_errlist_init(&errlist); + cmdp->c_fn(argc - 1, &argv[1], cmdp->c_usage); dladm_close(handle); @@ -4801,7 +4905,7 @@ do_create_vnic(int argc, char *argv[], const char *use) status = dladm_vnic_create(handle, name, dev_linkid, mac_addr_type, mac_addr, maclen, &mac_slot, mac_prefix_len, vid, vrid, af, - &linkid, proplist, flags); + &linkid, proplist, &errlist, flags); switch (status) { case DLADM_STATUS_OK: break; @@ -4812,7 +4916,8 @@ do_create_vnic(int argc, char *argv[], const char *use) break; default: - die_dlerr(status, "vnic creation over %s failed", devname); + die_dlerrlist(status, &errlist, "vnic creation over %s failed", + devname); } dladm_free_props(proplist); @@ -5311,7 +5416,7 @@ do_create_etherstub(int argc, char *argv[], const char *use) status = dladm_vnic_create(handle, name, DATALINK_INVALID_LINKID, VNIC_MAC_ADDR_TYPE_AUTO, mac_addr, ETHERADDRL, NULL, 0, 0, - VRRP_VRID_NONE, AF_UNSPEC, NULL, NULL, flags); + VRRP_VRID_NONE, AF_UNSPEC, NULL, NULL, &errlist, flags); if (status != DLADM_STATUS_OK) die_dlerr(status, "etherstub creation failed"); } @@ -8953,6 +9058,21 @@ warn_dlerr(dladm_status_t err, const char *format, ...) (void) fprintf(stderr, ": %s\n", dladm_status2str(err, errmsg)); } +static void +warn_dlerrlist(dladm_errlist_t *errlist) +{ + if (errlist != NULL && errlist->el_count > 0) { + int i; + for (i = 0; i < errlist->el_count; i++) { + (void) fprintf(stderr, gettext("%s: warning: "), + progname); + + (void) fprintf(stderr, "%s\n", + gettext(errlist->el_errs[i])); + } + } +} + /* * Also closes the dladm handle if it is not NULL. */ @@ -8978,6 +9098,34 @@ die_dlerr(dladm_status_t err, const char *format, ...) exit(EXIT_FAILURE); } +/* + * Like die_dlerr, but uses the errlist for additional information. + */ +/* PRINTFLIKE3 */ +static void +die_dlerrlist(dladm_status_t err, dladm_errlist_t *errlist, + const char *format, ...) +{ + va_list alist; + char errmsg[DLADM_STRSIZE]; + + warn_dlerrlist(errlist); + format = gettext(format); + (void) fprintf(stderr, "%s: ", progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + (void) fprintf(stderr, ": %s\n", dladm_status2str(err, errmsg)); + + /* close dladm handle if it was opened */ + if (handle != NULL) + dladm_close(handle); + + exit(EXIT_FAILURE); + +} + /* PRINTFLIKE1 */ static void die(const char *format, ...) @@ -9685,3 +9833,680 @@ do_up_part(int argc, char *argv[], const char *use) (void) dladm_part_up(handle, partid, 0); } + +static void +do_create_overlay(int argc, char *argv[], const char *use) +{ + int opt; + char *encap = NULL, *endp, *search = NULL; + char name[MAXLINKNAMELEN]; + dladm_status_t status; + uint32_t flags = DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST; + uint64_t vid = 0; + boolean_t havevid = B_FALSE; + char propstr[DLADM_STRSIZE]; + dladm_arg_list_t *proplist = NULL; + + bzero(propstr, sizeof (propstr)); + while ((opt = getopt_long(argc, argv, ":te:v:p:s:", + overlay_create_lopts, NULL)) != -1) { + switch (opt) { + case 'e': + encap = optarg; + break; + case 's': + search = optarg; + break; + case 't': + flags &= ~DLADM_OPT_PERSIST; + break; + case 'p': + (void) strlcat(propstr, optarg, DLADM_STRSIZE); + if (strlcat(propstr, ",", DLADM_STRSIZE) >= + DLADM_STRSIZE) + die("property list too long '%s'", propstr); + break; + case 'v': + vid = strtoul(optarg, &endp, 10); + if (*endp != '\0' || (vid == 0 && errno == EINVAL)) + die("couldn't parse virtual networkd id: %s", + optarg); + if (vid == ULONG_MAX && errno == ERANGE) + die("virtual networkd id too large: %s", + optarg); + havevid = B_TRUE; + break; + default: + die_opterr(optopt, opt, use); + } + } + + /* + * Overlays do not currently support persistence. + * This will be addressed by https://www.illumos.org/issues/14434 + */ + if ((flags & DLADM_OPT_PERSIST) != 0) + die("overlays do not (yet) support persistence, use -t"); + + if (havevid == B_FALSE) + die("missing required virtual network id"); + + if (encap == NULL) + die("missing required encapsulation plugin"); + + if (search == NULL) + die("missing required search plugin"); + + if (optind != (argc - 1)) + die("missing device name"); + + if (strlcpy(name, argv[optind], MAXLINKNAMELEN) >= MAXLINKNAMELEN) + die("link name too long '%s'", argv[optind]); + + if (!dladm_valid_linkname(name)) + die("invalid link name '%s'", argv[optind]); + + if (strlen(encap) + 1 > MAXLINKNAMELEN) + die("encapsulation plugin name too long '%s'", encap); + + if (strlen(search) + 1 > MAXLINKNAMELEN) + die("search plugin name too long '%s'", encap); + + if (dladm_parse_link_props(propstr, &proplist, B_FALSE) + != DLADM_STATUS_OK) + die("invalid overlay property"); + + status = dladm_overlay_create(handle, name, encap, search, vid, + proplist, &errlist, flags); + dladm_free_props(proplist); + if (status != DLADM_STATUS_OK) { + die_dlerrlist(status, &errlist, "overlay creation failed"); + } +} + +/* ARGSUSED */ +static void +do_delete_overlay(int argc, char *argv[], const char *use) +{ + datalink_id_t linkid = DATALINK_ALL_LINKID; + dladm_status_t status; + + if (argc != 2) { + usage(); + } + + status = dladm_name2info(handle, argv[1], &linkid, NULL, NULL, NULL); + if (status != DLADM_STATUS_OK) + die_dlerr(status, "failed to delete %s", argv[1]); + + status = dladm_overlay_delete(handle, linkid); + if (status != DLADM_STATUS_OK) + die_dlerr(status, "failed to delete %s", argv[1]); +} + +typedef struct showoverlay_state { + ofmt_handle_t sho_ofmt; + const char *sho_linkname; + dladm_overlay_propinfo_handle_t sho_info; + uint8_t sho_value[DLADM_OVERLAY_PROP_SIZEMAX]; + uint32_t sho_size; +} showoverlay_state_t; + +typedef struct showoverlay_fma_state { + ofmt_handle_t shof_ofmt; + const char *shof_linkname; + dladm_overlay_status_t *shof_status; +} showoverlay_fma_state_t; + +typedef struct showoverlay_targ_state { + ofmt_handle_t shot_ofmt; + const char *shot_linkname; + const struct ether_addr *shot_key; + const dladm_overlay_point_t *shot_point; +} showoverlay_targ_state_t; + +static void +print_overlay_value(char *outbuf, uint_t bufsize, uint_t type, const void *pbuf, + const size_t psize) +{ + const struct in6_addr *ipv6; + struct in_addr ip; + + switch (type) { + case OVERLAY_PROP_T_INT: + if (psize != 1 && psize != 2 && psize != 4 && psize != 8) { + (void) snprintf(outbuf, bufsize, "?"); + break; + } + if (psize == 1) + (void) snprintf(outbuf, bufsize, "%d", *(int8_t *)pbuf); + if (psize == 2) + (void) snprintf(outbuf, bufsize, "%d", + *(int16_t *)pbuf); + if (psize == 4) + (void) snprintf(outbuf, bufsize, "%d", + *(int32_t *)pbuf); + if (psize == 8) + (void) snprintf(outbuf, bufsize, "%d", + *(int64_t *)pbuf); + break; + case OVERLAY_PROP_T_UINT: + if (psize != 1 && psize != 2 && psize != 4 && psize != 8) { + (void) snprintf(outbuf, bufsize, "?"); + break; + } + if (psize == 1) + (void) snprintf(outbuf, bufsize, "%d", + *(uint8_t *)pbuf); + if (psize == 2) + (void) snprintf(outbuf, bufsize, "%d", + *(uint16_t *)pbuf); + if (psize == 4) + (void) snprintf(outbuf, bufsize, "%d", + *(uint32_t *)pbuf); + if (psize == 8) + (void) snprintf(outbuf, bufsize, "%d", + *(uint64_t *)pbuf); + break; + case OVERLAY_PROP_T_IP: + if (psize != sizeof (struct in6_addr)) { + warn("malformed overlay IP property: %d bytes\n", + psize); + (void) snprintf(outbuf, bufsize, "--"); + break; + } + + ipv6 = pbuf; + if (IN6_IS_ADDR_V4MAPPED(ipv6)) { + IN6_V4MAPPED_TO_INADDR(ipv6, &ip); + if (inet_ntop(AF_INET, &ip, outbuf, bufsize) == NULL) { + warn("malformed overlay IP property\n"); + (void) snprintf(outbuf, bufsize, "--"); + break; + } + } else { + if (inet_ntop(AF_INET6, ipv6, outbuf, bufsize) == + NULL) { + warn("malformed overlay IP property\n"); + (void) snprintf(outbuf, bufsize, "--"); + break; + } + } + + break; + case OVERLAY_PROP_T_STRING: + (void) snprintf(outbuf, bufsize, "%s", pbuf); + break; + default: + abort(); + } + + return; + +} + +static boolean_t +print_overlay_cb(ofmt_arg_t *ofarg, char *buf, uint_t bufsize) +{ + dladm_status_t status; + showoverlay_state_t *sp = ofarg->ofmt_cbarg; + dladm_overlay_propinfo_handle_t infop = sp->sho_info; + const char *pname; + uint_t type, prot; + const void *def; + uint32_t defsize; + const mac_propval_range_t *rangep; + + if ((status = dladm_overlay_prop_info(infop, &pname, &type, &prot, &def, + &defsize, &rangep)) != DLADM_STATUS_OK) { + warn_dlerr(status, "failed to get get property info"); + return (B_TRUE); + } + + switch (ofarg->ofmt_id) { + case OVERLAY_LINK: + (void) snprintf(buf, bufsize, "%s", sp->sho_linkname); + break; + case OVERLAY_PROPERTY: + (void) snprintf(buf, bufsize, "%s", pname); + break; + case OVERLAY_PERM: + if ((prot & OVERLAY_PROP_PERM_RW) == OVERLAY_PROP_PERM_RW) { + (void) snprintf(buf, bufsize, "%s", "rw"); + } else if ((prot & OVERLAY_PROP_PERM_RW) == + OVERLAY_PROP_PERM_READ) { + (void) snprintf(buf, bufsize, "%s", "r-"); + } else { + (void) snprintf(buf, bufsize, "%s", "--"); + } + break; + case OVERLAY_REQ: + (void) snprintf(buf, bufsize, "%s", + prot & OVERLAY_PROP_PERM_REQ ? "y" : "-"); + break; + case OVERLAY_VALUE: + if (sp->sho_size == 0) { + (void) snprintf(buf, bufsize, "%s", "--"); + } else { + print_overlay_value(buf, bufsize, type, sp->sho_value, + sp->sho_size); + } + break; + case OVERLAY_DEFAULT: + if (defsize == 0) { + (void) snprintf(buf, bufsize, "%s", "--"); + } else { + print_overlay_value(buf, bufsize, type, def, defsize); + } + break; + case OVERLAY_POSSIBLE: { + int i; + char **vals, *ptr, *lim; + if (rangep->mpr_count == 0) { + (void) snprintf(buf, bufsize, "%s", "--"); + break; + } + + vals = malloc((sizeof (char *) + DLADM_PROP_VAL_MAX) * + rangep->mpr_count); + if (vals == NULL) + die("insufficient memory"); + for (i = 0; i < rangep->mpr_count; i++) { + vals[i] = (char *)vals + sizeof (char *) * + rangep->mpr_count + i * DLADM_MAX_PROP_VALCNT; + } + + if (dladm_range2strs(rangep, vals) != 0) { + free(vals); + (void) snprintf(buf, bufsize, "%s", "?"); + break; + } + + ptr = buf; + lim = buf + bufsize; + for (i = 0; i < rangep->mpr_count; i++) { + ptr += snprintf(ptr, lim - ptr, "%s,", vals[i]); + if (ptr >= lim) + break; + } + if (rangep->mpr_count > 0) + buf[strlen(buf) - 1] = '\0'; + free(vals); + break; + } + default: + abort(); + } + return (B_TRUE); +} + +static int +dladm_overlay_show_one(dladm_handle_t handle, datalink_id_t linkid, + dladm_overlay_propinfo_handle_t phdl, void *arg) +{ + showoverlay_state_t *sp = arg; + sp->sho_info = phdl; + + sp->sho_size = sizeof (sp->sho_value); + if (dladm_overlay_get_prop(handle, linkid, phdl, &sp->sho_value, + &sp->sho_size) != DLADM_STATUS_OK) + return (DLADM_WALK_CONTINUE); + + ofmt_print(sp->sho_ofmt, sp); + return (DLADM_WALK_CONTINUE); +} + +static int +show_one_overlay(dladm_handle_t hdl, datalink_id_t linkid, void *arg) +{ + char buf[MAXLINKNAMELEN]; + showoverlay_state_t state; + datalink_class_t class; + + if (dladm_datalink_id2info(hdl, linkid, NULL, &class, NULL, buf, + MAXLINKNAMELEN) != DLADM_STATUS_OK || + class != DATALINK_CLASS_OVERLAY) + return (DLADM_WALK_CONTINUE); + + state.sho_linkname = buf; + state.sho_ofmt = arg; + + dladm_errlist_reset(&errlist); + (void) dladm_overlay_walk_prop(handle, linkid, dladm_overlay_show_one, + &state, &errlist); + warn_dlerrlist(&errlist); + + return (DLADM_WALK_CONTINUE); +} + +static boolean_t +print_overlay_targ_cb(ofmt_arg_t *ofarg, char *buf, uint_t bufsize) +{ + char keybuf[ETHERADDRSTRL]; + const showoverlay_targ_state_t *shot = ofarg->ofmt_cbarg; + const dladm_overlay_point_t *point = shot->shot_point; + char macbuf[ETHERADDRSTRL]; + char ipbuf[INET6_ADDRSTRLEN]; + custr_t *cus; + + switch (ofarg->ofmt_id) { + case OVERLAY_TARG_LINK: + (void) snprintf(buf, bufsize, shot->shot_linkname); + break; + case OVERLAY_TARG_TARGET: + if ((point->dop_flags & DLADM_OVERLAY_F_DEFAULT) != 0) { + (void) snprintf(buf, bufsize, "*:*:*:*:*:*"); + } else { + if (ether_ntoa_r(shot->shot_key, keybuf) == NULL) { + warn("encountered malformed mac address key\n"); + return (B_FALSE); + } + (void) snprintf(buf, bufsize, "%s", keybuf); + } + break; + case OVERLAY_TARG_DEST: + if (custr_alloc_buf(&cus, buf, bufsize) != 0) { + die("ran out of memory for printing the overlay " + "target destination"); + } + + if (point->dop_dest & OVERLAY_PLUGIN_D_ETHERNET) { + if (ether_ntoa_r(&point->dop_mac, macbuf) == NULL) { + warn("encountered malformed mac address target " + "for key %s\n", keybuf); + return (B_FALSE); + } + (void) custr_append(cus, macbuf); + } + + if (point->dop_dest & OVERLAY_PLUGIN_D_IP) { + if (IN6_IS_ADDR_V4MAPPED(&point->dop_ip)) { + struct in_addr v4; + IN6_V4MAPPED_TO_INADDR(&point->dop_ip, &v4); + if (inet_ntop(AF_INET, &v4, ipbuf, + sizeof (ipbuf)) == NULL) + abort(); + } else if (inet_ntop(AF_INET6, &point->dop_ip, ipbuf, + sizeof (ipbuf)) == NULL) { + /* + * The only failures we should get are + * EAFNOSUPPORT and ENOSPC because of buffer + * exhaustion. In either of these cases, that + * means something has gone horribly wrong. + */ + abort(); + } + if (point->dop_dest & OVERLAY_PLUGIN_D_ETHERNET) + (void) custr_appendc(cus, ','); + (void) custr_append(cus, ipbuf); + } + + if (point->dop_dest & OVERLAY_PLUGIN_D_PORT) { + if (point->dop_dest & OVERLAY_PLUGIN_D_IP) + (void) custr_appendc(cus, ':'); + else if (point->dop_dest & OVERLAY_PLUGIN_D_ETHERNET) + (void) custr_appendc(cus, ','); + (void) custr_append_printf(cus, "%u", point->dop_port); + } + + custr_free(cus); + + break; + } + return (B_TRUE); +} + +/* ARGSUSED */ +static int +show_one_overlay_table_entry(dladm_handle_t handle, datalink_id_t linkid, + const struct ether_addr *key, const dladm_overlay_point_t *point, void *arg) +{ + showoverlay_targ_state_t *shot = arg; + + shot->shot_key = key; + shot->shot_point = point; + ofmt_print(shot->shot_ofmt, shot); + + return (DLADM_WALK_CONTINUE); +} + +/* ARGSUSED */ +static int +show_one_overlay_table(dladm_handle_t handle, datalink_id_t linkid, void *arg) +{ + char linkbuf[MAXLINKNAMELEN]; + showoverlay_targ_state_t shot; + datalink_class_t class; + + if (dladm_datalink_id2info(handle, linkid, NULL, &class, NULL, linkbuf, + MAXLINKNAMELEN) != DLADM_STATUS_OK || + class != DATALINK_CLASS_OVERLAY) + return (DLADM_WALK_CONTINUE); + + shot.shot_ofmt = arg; + shot.shot_linkname = linkbuf; + + (void) dladm_overlay_walk_cache(handle, linkid, + show_one_overlay_table_entry, &shot); + + return (DLADM_WALK_CONTINUE); +} + +static boolean_t +print_overlay_fma_cb(ofmt_arg_t *ofarg, char *buf, uint_t bufsize) +{ + showoverlay_fma_state_t *shof = ofarg->ofmt_cbarg; + dladm_overlay_status_t *st = shof->shof_status; + + switch (ofarg->ofmt_id) { + case OVERLAY_FMA_LINK: + (void) snprintf(buf, bufsize, "%s", shof->shof_linkname); + break; + case OVERLAY_FMA_STATUS: + (void) snprintf(buf, bufsize, st->dos_degraded == B_TRUE ? + "DEGRADED": "ONLINE"); + break; + case OVERLAY_FMA_DETAILS: + (void) snprintf(buf, bufsize, "%s", st->dos_degraded == B_TRUE ? + st->dos_fmamsg : "-"); + break; + default: + abort(); + } + return (B_TRUE); +} + +/* ARGSUSED */ +static void +show_one_overlay_fma_cb(dladm_handle_t handle, datalink_id_t linkid, + dladm_overlay_status_t *stat, void *arg) +{ + showoverlay_fma_state_t *shof = arg; + shof->shof_status = stat; + ofmt_print(shof->shof_ofmt, shof); +} + + +static int +show_one_overlay_fma(dladm_handle_t handle, datalink_id_t linkid, void *arg) +{ + dladm_status_t status; + char linkbuf[MAXLINKNAMELEN]; + datalink_class_t class; + showoverlay_fma_state_t shof; + + if (dladm_datalink_id2info(handle, linkid, NULL, &class, NULL, linkbuf, + MAXLINKNAMELEN) != DLADM_STATUS_OK || + class != DATALINK_CLASS_OVERLAY) { + die("datalink %s is not an overlay device\n", linkbuf); + } + + shof.shof_ofmt = arg; + shof.shof_linkname = linkbuf; + + status = dladm_overlay_status(handle, linkid, + show_one_overlay_fma_cb, &shof); + if (status != DLADM_STATUS_OK) + die_dlerr(status, "failed to obtain device status for %s", + linkbuf); + + return (DLADM_WALK_CONTINUE); +} + +static void +do_show_overlay(int argc, char *argv[], const char *use) +{ + int i, opt; + datalink_id_t linkid = DATALINK_ALL_LINKID; + dladm_status_t status; + int (*funcp)(dladm_handle_t, datalink_id_t, void *); + char *fields_str = NULL; + const ofmt_field_t *fieldsp; + ofmt_status_t oferr; + boolean_t parse; + ofmt_handle_t ofmt; + uint_t ofmtflags; + int err; + + + funcp = show_one_overlay; + fieldsp = overlay_fields; + parse = B_FALSE; + ofmtflags = OFMT_WRAP; + while ((opt = getopt_long(argc, argv, ":o:pft", overlay_show_lopts, + NULL)) != -1) { + switch (opt) { + case 'f': + funcp = show_one_overlay_fma; + fieldsp = overlay_fma_fields; + break; + case 'o': + fields_str = optarg; + break; + case 'p': + parse = B_TRUE; + ofmtflags = OFMT_PARSABLE; + break; + case 't': + funcp = show_one_overlay_table; + fieldsp = overlay_targ_fields; + break; + default: + die_opterr(optopt, opt, use); + } + } + + if (fields_str != NULL && strcasecmp(fields_str, "all") == 0) + fields_str = NULL; + + oferr = ofmt_open(fields_str, fieldsp, ofmtflags, 0, &ofmt); + ofmt_check(oferr, parse, ofmt, die, warn); + + err = 0; + if (argc > optind) { + for (i = optind; i < argc; i++) { + status = dladm_name2info(handle, argv[i], &linkid, + NULL, NULL, NULL); + if (status != DLADM_STATUS_OK) { + warn_dlerr(status, "failed to find %s", + argv[i]); + err = 1; + continue; + } + (void) funcp(handle, linkid, ofmt); + } + } else { + (void) dladm_walk_datalink_id(funcp, handle, ofmt, + DATALINK_CLASS_OVERLAY, DATALINK_ANY_MEDIATYPE, + DLADM_OPT_ACTIVE); + } + ofmt_close(ofmt); + + exit(err); +} + +static void +do_modify_overlay(int argc, char *argv[], const char *use) +{ + int opt, ocnt = 0; + boolean_t flush, set, delete; + struct ether_addr e; + char *dest = NULL; + datalink_id_t linkid = DATALINK_ALL_LINKID; + dladm_status_t status; + + flush = set = delete = B_FALSE; + while ((opt = getopt_long(argc, argv, ":fd:s:", overlay_modify_lopts, + NULL)) != -1) { + switch (opt) { + case 'd': + if (delete == B_TRUE) + die_optdup('d'); + delete = B_TRUE; + ocnt++; + if (ether_aton_r(optarg, &e) == NULL) + die("invalid mac address: %s\n", optarg); + break; + case 'f': + if (flush == B_TRUE) + die_optdup('f'); + flush = B_TRUE; + ocnt++; + break; + case 's': + if (set == B_TRUE) + die_optdup('s'); + set = B_TRUE; + ocnt++; + dest = strchr(optarg, '='); + *dest = '\0'; + dest++; + if (dest == NULL) + die("malformed value, expected mac=dest, " + "got: %s\n", optarg); + if (ether_aton_r(optarg, &e) == NULL) + die("invalid mac address: %s\n", optarg); + break; + default: + die_opterr(optopt, opt, use); + } + } + + if (ocnt == 0) + die("need to specify one of -d, -f, or -s"); + if (ocnt > 1) + die("only one of -d, -f, or -s may be used"); + + if (argv[optind] == NULL) + die("missing required overlay device\n"); + if (argc > optind + 1) + die("only one overlay device may be specified\n"); + + status = dladm_name2info(handle, argv[optind], &linkid, NULL, NULL, + NULL); + if (status != DLADM_STATUS_OK) { + die_dlerr(status, "failed to find overlay %s", argv[optind]); + } + + if (flush == B_TRUE) { + status = dladm_overlay_cache_flush(handle, linkid); + if (status != DLADM_STATUS_OK) + die_dlerr(status, "failed to flush target cache for " + "overlay %s", argv[optind]); + } + + if (delete == B_TRUE) { + status = dladm_overlay_cache_delete(handle, linkid, &e); + if (status != DLADM_STATUS_OK) + die_dlerr(status, "failed to flush target %s from " + "overlay target cache %s", optarg, argv[optind]); + } + + if (set == B_TRUE) { + status = dladm_overlay_cache_set(handle, linkid, &e, dest); + if (status != DLADM_STATUS_OK) + die_dlerr(status, "failed to set target %s for overlay " + "target cache %s", optarg, argv[optind]); + } + +} diff --git a/usr/src/cmd/varpd/Makefile b/usr/src/cmd/varpd/Makefile new file mode 100644 index 0000000000..4d9e29cd26 --- /dev/null +++ b/usr/src/cmd/varpd/Makefile @@ -0,0 +1,64 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +PROG= varpd +OBJS = varpd.o +SRCS = $(OBJS:%.o=../%.c) +MANIFEST = varpd.xml +ROOTLIBVARPD = $(ROOTLIB)/varpd +ROOTLIBVARPDPROG= $(PROG:%=$(ROOTLIBVARPD)/%) + + +include ../Makefile.cmd +include ../Makefile.ctf + +ROOTMANIFESTDIR= $(ROOTSVCNETWORK) + +CLEANFILES += $(OBJS) +CPPFLAGS += -D_REENTRANT +CFLAGS += $(CCVERBOSE) +LDLIBS += -lvarpd -lumem -lscf +$(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG + +CSTD= $(CSTD_GNU99) + +.KEEP_STATE: + +all: $(PROG) + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDLIBS) + $(POST_PROCESS) + +clean: + -$(RM) $(CLEANFILES) + +%.o: ../%.c + $(COMPILE.c) $< + $(POST_PROCESS_O) + +check: $(CHKMANIFEST) + +install: $(PROG) $(ROOTLIBVARPDPROG) $(ROOTMANIFEST) + +$(ROOTLIBVARPD): + $(INS.dir) + +$(ROOTLIBVARPD)/%: % $(ROOTLIBVARPD) + $(INS.file) + +FRC: + +include ../Makefile.targ diff --git a/usr/src/cmd/varpd/varpd.c b/usr/src/cmd/varpd/varpd.c new file mode 100644 index 0000000000..1b013417f8 --- /dev/null +++ b/usr/src/cmd/varpd/varpd.c @@ -0,0 +1,526 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2021 Joyent, Inc. + */ + +/* + * virtual arp daemon -- varpd + * + * The virtual arp daemon is the user land counterpart to the overlay driver. To + * truly understand its purpose and how it fits into things, you should read the + * overlay big theory statement in uts/common/io/overlay/overlay.c. + * + * varpd's purpose it to provide a means for looking up the destination on the + * underlay network for a host on an overlay network and to also be a door + * server such that dladm(1M) via libdladm can configure and get useful status + * information. The heavy lifting is all done by libvarpd and the various lookup + * plugins. + * + * When varpd first starts up, we take care of chdiring into /var/run/varpd, + * which is also where we create /var/run/varpd/varpd.door, our door server. + * After that we daemonize and only after we daemonize do we go ahead and load + * plugins. The reason that we don't load plugins before daemonizing is that + * they could very well be creating threads and thus lose them all. In general, + * we want to make things easier on our children and not require them to be + * fork safe. + * + * Once it's spun up, the main varpd thread sits in sigsuspend and really just + * hangs out waiting for something, libvarpd handles everything else. + */ + +#include <libvarpd.h> +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <signal.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <libgen.h> +#include <stdarg.h> +#include <stdlib.h> +#include <paths.h> +#include <limits.h> +#include <sys/corectl.h> +#include <signal.h> +#include <strings.h> +#include <sys/wait.h> +#include <unistd.h> +#include <thread.h> +#include <priv.h> +#include <libscf.h> + +#define VARPD_EXIT_REQUESTED SMF_EXIT_OK +#define VARPD_EXIT_FATAL SMF_EXIT_ERR_FATAL +#define VARPD_EXIT_USAGE SMF_EXIT_ERR_CONFIG + +#define VARPD_RUNDIR "/var/run/varpd" +#define VARPD_DEFAULT_DOOR "/var/run/varpd/varpd.door" + +#define VARPD_PG "varpd" +#define VARPD_PROP_INC "include_path" + +static varpd_handle_t *varpd_handle; +static const char *varpd_pname; +static volatile boolean_t varpd_exit = B_FALSE; + +/* + * Debug builds are automatically wired up for umem debugging. + */ +#ifdef DEBUG +const char * +_umem_debug_init() +{ + return ("default,verbose"); +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); +} +#endif /* DEBUG */ + +static void +varpd_vwarn(FILE *out, const char *fmt, va_list ap) +{ + int error = errno; + + (void) fprintf(out, "%s: ", varpd_pname); + (void) vfprintf(out, fmt, ap); + + if (fmt[strlen(fmt) - 1] != '\n') + (void) fprintf(out, ": %s\n", strerror(error)); +} + +static void +varpd_fatal(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + varpd_vwarn(stderr, fmt, ap); + va_end(ap); + + exit(VARPD_EXIT_FATAL); +} + +static void +varpd_dfatal(int dfd, const char *fmt, ...) +{ + int status = VARPD_EXIT_FATAL; + va_list ap; + + va_start(ap, fmt); + varpd_vwarn(stdout, fmt, ap); + va_end(ap); + + /* Take a single shot at this */ + (void) write(dfd, &status, sizeof (status)); + exit(status); +} + +/* ARGSUSED */ +static int +varpd_plugin_walk_cb(varpd_handle_t *vph, const char *name, void *unused) +{ + (void) printf("loaded %s!\n", name); + return (0); +} + +static int +varpd_dir_setup(void) +{ + int fd; + + if (mkdir(VARPD_RUNDIR, 0700) != 0) { + if (errno != EEXIST) + varpd_fatal("failed to create %s: %s", VARPD_RUNDIR, + strerror(errno)); + } + + fd = open(VARPD_RUNDIR, O_RDONLY); + if (fd < 0) + varpd_fatal("failed to open %s: %s", VARPD_RUNDIR, + strerror(errno)); + + if (fchown(fd, UID_NETADM, GID_NETADM) != 0) + varpd_fatal("failed to chown %s: %s\n", VARPD_RUNDIR, + strerror(errno)); + + return (fd); +} + +/* + * Because varpd is generally run under SMF, we opt to keep its stdout and + * stderr to be whatever our parent set them up to be. + */ +static void +varpd_fd_setup(void) +{ + int dupfd; + + closefrom(STDERR_FILENO + 1); + dupfd = open(_PATH_DEVNULL, O_RDONLY); + if (dupfd < 0) + varpd_fatal("failed to open %s: %s", _PATH_DEVNULL, + strerror(errno)); + if (dup2(dupfd, STDIN_FILENO) == -1) + varpd_fatal("failed to dup out stdin: %s", strerror(errno)); +} + +/* + * We borrow fmd's daemonization style. Basically, the parent waits for the + * child to successfully set up a door and recover all of the old configurations + * before we say that we're good to go. + */ +static int +varpd_daemonize(int dirfd) +{ + char path[PATH_MAX]; + struct rlimit rlim; + sigset_t set, oset; + int estatus, pfds[2]; + pid_t child; + priv_set_t *pset; + + /* + * Set a per-process core path to be inside of /var/run/varpd. Make sure + * that we aren't limited in our dump size. + */ + (void) snprintf(path, sizeof (path), + "/var/run/varpd/core.%s.%%p", varpd_pname); + (void) core_set_process_path(path, strlen(path) + 1, getpid()); + + rlim.rlim_cur = RLIM_INFINITY; + rlim.rlim_max = RLIM_INFINITY; + (void) setrlimit(RLIMIT_CORE, &rlim); + + /* + * Claim as many file descriptors as the system will let us. + */ + if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) { + rlim.rlim_cur = rlim.rlim_max; + (void) setrlimit(RLIMIT_NOFILE, &rlim); + } + + /* + * chdir /var/run/varpd + */ + if (fchdir(dirfd) != 0) + varpd_fatal("failed to chdir to %s", VARPD_RUNDIR); + + + /* + * At this point block all signals going in so we don't have the parent + * mistakingly exit when the child is running, but never block SIGABRT. + */ + if (sigfillset(&set) != 0) + abort(); + if (sigdelset(&set, SIGABRT) != 0) + abort(); + if (sigprocmask(SIG_BLOCK, &set, &oset) != 0) + abort(); + + /* + * Do the fork+setsid dance. + */ + if (pipe(pfds) != 0) + varpd_fatal("failed to create pipe for daemonizing"); + + if ((child = fork()) == -1) + varpd_fatal("failed to fork for daemonizing"); + + if (child != 0) { + /* We'll be exiting shortly, so allow for silent failure */ + (void) close(pfds[1]); + if (read(pfds[0], &estatus, sizeof (estatus)) == + sizeof (estatus)) + _exit(estatus); + + if (waitpid(child, &estatus, 0) == child && WIFEXITED(estatus)) + _exit(WEXITSTATUS(estatus)); + + _exit(VARPD_EXIT_FATAL); + } + + /* + * Drop privileges here. + * + * We should make sure we keep around PRIV_NET_PRIVADDR and + * PRIV_SYS_DLCONFIG, but drop everything else; however, keep basic + * privs and have our child drop them. + * + * We should also run as netadm:netadm and drop all of our groups. + */ + if (setgroups(0, NULL) != 0) + abort(); + if (setgid(GID_NETADM) == -1 || seteuid(UID_NETADM) == -1) + abort(); + if ((pset = priv_allocset()) == NULL) + abort(); + priv_basicset(pset); + if (priv_delset(pset, PRIV_PROC_EXEC) == -1 || + priv_delset(pset, PRIV_PROC_INFO) == -1 || + priv_delset(pset, PRIV_PROC_FORK) == -1 || + priv_delset(pset, PRIV_PROC_SESSION) == -1 || + priv_delset(pset, PRIV_FILE_LINK_ANY) == -1 || + priv_addset(pset, PRIV_SYS_DL_CONFIG) == -1 || + priv_addset(pset, PRIV_NET_PRIVADDR) == -1) { + abort(); + } + /* + * Remove privs from the permitted set. That will cause them to be + * removed from the effective set. We want to make sure that in the case + * of a vulnerability, something can't get back in here and wreak more + * havoc. But if we want non-basic privs in the effective set, we have + * to request them explicitly. + */ + if (setppriv(PRIV_SET, PRIV_PERMITTED, pset) == -1) + abort(); + if (setppriv(PRIV_SET, PRIV_EFFECTIVE, pset) == -1) + abort(); + + priv_freeset(pset); + + if (close(pfds[0]) != 0) + abort(); + if (setsid() == -1) + abort(); + if (sigprocmask(SIG_SETMASK, &oset, NULL) != 0) + abort(); + (void) umask(0022); + + return (pfds[1]); +} + +static int +varpd_setup_lookup_threads(void) +{ + int ret; + long i, ncpus = sysconf(_SC_NPROCESSORS_ONLN) * 2 + 1; + + if (ncpus <= 0) + abort(); + for (i = 0; i < ncpus; i++) { + thread_t thr; + + ret = thr_create(NULL, 0, libvarpd_overlay_lookup_run, + varpd_handle, THR_DETACHED | THR_DAEMON, &thr); + if (ret != 0) + return (ret); + } + + return (0); +} + +static void +varpd_cleanup(void) +{ + varpd_exit = B_TRUE; +} + +/* + * Load default information from SMF and apply any of if necessary. We recognize + * the following properties: + * + * varpd/include_path Treat these as a series of -i options. + * + * If we're not under SMF, just move on. + */ +static void +varpd_load_smf(int dfd) +{ + char *fmri, *inc; + scf_simple_prop_t *prop; + + if ((fmri = getenv("SMF_FMRI")) == NULL) + return; + + if ((prop = scf_simple_prop_get(NULL, fmri, VARPD_PG, + VARPD_PROP_INC)) == NULL) + return; + + while ((inc = scf_simple_prop_next_astring(prop)) != NULL) { + int err = libvarpd_plugin_load(varpd_handle, inc); + if (err != 0) { + varpd_dfatal(dfd, "failed to load from %s: %s\n", + inc, strerror(err)); + } + } + + scf_simple_prop_free(prop); +} + +/* + * There are a bunch of things we need to do to be a proper daemon here. + * + * o Ensure that /var/run/varpd exists or create it + * o make stdin /dev/null (stdout?) + * o Ensure any other fds that we somehow inherited are closed, eg. + * closefrom() + * o Properly daemonize + * o Mask all signals except sigabrt before creating our first door -- all + * other doors will inherit from that. + * o Have the main thread sigsuspend looking for most things that are + * actionable... + */ +int +main(int argc, char *argv[]) +{ + int err, c, dirfd, dfd, i; + const char *doorpath = VARPD_DEFAULT_DOOR; + sigset_t set; + struct sigaction act; + int nincpath = 0, nextincpath = 0; + char **incpath = NULL; + + varpd_pname = basename(argv[0]); + + /* + * We want to clean up our file descriptors before we do anything else + * as we can't assume that libvarpd won't open file descriptors, etc. + */ + varpd_fd_setup(); + + if ((err = libvarpd_create(&varpd_handle)) != 0) { + varpd_fatal("failed to open a libvarpd handle"); + return (1); + } + + while ((c = getopt(argc, argv, ":i:d:")) != -1) { + switch (c) { + case 'i': + if (nextincpath == nincpath) { + if (nincpath == 0) + nincpath = 16; + else + nincpath *= 2; + incpath = realloc(incpath, sizeof (char *) * + nincpath); + if (incpath == NULL) { + (void) fprintf(stderr, "failed to " + "allocate memory for the %dth " + "-I option: %s\n", nextincpath + 1, + strerror(errno)); + } + + } + incpath[nextincpath] = optarg; + nextincpath++; + break; + case 'd': + doorpath = optarg; + break; + default: + (void) fprintf(stderr, "unknown option: %c\n", c); + return (1); + } + } + + dirfd = varpd_dir_setup(); + + (void) libvarpd_plugin_walk(varpd_handle, varpd_plugin_walk_cb, NULL); + + dfd = varpd_daemonize(dirfd); + + /* + * Now that we're in the child, go ahead and load all of our plug-ins. + * We do this, in part, because these plug-ins may need threads of their + * own and fork won't preserve those and we'd rather the plug-ins don't + * have to learn about fork-handlers. + */ + for (i = 0; i < nextincpath; i++) { + err = libvarpd_plugin_load(varpd_handle, incpath[i]); + if (err != 0) { + varpd_dfatal(dfd, "failed to load from %s: %s\n", + incpath[i], strerror(err)); + } + } + + varpd_load_smf(dfd); + + if ((err = libvarpd_persist_enable(varpd_handle, VARPD_RUNDIR)) != 0) + varpd_dfatal(dfd, "failed to enable varpd persistence: %s\n", + strerror(err)); + + if ((err = libvarpd_persist_restore(varpd_handle)) != 0) + varpd_dfatal(dfd, "failed to enable varpd persistence: %s\n", + strerror(err)); + + /* + * The ur-door thread will inherit from this signal mask. So set it to + * what we want before doing anything else. In addition, so will our + * threads that handle varpd lookups. + */ + if (sigfillset(&set) != 0) + varpd_dfatal(dfd, "failed to fill a signal set..."); + + if (sigdelset(&set, SIGABRT) != 0) + varpd_dfatal(dfd, "failed to unmask SIGABRT"); + + if (sigprocmask(SIG_BLOCK, &set, NULL) != 0) + varpd_dfatal(dfd, "failed to set our door signal mask"); + + if ((err = varpd_setup_lookup_threads()) != 0) + varpd_dfatal(dfd, "failed to create lookup threads: %s\n", + strerror(err)); + + if ((err = libvarpd_door_server_create(varpd_handle, doorpath)) != 0) + varpd_dfatal(dfd, "failed to create door server at %s: %s\n", + doorpath, strerror(err)); + + /* + * At this point, finish up signal initialization and finally go ahead, + * notify the parent that we're okay, and enter the sigsuspend loop. + */ + bzero(&act, sizeof (struct sigaction)); + act.sa_handler = varpd_cleanup; + if (sigfillset(&act.sa_mask) != 0) + varpd_dfatal(dfd, "failed to fill sigaction mask"); + act.sa_flags = 0; + if (sigaction(SIGHUP, &act, NULL) != 0) + varpd_dfatal(dfd, "failed to register HUP handler"); + if (sigdelset(&set, SIGHUP) != 0) + varpd_dfatal(dfd, "failed to remove HUP from mask"); + if (sigaction(SIGQUIT, &act, NULL) != 0) + varpd_dfatal(dfd, "failed to register QUIT handler"); + if (sigdelset(&set, SIGQUIT) != 0) + varpd_dfatal(dfd, "failed to remove QUIT from mask"); + if (sigaction(SIGINT, &act, NULL) != 0) + varpd_dfatal(dfd, "failed to register INT handler"); + if (sigdelset(&set, SIGINT) != 0) + varpd_dfatal(dfd, "failed to remove INT from mask"); + if (sigaction(SIGTERM, &act, NULL) != 0) + varpd_dfatal(dfd, "failed to register TERM handler"); + if (sigdelset(&set, SIGTERM) != 0) + varpd_dfatal(dfd, "failed to remove TERM from mask"); + + err = 0; + (void) write(dfd, &err, sizeof (err)); + (void) close(dfd); + + for (;;) { + if (sigsuspend(&set) == -1) + if (errno == EFAULT) + abort(); + if (varpd_exit == B_TRUE) + break; + } + + libvarpd_door_server_destroy(varpd_handle); + libvarpd_destroy(varpd_handle); + + return (VARPD_EXIT_REQUESTED); +} diff --git a/usr/src/cmd/varpd/varpd.xml b/usr/src/cmd/varpd/varpd.xml new file mode 100644 index 0000000000..df7015a3d6 --- /dev/null +++ b/usr/src/cmd/varpd/varpd.xml @@ -0,0 +1,67 @@ +<?xml version="1.0"?> +<!DOCTYPE service_bundle SYSTEM "/usr/share/lib/xml/dtd/service_bundle.dtd.1"> +<!-- +This file and its contents are supplied under the terms of the +Common Development and Distribution License ("CDDL"), version 1.0. +You may only use this file in accordance with the terms of version +1.0 of the CDDL. + +A full copy of the text of the CDDL should have accompanied this +source. A copy of the CDDL is also available via the Internet at +http://www.illumos.org/license/CDDL. + +Copyright 2018, Joyent, Inc. +--> + +<service_bundle type="manifest" name="illumos:varpd" > + + <service name="network/varpd" type="service" version="1" > + + <create_default_instance enabled="true" /> + + <single_instance/> + + <dependency name="varpd-network-physical" + grouping="require_all" + restart_on="none" + type="service"> + <service_fmri value="svc:/network/physical:default" /> + </dependency> + + <dependency name="varpd-device-local" + grouping="require_all" + restart_on="none" + type="service"> + <service_fmri value="svc:/system/device/local:default" /> + </dependency> + + <exec_method + type="method" + name="start" + exec="/usr/lib/varpd/varpd" + timeout_seconds="60" /> + + <exec_method + type="method" + name="stop" + exec=":kill" + timeout_seconds="10" /> + + <property_group name='varpd' type='application'> + <property name='include_path' type='astring'> + <astring_list> + <value_node value='/usr/lib/varpd'/> + </astring_list> + </property> + </property_group> + + <stability value='Unstable' /> + + <template> + <common_name> + <loctext xml:lang="C">virtual ARP daemon + </loctext> + </common_name> + </template> + </service> +</service_bundle> diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile index aa163cc3bf..d1a33d262d 100644 --- a/usr/src/lib/Makefile +++ b/usr/src/lib/Makefile @@ -272,6 +272,7 @@ SUBDIRS += \ sun_fc \ sun_sas \ udapl \ + varpd \ watchmalloc \ $($(MACH)_SUBDIRS) @@ -491,6 +492,7 @@ HDRSUBDIRS= \ smbsrv \ smhba \ udapl \ + varpd \ $($(MACH)_HDRSUBDIRS) i386_HDRSUBDIRS= \ @@ -609,7 +611,7 @@ libdhcputil: libgen libinetutil libdlpi libdiskmgt: libdevid libdevinfo libadm libefi libkstat libsysevent $(INTEL_BLD)libdiskmgt: libfdisk libdladm: libdevinfo libinetutil libscf librcm libexacct libkstat \ - libpool + libpool varpd libdlpi: libinetutil libdladm libds: libsysevent libdtrace: libproc libgen libctf libmapmalloc @@ -721,6 +723,8 @@ storage: libdevice libdevinfo libdevid sun_fc: libdevinfo libsysevent sun_sas: libdevinfo libsysevent libkstat libdevid udapl: libdevinfo libdladm +varpd: libavl libidspace libumem libnsl libnvpair libmd5 librename \ + libcustr # # The reason this rule checks for the existence of the diff --git a/usr/src/lib/libdladm/Makefile b/usr/src/lib/libdladm/Makefile index 5202579b6c..e4825d91da 100644 --- a/usr/src/lib/libdladm/Makefile +++ b/usr/src/lib/libdladm/Makefile @@ -20,6 +20,7 @@ # # # Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright 2015, Joyent, Inc. # # @@ -29,7 +30,7 @@ HDRS = libdladm.h libdladm_impl.h libdllink.h libdlaggr.h \ libdlwlan.h libdlwlan_impl.h libdlvnic.h libdlvlan.h \ libdlmgmt.h libdlflow.h libdlflow_impl.h libdlstat.h \ libdlether.h libdlsim.h libdlbridge.h libdliptun.h \ - libdlib.h + libdlib.h libdloverlay.h HDRDIR = common @@ -71,7 +72,13 @@ TYPELIST = \ dlmgmt_getconfsnapshot_retval_t \ dlmgmt_door_zoneboot_t \ dlmgmt_remapid_retval_t \ - dlmgmt_createid_retval_t + dlmgmt_createid_retval_t \ + overlay_ioc_create_t \ + overlay_ioc_activate_t \ + overlay_ioc_delete_t \ + overlay_ioc_nprops_t \ + overlay_ioc_propinfo_t \ + overlay_ioc_prop_t all := TARGET = all clean := TARGET = clean diff --git a/usr/src/lib/libdladm/Makefile.com b/usr/src/lib/libdladm/Makefile.com index d170a97998..13a5e8384a 100644 --- a/usr/src/lib/libdladm/Makefile.com +++ b/usr/src/lib/libdladm/Makefile.com @@ -28,7 +28,8 @@ VERS = .1 OBJECTS = libdladm.o secobj.o linkprop.o libdllink.o libdlaggr.o \ libdlwlan.o libdlvnic.o libdlmgmt.o libdlvlan.o libdlib.o\ flowattr.o flowprop.o propfuncs.o libdlflow.o libdlstat.o \ - usage.o libdlether.o libdlsim.o libdlbridge.o libdliptun.o + usage.o libdlether.o libdlsim.o libdlbridge.o libdliptun.o \ + libdloverlay.o include ../../Makefile.lib @@ -37,7 +38,7 @@ include ../../Makefile.rootfs LIBS = $(DYNLIB) LDLIBS += -ldevinfo -lc -linetutil -lsocket -lscf -lrcm -lnvpair \ - -lexacct -lkstat -lpool + -lexacct -lkstat -lpool -lvarpd SRCDIR = ../common @@ -51,9 +52,10 @@ CPPFLAGS += -I$(SRCDIR) -D_REENTRANT # not linted SMATCH=off +CSTD= $(CSTD_GNU99) + .KEEP_STATE: all: $(LIBS) - include $(SRC)/lib/Makefile.targ diff --git a/usr/src/lib/libdladm/common/libdladm.c b/usr/src/lib/libdladm/common/libdladm.c index eb099376a4..55e6d3e1e0 100644 --- a/usr/src/lib/libdladm/common/libdladm.c +++ b/usr/src/lib/libdladm/common/libdladm.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ /* @@ -37,6 +38,9 @@ #include <strings.h> #include <dirent.h> #include <stdlib.h> +#include <assert.h> +#include <stdio.h> +#include <stdarg.h> #include <netinet/in.h> #include <arpa/inet.h> #include <sys/param.h> @@ -440,6 +444,9 @@ dladm_status2str(dladm_status_t status, char *buf) case DLADM_STATUS_PERSIST_ON_TEMP: s = "can't create persistent object on top of temporary object"; break; + case DLADM_STATUS_BAD_ENCAP: + s = "invalid encapsulation protocol"; + break; default: s = "<unknown error>"; break; @@ -672,6 +679,9 @@ dladm_class2str(datalink_class_t class, char *buf) case DATALINK_CLASS_PART: s = "part"; break; + case DATALINK_CLASS_OVERLAY: + s = "overlay"; + break; default: s = "unknown"; break; @@ -1157,15 +1167,15 @@ dladm_strs2range(char **prop_val, uint_t val_cnt, mac_propval_type_t type, * Convert a mac_propval_range_t structure into an array of elements. */ dladm_status_t -dladm_range2list(mac_propval_range_t *rangep, void *elem, uint_t *nelem) +dladm_range2list(const mac_propval_range_t *rangep, void *elem, uint_t *nelem) { int i, j, k; dladm_status_t status = DLADM_STATUS_OK; switch (rangep->mpr_type) { case MAC_PROPVAL_UINT32: { - mac_propval_uint32_range_t *ur; - uint32_t *elem32 = elem; + const mac_propval_uint32_range_t *ur; + uint32_t *elem32 = elem; k = 0; ur = &rangep->mpr_range_uint32[0]; @@ -1193,13 +1203,13 @@ dladm_range2list(mac_propval_range_t *rangep, void *elem, uint_t *nelem) * of single elements or ranges. */ int -dladm_range2strs(mac_propval_range_t *rangep, char **prop_val) +dladm_range2strs(const mac_propval_range_t *rangep, char **prop_val) { int i; switch (rangep->mpr_type) { case MAC_PROPVAL_UINT32: { - mac_propval_uint32_range_t *ur; + const mac_propval_uint32_range_t *ur; /* Write ranges and individual elements */ ur = &rangep->mpr_range_uint32[0]; @@ -1216,6 +1226,20 @@ dladm_range2strs(mac_propval_range_t *rangep, char **prop_val) } return (0); } + case MAC_PROPVAL_STR: { + const mac_propval_str_range_t *str; + size_t coff, len; + + coff = 0; + str = &rangep->u.mpr_str; + for (i = 0; i < rangep->mpr_count; i++) { + len = strlen(&str->mpur_data[coff]); + (void) strlcpy(prop_val[i], &str->mpur_data[coff], + DLADM_PROP_VAL_MAX); + coff += len + 1; + } + return (0); + } default: break; } @@ -1293,3 +1317,54 @@ dladm_list2range(void *elem, uint_t nelem, mac_propval_type_t type, return (status); } + +void +dladm_errlist_init(dladm_errlist_t *erl) +{ + bzero(erl, sizeof (dladm_errlist_t)); +} + +void +dladm_errlist_reset(dladm_errlist_t *erl) +{ + uint_t i; + + for (i = 0; i < erl->el_count; i++) + free(erl->el_errs[i]); + free(erl->el_errs); + dladm_errlist_init(erl); +} + +dladm_status_t +dladm_errlist_append(dladm_errlist_t *erl, const char *fmt, ...) +{ + int ret; + va_list ap; + char *m = NULL; + + if (erl->el_count == erl->el_alloc) { + int alloc; + void *addr; + if (erl->el_alloc == 0) { + assert(erl->el_errs == NULL); + alloc = 32; + } else { + alloc = erl->el_alloc + 32; + } + addr = realloc(erl->el_errs, sizeof (char *) * alloc); + if (addr == NULL) + return (DLADM_STATUS_NOMEM); + + erl->el_errs = addr; + erl->el_alloc = alloc; + } + + va_start(ap, fmt); + ret = vasprintf(&m, fmt, ap); + va_end(ap); + if (ret == -1) + return (dladm_errno2status(errno)); + erl->el_errs[erl->el_count] = m; + erl->el_count++; + return (DLADM_STATUS_OK); +} diff --git a/usr/src/lib/libdladm/common/libdladm.h b/usr/src/lib/libdladm/common/libdladm.h index 350c9c50f3..5a97bacaa0 100644 --- a/usr/src/lib/libdladm/common/libdladm.h +++ b/usr/src/lib/libdladm/common/libdladm.h @@ -23,6 +23,7 @@ */ /* + * Copyright 2015, Joyent, Inc. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association */ @@ -179,7 +180,8 @@ typedef enum { DLADM_STATUS_INVALID_PKEY_TBL_SIZE, DLADM_STATUS_PORT_NOPROTO, DLADM_STATUS_INVALID_MTU, - DLADM_STATUS_PERSIST_ON_TEMP + DLADM_STATUS_PERSIST_ON_TEMP, + DLADM_STATUS_BAD_ENCAP } dladm_status_t; typedef enum { @@ -233,6 +235,12 @@ typedef struct dladm_arg_list { char *al_buf; } dladm_arg_list_t; +typedef struct dladm_errlist { + uint_t el_count; + uint_t el_alloc; + char **el_errs; +} dladm_errlist_t; + typedef enum { DLADM_LOGTYPE_LINK = 1, DLADM_LOGTYPE_FLOW @@ -294,12 +302,15 @@ extern dladm_status_t dladm_zone_halt(dladm_handle_t, zoneid_t); extern dladm_status_t dladm_strs2range(char **, uint_t, mac_propval_type_t, mac_propval_range_t **); -extern dladm_status_t dladm_range2list(mac_propval_range_t *, void*, +extern dladm_status_t dladm_range2list(const mac_propval_range_t *, void *, uint_t *); -extern int dladm_range2strs(mac_propval_range_t *, char **); +extern int dladm_range2strs(const mac_propval_range_t *, char **); extern dladm_status_t dladm_list2range(void *, uint_t, mac_propval_type_t, mac_propval_range_t **); +extern void dladm_errlist_init(dladm_errlist_t *); +extern void dladm_errlist_reset(dladm_errlist_t *); + #ifdef __cplusplus } #endif diff --git a/usr/src/lib/libdladm/common/libdladm_impl.h b/usr/src/lib/libdladm/common/libdladm_impl.h index 20db1cb1d7..9cd91d56c1 100644 --- a/usr/src/lib/libdladm/common/libdladm_impl.h +++ b/usr/src/lib/libdladm/common/libdladm_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* @@ -173,6 +174,12 @@ typedef struct resource_prop_s { */ #define FBRIDGE "bridge" /* string */ +/* + * For error lists + */ +extern dladm_status_t dladm_errlist_append(dladm_errlist_t *, + const char *, ...); + #ifdef __cplusplus } #endif diff --git a/usr/src/lib/libdladm/common/libdloverlay.c b/usr/src/lib/libdladm/common/libdloverlay.c new file mode 100644 index 0000000000..a25be3d201 --- /dev/null +++ b/usr/src/lib/libdladm/common/libdloverlay.c @@ -0,0 +1,885 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#include <libdladm_impl.h> +#include <libdllink.h> +#include <libdloverlay.h> +#include <sys/dld.h> +#include <sys/overlay.h> +#include <strings.h> +#include <unistd.h> +#include <stdlib.h> +#include <errno.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <limits.h> +#include <libvarpd_client.h> + +#define VARPD_PROPERTY_NAME "varpd/id" + +static const char *dladm_overlay_doorpath = "/var/run/varpd/varpd.door"; + +typedef struct dladm_overlay_propinfo { + boolean_t dop_isvarpd; + union { + overlay_ioc_propinfo_t *dop_overlay; + varpd_client_prop_handle_t *dop_varpd; + } dop_un; +} dladm_overlay_propinfo_t; + +dladm_status_t +dladm_overlay_prop_info(dladm_overlay_propinfo_handle_t phdl, + const char **namep, uint_t *typep, uint_t *protp, const void **defp, + uint32_t *sizep, const mac_propval_range_t **possp) +{ + dladm_overlay_propinfo_t *infop = (dladm_overlay_propinfo_t *)phdl; + overlay_ioc_propinfo_t *oinfop = infop->dop_un.dop_overlay; + + if (infop->dop_isvarpd == B_FALSE) { + if (namep != NULL) + *namep = oinfop->oipi_name; + if (typep != NULL) + *typep = oinfop->oipi_type; + if (protp != NULL) + *protp = oinfop->oipi_prot; + if (defp != NULL) + *defp = oinfop->oipi_default; + if (sizep != NULL) + *sizep = oinfop->oipi_defsize; + if (possp != NULL) { + *possp = (const mac_propval_range_t *)oinfop->oipi_poss; + } + + } else { + int ret; + ret = libvarpd_c_prop_info(infop->dop_un.dop_varpd, namep, + typep, protp, defp, sizep, possp); + if (ret != 0) + return (dladm_errno2status(ret)); + + } + + return (DLADM_STATUS_OK); +} + +static dladm_status_t +dladm_overlay_parse_prop(overlay_prop_type_t type, void *buf, uint32_t *sizep, + const char *val) +{ + int ret; + int64_t ival; + uint64_t uval; + char *eptr; + struct in6_addr ipv6; + struct in_addr ip; + + switch (type) { + case OVERLAY_PROP_T_INT: + errno = 0; + ival = strtol(val, &eptr, 10); + if ((ival == 0 && errno == EINVAL) || + ((ival == LONG_MAX || ival == LONG_MIN) && + errno == ERANGE)) + return (DLADM_STATUS_BADARG); + bcopy(&ival, buf, sizeof (int64_t)); + *sizep = sizeof (int64_t); + break; + case OVERLAY_PROP_T_UINT: + errno = 0; + uval = strtol(val, &eptr, 10); + if ((uval == 0 && errno == EINVAL) || + (uval == ULONG_MAX && errno == ERANGE)) + return (DLADM_STATUS_BADARG); + bcopy(&uval, buf, sizeof (uint64_t)); + *sizep = sizeof (uint64_t); + break; + case OVERLAY_PROP_T_STRING: + ret = strlcpy((char *)buf, val, OVERLAY_PROP_SIZEMAX); + if (ret >= OVERLAY_PROP_SIZEMAX) + return (DLADM_STATUS_BADARG); + *sizep = ret + 1; + break; + case OVERLAY_PROP_T_IP: + /* + * Always try to parse the IP as an IPv6 address. If that fails, + * try to interpret it as an IPv4 address and transform it into + * an IPv6 mapped IPv4 address. + */ + if (inet_pton(AF_INET6, val, &ipv6) != 1) { + if (inet_pton(AF_INET, val, &ip) != 1) + return (DLADM_STATUS_BADARG); + + IN6_INADDR_TO_V4MAPPED(&ip, &ipv6); + } + bcopy(&ipv6, buf, sizeof (struct in6_addr)); + *sizep = sizeof (struct in6_addr); + break; + default: + abort(); + } + + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +static dladm_status_t +dladm_overlay_varpd_setprop(dladm_handle_t handle, varpd_client_handle_t *chdl, + uint64_t inst, const char *name, char *const *valp, uint_t cnt) +{ + int ret; + uint32_t size; + uint8_t buf[LIBVARPD_PROP_SIZEMAX]; + varpd_client_prop_handle_t *phdl; + uint_t type; + dladm_status_t status; + + if ((ret = libvarpd_c_prop_handle_alloc(chdl, inst, &phdl)) != 0) + return (dladm_errno2status(ret)); + + if ((ret = libvarpd_c_prop_info_fill_by_name(phdl, name)) != 0) { + libvarpd_c_prop_handle_free(phdl); + return (dladm_errno2status(ret)); + } + + if ((ret = libvarpd_c_prop_info(phdl, NULL, &type, NULL, NULL, NULL, + NULL)) != 0) { + libvarpd_c_prop_handle_free(phdl); + return (dladm_errno2status(ret)); + } + + if ((status = dladm_overlay_parse_prop(type, buf, &size, valp[0])) != + DLADM_STATUS_OK) { + libvarpd_c_prop_handle_free(phdl); + return (status); + } + + ret = libvarpd_c_prop_set(phdl, buf, size); + libvarpd_c_prop_handle_free(phdl); + + return (dladm_errno2status(ret)); +} + +dladm_status_t +dladm_overlay_setprop(dladm_handle_t handle, datalink_id_t linkid, + const char *name, char *const *valp, uint_t cnt) +{ + int ret; + dladm_status_t status; + overlay_ioc_propinfo_t info; + overlay_ioc_prop_t prop; + + if (linkid == DATALINK_INVALID_LINKID || + name == NULL || valp == NULL || cnt != 1) + return (DLADM_STATUS_BADARG); + + bzero(&info, sizeof (overlay_ioc_propinfo_t)); + info.oipi_linkid = linkid; + info.oipi_id = -1; + if (strlcpy(info.oipi_name, name, OVERLAY_PROP_NAMELEN) >= + OVERLAY_PROP_NAMELEN) + return (DLADM_STATUS_BADARG); + + status = DLADM_STATUS_OK; + ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_PROPINFO, &info); + if (ret != 0) + status = dladm_errno2status(errno); + + if (status != DLADM_STATUS_OK) + return (status); + + prop.oip_linkid = linkid; + prop.oip_id = info.oipi_id; + prop.oip_name[0] = '\0'; + if ((ret = dladm_overlay_parse_prop(info.oipi_type, prop.oip_value, + &prop.oip_size, valp[0])) != DLADM_STATUS_OK) + return (ret); + + status = DLADM_STATUS_OK; + ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_SETPROP, &prop); + if (ret != 0) + status = dladm_errno2status(errno); + + return (ret); +} + +/* + * Tell the user about any unset required properties. + */ +static int +dladm_overlay_activate_cb(dladm_handle_t handle, datalink_id_t linkid, + dladm_overlay_propinfo_handle_t phdl, void *arg) +{ + dladm_status_t status; + uint8_t buf[DLADM_OVERLAY_PROP_SIZEMAX]; + uint_t prot; + size_t size = sizeof (buf); + const char *name; + dladm_errlist_t *errs = arg; + + if ((status = dladm_overlay_prop_info(phdl, &name, NULL, &prot, NULL, + NULL, NULL)) != DLADM_STATUS_OK) + return (status); + + if ((prot & OVERLAY_PROP_PERM_REQ) == 0) + return (DLADM_WALK_CONTINUE); + + if (dladm_overlay_get_prop(handle, linkid, phdl, buf, &size) != + DLADM_STATUS_OK) + return (DLADM_WALK_CONTINUE); + + if (size == 0) + (void) dladm_errlist_append(errs, "unset required property: %s", + name); + + return (DLADM_WALK_CONTINUE); +} + +/* + * We need to clean up the world here. The problem is that we may or may not + * actually have everything created. While in the normal case, we'd always have + * an overlay device, assigned datalink id, and a varpd instance, we might not + * have any of those, except for the datalink instance. Therefore, as long as + * the id refers to a valid overlay, we should try to clean up as much of the + * state as possible and most importantly, we need to make sure we delete the + * datalink id. If we fail to do that, then that name will become lost to time. + */ +dladm_status_t +dladm_overlay_delete(dladm_handle_t handle, datalink_id_t linkid) +{ + datalink_class_t class; + overlay_ioc_delete_t oid; + varpd_client_handle_t *chdl; + int ret; + uint32_t flags; + uint64_t varpdid; + + if (dladm_datalink_id2info(handle, linkid, &flags, &class, NULL, + NULL, 0) != DLADM_STATUS_OK) + return (DLADM_STATUS_BADARG); + + if (class != DATALINK_CLASS_OVERLAY) + return (DLADM_STATUS_BADARG); + + oid.oid_linkid = linkid; + ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_DELETE, &oid); + if (ret != 0 && errno != ENOENT) { + return (dladm_errno2status(errno)); + } + + if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0) { + return (dladm_errno2status(ret)); + } + + if ((ret = libvarpd_c_instance_lookup(chdl, linkid, &varpdid)) != 0) { + if (ret == ENOENT) { + goto finish; + } + (void) libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + + ret = libvarpd_c_instance_destroy(chdl, varpdid); +finish: + (void) libvarpd_c_destroy(chdl); + (void) dladm_destroy_datalink_id(handle, linkid, flags); + + return (dladm_errno2status(ret)); +} + +dladm_status_t +dladm_overlay_get_prop(dladm_handle_t handle, datalink_id_t linkid, + dladm_overlay_propinfo_handle_t infohdl, void *buf, size_t *sizep) +{ + int ret; + overlay_ioc_prop_t oip; + dladm_overlay_propinfo_t *infop = (dladm_overlay_propinfo_t *)infohdl; + + /* + * It'd be nice if we had a better or more specific error for this. If + * this kind of error becomes common place, let's get a better dladm + * error. + */ + if (*sizep < DLADM_OVERLAY_PROP_SIZEMAX) + return (dladm_errno2status(ERANGE)); + + if (infop->dop_isvarpd == B_FALSE) { + bzero(&oip, sizeof (overlay_ioc_prop_t)); + oip.oip_linkid = linkid; + oip.oip_id = infop->dop_un.dop_overlay->oipi_id; + ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_GETPROP, &oip); + if (ret != 0) + return (dladm_errno2status(errno)); + bcopy(oip.oip_value, buf, DLADM_OVERLAY_PROP_SIZEMAX); + *sizep = oip.oip_size; + } else { + uint32_t size = *sizep; + + ret = libvarpd_c_prop_get(infop->dop_un.dop_varpd, buf, &size); + if (ret != 0) + return (dladm_errno2status(errno)); + *sizep = size; + } + + return (DLADM_STATUS_OK); +} + +static dladm_status_t +dladm_overlay_walk_varpd_prop(dladm_handle_t handle, datalink_id_t linkid, + uint64_t varpdid, dladm_overlay_prop_f func, void *arg) +{ + int ret, i; + varpd_client_handle_t *chdl; + varpd_client_prop_handle_t *phdl; + uint_t nprops; + dladm_status_t status; + + if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0) + return (dladm_errno2status(ret)); + + if ((ret = libvarpd_c_prop_handle_alloc(chdl, varpdid, &phdl)) != 0) { + (void) libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + + if ((ret = libvarpd_c_prop_nprops(chdl, varpdid, &nprops)) != 0) { + libvarpd_c_prop_handle_free(phdl); + (void) libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + + status = DLADM_STATUS_OK; + for (i = 0; i < nprops; i++) { + dladm_overlay_propinfo_t dop; + + bzero(&dop, sizeof (dop)); + dop.dop_isvarpd = B_TRUE; + dop.dop_un.dop_varpd = phdl; + + if ((ret = libvarpd_c_prop_info_fill(phdl, i)) != 0) { + status = dladm_errno2status(ret); + break; + } + + ret = func(handle, linkid, + (dladm_overlay_propinfo_handle_t)&dop, arg); + if (ret == DLADM_WALK_TERMINATE) + break; + } + + libvarpd_c_prop_handle_free(phdl); + libvarpd_c_destroy(chdl); + + return (status); +} + +dladm_status_t +dladm_overlay_walk_prop(dladm_handle_t handle, datalink_id_t linkid, + dladm_overlay_prop_f func, void *arg, dladm_errlist_t *errs) +{ + int i, ret; + datalink_class_t class; + overlay_ioc_nprops_t oin; + overlay_ioc_propinfo_t oipi; + dladm_overlay_propinfo_t dop; + uint64_t varpdid = UINT64_MAX; + + if (dladm_datalink_id2info(handle, linkid, NULL, &class, NULL, + NULL, 0) != DLADM_STATUS_OK) + return (DLADM_STATUS_BADARG); + + if (class != DATALINK_CLASS_OVERLAY) + return (DLADM_STATUS_BADARG); + + bzero(&oin, sizeof (overlay_ioc_nprops_t)); + oin.oipn_linkid = linkid; + ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_NPROPS, &oin); + if (ret != 0) + return (dladm_errno2status(errno)); + + for (i = 0; i < oin.oipn_nprops; i++) { + bzero(&dop, sizeof (dladm_overlay_propinfo_t)); + bzero(&oipi, sizeof (overlay_ioc_propinfo_t)); + oipi.oipi_linkid = linkid; + oipi.oipi_id = i; + ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_PROPINFO, &oipi); + if (ret != 0) { + (void) dladm_errlist_append(errs, "failed to get " + "propinfo for property %d: %s", i, strerror(errno)); + return (dladm_errno2status(errno)); + } + + dop.dop_isvarpd = B_FALSE; + dop.dop_un.dop_overlay = &oipi; + ret = func(handle, linkid, + (dladm_overlay_propinfo_handle_t)&dop, arg); + if (ret == DLADM_WALK_TERMINATE) + break; + + if (strcmp(oipi.oipi_name, VARPD_PROPERTY_NAME) == 0) { + uint8_t buf[DLADM_OVERLAY_PROP_SIZEMAX]; + size_t bufsize = sizeof (buf); + uint64_t *vp; + + if (dladm_overlay_get_prop(handle, linkid, + (dladm_overlay_propinfo_handle_t)&dop, buf, + &bufsize) != DLADM_STATUS_OK) + continue; + + vp = (uint64_t *)buf; + varpdid = *vp; + } + } + + /* Should this really be possible? */ + if (varpdid == UINT64_MAX) + return (DLADM_STATUS_OK); + + return (dladm_overlay_walk_varpd_prop(handle, linkid, varpdid, func, + arg)); +} + +dladm_status_t +dladm_overlay_create(dladm_handle_t handle, const char *name, + const char *encap, const char *search, uint64_t vid, + dladm_arg_list_t *props, dladm_errlist_t *errs, uint32_t flags) +{ + int ret, i; + dladm_status_t status; + datalink_id_t linkid; + overlay_ioc_create_t oic; + overlay_ioc_activate_t oia; + size_t slen; + varpd_client_handle_t *vch; + uint64_t id; + + status = dladm_create_datalink_id(handle, name, DATALINK_CLASS_OVERLAY, + DL_ETHER, flags, &linkid); + if (status != DLADM_STATUS_OK) + return (status); + + bzero(&oic, sizeof (oic)); + oic.oic_linkid = linkid; + oic.oic_vnetid = vid; + (void) strlcpy(oic.oic_encap, encap, MAXLINKNAMELEN); + + status = DLADM_STATUS_OK; + ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_CREATE, &oic); + if (ret != 0) { + /* + * It'd be nice if we had private errors so we could better + * distinguish between different classes of errors. + */ + status = dladm_errno2status(errno); + } + + if (status != DLADM_STATUS_OK) { + (void) dladm_destroy_datalink_id(handle, linkid, flags); + return (status); + } + + slen = strlen(search); + for (i = 0; props != NULL && i < props->al_count; i++) { + dladm_arg_info_t *aip = &props->al_info[i]; + + /* + * If it's a property for the search plugin, eg. it has the + * prefix '<search>/', then we don't set the property on the + * overlay device and instead set it on the varpd instance. + */ + if (strncmp(aip->ai_name, search, slen) == 0 && + aip->ai_name[slen] == '/') + continue; + status = dladm_overlay_setprop(handle, linkid, aip->ai_name, + aip->ai_val, aip->ai_count); + if (status != DLADM_STATUS_OK) { + (void) dladm_errlist_append(errs, + "failed to set property %s", + aip->ai_name); + (void) dladm_overlay_delete(handle, linkid); + return (status); + } + } + + if ((ret = libvarpd_c_create(&vch, dladm_overlay_doorpath)) != 0) { + (void) dladm_errlist_append(errs, + "failed to create libvarpd handle: %s", strerror(ret)); + (void) dladm_overlay_delete(handle, linkid); + return (dladm_errno2status(ret)); + } + + if ((ret = libvarpd_c_instance_create(vch, linkid, search, + &id)) != 0) { + (void) dladm_errlist_append(errs, + "failed to create varpd instance: %s", strerror(ret)); + libvarpd_c_destroy(vch); + (void) dladm_overlay_delete(handle, linkid); + return (dladm_errno2status(ret)); + } + + for (i = 0; props != NULL && i < props->al_count; i++) { + dladm_arg_info_t *aip = &props->al_info[i]; + + /* + * Skip arguments we've processed already. + */ + if (strncmp(aip->ai_name, search, slen) != 0) + continue; + + if (aip->ai_name[slen] != '/') + continue; + + ret = dladm_overlay_varpd_setprop(handle, vch, id, aip->ai_name, + aip->ai_val, aip->ai_count); + if (ret != 0) { + (void) dladm_errlist_append(errs, + "failed to set varpd prop: %s\n", + aip->ai_name); + (void) libvarpd_c_instance_destroy(vch, id); + libvarpd_c_destroy(vch); + (void) dladm_overlay_delete(handle, linkid); + return (dladm_errno2status(ret)); + } + } + + if ((ret = libvarpd_c_instance_activate(vch, id)) != 0) { + (void) dladm_errlist_append(errs, + "failed to activate varpd instance: %s", strerror(ret)); + (void) dladm_overlay_walk_varpd_prop(handle, linkid, id, + dladm_overlay_activate_cb, errs); + (void) libvarpd_c_instance_destroy(vch, id); + libvarpd_c_destroy(vch); + (void) dladm_overlay_delete(handle, linkid); + return (dladm_errno2status(ret)); + + } + + bzero(&oia, sizeof (oia)); + oia.oia_linkid = linkid; + status = DLADM_STATUS_OK; + ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_ACTIVATE, &oia); + if (ret != 0) { + ret = errno; + (void) dladm_errlist_append(errs, "failed to activate " + "device: %s", strerror(ret)); + (void) libvarpd_c_instance_destroy(vch, id); + (void) dladm_overlay_walk_prop(handle, linkid, + dladm_overlay_activate_cb, errs, errs); + status = dladm_errno2status(ret); + (void) libvarpd_c_instance_destroy(vch, id); + } + + libvarpd_c_destroy(vch); + if (status != DLADM_STATUS_OK) + (void) dladm_overlay_delete(handle, linkid); + + return (status); +} + + + +typedef struct overlay_walk_cb { + dladm_handle_t owc_handle; + datalink_id_t owc_linkid; + void *owc_arg; + dladm_overlay_cache_f owc_func; + uint_t owc_mode; + uint_t owc_dest; +} overlay_walk_cb_t; + +/* ARGSUSED */ +static int +dladm_overlay_walk_cache_cb(varpd_client_handle_t *chdl, uint64_t varpdid, + const struct ether_addr *key, const varpd_client_cache_entry_t *entry, + void *arg) +{ + overlay_walk_cb_t *owc = arg; + dladm_overlay_point_t point; + + bzero(&point, sizeof (dladm_overlay_point_t)); + point.dop_dest = owc->owc_dest; + point.dop_mac = entry->vcp_mac; + point.dop_flags = entry->vcp_flags; + point.dop_ip = entry->vcp_ip; + point.dop_port = entry->vcp_port; + + if (owc->owc_mode == OVERLAY_TARGET_POINT) + point.dop_flags |= DLADM_OVERLAY_F_DEFAULT; + + if (owc->owc_func(owc->owc_handle, owc->owc_linkid, key, &point, + owc->owc_arg) == DLADM_WALK_TERMINATE) + return (1); + return (0); +} + +dladm_status_t +dladm_overlay_walk_cache(dladm_handle_t handle, datalink_id_t linkid, + dladm_overlay_cache_f func, void *arg) +{ + int ret; + uint_t mode, dest; + uint64_t varpdid; + varpd_client_handle_t *chdl; + overlay_walk_cb_t cbarg; + + if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0) + return (dladm_errno2status(ret)); + + if ((ret = libvarpd_c_instance_lookup(chdl, linkid, &varpdid)) != 0) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + + if ((ret = libvarpd_c_instance_target_mode(chdl, varpdid, + &dest, &mode)) != 0) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + + cbarg.owc_handle = handle; + cbarg.owc_linkid = linkid; + cbarg.owc_arg = arg; + cbarg.owc_func = func; + cbarg.owc_dest = dest; + cbarg.owc_mode = mode; + ret = libvarpd_c_instance_cache_walk(chdl, varpdid, + dladm_overlay_walk_cache_cb, &cbarg); + libvarpd_c_destroy(chdl); + + return (dladm_errno2status(ret)); +} + +/* ARGSUSED */ +dladm_status_t +dladm_overlay_cache_flush(dladm_handle_t handle, datalink_id_t linkid) +{ + int ret; + uint64_t varpdid; + varpd_client_handle_t *chdl; + + if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0) + return (dladm_errno2status(ret)); + + if ((ret = libvarpd_c_instance_lookup(chdl, linkid, &varpdid)) != 0) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + + ret = libvarpd_c_instance_cache_flush(chdl, varpdid); + libvarpd_c_destroy(chdl); + + return (dladm_errno2status(ret)); +} + +/* ARGSUSED */ +dladm_status_t +dladm_overlay_cache_delete(dladm_handle_t handle, datalink_id_t linkid, + const struct ether_addr *key) +{ + int ret; + uint64_t varpdid; + varpd_client_handle_t *chdl; + + if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0) + return (dladm_errno2status(ret)); + + if ((ret = libvarpd_c_instance_lookup(chdl, linkid, &varpdid)) != 0) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + + ret = libvarpd_c_instance_cache_delete(chdl, varpdid, key); + libvarpd_c_destroy(chdl); + + return (dladm_errno2status(ret)); +} + +/* ARGSUSED */ +dladm_status_t +dladm_overlay_cache_set(dladm_handle_t handle, datalink_id_t linkid, + const struct ether_addr *key, char *val) +{ + int ret; + uint_t dest; + uint64_t varpdid; + char *ip, *port = NULL; + varpd_client_handle_t *chdl; + varpd_client_cache_entry_t vcp; + + + if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0) + return (dladm_errno2status(ret)); + + if ((ret = libvarpd_c_instance_lookup(chdl, linkid, &varpdid)) != 0) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + + if ((ret = libvarpd_c_instance_target_mode(chdl, varpdid, + &dest, NULL)) != 0) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + + /* + * Mode tells us what we should expect in val. It we have more than one + * thing listed, the canonical format of it right now is mac,ip:port. + */ + bzero(&vcp, sizeof (varpd_client_cache_entry_t)); + + if (strcasecmp(val, "drop") == 0) { + vcp.vcp_flags = OVERLAY_TARGET_CACHE_DROP; + goto send; + } + + if (dest & OVERLAY_PLUGIN_D_ETHERNET) { + if (ether_aton_r(val, &vcp.vcp_mac) == NULL) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(EINVAL)); + } + } + + if (dest & OVERLAY_PLUGIN_D_IP) { + if (dest & OVERLAY_PLUGIN_D_ETHERNET) { + if ((ip = strchr(val, ',')) == NULL) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + ip++; + } else { + ip = val; + } + + if (dest & OVERLAY_PLUGIN_D_PORT) { + if ((port = strchr(val, ':')) == NULL) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + *port = '\0'; + port++; + } + + /* Try v6, then fall back to v4 */ + ret = inet_pton(AF_INET6, ip, &vcp.vcp_ip); + if (ret == -1) + abort(); + if (ret == 0) { + struct in_addr v4; + + ret = inet_pton(AF_INET, ip, &v4); + if (ret == -1) + abort(); + if (ret == 0) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + IN6_INADDR_TO_V4MAPPED(&v4, &vcp.vcp_ip); + } + } + + if (dest & OVERLAY_PLUGIN_D_PORT) { + char *eptr; + unsigned long l; + if (port == NULL && (dest & OVERLAY_PLUGIN_D_ETHERNET)) { + if ((port = strchr(val, ',')) == NULL) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(EINVAL)); + } + } else if (port == NULL) + port = val; + + errno = 0; + l = strtoul(port, &eptr, 10); + if (errno != 0 || *eptr != '\0') { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(EINVAL)); + } + if (l == 0 || l > UINT16_MAX) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(EINVAL)); + } + vcp.vcp_port = l; + } + +send: + ret = libvarpd_c_instance_cache_set(chdl, varpdid, key, &vcp); + + libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); +} + +/* ARGSUSED */ +dladm_status_t +dladm_overlay_cache_get(dladm_handle_t handle, datalink_id_t linkid, + const struct ether_addr *key, dladm_overlay_point_t *point) +{ + int ret; + uint_t dest, mode; + uint64_t varpdid; + varpd_client_handle_t *chdl; + varpd_client_cache_entry_t entry; + + if ((ret = libvarpd_c_create(&chdl, dladm_overlay_doorpath)) != 0) + return (dladm_errno2status(ret)); + + if ((ret = libvarpd_c_instance_lookup(chdl, linkid, &varpdid)) != 0) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + + if ((ret = libvarpd_c_instance_target_mode(chdl, varpdid, + &dest, &mode)) != 0) { + libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); + } + + ret = libvarpd_c_instance_cache_get(chdl, varpdid, key, &entry); + if (ret == 0) { + point->dop_dest = dest; + point->dop_mac = entry.vcp_mac; + point->dop_flags = entry.vcp_flags; + point->dop_ip = entry.vcp_ip; + point->dop_port = entry.vcp_port; + if (mode == OVERLAY_TARGET_POINT) + point->dop_flags |= DLADM_OVERLAY_F_DEFAULT; + } + + libvarpd_c_destroy(chdl); + return (dladm_errno2status(ret)); +} + +dladm_status_t +dladm_overlay_status(dladm_handle_t handle, datalink_id_t linkid, + dladm_overlay_status_f func, void *arg) +{ + int ret; + dladm_status_t status; + overlay_ioc_status_t ois; + dladm_overlay_status_t dos; + + ois.ois_linkid = linkid; + status = DLADM_STATUS_OK; + ret = ioctl(dladm_dld_fd(handle), OVERLAY_IOC_STATUS, &ois); + if (ret != 0) + status = dladm_errno2status(errno); + if (status != DLADM_STATUS_OK) + return (status); + + dos.dos_degraded = ois.ois_status == OVERLAY_I_DEGRADED ? B_TRUE : + B_FALSE; + (void) strlcpy(dos.dos_fmamsg, ois.ois_message, + sizeof (dos.dos_fmamsg)); + func(handle, linkid, &dos, arg); + return (DLADM_STATUS_OK); +} diff --git a/usr/src/lib/libdladm/common/libdloverlay.h b/usr/src/lib/libdladm/common/libdloverlay.h new file mode 100644 index 0000000000..39b01ccae3 --- /dev/null +++ b/usr/src/lib/libdladm/common/libdloverlay.h @@ -0,0 +1,107 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#ifndef _LIBDLOVERLAY_H +#define _LIBDLOVERLAY_H + +/* + * libdladm Overlay device routines + */ + +#include <libdladm.h> +#include <libdladm_impl.h> +#include <sys/overlay.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define DLADM_OVERLAY_F_DROP 0x0001 +#define DLADM_OVERLAY_F_DEFAULT 0xf000 + +typedef struct dladm_overlay_point { + uint_t dop_dest; + struct ether_addr dop_mac; + uint16_t dop_flags; + struct in6_addr dop_ip; + uint16_t dop_port; +} dladm_overlay_point_t; + +typedef struct dladm_overlay_status { + boolean_t dos_degraded; + char dos_fmamsg[256]; +} dladm_overlay_status_t; + +extern dladm_status_t dladm_overlay_create(dladm_handle_t, const char *, + const char *, const char *, uint64_t, dladm_arg_list_t *, dladm_errlist_t *, + uint32_t); +extern dladm_status_t dladm_overlay_delete(dladm_handle_t, datalink_id_t); + +typedef void (*dladm_overlay_status_f)(dladm_handle_t, datalink_id_t, + dladm_overlay_status_t *, void *); +extern dladm_status_t dladm_overlay_status(dladm_handle_t, datalink_id_t, + dladm_overlay_status_f, void *); + +extern dladm_status_t dladm_overlay_cache_flush(dladm_handle_t, datalink_id_t); +extern dladm_status_t dladm_overlay_cache_delete(dladm_handle_t, datalink_id_t, + const struct ether_addr *); +extern dladm_status_t dladm_overlay_cache_set(dladm_handle_t, datalink_id_t, + const struct ether_addr *, char *); +extern dladm_status_t dladm_overlay_cache_get(dladm_handle_t, datalink_id_t, + const struct ether_addr *, dladm_overlay_point_t *); + +#define DLADM_OVERLAY_PROP_SIZEMAX 256 +#define DLADM_OVERLAY_PROP_NAMELEN 32 + +typedef struct __dladm_overlay_propinfo *dladm_overlay_propinfo_handle_t; + +extern dladm_status_t dladm_overlay_prop_info(dladm_overlay_propinfo_handle_t, + const char **, uint_t *, uint_t *, const void **, uint32_t *, + const mac_propval_range_t **); +extern dladm_status_t dladm_overlay_get_prop(dladm_handle_t, datalink_id_t, + dladm_overlay_propinfo_handle_t, void *buf, size_t *bufsize); + +typedef int (*dladm_overlay_prop_f)(dladm_handle_t, datalink_id_t, + dladm_overlay_propinfo_handle_t, void *); +extern dladm_status_t dladm_overlay_walk_prop(dladm_handle_t, datalink_id_t, + dladm_overlay_prop_f, void *arg, dladm_errlist_t *); + +typedef int (*dladm_overlay_cache_f)(dladm_handle_t, datalink_id_t, + const struct ether_addr *, const dladm_overlay_point_t *, void *); +extern dladm_status_t dladm_overlay_walk_cache(dladm_handle_t, datalink_id_t, + dladm_overlay_cache_f, void *); + +/* + * Some day we'll want to support being able to set properties after creation. + * If we do, the following strawman API might serve us well. + * + * extern dladm_status_t dladm_overlay_prop_lookup(dladm_handle_t, + * datalink_id_t, const char *, dladm_overlay_propinfo_handle_t *); + * extern void dladm_overlay_prop_handle_free(dladm_handle_t, datalink_id_t, + * dladm_overlay_propinfo_handle_t *); + * extern dladm_status_t dladm_overlay_set_prop(dladm_handle_t, datalink_id_t, + * dladm_propinfo_handle_t, void *buf, size_t *bufsize); + * extern dladm_status_t dladm_overlay_str_to_buf(dladm_handle_t, datalink_id_t, + * dladm_overlay_propinfo_handle_t *, const char *, void *, size_t *); + * extern dladm_status_t dladm_overlay_buf_to_str(dladm_handle_t, datalink_id_t, + * dladm_overlay_propinfo_handle_t *, const void *, const size_t, char *, + * size_t *); + */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBDLOVERLAY_H */ diff --git a/usr/src/lib/libdladm/common/libdlvlan.c b/usr/src/lib/libdladm/common/libdlvlan.c index 943728dc03..34c1e6682d 100644 --- a/usr/src/lib/libdladm/common/libdlvlan.c +++ b/usr/src/lib/libdladm/common/libdlvlan.c @@ -64,7 +64,7 @@ dladm_vlan_create(dladm_handle_t handle, const char *vlan, datalink_id_t linkid, { return (dladm_vnic_create(handle, vlan, linkid, VNIC_MAC_ADDR_TYPE_PRIMARY, NULL, 0, NULL, 0, vid, VRRP_VRID_NONE, - AF_UNSPEC, vlan_id_out, proplist, flags | DLADM_OPT_VLAN)); + AF_UNSPEC, vlan_id_out, proplist, NULL, flags | DLADM_OPT_VLAN)); } /* diff --git a/usr/src/lib/libdladm/common/libdlvnic.c b/usr/src/lib/libdladm/common/libdlvnic.c index bad25e69ed..73c001b744 100644 --- a/usr/src/lib/libdladm/common/libdlvnic.c +++ b/usr/src/lib/libdladm/common/libdlvnic.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent Inc. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ @@ -400,7 +401,7 @@ dladm_vnic_create(dladm_handle_t handle, const char *vnic, datalink_id_t linkid, vnic_mac_addr_type_t mac_addr_type, uchar_t *mac_addr, uint_t mac_len, int *mac_slot, uint_t mac_prefix_len, uint16_t vid, vrid_t vrid, int af, datalink_id_t *vnic_id_out, dladm_arg_list_t *proplist, - uint32_t flags) + dladm_errlist_t *errs, uint32_t flags) { dladm_vnic_attr_t attr; datalink_id_t vnic_id; @@ -567,8 +568,14 @@ dladm_vnic_create(dladm_handle_t handle, const char *vnic, datalink_id_t linkid, status = dladm_set_linkprop(handle, vnic_id, aip->ai_name, aip->ai_val, aip->ai_count, DLADM_OPT_PERSIST); - if (status != DLADM_STATUS_OK) + if (status != DLADM_STATUS_OK) { + char errmsg[DLADM_STRSIZE]; + (void) dladm_errlist_append(errs, + "failed to set property %s: %s", + aip->ai_name, + dladm_status2str(status, errmsg)); break; + } } } diff --git a/usr/src/lib/libdladm/common/libdlvnic.h b/usr/src/lib/libdladm/common/libdlvnic.h index 94b656aadf..839b2de9f2 100644 --- a/usr/src/lib/libdladm/common/libdlvnic.h +++ b/usr/src/lib/libdladm/common/libdlvnic.h @@ -55,7 +55,8 @@ typedef struct dladm_vnic_attr { extern dladm_status_t dladm_vnic_create(dladm_handle_t, const char *, datalink_id_t, vnic_mac_addr_type_t, uchar_t *, uint_t, int *, uint_t, uint16_t, vrid_t, int, - datalink_id_t *, dladm_arg_list_t *, uint32_t); + datalink_id_t *, dladm_arg_list_t *, + dladm_errlist_t *, uint32_t); extern dladm_status_t dladm_vnic_delete(dladm_handle_t, datalink_id_t, uint32_t); diff --git a/usr/src/lib/libdladm/common/mapfile-vers b/usr/src/lib/libdladm/common/mapfile-vers index 63a86529fc..eba6118ace 100644 --- a/usr/src/lib/libdladm/common/mapfile-vers +++ b/usr/src/lib/libdladm/common/mapfile-vers @@ -20,6 +20,7 @@ # # # Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright 2015 Joyent, Inc. # # @@ -269,6 +270,23 @@ SYMBOL_VERSION SUNWprivate_1.1 { dladm_strs2range; dladm_range2list; dladm_list2range; + + dladm_errlist_init; + dladm_errlist_reset; + dladm_errlist_append; + + dladm_overlay_create; + dladm_overlay_delete; + dladm_overlay_status; + dladm_overlay_prop_info; + dladm_overlay_get_prop; + dladm_overlay_walk_prop; + + dladm_overlay_cache_set; + dladm_overlay_cache_get; + dladm_overlay_cache_delete; + dladm_overlay_cache_flush; + dladm_overlay_walk_cache; local: *; }; diff --git a/usr/src/lib/varpd/Makefile b/usr/src/lib/varpd/Makefile new file mode 100644 index 0000000000..0962119d1c --- /dev/null +++ b/usr/src/lib/varpd/Makefile @@ -0,0 +1,33 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +SUBDIRS = libvarpd direct files + +all := TARGET = all +clean := TARGET = clean +clobber := TARGET = clobber +check := TARGET = check +install := TARGET = install +install_h := TARGET = install_h + +.KEEP_STATE: + +all clean clobber install install_h check: $(SUBDIRS) +direct files svp: libvarpd + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: diff --git a/usr/src/lib/varpd/Makefile.plugin b/usr/src/lib/varpd/Makefile.plugin new file mode 100644 index 0000000000..48f188500c --- /dev/null +++ b/usr/src/lib/varpd/Makefile.plugin @@ -0,0 +1,19 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +ROOTLIBDIR = $(ROOT)/usr/lib/varpd +ROOTLIBDIR64 = $(ROOT)/usr/lib/varpd/$(MACH64) + +MAPFILES += ../../libvarpd/common/mapfile-plugin diff --git a/usr/src/lib/varpd/direct/Makefile b/usr/src/lib/varpd/direct/Makefile new file mode 100644 index 0000000000..511ea1f94d --- /dev/null +++ b/usr/src/lib/varpd/direct/Makefile @@ -0,0 +1,39 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +include ../../Makefile.lib + +SUBDIRS = $(MACH) +$(BUILD64)SUBDIRS += $(MACH64) + +all := TARGET = all +clean := TARGET = clean +clobber := TARGET = clobber +install := TARGET = install + +.KEEP_STATE: + +all clean clobber install: $(SUBDIRS) + +install_h: + +check: + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: + +include ../../Makefile.targ diff --git a/usr/src/lib/varpd/direct/Makefile.com b/usr/src/lib/varpd/direct/Makefile.com new file mode 100644 index 0000000000..4e8564bae0 --- /dev/null +++ b/usr/src/lib/varpd/direct/Makefile.com @@ -0,0 +1,35 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +LIBRARY = libvarpd_direct.a +VERS = .1 +OBJECTS = libvarpd_direct.o + +include ../../../Makefile.lib +include ../../Makefile.plugin + +LIBS = $(DYNLIB) +LDLIBS += -lc -lumem -lnvpair +CPPFLAGS += -I../common + +CSTD= $(CSTD_GNU99) + +SRCDIR = ../common + +.KEEP_STATE: + +all: $(LIBS) + +include ../../../Makefile.targ diff --git a/usr/src/lib/varpd/direct/amd64/Makefile b/usr/src/lib/varpd/direct/amd64/Makefile new file mode 100644 index 0000000000..1881990d79 --- /dev/null +++ b/usr/src/lib/varpd/direct/amd64/Makefile @@ -0,0 +1,19 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +include ../Makefile.com +include ../../../Makefile.lib.64 + +install: all $(ROOTLIBS64) $(ROOTLINKS64) diff --git a/usr/src/lib/varpd/direct/common/libvarpd_direct.c b/usr/src/lib/varpd/direct/common/libvarpd_direct.c new file mode 100644 index 0000000000..ed9f79fc7f --- /dev/null +++ b/usr/src/lib/varpd/direct/common/libvarpd_direct.c @@ -0,0 +1,411 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Point to point plug-in for varpd. + * + * This plugin implements a simple point to point plugin for a packet. It + * represents the traditional tunnel, just in overlay form. As such, the only + * properties it needs are those to determine where to send everything. At this + * time, we don't allow a multicast address; however, there's no reason that the + * direct plugin shouldn't in theory support multicast, though when implementing + * it the best path will become clear. + * + * In general this module has been designed to make it easy to support a + * destination of either IP or IP and port; however, we restrict it to the + * latter as we don't currently have an implementation that would allow us to + * test that. + */ + +#include <libvarpd_provider.h> +#include <umem.h> +#include <errno.h> +#include <thread.h> +#include <synch.h> +#include <strings.h> +#include <assert.h> +#include <limits.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <libnvpair.h> + +typedef struct varpd_direct { + overlay_plugin_dest_t vad_dest; /* RO */ + mutex_t vad_lock; /* Protects the rest */ + boolean_t vad_hip; + boolean_t vad_hport; + struct in6_addr vad_ip; + uint16_t vad_port; +} varpd_direct_t; + +static const char *varpd_direct_props[] = { + "direct/dest_ip", + "direct/dest_port" +}; + +static boolean_t +varpd_direct_valid_dest(overlay_plugin_dest_t dest) +{ + if (dest & ~(OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) + return (B_FALSE); + + if (!(dest & (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))) + return (B_FALSE); + + return (B_TRUE); +} + +/* ARGSUSED */ +static int +varpd_direct_create(varpd_provider_handle_t *hdl, void **outp, + overlay_plugin_dest_t dest) +{ + int ret; + varpd_direct_t *vdp; + + if (varpd_direct_valid_dest(dest) == B_FALSE) + return (ENOTSUP); + + vdp = umem_alloc(sizeof (varpd_direct_t), UMEM_DEFAULT); + if (vdp == NULL) + return (ENOMEM); + + if ((ret = mutex_init(&vdp->vad_lock, USYNC_THREAD | LOCK_ERRORCHECK, + NULL)) != 0) { + umem_free(vdp, sizeof (varpd_direct_t)); + return (ret); + } + + vdp->vad_dest = dest; + vdp->vad_hip = B_FALSE; + vdp->vad_hport = B_FALSE; + *outp = vdp; + return (0); +} + +static int +varpd_direct_start(void *arg) +{ + varpd_direct_t *vdp = arg; + + mutex_enter(&vdp->vad_lock); + if (vdp->vad_hip == B_FALSE ||((vdp->vad_dest & OVERLAY_PLUGIN_D_IP) && + vdp->vad_hport == B_FALSE)) { + mutex_exit(&vdp->vad_lock); + return (EAGAIN); + } + mutex_exit(&vdp->vad_lock); + + return (0); +} + +/* ARGSUSED */ +static void +varpd_direct_stop(void *arg) +{ +} + +static void +varpd_direct_destroy(void *arg) +{ + varpd_direct_t *vdp = arg; + + if (mutex_destroy(&vdp->vad_lock) != 0) + abort(); + umem_free(vdp, sizeof (varpd_direct_t)); +} + +static int +varpd_direct_default(void *arg, overlay_target_point_t *otp) +{ + varpd_direct_t *vdp = arg; + + mutex_enter(&vdp->vad_lock); + bcopy(&vdp->vad_ip, &otp->otp_ip, sizeof (struct in6_addr)); + otp->otp_port = vdp->vad_port; + mutex_exit(&vdp->vad_lock); + + return (VARPD_LOOKUP_OK); +} + +static int +varpd_direct_nprops(void *arg, uint_t *nprops) +{ + const varpd_direct_t *vdp = arg; + + *nprops = 0; + if (vdp->vad_dest & OVERLAY_PLUGIN_D_ETHERNET) + *nprops += 1; + + if (vdp->vad_dest & OVERLAY_PLUGIN_D_IP) + *nprops += 1; + + if (vdp->vad_dest & OVERLAY_PLUGIN_D_PORT) + *nprops += 1; + + assert(*nprops == 1 || *nprops == 2); + + return (0); +} + +static int +varpd_direct_propinfo(void *arg, uint_t propid, varpd_prop_handle_t *vph) +{ + varpd_direct_t *vdp = arg; + + /* + * Because we only support IP + port combos right now, prop 0 should + * always be the IP. We don't support a port without an IP. + */ + assert(vdp->vad_dest & OVERLAY_PLUGIN_D_IP); + if (propid == 0) { + libvarpd_prop_set_name(vph, varpd_direct_props[0]); + libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_RRW); + libvarpd_prop_set_type(vph, OVERLAY_PROP_T_IP); + libvarpd_prop_set_nodefault(vph); + return (0); + } + + if (propid == 1 && vdp->vad_dest & OVERLAY_PLUGIN_D_PORT) { + libvarpd_prop_set_name(vph, varpd_direct_props[1]); + libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_RRW); + libvarpd_prop_set_type(vph, OVERLAY_PROP_T_UINT); + libvarpd_prop_set_nodefault(vph); + libvarpd_prop_set_range_uint32(vph, 1, UINT16_MAX); + return (0); + } + + return (EINVAL); +} + +static int +varpd_direct_getprop(void *arg, const char *pname, void *buf, uint32_t *sizep) +{ + varpd_direct_t *vdp = arg; + + /* direct/dest_ip */ + if (strcmp(pname, varpd_direct_props[0]) == 0) { + if (*sizep < sizeof (struct in6_addr)) + return (EOVERFLOW); + mutex_enter(&vdp->vad_lock); + if (vdp->vad_hip == B_FALSE) { + *sizep = 0; + } else { + bcopy(&vdp->vad_ip, buf, sizeof (struct in6_addr)); + *sizep = sizeof (struct in6_addr); + } + mutex_exit(&vdp->vad_lock); + return (0); + } + + /* direct/dest_port */ + if (strcmp(pname, varpd_direct_props[1]) == 0) { + uint64_t val; + + if (*sizep < sizeof (uint64_t)) + return (EOVERFLOW); + mutex_enter(&vdp->vad_lock); + if (vdp->vad_hport == B_FALSE) { + *sizep = 0; + } else { + val = vdp->vad_port; + bcopy(&val, buf, sizeof (uint64_t)); + *sizep = sizeof (uint64_t); + } + mutex_exit(&vdp->vad_lock); + return (0); + } + + return (EINVAL); +} + +static int +varpd_direct_setprop(void *arg, const char *pname, const void *buf, + const uint32_t size) +{ + varpd_direct_t *vdp = arg; + + /* direct/dest_ip */ + if (strcmp(pname, varpd_direct_props[0]) == 0) { + const struct in6_addr *ipv6 = buf; + + if (size < sizeof (struct in6_addr)) + return (EOVERFLOW); + + if (IN6_IS_ADDR_V4COMPAT(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_6TO4(ipv6)) + return (EINVAL); + + mutex_enter(&vdp->vad_lock); + bcopy(buf, &vdp->vad_ip, sizeof (struct in6_addr)); + vdp->vad_hip = B_TRUE; + mutex_exit(&vdp->vad_lock); + return (0); + } + + /* direct/dest_port */ + if (strcmp(pname, varpd_direct_props[1]) == 0) { + const uint64_t *valp = buf; + if (size < sizeof (uint64_t)) + return (EOVERFLOW); + + if (*valp == 0 || *valp > UINT16_MAX) + return (EINVAL); + + mutex_enter(&vdp->vad_lock); + vdp->vad_port = (uint16_t)*valp; + vdp->vad_hport = B_TRUE; + mutex_exit(&vdp->vad_lock); + return (0); + } + + return (EINVAL); +} + +static int +varpd_direct_save(void *arg, nvlist_t *nvp) +{ + int ret; + varpd_direct_t *vdp = arg; + + mutex_enter(&vdp->vad_lock); + if (vdp->vad_hport == B_TRUE) { + if ((ret = nvlist_add_uint16(nvp, varpd_direct_props[1], + vdp->vad_port)) != 0) { + mutex_exit(&vdp->vad_lock); + return (ret); + } + } + + if (vdp->vad_hip == B_TRUE) { + char buf[INET6_ADDRSTRLEN]; + + if (inet_ntop(AF_INET6, &vdp->vad_ip, buf, sizeof (buf)) == + NULL) + abort(); + if ((ret = nvlist_add_string(nvp, varpd_direct_props[0], + buf)) != 0) { + mutex_exit(&vdp->vad_lock); + return (ret); + } + } + mutex_exit(&vdp->vad_lock); + + return (0); +} + +/* ARGSUSED */ +static int +varpd_direct_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl, + overlay_plugin_dest_t dest, void **outp) +{ + int ret; + char *ipstr; + varpd_direct_t *vdp; + + if (varpd_direct_valid_dest(dest) == B_FALSE) + return (ENOTSUP); + + vdp = umem_alloc(sizeof (varpd_direct_t), UMEM_DEFAULT); + if (vdp == NULL) + return (ENOMEM); + + if ((ret = mutex_init(&vdp->vad_lock, USYNC_THREAD | LOCK_ERRORCHECK, + NULL)) != 0) { + umem_free(vdp, sizeof (varpd_direct_t)); + return (ret); + } + + if ((ret = nvlist_lookup_uint16(nvp, varpd_direct_props[1], + &vdp->vad_port)) != 0) { + if (ret != ENOENT) { + if (mutex_destroy(&vdp->vad_lock) != 0) + abort(); + umem_free(vdp, sizeof (varpd_direct_t)); + return (ret); + } + vdp->vad_hport = B_FALSE; + } else { + vdp->vad_hport = B_TRUE; + } + + if ((ret = nvlist_lookup_string(nvp, varpd_direct_props[0], + &ipstr)) != 0) { + if (ret != ENOENT) { + if (mutex_destroy(&vdp->vad_lock) != 0) + abort(); + umem_free(vdp, sizeof (varpd_direct_t)); + return (ret); + } + vdp->vad_hip = B_FALSE; + } else { + ret = inet_pton(AF_INET6, ipstr, &vdp->vad_ip); + /* + * inet_pton is only defined to return -1 with errno set to + * EAFNOSUPPORT, which really, shouldn't happen. + */ + if (ret == -1) { + assert(errno == EAFNOSUPPORT); + abort(); + } + if (ret == 0) { + if (mutex_destroy(&vdp->vad_lock) != 0) + abort(); + umem_free(vdp, sizeof (varpd_direct_t)); + return (EINVAL); + } + } + + *outp = vdp; + return (0); +} + +static const varpd_plugin_ops_t varpd_direct_ops = { + 0, + varpd_direct_create, + varpd_direct_start, + varpd_direct_stop, + varpd_direct_destroy, + varpd_direct_default, + NULL, + varpd_direct_nprops, + varpd_direct_propinfo, + varpd_direct_getprop, + varpd_direct_setprop, + varpd_direct_save, + varpd_direct_restore +}; + +#pragma init(varpd_direct_init) +static void +varpd_direct_init(void) +{ + int err; + varpd_plugin_register_t *vpr; + + vpr = libvarpd_plugin_alloc(VARPD_CURRENT_VERSION, &err); + if (vpr == NULL) + return; + + vpr->vpr_mode = OVERLAY_TARGET_POINT; + vpr->vpr_name = "direct"; + vpr->vpr_ops = &varpd_direct_ops; + (void) libvarpd_plugin_register(vpr); + libvarpd_plugin_free(vpr); +} diff --git a/usr/src/lib/varpd/direct/common/mapfile-vers b/usr/src/lib/varpd/direct/common/mapfile-vers new file mode 100644 index 0000000000..6b7c5a5067 --- /dev/null +++ b/usr/src/lib/varpd/direct/common/mapfile-vers @@ -0,0 +1,35 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION SUNWprivate { + local: + *; +}; diff --git a/usr/src/lib/varpd/direct/i386/Makefile b/usr/src/lib/varpd/direct/i386/Makefile new file mode 100644 index 0000000000..4398507523 --- /dev/null +++ b/usr/src/lib/varpd/direct/i386/Makefile @@ -0,0 +1,18 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +include ../Makefile.com + +install: all $(ROOTLIBS) $(ROOTLINKS) diff --git a/usr/src/lib/varpd/files/Makefile b/usr/src/lib/varpd/files/Makefile new file mode 100644 index 0000000000..511ea1f94d --- /dev/null +++ b/usr/src/lib/varpd/files/Makefile @@ -0,0 +1,39 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +include ../../Makefile.lib + +SUBDIRS = $(MACH) +$(BUILD64)SUBDIRS += $(MACH64) + +all := TARGET = all +clean := TARGET = clean +clobber := TARGET = clobber +install := TARGET = install + +.KEEP_STATE: + +all clean clobber install: $(SUBDIRS) + +install_h: + +check: + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: + +include ../../Makefile.targ diff --git a/usr/src/lib/varpd/files/Makefile.com b/usr/src/lib/varpd/files/Makefile.com new file mode 100644 index 0000000000..13ff2149ce --- /dev/null +++ b/usr/src/lib/varpd/files/Makefile.com @@ -0,0 +1,36 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +LIBRARY = libvarpd_files.a +VERS = .1 +OBJECTS = libvarpd_files.o \ + libvarpd_files_json.o + +include ../../../Makefile.lib +include ../../Makefile.plugin + +LIBS = $(DYNLIB) +LDLIBS += -lc -lumem -lnvpair -lsocket -lcustr +CPPFLAGS += -I../common + +CSTD= $(CSTD_GNU99) + +SRCDIR = ../common + +.KEEP_STATE: + +all: $(LIBS) + +include ../../../Makefile.targ diff --git a/usr/src/lib/varpd/files/amd64/Makefile b/usr/src/lib/varpd/files/amd64/Makefile new file mode 100644 index 0000000000..1881990d79 --- /dev/null +++ b/usr/src/lib/varpd/files/amd64/Makefile @@ -0,0 +1,19 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +include ../Makefile.com +include ../../../Makefile.lib.64 + +install: all $(ROOTLIBS64) $(ROOTLINKS64) diff --git a/usr/src/lib/varpd/files/common/libvarpd_files.c b/usr/src/lib/varpd/files/common/libvarpd_files.c new file mode 100644 index 0000000000..84cb27f9e8 --- /dev/null +++ b/usr/src/lib/varpd/files/common/libvarpd_files.c @@ -0,0 +1,605 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +/* + * Files based plug-in for varpd + * + * This is a dynamic varpd plug-in that has a static backing store. It's really + * nothing more than a glorified version of /etc/ethers, though it facilitates + * a bit more. The files module allows for the full set of mappings to be fixed + * at creation time. In addition, it also provides support for proxying ARP, + * NDP, and DHCP. + * + * At this time, the plugin requires that the destination type involve both an + * IP address and a port; however, there's no reason that this cannot be made + * more flexible as we have additional encapsulation algorithms that support it. + * The plug-in only has a single property, which is the location of the JSON + * file. The JSON file itself looks something like: + * + * { + * "aa:bb:cc:dd:ee:ff": { + * "arp": "10.23.69.1", + * "ndp": "2600:3c00::f03c:91ff:fe96:a264", + * "ip": "192.168.1.1", + * "port": 8080 + * }, + * ... + * } + */ + +#include <libvarpd_provider.h> +#include <umem.h> +#include <errno.h> +#include <thread.h> +#include <synch.h> +#include <strings.h> +#include <assert.h> +#include <limits.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <libnvpair.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/ethernet.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#include <libvarpd_files_json.h> + +typedef struct varpd_files { + overlay_plugin_dest_t vaf_dest; /* RO */ + varpd_provider_handle_t *vaf_hdl; /* RO */ + char *vaf_path; /* WO */ + nvlist_t *vaf_nvl; /* WO */ + uint64_t vaf_nmisses; /* Atomic */ + uint64_t vaf_narp; /* Atomic */ +} varpd_files_t; + +static const char *varpd_files_props[] = { + "files/config" +}; + +static boolean_t +varpd_files_valid_dest(overlay_plugin_dest_t dest) +{ + if (dest & ~(OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) + return (B_FALSE); + + if (!(dest & (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))) + return (B_FALSE); + + return (B_TRUE); +} + +static int +varpd_files_create(varpd_provider_handle_t *hdl, void **outp, + overlay_plugin_dest_t dest) +{ + varpd_files_t *vaf; + + if (varpd_files_valid_dest(dest) == B_FALSE) + return (ENOTSUP); + + vaf = umem_alloc(sizeof (varpd_files_t), UMEM_DEFAULT); + if (vaf == NULL) + return (ENOMEM); + + bzero(vaf, sizeof (varpd_files_t)); + vaf->vaf_dest = dest; + vaf->vaf_path = NULL; + vaf->vaf_nvl = NULL; + vaf->vaf_hdl = hdl; + *outp = vaf; + return (0); +} + +static int +varpd_files_normalize_nvlist(varpd_files_t *vaf, nvlist_t *nvl) +{ + int ret; + nvlist_t *out; + nvpair_t *pair; + + if ((ret = nvlist_alloc(&out, NV_UNIQUE_NAME, 0)) != 0) + return (ret); + + for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL; + pair = nvlist_next_nvpair(nvl, pair)) { + char *name, fname[ETHERADDRSTRL]; + nvlist_t *data; + struct ether_addr ether, *e; + e = ðer; + + if (nvpair_type(pair) != DATA_TYPE_NVLIST) { + nvlist_free(out); + return (EINVAL); + } + + name = nvpair_name(pair); + if ((ret = nvpair_value_nvlist(pair, &data)) != 0) { + nvlist_free(out); + return (EINVAL); + } + + if (ether_aton_r(name, e) == NULL) { + nvlist_free(out); + return (EINVAL); + } + + if (ether_ntoa_r(e, fname) == NULL) { + nvlist_free(out); + return (ENOMEM); + } + + if ((ret = nvlist_add_nvlist(out, fname, data)) != 0) { + nvlist_free(out); + return (EINVAL); + } + } + + vaf->vaf_nvl = out; + return (0); +} + +static int +varpd_files_start(void *arg) +{ + int fd, ret; + void *maddr; + struct stat st; + nvlist_t *nvl; + varpd_files_t *vaf = arg; + + if (vaf->vaf_path == NULL) + return (EAGAIN); + + if ((fd = open(vaf->vaf_path, O_RDONLY)) < 0) + return (errno); + + if (fstat(fd, &st) != 0) { + ret = errno; + if (close(fd) != 0) + abort(); + return (ret); + } + + maddr = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, + fd, 0); + if (maddr == NULL) { + ret = errno; + if (close(fd) != 0) + abort(); + return (ret); + } + + ret = nvlist_parse_json(maddr, st.st_size, &nvl, + NVJSON_FORCE_INTEGER, NULL); + if (ret == 0) { + ret = varpd_files_normalize_nvlist(vaf, nvl); + nvlist_free(nvl); + } + if (munmap(maddr, st.st_size) != 0) + abort(); + if (close(fd) != 0) + abort(); + + return (ret); +} + +static void +varpd_files_stop(void *arg) +{ + varpd_files_t *vaf = arg; + + nvlist_free(vaf->vaf_nvl); + vaf->vaf_nvl = NULL; +} + +static void +varpd_files_destroy(void *arg) +{ + varpd_files_t *vaf = arg; + + assert(vaf->vaf_nvl == NULL); + if (vaf->vaf_path != NULL) { + umem_free(vaf->vaf_path, strlen(vaf->vaf_path) + 1); + vaf->vaf_path = NULL; + } + umem_free(vaf, sizeof (varpd_files_t)); +} + +static void +varpd_files_lookup(void *arg, varpd_query_handle_t *qh, + const overlay_targ_lookup_t *otl, overlay_target_point_t *otp) +{ + char macstr[ETHERADDRSTRL], *ipstr; + nvlist_t *nvl; + varpd_files_t *vaf = arg; + int32_t port; + static const uint8_t bcast[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + + /* We don't support a default */ + if (otl == NULL) { + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + return; + } + + if (otl->otl_sap == ETHERTYPE_ARP) { + libvarpd_plugin_proxy_arp(vaf->vaf_hdl, qh, otl); + return; + } + + if (otl->otl_sap == ETHERTYPE_IPV6 && + otl->otl_dstaddr[0] == 0x33 && + otl->otl_dstaddr[1] == 0x33) { + libvarpd_plugin_proxy_ndp(vaf->vaf_hdl, qh, otl); + return; + } + + if (otl->otl_sap == ETHERTYPE_IP && + bcmp(otl->otl_dstaddr, bcast, ETHERADDRL) == 0) { + char *mac; + struct ether_addr a, *addr; + + addr = &a; + if (ether_ntoa_r((struct ether_addr *)otl->otl_srcaddr, + macstr) == NULL) { + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + return; + } + + if (nvlist_lookup_nvlist(vaf->vaf_nvl, macstr, &nvl) != 0) { + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + return; + } + + if (nvlist_lookup_string(nvl, "dhcp-proxy", &mac) != 0) { + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + return; + } + + if (ether_aton_r(mac, addr) == NULL) { + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + return; + } + + libvarpd_plugin_proxy_dhcp(vaf->vaf_hdl, qh, otl); + return; + } + + if (ether_ntoa_r((struct ether_addr *)otl->otl_dstaddr, + macstr) == NULL) { + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + return; + } + + if (nvlist_lookup_nvlist(vaf->vaf_nvl, macstr, &nvl) != 0) { + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + return; + } + + if (nvlist_lookup_int32(nvl, "port", &port) != 0) { + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + return; + } + + if (port <= 0 || port > UINT16_MAX) { + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + return; + } + otp->otp_port = port; + + if (nvlist_lookup_string(nvl, "ip", &ipstr) != 0) { + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + return; + } + + /* + * Try to parse it as a v6 address and then if it's not, try to + * transform it into a v4 address which we'll then wrap it into a v4 + * mapped address. + */ + if (inet_pton(AF_INET6, ipstr, &otp->otp_ip) != 1) { + uint32_t v4; + if (inet_pton(AF_INET, ipstr, &v4) != 1) { + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_DROP); + return; + } + IN6_IPADDR_TO_V4MAPPED(v4, &otp->otp_ip); + } + + libvarpd_plugin_query_reply(qh, VARPD_LOOKUP_OK); +} + +/* ARGSUSED */ +static int +varpd_files_nprops(void *arg, uint_t *nprops) +{ + *nprops = 1; + return (0); +} + +/* ARGSUSED */ +static int +varpd_files_propinfo(void *arg, uint_t propid, varpd_prop_handle_t *vph) +{ + if (propid != 0) + return (EINVAL); + + libvarpd_prop_set_name(vph, varpd_files_props[0]); + libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_RRW); + libvarpd_prop_set_type(vph, OVERLAY_PROP_T_STRING); + libvarpd_prop_set_nodefault(vph); + return (0); +} + +static int +varpd_files_getprop(void *arg, const char *pname, void *buf, uint32_t *sizep) +{ + varpd_files_t *vaf = arg; + + if (strcmp(pname, varpd_files_props[0]) != 0) + return (EINVAL); + + if (vaf->vaf_path != NULL) { + size_t len = strlen(vaf->vaf_path) + 1; + if (*sizep < len) + return (EOVERFLOW); + *sizep = len; + (void) strlcpy(buf, vaf->vaf_path, *sizep); + + } else { + *sizep = 0; + } + + return (0); +} + +static int +varpd_files_setprop(void *arg, const char *pname, const void *buf, + const uint32_t size) +{ + varpd_files_t *vaf = arg; + + if (strcmp(pname, varpd_files_props[0]) != 0) + return (EINVAL); + + if (vaf->vaf_path != NULL) + umem_free(vaf->vaf_path, strlen(vaf->vaf_path) + 1); + + vaf->vaf_path = umem_alloc(size, UMEM_DEFAULT); + if (vaf->vaf_path == NULL) + return (ENOMEM); + (void) strlcpy(vaf->vaf_path, buf, size); + return (0); +} + +static int +varpd_files_save(void *arg, nvlist_t *nvp) +{ + int ret; + varpd_files_t *vaf = arg; + + if (vaf->vaf_path == NULL) + return (0); + + if ((ret = nvlist_add_string(nvp, varpd_files_props[0], + vaf->vaf_path)) != 0) + return (ret); + + if ((ret = nvlist_add_uint64(nvp, "files/vaf_nmisses", + vaf->vaf_nmisses)) != 0) + return (ret); + + if ((ret = nvlist_add_uint64(nvp, "files/vaf_narp", + vaf->vaf_narp)) != 0) + return (ret); + return (0); +} + +static int +varpd_files_restore(nvlist_t *nvp, varpd_provider_handle_t *hdl, + overlay_plugin_dest_t dest, void **outp) +{ + varpd_files_t *vaf; + char *str; + int ret; + uint64_t nmisses, narp; + + if (varpd_files_valid_dest(dest) == B_FALSE) + return (EINVAL); + + ret = nvlist_lookup_string(nvp, varpd_files_props[0], &str); + if (ret != 0 && ret != ENOENT) + return (ret); + else if (ret == ENOENT) + str = NULL; + + if (nvlist_lookup_uint64(nvp, "files/vaf_nmisses", &nmisses) != 0) + return (EINVAL); + if (nvlist_lookup_uint64(nvp, "files/vaf_narp", &narp) != 0) + return (EINVAL); + + vaf = umem_alloc(sizeof (varpd_files_t), UMEM_DEFAULT); + if (vaf == NULL) + return (ENOMEM); + + bzero(vaf, sizeof (varpd_files_t)); + vaf->vaf_dest = dest; + if (str != NULL) { + size_t len = strlen(str) + 1; + vaf->vaf_path = umem_alloc(len, UMEM_DEFAULT); + if (vaf->vaf_path == NULL) { + umem_free(vaf, sizeof (varpd_files_t)); + return (ENOMEM); + } + (void) strlcpy(vaf->vaf_path, str, len); + } + + vaf->vaf_hdl = hdl; + *outp = vaf; + return (0); +} + +static void +varpd_files_proxy_arp(void *arg, varpd_arp_handle_t *vah, int kind, + const struct sockaddr *sock, uint8_t *out) +{ + varpd_files_t *vaf = arg; + const struct sockaddr_in *ip; + const struct sockaddr_in6 *ip6; + nvpair_t *pair; + + if (kind != VARPD_QTYPE_ETHERNET) { + libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP); + return; + } + + if (sock->sa_family != AF_INET && sock->sa_family != AF_INET6) { + libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP); + return; + } + + ip = (const struct sockaddr_in *)sock; + ip6 = (const struct sockaddr_in6 *)sock; + for (pair = nvlist_next_nvpair(vaf->vaf_nvl, NULL); pair != NULL; + pair = nvlist_next_nvpair(vaf->vaf_nvl, pair)) { + char *mac, *ipstr; + nvlist_t *data; + struct in_addr ia; + struct in6_addr ia6; + struct ether_addr ether, *e; + e = ðer; + + if (nvpair_type(pair) != DATA_TYPE_NVLIST) + continue; + + mac = nvpair_name(pair); + if (nvpair_value_nvlist(pair, &data) != 0) + continue; + + + if (sock->sa_family == AF_INET) { + if (nvlist_lookup_string(data, "arp", &ipstr) != 0) + continue; + + if (inet_pton(AF_INET, ipstr, &ia) != 1) + continue; + + if (bcmp(&ia, &ip->sin_addr, + sizeof (struct in_addr)) != 0) + continue; + } else { + if (nvlist_lookup_string(data, "ndp", &ipstr) != 0) + continue; + + if (inet_pton(AF_INET6, ipstr, &ia6) != 1) + continue; + + if (bcmp(&ia6, &ip6->sin6_addr, + sizeof (struct in6_addr)) != 0) + continue; + } + + if (ether_aton_r(mac, e) == NULL) { + libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP); + return; + } + + bcopy(e, out, ETHERADDRL); + libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_OK); + return; + } + + libvarpd_plugin_arp_reply(vah, VARPD_LOOKUP_DROP); +} + +static void +varpd_files_proxy_dhcp(void *arg, varpd_dhcp_handle_t *vdh, int type, + const overlay_targ_lookup_t *otl, uint8_t *out) +{ + varpd_files_t *vaf = arg; + nvlist_t *nvl; + char macstr[ETHERADDRSTRL], *mac; + struct ether_addr a, *addr; + + addr = &a; + if (type != VARPD_QTYPE_ETHERNET) { + libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP); + return; + } + + if (ether_ntoa_r((struct ether_addr *)otl->otl_srcaddr, + macstr) == NULL) { + libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP); + return; + } + + if (nvlist_lookup_nvlist(vaf->vaf_nvl, macstr, &nvl) != 0) { + libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP); + return; + } + + if (nvlist_lookup_string(nvl, "dhcp-proxy", &mac) != 0) { + libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP); + return; + } + + if (ether_aton_r(mac, addr) == NULL) { + libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_DROP); + return; + } + + bcopy(addr, out, ETHERADDRL); + libvarpd_plugin_dhcp_reply(vdh, VARPD_LOOKUP_OK); +} + +static const varpd_plugin_ops_t varpd_files_ops = { + 0, + varpd_files_create, + varpd_files_start, + varpd_files_stop, + varpd_files_destroy, + NULL, + varpd_files_lookup, + varpd_files_nprops, + varpd_files_propinfo, + varpd_files_getprop, + varpd_files_setprop, + varpd_files_save, + varpd_files_restore, + varpd_files_proxy_arp, + varpd_files_proxy_dhcp +}; + +#pragma init(varpd_files_init) +static void +varpd_files_init(void) +{ + int err; + varpd_plugin_register_t *vpr; + + vpr = libvarpd_plugin_alloc(VARPD_CURRENT_VERSION, &err); + if (vpr == NULL) + return; + + vpr->vpr_mode = OVERLAY_TARGET_DYNAMIC; + vpr->vpr_name = "files"; + vpr->vpr_ops = &varpd_files_ops; + (void) libvarpd_plugin_register(vpr); + libvarpd_plugin_free(vpr); +} diff --git a/usr/src/lib/varpd/files/common/libvarpd_files_json.c b/usr/src/lib/varpd/files/common/libvarpd_files_json.c new file mode 100644 index 0000000000..53e63c6244 --- /dev/null +++ b/usr/src/lib/varpd/files/common/libvarpd_files_json.c @@ -0,0 +1,936 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <strings.h> +#include <errno.h> +#include <libnvpair.h> +#include <sys/ccompile.h> + +#include "libvarpd_files_json.h" + +typedef enum json_type { + JSON_TYPE_NOTHING = 0, + JSON_TYPE_STRING = 1, + JSON_TYPE_INTEGER, + JSON_TYPE_DOUBLE, + JSON_TYPE_BOOLEAN, + JSON_TYPE_NULL, + JSON_TYPE_OBJECT, + JSON_TYPE_ARRAY +} json_type_t; + +typedef enum parse_state { + PARSE_ERROR = -1, + PARSE_DONE = 0, + PARSE_REST, + PARSE_OBJECT, + PARSE_KEY_STRING, + PARSE_COLON, + PARSE_STRING, + PARSE_OBJECT_COMMA, + PARSE_ARRAY, + PARSE_BAREWORD, + PARSE_NUMBER, + PARSE_ARRAY_VALUE, + PARSE_ARRAY_COMMA +} parse_state_t; + +#define JSON_MARKER ".__json_" +#define JSON_MARKER_ARRAY JSON_MARKER "array" + +typedef struct parse_frame { + parse_state_t pf_ps; + nvlist_t *pf_nvl; + + char *pf_key; + void *pf_value; + json_type_t pf_value_type; + int pf_array_index; + + struct parse_frame *pf_next; +} parse_frame_t; + +typedef struct state { + const char *s_in; + unsigned long s_pos; + unsigned long s_len; + + parse_frame_t *s_top; + + nvlist_parse_json_flags_t s_flags; + + /* + * This string buffer is used for temporary storage by the + * "collect_*()" family of functions. + */ + custr_t *s_collect; + + int s_errno; + custr_t *s_errstr; +} state_t; + +typedef void (*parse_handler_t)(state_t *); + +static void +movestate(state_t *s, parse_state_t ps) +{ + if (s->s_flags & NVJSON_DEBUG) { + (void) fprintf(stderr, "nvjson: move state %d -> %d\n", + s->s_top->pf_ps, ps); + } + s->s_top->pf_ps = ps; +} + +static void +posterror(state_t *s, int erno, const char *error) +{ + /* + * If the caller wants error messages printed to stderr, do that + * first. + */ + if (s->s_flags & NVJSON_ERRORS_TO_STDERR) { + (void) fprintf(stderr, "nvjson error (pos %ld, errno %d): %s\n", + s->s_pos, erno, error); + } + + /* + * Try and store the error message for the caller. This may fail if + * the error was related to memory pressure, and that condition still + * exists. + */ + s->s_errno = erno; + if (s->s_errstr != NULL) { + (void) custr_append(s->s_errstr, error); + } + + movestate(s, PARSE_ERROR); +} + +static int +pushstate(state_t *s, parse_state_t ps, parse_state_t retps) +{ + parse_frame_t *n; + + if (s->s_flags & NVJSON_DEBUG) { + (void) fprintf(stderr, "nvjson: push state %d -> %d (ret %d)\n", + s->s_top->pf_ps, ps, retps); + } + + if ((n = calloc(1, sizeof (*n))) == NULL) { + posterror(s, errno, "pushstate calloc failure"); + return (-1); + } + + /* + * Store the state we'll return to when popping this + * frame: + */ + s->s_top->pf_ps = retps; + + /* + * Store the initial state for the new frame, and + * put it on top of the stack: + */ + n->pf_ps = ps; + n->pf_value_type = JSON_TYPE_NOTHING; + + n->pf_next = s->s_top; + s->s_top = n; + + return (0); +} + +static char +popchar(state_t *s) +{ + if (s->s_pos > s->s_len) { + return (0); + } + return (s->s_in[s->s_pos++]); +} + +static char +peekchar(state_t *s) +{ + if (s->s_pos > s->s_len) { + return (0); + } + return (s->s_in[s->s_pos]); +} + +static void +discard_whitespace(state_t *s) +{ + while (isspace(peekchar(s))) { + (void) popchar(s); + } +} + +static char *escape_pairs[] = { + "\"\"", "\\\\", "//", "b\b", "f\f", "n\n", "r\r", "t\t", NULL +}; + +static char +collect_string_escape(state_t *s) +{ + int i; + char c = popchar(s); + + if (c == '\0') { + posterror(s, EPROTO, "EOF mid-escape sequence"); + return (-1); + } + + /* + * Handle four-digit Unicode escapes up to and including \u007f. + * Strings that cannot be represented as 7-bit clean ASCII are not + * currently supported. + */ + if (c == 'u') { + int res; + int ndigs = 0; + char digs[5]; + + /* + * Deal with 4-digit unicode escape. + */ + while (ndigs < 4) { + if ((digs[ndigs++] = popchar(s)) == '\0') { + posterror(s, EPROTO, "EOF mid-escape " + "sequence"); + return (-1); + } + } + digs[4] = '\0'; + if ((res = atoi(digs)) > 127) { + posterror(s, EPROTO, "unicode escape above 0x7f"); + return (-1); + } + + if (custr_appendc(s->s_collect, res) != 0) { + posterror(s, errno, "custr_appendc failure"); + return (-1); + } + return (0); + } + + /* + * See if this is a C-style escape character we recognise. + */ + for (i = 0; escape_pairs[i] != NULL; i++) { + char *ep = escape_pairs[i]; + if (ep[0] == c) { + if (custr_appendc(s->s_collect, ep[1]) != 0) { + posterror(s, errno, "custr_appendc failure"); + return (-1); + } + return (0); + } + } + + posterror(s, EPROTO, "unrecognised escape sequence"); + return (-1); +} + +static int +collect_string(state_t *s) +{ + custr_reset(s->s_collect); + + for (;;) { + char c; + + switch (c = popchar(s)) { + case '"': + /* + * Legal End of String. + */ + return (0); + + case '\0': + posterror(s, EPROTO, "EOF mid-string"); + return (-1); + + case '\\': + /* + * Escape Characters and Sequences. + */ + if (collect_string_escape(s) != 0) { + return (-1); + } + break; + + default: + if (custr_appendc(s->s_collect, c) != 0) { + posterror(s, errno, "custr_appendc failure"); + return (-1); + } + break; + } + } +} + +static int +collect_bareword(state_t *s) +{ + custr_reset(s->s_collect); + + for (;;) { + if (!islower(peekchar(s))) { + return (0); + } + + if (custr_appendc(s->s_collect, popchar(s)) != 0) { + posterror(s, errno, "custr_appendc failure"); + return (-1); + } + } +} + +static void +hdlr_bareword(state_t *s) +{ + const char *str; + + if (collect_bareword(s) != 0) { + return; + } + + str = custr_cstr(s->s_collect); + if (strcmp(str, "true") == 0) { + s->s_top->pf_value_type = JSON_TYPE_BOOLEAN; + s->s_top->pf_value = (void *)B_TRUE; + } else if (strcmp(str, "false") == 0) { + s->s_top->pf_value_type = JSON_TYPE_BOOLEAN; + s->s_top->pf_value = (void *)B_FALSE; + } else if (strcmp(str, "null") == 0) { + s->s_top->pf_value_type = JSON_TYPE_NULL; + } else { + posterror(s, EPROTO, "expected 'true', 'false' or 'null'"); + return; + } + + movestate(s, PARSE_DONE); +} + +/* ARGSUSED */ +static int +collect_number(state_t *s, boolean_t *isint, int32_t *result, + double *fresult __unused) +{ + boolean_t neg = B_FALSE; + int t; + + custr_reset(s->s_collect); + + if (peekchar(s) == '-') { + neg = B_TRUE; + (void) popchar(s); + } + /* + * Read the 'int' portion: + */ + if (!isdigit(peekchar(s))) { + posterror(s, EPROTO, "malformed number: expected digit (0-9)"); + return (-1); + } + for (;;) { + if (!isdigit(peekchar(s))) { + break; + } + if (custr_appendc(s->s_collect, popchar(s)) != 0) { + posterror(s, errno, "custr_append failure"); + return (-1); + } + } + if (peekchar(s) == '.' || peekchar(s) == 'e' || peekchar(s) == 'E') { + posterror(s, ENOTSUP, "do not yet support FRACs or EXPs"); + return (-1); + } + + t = atoi(custr_cstr(s->s_collect)); + + *isint = B_TRUE; + *result = (neg == B_TRUE) ? (-t) : t; + return (0); +} + +static void +hdlr_number(state_t *s) +{ + boolean_t isint; + int32_t result; + double fresult; + + if (collect_number(s, &isint, &result, &fresult) != 0) { + return; + } + + if (isint == B_TRUE) { + s->s_top->pf_value = (void *)(uintptr_t)result; + s->s_top->pf_value_type = JSON_TYPE_INTEGER; + } else { + s->s_top->pf_value = malloc(sizeof (fresult)); + bcopy(&fresult, s->s_top->pf_value, sizeof (fresult)); + s->s_top->pf_value_type = JSON_TYPE_DOUBLE; + } + + movestate(s, PARSE_DONE); +} + +static void +hdlr_rest(state_t *s) +{ + char c; + discard_whitespace(s); + c = popchar(s); + switch (c) { + case '{': + movestate(s, PARSE_OBJECT); + return; + + case '[': + movestate(s, PARSE_ARRAY); + return; + + default: + posterror(s, EPROTO, "EOF before object or array"); + return; + } +} + +static int +add_empty_child(state_t *s) +{ + /* + * Here, we create an empty nvlist to represent this object + * or array: + */ + nvlist_t *empty; + if (nvlist_alloc(&empty, NV_UNIQUE_NAME, 0) != 0) { + posterror(s, errno, "nvlist_alloc failure"); + return (-1); + } + if (s->s_top->pf_next != NULL) { + /* + * If we're a child of the frame above, we store ourselves in + * that frame's nvlist: + */ + nvlist_t *nvl = s->s_top->pf_next->pf_nvl; + char *key = s->s_top->pf_next->pf_key; + + if (nvlist_add_nvlist(nvl, key, empty) != 0) { + posterror(s, errno, "nvlist_add_nvlist failure"); + nvlist_free(empty); + return (-1); + } + nvlist_free(empty); + if (nvlist_lookup_nvlist(nvl, key, &empty) != 0) { + posterror(s, errno, "nvlist_lookup_nvlist failure"); + return (-1); + } + } + s->s_top->pf_nvl = empty; + return (0); +} + +static int +decorate_array(state_t *s) +{ + int idx = s->s_top->pf_array_index; + /* + * When we are done creating an array, we store a 'length' + * property on it, as well as an internal-use marker value. + */ + if (nvlist_add_boolean(s->s_top->pf_nvl, JSON_MARKER_ARRAY) != 0 || + nvlist_add_uint32(s->s_top->pf_nvl, "length", idx) != 0) { + posterror(s, errno, "nvlist_add failure"); + return (-1); + } + + return (0); +} + +static void +hdlr_array(state_t *s) +{ + s->s_top->pf_value_type = JSON_TYPE_ARRAY; + + if (add_empty_child(s) != 0) { + return; + } + + discard_whitespace(s); + + switch (peekchar(s)) { + case ']': + (void) popchar(s); + + if (decorate_array(s) != 0) { + return; + } + + movestate(s, PARSE_DONE); + return; + + default: + movestate(s, PARSE_ARRAY_VALUE); + return; + } +} + +static void +hdlr_array_comma(state_t *s) +{ + discard_whitespace(s); + + switch (popchar(s)) { + case ']': + if (decorate_array(s) != 0) { + return; + } + + movestate(s, PARSE_DONE); + return; + case ',': + movestate(s, PARSE_ARRAY_VALUE); + return; + default: + posterror(s, EPROTO, "expected ',' or ']'"); + return; + } +} + +static void +hdlr_array_value(state_t *s) +{ + char c; + + /* + * Generate keyname from the next array index: + */ + if (s->s_top->pf_key != NULL) { + (void) fprintf(stderr, "pf_key not null! was %s\n", + s->s_top->pf_key); + abort(); + } + + if (asprintf(&s->s_top->pf_key, "%d", s->s_top->pf_array_index++) < 0) { + posterror(s, errno, "asprintf failure"); + return; + } + + discard_whitespace(s); + + /* + * Select which type handler we need for the next value: + */ + switch (c = peekchar(s)) { + case '"': + (void) popchar(s); + (void) pushstate(s, PARSE_STRING, PARSE_ARRAY_COMMA); + return; + + case '{': + (void) popchar(s); + (void) pushstate(s, PARSE_OBJECT, PARSE_ARRAY_COMMA); + return; + + case '[': + (void) popchar(s); + (void) pushstate(s, PARSE_ARRAY, PARSE_ARRAY_COMMA); + return; + + default: + if (islower(c)) { + (void) pushstate(s, PARSE_BAREWORD, + PARSE_ARRAY_COMMA); + return; + } else if (c == '-' || isdigit(c)) { + (void) pushstate(s, PARSE_NUMBER, PARSE_ARRAY_COMMA); + return; + } else { + posterror(s, EPROTO, "unexpected character at start " + "of value"); + return; + } + } +} + +static void +hdlr_object(state_t *s) +{ + s->s_top->pf_value_type = JSON_TYPE_OBJECT; + + if (add_empty_child(s) != 0) { + return; + } + + discard_whitespace(s); + + switch (popchar(s)) { + case '}': + movestate(s, PARSE_DONE); + return; + + case '"': + movestate(s, PARSE_KEY_STRING); + return; + + default: + posterror(s, EPROTO, "expected key or '}'"); + return; + } +} + +static void +hdlr_key_string(state_t *s) +{ + if (collect_string(s) != 0) { + return; + } + + /* + * Record the key name of the next value. + */ + if ((s->s_top->pf_key = strdup(custr_cstr(s->s_collect))) == NULL) { + posterror(s, errno, "strdup failure"); + return; + } + + movestate(s, PARSE_COLON); +} + +static void +hdlr_colon(state_t *s) +{ + char c; + discard_whitespace(s); + + if ((c = popchar(s)) != ':') { + posterror(s, EPROTO, "expected ':'"); + return; + } + + discard_whitespace(s); + + /* + * Select which type handler we need for the value after the colon: + */ + switch (c = peekchar(s)) { + case '"': + (void) popchar(s); + (void) pushstate(s, PARSE_STRING, PARSE_OBJECT_COMMA); + return; + + case '{': + (void) popchar(s); + (void) pushstate(s, PARSE_OBJECT, PARSE_OBJECT_COMMA); + return; + + case '[': + (void) popchar(s); + (void) pushstate(s, PARSE_ARRAY, PARSE_OBJECT_COMMA); + return; + + default: + if (islower(c)) { + (void) pushstate(s, PARSE_BAREWORD, PARSE_OBJECT_COMMA); + return; + } else if (c == '-' || isdigit(c)) { + (void) pushstate(s, PARSE_NUMBER, PARSE_OBJECT_COMMA); + return; + } else { + (void) posterror(s, EPROTO, "unexpected character at " + "start of value"); + return; + } + } +} + +static void +hdlr_object_comma(state_t *s) +{ + discard_whitespace(s); + + switch (popchar(s)) { + case '}': + movestate(s, PARSE_DONE); + return; + + case ',': + discard_whitespace(s); + if (popchar(s) != '"') { + posterror(s, EPROTO, "expected '\"'"); + return; + } + movestate(s, PARSE_KEY_STRING); + return; + + default: + posterror(s, EPROTO, "expected ',' or '}'"); + return; + } +} + +static void +hdlr_string(state_t *s) +{ + if (collect_string(s) != 0) { + return; + } + + s->s_top->pf_value_type = JSON_TYPE_STRING; + if ((s->s_top->pf_value = strdup(custr_cstr(s->s_collect))) == NULL) { + posterror(s, errno, "strdup failure"); + return; + } + + movestate(s, PARSE_DONE); +} + +static int +store_value(state_t *s) +{ + nvlist_t *targ = s->s_top->pf_next->pf_nvl; + char *key = s->s_top->pf_next->pf_key; + json_type_t type = s->s_top->pf_value_type; + int ret = 0; + + switch (type) { + case JSON_TYPE_STRING: + if (nvlist_add_string(targ, key, s->s_top->pf_value) != 0) { + posterror(s, errno, "nvlist_add_string failure"); + ret = -1; + } + free(s->s_top->pf_value); + break; + + case JSON_TYPE_BOOLEAN: + if (nvlist_add_boolean_value(targ, key, + (boolean_t)s->s_top->pf_value) != 0) { + posterror(s, errno, "nvlist_add_boolean_value " + "failure"); + ret = -1; + } + break; + + case JSON_TYPE_NULL: + if (nvlist_add_boolean(targ, key) != 0) { + posterror(s, errno, "nvlist_add_boolean failure"); + ret = -1; + } + break; + + case JSON_TYPE_INTEGER: + if (nvlist_add_int32(targ, key, + (int32_t)(uintptr_t)s->s_top->pf_value) != 0) { + posterror(s, errno, "nvlist_add_int32 failure"); + ret = -1; + } + break; + + case JSON_TYPE_ARRAY: + case JSON_TYPE_OBJECT: + /* + * Objects and arrays are already 'stored' in their target + * nvlist on creation. See: hdlr_object, hdlr_array. + */ + break; + + default: + (void) fprintf(stderr, "ERROR: could not store unknown " + "type %d\n", type); + abort(); + } + + s->s_top->pf_value = NULL; + free(s->s_top->pf_next->pf_key); + s->s_top->pf_next->pf_key = NULL; + return (ret); +} + +static parse_frame_t * +parse_frame_free(parse_frame_t *pf, boolean_t free_nvl) +{ + parse_frame_t *next = pf->pf_next; + if (pf->pf_key != NULL) { + free(pf->pf_key); + } + if (pf->pf_value != NULL) { + abort(); + } + if (free_nvl && pf->pf_nvl != NULL) { + nvlist_free(pf->pf_nvl); + } + free(pf); + return (next); +} + +static parse_handler_t hdlrs[] = { + NULL, /* PARSE_DONE */ + hdlr_rest, /* PARSE_REST */ + hdlr_object, /* PARSE_OBJECT */ + hdlr_key_string, /* PARSE_KEY_STRING */ + hdlr_colon, /* PARSE_COLON */ + hdlr_string, /* PARSE_STRING */ + hdlr_object_comma, /* PARSE_OBJECT_COMMA */ + hdlr_array, /* PARSE_ARRAY */ + hdlr_bareword, /* PARSE_BAREWORD */ + hdlr_number, /* PARSE_NUMBER */ + hdlr_array_value, /* PARSE_ARRAY_VALUE */ + hdlr_array_comma /* PARSE_ARRAY_COMMA */ +}; +#define NUM_PARSE_HANDLERS (int)(sizeof (hdlrs) / sizeof (hdlrs[0])) + +int +nvlist_parse_json(const char *buf, size_t buflen, nvlist_t **nvlp, + nvlist_parse_json_flags_t flag, nvlist_parse_json_error_t *errout) +{ + state_t s; + + /* + * Check for valid flags: + */ + if ((flag & NVJSON_FORCE_INTEGER) && (flag & NVJSON_FORCE_DOUBLE)) { + errno = EINVAL; + return (-1); + } + if ((flag & ~NVJSON_ALL) != 0) { + errno = EINVAL; + return (-1); + } + + /* + * Initialise parsing state structure: + */ + bzero(&s, sizeof (s)); + s.s_in = buf; + s.s_pos = 0; + s.s_len = buflen; + s.s_flags = flag; + + /* + * Allocate the collect buffer string. + */ + if (custr_alloc(&s.s_collect) != 0) { + s.s_errno = errno; + if (errout != NULL) { + (void) snprintf(errout->nje_message, + sizeof (errout->nje_message), + "custr alloc failure: %s", + strerror(errno)); + } + goto out; + } + + /* + * If the caller has requested error information, allocate the error + * string now. + */ + if (errout != NULL) { + if (custr_alloc_buf(&s.s_errstr, errout->nje_message, + sizeof (errout->nje_message)) != 0) { + s.s_errno = errno; + (void) snprintf(errout->nje_message, + sizeof (errout->nje_message), + "custr alloc failure: %s", + strerror(errno)); + goto out; + } + custr_reset(s.s_errstr); + } + + /* + * Allocate top-most stack frame: + */ + if ((s.s_top = calloc(1, sizeof (*s.s_top))) == NULL) { + s.s_errno = errno; + goto out; + } + + s.s_top->pf_ps = PARSE_REST; + for (;;) { + if (s.s_top->pf_ps < 0) { + /* + * The parser reported an error. + */ + goto out; + } + + if (s.s_top->pf_ps == PARSE_DONE) { + if (s.s_top->pf_next == NULL) { + /* + * Last frame, so we're really + * done. + */ + *nvlp = s.s_top->pf_nvl; + goto out; + } else { + /* + * Otherwise, pop a frame and continue in + * previous state. Copy out the value we + * created in the old frame: + */ + if (store_value(&s) != 0) { + goto out; + } + + /* + * Free old frame: + */ + s.s_top = parse_frame_free(s.s_top, B_FALSE); + } + } + + /* + * Dispatch to parser handler routine for this state: + */ + if (s.s_top->pf_ps >= NUM_PARSE_HANDLERS || + hdlrs[s.s_top->pf_ps] == NULL) { + (void) fprintf(stderr, "no handler for state %d\n", + s.s_top->pf_ps); + abort(); + } + hdlrs[s.s_top->pf_ps](&s); + } + +out: + if (errout != NULL) { + /* + * Copy out error number and parse position. The custr_t for + * the error message was backed by the buffer in the error + * object, so no copying is required. + */ + errout->nje_errno = s.s_errno; + errout->nje_pos = s.s_pos; + } + + /* + * Free resources: + */ + while (s.s_top != NULL) { + s.s_top = parse_frame_free(s.s_top, s.s_errno == 0 ? B_FALSE : + B_TRUE); + } + custr_free(s.s_collect); + custr_free(s.s_errstr); + + errno = s.s_errno; + return (s.s_errno == 0 ? 0 : -1); +} diff --git a/usr/src/lib/varpd/files/common/libvarpd_files_json.h b/usr/src/lib/varpd/files/common/libvarpd_files_json.h new file mode 100644 index 0000000000..9fe765741b --- /dev/null +++ b/usr/src/lib/varpd/files/common/libvarpd_files_json.h @@ -0,0 +1,52 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _LIBVARPD_FILES_JSON_H +#define _LIBVARPD_FILES_JSON_H + +#include <libnvpair.h> +#include <libcustr.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum nvlist_parse_json_flags { + NVJSON_FORCE_INTEGER = 0x01, + NVJSON_FORCE_DOUBLE = 0x02, + NVJSON_ERRORS_TO_STDERR = 0x04, + NVJSON_DEBUG = 0x08 +} nvlist_parse_json_flags_t; + +typedef struct nvlist_parse_json_error { + int nje_errno; + long nje_pos; + char nje_message[512]; +} nvlist_parse_json_error_t; + +#define NVJSON_ALL \ + (NVJSON_FORCE_INTEGER | \ + NVJSON_FORCE_DOUBLE | \ + NVJSON_ERRORS_TO_STDERR | \ + NVJSON_DEBUG) + +extern int nvlist_parse_json(const char *, size_t, nvlist_t **, + nvlist_parse_json_flags_t, nvlist_parse_json_error_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBVARPD_FILES_JSON_H */ diff --git a/usr/src/lib/varpd/files/common/mapfile-vers b/usr/src/lib/varpd/files/common/mapfile-vers new file mode 100644 index 0000000000..6b7c5a5067 --- /dev/null +++ b/usr/src/lib/varpd/files/common/mapfile-vers @@ -0,0 +1,35 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION SUNWprivate { + local: + *; +}; diff --git a/usr/src/lib/varpd/files/i386/Makefile b/usr/src/lib/varpd/files/i386/Makefile new file mode 100644 index 0000000000..4398507523 --- /dev/null +++ b/usr/src/lib/varpd/files/i386/Makefile @@ -0,0 +1,18 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +include ../Makefile.com + +install: all $(ROOTLIBS) $(ROOTLINKS) diff --git a/usr/src/lib/varpd/libvarpd/Makefile b/usr/src/lib/varpd/libvarpd/Makefile new file mode 100644 index 0000000000..034ba30c1d --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/Makefile @@ -0,0 +1,54 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +include ../../Makefile.lib + +HDRS = libvarpd.h libvarpd_client.h libvarpd_provider.h +HDRDIR = common +SUBDIRS = $(MACH) +$(BUILD64)SUBDIRS += $(MACH64) + +TYPECHECK_LIB = libvarpd.so.1 +TYPELIST = \ + varpd_client_instance_arg_t \ + varpd_client_nprops_arg_t \ + varpd_client_propinfo_arg_t \ + varpd_client_eresp_t \ + varpd_persist_header_t \ + overlay_targ_cache_entry_t \ + overlay_targ_cache_t \ + overlay_targ_cache_iter_t + +all := TARGET = all +clean := TARGET = clean +clobber := TARGET = clobber +install := TARGET = install + +.KEEP_STATE: + +all clean clobber: $(SUBDIRS) + +install: $(SUBDIRS) $(VARPD_MAPFILES) install_h + +install_h: $(ROOTHDRS) + +check: $(CHECKHDRS) $(TYPECHECK) + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: + +include ../../Makefile.targ diff --git a/usr/src/lib/varpd/libvarpd/Makefile.com b/usr/src/lib/varpd/libvarpd/Makefile.com new file mode 100644 index 0000000000..73e8f17883 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/Makefile.com @@ -0,0 +1,48 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +LIBRARY = libvarpd.a +VERS = .1 +OBJECTS = libvarpd.o \ + libvarpd_arp.o \ + libvarpd_client.o \ + libvarpd_door.o \ + libvarpd_overlay.o \ + libvarpd_panic.o \ + libvarpd_persist.o \ + libvarpd_prop.o \ + libvarpd_plugin.o \ + libvarpd_util.o + +include ../../../Makefile.lib + +# install this library in the root filesystem +include ../../../Makefile.rootfs + +LIBS = $(DYNLIB) +LDLIBS += -lc -lavl -lumem -lidspace -lnvpair -lmd5 -lrename +CPPFLAGS += -I../common + +CERRWARN += -erroff=E_STRUCT_DERIVED_FROM_FLEX_MBR + +CSTD= $(CSTD_GNU99) + +SRCDIR = ../common + +.KEEP_STATE: + +all: $(LIBS) + +include ../../../Makefile.targ diff --git a/usr/src/lib/varpd/libvarpd/amd64/Makefile b/usr/src/lib/varpd/libvarpd/amd64/Makefile new file mode 100644 index 0000000000..1881990d79 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/amd64/Makefile @@ -0,0 +1,19 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +include ../Makefile.com +include ../../../Makefile.lib.64 + +install: all $(ROOTLIBS64) $(ROOTLINKS64) diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd.c b/usr/src/lib/varpd/libvarpd/common/libvarpd.c new file mode 100644 index 0000000000..4e4c189a43 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd.c @@ -0,0 +1,345 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * varpd library + */ + +#include <stdlib.h> +#include <errno.h> +#include <umem.h> +#include <sys/types.h> +#include <unistd.h> +#include <sys/avl.h> +#include <stddef.h> +#include <stdio.h> +#include <strings.h> + +#include <libvarpd_impl.h> + +static int +libvarpd_instance_comparator(const void *lp, const void *rp) +{ + const varpd_instance_t *lpp, *rpp; + lpp = lp; + rpp = rp; + + if (lpp->vri_id > rpp->vri_id) + return (1); + if (lpp->vri_id < rpp->vri_id) + return (-1); + return (0); +} + +static int +libvarpd_instance_lcomparator(const void *lp, const void *rp) +{ + const varpd_instance_t *lpp, *rpp; + lpp = lp; + rpp = rp; + + if (lpp->vri_linkid > rpp->vri_linkid) + return (1); + if (lpp->vri_linkid < rpp->vri_linkid) + return (-1); + return (0); +} + +int +libvarpd_create(varpd_handle_t **vphp) +{ + int ret; + varpd_impl_t *vip; + char buf[32]; + + if (vphp == NULL) + return (EINVAL); + + *vphp = NULL; + vip = umem_alloc(sizeof (varpd_impl_t), UMEM_DEFAULT); + if (vip == NULL) + return (errno); + + bzero(vip, sizeof (varpd_impl_t)); + (void) snprintf(buf, sizeof (buf), "varpd_%p", vip); + vip->vdi_idspace = id_space_create(buf, LIBVARPD_ID_MIN, + LIBVARPD_ID_MAX); + if (vip->vdi_idspace == NULL) { + int ret = errno; + umem_free(vip, sizeof (varpd_impl_t)); + return (ret); + } + + vip->vdi_qcache = umem_cache_create("query", sizeof (varpd_query_t), 0, + NULL, NULL, NULL, NULL, NULL, 0); + if (vip->vdi_qcache == NULL) { + int ret = errno; + id_space_destroy(vip->vdi_idspace); + umem_free(vip, sizeof (varpd_impl_t)); + return (ret); + } + + if ((ret = libvarpd_overlay_init(vip)) != 0) { + umem_cache_destroy(vip->vdi_qcache); + id_space_destroy(vip->vdi_idspace); + umem_free(vip, sizeof (varpd_impl_t)); + return (ret); + } + + libvarpd_persist_init(vip); + + avl_create(&vip->vdi_plugins, libvarpd_plugin_comparator, + sizeof (varpd_plugin_t), offsetof(varpd_plugin_t, vpp_node)); + + avl_create(&vip->vdi_instances, libvarpd_instance_comparator, + sizeof (varpd_instance_t), offsetof(varpd_instance_t, vri_inode)); + avl_create(&vip->vdi_linstances, libvarpd_instance_lcomparator, + sizeof (varpd_instance_t), offsetof(varpd_instance_t, vri_lnode)); + + if (mutex_init(&vip->vdi_lock, USYNC_THREAD | LOCK_ERRORCHECK, + NULL) != 0) + libvarpd_panic("failed to create mutex: %d", errno); + + vip->vdi_doorfd = -1; + *vphp = (varpd_handle_t *)vip; + return (0); +} + +void +libvarpd_destroy(varpd_handle_t *vhp) +{ + varpd_impl_t *vip = (varpd_impl_t *)vhp; + + libvarpd_overlay_lookup_quiesce(vhp); + if (mutex_destroy(&vip->vdi_lock) != 0) + libvarpd_panic("failed to destroy mutex: %d", errno); + libvarpd_persist_fini(vip); + libvarpd_overlay_fini(vip); + umem_cache_destroy(vip->vdi_qcache); + id_space_destroy(vip->vdi_idspace); + umem_free(vip, sizeof (varpd_impl_t)); +} + +int +libvarpd_instance_create(varpd_handle_t *vhp, datalink_id_t linkid, + const char *pname, varpd_instance_handle_t **outp) +{ + int ret; + varpd_impl_t *vip = (varpd_impl_t *)vhp; + varpd_plugin_t *plugin; + varpd_instance_t *inst, lookup; + overlay_plugin_dest_t dest; + uint64_t vid; + + /* + * We should really have our own errnos. + */ + plugin = libvarpd_plugin_lookup(vip, pname); + if (plugin == NULL) + return (ENOENT); + + if ((ret = libvarpd_overlay_info(vip, linkid, &dest, NULL, &vid)) != 0) + return (ret); + + inst = umem_alloc(sizeof (varpd_instance_t), UMEM_DEFAULT); + if (inst == NULL) + return (ENOMEM); + + inst->vri_id = id_alloc(vip->vdi_idspace); + if (inst->vri_id == -1) + libvarpd_panic("failed to allocate id from vdi_idspace: %d", + errno); + inst->vri_linkid = linkid; + inst->vri_vnetid = vid; + inst->vri_mode = plugin->vpp_mode; + inst->vri_dest = dest; + inst->vri_plugin = plugin; + inst->vri_impl = vip; + inst->vri_flags = 0; + if ((ret = plugin->vpp_ops->vpo_create((varpd_provider_handle_t *)inst, + &inst->vri_private, dest)) != 0) { + id_free(vip->vdi_idspace, inst->vri_id); + umem_free(inst, sizeof (varpd_instance_t)); + return (ret); + } + + if (mutex_init(&inst->vri_lock, USYNC_THREAD | LOCK_ERRORCHECK, + NULL) != 0) + libvarpd_panic("failed to create mutex: %d", errno); + + mutex_enter(&vip->vdi_lock); + lookup.vri_id = inst->vri_id; + if (avl_find(&vip->vdi_instances, &lookup, NULL) != NULL) + libvarpd_panic("found duplicate instance with id %d", + lookup.vri_id); + avl_add(&vip->vdi_instances, inst); + lookup.vri_linkid = inst->vri_linkid; + if (avl_find(&vip->vdi_linstances, &lookup, NULL) != NULL) + libvarpd_panic("found duplicate linstance with id %d", + lookup.vri_linkid); + avl_add(&vip->vdi_linstances, inst); + mutex_exit(&vip->vdi_lock); + *outp = (varpd_instance_handle_t *)inst; + return (0); +} + +uint64_t +libvarpd_instance_id(varpd_instance_handle_t *ihp) +{ + varpd_instance_t *inst = (varpd_instance_t *)ihp; + return (inst->vri_id); +} + +uint64_t +libvarpd_plugin_vnetid(varpd_provider_handle_t *vhp) +{ + varpd_instance_t *inst = (varpd_instance_t *)vhp; + return (inst->vri_vnetid); +} + +varpd_instance_handle_t * +libvarpd_instance_lookup(varpd_handle_t *vhp, uint64_t id) +{ + varpd_impl_t *vip = (varpd_impl_t *)vhp; + varpd_instance_t lookup, *retp; + + lookup.vri_id = id; + mutex_enter(&vip->vdi_lock); + retp = avl_find(&vip->vdi_instances, &lookup, NULL); + mutex_exit(&vip->vdi_lock); + return ((varpd_instance_handle_t *)retp); +} + +/* + * If this function becomes external to varpd, we need to change it to return a + * varpd_instance_handle_t. + */ +varpd_instance_t * +libvarpd_instance_lookup_by_dlid(varpd_impl_t *vip, datalink_id_t linkid) +{ + varpd_instance_t lookup, *retp; + + lookup.vri_linkid = linkid; + mutex_enter(&vip->vdi_lock); + retp = avl_find(&vip->vdi_linstances, &lookup, NULL); + mutex_exit(&vip->vdi_lock); + return (retp); +} + +/* + * When an instance is being destroyed, that means we should deactivate it, as + * well as clean it up. That means here, the proper order is calling the plug-in + * stop and then the destroy function. + */ +void +libvarpd_instance_destroy(varpd_instance_handle_t *ihp) +{ + varpd_instance_t *inst = (varpd_instance_t *)ihp; + varpd_impl_t *vip = inst->vri_impl; + + /* + * First things first, remove it from global visibility. + */ + mutex_enter(&vip->vdi_lock); + avl_remove(&vip->vdi_instances, inst); + avl_remove(&vip->vdi_linstances, inst); + mutex_exit(&vip->vdi_lock); + + mutex_enter(&inst->vri_lock); + + /* + * We need to clean up this instance, that means remove it from + * persistence and stopping it. Then finally we'll have to clean it up + * entirely. + */ + if (inst->vri_flags & VARPD_INSTANCE_F_ACTIVATED) { + inst->vri_flags &= ~VARPD_INSTANCE_F_ACTIVATED; + libvarpd_torch_instance(vip, inst); + inst->vri_plugin->vpp_ops->vpo_stop(inst->vri_private); + inst->vri_plugin->vpp_ops->vpo_destroy(inst->vri_private); + inst->vri_private = NULL; + } + mutex_exit(&inst->vri_lock); + + /* Do the full clean up of the instance */ + if (mutex_destroy(&inst->vri_lock) != 0) + libvarpd_panic("failed to destroy instance vri_lock"); + id_free(vip->vdi_idspace, inst->vri_id); + umem_free(inst, sizeof (varpd_instance_t)); +} + +int +libvarpd_instance_activate(varpd_instance_handle_t *ihp) +{ + int ret; + varpd_instance_t *inst = (varpd_instance_t *)ihp; + + mutex_enter(&inst->vri_lock); + + if (inst->vri_flags & VARPD_INSTANCE_F_ACTIVATED) { + ret = EEXIST; + goto out; + } + + if ((ret = inst->vri_plugin->vpp_ops->vpo_start(inst->vri_private)) != + 0) + goto out; + + if ((ret = libvarpd_persist_instance(inst->vri_impl, inst)) != 0) + goto out; + + /* + * If this fails, we don't need to call stop, as the caller should end + * up calling destroy on the instance, which takes care of calling stop + * and destroy. + */ + if ((ret = libvarpd_overlay_associate(inst)) != 0) + goto out; + + inst->vri_flags |= VARPD_INSTANCE_F_ACTIVATED; + +out: + mutex_exit(&inst->vri_lock); + return (ret); +} + +static void +libvarpd_prefork(void) +{ + libvarpd_plugin_prefork(); +} + +static void +libvarpd_postfork(void) +{ + libvarpd_plugin_postfork(); +} + +#pragma init(libvarpd_init) +static void +libvarpd_init(void) +{ + libvarpd_plugin_init(); + if (pthread_atfork(libvarpd_prefork, libvarpd_postfork, + libvarpd_postfork) != 0) + libvarpd_panic("failed to create varpd atfork: %d", errno); +} + +#pragma fini(libvarpd_fini) +static void +libvarpd_fini(void) +{ + libvarpd_plugin_fini(); +} diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd.h b/usr/src/lib/varpd/libvarpd/common/libvarpd.h new file mode 100644 index 0000000000..106d4272d9 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd.h @@ -0,0 +1,77 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LIBVARPD_H +#define _LIBVARPD_H + +/* + * varpd interfaces + */ + +#include <sys/types.h> +#include <stdint.h> +#include <sys/mac.h> +#include <libvarpd_client.h> +#include <stdio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct __varpd_handle varpd_handle_t; +typedef struct __varpd_prop_handle varpd_prop_handle_t; +typedef struct __varpd_instance_handle varpd_instance_handle_t; + +extern int libvarpd_create(varpd_handle_t **); +extern void libvarpd_destroy(varpd_handle_t *); + +extern int libvarpd_persist_enable(varpd_handle_t *, const char *); +extern int libvarpd_persist_restore(varpd_handle_t *); +extern int libvarpd_persist_disable(varpd_handle_t *); + +extern int libvarpd_instance_create(varpd_handle_t *, datalink_id_t, + const char *, varpd_instance_handle_t **); +extern uint64_t libvarpd_instance_id(varpd_instance_handle_t *); +extern varpd_instance_handle_t *libvarpd_instance_lookup(varpd_handle_t *, + uint64_t); +extern void libvarpd_instance_destroy(varpd_instance_handle_t *); +extern int libvarpd_instance_activate(varpd_instance_handle_t *); + +extern int libvarpd_plugin_load(varpd_handle_t *, const char *); +typedef int (*libvarpd_plugin_walk_f)(varpd_handle_t *, const char *, void *); +extern int libvarpd_plugin_walk(varpd_handle_t *, libvarpd_plugin_walk_f, + void *); + +extern int libvarpd_prop_handle_alloc(varpd_handle_t *, + varpd_instance_handle_t *, varpd_prop_handle_t **); +extern void libvarpd_prop_handle_free(varpd_prop_handle_t *); +extern int libvarpd_prop_nprops(varpd_instance_handle_t *, uint_t *); +extern int libvarpd_prop_info_fill(varpd_prop_handle_t *, uint_t); +extern int libvarpd_prop_info(varpd_prop_handle_t *, const char **, uint_t *, + uint_t *, const void **, uint32_t *, const mac_propval_range_t **); +extern int libvarpd_prop_get(varpd_prop_handle_t *, void *, uint32_t *); +extern int libvarpd_prop_set(varpd_prop_handle_t *, const void *, uint32_t); + +extern int libvarpd_door_server_create(varpd_handle_t *, const char *); +extern void libvarpd_door_server_destroy(varpd_handle_t *); + +extern void *libvarpd_overlay_lookup_run(void *); +extern void libvarpd_overlay_lookup_quiesce(varpd_handle_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBVARPD_H */ diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c new file mode 100644 index 0000000000..7180fcb2de --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_arp.c @@ -0,0 +1,649 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Common routines for implementing proxy arp + */ + +#include <sys/types.h> +#include <net/if.h> +#include <netinet/if_ether.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet/udp.h> +#include <netinet/dhcp.h> +#include <libvarpd_impl.h> +#include <sys/vlan.h> +#include <strings.h> +#include <assert.h> + +#define IPV6_VERSION 6 + +typedef struct varpd_arp_query { + int vaq_type; + char vaq_buf[ETHERMAX + VLAN_TAGSZ]; + size_t vaq_bsize; + uint8_t vaq_lookup[ETHERADDRL]; + struct sockaddr_storage vaq_sock; + varpd_instance_t *vaq_inst; + struct ether_arp *vaq_ea; + varpd_query_handle_t *vaq_query; + const overlay_targ_lookup_t *vaq_otl; + ip6_t *vaq_ip6; + nd_neighbor_solicit_t *vaq_ns; +} varpd_arp_query_t; + +typedef struct varpd_dhcp_query { + char vdq_buf[ETHERMAX + VLAN_TAGSZ]; + size_t vdq_bsize; + uint8_t vdq_lookup[ETHERADDRL]; + const overlay_targ_lookup_t *vdq_otl; + varpd_instance_t *vdq_inst; + varpd_query_handle_t *vdq_query; + struct ether_header *vdq_ether; +} varpd_dhcp_query_t; + +static const uint8_t libvarpd_arp_bcast[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff }; + +void +libvarpd_plugin_proxy_arp(varpd_provider_handle_t *hdl, + varpd_query_handle_t *vqh, const overlay_targ_lookup_t *otl) +{ + varpd_arp_query_t *vaq; + varpd_instance_t *inst = (varpd_instance_t *)hdl; + struct ether_arp *ea; + struct sockaddr_in *ip; + + vaq = umem_alloc(sizeof (varpd_arp_query_t), UMEM_DEFAULT); + if (vaq == NULL) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + return; + } + vaq->vaq_bsize = sizeof (vaq->vaq_buf); + + if (otl->otl_sap != ETHERTYPE_ARP) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + /* + * An ARP packet should not be very large because it's definited to only + * be allowed to have a single entry at a given time. But our data must + * be at least as large as an ether_arp and our header must be at least + * as large as a standard ethernet header. + */ + if (otl->otl_hdrsize + otl->otl_pktsize > vaq->vaq_bsize || + otl->otl_pktsize < sizeof (struct ether_arp) || + otl->otl_hdrsize < sizeof (struct ether_header)) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + if (libvarpd_overlay_packet(inst->vri_impl, otl, vaq->vaq_buf, + &vaq->vaq_bsize) != 0) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + if (otl->otl_hdrsize + otl->otl_pktsize < vaq->vaq_bsize) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + ea = (void *)((uintptr_t)vaq->vaq_buf + (uintptr_t)otl->otl_hdrsize); + + /* + * Make sure it matches something that we know about. + */ + if (ntohs(ea->ea_hdr.ar_hrd) != ARPHRD_ETHER || + ntohs(ea->ea_hdr.ar_pro) != ETHERTYPE_IP || + ea->ea_hdr.ar_hln != ETHERADDRL || + ea->ea_hdr.ar_pln != sizeof (ea->arp_spa) || + ntohs(ea->ea_hdr.ar_op) != ARPOP_REQUEST) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + /* + * Now that we've verified that our data is sane, see if we're doing a + * gratuitous arp and if so, drop it. Otherwise, we may end up + * triggering duplicate address detection. + */ + if (bcmp(ea->arp_spa, ea->arp_tpa, sizeof (ea->arp_spa)) == 0) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + bzero(&vaq->vaq_sock, sizeof (struct sockaddr_storage)); + ip = (struct sockaddr_in *)&vaq->vaq_sock; + ip->sin_family = AF_INET; + bcopy(ea->arp_tpa, &ip->sin_addr, sizeof (ea->arp_tpa)); + + vaq->vaq_type = AF_INET; + vaq->vaq_inst = inst; + vaq->vaq_ea = ea; + vaq->vaq_query = vqh; + vaq->vaq_otl = otl; + + if (inst->vri_plugin->vpp_ops->vpo_arp == NULL) + libvarpd_panic("%s plugin asked to do arp, but has no method", + inst->vri_plugin->vpp_name); + + inst->vri_plugin->vpp_ops->vpo_arp(inst->vri_private, + (varpd_arp_handle_t *)vaq, VARPD_QTYPE_ETHERNET, + (struct sockaddr *)ip, vaq->vaq_lookup); +} + +static void +libvarpd_proxy_arp_fini(varpd_arp_query_t *vaq) +{ + struct ether_header *ether; + struct sockaddr_in *ip; + + ip = (struct sockaddr_in *)&vaq->vaq_sock; + /* + * Modify our packet in place for a reply. We need to swap around the + * sender and target addresses. + */ + vaq->vaq_ea->ea_hdr.ar_op = htons(ARPOP_REPLY); + bcopy(vaq->vaq_ea->arp_sha, vaq->vaq_ea->arp_tha, ETHERADDRL); + bcopy(vaq->vaq_lookup, vaq->vaq_ea->arp_sha, ETHERADDRL); + bcopy(vaq->vaq_ea->arp_spa, &ip->sin_addr, + sizeof (vaq->vaq_ea->arp_spa)); + bcopy(vaq->vaq_ea->arp_tpa, vaq->vaq_ea->arp_spa, + sizeof (vaq->vaq_ea->arp_spa)); + bcopy(&ip->sin_addr, vaq->vaq_ea->arp_tpa, + sizeof (vaq->vaq_ea->arp_spa)); + + /* + * Finally go ahead and fix up the mac header and reply to the sender + * explicitly. + */ + ether = (struct ether_header *)vaq->vaq_buf; + bcopy(ðer->ether_shost, ðer->ether_dhost, ETHERADDRL); + bcopy(vaq->vaq_lookup, ðer->ether_shost, ETHERADDRL); + + (void) libvarpd_overlay_inject(vaq->vaq_inst->vri_impl, vaq->vaq_otl, + vaq->vaq_buf, vaq->vaq_bsize); + + libvarpd_plugin_query_reply(vaq->vaq_query, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); +} + +static uint16_t +libvarpd_icmpv6_checksum(const ip6_t *v6hdr, const uint16_t *buf, uint16_t mlen) +{ + int i; + uint16_t *v; + uint32_t sum = 0; + + assert(mlen % 2 == 0); + v = (uint16_t *)&v6hdr->ip6_src; + for (i = 0; i < sizeof (struct in6_addr); i += 2, v++) + sum += *v; + v = (uint16_t *)&v6hdr->ip6_dst; + for (i = 0; i < sizeof (struct in6_addr); i += 2, v++) + sum += *v; + sum += htons(mlen); +#ifdef _BIG_ENDIAN + sum += IPPROTO_ICMPV6; +#else + sum += IPPROTO_ICMPV6 << 8; +#endif /* _BIG_ENDIAN */ + + for (i = 0; i < mlen; i += 2, buf++) + sum += *buf; + + while ((sum >> 16) != 0) + sum = (sum & 0xffff) + (sum >> 16); + + return (sum & 0xffff); +} + +/* + * Proxying NDP is much more involved than proxying ARP. For starters, NDP + * neighbor solicitations are implemented in terms of IPv6 ICMP as opposed to + * its own Ethertype. Therefore, we're going to have to grab a packet if it's a + * multicast packet and then determine if we actually want to do anything with + * it. + */ +void +libvarpd_plugin_proxy_ndp(varpd_provider_handle_t *hdl, + varpd_query_handle_t *vqh, const overlay_targ_lookup_t *otl) +{ + size_t bsize, plen; + varpd_arp_query_t *vaq; + ip6_t *v6hdr; + nd_neighbor_solicit_t *ns; + nd_opt_hdr_t *opt; + struct sockaddr_in6 *s6; + + varpd_instance_t *inst = (varpd_instance_t *)hdl; + uint8_t *eth = NULL; + + vaq = umem_alloc(sizeof (varpd_arp_query_t), UMEM_DEFAULT); + if (vaq == NULL) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + return; + } + vaq->vaq_bsize = sizeof (vaq->vaq_buf); + + if (otl->otl_dstaddr[0] != 0x33 || + otl->otl_dstaddr[1] != 0x33) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + /* + * If we have more than a standard frame size for the ICMP neighbor + * solicitation, drop it. Similarly if there isn't enough data present + * for us, drop it. + */ + if (otl->otl_hdrsize + otl->otl_pktsize > vaq->vaq_bsize) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + if (otl->otl_pktsize < sizeof (ip6_t) + + sizeof (nd_neighbor_solicit_t)) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + if (libvarpd_overlay_packet(inst->vri_impl, otl, vaq->vaq_buf, + &vaq->vaq_bsize) != 0) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + bsize = vaq->vaq_bsize; + bsize -= otl->otl_hdrsize; + assert(bsize > sizeof (ip6_t)); + + v6hdr = (ip6_t *)(vaq->vaq_buf + otl->otl_hdrsize); + if (((v6hdr->ip6_vfc & 0xf0) >> 4) != IPV6_VERSION) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + if (v6hdr->ip6_nxt != IPPROTO_ICMPV6) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + /* + * In addition to getting these requests on the multicast address for + * node solicitation, we may also end up getting them on a generic + * multicast address due to timeouts or other choices by various OSes. + * We should fairly liberal and accept both, even though the standard + * wants them to a solicitation address. + */ + if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&v6hdr->ip6_dst) && + !IN6_IS_ADDR_MC_LINKLOCAL(&v6hdr->ip6_dst)) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + bsize -= sizeof (ip6_t); + plen = ntohs(v6hdr->ip6_plen); + if (bsize < plen) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + /* + * Now we know that this is an ICMPv6 request targeting the right + * IPv6 multicast prefix. Let's go through and verify that ICMPv6 + * indicates that we have the real thing and ensure that per RFC 4861 + * the target address is not a multicast address. Further, because this + * is a multicast on Ethernet, we must have a source link-layer address. + * + * We should probably enforce that we have a valid ICMP checksum at some + * point. + */ + ns = (nd_neighbor_solicit_t *)(vaq->vaq_buf + otl->otl_hdrsize + + sizeof (ip6_t)); + if (ns->nd_ns_type != ND_NEIGHBOR_SOLICIT && ns->nd_ns_code != 0) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + if (IN6_IS_ADDR_MULTICAST(&ns->nd_ns_target) || + IN6_IS_ADDR_V4MAPPED(&ns->nd_ns_target) || + IN6_IS_ADDR_LOOPBACK(&ns->nd_ns_target)) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + plen -= sizeof (nd_neighbor_solicit_t); + opt = (nd_opt_hdr_t *)(ns+1); + while (plen >= sizeof (struct nd_opt_hdr)) { + /* If we have an option with no lenght, that's clear bogus */ + if (opt->nd_opt_len == 0) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + if (opt->nd_opt_type == ND_OPT_SOURCE_LINKADDR) { + eth = (uint8_t *)((uintptr_t)opt + + sizeof (nd_opt_hdr_t)); + } + plen -= opt->nd_opt_len * 8; + opt = (nd_opt_hdr_t *)((uintptr_t)opt + + opt->nd_opt_len * 8); + } + + if (eth == NULL) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } + + bzero(&vaq->vaq_sock, sizeof (struct sockaddr_storage)); + s6 = (struct sockaddr_in6 *)&vaq->vaq_sock; + s6->sin6_family = AF_INET6; + bcopy(&ns->nd_ns_target, &s6->sin6_addr, sizeof (s6->sin6_addr)); + + if (inst->vri_plugin->vpp_ops->vpo_arp == NULL) + libvarpd_panic("%s plugin asked to do arp, but has no method", + inst->vri_plugin->vpp_name); + + vaq->vaq_type = AF_INET6; + vaq->vaq_inst = inst; + vaq->vaq_ea = NULL; + vaq->vaq_query = vqh; + vaq->vaq_otl = otl; + vaq->vaq_ns = ns; + vaq->vaq_ip6 = v6hdr; + inst->vri_plugin->vpp_ops->vpo_arp(inst->vri_private, + (varpd_arp_handle_t *)vaq, VARPD_QTYPE_ETHERNET, + (struct sockaddr *)s6, vaq->vaq_lookup); +} + +static void +libvarpd_proxy_ndp_fini(varpd_arp_query_t *vaq) +{ + char resp[ETHERMAX + VLAN_TAGSZ]; + struct ether_header *ether; + nd_neighbor_advert_t *na; + nd_opt_hdr_t *opt; + ip6_t *v6hdr; + size_t roff = 0; + + /* + * Now we need to assemble an RA as a response. Unlike with arp, we opt + * to use a new packet just to make things a bit simpler saner here. + */ + v6hdr = vaq->vaq_ip6; + bcopy(vaq->vaq_buf, resp, vaq->vaq_otl->otl_hdrsize); + ether = (struct ether_header *)resp; + bcopy(ðer->ether_shost, ðer->ether_dhost, ETHERADDRL); + bcopy(vaq->vaq_lookup, ðer->ether_shost, ETHERADDRL); + roff += vaq->vaq_otl->otl_hdrsize; + bcopy(v6hdr, resp + roff, sizeof (ip6_t)); + v6hdr = (ip6_t *)(resp + roff); + bcopy(&v6hdr->ip6_src, &v6hdr->ip6_dst, sizeof (struct in6_addr)); + bcopy(&vaq->vaq_ns->nd_ns_target, &v6hdr->ip6_src, + sizeof (struct in6_addr)); + roff += sizeof (ip6_t); + na = (nd_neighbor_advert_t *)(resp + roff); + na->nd_na_type = ND_NEIGHBOR_ADVERT; + na->nd_na_code = 0; + /* + * RFC 4443 defines that we should set the checksum to zero before we + * calculate it. + */ + na->nd_na_cksum = 0; + /* + * Nota bene, the header <netinet/icmp6.h> has already transformed this + * into the appropriate host order. Don't use htonl. + */ + na->nd_na_flags_reserved = ND_NA_FLAG_SOLICITED | ND_NA_FLAG_OVERRIDE; + bcopy(&vaq->vaq_ns->nd_ns_target, &na->nd_na_target, + sizeof (struct in6_addr)); + roff += sizeof (nd_neighbor_advert_t); + + opt = (nd_opt_hdr_t *)(resp + roff); + opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; + opt->nd_opt_len = 1; + roff += sizeof (nd_opt_hdr_t); + bcopy(vaq->vaq_lookup, resp + roff, ETHERADDRL); + roff += ETHERADDRL; + + /* + * Now that we've filled in the packet, go back and compute the checksum + * and fill in the IPv6 payload size. + */ + v6hdr->ip6_plen = htons(roff - sizeof (ip6_t) - + vaq->vaq_otl->otl_hdrsize); + na->nd_na_cksum = ~libvarpd_icmpv6_checksum(v6hdr, (uint16_t *)na, + ntohs(v6hdr->ip6_plen)) & 0xffff; + + (void) libvarpd_overlay_inject(vaq->vaq_inst->vri_impl, vaq->vaq_otl, + resp, roff); + + libvarpd_plugin_query_reply(vaq->vaq_query, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); +} + +void +libvarpd_plugin_arp_reply(varpd_arp_handle_t *vah, int action) +{ + varpd_arp_query_t *vaq = (varpd_arp_query_t *)vah; + + if (vaq == NULL) + libvarpd_panic("unknown plugin passed invalid " + "varpd_arp_handle_t"); + + if (action == VARPD_LOOKUP_DROP) { + libvarpd_plugin_query_reply(vaq->vaq_query, VARPD_LOOKUP_DROP); + umem_free(vaq, sizeof (varpd_arp_query_t)); + return; + } else if (action != VARPD_LOOKUP_OK) + libvarpd_panic("%s plugin returned invalid action %d", + vaq->vaq_inst->vri_plugin->vpp_name, action); + + switch (vaq->vaq_type) { + case AF_INET: + libvarpd_proxy_arp_fini(vaq); + break; + case AF_INET6: + libvarpd_proxy_ndp_fini(vaq); + break; + default: + libvarpd_panic("encountered unknown vaq_type: %d", + vaq->vaq_type); + } +} + +void +libvarpd_plugin_proxy_dhcp(varpd_provider_handle_t *hdl, + varpd_query_handle_t *vqh, const overlay_targ_lookup_t *otl) +{ + varpd_dhcp_query_t *vdq; + struct ether_header *ether; + struct ip *ip; + struct udphdr *udp; + varpd_instance_t *inst = (varpd_instance_t *)hdl; + + vdq = umem_alloc(sizeof (varpd_dhcp_query_t), UMEM_DEFAULT); + if (vdq == NULL) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + return; + } + vdq->vdq_bsize = sizeof (vdq->vdq_buf); + + if (otl->otl_sap != ETHERTYPE_IP) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vdq, sizeof (varpd_dhcp_query_t)); + return; + } + + if (bcmp(otl->otl_dstaddr, libvarpd_arp_bcast, ETHERADDRL) != 0) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vdq, sizeof (varpd_dhcp_query_t)); + return; + } + + if (otl->otl_hdrsize + otl->otl_pktsize > vdq->vdq_bsize || + otl->otl_pktsize < sizeof (struct ip) + sizeof (struct udphdr) + + sizeof (struct dhcp) || + otl->otl_hdrsize < sizeof (struct ether_header)) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vdq, sizeof (varpd_dhcp_query_t)); + return; + } + + if (libvarpd_overlay_packet(inst->vri_impl, otl, vdq->vdq_buf, + &vdq->vdq_bsize) != 0) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vdq, sizeof (varpd_dhcp_query_t)); + return; + } + + if (vdq->vdq_bsize != otl->otl_hdrsize + otl->otl_pktsize) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vdq, sizeof (varpd_dhcp_query_t)); + return; + } + + ether = (struct ether_header *)vdq->vdq_buf; + ip = (struct ip *)(vdq->vdq_buf + otl->otl_hdrsize); + + if (ip->ip_v != IPVERSION && ip->ip_p != IPPROTO_UDP) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vdq, sizeof (varpd_dhcp_query_t)); + return; + } + + if (otl->otl_hdrsize + ip->ip_hl * 4 + sizeof (struct udphdr) > + vdq->vdq_bsize) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vdq, sizeof (varpd_dhcp_query_t)); + return; + } + + udp = (struct udphdr *)(vdq->vdq_buf + otl->otl_hdrsize + + ip->ip_hl * 4); + + if (ntohs(udp->uh_sport) != IPPORT_BOOTPC || + ntohs(udp->uh_dport) != IPPORT_BOOTPS) { + libvarpd_plugin_query_reply(vqh, VARPD_LOOKUP_DROP); + umem_free(vdq, sizeof (varpd_dhcp_query_t)); + return; + } + + vdq->vdq_ether = ether; + vdq->vdq_inst = inst; + vdq->vdq_query = vqh; + vdq->vdq_otl = otl; + + if (inst->vri_plugin->vpp_ops->vpo_dhcp == NULL) + libvarpd_panic("%s plugin asked to do dhcp, but has no method", + inst->vri_plugin->vpp_name); + + inst->vri_plugin->vpp_ops->vpo_dhcp(inst->vri_private, + (varpd_dhcp_handle_t *)vdq, VARPD_QTYPE_ETHERNET, otl, + vdq->vdq_lookup); +} + +void +libvarpd_plugin_dhcp_reply(varpd_dhcp_handle_t *vdh, int action) +{ + varpd_dhcp_query_t *vdq = (varpd_dhcp_query_t *)vdh; + + if (vdq == NULL) + libvarpd_panic("unknown plugin passed invalid " + "varpd_dhcp_handle_t"); + + if (action == VARPD_LOOKUP_DROP) { + libvarpd_plugin_query_reply(vdq->vdq_query, VARPD_LOOKUP_DROP); + umem_free(vdq, sizeof (varpd_dhcp_query_t)); + return; + } else if (action != VARPD_LOOKUP_OK) + libvarpd_panic("%s plugin returned invalid action %d", + vdq->vdq_inst->vri_plugin->vpp_name, action); + + bcopy(vdq->vdq_lookup, &vdq->vdq_ether->ether_dhost, ETHERADDRL); + (void) libvarpd_overlay_resend(vdq->vdq_inst->vri_impl, vdq->vdq_otl, + vdq->vdq_buf, vdq->vdq_bsize); + + libvarpd_plugin_query_reply(vdq->vdq_query, VARPD_LOOKUP_DROP); + umem_free(vdq, sizeof (varpd_dhcp_query_t)); +} + +/* + * Inject a gratuitous ARP packet to the specified mac address. + */ +void +libvarpd_inject_arp(varpd_provider_handle_t *vph, const uint16_t vlan, + const uint8_t *srcmac, const struct in_addr *srcip, const uint8_t *dstmac) +{ + char buf[500]; + size_t bsize = 0; + struct ether_arp *ea; + varpd_instance_t *inst = (varpd_instance_t *)vph; + + if (vlan != 0) { + struct ether_vlan_header *eh; + eh = (struct ether_vlan_header *)(buf + bsize); + bsize += sizeof (struct ether_vlan_header); + bcopy(dstmac, &eh->ether_dhost, ETHERADDRL); + bcopy(srcmac, &eh->ether_shost, ETHERADDRL); + eh->ether_tpid = htons(ETHERTYPE_VLAN); + eh->ether_tci = htons(VLAN_TCI(0, ETHER_CFI, vlan)); + eh->ether_type = htons(ETHERTYPE_ARP); + } else { + struct ether_header *eh; + eh = (struct ether_header *)(buf + bsize); + bsize += sizeof (struct ether_header); + bcopy(dstmac, &eh->ether_dhost, ETHERADDRL); + bcopy(srcmac, &eh->ether_shost, ETHERADDRL); + eh->ether_type = htons(ETHERTYPE_ARP); + } + + ea = (struct ether_arp *)(buf + bsize); + bsize += sizeof (struct ether_arp); + ea->ea_hdr.ar_hrd = htons(ARPHRD_ETHER); + ea->ea_hdr.ar_pro = htons(ETHERTYPE_IP); + ea->ea_hdr.ar_hln = ETHERADDRL; + ea->ea_hdr.ar_pln = sizeof (struct in_addr); + ea->ea_hdr.ar_op = htons(ARPOP_REQUEST); + bcopy(srcmac, ea->arp_sha, ETHERADDRL); + bcopy(srcip, ea->arp_spa, sizeof (struct in_addr)); + bcopy(libvarpd_arp_bcast, ea->arp_tha, ETHERADDRL); + bcopy(srcip, ea->arp_tpa, sizeof (struct in_addr)); + + (void) libvarpd_overlay_instance_inject(inst, buf, bsize); +} diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c new file mode 100644 index 0000000000..1254c14e19 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.c @@ -0,0 +1,626 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * varpd client interfaces + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <umem.h> +#include <unistd.h> +#include <string.h> +#include <strings.h> +#include <door.h> + +#include <libvarpd_impl.h> + +typedef struct varpd_client { + int vcl_doorfd; +} varpd_client_t; + +typedef struct varpd_client_prop_info { + varpd_client_t *vcprop_client; + uint64_t vcprop_instance; + uint_t vcprop_propid; + uint_t vcprop_type; + uint_t vcprop_prot; + uint32_t vcprop_defsize; + uint32_t vcprop_psize; + char vcprop_name[LIBVARPD_PROP_NAMELEN]; + uint8_t vcprop_default[LIBVARPD_PROP_SIZEMAX]; + uint8_t vcprop_poss[LIBVARPD_PROP_SIZEMAX]; +} varpd_client_prop_info_t; + +static int +libvarpd_c_door_call(varpd_client_t *client, varpd_client_arg_t *argp, + size_t altsize) +{ + int ret; + door_arg_t darg; + + darg.data_ptr = (char *)argp; + darg.desc_ptr = NULL; + darg.desc_num = 0; + darg.rbuf = (char *)argp; + if (altsize != 0) { + darg.data_size = altsize; + darg.rsize = altsize; + } else { + darg.data_size = sizeof (varpd_client_arg_t); + darg.rsize = sizeof (varpd_client_arg_t); + } + + do { + ret = door_call(client->vcl_doorfd, &darg); + } while (ret != 0 && errno == EINTR); + if (ret != 0) { + switch (errno) { + case E2BIG: + case EFAULT: + case EINVAL: + case ENOTSUP: + case EOVERFLOW: + case ENFILE: + libvarpd_panic("unhandleable errno from door_call: %d", + errno); + } + ret = errno; + } + + return (ret); +} + +int +libvarpd_c_create(varpd_client_handle_t **chpp, const char *doorname) +{ + varpd_client_t *client; + + client = umem_alloc(sizeof (varpd_client_t), UMEM_DEFAULT); + if (client == NULL) + return (ENOMEM); + + client->vcl_doorfd = open(doorname, O_RDWR); + if (client->vcl_doorfd < 0) { + int ret = errno; + umem_free(client, sizeof (varpd_client_t)); + return (ret); + } + + *chpp = (varpd_client_handle_t *)client; + return (0); +} + +void +libvarpd_c_destroy(varpd_client_handle_t *chp) +{ + varpd_client_t *client = (varpd_client_t *)chp; + if (close(client->vcl_doorfd) != 0) + libvarpd_panic("failed to close door fd %d: %d", + client->vcl_doorfd, errno); + + umem_free(chp, sizeof (varpd_client_t)); +} + +int +libvarpd_c_instance_create(varpd_client_handle_t *chp, datalink_id_t linkid, + const char *search, uint64_t *cidp) +{ + int ret; + varpd_client_t *client = (varpd_client_t *)chp; + varpd_client_arg_t carg; + varpd_client_create_arg_t *cap = &carg.vca_un.vca_create; + + if (strlen(search) >= LIBVARPD_PROP_NAMELEN) + return (EINVAL); + carg.vca_command = VARPD_CLIENT_CREATE; + carg.vca_errno = 0; + cap->vcca_linkid = linkid; + (void) strlcpy(cap->vcca_plugin, search, LIBVARPD_PROP_NAMELEN); + + ret = libvarpd_c_door_call(client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + + *cidp = cap->vcca_id; + + return (0); +} + +int +libvarpd_c_instance_activate(varpd_client_handle_t *chp, uint64_t cid) +{ + int ret; + varpd_client_t *client = (varpd_client_t *)chp; + varpd_client_arg_t carg; + varpd_client_instance_arg_t *vciap = &carg.vca_un.vca_instance; + + carg.vca_command = VARPD_CLIENT_ACTIVATE; + carg.vca_errno = 0; + vciap->vcia_id = cid; + + ret = libvarpd_c_door_call(client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + + return (0); +} + +int +libvarpd_c_instance_destroy(varpd_client_handle_t *chp, uint64_t cid) +{ + int ret; + varpd_client_t *client = (varpd_client_t *)chp; + varpd_client_arg_t carg; + varpd_client_instance_arg_t *vciap = &carg.vca_un.vca_instance; + + carg.vca_command = VARPD_CLIENT_DESTROY; + carg.vca_errno = 0; + vciap->vcia_id = cid; + + ret = libvarpd_c_door_call(client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + + return (0); +} + +int +libvarpd_c_prop_nprops(varpd_client_handle_t *chp, uint64_t cid, uint_t *nprops) +{ + int ret; + varpd_client_t *client = (varpd_client_t *)chp; + varpd_client_arg_t carg; + varpd_client_nprops_arg_t *vcnap = &carg.vca_un.vca_nprops; + + carg.vca_command = VARPD_CLIENT_NPROPS; + carg.vca_errno = 0; + vcnap->vcna_id = cid; + vcnap->vcna_nprops = 0; + + ret = libvarpd_c_door_call(client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + *nprops = vcnap->vcna_nprops; + return (0); +} + +int +libvarpd_c_prop_handle_alloc(varpd_client_handle_t *chp, uint64_t cid, + varpd_client_prop_handle_t **phdlp) +{ + varpd_client_prop_info_t *infop; + + infop = umem_alloc(sizeof (varpd_client_prop_info_t), UMEM_DEFAULT); + if (infop == NULL) + return (ENOMEM); + + bzero(infop, sizeof (varpd_client_prop_info_t)); + infop->vcprop_client = (varpd_client_t *)chp; + infop->vcprop_instance = cid; + infop->vcprop_propid = UINT_MAX; + *phdlp = (varpd_client_prop_handle_t *)infop; + return (0); +} + +void +libvarpd_c_prop_handle_free(varpd_client_prop_handle_t *phdl) +{ + umem_free(phdl, sizeof (varpd_client_prop_info_t)); + phdl = NULL; +} + +static void +libvarpd_c_prop_info_from_door(varpd_client_prop_info_t *infop, + const varpd_client_propinfo_arg_t *vcfap) +{ + infop->vcprop_propid = vcfap->vcfa_propid; + infop->vcprop_type = vcfap->vcfa_type; + infop->vcprop_prot = vcfap->vcfa_prot; + infop->vcprop_defsize = vcfap->vcfa_defsize; + infop->vcprop_psize = vcfap->vcfa_psize; + bcopy(vcfap->vcfa_name, infop->vcprop_name, LIBVARPD_PROP_NAMELEN); + bcopy(vcfap->vcfa_default, infop->vcprop_default, + LIBVARPD_PROP_SIZEMAX); + bcopy(vcfap->vcfa_poss, infop->vcprop_poss, LIBVARPD_PROP_SIZEMAX); +} + +int +libvarpd_c_prop_info_fill_by_name(varpd_client_prop_handle_t *phdl, + const char *name) +{ + int ret; + varpd_client_arg_t carg; + varpd_client_propinfo_arg_t *vcfap = &carg.vca_un.vca_info; + varpd_client_prop_info_t *infop = (varpd_client_prop_info_t *)phdl; + + if (strlen(name) >= LIBVARPD_PROP_NAMELEN) + return (EINVAL); + bzero(&carg, sizeof (varpd_client_arg_t)); + carg.vca_command = VARPD_CLIENT_PROPINFO; + carg.vca_errno = 0; + vcfap->vcfa_id = infop->vcprop_instance; + vcfap->vcfa_propid = UINT_MAX; + (void) strlcpy(vcfap->vcfa_name, name, LIBVARPD_PROP_NAMELEN); + + ret = libvarpd_c_door_call(infop->vcprop_client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + + libvarpd_c_prop_info_from_door(infop, vcfap); + return (0); +} + +int +libvarpd_c_prop_info_fill(varpd_client_prop_handle_t *phdl, uint_t propid) +{ + int ret; + varpd_client_arg_t carg; + varpd_client_propinfo_arg_t *vcfap = &carg.vca_un.vca_info; + varpd_client_prop_info_t *infop = (varpd_client_prop_info_t *)phdl; + + bzero(&carg, sizeof (varpd_client_arg_t)); + carg.vca_command = VARPD_CLIENT_PROPINFO; + carg.vca_errno = 0; + vcfap->vcfa_id = infop->vcprop_instance; + vcfap->vcfa_propid = propid; + + ret = libvarpd_c_door_call(infop->vcprop_client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + + libvarpd_c_prop_info_from_door(infop, vcfap); + return (0); +} + +int +libvarpd_c_prop_info(varpd_client_prop_handle_t *phdl, const char **namep, + uint_t *typep, uint_t *protp, const void **defp, uint32_t *defsizep, + const mac_propval_range_t **possp) +{ + varpd_client_prop_info_t *infop = (varpd_client_prop_info_t *)phdl; + if (infop->vcprop_propid == UINT_MAX) + return (EINVAL); + + if (namep != NULL) + *namep = infop->vcprop_name; + if (typep != NULL) + *typep = infop->vcprop_type; + if (protp != NULL) + *protp = infop->vcprop_prot; + if (defp != NULL) + *defp = infop->vcprop_default; + if (defsizep != NULL) + *defsizep = infop->vcprop_defsize; + if (possp != NULL) + *possp = (const mac_propval_range_t *)infop->vcprop_poss; + return (0); +} + +int +libvarpd_c_prop_get(varpd_client_prop_handle_t *phdl, void *buf, uint32_t *len) +{ + int ret; + varpd_client_arg_t carg; + varpd_client_prop_arg_t *vcpap = &carg.vca_un.vca_prop; + varpd_client_prop_info_t *infop = (varpd_client_prop_info_t *)phdl; + + if (len == NULL || buf == NULL || infop->vcprop_propid == UINT_MAX) + return (EINVAL); + if (*len < LIBVARPD_PROP_SIZEMAX) + return (EOVERFLOW); + + bzero(&carg, sizeof (varpd_client_arg_t)); + carg.vca_command = VARPD_CLIENT_GETPROP; + carg.vca_errno = 0; + vcpap->vcpa_id = infop->vcprop_instance; + vcpap->vcpa_propid = infop->vcprop_propid; + + ret = libvarpd_c_door_call(infop->vcprop_client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + + /* + * If the buffer size is too large then something odd has certainly + * happened here, it means that varpd has gone rogue. In such a case we + * return a rather odd errror, though we don't believe that this should + * generally happen. + */ + if (vcpap->vcpa_bufsize > LIBVARPD_PROP_SIZEMAX) + return (E2BIG); + + bcopy(vcpap->vcpa_buf, buf, vcpap->vcpa_bufsize); + *len = vcpap->vcpa_bufsize; + return (0); +} + +int +libvarpd_c_prop_set(varpd_client_prop_handle_t *phdl, const void *buf, + uint32_t len) +{ + int ret; + varpd_client_arg_t carg; + varpd_client_prop_arg_t *vcpap = &carg.vca_un.vca_prop; + varpd_client_prop_info_t *infop = (varpd_client_prop_info_t *)phdl; + + if (len == 0 || buf == NULL || infop->vcprop_propid == UINT_MAX) + return (EINVAL); + if (len > LIBVARPD_PROP_SIZEMAX) + return (EOVERFLOW); + + carg.vca_command = VARPD_CLIENT_SETPROP; + carg.vca_errno = 0; + vcpap->vcpa_id = infop->vcprop_instance; + vcpap->vcpa_propid = infop->vcprop_propid; + vcpap->vcpa_bufsize = len; + bcopy(buf, vcpap->vcpa_buf, len); + + ret = libvarpd_c_door_call(infop->vcprop_client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + + return (0); +} + +int +libvarpd_c_instance_lookup(varpd_client_handle_t *chp, datalink_id_t linkid, + uint64_t *instp) +{ + int ret; + varpd_client_arg_t carg; + varpd_client_lookup_arg_t *vclap = &carg.vca_un.vca_lookup; + varpd_client_t *client = (varpd_client_t *)chp; + + carg.vca_command = VARPD_CLIENT_LOOKUP; + carg.vca_errno = 0; + vclap->vcla_linkid = linkid; + ret = libvarpd_c_door_call(client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + if (instp != NULL) + *instp = vclap->vcla_id; + + return (0); +} + +int +libvarpd_c_instance_target_mode(varpd_client_handle_t *chp, uint64_t cid, + uint_t *dtype, uint_t *mtype) +{ + int ret; + varpd_client_arg_t carg; + varpd_client_target_mode_arg_t *vctmap = &carg.vca_un.vca_mode; + varpd_client_t *client = (varpd_client_t *)chp; + + carg.vca_command = VARPD_CLIENT_TARGET_MODE; + carg.vca_errno = 0; + vctmap->vtma_id = cid; + ret = libvarpd_c_door_call(client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + if (ret == 0) { + if (mtype != NULL) + *mtype = vctmap->vtma_mode; + if (dtype != NULL) + *dtype = vctmap->vtma_dest; + } + + return (ret); +} + +int +libvarpd_c_instance_cache_flush(varpd_client_handle_t *chp, uint64_t cid) +{ + int ret; + varpd_client_arg_t carg; + varpd_client_target_cache_arg_t *vctcap = &carg.vca_un.vca_cache; + varpd_client_t *client = (varpd_client_t *)chp; + + carg.vca_command = VARPD_CLIENT_CACHE_FLUSH; + carg.vca_errno = 0; + + vctcap->vtca_id = cid; + ret = libvarpd_c_door_call(client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + + return (0); +} + +int +libvarpd_c_instance_cache_delete(varpd_client_handle_t *chp, uint64_t cid, + const struct ether_addr *key) +{ + int ret; + varpd_client_arg_t carg; + varpd_client_target_cache_arg_t *vctcap = &carg.vca_un.vca_cache; + varpd_client_t *client = (varpd_client_t *)chp; + + if (key == NULL) + return (EINVAL); + + carg.vca_command = VARPD_CLIENT_CACHE_DELETE; + carg.vca_errno = 0; + vctcap->vtca_id = cid; + bcopy(key, vctcap->vtca_key, ETHERADDRL); + + ret = libvarpd_c_door_call(client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + + return (0); +} + +int +libvarpd_c_instance_cache_get(varpd_client_handle_t *chp, uint64_t cid, + const struct ether_addr *key, varpd_client_cache_entry_t *entry) +{ + int ret; + varpd_client_arg_t carg; + varpd_client_target_cache_arg_t *vctcap = &carg.vca_un.vca_cache; + varpd_client_t *client = (varpd_client_t *)chp; + + if (key == NULL || entry == NULL) + return (EINVAL); + + carg.vca_command = VARPD_CLIENT_CACHE_GET; + carg.vca_errno = 0; + vctcap->vtca_id = cid; + bcopy(key, vctcap->vtca_key, ETHERADDRL); + bzero(&vctcap->vtca_entry, sizeof (varpd_client_cache_entry_t)); + + ret = libvarpd_c_door_call(client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + + bcopy(&vctcap->vtca_entry, entry, sizeof (varpd_client_cache_entry_t)); + return (0); +} + +int +libvarpd_c_instance_cache_set(varpd_client_handle_t *chp, uint64_t cid, + const struct ether_addr *key, const varpd_client_cache_entry_t *entry) +{ + int ret; + varpd_client_arg_t carg; + varpd_client_target_cache_arg_t *vctcap = &carg.vca_un.vca_cache; + varpd_client_t *client = (varpd_client_t *)chp; + + if (key == NULL || entry == NULL) + return (EINVAL); + + carg.vca_command = VARPD_CLIENT_CACHE_SET; + carg.vca_errno = 0; + vctcap->vtca_id = cid; + bcopy(key, vctcap->vtca_key, ETHERADDRL); + bcopy(entry, &vctcap->vtca_entry, sizeof (varpd_client_cache_entry_t)); + + ret = libvarpd_c_door_call(client, &carg, 0); + if (ret != 0) + return (ret); + + if (carg.vca_errno != 0) + return (carg.vca_errno); + + return (0); +} + +int +libvarpd_c_instance_cache_walk(varpd_client_handle_t *chp, uint64_t cid, + varpd_client_cache_f func, void *arg) +{ + int ret = 0; + size_t bufsize = sizeof (varpd_client_arg_t) + + 100 * sizeof (varpd_client_cache_entry_t); + varpd_client_t *client = (varpd_client_t *)chp; + varpd_client_arg_t *cargp; + varpd_client_target_walk_arg_t *vctwap; + + /* + * Because the number of entries involved in a walk may be large, we + * dynamically allocate a number of queries to make at a single time. + * This also means that the average door request doesn't inflate by the + * number of entries we want. For now, let's always grab 100 entries in + * a request. + */ + cargp = umem_zalloc(bufsize, UMEM_DEFAULT); + if (cargp == NULL) + return (errno); + vctwap = &cargp->vca_un.vca_walk; + for (;;) { + int i; + + cargp->vca_command = VARPD_CLIENT_CACHE_WALK; + cargp->vca_errno = 0; + vctwap->vtcw_id = cid; + vctwap->vtcw_count = 100; + + ret = libvarpd_c_door_call(client, cargp, bufsize); + if (ret != 0) + break; + + if (cargp->vca_errno != 0) { + ret = cargp->vca_errno; + break; + } + + if (vctwap->vtcw_count == 0) { + ret = 0; + break; + } + + for (i = 0; i < vctwap->vtcw_count; i++) { + varpd_client_cache_entry_t ent; + + ent.vcp_flags = vctwap->vtcw_ents[i].otce_flags; + bcopy(vctwap->vtcw_ents[i].otce_dest.otp_mac, + &ent.vcp_mac, ETHERADDRL); + ent.vcp_ip = vctwap->vtcw_ents[i].otce_dest.otp_ip; + ent.vcp_port = vctwap->vtcw_ents[i].otce_dest.otp_port; + ret = func(chp, cid, + (struct ether_addr *)vctwap->vtcw_ents[i].otce_mac, + &ent, arg); + if (ret != 0) { + ret = 0; + goto done; + } + } + } + +done: + umem_free(cargp, bufsize); + return (ret); +} diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h new file mode 100644 index 0000000000..459711b385 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_client.h @@ -0,0 +1,92 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LIBVARPD_CLIENT_H +#define _LIBVARPD_CLIENT_H + +/* + * varpd interfaces + */ + +#include <sys/types.h> +#include <stdint.h> +#include <sys/mac.h> +#include <sys/overlay_target.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct __varpd_client_handle varpd_client_handle_t; +typedef struct __varpd_client_prop_handle varpd_client_prop_handle_t; + +typedef struct varpd_client_cache_entry { + struct ether_addr vcp_mac; + uint16_t vcp_flags; + struct in6_addr vcp_ip; + uint16_t vcp_port; +} varpd_client_cache_entry_t; + +/* + * We just use the values from the kernel for now. + */ +#define LIBVARPD_PROP_SIZEMAX OVERLAY_PROP_SIZEMAX +#define LIBVARPD_PROP_NAMELEN OVERLAY_PROP_NAMELEN + +extern int libvarpd_c_create(varpd_client_handle_t **, const char *); +extern void libvarpd_c_destroy(varpd_client_handle_t *); +extern int libvarpd_c_instance_create(varpd_client_handle_t *, datalink_id_t, + const char *, uint64_t *); +extern int libvarpd_c_instance_activate(varpd_client_handle_t *, uint64_t); +extern int libvarpd_c_instance_destroy(varpd_client_handle_t *, uint64_t); + +extern int libvarpd_c_prop_nprops(varpd_client_handle_t *, uint64_t, uint_t *); +extern int libvarpd_c_prop_handle_alloc(varpd_client_handle_t *, uint64_t, + varpd_client_prop_handle_t **); +extern void libvarpd_c_prop_handle_free(varpd_client_prop_handle_t *); +extern int libvarpd_c_prop_info_fill(varpd_client_prop_handle_t *, uint_t); +extern int libvarpd_c_prop_info_fill_by_name(varpd_client_prop_handle_t *, + const char *); +extern int libvarpd_c_prop_info(varpd_client_prop_handle_t *, const char **, + uint_t *, uint_t *, const void **, uint32_t *, + const mac_propval_range_t **); +extern int libvarpd_c_prop_get(varpd_client_prop_handle_t *, void *, + uint32_t *); +extern int libvarpd_c_prop_set(varpd_client_prop_handle_t *, const void *, + uint32_t); + +extern int libvarpd_c_instance_lookup(varpd_client_handle_t *, datalink_id_t, + uint64_t *); +extern int libvarpd_c_instance_target_mode(varpd_client_handle_t *, uint64_t, + uint_t *, uint_t *); +extern int libvarpd_c_instance_cache_flush(varpd_client_handle_t *, uint64_t); +extern int libvarpd_c_instance_cache_delete(varpd_client_handle_t *, uint64_t, + const struct ether_addr *); +extern int libvarpd_c_instance_cache_get(varpd_client_handle_t *, uint64_t, + const struct ether_addr *, varpd_client_cache_entry_t *); +extern int libvarpd_c_instance_cache_set(varpd_client_handle_t *, uint64_t, + const struct ether_addr *, const varpd_client_cache_entry_t *); + +typedef int (*varpd_client_cache_f)(varpd_client_handle_t *, uint64_t, + const struct ether_addr *, const varpd_client_cache_entry_t *, void *); +extern int libvarpd_c_instance_cache_walk(varpd_client_handle_t *, uint64_t, + varpd_client_cache_f, void *); + + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBVARPD_CLIENT_H */ diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c new file mode 100644 index 0000000000..f684e031a8 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_door.c @@ -0,0 +1,469 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * varpd door server logic + */ + +#include <door.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <stropts.h> +#include <stdlib.h> +#include <strings.h> +#include <priv.h> +#include <libvarpd_impl.h> + +typedef int (libvarpd_door_f)(varpd_impl_t *, varpd_client_arg_t *, ucred_t *); + +static boolean_t +libvarpd_door_privileged(ucred_t *credp) +{ + const priv_set_t *ps; + + ps = ucred_getprivset(credp, PRIV_EFFECTIVE); + if (ps == NULL) + return (B_FALSE); + + return (priv_ismember(ps, PRIV_SYS_NET_CONFIG)); +} + +/* ARGSUSED */ +static int +libvarpd_door_f_create(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + int ret; + varpd_instance_handle_t *ihdl; + varpd_client_create_arg_t *vccap = &vcap->vca_un.vca_create; + + vccap->vcca_plugin[LIBVARPD_PROP_NAMELEN-1] = '\0'; + ret = libvarpd_instance_create((varpd_handle_t *)vip, + vccap->vcca_linkid, vccap->vcca_plugin, &ihdl); + if (ret == 0) + vccap->vcca_id = libvarpd_instance_id(ihdl); + + return (ret); +} + +/* ARGSUSED */ +static int +libvarpd_door_f_activate(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + varpd_instance_handle_t *ihp; + varpd_client_instance_arg_t *vciap = &vcap->vca_un.vca_instance; + + ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vciap->vcia_id); + if (ihp == NULL) + return (ENOENT); + return (libvarpd_instance_activate(ihp)); +} + +/* ARGSUSED */ +static int +libvarpd_door_f_destroy(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + varpd_instance_handle_t *ihp; + varpd_client_instance_arg_t *vciap = &vcap->vca_un.vca_instance; + + ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vciap->vcia_id); + if (ihp == NULL) + return (ENOENT); + libvarpd_instance_destroy(ihp); + return (0); +} + +/* ARGSUSED */ +static int +libvarpd_door_f_nprops(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + varpd_instance_handle_t *ihp; + varpd_client_nprops_arg_t *vcnap = &vcap->vca_un.vca_nprops; + + ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vcnap->vcna_id); + if (ihp == NULL) + return (ENOENT); + + return (libvarpd_prop_nprops(ihp, &vcnap->vcna_nprops)); +} + +/* ARGSUSED */ +static int +libvarpd_door_f_propinfo(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + int ret; + varpd_instance_handle_t *ihp; + varpd_prop_handle_t *phdl; + varpd_client_propinfo_arg_t *vcfap = &vcap->vca_un.vca_info; + + ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vcfap->vcfa_id); + if (ihp == NULL) + return (ENOENT); + ret = libvarpd_prop_handle_alloc((varpd_handle_t *)vip, ihp, &phdl); + if (ret != 0) + return (ret); + + if (vcfap->vcfa_propid != UINT_MAX) { + ret = libvarpd_prop_info_fill(phdl, vcfap->vcfa_propid); + if (ret != 0) { + libvarpd_prop_handle_free(phdl); + return (ret); + } + } else { + uint_t i, nprop; + const char *name; + + vcfap->vcfa_name[LIBVARPD_PROP_NAMELEN-1] = '\0'; + ret = libvarpd_prop_nprops(ihp, &nprop); + if (ret != 0) { + libvarpd_prop_handle_free(phdl); + return (ret); + } + for (i = 0; i < nprop; i++) { + ret = libvarpd_prop_info_fill(phdl, i); + if (ret != 0) { + libvarpd_prop_handle_free(phdl); + return (ret); + } + ret = libvarpd_prop_info(phdl, &name, NULL, NULL, NULL, + NULL, NULL); + if (ret != 0) { + libvarpd_prop_handle_free(phdl); + return (ret); + } + if (strcmp(vcfap->vcfa_name, name) == 0) + break; + } + + if (i == nprop) { + libvarpd_prop_handle_free(phdl); + return (ENOENT); + } + vcfap->vcfa_propid = i; + } + libvarpd_prop_door_convert(phdl, vcfap); + libvarpd_prop_handle_free(phdl); + return (0); +} + +/* ARGSUSED */ +static int +libvarpd_door_f_getprop(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + int ret; + uint32_t size; + varpd_instance_handle_t *ihp; + varpd_prop_handle_t *phdl; + varpd_client_prop_arg_t *vcpap = &vcap->vca_un.vca_prop; + + ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vcpap->vcpa_id); + if (ihp == NULL) + return (ENOENT); + ret = libvarpd_prop_handle_alloc((varpd_handle_t *)vip, ihp, &phdl); + if (ret != 0) + return (ret); + + ret = libvarpd_prop_info_fill(phdl, vcpap->vcpa_propid); + if (ret != 0) { + libvarpd_prop_handle_free(phdl); + return (ret); + } + + ret = libvarpd_prop_get(phdl, vcpap->vcpa_buf, &size); + if (ret == 0) + vcpap->vcpa_bufsize = size; + libvarpd_prop_handle_free(phdl); + return (0); +} + +/* ARGSUSED */ +static int +libvarpd_door_f_setprop(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + int ret; + varpd_instance_handle_t *ihp; + varpd_prop_handle_t *phdl; + varpd_client_prop_arg_t *vcpap = &vcap->vca_un.vca_prop; + + ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vcpap->vcpa_id); + if (ihp == NULL) + return (ENOENT); + ret = libvarpd_prop_handle_alloc((varpd_handle_t *)vip, ihp, &phdl); + if (ret != 0) + return (ret); + + ret = libvarpd_prop_info_fill(phdl, vcpap->vcpa_propid); + if (ret != 0) { + libvarpd_prop_handle_free(phdl); + return (ret); + } + + ret = libvarpd_prop_set(phdl, vcpap->vcpa_buf, vcpap->vcpa_bufsize); + libvarpd_prop_handle_free(phdl); + return (ret); +} + +/* ARGSUSED */ +static int +libvarpd_door_f_lookup(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + varpd_instance_t *inst; + varpd_client_lookup_arg_t *vclap = &vcap->vca_un.vca_lookup; + + inst = libvarpd_instance_lookup_by_dlid(vip, vclap->vcla_linkid); + if (inst == NULL) + return (ENOENT); + + vclap->vcla_id = inst->vri_id; + return (0); +} + +/* ARGSUSED */ +static int +libvarpd_door_f_target(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + varpd_instance_handle_t *ihp; + varpd_instance_t *inst; + varpd_client_target_mode_arg_t *vtmap = &vcap->vca_un.vca_mode; + + ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vtmap->vtma_id); + if (ihp == NULL) + return (ENOENT); + inst = (varpd_instance_t *)ihp; + vtmap->vtma_dest = inst->vri_dest; + vtmap->vtma_mode = inst->vri_mode; + return (0); +} + +static int +libvarpd_door_f_flush(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + varpd_instance_handle_t *ihp; + varpd_client_target_cache_arg_t *vtcap = &vcap->vca_un.vca_cache; + + if (libvarpd_door_privileged(credp) == B_FALSE) + return (EPERM); + + ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vtcap->vtca_id); + if (ihp == NULL) + return (ENOENT); + return (libvarpd_overlay_cache_flush((varpd_instance_t *)ihp)); +} + +static int +libvarpd_door_f_delete(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + varpd_instance_handle_t *ihp; + varpd_client_target_cache_arg_t *vtcap = &vcap->vca_un.vca_cache; + + if (libvarpd_door_privileged(credp) == B_FALSE) + return (EPERM); + + ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vtcap->vtca_id); + if (ihp == NULL) + return (ENOENT); + return (libvarpd_overlay_cache_delete((varpd_instance_t *)ihp, + vtcap->vtca_key)); +} + +/* ARGSUSED */ +static int +libvarpd_door_f_get(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + varpd_instance_handle_t *ihp; + varpd_client_target_cache_arg_t *vtcap = &vcap->vca_un.vca_cache; + + ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vtcap->vtca_id); + if (ihp == NULL) + return (ENOENT); + return (libvarpd_overlay_cache_get((varpd_instance_t *)ihp, + vtcap->vtca_key, &vtcap->vtca_entry)); +} + +static int +libvarpd_door_f_set(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + varpd_instance_handle_t *ihp; + varpd_client_target_cache_arg_t *vtcap = &vcap->vca_un.vca_cache; + + if (libvarpd_door_privileged(credp) == B_FALSE) + return (EPERM); + + ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vtcap->vtca_id); + if (ihp == NULL) + return (ENOENT); + + return (libvarpd_overlay_cache_set((varpd_instance_t *)ihp, + vtcap->vtca_key, &vtcap->vtca_entry)); +} + +/* ARGSUSED */ +static int +libvarpd_door_f_walk(varpd_impl_t *vip, varpd_client_arg_t *vcap, + ucred_t *credp) +{ + varpd_instance_handle_t *ihp; + varpd_client_target_walk_arg_t *vctwp = &vcap->vca_un.vca_walk; + + ihp = libvarpd_instance_lookup((varpd_handle_t *)vip, vctwp->vtcw_id); + if (ihp == NULL) + return (ENOENT); + + return (libvarpd_overlay_cache_walk_fill((varpd_instance_t *)ihp, + &vctwp->vtcw_marker, &vctwp->vtcw_count, vctwp->vtcw_ents)); +} + +static libvarpd_door_f *libvarpd_door_table[] = { + libvarpd_door_f_create, + libvarpd_door_f_activate, + libvarpd_door_f_destroy, + libvarpd_door_f_nprops, + libvarpd_door_f_propinfo, + libvarpd_door_f_getprop, + libvarpd_door_f_setprop, + libvarpd_door_f_lookup, + libvarpd_door_f_target, + libvarpd_door_f_flush, + libvarpd_door_f_delete, + libvarpd_door_f_get, + libvarpd_door_f_set, + libvarpd_door_f_walk +}; + +/* ARGSUSED */ +static void +libvarpd_door_server(void *cookie, char *argp, size_t argsz, door_desc_t *dp, + uint_t ndesc) +{ + int ret; + varpd_client_eresp_t err; + ucred_t *credp = NULL; + varpd_impl_t *vip = cookie; + varpd_client_arg_t *vcap = (varpd_client_arg_t *)argp; + + err.vce_command = VARPD_CLIENT_INVALID; + if (argsz < sizeof (varpd_client_arg_t)) { + err.vce_errno = EINVAL; + goto errout; + } + + if ((ret = door_ucred(&credp)) != 0) { + err.vce_errno = ret; + goto errout; + } + + if (vcap->vca_command == VARPD_CLIENT_INVALID || + vcap->vca_command >= VARPD_CLIENT_MAX) { + err.vce_errno = EINVAL; + goto errout; + } + + vcap->vca_errno = 0; + ret = libvarpd_door_table[vcap->vca_command - 1](vip, vcap, credp); + if (ret != 0) + vcap->vca_errno = ret; + + ucred_free(credp); + (void) door_return(argp, argsz, NULL, 0); + return; + +errout: + ucred_free(credp); + (void) door_return((char *)&err, sizeof (err), NULL, 0); +} + +int +libvarpd_door_server_create(varpd_handle_t *vhp, const char *path) +{ + int fd, ret; + varpd_impl_t *vip = (varpd_impl_t *)vhp; + + mutex_enter(&vip->vdi_lock); + if (vip->vdi_doorfd >= 0) { + mutex_exit(&vip->vdi_lock); + return (EEXIST); + } + + vip->vdi_doorfd = door_create(libvarpd_door_server, vip, + DOOR_REFUSE_DESC | DOOR_NO_CANCEL); + if (vip->vdi_doorfd == -1) { + mutex_exit(&vip->vdi_lock); + return (errno); + } + + if ((fd = open(path, O_CREAT | O_RDWR, 0666)) == -1) { + ret = errno; + if (door_revoke(vip->vdi_doorfd) != 0) + libvarpd_panic("failed to revoke door: %d", + errno); + mutex_exit(&vip->vdi_lock); + return (errno); + } + + if (fchown(fd, UID_NETADM, GID_NETADM) != 0) { + ret = errno; + if (door_revoke(vip->vdi_doorfd) != 0) + libvarpd_panic("failed to revoke door: %d", + errno); + mutex_exit(&vip->vdi_lock); + return (ret); + } + + if (close(fd) != 0) + libvarpd_panic("failed to close door fd %d: %d", + fd, errno); + (void) fdetach(path); + if (fattach(vip->vdi_doorfd, path) != 0) { + ret = errno; + if (door_revoke(vip->vdi_doorfd) != 0) + libvarpd_panic("failed to revoke door: %d", + errno); + mutex_exit(&vip->vdi_lock); + return (ret); + } + + mutex_exit(&vip->vdi_lock); + return (0); +} + +void +libvarpd_door_server_destroy(varpd_handle_t *vhp) +{ + varpd_impl_t *vip = (varpd_impl_t *)vhp; + + mutex_enter(&vip->vdi_lock); + if (vip->vdi_doorfd != 0) { + if (door_revoke(vip->vdi_doorfd) != 0) + libvarpd_panic("failed to revoke door: %d", + errno); + vip->vdi_doorfd = -1; + } + mutex_exit(&vip->vdi_lock); +} diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h b/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h new file mode 100644 index 0000000000..f8530a7112 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_impl.h @@ -0,0 +1,248 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LIBVARPD_IMPL_H +#define _LIBVARPD_IMPL_H + +/* + * varpd internal interfaces + */ + +#include <libvarpd.h> +#include <libvarpd_provider.h> +#include <sys/avl.h> +#include <thread.h> +#include <synch.h> +#include <limits.h> +#include <libidspace.h> +#include <umem.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIBVARPD_ID_MIN 1 +#define LIBVARPD_ID_MAX INT32_MAX + +typedef struct varpd_plugin { + avl_node_t vpp_node; + const char *vpp_name; + overlay_target_mode_t vpp_mode; + const varpd_plugin_ops_t *vpp_ops; + mutex_t vpp_lock; + uint_t vpp_active; +} varpd_plugin_t; + +typedef struct varpd_impl { + mutex_t vdi_lock; + rwlock_t vdi_pfdlock; + avl_tree_t vdi_plugins; /* vdi_lock */ + avl_tree_t vdi_instances; /* vdi_lock */ + avl_tree_t vdi_linstances; /* vdi_lock */ + id_space_t *vdi_idspace; /* RO */ + umem_cache_t *vdi_qcache; /* RO */ + int vdi_overlayfd; /* RO */ + int vdi_doorfd; /* vdi_lock */ + int vdi_persistfd; /* vdi_plock */ + cond_t vdi_lthr_cv; /* vdi_lock */ + boolean_t vdi_lthr_quiesce; /* vdi_lock */ + uint_t vdi_lthr_count; /* vdi_lock */ +} varpd_impl_t; + +typedef enum varpd_instance_flags { + VARPD_INSTANCE_F_ACTIVATED = 0x01 +} varpd_instance_flags_t; + +typedef struct varpd_instance { + avl_node_t vri_inode; + avl_node_t vri_lnode; + uint64_t vri_id; /* RO */ + uint64_t vri_vnetid; /* RO */ + datalink_id_t vri_linkid; /* RO */ + overlay_target_mode_t vri_mode; /* RO */ + overlay_plugin_dest_t vri_dest; /* RO */ + varpd_impl_t *vri_impl; /* RO */ + varpd_plugin_t *vri_plugin; /* RO */ + void *vri_private; /* RO */ + mutex_t vri_lock; + varpd_instance_flags_t vri_flags; /* vri_lock */ +} varpd_instance_t; + +typedef struct varpd_query { + overlay_targ_lookup_t vq_lookup; + overlay_targ_resp_t vq_response; + varpd_instance_t *vq_instance; +} varpd_query_t; + +typedef struct varpd_client_create_arg { + datalink_id_t vcca_linkid; + uint64_t vcca_id; + char vcca_plugin[LIBVARPD_PROP_NAMELEN]; +} varpd_client_create_arg_t; + +typedef struct varpd_client_instance_arg { + uint64_t vcia_id; +} varpd_client_instance_arg_t; + +typedef struct varpd_client_nprops_arg { + uint64_t vcna_id; + uint_t vcna_nprops; + uint8_t vcna_pad[4]; +} varpd_client_nprops_arg_t; + +typedef struct varpd_client_propinfo_arg { + uint64_t vcfa_id; + uint_t vcfa_propid; + uint_t vcfa_type; + uint_t vcfa_prot; + uint32_t vcfa_defsize; + uint32_t vcfa_psize; + uint8_t vcfa_pad[4]; + char vcfa_name[LIBVARPD_PROP_NAMELEN]; + uint8_t vcfa_default[LIBVARPD_PROP_SIZEMAX]; + uint8_t vcfa_poss[LIBVARPD_PROP_SIZEMAX]; +} varpd_client_propinfo_arg_t; + +typedef struct varpd_client_prop_arg { + uint64_t vcpa_id; + uint_t vcpa_propid; + uint8_t vcpa_buf[LIBVARPD_PROP_SIZEMAX]; + size_t vcpa_bufsize; +} varpd_client_prop_arg_t; + +typedef struct varpd_client_lookup_arg { + datalink_id_t vcla_linkid; + uint32_t vcla_pad; + uint64_t vcla_id; +} varpd_client_lookup_arg_t; + +typedef struct varpd_client_target_mode_arg { + uint64_t vtma_id; + uint32_t vtma_dest; + uint32_t vtma_mode; +} varpd_client_target_mode_arg_t; + +typedef struct varpd_client_target_cache_arg { + uint64_t vtca_id; + uint8_t vtca_key[ETHERADDRL]; + uint8_t vtca_pad[2]; + varpd_client_cache_entry_t vtca_entry; +} varpd_client_target_cache_arg_t; + +typedef struct varpd_client_target_walk_arg { + uint64_t vtcw_id; + uint64_t vtcw_marker; + uint64_t vtcw_count; + overlay_targ_cache_entry_t vtcw_ents[]; +} varpd_client_target_walk_arg_t; + +typedef enum varpd_client_command { + VARPD_CLIENT_INVALID = 0x0, + VARPD_CLIENT_CREATE, + VARPD_CLIENT_ACTIVATE, + VARPD_CLIENT_DESTROY, + VARPD_CLIENT_NPROPS, + VARPD_CLIENT_PROPINFO, + VARPD_CLIENT_GETPROP, + VARPD_CLIENT_SETPROP, + VARPD_CLIENT_LOOKUP, + VARPD_CLIENT_TARGET_MODE, + VARPD_CLIENT_CACHE_FLUSH, + VARPD_CLIENT_CACHE_DELETE, + VARPD_CLIENT_CACHE_GET, + VARPD_CLIENT_CACHE_SET, + VARPD_CLIENT_CACHE_WALK, + VARPD_CLIENT_MAX +} varpd_client_command_t; + +typedef struct varpd_client_arg { + uint_t vca_command; + uint_t vca_errno; + union { + varpd_client_create_arg_t vca_create; + varpd_client_instance_arg_t vca_instance; + varpd_client_nprops_arg_t vca_nprops; + varpd_client_propinfo_arg_t vca_info; + varpd_client_prop_arg_t vca_prop; + varpd_client_lookup_arg_t vca_lookup; + varpd_client_target_mode_arg_t vca_mode; + varpd_client_target_cache_arg_t vca_cache; + varpd_client_target_walk_arg_t vca_walk; + } vca_un; +} varpd_client_arg_t; + +typedef struct varpd_client_eresp { + uint_t vce_command; + uint_t vce_errno; +} varpd_client_eresp_t; + +extern void libvarpd_plugin_init(void); +extern void libvarpd_plugin_prefork(void); +extern void libvarpd_plugin_postfork(void); +extern void libvarpd_plugin_fini(void); +extern int libvarpd_plugin_comparator(const void *, const void *); +extern varpd_plugin_t *libvarpd_plugin_lookup(varpd_impl_t *, const char *); + +extern varpd_instance_t *libvarpd_instance_lookup_by_dlid(varpd_impl_t *, + datalink_id_t); + +extern void libvarpd_prop_door_convert(const varpd_prop_handle_t *, + varpd_client_propinfo_arg_t *); + +extern const char *libvarpd_isaext(void); +typedef int (*libvarpd_dirwalk_f)(varpd_impl_t *, const char *, void *); +extern int libvarpd_dirwalk(varpd_impl_t *, const char *, const char *, + libvarpd_dirwalk_f, void *); + +extern int libvarpd_overlay_init(varpd_impl_t *); +extern void libvarpd_overlay_fini(varpd_impl_t *); +extern int libvarpd_overlay_info(varpd_impl_t *, datalink_id_t, + overlay_plugin_dest_t *, uint64_t *, uint64_t *); +extern int libvarpd_overlay_associate(varpd_instance_t *); +extern int libvarpd_overlay_disassociate(varpd_instance_t *); +extern int libvarpd_overlay_degrade(varpd_instance_t *, const char *); +extern int libvarpd_overlay_degrade_datalink(varpd_impl_t *, datalink_id_t, + const char *); +extern int libvarpd_overlay_restore(varpd_instance_t *); +extern int libvarpd_overlay_packet(varpd_impl_t *, + const overlay_targ_lookup_t *, void *, size_t *); +extern int libvarpd_overlay_inject(varpd_impl_t *, + const overlay_targ_lookup_t *, void *, size_t); +extern int libvarpd_overlay_instance_inject(varpd_instance_t *, void *, size_t); +extern int libvarpd_overlay_resend(varpd_impl_t *, + const overlay_targ_lookup_t *, void *, size_t); +typedef int (*libvarpd_overlay_iter_f)(varpd_impl_t *, datalink_id_t, void *); +extern int libvarpd_overlay_iter(varpd_impl_t *, libvarpd_overlay_iter_f, + void *); +extern int libvarpd_overlay_cache_flush(varpd_instance_t *); +extern int libvarpd_overlay_cache_delete(varpd_instance_t *, const uint8_t *); +extern int libvarpd_overlay_cache_delete(varpd_instance_t *, const uint8_t *); +extern int libvarpd_overlay_cache_get(varpd_instance_t *, const uint8_t *, + varpd_client_cache_entry_t *); +extern int libvarpd_overlay_cache_set(varpd_instance_t *, const uint8_t *, + const varpd_client_cache_entry_t *); +extern int libvarpd_overlay_cache_walk_fill(varpd_instance_t *, uint64_t *, + uint64_t *, overlay_targ_cache_entry_t *); + +extern void libvarpd_persist_init(varpd_impl_t *); +extern void libvarpd_persist_fini(varpd_impl_t *); +extern int libvarpd_persist_instance(varpd_impl_t *, varpd_instance_t *); +extern void libvarpd_torch_instance(varpd_impl_t *, varpd_instance_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBVARPD_IMPL_H */ diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c new file mode 100644 index 0000000000..167c004a90 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_overlay.c @@ -0,0 +1,588 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Interactions with /dev/overlay + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <assert.h> +#include <unistd.h> +#include <stdlib.h> +#include <stropts.h> +#include <strings.h> +#include <umem.h> + +#include <libvarpd_impl.h> +#include <sys/overlay_target.h> + +#define OVERLAY_PATH "/dev/overlay" + +int +libvarpd_overlay_init(varpd_impl_t *vip) +{ + vip->vdi_overlayfd = open(OVERLAY_PATH, O_RDWR | O_EXCL); + if (vip->vdi_overlayfd == -1) + return (errno); + return (0); +} + +void +libvarpd_overlay_fini(varpd_impl_t *vip) +{ + assert(vip->vdi_overlayfd > 0); + if (close(vip->vdi_overlayfd) != 0) + libvarpd_panic("failed to close /dev/overlay fd %d: %d", + vip->vdi_overlayfd, errno); +} + +int +libvarpd_overlay_info(varpd_impl_t *vip, datalink_id_t linkid, + overlay_plugin_dest_t *destp, uint64_t *flags, uint64_t *vnetid) +{ + overlay_targ_info_t oti; + + oti.oti_linkid = linkid; + if (ioctl(vip->vdi_overlayfd, OVERLAY_TARG_INFO, &oti) != 0) + return (errno); + + if (destp != NULL) + *destp = oti.oti_needs; + if (flags != NULL) + *flags = oti.oti_flags; + if (vnetid != NULL) + *vnetid = oti.oti_vnetid; + return (0); +} + +int +libvarpd_overlay_associate(varpd_instance_t *inst) +{ + overlay_targ_associate_t ota; + varpd_impl_t *vip = inst->vri_impl; + + bzero(&ota, sizeof (overlay_targ_associate_t)); + ota.ota_linkid = inst->vri_linkid; + ota.ota_mode = inst->vri_mode; + ota.ota_id = inst->vri_id; + ota.ota_provides = inst->vri_dest; + + if (ota.ota_mode == OVERLAY_TARGET_POINT) { + int ret; + ret = inst->vri_plugin->vpp_ops->vpo_default(inst->vri_private, + &ota.ota_point); + if (ret != VARPD_LOOKUP_OK) + return (ret); + } + + if (ioctl(vip->vdi_overlayfd, OVERLAY_TARG_ASSOCIATE, &ota) != 0) + return (errno); + + return (0); +} + +int +libvarpd_overlay_disassociate(varpd_instance_t *inst) +{ + overlay_targ_id_t otid; + varpd_impl_t *vip = inst->vri_impl; + + otid.otid_linkid = inst->vri_linkid; + if (ioctl(vip->vdi_overlayfd, OVERLAY_TARG_DISASSOCIATE, &otid) != 0) + return (errno); + return (0); +} + +int +libvarpd_overlay_degrade_datalink(varpd_impl_t *vip, datalink_id_t linkid, + const char *msg) +{ + overlay_targ_degrade_t otd; + + otd.otd_linkid = linkid; + (void) strlcpy(otd.otd_buf, msg, OVERLAY_STATUS_BUFLEN); + if (ioctl(vip->vdi_overlayfd, OVERLAY_TARG_DEGRADE, &otd) != 0) + return (errno); + return (0); + +} + +int +libvarpd_overlay_degrade(varpd_instance_t *inst, const char *msg) +{ + return (libvarpd_overlay_degrade_datalink(inst->vri_impl, + inst->vri_linkid, msg)); +} + +int +libvarpd_overlay_restore(varpd_instance_t *inst) +{ + overlay_targ_id_t otid; + varpd_impl_t *vip = inst->vri_impl; + + otid.otid_linkid = inst->vri_linkid; + if (ioctl(vip->vdi_overlayfd, OVERLAY_TARG_RESTORE, &otid) != 0) + return (errno); + return (0); +} + +int +libvarpd_overlay_packet(varpd_impl_t *vip, const overlay_targ_lookup_t *otl, + void *buf, size_t *buflen) +{ + int ret; + overlay_targ_pkt_t otp; + + otp.otp_linkid = UINT64_MAX; + otp.otp_reqid = otl->otl_reqid; + otp.otp_size = *buflen; + otp.otp_buf = buf; + + do { + ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_PKT, &otp); + } while (ret != 0 && errno == EINTR); + if (ret != 0 && errno == EFAULT) + libvarpd_panic("OVERLAY_TARG_PKT ioctl efault"); + else if (ret != 0) + ret = errno; + + if (ret == 0) + *buflen = otp.otp_size; + + return (ret); +} + +static int +libvarpd_overlay_inject_common(varpd_impl_t *vip, varpd_instance_t *inst, + const overlay_targ_lookup_t *otl, void *buf, size_t buflen, int cmd) +{ + int ret; + overlay_targ_pkt_t otp; + + if (otl == NULL) { + otp.otp_linkid = inst->vri_linkid; + otp.otp_reqid = 0; + } else { + otp.otp_linkid = UINT64_MAX; + otp.otp_reqid = otl->otl_reqid; + } + otp.otp_size = buflen; + otp.otp_buf = buf; + + do { + ret = ioctl(vip->vdi_overlayfd, cmd, &otp); + } while (ret != 0 && errno == EINTR); + if (ret != 0 && errno == EFAULT) + libvarpd_panic("overlay_inject_common ioctl EFAULT"); + else if (ret != 0) + ret = errno; + + return (ret); +} + +int +libvarpd_overlay_inject(varpd_impl_t *vip, const overlay_targ_lookup_t *otl, + void *buf, size_t buflen) +{ + return (libvarpd_overlay_inject_common(vip, NULL, otl, buf, buflen, + OVERLAY_TARG_INJECT)); +} + +int +libvarpd_overlay_instance_inject(varpd_instance_t *inst, void *buf, + size_t buflen) +{ + return (libvarpd_overlay_inject_common(inst->vri_impl, inst, NULL, buf, + buflen, OVERLAY_TARG_INJECT)); +} + +int +libvarpd_overlay_resend(varpd_impl_t *vip, const overlay_targ_lookup_t *otl, + void *buf, size_t buflen) +{ + return (libvarpd_overlay_inject_common(vip, NULL, otl, buf, buflen, + OVERLAY_TARG_RESEND)); +} + +static void +libvarpd_overlay_lookup_reply(varpd_impl_t *vip, + const overlay_targ_lookup_t *otl, overlay_targ_resp_t *otr, int cmd) +{ + int ret; + + otr->otr_reqid = otl->otl_reqid; + do { + ret = ioctl(vip->vdi_overlayfd, cmd, otr); + } while (ret != 0 && errno == EINTR); + + /* + * The only errors that should cause us to end up here are due to + * programmer errors. Arguably the EINVAL case indicates that something + * is a bit off; however, at this time we don't opt to kill varpd. + */ + if (ret != 0 && errno != EINVAL) + libvarpd_panic("received bad errno from lookup_reply " + "(cmd %d): %d\n", cmd, errno); +} + +static void +libvarpd_overlay_lookup_handle(varpd_impl_t *vip) +{ + int ret; + varpd_query_t *vqp; + overlay_targ_lookup_t *otl; + overlay_targ_resp_t *otr; + varpd_instance_t *inst; + + vqp = umem_cache_alloc(vip->vdi_qcache, UMEM_DEFAULT); + otl = &vqp->vq_lookup; + otr = &vqp->vq_response; + /* + * abort doesn't really help here that much, maybe we can instead try + * and for a reap or something? + */ + if (vqp == NULL) + libvarpd_panic("failed to allocate memory for lookup " + "handle..., we should not panic()"); + ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_LOOKUP, otl); + if (ret != 0 && errno != ETIME && errno != EINTR) + libvarpd_panic("received bad errno from OVERLAY_TARG_LOOKUP: " + "%d", errno); + + if (ret != 0) { + umem_cache_free(vip->vdi_qcache, vqp); + return; + } + + inst = (varpd_instance_t *)libvarpd_instance_lookup( + (varpd_handle_t *)vip, otl->otl_varpdid); + if (inst == NULL) { + libvarpd_overlay_lookup_reply(vip, otl, otr, + OVERLAY_TARG_DROP); + umem_cache_free(vip->vdi_qcache, vqp); + return; + } + vqp->vq_instance = inst; + + inst->vri_plugin->vpp_ops->vpo_lookup(inst->vri_private, + (varpd_query_handle_t *)vqp, otl, &otr->otr_answer); +} + +/* Use "void *" for vhp here to play nicely with thr_create(). */ +void * +libvarpd_overlay_lookup_run(void *vhp) +{ + varpd_impl_t *vip = (varpd_impl_t *)vhp; + + mutex_enter(&vip->vdi_lock); + if (vip->vdi_lthr_quiesce == B_TRUE) { + mutex_exit(&vip->vdi_lock); + return (NULL); + } + vip->vdi_lthr_count++; + + for (;;) { + mutex_exit(&vip->vdi_lock); + libvarpd_overlay_lookup_handle(vip); + mutex_enter(&vip->vdi_lock); + if (vip->vdi_lthr_quiesce == B_TRUE) + break; + } + assert(vip->vdi_lthr_count > 0); + vip->vdi_lthr_count--; + (void) cond_signal(&vip->vdi_lthr_cv); + mutex_exit(&vip->vdi_lock); + return (NULL); +} + +void +libvarpd_overlay_lookup_quiesce(varpd_handle_t *vhp) +{ + varpd_impl_t *vip = (varpd_impl_t *)vhp; + + mutex_enter(&vip->vdi_lock); + if (vip->vdi_lthr_count == 0) { + mutex_exit(&vip->vdi_lock); + return; + } + vip->vdi_lthr_quiesce = B_TRUE; + while (vip->vdi_lthr_count > 0) + (void) cond_wait(&vip->vdi_lthr_cv, &vip->vdi_lock); + vip->vdi_lthr_quiesce = B_FALSE; + mutex_exit(&vip->vdi_lock); +} + +int +libvarpd_overlay_iter(varpd_impl_t *vip, libvarpd_overlay_iter_f func, + void *arg) +{ + uint32_t curents = 0, i; + size_t size; + overlay_targ_list_t *otl; + + for (;;) { + size = sizeof (overlay_targ_list_t) + + sizeof (uint32_t) * curents; + otl = umem_alloc(size, UMEM_DEFAULT); + if (otl == NULL) + return (ENOMEM); + + otl->otl_nents = curents; + if (ioctl(vip->vdi_overlayfd, OVERLAY_TARG_LIST, otl) != 0) { + if (errno == EFAULT) + libvarpd_panic("OVERLAY_TARG_LIST ioctl " + "efault"); + umem_free(otl, size); + if (errno == EINTR) + continue; + else + return (errno); + } + + if (otl->otl_nents == curents) + break; + + curents = otl->otl_nents; + umem_free(otl, size); + } + + for (i = 0; i < otl->otl_nents; i++) { + if (func(vip, otl->otl_ents[i], arg) != 0) + break; + } + umem_free(otl, size); + return (0); +} + +int +libvarpd_overlay_cache_flush(varpd_instance_t *inst) +{ + int ret; + overlay_targ_cache_t cache; + varpd_impl_t *vip = inst->vri_impl; + + bzero(&cache, sizeof (overlay_targ_cache_t)); + cache.otc_linkid = inst->vri_linkid; + + ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_FLUSH, &cache); + if (ret != 0 && errno == EFAULT) + libvarpd_panic("OVERLAY_TARG_CACHE_FLUSH ioctl efault"); + else if (ret != 0) + ret = errno; + + return (ret); +} + +int +libvarpd_overlay_cache_delete(varpd_instance_t *inst, const uint8_t *key) +{ + int ret; + overlay_targ_cache_t cache; + varpd_impl_t *vip = inst->vri_impl; + + bzero(&cache, sizeof (overlay_targ_cache_t)); + cache.otc_linkid = inst->vri_linkid; + bcopy(key, cache.otc_entry.otce_mac, ETHERADDRL); + + ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_REMOVE, &cache); + if (ret != 0 && errno == EFAULT) + libvarpd_panic("OVERLAY_TARG_CACHE_REMOVE ioctl efault"); + else if (ret != 0) + ret = errno; + + return (ret); + +} + +int +libvarpd_overlay_cache_get(varpd_instance_t *inst, const uint8_t *key, + varpd_client_cache_entry_t *entry) +{ + int ret; + overlay_targ_cache_t cache; + varpd_impl_t *vip = inst->vri_impl; + + bzero(&cache, sizeof (overlay_targ_cache_t)); + cache.otc_linkid = inst->vri_linkid; + bcopy(key, cache.otc_entry.otce_mac, ETHERADDRL); + + ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_GET, &cache); + if (ret != 0 && errno == EFAULT) + libvarpd_panic("OVERLAY_TARG_CACHE_GET ioctl efault"); + else if (ret != 0) + return (errno); + + bcopy(cache.otc_entry.otce_dest.otp_mac, &entry->vcp_mac, ETHERADDRL); + entry->vcp_flags = cache.otc_entry.otce_flags; + entry->vcp_ip = cache.otc_entry.otce_dest.otp_ip; + entry->vcp_port = cache.otc_entry.otce_dest.otp_port; + + return (0); +} + +int +libvarpd_overlay_cache_set(varpd_instance_t *inst, const uint8_t *key, + const varpd_client_cache_entry_t *entry) +{ + int ret; + overlay_targ_cache_t cache; + varpd_impl_t *vip = inst->vri_impl; + + bzero(&cache, sizeof (overlay_targ_cache_t)); + cache.otc_linkid = inst->vri_linkid; + bcopy(key, cache.otc_entry.otce_mac, ETHERADDRL); + bcopy(&entry->vcp_mac, cache.otc_entry.otce_dest.otp_mac, ETHERADDRL); + cache.otc_entry.otce_flags = entry->vcp_flags; + cache.otc_entry.otce_dest.otp_ip = entry->vcp_ip; + cache.otc_entry.otce_dest.otp_port = entry->vcp_port; + + ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_SET, &cache); + if (ret != 0 && errno == EFAULT) + libvarpd_panic("OVERLAY_TARG_CACHE_SET ioctl efault"); + else if (ret != 0) + return (errno); + + return (0); +} + +int +libvarpd_overlay_cache_walk_fill(varpd_instance_t *inst, uint64_t *markerp, + uint64_t *countp, overlay_targ_cache_entry_t *ents) +{ + int ret; + size_t asize; + overlay_targ_cache_iter_t *iter; + varpd_impl_t *vip = inst->vri_impl; + + if (*countp > 200) + return (E2BIG); + + asize = sizeof (overlay_targ_cache_iter_t) + + *countp * sizeof (overlay_targ_cache_entry_t); + iter = umem_alloc(asize, UMEM_DEFAULT); + if (iter == NULL) + return (ENOMEM); + + iter->otci_linkid = inst->vri_linkid; + iter->otci_marker = *markerp; + iter->otci_count = *countp; + ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_ITER, iter); + if (ret != 0 && errno == EFAULT) + libvarpd_panic("OVERLAY_TARG_CACHE_ITER ioctl efault"); + else if (ret != 0) { + ret = errno; + goto out; + } + + *markerp = iter->otci_marker; + *countp = iter->otci_count; + bcopy(iter->otci_ents, ents, + *countp * sizeof (overlay_targ_cache_entry_t)); +out: + umem_free(iter, asize); + return (ret); +} + +void +libvarpd_plugin_query_reply(varpd_query_handle_t *vqh, int action) +{ + varpd_query_t *vqp = (varpd_query_t *)vqh; + + if (vqp == NULL) + libvarpd_panic("unknown plugin passed invalid " + "varpd_query_handle_t"); + + if (action == VARPD_LOOKUP_DROP) + libvarpd_overlay_lookup_reply(vqp->vq_instance->vri_impl, + &vqp->vq_lookup, &vqp->vq_response, OVERLAY_TARG_DROP); + else if (action == VARPD_LOOKUP_OK) + libvarpd_overlay_lookup_reply(vqp->vq_instance->vri_impl, + &vqp->vq_lookup, &vqp->vq_response, OVERLAY_TARG_RESPOND); + else + libvarpd_panic("plugin %s passed in an invalid action: %d", + vqp->vq_instance->vri_plugin->vpp_name, action); + + umem_cache_free(vqp->vq_instance->vri_impl->vdi_qcache, vqp); +} + +void +libvarpd_inject_varp(varpd_provider_handle_t *vph, const uint8_t *mac, + const overlay_target_point_t *otp) +{ + int ret; + overlay_targ_cache_t otc; + varpd_instance_t *inst = (varpd_instance_t *)vph; + varpd_impl_t *vip = inst->vri_impl; + + if (otp == NULL) { + (void) libvarpd_overlay_cache_delete(inst, mac); + return; + } + + otc.otc_linkid = inst->vri_linkid; + otc.otc_entry.otce_flags = 0; + bcopy(mac, otc.otc_entry.otce_mac, ETHERADDRL); + bcopy(otp, &otc.otc_entry.otce_dest, sizeof (overlay_target_point_t)); + + ret = ioctl(vip->vdi_overlayfd, OVERLAY_TARG_CACHE_SET, &otc); + if (ret != 0) { + switch (errno) { + case EBADF: + case EFAULT: + case ENOTSUP: + libvarpd_panic("received bad errno from " + "OVERLAY_TARG_CACHE_SET: %d", errno); + default: + break; + } + } +} + +void +libvarpd_fma_degrade(varpd_provider_handle_t *vph, const char *msg) +{ + int ret; + varpd_instance_t *inst = (varpd_instance_t *)vph; + + ret = libvarpd_overlay_degrade(inst, msg); + switch (ret) { + case ENOENT: + case EFAULT: + libvarpd_panic("received bad errno from degrade ioctl: %d", + errno); + default: + break; + } +} + +void +libvarpd_fma_restore(varpd_provider_handle_t *vph) +{ + int ret; + varpd_instance_t *inst = (varpd_instance_t *)vph; + + ret = libvarpd_overlay_restore(inst); + switch (ret) { + case ENOENT: + case EFAULT: + libvarpd_panic("received bad errno from restore ioctl: %d", + errno); + default: + break; + } +} diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_panic.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_panic.c new file mode 100644 index 0000000000..ba6bc26bf6 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_panic.c @@ -0,0 +1,53 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +/* + * No, 'tis not so deep as a well, nor so wide as a church door; but 'tis + * enough, 'twill serve. Ask for me tomorrow, and you shall find me a grave man. + * + * This file maintains various routines for handling when we die. + */ + +#include <stdio.h> +#include <stdarg.h> +#include <errno.h> +#include <thread.h> +#include <stdlib.h> + +/* + * Normally these would be static, but if they're static, that throws off lint + * because it thinks we never use them, which is kind of the point, because we + * only read them in the core... + */ +int varpd_panic_errno; +char varpd_panic_buf[1024]; +thread_t varpd_panic_thread; + +void +libvarpd_panic(const char *fmt, ...) +{ + va_list ap; + + /* Always save errno first! */ + varpd_panic_errno = errno; + varpd_panic_thread = thr_self(); + + if (fmt != NULL) { + va_start(ap, fmt); + (void) vsnprintf(varpd_panic_buf, sizeof (varpd_panic_buf), fmt, + ap); + } + abort(); +} diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c new file mode 100644 index 0000000000..27cc802a9c --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_persist.c @@ -0,0 +1,586 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +/* + * varpd persistence backend + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <strings.h> +#include <librename.h> +#include <md5.h> +#include <sys/sysmacros.h> +#include <dirent.h> +#include <sys/mman.h> +#include <umem.h> +#include <sys/debug.h> + +#include <libvarpd_impl.h> + +static uint8_t varpd_persist_magic[4] = { + 'v', + 'a', + 'r', + 'p', +}; + +#define VARPD_PERSIST_MAXWRITE 4096 +#define VARPD_PERSIST_VERSION_ONE 1 +#define VARPD_PERSIST_SUFFIX ".varpd" + +typedef struct varpd_persist_header { + uint8_t vph_magic[4]; + uint32_t vph_version; + uint8_t vph_md5[16]; +} varpd_persist_header_t; + +void +libvarpd_persist_init(varpd_impl_t *vip) +{ + vip->vdi_persistfd = -1; + if (rwlock_init(&vip->vdi_pfdlock, USYNC_THREAD, NULL) != 0) + libvarpd_panic("failed to create rw vdi_pfdlock"); +} + +void +libvarpd_persist_fini(varpd_impl_t *vip) +{ + /* + * Clean up for someone that left something behind. + */ + if (vip->vdi_persistfd != -1) { + if (close(vip->vdi_persistfd) != 0) + libvarpd_panic("failed to close persist fd %d: %d", + vip->vdi_persistfd, errno); + vip->vdi_persistfd = -1; + } + if (rwlock_destroy(&vip->vdi_pfdlock) != 0) + libvarpd_panic("failed to destroy rw vdi_pfdlock"); +} + +int +libvarpd_persist_enable(varpd_handle_t *vhp, const char *rootdir) +{ + int fd; + struct stat st; + varpd_impl_t *vip = (varpd_impl_t *)vhp; + + fd = open(rootdir, O_RDONLY); + if (fd < 0) + return (errno); + + if (fstat(fd, &st) != 0) { + int ret = errno; + if (close(fd) != 0) + libvarpd_panic("failed to close rootdir fd (%s) %d: %d", + rootdir, fd, errno); + return (ret); + } + + if (!S_ISDIR(st.st_mode)) { + if (close(fd) != 0) + libvarpd_panic("failed to close rootdir fd (%s) %d: %d", + rootdir, fd, errno); + return (EINVAL); + } + + + VERIFY0(rw_wrlock(&vip->vdi_pfdlock)); + if (vip->vdi_persistfd != -1) { + VERIFY0(rw_unlock(&vip->vdi_pfdlock)); + if (close(fd) != 0) + libvarpd_panic("failed to close rootdir fd (%s) %d: %d", + rootdir, fd, errno); + return (EEXIST); + } + vip->vdi_persistfd = fd; + VERIFY0(rw_unlock(&vip->vdi_pfdlock)); + + return (0); +} + +static int +libvarpd_persist_write(int fd, const void *buf, size_t buflen) +{ + ssize_t ret; + off_t off = 0; + + while (buflen > 0) { + ret = write(fd, (void *)((uintptr_t)buf + off), + MIN(buflen, VARPD_PERSIST_MAXWRITE)); + if (ret == -1 && errno == EINTR) + continue; + if (ret == -1) + return (errno); + + off += ret; + buflen -= ret; + } + + return (0); +} + +static int +libvarpd_persist_nvlist(int dirfd, uint64_t id, nvlist_t *nvl) +{ + int err, fd; + size_t size; + varpd_persist_header_t hdr; + librename_atomic_t *lrap; + char *buf = NULL, *name; + + if ((err = nvlist_pack(nvl, &buf, &size, NV_ENCODE_XDR, 0)) != 0) + return (err); + + if (asprintf(&name, "%llu%s", (unsigned long long)id, ".varpd") == -1) { + err = errno; + free(buf); + return (err); + } + + if ((err = librename_atomic_fdinit(dirfd, name, NULL, 0600, 0, + &lrap)) != 0) { + free(name); + free(buf); + return (err); + } + + fd = librename_atomic_fd(lrap); + + bzero(&hdr, sizeof (varpd_persist_header_t)); + bcopy(varpd_persist_magic, hdr.vph_magic, sizeof (varpd_persist_magic)); + hdr.vph_version = VARPD_PERSIST_VERSION_ONE; + md5_calc(hdr.vph_md5, buf, size); + + if ((err = libvarpd_persist_write(fd, &hdr, + sizeof (varpd_persist_header_t))) != 0) { + librename_atomic_fini(lrap); + free(name); + free(buf); + return (err); + } + + if ((err = libvarpd_persist_write(fd, buf, size)) != 0) { + librename_atomic_fini(lrap); + free(name); + free(buf); + return (err); + } + + do { + err = librename_atomic_commit(lrap); + } while (err == EINTR); + + librename_atomic_fini(lrap); + free(name); + free(buf); + return (err); +} + +int +libvarpd_persist_instance(varpd_impl_t *vip, varpd_instance_t *inst) +{ + int err = 0; + nvlist_t *nvl = NULL, *cvl = NULL; + + VERIFY0(rw_rdlock(&vip->vdi_pfdlock)); + /* Check if persistence exists */ + if (vip->vdi_persistfd == -1) + goto out; + + if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0)) != 0) + goto out; + + if ((err = nvlist_alloc(&cvl, NV_UNIQUE_NAME, 0)) != 0) + goto out; + + if ((err = nvlist_add_uint64(nvl, "vri_id", inst->vri_id)) != 0) + goto out; + + if ((err = nvlist_add_uint32(nvl, "vri_linkid", inst->vri_linkid)) != 0) + goto out; + + if ((err = nvlist_add_uint32(nvl, "vri_dest", + (uint32_t)inst->vri_dest)) != 0) + goto out; + if ((err = nvlist_add_uint32(nvl, "vri_mode", + (uint32_t)inst->vri_mode)) != 0) + goto out; + + if ((err = nvlist_add_string(nvl, "vri_plugin", + inst->vri_plugin->vpp_name)) != 0) + goto out; + + err = inst->vri_plugin->vpp_ops->vpo_save(inst->vri_private, cvl); + if (err != 0) + goto out; + + if ((err = nvlist_add_nvlist(nvl, "vri_private", cvl)) != 0) + goto out; + + err = libvarpd_persist_nvlist(vip->vdi_persistfd, inst->vri_id, nvl); +out: + nvlist_free(nvl); + nvlist_free(cvl); + VERIFY0(rw_unlock(&vip->vdi_pfdlock)); + return (err); +} + +void +libvarpd_torch_instance(varpd_impl_t *vip, varpd_instance_t *inst) +{ + char buf[32]; + int ret; + + VERIFY0(rw_rdlock(&vip->vdi_pfdlock)); + if (vip->vdi_persistfd == -1) { + VERIFY0(rw_unlock(&vip->vdi_pfdlock)); + return; + } + + if (snprintf(buf, sizeof (buf), "%lld.varpd", inst->vri_id) >= 32) + libvarpd_panic("somehow exceeded static value for " + "libvarpd_torch_instance buffer"); + + do { + ret = unlinkat(vip->vdi_persistfd, buf, 0); + } while (ret == -1 && errno == EINTR); + if (ret != 0) { + switch (errno) { + case ENOENT: + break; + default: + libvarpd_panic("failed to unlinkat %d`%s: %s", + vip->vdi_persistfd, buf, strerror(errno)); + } + } + + VERIFY0(rw_unlock(&vip->vdi_pfdlock)); +} + +static int +libvarpd_persist_restore_instance(varpd_impl_t *vip, nvlist_t *nvl) +{ + int err; + nvlist_t *pvl; + uint64_t id, flags, vid; + uint32_t linkid, dest, mode; + char *pluginstr; + varpd_plugin_t *plugin; + overlay_plugin_dest_t adest; + varpd_instance_t *inst, lookup; + + if (nvlist_lookup_uint64(nvl, "vri_id", &id) != 0) + return (EINVAL); + + if (nvlist_lookup_uint32(nvl, "vri_linkid", &linkid) != 0) + return (EINVAL); + + if (nvlist_lookup_uint32(nvl, "vri_dest", &dest) != 0) + return (EINVAL); + + if (nvlist_lookup_uint32(nvl, "vri_mode", &mode) != 0) + return (EINVAL); + + if (nvlist_lookup_string(nvl, "vri_plugin", &pluginstr) != 0) + return (EINVAL); + + if (nvlist_lookup_nvlist(nvl, "vri_private", &pvl) != 0) + return (EINVAL); + + plugin = libvarpd_plugin_lookup(vip, pluginstr); + if (plugin == NULL) + return (EINVAL); + + if (plugin->vpp_mode != mode) + return (EINVAL); + + if (libvarpd_overlay_info(vip, linkid, &adest, &flags, &vid) != 0) + return (EINVAL); + + if (dest != adest) + return (EINVAL); + + inst = umem_alloc(sizeof (varpd_instance_t), UMEM_DEFAULT); + if (inst == NULL) + libvarpd_panic("failed to allocate instance for restore"); + + inst->vri_id = id_alloc_specific(vip->vdi_idspace, id); + if (inst->vri_id != id) { + umem_free(inst, sizeof (varpd_instance_t)); + return (EINVAL); + } + + inst->vri_linkid = linkid; + inst->vri_vnetid = vid; + inst->vri_mode = plugin->vpp_mode; + inst->vri_dest = dest; + inst->vri_plugin = plugin; + inst->vri_impl = vip; + inst->vri_flags = 0; + if (plugin->vpp_ops->vpo_restore(pvl, (varpd_provider_handle_t *)inst, + dest, &inst->vri_private) != 0) { + id_free(vip->vdi_idspace, id); + umem_free(inst, sizeof (varpd_instance_t)); + return (EINVAL); + } + + if (mutex_init(&inst->vri_lock, USYNC_THREAD | LOCK_ERRORCHECK, + NULL) != 0) + libvarpd_panic("failed to create vri_lock mutex"); + + mutex_enter(&vip->vdi_lock); + lookup.vri_id = inst->vri_id; + if (avl_find(&vip->vdi_instances, &lookup, NULL) != NULL) + libvarpd_panic("found duplicate instance with id %d", + lookup.vri_id); + avl_add(&vip->vdi_instances, inst); + lookup.vri_linkid = inst->vri_linkid; + if (avl_find(&vip->vdi_linstances, &lookup, NULL) != NULL) + libvarpd_panic("found duplicate linstance with id %d", + lookup.vri_linkid); + avl_add(&vip->vdi_linstances, inst); + mutex_exit(&vip->vdi_lock); + + if (plugin->vpp_ops->vpo_start(inst->vri_private) != 0) { + libvarpd_instance_destroy((varpd_instance_handle_t *)inst); + return (EINVAL); + } + + if (flags & OVERLAY_TARG_INFO_F_ACTIVE) + (void) libvarpd_overlay_disassociate(inst); + + if (libvarpd_overlay_associate(inst) != 0) { + libvarpd_instance_destroy((varpd_instance_handle_t *)inst); + return (EINVAL); + } + + if (flags & OVERLAY_TARG_INFO_F_DEGRADED) { + if ((err = libvarpd_overlay_restore(inst)) != 0) { + libvarpd_panic("failed to restore instance %p: %d\n", + inst, err); + } + } + + mutex_enter(&inst->vri_lock); + inst->vri_flags |= VARPD_INSTANCE_F_ACTIVATED; + mutex_exit(&inst->vri_lock); + + return (0); +} + +static int +libvarpd_persist_restore_one(varpd_impl_t *vip, int fd) +{ + int err; + size_t fsize; + struct stat st; + void *buf, *datap; + varpd_persist_header_t *hdr; + uint8_t md5[16]; + nvlist_t *nvl; + + if (fstat(fd, &st) != 0) + return (errno); + + if (st.st_size <= sizeof (varpd_persist_header_t)) + return (EINVAL); + fsize = st.st_size - sizeof (varpd_persist_header_t); + + buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (buf == MAP_FAILED) + return (errno); + + hdr = buf; + if (bcmp(varpd_persist_magic, hdr->vph_magic, + sizeof (varpd_persist_magic)) != 0) { + if (munmap(buf, st.st_size) != 0) + libvarpd_panic("failed to munmap %p: %d", buf, errno); + return (EINVAL); + } + + if (hdr->vph_version != VARPD_PERSIST_VERSION_ONE) { + if (munmap(buf, st.st_size) != 0) + libvarpd_panic("failed to munmap %p: %d", buf, errno); + return (EINVAL); + } + + datap = (void *)((uintptr_t)buf + sizeof (varpd_persist_header_t)); + md5_calc(md5, datap, fsize); + if (bcmp(md5, hdr->vph_md5, sizeof (uint8_t) * 16) != 0) { + if (munmap(buf, st.st_size) != 0) + libvarpd_panic("failed to munmap %p: %d", buf, errno); + return (EINVAL); + } + + err = nvlist_unpack(datap, fsize, &nvl, 0); + if (munmap(buf, st.st_size) != 0) + libvarpd_panic("failed to munmap %p: %d", buf, errno); + + if (err != 0) + return (EINVAL); + + err = libvarpd_persist_restore_instance(vip, nvl); + nvlist_free(nvl); + return (err); +} + +/* ARGSUSED */ +static int +libvarpd_check_degrade_cb(varpd_impl_t *vip, datalink_id_t linkid, void *arg) +{ + varpd_instance_t *inst; + + mutex_enter(&vip->vdi_lock); + for (inst = avl_first(&vip->vdi_instances); inst != NULL; + inst = AVL_NEXT(&vip->vdi_instances, inst)) { + if (inst->vri_linkid == linkid) { + mutex_exit(&vip->vdi_lock); + return (0); + } + } + + mutex_exit(&vip->vdi_lock); + + (void) libvarpd_overlay_degrade_datalink(vip, linkid, + "no varpd instance exists"); + return (0); +} + +static void +libvarpd_check_degrade(varpd_impl_t *vip) +{ + (void) libvarpd_overlay_iter(vip, libvarpd_check_degrade_cb, NULL); +} + +int +libvarpd_persist_restore(varpd_handle_t *vhp) +{ + int dirfd; + int ret = 0; + DIR *dirp = NULL; + struct dirent *dp; + varpd_impl_t *vip = (varpd_impl_t *)vhp; + + VERIFY0(rw_rdlock(&vip->vdi_pfdlock)); + if ((dirfd = dup(vip->vdi_persistfd)) < 0) { + ret = errno; + goto out; + } + + if ((dirp = fdopendir(dirfd)) == NULL) { + ret = errno; + if (close(dirfd) != 0) + libvarpd_panic("failed to close dirfd %d: %d", + dirfd, errno); + goto out; + } + + for (;;) { + int fd; + uint64_t id; + char *eptr; + struct stat st; + + errno = 0; + dp = readdir(dirp); + if (dp == NULL) { + ret = errno; + break; + } + + if (strcmp(dp->d_name, ".") == 0 || + strcmp(dp->d_name, "..") == 0) + continue; + + /* + * Leave files that we don't recognize alone. A valid file has + * the format `%llu.varpd`. + */ + errno = 0; + id = strtoull(dp->d_name, &eptr, 10); + if ((id == 0 && errno == EINVAL) || + (id == ULLONG_MAX && errno == ERANGE)) + continue; + + if (strcmp(eptr, VARPD_PERSIST_SUFFIX) != 0) + continue; + + fd = openat(vip->vdi_persistfd, dp->d_name, O_RDONLY); + if (fd < 0) { + ret = errno; + break; + } + + if (fstat(fd, &st) != 0) { + ret = errno; + break; + } + + if (!S_ISREG(st.st_mode)) { + if (close(fd) != 0) + libvarpd_panic("failed to close fd (%s) %d: " + "%d\n", dp->d_name, fd, errno); + continue; + } + + ret = libvarpd_persist_restore_one(vip, fd); + if (close(fd) != 0) + libvarpd_panic("failed to close fd (%s) %d: " + "%d\n", dp->d_name, fd, errno); + /* + * This is an invalid file. We'll unlink it to save us this + * trouble in the future. + */ + if (ret != 0) { + if (unlinkat(vip->vdi_persistfd, dp->d_name, 0) != 0) { + ret = errno; + break; + } + } + } + + libvarpd_check_degrade(vip); + +out: + if (dirp != NULL) + (void) closedir(dirp); + VERIFY0(rw_unlock(&vip->vdi_pfdlock)); + return (ret); +} + +int +libvarpd_persist_disable(varpd_handle_t *vhp) +{ + varpd_impl_t *vip = (varpd_impl_t *)vhp; + + VERIFY0(rw_wrlock(&vip->vdi_pfdlock)); + if (vip->vdi_persistfd == -1) { + mutex_exit(&vip->vdi_lock); + VERIFY0(rw_unlock(&vip->vdi_pfdlock)); + return (ENOENT); + } + if (close(vip->vdi_persistfd) != 0) + libvarpd_panic("failed to close persist fd %d: %d", + vip->vdi_persistfd, errno); + vip->vdi_persistfd = -1; + VERIFY0(rw_unlock(&vip->vdi_pfdlock)); + return (0); +} diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_plugin.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_plugin.c new file mode 100644 index 0000000000..176306a3f7 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_plugin.c @@ -0,0 +1,256 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * varpd plugin management + */ + +#include <libvarpd_impl.h> +#include <errno.h> +#include <umem.h> +#include <assert.h> +#include <strings.h> +#include <dlfcn.h> +#include <link.h> +#include <stdio.h> + +static varpd_impl_t *varpd_load_handle; +static const char *varpd_load_path; +static mutex_t varpd_load_lock; +static cond_t varpd_load_cv; + +int +libvarpd_plugin_comparator(const void *lp, const void *rp) +{ + int ret; + const varpd_plugin_t *lpp, *rpp; + + lpp = lp; + rpp = rp; + + ret = strcmp(lpp->vpp_name, rpp->vpp_name); + if (ret > 0) + return (1); + if (ret < 0) + return (-1); + return (0); +} + +varpd_plugin_register_t * +libvarpd_plugin_alloc(uint_t version, int *errp) +{ + int err; + varpd_plugin_register_t *vprp; + + if (errp == NULL) + errp = &err; + + if (version != VARPD_VERSION_ONE) { + (void) fprintf(stderr, + "unsupported registration version %u - %s\n", + version, varpd_load_path); + *errp = EINVAL; + return (NULL); + } + + vprp = umem_alloc(sizeof (varpd_plugin_register_t), UMEM_DEFAULT); + if (vprp == NULL) { + (void) fprintf(stderr, + "failed to allocate registration handle - %s\n", + varpd_load_path); + *errp = ENOMEM; + return (NULL); + } + + vprp->vpr_version = VARPD_VERSION_ONE; + + return (vprp); +} + +void +libvarpd_plugin_free(varpd_plugin_register_t *vprp) +{ + umem_free(vprp, sizeof (varpd_plugin_register_t)); +} + +int +libvarpd_plugin_register(varpd_plugin_register_t *vprp) +{ + varpd_plugin_t *vpp; + varpd_plugin_t lookup; + + vpp = umem_alloc(sizeof (varpd_plugin_t), UMEM_DEFAULT); + if (vpp == NULL) { + (void) fprintf(stderr, + "failed to allocate memory for the varpd_plugin_t - %s\n", + varpd_load_path); + return (ENOMEM); + } + + /* Watch out for an evil plugin */ + if (vprp->vpr_version != VARPD_VERSION_ONE) { + (void) fprintf(stderr, + "unsupported registration version %u - %s\n", + vprp->vpr_version, varpd_load_path); + return (EINVAL); + } + + mutex_enter(&varpd_load_lock); + if (varpd_load_handle == NULL) + libvarpd_panic("varpd_load_handle was unexpectedly null"); + + mutex_enter(&varpd_load_handle->vdi_lock); + lookup.vpp_name = vprp->vpr_name; + if (avl_find(&varpd_load_handle->vdi_plugins, &lookup, NULL) != NULL) { + (void) fprintf(stderr, + "module already exists with requested name '%s' - %s\n", + vprp->vpr_name, varpd_load_path); + mutex_exit(&varpd_load_handle->vdi_lock); + mutex_exit(&varpd_load_lock); + umem_free(vpp, sizeof (varpd_plugin_t)); + return (EEXIST); + } + vpp->vpp_name = strdup(vprp->vpr_name); + if (vpp->vpp_name == NULL) { + (void) fprintf(stderr, + "failed to allocate memory to duplicate name - %s\n", + varpd_load_path); + mutex_exit(&varpd_load_handle->vdi_lock); + mutex_exit(&varpd_load_lock); + umem_free(vpp, sizeof (varpd_plugin_t)); + return (ENOMEM); + } + + vpp->vpp_mode = vprp->vpr_mode; + vpp->vpp_ops = vprp->vpr_ops; + if (mutex_init(&vpp->vpp_lock, USYNC_THREAD | LOCK_ERRORCHECK, + NULL) != 0) + libvarpd_panic("failed to create plugin's vpp_lock"); + vpp->vpp_active = 0; + avl_add(&varpd_load_handle->vdi_plugins, vpp); + mutex_exit(&varpd_load_handle->vdi_lock); + mutex_exit(&varpd_load_lock); + + return (0); +} + +varpd_plugin_t * +libvarpd_plugin_lookup(varpd_impl_t *vip, const char *name) +{ + varpd_plugin_t lookup, *ret; + + lookup.vpp_name = name; + mutex_enter(&vip->vdi_lock); + ret = avl_find(&vip->vdi_plugins, &lookup, NULL); + mutex_exit(&vip->vdi_lock); + + return (ret); +} + +/* ARGSUSED */ +static int +libvarpd_plugin_load_cb(varpd_impl_t *vip, const char *path, void *unused) +{ + void *dlp; + + varpd_load_path = path; + dlp = dlopen(path, RTLD_LOCAL | RTLD_NOW); + if (dlp == NULL) + (void) fprintf(stderr, "dlopen failed - %s\n", path); + path = NULL; + + return (0); +} + +int +libvarpd_plugin_load(varpd_handle_t *vph, const char *path) +{ + int ret = 0; + varpd_impl_t *vip = (varpd_impl_t *)vph; + + if (vip == NULL || path == NULL) + return (EINVAL); + mutex_enter(&varpd_load_lock); + while (varpd_load_handle != NULL) + (void) cond_wait(&varpd_load_cv, &varpd_load_lock); + varpd_load_handle = vip; + mutex_exit(&varpd_load_lock); + + ret = libvarpd_dirwalk(vip, path, ".so", libvarpd_plugin_load_cb, NULL); + + mutex_enter(&varpd_load_lock); + varpd_load_handle = NULL; + (void) cond_signal(&varpd_load_cv); + mutex_exit(&varpd_load_lock); + + return (ret); +} + +int +libvarpd_plugin_walk(varpd_handle_t *vph, libvarpd_plugin_walk_f func, + void *arg) +{ + varpd_impl_t *vip = (varpd_impl_t *)vph; + varpd_plugin_t *vpp; + + mutex_enter(&vip->vdi_lock); + for (vpp = avl_first(&vip->vdi_plugins); vpp != NULL; + vpp = AVL_NEXT(&vip->vdi_plugins, vpp)) { + if (func(vph, vpp->vpp_name, arg) != 0) { + mutex_exit(&vip->vdi_lock); + return (1); + } + } + mutex_exit(&vip->vdi_lock); + return (0); +} + +void +libvarpd_plugin_init(void) +{ + if (mutex_init(&varpd_load_lock, USYNC_THREAD | LOCK_RECURSIVE | + LOCK_ERRORCHECK, NULL) != 0) + libvarpd_panic("failed to create varpd_load_lock"); + + if (cond_init(&varpd_load_cv, USYNC_THREAD, NULL) != 0) + libvarpd_panic("failed to create varpd_load_cv"); + + varpd_load_handle = NULL; +} + +void +libvarpd_plugin_fini(void) +{ + assert(varpd_load_handle == NULL); + if (mutex_destroy(&varpd_load_lock) != 0) + libvarpd_panic("failed to destroy varpd_load_lock"); + if (cond_destroy(&varpd_load_cv) != 0) + libvarpd_panic("failed to destroy varpd_load_cv"); +} + +void +libvarpd_plugin_prefork(void) +{ + mutex_enter(&varpd_load_lock); + while (varpd_load_handle != NULL) + (void) cond_wait(&varpd_load_cv, &varpd_load_lock); +} + +void +libvarpd_plugin_postfork(void) +{ + (void) cond_signal(&varpd_load_cv); + mutex_exit(&varpd_load_lock); +} diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_prop.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_prop.c new file mode 100644 index 0000000000..f3a1492408 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_prop.c @@ -0,0 +1,300 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * varpd property management + */ + +#include <libvarpd_impl.h> +#include <errno.h> +#include <strings.h> +#include <sys/mac.h> +#include <umem.h> + +typedef struct varpd_prop_info { + varpd_impl_t *vprop_vip; + varpd_instance_t *vprop_instance; + uint_t vprop_type; + uint_t vprop_prot; + uint32_t vprop_defsize; + uint32_t vprop_psize; + char vprop_name[LIBVARPD_PROP_NAMELEN]; + uint8_t vprop_default[LIBVARPD_PROP_SIZEMAX]; + uint8_t vprop_poss[LIBVARPD_PROP_SIZEMAX]; +} varpd_prop_info_t; + +/* Internal Properties */ +static int varpd_nintprops = 1; +static const char *varpd_intprops[] = { + "search" +}; + +static int +libvarpd_prop_get_search(varpd_prop_info_t *infop, void *buf, uint32_t *sizep) +{ + varpd_plugin_t *vpp = infop->vprop_instance->vri_plugin; + size_t nlen; + + nlen = strlen(vpp->vpp_name) + 1; + if (nlen > *sizep) + return (EOVERFLOW); + *sizep = nlen; + (void) strlcpy(buf, vpp->vpp_name, *sizep); + return (0); +} + +void +libvarpd_prop_set_name(varpd_prop_handle_t *phdl, const char *name) +{ + varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl; + (void) strlcpy(infop->vprop_name, name, OVERLAY_PROP_NAMELEN); +} + +void +libvarpd_prop_set_prot(varpd_prop_handle_t *phdl, overlay_prop_prot_t perm) +{ + varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl; + infop->vprop_prot = perm; +} + +void +libvarpd_prop_set_type(varpd_prop_handle_t *phdl, overlay_prop_type_t type) +{ + varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl; + infop->vprop_type = type; +} + +int +libvarpd_prop_set_default(varpd_prop_handle_t *phdl, void *buf, ssize_t len) +{ + varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl; + + if (len > LIBVARPD_PROP_SIZEMAX) + return (E2BIG); + + if (len < 0) + return (EOVERFLOW); + + bcopy(buf, infop->vprop_default, len); + infop->vprop_defsize = len; + return (0); +} + +void +libvarpd_prop_set_nodefault(varpd_prop_handle_t *phdl) +{ + varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl; + + infop->vprop_default[0] = '\0'; + infop->vprop_defsize = 0; +} + +void +libvarpd_prop_set_range_uint32(varpd_prop_handle_t *phdl, uint32_t min, + uint32_t max) +{ + varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->vprop_poss; + + if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_UINT32) + return; + + if (infop->vprop_psize + sizeof (mac_propval_uint32_range_t) > + sizeof (infop->vprop_poss)) + return; + + infop->vprop_psize += sizeof (mac_propval_uint32_range_t); + rangep->mpr_count++; + rangep->mpr_type = MAC_PROPVAL_UINT32; + rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_min = min; + rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_max = max; +} + +void +libvarpd_prop_set_range_str(varpd_prop_handle_t *phdl, const char *str) +{ + varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl; + size_t len = strlen(str) + 1; /* Account for a null terminator */ + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->vprop_poss; + mac_propval_str_range_t *pstr = &rangep->u.mpr_str; + + if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_STR) + return; + + if (infop->vprop_psize + len > sizeof (infop->vprop_poss)) + return; + + rangep->mpr_count++; + rangep->mpr_type = MAC_PROPVAL_STR; + (void) strlcpy((char *)&pstr->mpur_data[pstr->mpur_nextbyte], str, + sizeof (infop->vprop_poss) - infop->vprop_psize); + pstr->mpur_nextbyte += len; + infop->vprop_psize += len; +} + +int +libvarpd_prop_handle_alloc(varpd_handle_t *vph, varpd_instance_handle_t *inst, + varpd_prop_handle_t **phdlp) +{ + varpd_prop_info_t *infop; + + infop = umem_alloc(sizeof (varpd_prop_info_t), UMEM_DEFAULT); + if (infop == NULL) + return (ENOMEM); + + bzero(infop, sizeof (varpd_prop_info_t)); + infop->vprop_vip = (varpd_impl_t *)vph; + infop->vprop_instance = (varpd_instance_t *)inst; + + *phdlp = (varpd_prop_handle_t *)infop; + return (0); +} + +void +libvarpd_prop_handle_free(varpd_prop_handle_t *phdl) +{ + umem_free(phdl, sizeof (varpd_prop_info_t)); +} + +int +libvarpd_prop_nprops(varpd_instance_handle_t *ihdl, uint_t *np) +{ + int ret; + varpd_instance_t *instp = (varpd_instance_t *)ihdl; + + ret = instp->vri_plugin->vpp_ops->vpo_nprops(instp->vri_private, np); + if (ret != 0) + return (ret); + *np += varpd_nintprops; + return (0); +} + +static int +libvarpd_prop_info_fill_int_cb(varpd_handle_t *handle, const char *name, + void *arg) +{ + varpd_prop_handle_t *vph = arg; + libvarpd_prop_set_range_str(vph, name); + return (0); +} + +static int +libvarpd_prop_info_fill_int(varpd_prop_handle_t *vph, uint_t propid) +{ + varpd_prop_info_t *infop = (varpd_prop_info_t *)vph; + if (propid >= varpd_nintprops) + abort(); + libvarpd_prop_set_name(vph, varpd_intprops[0]); + libvarpd_prop_set_prot(vph, OVERLAY_PROP_PERM_READ); + libvarpd_prop_set_type(vph, OVERLAY_PROP_T_STRING); + libvarpd_prop_set_nodefault(vph); + (void) libvarpd_plugin_walk( + (varpd_handle_t *)infop->vprop_instance->vri_impl, + libvarpd_prop_info_fill_int_cb, vph); + return (0); +} + +int +libvarpd_prop_info_fill(varpd_prop_handle_t *phdl, uint_t propid) +{ + varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl; + varpd_instance_t *instp = infop->vprop_instance; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->vprop_poss; + + infop->vprop_psize = sizeof (mac_propval_range_t); + + bzero(rangep, sizeof (mac_propval_range_t)); + if (propid < varpd_nintprops) { + return (libvarpd_prop_info_fill_int(phdl, propid)); + } else { + varpd_plugin_t *vpp = instp->vri_plugin; + return (vpp->vpp_ops->vpo_propinfo(instp->vri_private, + propid - varpd_nintprops, phdl)); + } +} + +int +libvarpd_prop_info(varpd_prop_handle_t *phdl, const char **namep, + uint_t *typep, uint_t *protp, const void **defp, uint32_t *sizep, + const mac_propval_range_t **possp) +{ + varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl; + if (namep != NULL) + *namep = infop->vprop_name; + if (typep != NULL) + *typep = infop->vprop_type; + if (protp != NULL) + *protp = infop->vprop_prot; + if (defp != NULL) + *defp = infop->vprop_default; + if (sizep != NULL) + *sizep = infop->vprop_psize; + if (possp != NULL) + *possp = (mac_propval_range_t *)infop->vprop_poss; + return (0); +} + +int +libvarpd_prop_get(varpd_prop_handle_t *phdl, void *buf, uint32_t *sizep) +{ + varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl; + varpd_instance_t *instp = infop->vprop_instance; + + if (infop->vprop_name[0] == '\0') + return (EINVAL); + + if (strcmp(varpd_intprops[0], infop->vprop_name) == 0) { + /* search property */ + return (libvarpd_prop_get_search(infop, buf, sizep)); + } + + return (instp->vri_plugin->vpp_ops->vpo_getprop(instp->vri_private, + infop->vprop_name, buf, sizep)); +} + +int +libvarpd_prop_set(varpd_prop_handle_t *phdl, const void *buf, uint32_t size) +{ + int i; + varpd_prop_info_t *infop = (varpd_prop_info_t *)phdl; + varpd_instance_t *instp = infop->vprop_instance; + + if (infop->vprop_name[0] == '\0') + return (EINVAL); + + for (i = 0; i < varpd_nintprops; i++) { + if (strcmp(infop->vprop_name, varpd_intprops[i]) == 0) { + return (EPERM); + } + } + + return (instp->vri_plugin->vpp_ops->vpo_setprop(instp->vri_private, + infop->vprop_name, buf, size)); +} + +void +libvarpd_prop_door_convert(const varpd_prop_handle_t *phdl, + varpd_client_propinfo_arg_t *vcfap) +{ + const varpd_prop_info_t *infop = (const varpd_prop_info_t *)phdl; + + vcfap->vcfa_type = infop->vprop_type; + vcfap->vcfa_prot = infop->vprop_prot; + vcfap->vcfa_defsize = infop->vprop_defsize; + vcfap->vcfa_psize = infop->vprop_psize; + bcopy(infop->vprop_name, vcfap->vcfa_name, LIBVARPD_PROP_NAMELEN); + bcopy(infop->vprop_default, vcfap->vcfa_default, LIBVARPD_PROP_SIZEMAX); + bcopy(infop->vprop_poss, vcfap->vcfa_poss, LIBVARPD_PROP_SIZEMAX); +} diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h b/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h new file mode 100644 index 0000000000..c44a8f6941 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_provider.h @@ -0,0 +1,417 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LIBVARPD_PROVIDER_H +#define _LIBVARPD_PROVIDER_H + +/* + * varpd provider interface for lookup modules + * + * This header file defines all the structures and functions that a given lookup + * module needs to implement and perform its purpose. At this time, all of these + * interfaces are considered private to illumos and therefore are subject to + * change. At some point we will move to more broadly stabilize these interfaces + * and commit to them. Until such time, expect breakage for out of gate + * consumers. + * + * A plugin is a dynamic shared object that is placed inside of varpd's default + * module. + * + * The shared object must define an initializer, such as with #pragma init. This + * function will be run with the module is dlopened by libvarpd. In that init + * function, the function must allocate a varpd_plugin_register by calling + * libvarpd_plugin_alloc() and specifying VARPD_CURRENT_VERSION. If that + * succeeds, then it should proceed to fill out the registration and then call, + * libvarpd_plugin_register() with it. Regardless of whether it succeeds or + * fails, it should call libvarpd_plugin_free(). In the case of failure, there + * is not much that the module should do, other than log some message to + * stderr. + * + * Once libvarpd_plugin_register() returns, the module should assume that any + * of the operations it defined in the operation vector may be called and + * therefore it is recommended that any other required initialization should be + * performed at that time. + * + * At this time, once a plugin is loaded, it will not be unloaded. Therefore, + * there is no corresponding requirement to unregister, though that may come in + * a future version. + * + * ----------------------------- + * Plugin Types and Destinations + * ----------------------------- + * + * There are two different kinds of plugins in this world, there are point to + * point plugins and there are dynamic plugins. The key difference is in how + * packets are routed through the system. In a point to point plugin, a single + * destination is used when the instance is started. In dynamic plugins, + * destinations are looked up as they are required and an instance of a plugin + * is required to provide that. + * + * These point to point plugins define a type of OVERLAY_TARGET_POINT and the + * dynamic plugins instead define a type of OVERLAY_TARGET_DYNAMIC. + * + * Encapsulation plugins have multiple types of destinations. They may require + * an Ethernet address (OVERLAY_PLUGIN_D_ETHERNET), IP address + * (OVERLAY_PLUGIN_D_IP), and a port (OVERLAY_PLUGIN_D_PORT). For example, + * consider vxlan, it requires an IP and a port; while a hypothetical nvgre, + * would only require an IP. + * + * A plugin is allowed to describe which of these fields that it supports and + * given which encapsulation plugin it is paired with, it can support a varying + * degree of properties. For example, consider the example of the direct plugin. + * It has a notion of a destination port and a destination IP. If it is paired + * with a plugin that only requires an IP, then it wouldn't need to show a + * property that's related to a destination port. + * + * ------------------ + * Plugin Definitions + * ------------------ + * + * A plugin is required to fill in both an operations vector and a series of + * additional metadata that it passed in to libvarpd_plugin_register(). The + * following lists all of the routines and their purposes. The full signatures + * are available in the body of the header file. + * + * varpd_plugin_create_f + * + * Create a new instance of a plugin. Each instance refers to a different + * overlay device and thus a different overlay identifier. Each instance + * has its own property space and is unique. This function gives the chance + * for the plugin to create and provide any private data that it will + * require. + * + * In addition, the plugin is given the type of destination that is + * required and it is its job to determine whether or not it supports it. + * + * varpd_plugin_destroy_f + * + * This is the opposite of varpd_plugin_create_f. It is called to allow the + * plugin to reclaim any resources with the private argument that it passed + * out as part of the destroy function. + * + * varpd_plugin_start_f + * + * This routine is called to indicate that an instance should be started. + * This is a plugin's chance to verify that it has all of its required + * properties set and to take care of any action that needs to be handled + * to begin the plugin. After this point it will be legal to have the + * varpd_plugin_default_f, varpd_plugin_lookup_f, varpd_plugin_arp_f and + * varpd_plugin_dhcp_f endpoints called. + * + * varpd_plugin_stop_f + * + * This routine is called to indicate that an instance is stopping, it is + * the opposite of varpd_plugin_start_f. This is a chance to clean up + * resources that are a side effect of having started the instance. + * + * varpd_plugin_default_f + * + * This routine is defined by plugins of type OVERLAY_TARGET_POINT. It is + * used to answer the question of where should all traffic for this + * instance be destined. Plugins of type OVERLAY_TARGET_DYNAMIC should + * leave this entry set to NULL. + * + * On success, the default routine should return VARPD_LOOKUP_OK. On + * failure, it should return the macro VARPD_LOOKUP_DROP. + * + * varpd_plugin_lookup_f + * + * This routine must be defined by plugins of type OVERLAY_TARGET_DYNAMIC. + * It is used to lookup the destination for a given request. Each request + * comes in with its own MAC address this allows a plugin to direct it to + * any remote location. + * + * This is designed as an asynchronous API. Once a lookup is completed it + * should call libvarpd_plugin_query_reply() and pass as the second + * argument either VARPD_LOOKUP_OK to indicate that it went alright or it + * should reply VARPD_LOOKUP_DROP to indicate that the packet should be + * dropped. + * + * In addition, there are several utility routines that can take care of + * various kinds of traffic automatically. For example, if an ARP, NDP, or + * DHCP packet comes in, there are utilities such as + * libvarpd_plugin_proxy_arp(), libvarpd_plugin_proxy_ndp() and + * libvarpd_plugin_proxy_dhcp(), which allows the system to do the heavy + * lifting of validating the packet once it finds that it matches certain + * properties. + * + * varpd_plugin_arp_f + * + * This is an optional entry for plugins of type OVERLAY_TARGET_DYNAMIC. + * This is called after a plugin calls libvarpd_plugin_proxy_arp() and is + * used to ask the plugin to perform an ARP or NDP query. The type of query + * is passed in in the third argument, the only valid value for which will + * be VARPD_QTYPE_ETHERNET, to indicate we're doing an Ethernet lookup. + * + * The layer three IP address that is being looked up will be included in + * the struct sockaddr. The sockaddr(3SOCKET)'s sa_family will be set to + * indicate the type, eg. AF_INET or AF_INET6 and that will indicate the + * kind of sockaddr that will be used. For more information see + * sockaddr(3SOCKET). The implementation ensures that enough space for the + * link layer address will exist. + * + * This is an asynchronous lookup. Once the answer has been written, a + * plugin should call libvarpd_plugin_arp_reply and if it was successful, + * VARPD_LOOKUP_OK should be passed in and if it failed, VARPD_LOOKUP_DROP + * should be passed in instead. + * + * varpd_plugin_dhcp_f + * + * This is an optional entry for plugins of type OVERLAY_TARGET_DYNAMIC. + * This is called after a plugin calls the libvarpd_plugin_proxy_dhcp() and + * is used to ask the plugin to determine where is the DHCP server that + * this packet should actually be sent to. What is happening here is that + * rather than broadcast the initial DHCP request, we instead unicast it to + * a specified DHCP server that this operation vector indicates. + * + * The plugin is given a type, the same as the ARP plugin which indicates + * the kind of link layer address, the only valid type is + * VARPD_QTYPE_ETHERNET, other types should be rejected. Then, like the arp + * entry point, the dhcp entry point should determine the link layer + * address of the DHCP server and write that out in the appropriate memory + * and call libvarpd_plugin_dhcp_reply() when done. Similar to the arp + * entry point, it should use VARPD_LOOKUP_OK to indicate that it was + * filled in and VARPD_LOOKUP_DROP to indicate that it was not. + * + * varpd_plugin_nprops_f + * + * This is used by a plugin to indicate the number of properties that + * should exist for this instance. Recall from the section that Plugin + * types and Destinations, that the number of entries here may vary. As + * such, the plugin should return the number that is appropriate for the + * instance. + * + * This number will be used to obtain information about a property via the + * propinfo functions. However, the getprop and setprop interfaces will + * always use names to indicate the property it is getting and setting. + * This difference is structured this way to deal with property discovery + * and to make the getprop and setprop interfaces slightly easier for other + * parts of the broader varpd/dladm infrastructure. + * + * varpd_plugin_propinfo_f + * + * This interface is used to get information about a property, the property + * that information is being requested for is being passed in via the + * second argument. Here, callers should set properties such as the name, + * the protection, whether or not the property is required, set any default + * value, if it exist, and if relevant, set the valid range of values. + * + * varpd_plugin_getprop_f + * + * This is used to get the value of a property, if it is set. The passed in + * length indicates the length of the buffer that is used for updating + * properties. If it is not of sufficient size, the function should return + * an error and not update the buffer. Otherwise, it should update the size + * pointer with the valid size. + * + * varpd_plugin_setprop_f + * + * This is used to set the value of a property. An endpoint should validate + * that the property is valid before updating it. In addition, it should + * update its state as appropriate. + * + * varpd_plugin_save_f + * + * This is used to serialize the state of a given instance of a plugin such + * that if varpd crashes, it can be recovered. The plugin should write all + * state into the nvlist that it is passed in, it may use any keys and + * values that it wants. The only consumer of that nvlist will be the + * plugin itself when the restore endpoint is called. + * + * varpd_plugin_restore_f + * + * This is called by the server to restore an instance that used to exist, + * but was lost due to a crash. This is a combination of calling create and + * setting properties. The plugin should restore any private state that it + * can find recorded from the nvlist. The only items in the nvlist will be + * those that were written out during a previous call to + * varpd_plugin_save_f. + * + * + * Once all of these interfaces are implemented, the plugin should define the + * following members in the varpd_plugin_register_t. + * + * vpr_version + * + * This indicates the version of the plugin. Plugins should set this to the + * macro VARPD_CURRENT_VERSION. + * + * vpr_mode + * + * This indicates the mode of the plugin. The plugin's mode should be one + * of OVERLAY_TARGET_POINT and OVERLAY_TARGET_DYNAMIC. For more discussion + * of these types and the differences, see the section on Plugin Types and + * Destinations. + * + * vpr_name + * + * This is the name of the plugin. This is how users will refer to it in + * the context of running dladm(1M) commands. Note, this name must be + * unique across the different plugins, as it will cause others with the + * same name not to be registered. + * + * vpr_ops + * + * This is the operations vector as described above. Importantly, the + * member vpo_callbacks must be set to zero, this is being used for future + * expansion of the structure. + * + * + * -------------------------------------------------- + * Downcalls, Upcalls, and Synchronization Guarantees + * -------------------------------------------------- + * + * Every instance of a plugin is independent. Calls into a plugin may be made + * for different instances in parallel. Any necessary locking is left to the + * plugin module. Within an instance, various calls may come in parallel. + * + * The primary guarantees are that none of the varpd_plugin_save_f, + * varpd_plugin_lookup_f, varpd_default_f, varpd_plugin_arp_f, and + * varpd_plugin_dhcp_f will be called until after a call to varpd_plugin_start_f + * has been called. Similarly, they will not be called after a call to + * varpd_plugin_stop_f. + * + * The functions documented in this header may be called back into from any + * context, including from the operation vectors. + */ + +#include <libvarpd.h> +#include <libnvpair.h> +#include <sys/socket.h> +#include <sys/overlay_target.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define VARPD_VERSION_ONE 1 +#define VARPD_CURRENT_VERSION VARPD_VERSION_ONE + +typedef struct __varpd_provier_handle varpd_provider_handle_t; +typedef struct __varpd_query_handle varpd_query_handle_t; +typedef struct __varpd_arp_handle varpd_arp_handle_t; +typedef struct __varpd_dhcp_handle varpd_dhcp_handle_t; + +typedef int (*varpd_plugin_create_f)(varpd_provider_handle_t *, void **, + overlay_plugin_dest_t); +typedef int (*varpd_plugin_start_f)(void *); +typedef void (*varpd_plugin_stop_f)(void *); +typedef void (*varpd_plugin_destroy_f)(void *); + +#define VARPD_LOOKUP_OK (0) +#define VARPD_LOOKUP_DROP (-1) +typedef int (*varpd_plugin_default_f)(void *, overlay_target_point_t *); +typedef void (*varpd_plugin_lookup_f)(void *, varpd_query_handle_t *, + const overlay_targ_lookup_t *, overlay_target_point_t *); + +#define VARPD_QTYPE_ETHERNET 0x0 +typedef void (*varpd_plugin_arp_f)(void *, varpd_arp_handle_t *, int, + const struct sockaddr *, uint8_t *); +typedef void (*varpd_plugin_dhcp_f)(void *, varpd_dhcp_handle_t *, int, + const overlay_targ_lookup_t *, uint8_t *); + +typedef int (*varpd_plugin_nprops_f)(void *, uint_t *); +typedef int (*varpd_plugin_propinfo_f)(void *, const uint_t, + varpd_prop_handle_t *); +typedef int (*varpd_plugin_getprop_f)(void *, const char *, void *, uint32_t *); +typedef int (*varpd_plugin_setprop_f)(void *, const char *, const void *, + const uint32_t); + +typedef int (*varpd_plugin_save_f)(void *, nvlist_t *); +typedef int (*varpd_plugin_restore_f)(nvlist_t *, varpd_provider_handle_t *, + overlay_plugin_dest_t, void **); + +typedef struct varpd_plugin_ops { + uint_t vpo_callbacks; + varpd_plugin_create_f vpo_create; + varpd_plugin_start_f vpo_start; + varpd_plugin_stop_f vpo_stop; + varpd_plugin_destroy_f vpo_destroy; + varpd_plugin_default_f vpo_default; + varpd_plugin_lookup_f vpo_lookup; + varpd_plugin_nprops_f vpo_nprops; + varpd_plugin_propinfo_f vpo_propinfo; + varpd_plugin_getprop_f vpo_getprop; + varpd_plugin_setprop_f vpo_setprop; + varpd_plugin_save_f vpo_save; + varpd_plugin_restore_f vpo_restore; + varpd_plugin_arp_f vpo_arp; + varpd_plugin_dhcp_f vpo_dhcp; +} varpd_plugin_ops_t; + +typedef struct varpd_plugin_register { + uint_t vpr_version; + uint_t vpr_mode; + const char *vpr_name; + const varpd_plugin_ops_t *vpr_ops; +} varpd_plugin_register_t; + +extern varpd_plugin_register_t *libvarpd_plugin_alloc(uint_t, int *); +extern void libvarpd_plugin_free(varpd_plugin_register_t *); +extern int libvarpd_plugin_register(varpd_plugin_register_t *); + +/* + * Blowing up and logging + */ +extern void libvarpd_panic(const char *, ...) __NORETURN; + +/* + * Misc. Information APIs + */ +extern uint64_t libvarpd_plugin_vnetid(varpd_provider_handle_t *); + +/* + * Lookup Replying query and proxying + */ +extern void libvarpd_plugin_query_reply(varpd_query_handle_t *, int); + +extern void libvarpd_plugin_proxy_arp(varpd_provider_handle_t *, + varpd_query_handle_t *, const overlay_targ_lookup_t *); +extern void libvarpd_plugin_proxy_ndp(varpd_provider_handle_t *, + varpd_query_handle_t *, const overlay_targ_lookup_t *); +extern void libvarpd_plugin_arp_reply(varpd_arp_handle_t *, int); + +extern void libvarpd_plugin_proxy_dhcp(varpd_provider_handle_t *, + varpd_query_handle_t *, const overlay_targ_lookup_t *); +extern void libvarpd_plugin_dhcp_reply(varpd_dhcp_handle_t *, int); + + +/* + * Property information callbacks + */ +extern void libvarpd_prop_set_name(varpd_prop_handle_t *, const char *); +extern void libvarpd_prop_set_prot(varpd_prop_handle_t *, overlay_prop_prot_t); +extern void libvarpd_prop_set_type(varpd_prop_handle_t *, overlay_prop_type_t); +extern int libvarpd_prop_set_default(varpd_prop_handle_t *, void *, ssize_t); +extern void libvarpd_prop_set_nodefault(varpd_prop_handle_t *); +extern void libvarpd_prop_set_range_uint32(varpd_prop_handle_t *, uint32_t, + uint32_t); +extern void libvarpd_prop_set_range_str(varpd_prop_handle_t *, const char *); + +/* + * Various injecting and invalidation routines + */ +extern void libvarpd_inject_varp(varpd_provider_handle_t *, const uint8_t *, + const overlay_target_point_t *); +extern void libvarpd_inject_arp(varpd_provider_handle_t *, const uint16_t, + const uint8_t *, const struct in_addr *, const uint8_t *); +extern void libvarpd_fma_degrade(varpd_provider_handle_t *, const char *); +extern void libvarpd_fma_restore(varpd_provider_handle_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBVARPD_PROVIDER_H */ diff --git a/usr/src/lib/varpd/libvarpd/common/libvarpd_util.c b/usr/src/lib/varpd/libvarpd/common/libvarpd_util.c new file mode 100644 index 0000000000..92e50b5f1b --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/libvarpd_util.c @@ -0,0 +1,91 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <libvarpd_impl.h> +#include <assert.h> +#include <stdio.h> +#include <sys/types.h> +#include <dirent.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +const char * +libvarpd_isaext(void) +{ +#if defined(__amd64) + return ("64"); +#elif defined(__i386) + return (""); +#else +#error "unknown ISA" +#endif +} + +int +libvarpd_dirwalk(varpd_impl_t *vip, const char *path, const char *suffix, + libvarpd_dirwalk_f func, void *arg) +{ + int ret; + size_t slen; + char *dirpath, *filepath; + DIR *dirp; + struct dirent *dp; + assert(vip != NULL && path != NULL); + + if (asprintf(&dirpath, "%s/%s", path, libvarpd_isaext()) == -1) + return (errno); + + if ((dirp = opendir(dirpath)) == NULL) { + ret = errno; + return (ret); + } + + slen = strlen(suffix); + for (;;) { + size_t len; + + errno = 0; + dp = readdir(dirp); + if (dp == NULL) { + ret = errno; + break; + } + + len = strlen(dp->d_name); + if (len <= slen) + continue; + + if (strcmp(suffix, dp->d_name + (len - slen)) != 0) + continue; + + if (asprintf(&filepath, "%s/%s", dirpath, dp->d_name) == -1) { + ret = errno; + break; + } + + if (func(vip, filepath, arg) != 0) { + free(filepath); + ret = 0; + break; + } + + free(filepath); + } + + (void) closedir(dirp); + free(dirpath); + return (ret); +} diff --git a/usr/src/lib/varpd/libvarpd/common/mapfile-plugin b/usr/src/lib/varpd/libvarpd/common/mapfile-plugin new file mode 100644 index 0000000000..8cef7f669f --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/mapfile-plugin @@ -0,0 +1,57 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_SCOPE { + global: + libvarpd_fma_degrade { FLAGS = EXTERN }; + libvarpd_inject_arp { FLAGS = EXTERN }; + libvarpd_inject_ndp { FLAGS = EXTERN }; + libvarpd_inject_varp { FLAGS = EXTERN }; + libvarpd_fma_restore { FLAGS = EXTERN }; + libvarpd_panic { FLAGS = EXTERN }; + libvarpd_plugin_alloc { FLAGS = EXTERN }; + libvarpd_plugin_arp_reply { FLAGS = EXTERN }; + libvarpd_plugin_dhcp_reply { FLAGS = EXTERN }; + libvarpd_plugin_free { FLAGS = EXTERN }; + libvarpd_plugin_proxy_arp { FLAGS = EXTERN }; + libvarpd_plugin_proxy_dhcp { FLAGS = EXTERN }; + libvarpd_plugin_proxy_ndp { FLAGS = EXTERN }; + libvarpd_plugin_query_reply { FLAGS = EXTERN }; + libvarpd_plugin_register { FLAGS = EXTERN }; + libvarpd_plugin_vnetid { FLAGS = EXTERN }; + libvarpd_prop_set_name { FLAGS = EXTERN }; + libvarpd_prop_set_prot { FLAGS = EXTERN }; + libvarpd_prop_set_type { FLAGS = EXTERN }; + libvarpd_prop_set_default { FLAGS = EXTERN }; + libvarpd_prop_set_nodefault { FLAGS = EXTERN }; + libvarpd_prop_set_range_uint32 { FLAGS = EXTERN }; + libvarpd_prop_set_rangestr { FLAGS = EXTERN }; +}; diff --git a/usr/src/lib/varpd/libvarpd/common/mapfile-vers b/usr/src/lib/varpd/libvarpd/common/mapfile-vers new file mode 100644 index 0000000000..7aa930cb54 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/common/mapfile-vers @@ -0,0 +1,113 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION SUNWprivate { + global: + libvarpd_c_create; + libvarpd_c_destroy; + libvarpd_c_instance_activate; + libvarpd_c_instance_create; + libvarpd_c_instance_destroy; + libvarpd_c_prop_nprops; + libvarpd_c_prop_handle_alloc; + libvarpd_c_prop_handle_free; + libvarpd_c_prop_info_fill; + libvarpd_c_prop_info_fill_by_name; + libvarpd_c_prop_info; + libvarpd_c_prop_get; + libvarpd_c_prop_set; + + libvarpd_c_instance_lookup; + libvarpd_c_instance_target_mode; + libvarpd_c_instance_cache_flush; + libvarpd_c_instance_cache_delete; + libvarpd_c_instance_cache_get; + libvarpd_c_instance_cache_set; + libvarpd_c_instance_cache_walk; + + libvarpd_create; + libvarpd_destroy; + + libvarpd_door_server_create; + libvarpd_door_server_destroy; + + libvarpd_fma_degrade; + libvarpd_fma_restore; + + libvarpd_inject_varp; + libvarpd_inject_arp; + + libvarpd_instance_activate; + libvarpd_instance_create; + libvarpd_instance_destroy; + libvarpd_instance_lookup; + libvarpd_instance_id; + + libvarpd_panic; + + libvarpd_persist_disable; + libvarpd_persist_enable; + libvarpd_persist_restore; + + libvarpd_plugin_alloc; + libvarpd_plugin_load; + libvarpd_plugin_free; + libvarpd_plugin_arp_reply; + libvarpd_plugin_dhcp_reply; + libvarpd_plugin_query_reply; + libvarpd_plugin_proxy_arp; + libvarpd_plugin_proxy_dhcp; + libvarpd_plugin_proxy_ndp; + libvarpd_plugin_register; + libvarpd_plugin_walk; + libvarpd_plugin_vnetid; + + libvarpd_prop_set_default; + libvarpd_prop_set_nodefault; + libvarpd_prop_set_name; + libvarpd_prop_set_prot; + libvarpd_prop_set_range_uint32; + libvarpd_prop_set_range_str; + libvarpd_prop_set_type; + + libvarpd_prop_handle_alloc; + libvarpd_prop_handle_free; + libvarpd_prop_nprops; + libvarpd_prop_info_fill; + libvarpd_prop_info; + libvarpd_prop_get; + libvarpd_prop_set; + + libvarpd_overlay_lookup_quiesce; + libvarpd_overlay_lookup_run; + local: + *; +}; diff --git a/usr/src/lib/varpd/libvarpd/i386/Makefile b/usr/src/lib/varpd/libvarpd/i386/Makefile new file mode 100644 index 0000000000..4398507523 --- /dev/null +++ b/usr/src/lib/varpd/libvarpd/i386/Makefile @@ -0,0 +1,18 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +include ../Makefile.com + +install: all $(ROOTLIBS) $(ROOTLINKS) diff --git a/usr/src/man/man1m/dladm.1m b/usr/src/man/man1m/dladm.1m index 6e03105132..e76b8998c7 100644 --- a/usr/src/man/man1m/dladm.1m +++ b/usr/src/man/man1m/dladm.1m @@ -178,6 +178,14 @@ dladm \- administer data links .LP .nf +\fBdladm create-overlay\fR [\fB-t\fR] \fB-e\fR \fIencap\fR \fB-s\fR \fIsearch\fR \fB-v\fR \fIvnetid\fR [\fB-p\fR \fIprop\fR=\fIvalue\fR[,...]] \fIoverlay\fR +\fBdladm delete-overlay\fR \fIoverlay\fR +\fBdladm modify-overlay\fR \fB-d\fR \fImac\fR | \fB-f\fR | \fB-s\fR \fImac=ip:port\fR \fIoverlay\fR +\fBdladm show-overlay\fR [ \fB-f\fR | \fB-t\fR ] [[\fB-p\fR] \fB-o\fR \fIfield\fR[,...]] [\fIoverlay\fR] +.fi + +.LP +.nf \fBdladm show-usage\fR [\fB-a\fR] \fB-f\fR \fIfilename\fR [\fB-p\fR \fIplotfile\fR \fB-F\fR \fIformat\fR] [\fB-s\fR \fItime\fR] [\fB-e\fR \fItime\fR] [\fIlink\fR] .fi @@ -264,9 +272,9 @@ A WiFi datalink. .ad .sp .6 .RS 4n -A virtual network interface created on a link or an \fBetherstub\fR. It is a -pseudo device that can be treated as if it were an network interface card on a -machine. +A virtual network interface created on a link, an \fBetherstub\fR, or \fBan +overlay\fR. It is a pseudo device that can be treated as if it were an network +interface card on a machine. .RE .sp @@ -334,6 +342,20 @@ use any alphanumeric characters, as well as underscore (\fB_\fR), period characters. .RE +.sp +.ne 2 +.na +.B overlay +.ad +.sp .6 +.RS 4n +An overlay instance, identified by an administratively-chosen name. An overlay +can be used to create or join an existing software defined network. +VNICs created on an overlay will appear to be connected by a local virtual +switch and will also be connected to interfaces on matching overlays provided by +other hosts. For more information on overlay devices, see \fBoverlay\fR(5). +.RE + .SS "Options" Each \fBdladm\fR subcommand has its own set of options. However, many of the subcommands have the following as a common option: @@ -4370,6 +4392,348 @@ The tunnel destination address. .sp .ne 2 .na +\fBdladm create-overlay\fR \fB-e\fR \fIencap\fR \fB-s\fR \fIsearch\fR +\fB-v\fR \fIvnetid\fR [\fB-p\fR \fIprop\fR=\fIvalue\fR[,...]] \fIoverlay\fR +.ad +.sp .6 +.RS 4n +Create an overlay device named \fIoverlay\fR. +.sp +Overlay devices are similar to etherstubs. VNICs can be created on top +of them. However, unlike an etherstub which is local to the system, an +overlay device can be configured to communicate to remote hosts, +providing a means for network virtualization. The way in which it does +this is described by the encapsulation module and the search plugin. For +more information on these, see \fBoverlay\fR(5). +.sp +An overlay device has a series of required and optional properties. These +properties vary based upon the search and encapsulation modules and are fully +specified in \fBoverlay\fR(5). Not every property needs to be specified - some +have default values which will be used if nothing specific is specified. For +example, the default port for VXLAN comes from its IANA standard. If a +required property is missing, the command will fail and inform you of the +missing properties. +.sp +.ne 2 +.na +\fB\fB-t\fR, \fB--temporary\fR\fR +.ad +.sp .6 +.RS 4n +Specifies that the overlay is temporary. Temporary overlays last until +the next reboot. +.RE + +.sp +.ne 2 +.na +\fB-e\fR \fIencap\fR, \fB--encap\fR=\fIencap\fR +.ad +.sp .6 +.RS 4n +Use \fIencap\fR as the encapsulation plugin for the overlay device +\fIoverlay\fR. The encapsulation plugin determines how packets are transformed +before being put on the wire. +.RE + +.sp +.ne 2 +.na +\fB-s\fR \fIsearch\fR, \fB--search\fR=\fIsearch\fR +.ad +.sp .6 +.RS 4n +Use \fIsearch\fR as the search plugin for \fIoverlay\fR. The search plugin +determines how non-local targets are found and where packets are directed to. +.RE + +.sp +.ne 2 +.na +\fB\fB-p\fR \fIprop\fR=\fIvalue\fR,..., \fB--prop\fR +\fIprop\fR=\fIvalue\fR,...\fR +.ad +.sp .6 +.RS 4n +A comma-separated list of properties to set to the specified values. +.RE + +.sp +.ne 2 +.na +\fB-v\fR \fIvnetid\fR, \fB--vnetid\fR=\fIvnetid\fR +.ad +.sp .6 +.RS 4n +Sets the virtual networking identifier to \fIvnetid\fR. A virtual network +identifier determines is similar to a VLAN identifier, in that it identifies a +unique virtual network. All overlay devices on the system share the same space +for the virtual network identifier. However, the valid range of identifiers is +determined by the encapsulation plugin specified by \fB-e\fR. +.RE + +.RE + +.sp +.ne 2 +.na +\fBdladm delete-overlay\fR \fIoverlay\fR +.ad +.sp .6 +.RS 4n +Delete the specified overlay. This will fail if there are VNICs on top of the +device. +.RE + +.sp +.ne 2 +.na +\fBdladm modify-overlay\fR \fB-d\fR \fImac\fR | \fB-f\fR | \fB-s\fR \fImac=ip:port\fR \fIoverlay\fR +.ad +.sp .6 +.RS 4n +Modifies the target tables for the specified overlay. +.sp +The different options allow for different ways of modifying the target table. +One of \fB-d\fR, \fB-f\fR, and \fB-s\fR is required. This is not applicable for +all kinds of overlay devices. For more information, see \fBoverlay\fR(5). +.sp +.ne 2 +.na +\fB-d\fR \fImac\fR, \fB--delete-entry\fR=\fImac\fR +.ad +.sp .6 +.RS 4n +Deletes the entry for \fImac\fR from the target table for \fIoverlay\fR. Note, +if a lookup is pending or outstanding, this does not cancel it or stop it from +updating the value. +.RE + +.sp +.ne 2 +.na +\fB-f\fR, \fB--flush-table\fR +.ad +.sp .6 +.RS 4n +Flushes all values in the target table for \fIoverlay\fR. +.RE + +.sp +.ne 2 +.na +\fB-s\fR \fImac\fR=\fIvalue\fR, \fB--set-entry\fR=\fImac\fR=\fIvalue\fR +.ad +.sp .6 +.RS 4n +Sets the value of \fIoverlay\fR's target table entry for \fImac\fR to the +specified value. The specified value varies upon the encapsulation plugin. The +value may be a combination of a MAC address, IP address, and port. Generally, +this looks like [\fImac\fR,][\fIIP\fR:][\fIport\fR]. If a component is the last +one, then there is no need for a separator. eg. if just the MAC address or IP +is needed, it would look like \fImac\fR and \fIIP\fR respectively. +.RE + +.RE + +.sp +.ne 2 +.na +\fBdladm show-overlay\fR [ \fB-f\fR | \fB-t\fR ] [[\fB-p\fR] \fB-o\fR \fIfield\fR[,...]] [\fIoverlay\fR] +.ad +.sp .6 +.RS 4n +Shows overlay configuration (the default), internal target tables (\fB-t\fR), or +the FMA state (\fB-f\fR), either for all overlays or the specified overlay. +.sp +By default (with neither \fB-f\fR or \fB-t\fR specified), the following fields +will be displayed: +.sp +.ne 2 +.na +\fB\fBLINK\fR\fR +.ad +.sp .6 +.RS 4n +The name of the overlay. +.RE + +.sp +.ne 2 +.na +\fB\fBPROPERTY\fR\fR +.ad +.sp .6 +.RS 4n +The name of the property. +.RE + +.sp +.ne 2 +.na +\fB\fBPERM\fR\fR +.ad +.sp .6 +.RS 4n +The read/write permissions of the property. The value shown is one of \fBr-\fR +or \fBrw\fR. +.RE + +.sp +.ne 2 +.na +\fB\fBVALUE\fR\fR +.ad +.sp .6 +.RS 4n +The current property value. If the value is not set, it is shown as \fB--\fR. +If it is unknown, the value is shown as \fB?\fR. +.RE + +.sp +.ne 2 +.na +\fB\fBDEFAULT\fR\fR +.ad +.sp .6 +.RS 4n +The default value of the property. If the property has no default value, +\fB--\fR is shown. +.RE + +.sp +.ne 2 +.na +\fB\fBPOSSIBLE\fR\fR +.ad +.sp .6 +.RS 4n +A comma-separated list of the values the property can have. If the values span +a numeric range, \fImin\fR - \fImax\fR might be shown as shorthand. If the +possible values are unknown or unbounded, \fB--\fR is shown. +.RE + +.sp +When the \fB-f\fR option is displayed, the following fields will be displayed: +.sp +.ne 2 +.na +\fB\fBLINK\fR\fR +.ad +.sp .6 +.RS 4n +The name of the overlay. +.RE + +.sp +.ne 2 +.na +\fB\fBSTATUS\fR\fR +.ad +.sp .6 +.RS 4n +Either \fBONLINE\fR or \fBDEGRADED\fR. +.RE + +.sp +.ne 2 +.na +\fB\fBDETAILS\fR\fR +.ad +.sp .6 +.RS 4n +When the \fBoverlay\fR's status is \fBONLINE\fR, then this has the value +\fB--\fR. Otherwise, when it is \fBDEGRADED\fR, this field provides a more +detailed explanation as to why it's degraded. +.RE + +.sp +When the \fB-t\fR option is displayed, the following fields will be displayed: +.sp +.ne 2 +.na +\fB\fBLINK\fR\fR +.ad +.sp .6 +.RS 4n +The name of the overlay. +.RE + +.sp +.ne 2 +.na +\fB\fBTARGET\fR\fR +.ad +.sp .6 +.RS 4n +The target MAC address of a table entry. +.RE + +.sp +.ne 2 +.na +\fB\fBDESTINATION\fR\fR +.ad +.sp .6 +.RS 4n +The address that an encapsulated packet will be sent to when a packet has the +address specified by \fBTARGET\fR. +.RE + +The \fBshow-overlay\fR command supports the following options: + +.sp +.ne 2 +.na +\fB-f\fR, \fB--fma\fR +.ad +.sp .6 +.RS 4n +Displays information about an overlay device's FMA state. For more +information on the target table, see \fBoverlay\fR(5). +.RE + +.sp +.ne 2 +.na +\fB\fB-o\fR \fIfield\fR[,...], \fB--output\fR=\fIfield\fR\fR +.ad +.sp .6 +.RS 4n +A case-insensitive, comma-separated list of output fields to display. The field +name must be one of the fields listed above, or the special value \fBall\fR, to +display all fields. The fields applicable to the \fB-o\fR option are limited to +those listed under each output mode. For example, if using \fB-L\fR, only the +fields listed under \fB-L\fR, above, can be used with \fB-o\fR. +.RE + +.sp +.ne 2 +.na +\fB\fB-p\fR, \fB--parsable\fR\fR +.ad +.sp .6 +.RS 4n +Display using a stable machine-parsable format. The \fB-o\fR option is +required with \fB-p\fR. See "Parsable Output Format", below. +.RE + +.sp +.ne 2 +.na +\fB-t\fR, \fB--target\fR +.ad +.sp .6 +.RS 4n +Displays information about an overlay device's target table. For more +information on the target table, see \fBoverlay\fR(5). +.RE + +.RE + +.sp +.ne 2 +.na \fB\fBdladm show-usage\fR [\fB-a\fR] \fB-f\fR \fIfilename\fR [\fB-p\fR \fIplotfile\fR \fB-F\fR \fIformat\fR] [\fB-s\fR \fItime\fR] [\fB-e\fR \fItime\fR] [\fIlink\fR]\fR @@ -5606,7 +5970,7 @@ Interface Stability Committed .SH SEE ALSO \fBacctadm\fR(1M), \fBautopush\fR(1M), \fBifconfig\fR(1M), \fBipsecconf\fR(1M), \fBndd\fR(1M), \fBpsrset\fR(1M), \fBwpad\fR(1M), \fBzonecfg\fR(1M), -\fBattributes\fR(5), \fBieee802.3\fR(5), \fBdlpi\fR(7P) +\fBattributes\fR(5), \fBieee802.3\fR(5), \fBoverlay\fR(5), \fBdlpi\fR(7P) .SH NOTES The preferred method of referring to an aggregation in the aggregation subcommands is by its link name. Referring to an aggregation by its integer diff --git a/usr/src/man/man4/Makefile b/usr/src/man/man4/Makefile index 40c7f78d41..941757d4f3 100644 --- a/usr/src/man/man4/Makefile +++ b/usr/src/man/man4/Makefile @@ -133,6 +133,7 @@ _MANFILES= Intro.4 \ nsmbrc.4 \ nss.4 \ nsswitch.conf.4 \ + overlay_files.4 \ packingrules.4 \ pam.conf.4 \ passwd.4 \ diff --git a/usr/src/man/man4/overlay_files.4 b/usr/src/man/man4/overlay_files.4 new file mode 100644 index 0000000000..b9e5387871 --- /dev/null +++ b/usr/src/man/man4/overlay_files.4 @@ -0,0 +1,187 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright 2015, Joyent, Inc. +.\" +.Dd Apr 13, 2015 +.Dt OVERLAY_FILES 4 +.Os +.Sh NAME +.Nm overlay_files +.Nd Overlay files plugin file format +.Sh DESCRIPTION +The +.Sy files +plugin provides a means for a dynamic overlay where the destinations are +determined based on a static description contained in a +.Sy JSON +file. +This manual describes the format of the file used by the +.Sy files/config +property. +To create and manage overlays with the +.Sy files +plugin, use +.Xr dladm 1M . +For more information on overlays, see +.Xr overlay 5 . +.Pp +Using the +.Sy files +module, a static and simple overlay network can be created. +This network does not support the use of +.Em broadcast +or +.Em multicast +traffic. +Both ARP and NDP traffic are proxied by the plugin itself. +In addition, the plugin allows for DHCP. +Instead of providing a traditional DHCP proxy, when an initial DHCP broadcast +goes out to a broadcast address, it will get rewritten to target a specific MAC +address. +The +.Sy files +plugin is useful as proof of concept and for simple static networks +where addresses do not need to be reconfigured. +If more advanced topologies or more streamlined updates are required, consider +a different plugin. +.Pp +The file format is encoded as a series of +.Sy JSON +objects. +Each object has a key, which is a MAC address on the +.Sy overlay +network. +It has multiple values, some required, some optional, which describe various +properties. +The valid properties are: +.Bl -hang -width Ds +.It Sy ip +.Bd -filled -compact +The +.Sy ip +key indicates the IP address on the +.Sy underlay +network that houses the MAC address in question. +Packets directed for the MAC address will be encapsulated and set to this +address. +This field is required. +.Pp +The value is a +.Em JSON String . +Both IPv4 and IPv6 addresses are supported and should be written out in their +traditional forms. +Follow the guidelines for writing addresses in +.Xr inet_aton 3SOCKET . +.Ed +.It Sy port +.Bd -filled -compact +The +.Sy port +key indicates the port on the +.Sy underlay +network that houses the MAC address in question. +This property is required if the encapsulation module requires a port for its +destination. +The value is a +.Em JSON Number . +.Ed +.It Sy arp +.Bd -filled -compact +The +.Sy arp +key stores the IPv4 address that corresponds to this MAC address on the +.Sy overlay +network. +This will be used to respond to ARP queries that would traditionally have been +received by the OS kernel. +If this address is not present, no IPv4 packets directed to this IP address will +be received by the network interface that has this MAC address, regardless of +what is configured on top of it. +.Pp +The value is a +.Em JSON String +and should be written out following the guidelines for IPv4 addresses in +.Xr inet_aton 3SOCKET . +.Ed +.It Sy ndp +.Bd -filled -compact +The +.Sy ndp +key stores the IPv6 address that corresponds to this MAC address on the +.Sy overlay +network. +This will be used to respond to NDP queries that would traditionally have been +received by the OS kernel. +If this address is not present, no IPv6 packets directed to this IP address will +be received by the network interface that has this MAC address, regardless of +what is configured on top of it. +.Pp +The value is a +.Em JSON String +and should be written out following the guidelines for IPv6 addresses in +.Xr inet_aton 3SOCKET . +.Ed +.It Sy dhcp-proxy +.Bd -filled -compact +The +.Sy dhcp-proxy +key stores a MAC address that DHCP messages directed to a broadcast address get +rewritten to be sent to. +This can be viewed as a form of proxy DHCP, but is different in mechanism from a +traditional proxy. +The value is a +.Em JSON String +and should be written as a traditional MAC address string as described by +.Xr ether_aton 3SOCKET . +.Ed +.El +.Sh EXAMPLES +.Sy Example 1 +Sample configuration file +.Pp +This configuration file provides information for three different MAC +addresses. +Each MAC address has an entry which describes what its IPv4 +and IPv6 address is, as well as the IP address and port of the host on +the underlay network. +Finally, one host has a DHCP proxy entry to demonstrate how one might +configure DHCP. +.Bd -literal -offset indent +{ + "de:ad:be:ef:00:00": { + "arp": "10.55.55.2", + "ip": "10.88.88.69", + "ndp": "fe80::3", + "port": 4789 + }, + "de:ad:be:ef:00:01": { + "arp": "10.55.55.3", + "dhcp-proxy": "de:ad:be:ef:00:00", + "ip": "10.88.88.70", + "ndp": "fe80::4", + "port": 4789 + }, + "de:ad:be:ef:00:02": { + "arp": "10.55.55.4", + "ip": "10.88.88.71", + "ndp": "fe80::5", + "port": 4789 + } +} +.Ed +.Sh STABILITY +This file format is +.Sy committed ; +however, keys that are not listed here are reserved for future use. +.Sh SEE ALSO +.Xr dladm 1M , +.Xr overlay 5 diff --git a/usr/src/man/man5/Makefile b/usr/src/man/man5/Makefile index 9eb12d0164..ea0520872a 100644 --- a/usr/src/man/man5/Makefile +++ b/usr/src/man/man5/Makefile @@ -83,6 +83,7 @@ _MANFILES= Intro.5 \ ms.5 \ mutex.5 \ nfssec.5 \ + overlay.5 \ pam_allow.5 \ pam_authtok_check.5 \ pam_authtok_get.5 \ diff --git a/usr/src/man/man5/overlay.5 b/usr/src/man/man5/overlay.5 new file mode 100644 index 0000000000..41d1b18739 --- /dev/null +++ b/usr/src/man/man5/overlay.5 @@ -0,0 +1,521 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright 2015 Joyent, Inc. +.\" +.Dd Apr 09, 2015 +.Dt OVERLAY 5 +.Os +.Sh NAME +.Nm overlay +.Nd Overlay Devices +.Sh DESCRIPTION +Overlay devices are a GLDv3 device that allows users to create overlay +networks that can be used to form the basis of network virtualization +and software defined networking. +Overlay networks allow a single physical network, often called an +.Sy underlay +network, to provide the means for creating multiple logical, isolated, +and discrete layer two and layer three networks on top of it. +.Pp +Overlay devices are administered through +.Xr dladm 1M . +Overlay devices themselves cannot be plumbed up with +.Sy IP , +.Sy vnd , +or any other protocol. +Instead, like an +.Sy etherstub , +they allow for VNICs to be created on top of them. +Like an +.Sy etherstub , +an overlay device acts as a local switch; however, when it encounters a +non-local destination address, it instead looks up where it should send +the packet, encapsulates it, and sends it out another interface in the +system. +.Pp +A single overlay device encapsulates the logic to answer two different, +but related, questions: +.Pp +.Bl -enum -offset indent -compact +.It +How should a packet be transformed and put on the wire? +.It +Where should a transformed packet be sent? +.El +.Pp +Each of these questions is answered by a plugin. +The first question is answered by what's called an +.Em encapsulation plugin . +The second question is answered by what's called a +.Em search plugin . +Packets are encapsulated and decapsulated using the encapsulation plugin +by the kernel. +The search plugins are all user land plugins that are consumed by the +varpd service whose FMRI is +.Em svc:/network/varpd:default . +This separation allows for the kernel to be responsible for the data +path, while having the search plugins in userland allows the system to +provide a much more expressive interface. +.Ss Overlay Types +Overlay devices come in two different flavors, one where all packets are +always sent to a single address, the other, where the destination of a +packet varies based on the target MAC address of the packet. +This information is maintained in a +.Em target table , +which is independent and unique to each overlay device. +We call the plugins that send traffic to a single location, for example +a single unicast or multicast IP address, a +.Sy point to point +overlay and the overlay devices that can send traffic to different +locations based on the MAC address of that packet a +.Sy dynamic +overlay. +The plugin type is determined based on the type of the +.Sy search plugin . +These are all fully listed in the section +.Sx Plugins and their Properties . +.Ss Overlay Destination +Both encapsulation and search plugins define the kinds of destinations +that they know how to support. +An encapsulation plugin always has a single destination type that's +determined based on how the encapsulation is defined. +A search plugin, on the other hand, can support multiple combinations of +destinations. +A search plugin must support the destination type of the encapsulation +device. +The destination may require any of the following three pieces of +information, depending on the encapsulation plugin: +.Bl -hang -width Ds +.It Sy MAC Address +.Bd -filled -compact +An Ethernet MAC address is required to determine the destination. +.Ed +.It Sy IP Address +.Bd -filled -compact +An IP address is required. +Both IPv4 and IPv6 addresses are supported. +.Ed +.It Sy Port +.Bd -filled -compact +An IP protocol level (TCP, UDP, SCTP, etc.) port is required. +.Ed +.El +.Pp +The list of destination types that are supported by both the search and +encapsulation plugins is listed in the section +.Sx Plugins and their Properties . +.Ss varpd +The varpd service, mentioned above, is responsible for providing the +virtual ARP daemon. +Its responsibility is conceptually similar to ARP. +It runs all instances of search plugins in the system and is responsible +for answering the kernel's ARP-like questions for where packets should +be sent. +.Pp +The varpd service, svc:/network/varpd:default, must be enabled for +overlay devices to function. +If it is disabled while there are active devices, then most overlay +devices will not function correctly and likely will end up dropping +traffic. +.Sh PLUGINS AND PROPERTIES +Properties fall into three categories in the system: +.Bl -enum -offset indent -compact +.It +Generic properties all overlay devices have +.It +Properties specific to the encapsulation plugin +.It +Properties specific to the search plugin +.El +.Pp +Each property in the system has the following attributes, which mirror +the traditional +.Xr dladm 1M +link properties: +.Bl -hang -width Ds +.It Sy Name +.Bd -filled -compact +The name of a property is namespaced by its module and always structured +and referred to as as module/property. +This allows for both an encapsulation and search plugin to have a +property with the same name. +Properties that are valid for all overlay devices and not specific to a +module do not generally use a module prefix. +.Pp +For example, the property +.Sy vxlan/listen_ip +is associated with the +.Sy vxlan +encapsulation module. +.Ed +.It Sy Type +.Bd -filled -compact +Each property in the system has a type. +.Xr dladm 1M +takes care of converting between the internal representation and a +value, but the type influences the acceptable input range. +The types are: +.Bl -hang -width Ds +.It Sy INT +A signed integer that is up to eight bytes long +.Pq Sy int64_t . +.It Sy UINT +An unsigned integer that is up to eight bytes long +.Pq Sy uint64_t . +.It Sy IP +Either an IPv4 or IPv6 address in traditional string form. +For example, 192.168.128.23 or 2001:470:8af4::1:1. +IPv4 addresses may also be encoded as IPv4-mapped IPv6 addresses. +.It Sy STRING +A string of ASCII or UTF-8 encoded characters terminated with a +.Sy NUL +byte. +The maximum string length, including the terminator, is currently +256 bytes. +.El +.Ed +.It Sy Permissions +.Bd -filled -compact +Each property has permissions associated with it, which indicate whether +the system considers them read-only properties or read-write properties. +A read-only property can never be updated once the device is created. +This generally includes things like the overlay's encapsulation module. +.Ed +.It Sy Required +.Bd -filled -compact +This property indicates whether the property is required for the given +plugin. +If it is not specified during a call to +.Sy dladm create-overlay , +then the overlay cannot be successfully created. +Properties which have a +.Sy default +will use that value if one is not specified rather than cause the +overlay creation to fail. +.Ed +.It Sy Current Value +.Bd -filled -compact +The current value of a property, if the property has a value set. +Required properties always have a value set. +.Ed +.It Sy Default Value +.Bd -filled -compact +The default value is an optional part of a given property. +If a property does define a default value, then it will be used when an +overlay is created and no other value is given. +.Ed +.It Sy Value ranges +.Bd -filled -compact +Value ranges are an optional part of a given property. +They indicate a range or set of values that are valid and may be set for +a property. +A property may not declare such a range as it may be impractical or +unknown. +For example, most properties based on IP addresses will not +declare a range. +.Ed +.El +.Pp +The following sections describe both the modules and the properties that +exist for each module, noting their name, type, permissions, whether or +not they are required, and if there is a default value. +In addition, the effects of each property will be described. +.Ss Encapsulation Plugins +.Bl -hang -width Ds +.It Sy vxlan +The +.Sy vxlan +module is a UDP based encapsulation method. +It takes a frame that would be put on the wire, wraps it up in a VXLAN +header and places it in a UDP packet that gets sent out on the +underlying network. +For more details about the specific format of the VXLAN header, see +.Xr vxlan 7P . +.Pp +The +.Sy vxlan +module requires both an +.Sy IP address +and +.Sy port +to address it. +It has a 24-bit virtual network ID space, allowing for +virtual network identifiers that range from +.Sy 0 +- +.Sy 16777215 . +.Pp +The +.Sy vxlan +module has the following properties: +.Bl -hang -width Ds +.It Sy vxlan/listen_ip +.Bd -filled -compact +Type: +.Sy IP | +Permissions: +.Sy Read/Write | +.Sy Required +.Ed +.Bd -filled +The +.Sy vxlan/listen_ip +property determines the IP address that the system will accept VXLAN +encapsulated packets on for this overlay. +.Ed +.It Sy vxlan/listen_port +.Bd -filled -compact +Type: +.Sy UINT | +Permissions: +.Sy Read/Write | +.Sy Required +.Ed +.Bd -filled -compact +Default Value: +.Sy 4789 | +Range: +.Sy 0 - 65535 +.Ed +.Bd -filled +The +.Sy vxlan/listen_port +property determines the UDP port that the system will listen on for +VXLAN traffic for this overlay. +The default value is +.Sy 4789 , +the IANA assigned port for VXLAN. +.Ed +.El +.Pp +The +.Sy vxlan/listen_ip +and +.Sy vxlan/listen_port +properties determine how the system will accept VXLAN encapsulated +packets for this interface. +It does not determine the interface that packets will be sent out over. +Multiple overlays that all use VXLAN can share the same IP and port +combination, as the virtual network identifier can be used to tell the +different overlays apart. +.El +.Ss Search Plugins +Because search plugins may support multiple destinations, they may have +more properties listed than necessarily show up for a given overlay. +For example, the +.Sy direct +plugin supports destinations that are identified by both an IP address +and a port, or just an IP address. +In cases where the device is created over an overlay that only uses an +IP address for its destination, then it will not have the +.Sy direct/dest_port +property. +.Bl -hang -width Ds +.It Sy direct +The +.Sy direct +plugin is a point to point module that can be used to create an overlay +that forwards all non-local traffic to a single destination. +It supports destinations that are a combination of an +.Sy IP Address +and a +.Sy port . +.Pp +The +.Sy direct +plugin has the following properties: +.Bl -hang -width Ds +.It Sy direct/dest_ip +.Bd -filled -compact +Type: +.Sy IP | +Permissions: +.Sy Read/Write | +.Sy Required +.Ed +.Bd -filled +The +.Sy direct/dest_ip +property indicates the IP address that all traffic will be sent out. +Traffic will be sent out the corresponding interface based on +traditional IP routing rules and the configuration of the networking +stack of the global zone. +.Ed +.It Sy direct/dest_port +.Bd -filled -compact +Type: +.Sy UINT | +Permissions: +.Sy Read/Write | +.Sy Required +.Ed +.Bd -filled -compact +Default Value: +.Sy - | +Range: +.Sy 0 - 65535 +.Ed +.Bd -filled +The +.Sy direct/dest_port +property indicates the TCP or UDP port that all traffic will be directed +to. +.Ed +.El +.It Sy files +The +.Sy files +plugin implements a +.Sy dynamic +plugin that specifies where traffic should be sent based on a file. +It is a glorified version of /etc/ethers. +The +.Sy dynamic +plugin does not support broadcast or multicast traffic, but it has +support for proxy ARP, NDP, and DHCPv4. +For the full details of the file format, see +.Xr overlay_files 4 . +.Pp +The +.Sy files +plugin has the following property: +.Bl -hang -width Ds +.It Sy files/config +.Bd -filled -compact +Type: +.Sy String | +Permissions: +.Sy Read/Write | +.Sy Required +.Ed +.Bd -filled +The +.Sy files/config +property specifies an absolute path to a file to read. +The file is a JSON file that is formatted according to +.Xr overlay_files 4 . +.Ed +.El +.El +.Ss General Properties +Each overlay has the following properties which are used to give +additional information about the system. +None of these properties may be specified as part of a +.Sy dladm create-overlay , +instead they come from other arguments or from internal parts of the +system. +.Bl -hang -width Ds +.It Sy encap +.Bd -filled -compact +.Sy String | +Permissions: +.Sy Read Only +.Ed +.Bd -filled +The +.Sy encap +property contains the name of the encapsulation module that's in use. +.Ed +.It Sy mtu +.Bd -filled -compact +.Sy UINT | +Permissions: +.Sy Read/Write +.Ed +.Bd -filled -compact +Default Value: +.Sy 1400 | +Range: +.Sy 576 - 9000 +.Ed +.Bd -filled +The +.Sy mtu +property describes the maximum transmission unit of the overlay. +The default value is +.Sy 1400 +bytes, which ensures that in a traditional deployment with an MTU of +1500 bytes, the overhead that is added from encapsulation is all +accounted for. +It is the administrator's responsibility to ensure that +the device's MTU and the encapsulation overhead does not exceed that of +the interfaces that the encapsulated traffic will be sent out of. +.Pp +To modify the +.Sy mtu +property, use +.Sy dladm set-linkprop . +.Ed +.It Sy search +.Bd -filled -compact +.Sy String | +Permissions: +.Sy Read Only +.Ed +.Bd -filled +The +.Sy search +property contains the name of the search plugin that's in use. +.Ed +.It Sy varpd/id +.Bd -filled -compact +.Sy String | +Permissions: +.Sy Read Only +.Ed +.Bd -filled +The +.Sy varpd/id +property indicates the identifier which the +.Sy varpd +service uses for this overlay. +.Ed +.It Sy vnetid +.Bd -filled -compact +.Sy UINT | +Permissions: +.Sy Read/Write +.Ed +.Bd -filled +The +.Sy vnetid +property has the virtual network identifier that belongs to this overlay. +The valid range for the virtual network identifier depends on the +encapsulation engine. +.Ed +.El +.Sh FMA INTEGRATION +Overlay devices are wired into FMA, the illumos fault management +architecture, and generates error reports depending on the +.Sy search +plugin in use. +Due to limitations in FMA today, when a single overlay +enters a degraded state, meaning that it cannot properly perform look +ups or another error occurred, then it degrades the overall +.Sy overlay +pseudo-device driver. +.Pp +For more fine-grained information about which overlay is actually in a +.Em degraded +state, one should run +.Sy dladm show-overlay -f . +In addition, for each overlay in a degraded state a more useful +diagnostic message is provided which describes the reason that caused +this overlay to enter into a degraded state. +.Pp +The overlay driver is self-healing. +If the problem corrects itself on its own, it will clear the fault on +the corresponding device. +.Sh SEE ALSO +.Xr dladm 1M , +.Xr overlay_files 4 , +.Xr vxlan 7P diff --git a/usr/src/man/man7p/Makefile b/usr/src/man/man7p/Makefile index 13cb58770d..9186b1ac20 100644 --- a/usr/src/man/man7p/Makefile +++ b/usr/src/man/man7p/Makefile @@ -16,30 +16,31 @@ include $(SRC)/Makefile.master -MANSECT= 7p - -MANFILES= arp.7p \ - dlpi.7p \ - icmp.7p \ - icmp6.7p \ - if_tcp.7p \ - inet.7p \ - inet6.7p \ - ip.7p \ - ip6.7p \ - ipsec.7p \ - ipsecah.7p \ - ipsecesp.7p \ - ndp.7p \ - pf_key.7p \ - rarp.7p \ - route.7p \ - routing.7p \ - sctp.7p \ - sip.7p \ - slp.7p \ - tcp.7p \ - udp.7p +MANSECT= 7p + +MANFILES= arp.7p \ + dlpi.7p \ + icmp.7p \ + icmp6.7p \ + if_tcp.7p \ + inet.7p \ + inet6.7p \ + ip.7p \ + ip6.7p \ + ipsec.7p \ + ipsecah.7p \ + ipsecesp.7p \ + ndp.7p \ + pf_key.7p \ + rarp.7p \ + route.7p \ + routing.7p \ + sctp.7p \ + sip.7p \ + slp.7p \ + tcp.7p \ + udp.7p \ + vxlan.7p MANLINKS= AH.7p \ ARP.7p \ @@ -51,7 +52,8 @@ MANLINKS= AH.7p \ SCTP.7p \ TCP.7p \ UDP.7p \ - if.7p + VXLAN.7p \ + if.7p ARP.7p := LINKSRC = arp.7p @@ -67,14 +69,16 @@ ESP.7p := LINKSRC = ipsecesp.7p NDP.7p := LINKSRC = ndp.7p -RARP.7p := LINKSRC = rarp.7p +RARP.7p := LINKSRC = rarp.7p -SCTP.7p := LINKSRC = sctp.7p +SCTP.7p := LINKSRC = sctp.7p TCP.7p := LINKSRC = tcp.7p UDP.7p := LINKSRC = udp.7p +VXLAN.7p := LINKSRC = vxlan.7p + .KEEP_STATE: include $(SRC)/man/Makefile.man diff --git a/usr/src/man/man7p/vxlan.7p b/usr/src/man/man7p/vxlan.7p new file mode 100644 index 0000000000..43c4756585 --- /dev/null +++ b/usr/src/man/man7p/vxlan.7p @@ -0,0 +1,130 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright 2015 Joyent, Inc. +.\" +.Dd Apr 10, 2015 +.Dt VXLAN 7P +.Os +.Sh NAME +.Nm VXLAN , +.Nm vxlan +.Nd Virtual eXtensible Local Area Network +.Sh SYNOPSIS +.In sys/vxlan.h +.Sh DESCRIPTION +.Nm +(RFC 7348) is a network encapsulation protocol that is used by +.Xr overlay 5 +devices. +A payload, commonly an Ethernet frame, is placed inside of a +UDP packet and prepended with an 8-byte +.Nm +header. +.Pp +The +.Nm +header contains two 32-bit words. +The first word is an 8-bit flags field followed by 24 reserved bits. +The second word is a 24-bit virtual network identifier followed by 8 +reserved bits. +The virtual network identifier identifies a unique +.Nm +and +is similar in concept to an IEEE 802.1Q VLAN identifier. +.Pp +The system provides access to +.Nm +through dladm overlays. +See +.Xr dladm 1M +and +.Xr overlay 5 +for more information. +.Pp +The +.In sys/vxlan.h +header provides information for working with the +.Nm +protocol. +The contents of this header are +.Sy uncommitted . +The header defines a structure that may be used to encode and decode a VXLAN +header. +It defines a packed structure type +.Sy vxlan_hdr_t +which represents the +.Nm +frame header and has the following members: +.Bd -literal + uint32_t vxlan_flags; /* flags in upper 8 bits */ + uint32_t vxlan_id; /* VXLAN ID in upper 24 bits */ +.Ed +.Sh EXAMPLES +.Sy Example 1 +Decoding a +.Nm +header +.Pp +The following example shows how to validate a +.Nm header. +For more information on this process, see RFC 7348. +.Bd -literal -offset indent +#include <sys/types.h> +#include <netinet/in.h> +#include <inttypes.h> +#include <sys/vxlan.h> + +\&... + +/* + * Validate the following bytes as a VXLAN header. If valid, return + * 0 and store the VXLAN identifier in *vidp. Otherwise, return an + * error. + */ +int +validate_vxlan(void *buf, int len, uint32_t *vidp) +{ + vxlan_hdr_t *hdr; + + if (len < sizeof (vxlan_hdr_t)) + return (EINAVL); + + hdr = buf; + if ((ntohl(hdr->vxlan_flags) & VXLAN_MAGIC) == 0) + return (EINAVL); + + *vidp = ntohl(vxlan->vxlan_id) >> VXLAN_ID_SHIFT; + + return (0); +} +.Ed +.Sh STABILITY +The contents of +.In sys/vxlan.h +are +.Sy Uncommitted . +.Sh SEE ALSO +.Xr dladm 1M , +.Xr overlay 5 +.Rs +.%A Mahalingam, M. +.%A Dutt, D. +.%A Duda, K. +.%A Agarwal, P. +.%A Kreeger L. +.%A Sridhar, T. +.%A Bursell, M. +.%A C. Wright +.%T RFC 7348, Virtual eXtensible Local Area Network (VXLAN): A Framework +.%T for Overlaying Virtualized Layer 2 Networks over Layer 3 Networks +.%D August 2014 +.Re diff --git a/usr/src/pkg/manifests/system-network-overlay.p5m b/usr/src/pkg/manifests/system-network-overlay.p5m new file mode 100644 index 0000000000..8cdbd10775 --- /dev/null +++ b/usr/src/pkg/manifests/system-network-overlay.p5m @@ -0,0 +1,62 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2020 OmniOS Community Edition (OmniOSce) Association. +# + +<include global_zone_only_component> +set name=pkg.fmri value=pkg:/system/network/overlay@$(PKGVERS) +set name=pkg.summary value="illumos overlay driver" +set name=pkg.description value="Device driver implementing network overlays" +set name=info.classification \ + value=org.opensolaris.category.2008:Drivers/Networking +set name=variant.arch value=$(ARCH) +dir path=kernel group=sys +dir path=kernel/drv group=sys +dir path=kernel/drv/$(ARCH64) group=sys +file path=kernel/drv/$(ARCH64)/overlay group=sys +file path=kernel/drv/overlay.conf group=sys +dir path=kernel/overlay +dir path=kernel/overlay/$(ARCH64) +file path=kernel/overlay/$(ARCH64)/vxlan group=sys mode=0755 +dir path=lib +file path=lib/$(ARCH64)/libvarpd.so.1 mode=0755 \ + variant.opensolaris.zone=__NODEFAULT +file path=lib/libvarpd.so.1 mode=0755 variant.opensolaris.zone=__NODEFAULT +dir path=lib/svc +dir path=lib/svc/manifest group=sys +dir path=lib/svc/manifest/network group=sys +file path=lib/svc/manifest/network/varpd.xml mode=0444 +dir path=usr/lib +dir path=usr/lib/$(ARCH64) +dir path=usr/lib/varpd +dir path=usr/lib/varpd/$(ARCH64) +link path=usr/lib/varpd/$(ARCH64)/libvarpd_direct.so target=libvarpd_direct.so.1 +file path=usr/lib/varpd/$(ARCH64)/libvarpd_direct.so.1 +link path=usr/lib/varpd/$(ARCH64)/libvarpd_files.so target=libvarpd_files.so.1 +file path=usr/lib/varpd/$(ARCH64)/libvarpd_files.so.1 +link path=usr/lib/varpd/64 target=$(ARCH64) +link path=usr/lib/varpd/libvarpd_direct.so target=libvarpd_direct.so.1 +file path=usr/lib/varpd/libvarpd_direct.so.1 +link path=usr/lib/varpd/libvarpd_files.so target=libvarpd_files.so.1 +file path=usr/lib/varpd/libvarpd_files.so.1 +file path=usr/lib/varpd/varpd mode=0555 +dir path=usr/share/man +dir path=usr/share/man/man4 +file path=usr/share/man/man4/overlay_files.4 +dir path=usr/share/man/man5 +file path=usr/share/man/man5/overlay.5 +dir path=usr/share/man/man7p +link path=usr/share/man/man7p/VXLAN.7p target=vxlan.7p +file usr/share/man/man7p/vxlan.7p path=usr/share/man/man7p/vxlan.7p mode=0444 +driver name=overlay +license lic_CDDL license=lic_CDDL diff --git a/usr/src/test/util-tests/tests/dladm/Makefile b/usr/src/test/util-tests/tests/dladm/Makefile index e37ae56072..53fc2ce092 100644 --- a/usr/src/test/util-tests/tests/dladm/Makefile +++ b/usr/src/test/util-tests/tests/dladm/Makefile @@ -25,8 +25,6 @@ all: install: $(ROOTPROG) -lint: - clobber: clean clean: diff --git a/usr/src/uts/Makefile.targ b/usr/src/uts/Makefile.targ index 1f686f2ca0..bb03cca4e2 100644 --- a/usr/src/uts/Makefile.targ +++ b/usr/src/uts/Makefile.targ @@ -177,6 +177,9 @@ $(ROOT_FONT_DIR)/%: $(OBJS_DIR)/% $(ROOT_MOD_DIR) $(ROOT_FONT_DIR) FRC $(ROOT_MAC_DIR)/%: $(OBJS_DIR)/% $(ROOT_MOD_DIR) $(ROOT_MAC_DIR) FRC $(INS.file) +$(ROOT_OVERLAY_DIR)/%: $(OBJS_DIR)/% $(ROOT_MOD_DIR) $(ROOT_OVERLAY_DIR) FRC + $(INS.file) + $(USR_DRV_DIR)/%: $(OBJS_DIR)/% $(USR_DRV_DIR) FRC $(INS.file) diff --git a/usr/src/uts/Makefile.uts b/usr/src/uts/Makefile.uts index a508c37287..841e84e40a 100644 --- a/usr/src/uts/Makefile.uts +++ b/usr/src/uts/Makefile.uts @@ -400,6 +400,7 @@ ROOT_DACF_DIR_32 = $(ROOT_MOD_DIR)/dacf ROOT_CRYPTO_DIR_32 = $(ROOT_MOD_DIR)/crypto ROOT_MAC_DIR_32 = $(ROOT_MOD_DIR)/mac ROOT_CC_DIR_32 = $(ROOT_MOD_DIR)/cc +ROOT_OVERLAY_DIR_32 = $(ROOT_MOD_DIR)/overlay ROOT_KICONV_DIR_32 = $(ROOT_MOD_DIR)/kiconv ROOT_KERN_DIR_64 = $(ROOT_MOD_DIR)/$(SUBDIR64) @@ -428,6 +429,7 @@ ROOT_DACF_DIR_64 = $(ROOT_MOD_DIR)/dacf/$(SUBDIR64) ROOT_CRYPTO_DIR_64 = $(ROOT_MOD_DIR)/crypto/$(SUBDIR64) ROOT_MAC_DIR_64 = $(ROOT_MOD_DIR)/mac/$(SUBDIR64) ROOT_CC_DIR_64 = $(ROOT_MOD_DIR)/cc/$(SUBDIR64) +ROOT_OVERLAY_DIR_64 = $(ROOT_MOD_DIR)/overlay/$(SUBDIR64) ROOT_KICONV_DIR_64 = $(ROOT_MOD_DIR)/kiconv/$(SUBDIR64) ROOT_KERN_DIR = $(ROOT_KERN_DIR_$(CLASS)) @@ -456,6 +458,7 @@ ROOT_DACF_DIR = $(ROOT_DACF_DIR_$(CLASS)) ROOT_CRYPTO_DIR = $(ROOT_CRYPTO_DIR_$(CLASS)) ROOT_MAC_DIR = $(ROOT_MAC_DIR_$(CLASS)) ROOT_CC_DIR = $(ROOT_CC_DIR_$(CLASS)) +ROOT_OVERLAY_DIR = $(ROOT_OVERLAY_DIR_$(CLASS)) ROOT_KICONV_DIR = $(ROOT_KICONV_DIR_$(CLASS)) ROOT_FIRMWARE_DIR = $(ROOT_MOD_DIR)/firmware @@ -475,6 +478,7 @@ ROOT_MOD_DIRS_32 += $(ROOT_CPU_DIR_32) $(ROOT_FONT_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_TOD_DIR_32) $(ROOT_DACF_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_CRYPTO_DIR_32) $(ROOT_MAC_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_CC_DIR_32) +ROOT_MOD_DIRS_32 += $(ROOT_OVERLAY_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_KICONV_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_FIRMWARE_DIR) @@ -568,7 +572,7 @@ PARALLEL_KMODS = $(DRV_KMODS) $(EXEC_KMODS) $(FS_KMODS) $(SCHED_KMODS) \ $(MMU_KMODS) $(DACF_KMODS) $(EXPORT_KMODS) $(IPP_KMODS) \ $(CRYPTO_KMODS) $(PCBE_KMODS) \ $(DRV_KMODS_$(CLASS)) $(MISC_KMODS_$(CLASS)) $(MAC_KMODS) \ - $(BRAND_KMODS) $(KICONV_KMODS) $(CC_KMODS) \ + $(BRAND_KMODS) $(KICONV_KMODS) $(CC_KMODS) $(OVERLAY_KMODS) \ $(SOCKET_KMODS) KMODS = $(GENUNIX_KMODS) $(PARALLEL_KMODS) diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index b6729071bf..f87e659a2b 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -705,6 +705,11 @@ NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \ VNIC_OBJS += vnic_ctl.o vnic_dev.o +OVERLAY_OBJS += overlay.o overlay_fm.o overlay_mux.o overlay_plugin.o \ + overlay_prop.o overlay_target.o + +OVERLAY_VXLAN_OBJS += overlay_vxlan.o + SIMNET_OBJS += simnet.o IB_OBJS += ibnex.o ibnex_ioctl.o ibnex_hca.o diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index c3ca37feb5..870758e866 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -1004,6 +1004,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nxge/npi/%.c $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nxge/%.s $(COMPILE.s) -o $@ $< +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/overlay/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/overlay/plugins/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/pci-ide/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 58e6d95e99..5d42a69fa2 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -22,7 +22,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2015, Joyent, Inc. + * Copyright 2018, Joyent, Inc. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -388,7 +388,19 @@ udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max, if (!IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))) return (def); - if (MBLKL(mp) < VXLAN_HDR_LEN) { + /* + * The following logic is VXLAN specific to get at the header, if we + * have formats, eg. GENEVE, then we should ignore this. + * + * The kernel overlay device often puts a first mblk_t for the data + * which is just the encap. If so, then we're going to use that and try + * to avoid a pull up. + */ + if (MBLKL(mp) == VXLAN_HDR_LEN) { + if (mp->b_cont == NULL) + return (def); + mp = mp->b_cont; + } else if (MBLKL(mp) < VXLAN_HDR_LEN) { return (def); } else { szused = VXLAN_HDR_LEN; diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c index eca17349c3..dbcd9caea8 100644 --- a/usr/src/uts/common/io/dld/dld_drv.c +++ b/usr/src/uts/common/io/dld/dld_drv.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent Inc. * Copyright (c) 2017, Joyent, Inc. */ @@ -631,7 +632,7 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set, cred_t *cred, int mode) { int err = EINVAL; - dls_dl_handle_t dlh = NULL; + dls_dl_handle_t dlh = NULL; dls_link_t *dlp = NULL; mac_perim_handle_t mph = NULL; dld_ioc_macprop_t *kprop; @@ -1327,7 +1328,7 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred, { int ret = 0; mac_perim_handle_t mph = NULL; - dls_dl_handle_t dlh = NULL; + dls_dl_handle_t dlh = NULL; dls_link_t *dlp = NULL; dld_ioc_gettran_t *dgt = karg; @@ -1371,7 +1372,7 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred, { int ret = 0; mac_perim_handle_t mph = NULL; - dls_dl_handle_t dlh = NULL; + dls_dl_handle_t dlh = NULL; dls_link_t *dlp = NULL; dld_ioc_tranio_t *dti = karg; uint8_t buf[256]; @@ -1424,7 +1425,7 @@ drv_ioc_getled(void *karg, intptr_t arg, int mode, cred_t *cred, { int ret = 0; mac_perim_handle_t mph = NULL; - dls_dl_handle_t dlh = NULL; + dls_dl_handle_t dlh = NULL; dls_link_t *dlp = NULL; dld_ioc_led_t *dil = karg; @@ -1470,7 +1471,7 @@ drv_ioc_setled(void *karg, intptr_t arg, int mode, cred_t *cred, { int ret = 0; mac_perim_handle_t mph = NULL; - dls_dl_handle_t dlh = NULL; + dls_dl_handle_t dlh = NULL; dls_link_t *dlp = NULL; dld_ioc_led_t *dil = karg; @@ -1585,7 +1586,8 @@ static dld_ioc_modentry_t dld_ioc_modtable[] = { {SIMNET_IOC, "simnet", 0, NULL, 0}, {BRIDGE_IOC, "bridge", 0, NULL, 0}, {IPTUN_IOC, "iptun", 0, NULL, 0}, - {IBPART_IOC, "ibp", -1, NULL, 0} + {IBPART_IOC, "ibp", -1, NULL, 0}, + {OVERLAY_IOC, "overlay", 0, NULL, 0} }; #define DLDIOC_CNT \ (sizeof (dld_ioc_modtable) / sizeof (dld_ioc_modentry_t)) diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c index 605cb51bf7..167ed2b90f 100644 --- a/usr/src/uts/common/io/mac/mac_client.c +++ b/usr/src/uts/common/io/mac/mac_client.c @@ -4524,7 +4524,13 @@ mac_addr_len(mac_handle_t mh) boolean_t mac_is_vnic(mac_handle_t mh) { - return (((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC); + return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC) != 0); +} + +boolean_t +mac_is_overlay(mac_handle_t mh) +{ + return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_OVERLAY) != 0); } mac_handle_t diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c index bfb41afe5e..0da404853c 100644 --- a/usr/src/uts/common/io/mac/mac_datapath_setup.c +++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c @@ -605,6 +605,7 @@ mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg) * * TODO: Cleanup and tighten some of the assumptions. */ +boolean_t mac_check_overlay = B_TRUE; boolean_t mac_use_bw_heuristic = B_TRUE; static int mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) @@ -612,6 +613,7 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) uint64_t cpu_speed, bw = 0; int srings = 0; boolean_t bw_enabled = B_FALSE; + mac_client_impl_t *mcip = flent->fe_mcip; ASSERT(!(flent->fe_type & FLOW_USER)); if (flent->fe_resource_props.mrp_mask & MRP_MAXBW && @@ -639,7 +641,16 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) */ if (mac_soft_ring_enable) srings = srings * 2; + } else if (mac_check_overlay == B_TRUE && + (mcip->mci_state_flags & MCIS_IS_VNIC) != 0) { + /* Is this a VNIC on an overlay? */ + mac_handle_t mh = (mac_handle_t)mcip->mci_mip; + if (mac_is_overlay(mh) == B_TRUE) { + srings = mac_rx_soft_ring_10gig_count; + } } + + } else { /* * Soft ring computation using CPU speed and specified diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c index bfaf232d25..bcca602589 100644 --- a/usr/src/uts/common/io/mac/mac_provider.c +++ b/usr/src/uts/common/io/mac/mac_provider.c @@ -393,6 +393,9 @@ mac_register(mac_register_t *mregp, mac_handle_t *mhp) if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL)) mip->mi_state_flags |= MIS_IS_AGGR; + if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL)) + mip->mi_state_flags |= MIS_IS_OVERLAY; + mac_addr_factory_init(mip); mac_transceiver_init(mip); diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c new file mode 100644 index 0000000000..e43f3671b4 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.c @@ -0,0 +1,2184 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Overlay Devices + * + * Overlay devices provide a means for creating overlay networks, a means of + * multiplexing multiple logical, isolated, and discrete layer two and layer + * three networks on top of one physical network. + * + * In general, these overlay devices encapsulate the logic to answer two + * different questions: + * + * 1) How should I transform a packet to put it on the wire? + * 2) Where should I send a transformed packet? + * + * Each overlay device is presented to the user as a GLDv3 device. While the + * link itself cannot have an IP interface created on top of it, it allows for + * additional GLDv3 devices, such as a VNIC, to be created on top of it which + * can be plumbed up with IP interfaces. + * + * + * -------------------- + * General Architecture + * -------------------- + * + * The logical overlay device that a user sees in dladm(1M) is a combination of + * two different components that work together. The first component is this + * kernel module, which is responsible for answering question one -- how should + * I transform a packet to put it on the wire. + * + * The second component is what we call the virtual ARP daemon, or varpd. It is + * a userland component that is responsible for answering the second question -- + * Where should I send a transformed packet. Instances of the kernel overlay + * GLDv3 device ask varpd the question of where should a packet go. + * + * The split was done for a few reasons. Importantly, we wanted to keep the act + * of generating encapsulated packets in the kernel so as to ensure that the + * general data path was fast and also kept simple. On the flip side, while the + * question of where should something go may be simple, it may often be + * complicated and need to interface with several different external or + * distributed systems. In those cases, it's simpler to allow for the full + * flexibility of userland to be brought to bear to solve that problem and in + * general, the path isn't very common. + * + * The following is what makes up the logical overlay device that a user would + * create with dladm(1M). + * + * Kernel Userland + * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . + * . +--------+ +--------+ +--------+ . . . + * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . . + * . +--------+ +--------+ +--------+ . . . + * . | | | . . . + * . | | | . . . + * . +------------+-----------+ . . . + * . | . . /dev/overlay . + * . +--------------+ . . . +------------+ . + * . | | . . . | | . + * . | Overlay |======*=================| Virtual | . + * . | GLDv3 Device |========================| ARP Daemon | . + * . | | . . | | . + * . +--------------+ . . +------------+ . + * . | . . | . + * . | . . | . + * . +----------------+ . . +--------+ . + * . | Overlay | . . | varpd | . + * . | Encapsulation | . . | Lookup | . + * . | Plugin | . . | Plugin | . + * . +----------------+ . . +--------+ . + * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . + * + * + * This image shows the two different components and where they live. + * Importantly, it also shows that both the kernel overlay device and the + * userland varpd both support plugins. The plugins actually implement the + * things that users care about and the APIs have been designed to try to + * minimize the amount of things that a module writer needs to worry about it. + * + * IDENTIFIERS + * + * Every overlay device is defined by a unique identifier which is the overlay + * identifier. Its purpose is similar to that of a VLAN identifier, it's a + * unique number that is used to differentiate between different entries on the + * wire. + * + * ENCAPSULATION + * + * An overlay encapsulation plugin is a kernel miscellaneous module whose + * purpose is to contain knowledge about how to transform packets to put them + * onto the wire and to take them off. An example of an encapsulation plugin is + * vxlan. It's also how support for things like nvgre or geneve would be brought + * into the system. + * + * Each encapsulation plugins defines a series of operation vectors and + * properties. For the full details on everything they should provide, please + * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible + * for telling the system what information is required to send a packet. For + * example, vxlan is defined to send everything over a UDP packet and therefore + * requires a port and an IP address, while nvgre on the other hand is its own + * IP type and therefore just requires an IP address. In addition, it also + * provides information about the kind of socket that should be created. This is + * used by the kernel multiplexor, more of that in the Kernel Components + * section. + * + * LOOKUPS + * + * The kernel communicates requests for lookups over the character device + * /dev/overlay. varpd is responsible for listening for requests on that device + * and answering them. The character device is specific to the target path and + * varpd. + * + * Much as the kernel overlay module handles the bulk of the scaffolding but + * leaves the important work to the encapsulation plugin, varpd provides a + * similar role and leaves the full brunt of lookups to a userland dynamic + * shared object which implements the logic of lookups. + * + * Each lookup plugin defines a series of operation vectors and properties. For + * the full details on everything that they should provide, please read + * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC + * address and asked to give an address on the physical network that it should + * be sent to. In addition, they handle questions related to how to handle + * things like broadcast and multicast traffic, etc. + * + * ---------- + * Properties + * ---------- + * + * A device from a dladm perspective has a unique set of properties that are + * combined from three different sources: + * + * 1) Generic properties that every overlay device has + * 2) Properties that are specific to the encapsulation plugin + * 3) Properties that are specific to the lookup plugin + * + * All of these are exposed in a single set of properties in dladm. Note that + * these are not necessarily traditional link properties. However, if something + * is both a traditional GLDv3 link property, say the MTU of a device, and a + * specific property here, than the driver ensures that all existing GLDv3 + * specific means of manipulating it are used and wraps up its private property + * interfaces to ensure that works. + * + * Properties in the second and third category are prefixed with the name of + * their module. For example, the vxlan encapsulation module has a property + * called the 'listen_ip'. This property would show up in dladm as + * 'vxlan/listen_ip'. This allows different plugins to both use similar names + * for similar properties and to also have independent name spaces so that + * overlapping names do not conflict with anything else. + * + * While the kernel combines both sets one and two into a single coherent view, + * it does not do anything with respect to the properties that are owned by the + * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in + * charge of bridging these two worlds into one magical experience for the user. + * It carries the burden of knowing about both overlay specific and varpd + * specific properties. Importantly, we want to maintain this distinction. We + * don't want to treat the kernel as an arbitrary key/value store for varpd and + * we want the kernel to own its own data and not have to ask userland for + * information that it owns. + * + * Every property in the system has the following attributes: + * + * o A name + * o A type + * o A size + * o Permissions + * o Default value + * o Valid value ranges + * o A value + * + * Everything except for the value is obtained by callers through the propinfo + * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX, + * currently 256 bytes. + * + * The following are the supported types of properties: + * + * OVERLAY_PROP_T_INT + * + * A signed integer, its length is 8 bytes, corresponding to a + * int64_t. + * + * OVERLAY_PROP_T_UINT + * + * An unsigned integer, its length is 8 bytes, corresponding to a + * uint64_t. + * + * OVERLAY_PROP_T_IP + * + * A struct in6_addr, it has a fixed size. + * + * OVERLAY_PROP_T_STRING + * + * A null-terminated character string encoded in either ASCII or + * UTF-8. Note that the size of the string includes the null + * terminator. + * + * The next thing that we apply to a property is its permission. The permissions + * are put together by the bitwise or of the following flags and values. + * + * OVERLAY_PROP_PERM_REQ + * + * This indicates a required property. A property that is required + * must be set by a consumer before the device can be created. If a + * required property has a default property, this constraint is + * loosened because the default property defines the value. + * + * OVERLAY_PORP_PERM_READ + * + * This indicates that a property can be read. All properties will + * have this value set. + * + * OVERLAY_PROP_PERM_WRITE + * + * This indicates that a property can be written to and thus + * updated by userland. Properties that are only intended to + * display information, will not have OVERLAY_PROP_PERM_WRITE set. + * + * In addition, a few additional values are defined as a convenience to + * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of + * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second, + * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ, + * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a + * property should generally be a constant across its lifetime. + * + * A property may optionally have a default value. If it does have a default + * value, and that property is not set to be a different value, then the default + * value is inherited automatically. It also means that if the default value is + * acceptable, there is no need to set the value for a required property. For + * example, the vxlan module has the vxlan/listen_port property which is + * required, but has a default value of 4789 (the IANA assigned port). Because + * of that default value, there is no need for it to be set. + * + * Finally, a property may declare a list of valid values. These valid values + * are used for display purposes, they are not enforced by the broader system, + * but merely allow a means for the information to be communicated to the user + * through dladm(1M). Like a default value, this is optional. + * + * The general scaffolding does not do very much with respect to the getting and + * setting of properties. That is really owned by the individual plugins + * themselves. + * + * ----------------------------- + * Destinations and Plugin Types + * ----------------------------- + * + * Both encapsulation and lookup plugins define the kinds of destinations that + * they know how to support. There are three different pieces of information + * that can be used to address to a destination currently, all of which is + * summarized in the type overlay_point_t. Any combination of these is + * supported. + * + * OVERLAY_PLUGIN_D_ETHERNET + * + * An Ethernet MAC address is required. + * + * OVERLAY_PLUGIN_D_IP + * + * An IP address is required. All IP addresses used by the overlay + * system are transmitted as IPv6 addresses. IPv4 addresses can be + * represented by using IPv4-mapped IPv6 addresses. + * + * OVERLAY_PLUGIN_D_PORT + * + * A TCP/UDP port is required. + * + * A kernel encapsulation plugin declares which of these that it requires, it's + * a static set. On the other hand, a userland lookup plugin can be built to + * support all of these or any combination thereof. It gets passed the required + * destination type, based on the kernel encapsulation method, and then it makes + * the determination as to whether or not it supports it. For example, the + * direct plugin can support either an IP or both an IP and a port, it simply + * doesn't display the direct/dest_port property in the cases where a port is + * not required to support this. + * + * The user lookup plugins have two different modes of operation which + * determines how they interact with the broader system and how look ups are + * performed. These types are: + * + * OVERLAY_TARGET_POINT + * + * A point to point plugin has a single static definition for where + * to send all traffic. Every packet in the system always gets sent + * to the exact same destination which is programmed into the + * kernel when the general device is activated. + * + * OVERLAY_TARGET_DYNAMIC + * + * A dynamic plugin does not have a single static definition. + * Instead, for each destination, the kernel makes an asynchronous + * request to varpd to determine where the packet should be routed, + * and if a specific destination is found, then that destination is + * cached in the overlay device's target cache. + * + * This distinction, while important for the general overlay device's operation, + * is not important to the encapsulation plugins. They don't need to know about + * any of these pieces. It's just a concern for varpd, the userland plugin, and + * the general overlay scaffolding. + * + * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not + * maintain a target cache, and instead just keeps track of the destination and + * always sends encapsulated packets to that address. When the target type is of + * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such + * destinations. These destinations are kept around in an instance of a + * reference hash that is specific to the given overlay device. Entries in the + * cache can be invalidated and replaced by varpd and its lookup plugins. + * + * ---------------------------------- + * Kernel Components and Architecture + * ---------------------------------- + * + * There are multiple pieces inside the kernel that work together, there is the + * general overlay_dev_t structure, which is the logical GLDv3 device, but it + * itself has references to things like an instance of an encapsulation plugin, + * a pointer to a mux and a target cache. It can roughly be summarized in the + * following image: + * + * +------------------+ + * | global | + * | overlay list | + * | overlay_dev_list | + * +------------------+ + * | + * | +-----------------------+ +---------------+ + * +->| GLDv3 Device |----------->| GLDv3 Device | -> ... + * | overlay_dev_t | | overlay_dev_t | + * | | +---------------+ + * | | + * | mac_handle_t -----+---> GLDv3 handle to MAC + * | datalink_id_t -----+---> Datalink ID used by DLS + * | overlay_dev_flag_t ---+---> Device state + * | uint_t -----+---> Current device MTU + * | uint_t -----+---> In-progress RX operations + * | uint_t -----+---> In-progress TX operations + * | char[] -----+---> FMA degraded message + * | void * -----+---> plugin private data + * | overlay_target_t * ---+---------------------+ + * | overlay_plugin_t * ---+---------+ | + * +-----------------------+ | | + * ^ | | + * +--------------------+ | | | + * | Kernel Socket | | | | + * | Multiplexor | | | | + * | overlay_mux_t | | | | + * | | | | | + * | avl_tree_t -+--+ | | + * | uint_t -+--> socket family | | + * | uint_t -+--> socket type | | + * | uint_t -+--> socket protocol | | + * | ksocket_t -+--> I/O socket | | + * | struct sockaddr * -+--> ksocket address | | + * | overlay_plugin_t --+--------+ | | + * +--------------------+ | | | + * | | | + * +-------------------------+ | | | + * | Encap Plugin |<--+-----------+ | + * | overlay_plugin_t | | + * | | | + * | char * ---+--> plugin name | + * | overlay_plugin_ops_t * -+--> plugin downcalls | + * | char ** (props) ---+--> property list | + * | uint_t ---+--> id length | + * | overlay_plugin_flags_t -+--> plugin flags | + * | overlay_plugin_dest_t --+--> destination type v + * +-------------------------+ +-------------------------+ + * | Target Cache | + * | overlay_target_t | + * | | + * cache mode <--+- overlay_target_mode_t | + * dest type <--+- overlay_plugin_dest_t | + * cache flags <--+- overlay_target_flag_t | + * varpd id <--+- uint64_t | + * outstanding varpd reqs. <--+- uint_t | + * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t | + * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t | + * | +-------------------------+ + * +-----------------------+ + * | + * v + * +-------------------------------+ +------------------------+ + * | Target Entry |-->| Target Entry |--> ... + * | overlay_target_entry_t | | overlay_target_entry_t | + * | | +------------------------+ + * | | + * | overlay_target_entry_flags_t -+--> Entry flags + * | uint8_t[ETHERADDRL] ---+--> Target MAC address + * | overlay_target_point_t ---+--> Target underlay address + * | mblk_t * ---+--> outstanding mblk head + * | mblk_t * ---+--> outstanding mblk tail + * | size_t ---+--> outstanding mblk size + * +-------------------------------+ + * + * The primary entries that we care about are the overlay_dev_t, which + * correspond to each overlay device that is created with dladm(1M). Globally, + * these devices are maintained in a simple list_t which is protected with a + * lock. Hence, these include important information such as the mac_handle_t + * and a datalink_id_t which is used to interact with the broader MAC and DLS + * ecosystem. We also maintain additional information such as the current state, + * outstanding operations, the mtu, and importantly, the plugin's private data. + * This is the instance of an encapsulation plugin that gets created as part of + * creating an overlay device. Another aspect of this is that the overlay_dev_t + * also includes information with respect to FMA. For more information, see the + * FMA section. + * + * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin + * is the encapsulation plugin. This allows the device to make downcalls into it + * based on doing things like getting and setting properties. Otherwise, the + * plugin itself is a fairly straightforward entity. They are maintained in an + * (not pictured above) list. The plugins themselves mostly maintain things like + * the static list of properties, what kind of destination they require, and the + * operations vector. A given module may contain more if necessary. + * + * The next piece of the puzzle is the mux, or a multiplexor. The mux itself + * maintains a ksocket and it is through the mux that we send and receive + * message blocks. The mux represents a socket type and address, as well as a + * plugin. Multiple overlay_dev_t devices may then share the same mux. For + * example, consider the case where you have different instances of vxlan all on + * the same underlay network. These would all logically share the same IP + * address and port that packets are sent and received on; however, what differs + * is the decapuslation ID. + * + * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike + * a socket, we enable a direct callback on the ksocket. This means that + * whenever a message block chain is received, rather than sitting there and + * getting a callback in a context and kicking that back out to a taskq. Instead + * data comes into the callback function overlay_mux_recv(). + * + * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx + * function) to transmit. It receives encapsulated packets, decapsulates them to + * determine the overlay identifier, looks up the given device that matches that + * identifier, and then causes the broader MAC world to receive the packet with + * a call to mac_rx(). + * + * Today, we don't do too much that's special with the ksocket; however, as + * hardware is gaining understanding for these encapsulation protocols, we'll + * probably want to think of better ways to get those capabilities passed down + * and potentially better ways to program receive filters so they get directly + * to us. Though, that's all fantasy future land. + * + * The next part of the puzzle is the target cache. The purpose of the target + * cache is to cache where we should send a packet on the underlay network, + * given its mac address. The target cache operates in two modes depending on + * whether the lookup module was declared to OVERLAY_TARGET_POINT or + * OVERLAY_TARGET_DYANMIC. + * + * In the case where the target cache has been programmed to be + * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t + * which has the destination that we send everything, no matter the destination + * mac address. + * + * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things + * are much more interesting and as a result, more complicated. We primarily + * store lists of overlay_target_entry_t's which are stored in both an avl tree + * and a refhash_t. The primary look up path uses the refhash_t and the avl tree + * is only used for a few of the target ioctls used to dump data such that we + * can get a consistent iteration order for things like dladm show-overlay -t. + * The key that we use for the reference hashtable is based on the mac address + * in the cache and currently we just do a simple CRC32 to transform it into a + * hash. + * + * Each entry maintains a set of flags to indicate the current status of the + * request. The flags may indicate one of three states: that current cache entry + * is valid, that the current cache entry has been directed to drop all output, + * and that the current cache entry is invalid and may be being looked up. In + * the case where it's valid, we just take the destination address and run with + * it. + * + * If it's invalid and a lookup has not been made, then we start the process + * that prepares a query that will make its way up to varpd. The cache entry + * entry maintains a message block chain of outstanding message blocks and a + * size. These lists are populated only when we don't know the answer as to + * where should these be sent. The size entry is used to cap the amount of + * outstanding data that we don't know the answer to. If we exceed a cap on the + * amount of outstanding data (currently 1 Mb), then we'll drop any additional + * packets. Once we get an answer indicating a valid destination, we transmit + * any outstanding data to that place. For the full story on how we look that up + * will be discussed in the section on the Target Cache Lifecycle. + * + * ------------------------ + * FMA and Degraded Devices + * ------------------------ + * + * Every kernel overlay device keeps track of its FMA state. Today in FMA we + * cannot represent partitions between resources nor can we represent that a + * given minor node of a pseudo device has failed -- if we degrade the overlay + * device, then the entire dev_info_t is degraded. However, we still want to be + * able to indicate to administrators that things may go wrong. + * + * To this end, we've added a notion of a degraded state to every overlay + * device. This state is primarily dictated by userland and it can happen for + * various reasons. Generally, because a userland lookup plugin has been + * partitioned, or something has gone wrong such that there is no longer any + * userland lookup module for a device, then we'll mark it degraded. + * + * As long as any of our minor instances is degraded, then we'll fire off the + * FMA event to note that. Once the last degraded instance is no longer + * degraded, then we'll end up telling FMA that we're all clean. + * + * To help administrators get a better sense of which of the various minor + * devices is wrong, we store the odd_fmamsg[] character array. This character + * array can be fetched with doing a dladm show-overlay -f. + * + * Note, that it's important that we do not update the link status of the + * devices. We want to remain up as much as possible. By changing the link in a + * degraded state, this may end up making things worse. We may still actually + * have information in the target cache and if we mark the link down, that'll + * result in not being able to use it. The reason being that this'll mark all + * the downstream VNICs down which will go to IP and from there we end up + * dealing with sadness. + * + * ----------------------- + * Target Cache Life Cycle + * ----------------------- + * + * This section only applies when we have a lookup plugin of + * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type + * OVERLAY_TARGET_POINT. + * + * While we got into the target cache in the general architecture section, it's + * worth going into more details as to how this actually works and showing some + * examples and state machines. Recall that a target cache entry basically has + * the following state transition diagram: + * + * Initial state + * . . . . . . first access . . . varpd lookup enqueued + * . . . + * . . . + * +-------+ . +----------+ . + * | No |------*---->| Invalid |-------*----+ + * | Entry | | Entry | | + * +-------+ +----------+ | + * varpd ^ ^ varpd | + * invalidate | | drop | + * . . . * * . . v + * +-------+ | | +---------+ + * | Entry |--->-----+ +----<----| Entry | + * | Valid |<----------*---------<----| Pending |->-+ varpd + * +-------+ . +---------+ * . . drop, but + * . varpd ^ | other queued + * . success | | entries + * +-----+ + * + * When the table is first created, it is empty. As we attempt to lookup entries + * and we find there is no entry at all, we'll create a new table entry for it. + * At that point the entry is technically in an invalid state, that means that + * we have no valid data from varpd. In that case, we'll go ahead and queue the + * packet into the entry's pending chain, and queue a varpd lookup, setting the + * OVERLAY_ENTRY_F_PENDING flag in the progress. + * + * If additional mblk_t's come in for this entry, we end up appending them to + * the tail of the chain, if and only if, we don't exceed the threshold for the + * amount of space they can take up. An entry remains pending until we get a + * varpd reply. If varpd replies with a valid results, we move to the valid + * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one + * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate. + * + * Once an entry is valid, it stays valid until user land tells us to invalidate + * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and + * OVERLAY_TARG_CACHE_SET respectively. + * + * If the lookup fails with a call to drop the packet, then the next state is + * determined by the state of the queue. If the set of outstanding entries is + * empty, then we just transition back to the invalid state. If instead, the + * set of outstanding entries is not empty, then we'll queue another entry and + * stay in the same state, repeating this until the number of requests is + * drained. + * + * The following images describes the flow of a given lookup and where the + * overlay_target_entry_t is at any given time. + * + * +-------------------+ + * | Invalid Entry | An entry starts off as an invalid entry + * | de:ad:be:ef:00:00 | and only exists in the target cache. + * +-------------------+ + * + * ~~~~ + * + * +---------------------+ + * | Global list_t | A mblk_t comes in for an entry. We + * | overlay_target_list | append it to the overlay_target_list. + * +---------------------+ + * | + * v + * +-------------------+ +-------------------+ + * | Pending Entry |----->| Pending Entry |--->... + * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 | + * +-------------------+ +-------------------+ + * + * ~~~~ + * + * +--------------------------+ + * | /dev/overlay minor state | User land said that it would look up an + * | overlay_target_hdl_t | entry for us. We remove it from the + * +--------------------------+ global list and add it to the handle's + * | outstanding list. + * | + * v + * +-------------------+ +-------------------+ + * | Pending Entry |----->| Pending Entry | + * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 | + * +-------------------+ +-------------------+ + * + * ~~~~ + * + * +-------------------+ + * | Valid Entry | varpd returned an answer with + * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache + * | 10.169.23.42:4789 | entry is now populated with a + * +-------------------+ destination and marked as valid + * + * + * The lookup mechanism is performed via a series of operations on the character + * pseudo-device /dev/overlay. The only thing that uses this device is the + * userland daemon varpd. /dev/overlay is a cloneable device, each open of it + * granting a new minor number which maintains its own state. We maintain this + * state so that way if an outstanding lookup was queued to something that + * crashed or closed its handle without responding, we can know about this and + * thus handle it appropriately. + * + * When a lookup is first created it's added to our global list of outstanding + * lookups. To service requests, userland is required to perform an ioctl to ask + * for a request. We will block it in the kernel a set amount of time waiting + * for a request. When we give a request to a given minor instance of the + * device, we remove it from the global list and append the request to the + * device's list of outstanding entries, for the reasons we discussed above. + * When a lookup comes in, we give user land a smaller amount of information + * specific to that packet, the overlay_targ_lookup_t. It includes a request id + * to identify this, and then the overlay id, the varpd id, the header and + * packet size, the source and destination mac address, the SAP, and any + * potential VLAN header. + * + * At that point, it stays in that outstanding list until one of two ioctls are + * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time, + * userland may also perform other operations. For example, it may use + * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth + * analysis of what to do beyond what we gave it initially. This is useful for + * providing proxy arp and the like. Finally, there are two other ioctls that + * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the + * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which + * causes us to encapsulate and send out the packet they've given us. + * + * + * Finally, through the target cache, several ioctls are provided to allow for + * interrogation and management of the cache. They allow for individual entries + * to be retrieved, set, or have the entire table flushed. For the full set of + * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h. + * + * ------------------ + * Sample Packet Flow + * ------------------ + * + * There's a lot of pieces here, hopefully an example of how this all fits + * together will help clarify and elucidate what's going on. We're going to + * first track an outgoing packet, eg. one that is sent from an IP interface on + * a VNIC on top of an overlay device, and then we'll look at what it means to + * respond to that. + * + * + * +----------------+ +--------------+ +------------------+ + * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches | + * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx | + * +----------------+ | VNIC device | | overlay_m_tx() | + * +--------------+ +------------------+ + * | + * . lookup . cache | + * . drop . miss v + * +---------+ . +--------+ . +------------------+ + * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk | + * | mblk_t | | lookup | | in the target | + * +---------+ | queued | | cache | + * ^ +--------+ +------------------+ + * on send | | | cache + * error . . * *. . lookup * . . hit + * | | success v + * | | +------------------+ + * +-----------------+ +--------------->| call plugin | + * | Send out | | ovpo_encap() to | + * | overlay_mux_t's |<----------------------------------| get encap mblk_t | + * | ksocket | +------------------+ + * +-----------------+ + * + * The receive end point looks a little different and looks more like: + * + * +------------------+ +----------------+ +-----------+ + * | mblk_t comes off |---->| enter netstack |--->| delivered |---+ + * | the physical | | IP stack | | to | * . . direct + * | device | +----------------+ | ksocket | | callback + * +------------------+ +-----------+ | + * . overlay id | + * . not found v + * +-----------+ . +-----------------+ +--------------------+ + * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() | + * | mblk_t | | ovpo_decap() to | +--------------------+ + * +-----------+ | decap mblk_t | + * +-----------------+ + * | + * * . . overlay id + * v found + * +--------+ +----------------+ + * | adjust |----->| call mac_rx | + * | mblk_t | | on original | + * +--------+ | decaped packet | + * +----------------+ + * + * ------------------ + * Netstack Awareness + * ------------------ + * + * In the above image we note that this enters a netstack. Today the only + * netstack that can be is the global zone as the overlay driver itself is not + * exactly netstack aware. What this really means is that varpd cannot run in a + * non-global zone and an overlay device cannot belong to a non-global zone. + * Non-global zones can still have a VNIC assigned to them that's been created + * over the overlay device the same way they would if it had been created over + * an etherstub or a physical device. + * + * The majority of the work to make it netstack aware is straightforward and the + * biggest thing is to create a netstack module that allows us to hook into + * netstack (and thus zone) creation and destruction. From there, we need to + * amend the target cache lookup routines that we discussed earlier to not have + * a global outstanding list and a global list of handles, but rather, one per + * netstack. + * + * For the mux, we'll need to open the ksocket in the context of the zone, we + * can likely do this with a properly composed credential, but we'll need to do + * some more work on that path. Finally, we'll want to make sure the dld ioctls + * are aware of the zoneid of the caller and we use that appropriately and store + * it in the overlay_dev_t. + * + * ----------- + * GLDv3 Notes + * ----------- + * + * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more + * relevant and other parts are much less relevant for us. For example, the + * GLDv3 is used to toggle the device being put into and out of promiscuous + * mode, to program MAC addresses for unicast and multicast hardware filters. + * Today, an overlay device doesn't have a notion of promiscuous mode nor does + * it have a notion of unicast and multicast addresses programmed into the + * device. Instead, for the purposes of the hardware filter, we don't do + * anything and just always accept new addresses being added and removed. + * + * If the GLDv3 start function has not been called, then we will not use this + * device for I/O purposes. Any calls to transmit or receive should be dropped, + * though the GLDv3 guarantees us that transmit will not be called without + * calling start. Similarly, once stop is called, then no packets can be dealt + * with. + * + * Today we don't support the stat interfaces, though there's no good reason + * that we shouldn't assemble some of the stats based on what we have in the + * future. + * + * When it comes to link properties, many of the traditional link properties do + * not apply and many others MAC handles for us. For example, we don't need to + * implement anything for overlay_m_getprop() to deal with returning the MTU, as + * MAC never calls into us for that. As such, there isn't much of anything to + * support in terms of properties. + * + * Today, we don't support any notion of hardware capabilities. However, if + * future NIC hardware or other changes to the system cause it to make sense for + * us to emulate logical groups, then we should do that. However, we still do + * implement a capab function so that we can identify ourselves as an overlay + * device to the broader MAC framework. This is done mostly so that a device + * created on top of us can have fanout rings as we don't try to lie about a + * speed for our device. + * + * The other question is what should be done for a device's MTU and margin. We + * set our minimum supported MTU to be the minimum value that an IP network may + * be set to 576 -- which mimics what an etherstub does. On the flip side, we + * have our upper bound set to 8900. This value comes from the fact that a lot + * of jumbo networks use their maximum as 9000. As such, we want to reserve 100 + * bytes, which isn't exactly the most accurate number, but it'll be good enough + * for now. Because of that, our default MTU off of these devices is 1400, as + * the default MTU for everything is usually 1500 or whatever the underlying + * device is at; however, this is a bit simpler than asking the netstack what + * are all the IP interfaces at. It also calls into question how PMTU and PMTU + * discovery should work here. The challenge, especially for + * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's + * not clear that if you have a single bad entry that the overall MTU should be + * lowered. Instead, we should figure out a better way of determining these + * kinds of PMTU errors and appropriately alerting the administrator via FMA. + * + * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether + * or not the underlying encapsulation device supports VLAN tags. If it does, + * then we'll set the margin to allow for it, otherwise, we will not. + */ + +#include <sys/conf.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/modctl.h> +#include <sys/policy.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/ddifm.h> + +#include <sys/dls.h> +#include <sys/dld_ioc.h> +#include <sys/mac_provider.h> +#include <sys/mac_client_priv.h> +#include <sys/mac_ether.h> +#include <sys/vlan.h> + +#include <sys/overlay_impl.h> + +dev_info_t *overlay_dip; +static kmutex_t overlay_dev_lock; +static list_t overlay_dev_list; +static uint8_t overlay_macaddr[ETHERADDRL] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + +typedef enum overlay_dev_prop { + OVERLAY_DEV_P_MTU = 0, + OVERLAY_DEV_P_VNETID, + OVERLAY_DEV_P_ENCAP, + OVERLAY_DEV_P_VARPDID +} overlay_dev_prop_t; + +#define OVERLAY_DEV_NPROPS 4 +static const char *overlay_dev_props[] = { + "mtu", + "vnetid", + "encap", + "varpd/id" +}; + +#define OVERLAY_MTU_MIN 576 +#define OVERLAY_MTU_DEF 1400 +#define OVERLAY_MTU_MAX 8900 + +overlay_dev_t * +overlay_hold_by_dlid(datalink_id_t id) +{ + overlay_dev_t *o; + + mutex_enter(&overlay_dev_lock); + for (o = list_head(&overlay_dev_list); o != NULL; + o = list_next(&overlay_dev_list, o)) { + if (id == o->odd_linkid) { + mutex_enter(&o->odd_lock); + o->odd_ref++; + mutex_exit(&o->odd_lock); + mutex_exit(&overlay_dev_lock); + return (o); + } + } + + mutex_exit(&overlay_dev_lock); + return (NULL); +} + +void +overlay_hold_rele(overlay_dev_t *odd) +{ + mutex_enter(&odd->odd_lock); + ASSERT(odd->odd_ref > 0); + odd->odd_ref--; + mutex_exit(&odd->odd_lock); +} + +void +overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + if (flag & OVERLAY_F_IN_RX) + odd->odd_rxcount++; + if (flag & OVERLAY_F_IN_TX) + odd->odd_txcount++; + odd->odd_flags |= flag; +} + +void +overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + boolean_t signal = B_FALSE; + + ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + if (flag & OVERLAY_F_IN_RX) { + ASSERT(odd->odd_rxcount > 0); + odd->odd_rxcount--; + if (odd->odd_rxcount == 0) { + signal = B_TRUE; + odd->odd_flags &= ~OVERLAY_F_IN_RX; + } + } + if (flag & OVERLAY_F_IN_TX) { + ASSERT(odd->odd_txcount > 0); + odd->odd_txcount--; + if (odd->odd_txcount == 0) { + signal = B_TRUE; + odd->odd_flags &= ~OVERLAY_F_IN_TX; + } + } + + if (signal == B_TRUE) + cv_broadcast(&odd->odd_iowait); +} + +static void +overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + ASSERT((flag & ~OVERLAY_F_IOMASK) == 0); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + while (odd->odd_flags & flag) { + cv_wait(&odd->odd_iowait, &odd->odd_lock); + } +} + +void +overlay_dev_iter(overlay_dev_iter_f func, void *arg) +{ + overlay_dev_t *odd; + + mutex_enter(&overlay_dev_lock); + for (odd = list_head(&overlay_dev_list); odd != NULL; + odd = list_next(&overlay_dev_list, odd)) { + if (func(odd, arg) != 0) { + mutex_exit(&overlay_dev_lock); + return; + } + } + mutex_exit(&overlay_dev_lock); +} + +/* ARGSUSED */ +static int +overlay_m_stat(void *arg, uint_t stat, uint64_t *val) +{ + return (ENOTSUP); +} + +static int +overlay_m_start(void *arg) +{ + overlay_dev_t *odd = arg; + overlay_mux_t *mux; + int ret, domain, family, prot; + struct sockaddr_storage storage; + socklen_t slen; + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) { + mutex_exit(&odd->odd_lock); + return (EAGAIN); + } + mutex_exit(&odd->odd_lock); + + ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain, + &family, &prot, (struct sockaddr *)&storage, &slen); + if (ret != 0) + return (ret); + + mux = overlay_mux_open(odd->odd_plugin, domain, family, prot, + (struct sockaddr *)&storage, slen, &ret); + if (mux == NULL) + return (ret); + + overlay_mux_add_dev(mux, odd); + odd->odd_mux = mux; + mutex_enter(&odd->odd_lock); + ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX)); + odd->odd_flags |= OVERLAY_F_IN_MUX; + mutex_exit(&odd->odd_lock); + + return (0); +} + +static void +overlay_m_stop(void *arg) +{ + overlay_dev_t *odd = arg; + + /* + * The MAC Perimeter is held here, so we don't have to worry about + * synchronizing this with respect to metadata operations. + */ + mutex_enter(&odd->odd_lock); + VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX); + VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP)); + odd->odd_flags |= OVERLAY_F_MDDROP; + overlay_io_wait(odd, OVERLAY_F_IOMASK); + mutex_exit(&odd->odd_lock); + + overlay_mux_remove_dev(odd->odd_mux, odd); + overlay_mux_close(odd->odd_mux); + odd->odd_mux = NULL; + + mutex_enter(&odd->odd_lock); + odd->odd_flags &= ~OVERLAY_F_IN_MUX; + odd->odd_flags &= ~OVERLAY_F_MDDROP; + VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0); + mutex_exit(&odd->odd_lock); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_promisc(void *arg, boolean_t on) +{ + return (0); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp) +{ + return (0); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_unicast(void *arg, const uint8_t *macaddr) +{ + return (0); +} + +mblk_t * +overlay_m_tx(void *arg, mblk_t *mp_chain) +{ + overlay_dev_t *odd = arg; + mblk_t *mp, *ep; + int ret; + ovep_encap_info_t einfo; + struct msghdr hdr; + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_MDDROP) || + !(odd->odd_flags & OVERLAY_F_IN_MUX)) { + mutex_exit(&odd->odd_lock); + freemsgchain(mp_chain); + return (NULL); + } + overlay_io_start(odd, OVERLAY_F_IN_TX); + mutex_exit(&odd->odd_lock); + + bzero(&hdr, sizeof (struct msghdr)); + + bzero(&einfo, sizeof (ovep_encap_info_t)); + einfo.ovdi_id = odd->odd_vid; + mp = mp_chain; + while (mp != NULL) { + socklen_t slen; + struct sockaddr_storage storage; + + mp_chain = mp->b_next; + mp->b_next = NULL; + ep = NULL; + + ret = overlay_target_lookup(odd, mp, + (struct sockaddr *)&storage, &slen); + if (ret != OVERLAY_TARGET_OK) { + if (ret == OVERLAY_TARGET_DROP) + freemsg(mp); + mp = mp_chain; + continue; + } + + hdr.msg_name = &storage; + hdr.msg_namelen = slen; + + ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp, + &einfo, &ep); + if (ret != 0 || ep == NULL) { + freemsg(mp); + goto out; + } + + ASSERT(ep->b_cont == mp || ep == mp); + ret = overlay_mux_tx(odd->odd_mux, &hdr, ep); + if (ret != 0) + goto out; + + mp = mp_chain; + } + +out: + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_TX); + mutex_exit(&odd->odd_lock); + return (mp_chain); +} + +/* ARGSUSED */ +static void +overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp) +{ + miocnak(q, mp, 0, ENOTSUP); +} + +/* ARGSUSED */ +static boolean_t +overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) +{ + /* + * Tell MAC we're an overlay. + */ + if (cap == MAC_CAPAB_OVERLAY) + return (B_TRUE); + return (B_FALSE); +} + +/* ARGSUSED */ +static int +overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, const void *pr_val) +{ + uint32_t mtu, old; + int err; + overlay_dev_t *odd = arg; + + if (pr_num != MAC_PROP_MTU) + return (ENOTSUP); + + bcopy(pr_val, &mtu, sizeof (mtu)); + if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX) + return (EINVAL); + + mutex_enter(&odd->odd_lock); + old = odd->odd_mtu; + odd->odd_mtu = mtu; + err = mac_maxsdu_update(odd->odd_mh, mtu); + if (err != 0) + odd->odd_mtu = old; + mutex_exit(&odd->odd_lock); + + return (err); +} + +/* ARGSUSED */ +static int +overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, void *pr_val) +{ + return (ENOTSUP); +} + +/* ARGSUSED */ +static void +overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, + mac_prop_info_handle_t prh) +{ + if (pr_num != MAC_PROP_MTU) + return; + + mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF); + mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX); +} + +static mac_callbacks_t overlay_m_callbacks = { + .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | + MC_PROPINFO), + .mc_getstat = overlay_m_stat, + .mc_start = overlay_m_start, + .mc_stop = overlay_m_stop, + .mc_setpromisc = overlay_m_promisc, + .mc_multicst = overlay_m_multicast, + .mc_unicst = overlay_m_unicast, + .mc_tx = overlay_m_tx, + .mc_ioctl = overlay_m_ioctl, + .mc_getcapab = overlay_m_getcapab, + .mc_getprop = overlay_m_getprop, + .mc_setprop = overlay_m_setprop, + .mc_propinfo = overlay_m_propinfo +}; + +static boolean_t +overlay_valid_name(const char *name, size_t buflen) +{ + size_t actlen; + int err, i; + + for (i = 0; i < buflen; i++) { + if (name[i] == '\0') + break; + } + + if (i == 0 || i == buflen) + return (B_FALSE); + actlen = i; + if (strchr(name, '/') != NULL) + return (B_FALSE); + if (u8_validate((char *)name, actlen, NULL, + U8_VALIDATE_ENTIRE, &err) < 0) + return (B_FALSE); + return (B_TRUE); +} + +/* ARGSUSED */ +static int +overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + int err; + uint64_t maxid; + overlay_dev_t *odd, *o; + mac_register_t *mac; + overlay_ioc_create_t *oicp = karg; + + if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE) + return (EINVAL); + + odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP); + odd->odd_linkid = oicp->oic_linkid; + odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap); + if (odd->odd_plugin == NULL) { + kmem_free(odd, sizeof (overlay_dev_t)); + return (ENOENT); + } + err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd, + &odd->odd_pvoid); + if (err != 0) { + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + + /* + * Make sure that our virtual network id is valid for the given plugin + * that we're working with. + */ + ASSERT(odd->odd_plugin->ovp_id_size <= 8); + maxid = UINT64_MAX; + if (odd->odd_plugin->ovp_id_size != 8) + maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL; + if (oicp->oic_vnetid > maxid) { + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + odd->odd_vid = oicp->oic_vnetid; + + mac = mac_alloc(MAC_VERSION); + if (mac == NULL) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + + mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; + mac->m_driver = odd; + mac->m_dip = overlay_dip; + mac->m_dst_addr = NULL; + mac->m_callbacks = &overlay_m_callbacks; + mac->m_pdata = NULL; + mac->m_pdata_size = 0; + + mac->m_priv_props = NULL; + + /* Let mac handle this itself. */ + mac->m_instance = (uint_t)-1; + + /* + * There is no real source address that should be used here, but saying + * that we're not ethernet is going to cause its own problems. At the + * end of the say, this is fine. + */ + mac->m_src_addr = overlay_macaddr; + + /* + * Start with the default MTU as the max SDU. If the MTU is changed, the + * SDU will be changed to reflect that. + */ + mac->m_min_sdu = 1; + mac->m_max_sdu = OVERLAY_MTU_DEF; + mac->m_multicast_sdu = 0; + + /* + * The underlying device doesn't matter, instead this comes from the + * encapsulation protocol and whether or not they allow VLAN tags. + */ + if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) { + mac->m_margin = VLAN_TAGSZ; + } else { + mac->m_margin = 0; + } + + /* + * Today, we have no MAC virtualization, it may make sense in the future + * to go ahead and emulate some subset of this, but it doesn't today. + */ + mac->m_v12n = MAC_VIRT_NONE; + + mutex_enter(&overlay_dev_lock); + for (o = list_head(&overlay_dev_list); o != NULL; + o = list_next(&overlay_dev_list, o)) { + if (o->odd_linkid == oicp->oic_linkid) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EEXIST); + } + + if (o->odd_vid == oicp->oic_vnetid && + o->odd_plugin == odd->odd_plugin) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EEXIST); + } + } + + err = mac_register(mac, &odd->odd_mh); + mac_free(mac); + if (err != 0) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (err); + } + + err = dls_devnet_create(odd->odd_mh, odd->odd_linkid, + crgetzoneid(cred)); + if (err != 0) { + mutex_exit(&overlay_dev_lock); + (void) mac_unregister(odd->odd_mh); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (err); + } + + mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL); + odd->odd_ref = 0; + odd->odd_flags = 0; + list_insert_tail(&overlay_dev_list, odd); + mutex_exit(&overlay_dev_lock); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + int i, ret; + overlay_dev_t *odd; + mac_perim_handle_t mph; + overlay_ioc_activate_t *oiap = karg; + overlay_ioc_propinfo_t *infop; + overlay_ioc_prop_t *oip; + overlay_prop_handle_t phdl; + + odd = overlay_hold_by_dlid(oiap->oia_linkid); + if (odd == NULL) + return (ENOENT); + + infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP); + oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP); + phdl = (overlay_prop_handle_t)infop; + + mac_perim_enter_by_mh(odd->odd_mh, &mph); + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_ACTIVATED) { + mutex_exit(&odd->odd_lock); + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (EEXIST); + } + mutex_exit(&odd->odd_lock); + + for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { + const char *pname = odd->odd_plugin->ovp_props[i]; + bzero(infop, sizeof (overlay_ioc_propinfo_t)); + overlay_prop_init(phdl); + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl); + if (ret != 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ret); + } + + if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0) + continue; + bzero(oip, sizeof (overlay_ioc_prop_t)); + oip->oip_size = sizeof (oip->oip_value); + ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, + pname, oip->oip_value, &oip->oip_size); + if (ret != 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ret); + } + if (oip->oip_size == 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (EINVAL); + } + } + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) { + mutex_exit(&odd->odd_lock); + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ENXIO); + } + + ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0); + odd->odd_flags |= OVERLAY_F_ACTIVATED; + + /* + * Now that we've activated ourselves, we should indicate to the world + * that we're up. Note that we may not be able to perform lookups at + * this time, but our notion of being 'up' isn't dependent on that + * ability. + */ + mac_link_update(odd->odd_mh, LINK_STATE_UP); + mutex_exit(&odd->odd_lock); + + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + overlay_ioc_delete_t *oidp = karg; + overlay_dev_t *odd; + datalink_id_t tid; + int ret; + + odd = overlay_hold_by_dlid(oidp->oid_linkid); + if (odd == NULL) { + return (ENOENT); + } + + mutex_enter(&odd->odd_lock); + /* If we're not the only hold, we're busy */ + if (odd->odd_ref != 1) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (EBUSY); + } + + if (odd->odd_flags & OVERLAY_F_IN_MUX) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (EBUSY); + } + + /* + * To remove this, we need to first remove it from dls and then remove + * it from mac. The act of removing it from mac will check if there are + * devices on top of this, eg. vnics. If there are, then that will fail + * and we'll have to go through and recreate the dls entry. Only after + * mac_unregister has succeeded, then we'll go through and actually free + * everything and drop the dev lock. + */ + ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE); + if (ret != 0) { + overlay_hold_rele(odd); + return (ret); + } + + ASSERT(oidp->oid_linkid == tid); + ret = mac_disable(odd->odd_mh); + if (ret != 0) { + (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid, + crgetzoneid(cred)); + overlay_hold_rele(odd); + return (ret); + } + + overlay_target_quiesce(odd->odd_target); + + mutex_enter(&overlay_dev_lock); + list_remove(&overlay_dev_list, odd); + mutex_exit(&overlay_dev_lock); + + cv_destroy(&odd->odd_iowait); + mutex_destroy(&odd->odd_lock); + overlay_target_free(odd); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + overlay_ioc_nprops_t *on = karg; + + odd = overlay_hold_by_dlid(on->oipn_linkid); + if (odd == NULL) + return (ENOENT); + on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS; + overlay_hold_rele(odd); + + return (0); +} + +static int +overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg) +{ + overlay_prop_handle_t phdl = arg; + overlay_prop_set_range_str(phdl, opp->ovp_name); + return (0); +} + +static int +overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id) +{ + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], name) == 0) { + *id = i; + return (0); + } + } + + for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { + if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) { + *id = i + OVERLAY_DEV_NPROPS; + return (0); + } + } + + return (ENOENT); +} + +static void +overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl) +{ + uint32_t def; + mac_propval_range_t range; + uint_t perm; + + ASSERT(MAC_PERIM_HELD(odd->odd_mh)); + + bzero(&range, sizeof (mac_propval_range_t)); + range.mpr_count = 1; + if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def, + sizeof (def), &range, &perm) != 0) + return; + + if (perm == MAC_PROP_PERM_READ) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + else if (perm == MAC_PROP_PERM_WRITE) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE); + else if (perm == MAC_PROP_PERM_RW) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min, + range.mpr_range_uint32[0].mpur_max); +} + +/* ARGSUSED */ +static int +overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + int ret; + mac_perim_handle_t mph; + uint_t propid = UINT_MAX; + overlay_ioc_propinfo_t *oip = karg; + overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip; + + odd = overlay_hold_by_dlid(oip->oipi_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_prop_init(phdl); + mac_perim_enter_by_mh(odd->odd_mh, &mph); + + /* + * If the id is -1, then the property that we're looking for is named in + * oipi_name and we should fill in its id. Otherwise, we've been given + * an id and we need to turn that into a name for our plugin's sake. The + * id is our own fabrication for property discovery. + */ + if (oip->oipi_id == -1) { + /* + * Determine if it's a known generic property or it belongs to a + * module by checking against the list of known names. + */ + oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name, + &propid)) != 0) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + oip->oipi_id = propid; + if (propid >= OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( + oip->oipi_name, phdl); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + + } + } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS; + + if (id >= odd->odd_plugin->ovp_nprops) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( + odd->odd_plugin->ovp_props[id], phdl); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } else if (oip->oipi_id < -1) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } else { + ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oipi_id >= 0); + propid = oip->oipi_id; + (void) strlcpy(oip->oipi_name, overlay_dev_props[propid], + sizeof (oip->oipi_name)); + } + + switch (propid) { + case OVERLAY_DEV_P_MTU: + overlay_i_propinfo_mtu(odd, phdl); + break; + case OVERLAY_DEV_P_VNETID: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_nodefault(phdl); + break; + case OVERLAY_DEV_P_ENCAP: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING); + overlay_prop_set_nodefault(phdl); + overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl); + break; + case OVERLAY_DEV_P_VARPDID: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_nodefault(phdl); + break; + default: + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ENOENT); + } + + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + int ret; + overlay_dev_t *odd; + mac_perim_handle_t mph; + overlay_ioc_prop_t *oip = karg; + uint_t propid, mtu; + + odd = overlay_hold_by_dlid(oip->oip_linkid); + if (odd == NULL) + return (ENOENT); + + mac_perim_enter_by_mh(odd->odd_mh, &mph); + oip->oip_size = OVERLAY_PROP_SIZEMAX; + oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + if (oip->oip_id == -1) { + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) + break; + if (i == OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_getprop( + odd->odd_pvoid, oip->oip_name, + oip->oip_value, &oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + } + + propid = i; + } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; + + if (id > odd->odd_plugin->ovp_nprops) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, + odd->odd_plugin->ovp_props[id], oip->oip_value, + &oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } else if (oip->oip_id < -1) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } else { + ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oip_id >= 0); + propid = oip->oip_id; + } + + ret = 0; + switch (propid) { + case OVERLAY_DEV_P_MTU: + /* + * The MTU is always set and retrieved through MAC, to allow for + * MAC to do whatever it wants, as really that property belongs + * to MAC. This is important for things where vnics have hold on + * the MTU. + */ + mac_sdu_get(odd->odd_mh, NULL, &mtu); + bcopy(&mtu, oip->oip_value, sizeof (uint_t)); + oip->oip_size = sizeof (uint_t); + break; + case OVERLAY_DEV_P_VNETID: + /* + * While it's read-only while inside of a mux, we're not in a + * context that can guarantee that. Therefore we always grab the + * overlay_dev_t's odd_lock. + */ + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint64_t); + break; + case OVERLAY_DEV_P_ENCAP: + oip->oip_size = strlcpy((char *)oip->oip_value, + odd->odd_plugin->ovp_name, oip->oip_size); + break; + case OVERLAY_DEV_P_VARPDID: + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_VARPD) { + const uint64_t val = odd->odd_target->ott_id; + bcopy(&val, oip->oip_value, sizeof (uint64_t)); + oip->oip_size = sizeof (uint64_t); + } else { + oip->oip_size = 0; + } + mutex_exit(&odd->odd_lock); + break; + default: + ret = ENOENT; + } + + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); +} + +static void +overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid) +{ + mutex_enter(&odd->odd_lock); + + /* Simple case, not active */ + if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) { + odd->odd_vid = vnetid; + mutex_exit(&odd->odd_lock); + return; + } + + /* + * In the hard case, we need to set the drop flag, quiesce I/O and then + * we can go ahead and do everything. + */ + odd->odd_flags |= OVERLAY_F_MDDROP; + overlay_io_wait(odd, OVERLAY_F_IOMASK); + mutex_exit(&odd->odd_lock); + + overlay_mux_remove_dev(odd->odd_mux, odd); + mutex_enter(&odd->odd_lock); + odd->odd_vid = vnetid; + mutex_exit(&odd->odd_lock); + overlay_mux_add_dev(odd->odd_mux, odd); + + mutex_enter(&odd->odd_lock); + ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX); + odd->odd_flags &= ~OVERLAY_F_IN_MUX; + mutex_exit(&odd->odd_lock); +} + +/* ARGSUSED */ +static int +overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + int ret; + overlay_dev_t *odd; + overlay_ioc_prop_t *oip = karg; + uint_t propid = UINT_MAX; + mac_perim_handle_t mph; + uint64_t maxid, *vidp; + + if (oip->oip_size > OVERLAY_PROP_SIZEMAX) + return (EINVAL); + + odd = overlay_hold_by_dlid(oip->oip_linkid); + if (odd == NULL) + return (ENOENT); + + oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + mac_perim_enter_by_mh(odd->odd_mh, &mph); + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_ACTIVATED) { + mac_perim_exit(mph); + mutex_exit(&odd->odd_lock); + return (ENOTSUP); + } + mutex_exit(&odd->odd_lock); + if (oip->oip_id == -1) { + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) + break; + if (i == OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_setprop( + odd->odd_pvoid, oip->oip_name, + oip->oip_value, oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + } + + propid = i; + } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; + + if (id > odd->odd_plugin->ovp_nprops) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid, + odd->odd_plugin->ovp_props[id], oip->oip_value, + oip->oip_size); + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ret); + } else if (oip->oip_id < -1) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (EINVAL); + } else { + ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oip_id >= 0); + propid = oip->oip_id; + } + + ret = 0; + switch (propid) { + case OVERLAY_DEV_P_MTU: + ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu", + oip->oip_value, oip->oip_size); + break; + case OVERLAY_DEV_P_VNETID: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + vidp = (uint64_t *)oip->oip_value; + ASSERT(odd->odd_plugin->ovp_id_size <= 8); + maxid = UINT64_MAX; + if (odd->odd_plugin->ovp_id_size != 8) + maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - + 1ULL; + if (*vidp >= maxid) { + ret = EINVAL; + break; + } + overlay_setprop_vnetid(odd, *vidp); + break; + case OVERLAY_DEV_P_ENCAP: + case OVERLAY_DEV_P_VARPDID: + ret = EPERM; + break; + default: + ret = ENOENT; + } + + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ret); +} + +/* ARGSUSED */ +static int +overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + overlay_ioc_status_t *os = karg; + + odd = overlay_hold_by_dlid(os->ois_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) { + os->ois_status = OVERLAY_I_DEGRADED; + if (odd->odd_fmamsg != NULL) { + (void) strlcpy(os->ois_message, odd->odd_fmamsg, + OVERLAY_STATUS_BUFLEN); + } else { + os->ois_message[0] = '\0'; + } + + } else { + os->ois_status = OVERLAY_I_OK; + os->ois_message[0] = '\0'; + } + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + + return (0); +} + +static dld_ioc_info_t overlay_ioc_list[] = { + { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t), + overlay_i_create, secpolicy_dl_config }, + { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t), + overlay_i_activate, secpolicy_dl_config }, + { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t), + overlay_i_delete, secpolicy_dl_config }, + { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo, + secpolicy_dl_config }, + { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_prop_t), overlay_i_getprop, + secpolicy_dl_config }, + { OVERLAY_IOC_SETPROP, DLDCOPYIN, + sizeof (overlay_ioc_prop_t), overlay_i_setprop, + secpolicy_dl_config }, + { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_nprops_t), overlay_i_nprops, + secpolicy_dl_config }, + { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_status_t), overlay_i_status, + NULL } +}; + +static int +overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int fmcap = DDI_FM_EREPORT_CAPABLE; + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (overlay_dip != NULL || ddi_get_instance(dip) != 0) + return (DDI_FAILURE); + + ddi_fm_init(dip, &fmcap, NULL); + + if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR, + ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE) + return (DDI_FAILURE); + + if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list, + DLDIOCCNT(overlay_ioc_list)) != 0) { + ddi_remove_minor_node(dip, OVERLAY_CTL); + return (DDI_FAILURE); + } + + overlay_dip = dip; + return (DDI_SUCCESS); +} + +/* ARGSUSED */ +static int +overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *resp = (void *)overlay_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *resp = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + + return (error); +} + +static int +overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + mutex_enter(&overlay_dev_lock); + if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) { + mutex_exit(&overlay_dev_lock); + return (EBUSY); + } + mutex_exit(&overlay_dev_lock); + + + dld_ioc_unregister(OVERLAY_IOC); + ddi_remove_minor_node(dip, OVERLAY_CTL); + ddi_fm_fini(dip); + overlay_dip = NULL; + return (DDI_SUCCESS); +} + +static struct cb_ops overlay_cbops = { + overlay_target_open, /* cb_open */ + overlay_target_close, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + overlay_target_ioctl, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* cb_stream */ + D_MP, /* cb_flag */ + CB_REV, /* cb_rev */ + nodev, /* cb_aread */ + nodev, /* cb_awrite */ +}; + +static struct dev_ops overlay_dev_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + overlay_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + overlay_attach, /* devo_attach */ + overlay_detach, /* devo_detach */ + nulldev, /* devo_reset */ + &overlay_cbops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + NULL, /* devo_power */ + ddi_quiesce_not_supported /* devo_quiesce */ +}; + +static struct modldrv overlay_modldrv = { + &mod_driverops, + "Overlay Network Driver", + &overlay_dev_ops +}; + +static struct modlinkage overlay_linkage = { + MODREV_1, + &overlay_modldrv +}; + +static int +overlay_init(void) +{ + mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&overlay_dev_list, sizeof (overlay_dev_t), + offsetof(overlay_dev_t, odd_link)); + overlay_mux_init(); + overlay_plugin_init(); + overlay_target_init(); + + return (DDI_SUCCESS); +} + +static void +overlay_fini(void) +{ + overlay_target_fini(); + overlay_plugin_fini(); + overlay_mux_fini(); + mutex_destroy(&overlay_dev_lock); + list_destroy(&overlay_dev_list); +} + +int +_init(void) +{ + int err; + + if ((err = overlay_init()) != DDI_SUCCESS) + return (err); + + mac_init_ops(NULL, "overlay"); + err = mod_install(&overlay_linkage); + if (err != DDI_SUCCESS) { + overlay_fini(); + return (err); + } + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&overlay_linkage, modinfop)); +} + +int +_fini(void) +{ + int err; + + err = mod_remove(&overlay_linkage); + if (err != 0) + return (err); + + overlay_fini(); + return (0); +} diff --git a/usr/src/uts/common/io/overlay/overlay.conf b/usr/src/uts/common/io/overlay/overlay.conf new file mode 100644 index 0000000000..4b62fafd94 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015, Joyent, Inc. +# + +name="overlay" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/overlay/overlay.mapfile b/usr/src/uts/common/io/overlay/overlay.mapfile new file mode 100644 index 0000000000..800d72dc2b --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.mapfile @@ -0,0 +1,46 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + # Encapsualation Plugin interfaces + overlay_plugin_alloc; + overlay_plugin_free; + overlay_plugin_register; + overlay_plugin_unregister; + local: + *; +}; diff --git a/usr/src/uts/common/io/overlay/overlay_fm.c b/usr/src/uts/common/io/overlay/overlay_fm.c new file mode 100644 index 0000000000..0701d08e8b --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_fm.c @@ -0,0 +1,82 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Overlay device FMA operations. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/ddifm.h> +#include <sys/overlay_impl.h> + +kmutex_t overlay_fm_lock; +uint_t overlay_fm_count; + +void +overlay_fm_init(void) +{ + overlay_fm_count = 0; + mutex_init(&overlay_fm_lock, NULL, MUTEX_DRIVER, NULL); +} + +void +overlay_fm_fini(void) +{ + VERIFY(overlay_fm_count == 0); + mutex_destroy(&overlay_fm_lock); +} + +void +overlay_fm_degrade(overlay_dev_t *odd, const char *msg) +{ + mutex_enter(&overlay_fm_lock); + mutex_enter(&odd->odd_lock); + + if (msg != NULL) + (void) strlcpy(odd->odd_fmamsg, msg, OVERLAY_STATUS_BUFLEN); + + if (odd->odd_flags & OVERLAY_F_DEGRADED) + goto out; + + odd->odd_flags |= OVERLAY_F_DEGRADED; + overlay_fm_count++; + if (overlay_fm_count == 1) { + ddi_fm_service_impact(overlay_dip, DDI_SERVICE_DEGRADED); + } +out: + mutex_exit(&odd->odd_lock); + mutex_exit(&overlay_fm_lock); +} + +void +overlay_fm_restore(overlay_dev_t *odd) +{ + mutex_enter(&overlay_fm_lock); + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_DEGRADED)) + goto out; + + odd->odd_fmamsg[0] = '\0'; + odd->odd_flags &= ~OVERLAY_F_DEGRADED; + overlay_fm_count--; + if (overlay_fm_count == 0) { + ddi_fm_service_impact(overlay_dip, DDI_SERVICE_RESTORED); + } +out: + mutex_exit(&odd->odd_lock); + mutex_exit(&overlay_fm_lock); +} diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c new file mode 100644 index 0000000000..cd612763e1 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_mux.c @@ -0,0 +1,363 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * Overlay device ksocket multiplexer. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/ksynch.h> +#include <sys/ksocket.h> +#include <sys/avl.h> +#include <sys/list.h> +#include <sys/pattr.h> +#include <sys/sysmacros.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/tihdr.h> + +#include <sys/overlay_impl.h> + +#include <sys/sdt.h> + +#define OVERLAY_FREEMSG(mp, reason) \ + DTRACE_PROBE2(overlay__freemsg, mblk_t *, mp, char *, reason) + +static list_t overlay_mux_list; +static kmutex_t overlay_mux_lock; + +void +overlay_mux_init(void) +{ + list_create(&overlay_mux_list, sizeof (overlay_mux_t), + offsetof(overlay_mux_t, omux_lnode)); + mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL); +} + +void +overlay_mux_fini(void) +{ + mutex_destroy(&overlay_mux_lock); + list_destroy(&overlay_mux_list); +} + +static int +overlay_mux_comparator(const void *a, const void *b) +{ + const overlay_dev_t *odl, *odr; + odl = a; + odr = b; + if (odl->odd_vid > odr->odd_vid) + return (1); + else if (odl->odd_vid < odr->odd_vid) + return (-1); + else + return (0); +} + +/* + * This is the central receive data path. We need to decode the packet, if we + * can, and then deliver it to the appropriate overlay. + */ +/* ARGSUSED */ +static boolean_t +overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob, + void *arg) +{ + mblk_t *mp, *nmp, *fmp; + overlay_mux_t *mux = arg; + + /* + * We may have a received a chain of messages. Each message in the + * chain will likely have a T_unitdata_ind attached to it as an M_PROTO. + * If we aren't getting that, we should probably drop that for the + * moment. + */ + for (mp = mpchain; mp != NULL; mp = nmp) { + struct T_unitdata_ind *tudi; + ovep_encap_info_t infop; + overlay_dev_t od, *odd; + int ret; + + nmp = mp->b_next; + mp->b_next = NULL; + + if (DB_TYPE(mp) != M_PROTO) { + OVERLAY_FREEMSG(mp, "first one isn't M_PROTO"); + freemsg(mp); + continue; + } + + if (mp->b_cont == NULL) { + OVERLAY_FREEMSG(mp, "missing a b_cont"); + freemsg(mp); + continue; + } + + tudi = (struct T_unitdata_ind *)mp->b_rptr; + if (tudi->PRIM_type != T_UNITDATA_IND) { + OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *"); + freemsg(mp); + continue; + } + + /* + * In the future, we'll care about the source information + * for purposes of telling varpd for oob invalidation. But for + * now, just drop that block. + */ + fmp = mp; + mp = fmp->b_cont; + freeb(fmp); + + /* + * Until we have VXLAN-or-other-decap HW acceleration support + * (e.g. we support NICs that reach into VXLAN-encapsulated + * packets and check the inside-VXLAN IP packets' checksums, + * or do LSO with VXLAN), we should clear any HW-accelerated- + * performed bits. + */ + DB_CKSUMFLAGS(mp) = 0; + + /* + * Decap and deliver. + */ + bzero(&infop, sizeof (ovep_encap_info_t)); + ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop); + if (ret != 0) { + OVERLAY_FREEMSG(mp, "decap failed"); + freemsg(mp); + continue; + } + if (MBLKL(mp) > infop.ovdi_hdr_size) { + mp->b_rptr += infop.ovdi_hdr_size; + } else { + while (infop.ovdi_hdr_size != 0) { + size_t rem, blkl; + + if (mp == NULL) + break; + + blkl = MBLKL(mp); + rem = MIN(infop.ovdi_hdr_size, blkl); + infop.ovdi_hdr_size -= rem; + mp->b_rptr += rem; + if (rem == blkl) { + fmp = mp; + mp = fmp->b_cont; + fmp->b_cont = NULL; + OVERLAY_FREEMSG(mp, + "freed a fmp block"); + freemsg(fmp); + } + } + if (mp == NULL) { + OVERLAY_FREEMSG(mp, "freed it all..."); + continue; + } + } + + + od.odd_vid = infop.ovdi_id; + mutex_enter(&mux->omux_lock); + odd = avl_find(&mux->omux_devices, &od, NULL); + if (odd == NULL) { + mutex_exit(&mux->omux_lock); + OVERLAY_FREEMSG(mp, "no matching vid"); + freemsg(mp); + continue; + } + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_MDDROP) || + !(odd->odd_flags & OVERLAY_F_IN_MUX)) { + mutex_exit(&odd->odd_lock); + mutex_exit(&mux->omux_lock); + OVERLAY_FREEMSG(mp, "dev dropped"); + freemsg(mp); + continue; + } + overlay_io_start(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + mutex_exit(&mux->omux_lock); + + mac_rx(odd->odd_mh, NULL, mp); + + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + } + + return (B_TRUE); +} + +/* + * Register a given device with a socket backend. If no such device socket + * exists, create a new one. + */ +overlay_mux_t * +overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol, + struct sockaddr *addr, socklen_t len, int *errp) +{ + int err; + overlay_mux_t *mux; + ksocket_t ksock; + + if (errp == NULL) + errp = &err; + + mutex_enter(&overlay_mux_lock); + for (mux = list_head(&overlay_mux_list); mux != NULL; + mux = list_next(&overlay_mux_list, mux)) { + if (domain == mux->omux_domain && + family == mux->omux_family && + protocol == mux->omux_protocol && + len == mux->omux_alen && + bcmp(addr, mux->omux_addr, len) == 0) { + + if (opp != mux->omux_plugin) { + *errp = EEXIST; + return (NULL); + } + + mutex_enter(&mux->omux_lock); + mux->omux_count++; + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + *errp = 0; + return (mux); + } + } + + /* + * Today we aren't zone-aware and only exist in the global zone. When we + * allow for things to exist in the non-global zone, we'll want to use a + * credential that's actually specific to the zone. + */ + *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP, + kcred); + if (*errp != 0) { + mutex_exit(&overlay_mux_lock); + return (NULL); + } + + *errp = ksocket_bind(ksock, addr, len, kcred); + if (*errp != 0) { + mutex_exit(&overlay_mux_lock); + ksocket_close(ksock, kcred); + return (NULL); + } + + /* + * Ask our lower layer to optionally toggle anything they need on this + * socket. Because a socket is owned by a single type of plugin, we can + * then ask it to perform any additional socket set up it'd like to do. + */ + if (opp->ovp_ops->ovpo_sockopt != NULL && + (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) { + mutex_exit(&overlay_mux_lock); + ksocket_close(ksock, kcred); + return (NULL); + } + + mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP); + list_link_init(&mux->omux_lnode); + mux->omux_ksock = ksock; + mux->omux_plugin = opp; + mux->omux_domain = domain; + mux->omux_family = family; + mux->omux_protocol = protocol; + mux->omux_addr = kmem_alloc(len, KM_SLEEP); + bcopy(addr, mux->omux_addr, len); + mux->omux_alen = len; + mux->omux_count = 1; + avl_create(&mux->omux_devices, overlay_mux_comparator, + sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode)); + mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL); + + + /* Once this is called, we need to expect to rx data */ + *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux); + if (*errp != 0) { + ksocket_close(ksock, kcred); + mutex_destroy(&mux->omux_lock); + avl_destroy(&mux->omux_devices); + kmem_free(mux->omux_addr, len); + kmem_free(mux, sizeof (overlay_mux_t)); + return (NULL); + } + + list_insert_tail(&overlay_mux_list, mux); + mutex_exit(&overlay_mux_lock); + + *errp = 0; + return (mux); +} + +void +overlay_mux_close(overlay_mux_t *mux) +{ + mutex_enter(&overlay_mux_lock); + mutex_enter(&mux->omux_lock); + mux->omux_count--; + if (mux->omux_count != 0) { + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + return; + } + list_remove(&overlay_mux_list, mux); + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + + ksocket_close(mux->omux_ksock, kcred); + avl_destroy(&mux->omux_devices); + kmem_free(mux->omux_addr, mux->omux_alen); + kmem_free(mux, sizeof (overlay_mux_t)); +} + +void +overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd) +{ + mutex_enter(&mux->omux_lock); + avl_add(&mux->omux_devices, odd); + mutex_exit(&mux->omux_lock); +} + +void +overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd) +{ + mutex_enter(&mux->omux_lock); + avl_remove(&mux->omux_devices, odd); + mutex_exit(&mux->omux_lock); +} + +int +overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp) +{ + int ret; + + /* + * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately, + * that isn't actually supported by UDP at this time. + */ + ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred); + if (ret != 0) + freemsg(mp); + + return (ret); +} diff --git a/usr/src/uts/common/io/overlay/overlay_plugin.c b/usr/src/uts/common/io/overlay/overlay_plugin.c new file mode 100644 index 0000000000..348ddb92a2 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_plugin.c @@ -0,0 +1,281 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Overlay device encapsulation plugin management + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/errno.h> +#include <sys/sysmacros.h> +#include <sys/modctl.h> + +#include <sys/overlay_impl.h> + +static kmem_cache_t *overlay_plugin_cache; +static kmutex_t overlay_plugin_lock; +static list_t overlay_plugin_list; + +#define OVERLAY_MODDIR "overlay" + +/* ARGSUSED */ +static int +overlay_plugin_cache_constructor(void *buf, void *arg, int kmflags) +{ + overlay_plugin_t *opp = buf; + + mutex_init(&opp->ovp_mutex, NULL, MUTEX_DRIVER, NULL); + list_link_init(&opp->ovp_link); + + return (0); +} + +/* ARGSUSED */ +static void +overlay_plugin_cache_destructor(void *buf, void *arg) +{ + overlay_plugin_t *opp = buf; + ASSERT(list_link_active(&opp->ovp_link) == 0); + mutex_destroy(&opp->ovp_mutex); +} + +void +overlay_plugin_init(void) +{ + mutex_init(&overlay_plugin_lock, NULL, MUTEX_DRIVER, 0); + + /* + * In the future we may want to have a reaper to unload unused modules + * to help the kernel be able to reclaim memory. + */ + overlay_plugin_cache = kmem_cache_create("overlay_plugin_cache", + sizeof (overlay_plugin_t), 0, overlay_plugin_cache_constructor, + overlay_plugin_cache_destructor, NULL, NULL, NULL, 0); + list_create(&overlay_plugin_list, sizeof (overlay_plugin_t), + offsetof(overlay_plugin_t, ovp_link)); +} + +void +overlay_plugin_fini(void) +{ + mutex_enter(&overlay_plugin_lock); + VERIFY(list_is_empty(&overlay_plugin_list)); + mutex_exit(&overlay_plugin_lock); + + list_destroy(&overlay_plugin_list); + kmem_cache_destroy(overlay_plugin_cache); + mutex_destroy(&overlay_plugin_lock); +} + +overlay_plugin_register_t * +overlay_plugin_alloc(uint_t version) +{ + overlay_plugin_register_t *ovrp; + /* Version 1 is the only one that exists */ + if (version != OVEP_VERSION_ONE) + return (NULL); + + ovrp = kmem_zalloc(sizeof (overlay_plugin_register_t), KM_SLEEP); + ovrp->ovep_version = version; + return (ovrp); +} + +void +overlay_plugin_free(overlay_plugin_register_t *ovrp) +{ + kmem_free(ovrp, sizeof (overlay_plugin_register_t)); +} + +int +overlay_plugin_register(overlay_plugin_register_t *ovrp) +{ + overlay_plugin_t *opp, *ipp; + + /* Sanity check parameters of the registration */ + if (ovrp->ovep_version != OVEP_VERSION_ONE) + return (EINVAL); + + if (ovrp->ovep_name == NULL || ovrp->ovep_ops == NULL) + return (EINVAL); + + if ((ovrp->ovep_flags & ~(OVEP_F_VLAN_TAG)) != 0) + return (EINVAL); + + if (ovrp->ovep_id_size < 1) + return (EINVAL); + + /* Don't support anything that has an id size larger than 8 bytes */ + if (ovrp->ovep_id_size > 8) + return (ENOTSUP); + + if (ovrp->ovep_dest == OVERLAY_PLUGIN_D_INVALID) + return (EINVAL); + + if ((ovrp->ovep_dest & ~OVERLAY_PLUGIN_D_MASK) != 0) + return (EINVAL); + + if (ovrp->ovep_ops->ovpo_callbacks != 0) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_init == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_fini == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_encap == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_decap == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_socket == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_getprop == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_setprop == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_propinfo == NULL) + return (EINVAL); + + + opp = kmem_cache_alloc(overlay_plugin_cache, KM_SLEEP); + opp->ovp_active = 0; + opp->ovp_name = ovrp->ovep_name; + opp->ovp_ops = ovrp->ovep_ops; + opp->ovp_props = ovrp->ovep_props; + opp->ovp_id_size = ovrp->ovep_id_size; + opp->ovp_flags = ovrp->ovep_flags; + opp->ovp_dest = ovrp->ovep_dest; + + opp->ovp_nprops = 0; + if (ovrp->ovep_props != NULL) { + while (ovrp->ovep_props[opp->ovp_nprops] != NULL) { + if (strlen(ovrp->ovep_props[opp->ovp_nprops]) >= + OVERLAY_PROP_NAMELEN) { + mutex_exit(&overlay_plugin_lock); + kmem_cache_free(overlay_plugin_cache, opp); + return (EINVAL); + } + opp->ovp_nprops++; + } + } + + mutex_enter(&overlay_plugin_lock); + for (ipp = list_head(&overlay_plugin_list); ipp != NULL; + ipp = list_next(&overlay_plugin_list, ipp)) { + if (strcmp(ipp->ovp_name, opp->ovp_name) == 0) { + mutex_exit(&overlay_plugin_lock); + kmem_cache_free(overlay_plugin_cache, opp); + return (EEXIST); + } + } + list_insert_tail(&overlay_plugin_list, opp); + mutex_exit(&overlay_plugin_lock); + + return (0); +} + +int +overlay_plugin_unregister(const char *name) +{ + overlay_plugin_t *opp; + + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (strcmp(opp->ovp_name, name) == 0) + break; + } + + if (opp == NULL) { + mutex_exit(&overlay_plugin_lock); + return (ENOENT); + } + + mutex_enter(&opp->ovp_mutex); + if (opp->ovp_active > 0) { + mutex_exit(&opp->ovp_mutex); + mutex_exit(&overlay_plugin_lock); + return (EBUSY); + } + mutex_exit(&opp->ovp_mutex); + + list_remove(&overlay_plugin_list, opp); + mutex_exit(&overlay_plugin_lock); + + kmem_cache_free(overlay_plugin_cache, opp); + return (0); +} + +overlay_plugin_t * +overlay_plugin_lookup(const char *name) +{ + overlay_plugin_t *opp; + boolean_t trymodload = B_FALSE; + + for (;;) { + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (strcmp(name, opp->ovp_name) == 0) { + mutex_enter(&opp->ovp_mutex); + opp->ovp_active++; + mutex_exit(&opp->ovp_mutex); + mutex_exit(&overlay_plugin_lock); + return (opp); + } + } + mutex_exit(&overlay_plugin_lock); + + if (trymodload == B_TRUE) + return (NULL); + + /* + * If we didn't find it, it may still exist, but just not have + * been a loaded module. In that case, we'll do one attempt to + * load it. + */ + if (modload(OVERLAY_MODDIR, (char *)name) == -1) + return (NULL); + trymodload = B_TRUE; + } + +} + +void +overlay_plugin_rele(overlay_plugin_t *opp) +{ + mutex_enter(&opp->ovp_mutex); + ASSERT(opp->ovp_active > 0); + opp->ovp_active--; + mutex_exit(&opp->ovp_mutex); +} + +void +overlay_plugin_walk(overlay_plugin_walk_f func, void *arg) +{ + overlay_plugin_t *opp; + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (func(opp, arg) != 0) { + mutex_exit(&overlay_plugin_lock); + return; + } + } + mutex_exit(&overlay_plugin_lock); +} diff --git a/usr/src/uts/common/io/overlay/overlay_prop.c b/usr/src/uts/common/io/overlay/overlay_prop.c new file mode 100644 index 0000000000..ba1ea2a629 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_prop.c @@ -0,0 +1,122 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +/* + * Routines for manipulating property information structures. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/overlay_impl.h> + +void +overlay_prop_init(overlay_prop_handle_t phdl) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + + infop->oipi_posssize = sizeof (mac_propval_range_t); + bzero(rangep, sizeof (mac_propval_range_t)); +} + +void +overlay_prop_set_name(overlay_prop_handle_t phdl, const char *name) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + (void) strlcpy(infop->oipi_name, name, OVERLAY_PROP_NAMELEN); +} + +void +overlay_prop_set_prot(overlay_prop_handle_t phdl, overlay_prop_prot_t prot) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_prot = prot; +} + +void +overlay_prop_set_type(overlay_prop_handle_t phdl, overlay_prop_type_t type) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_type = type; +} + +int +overlay_prop_set_default(overlay_prop_handle_t phdl, void *def, ssize_t len) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + + if (len > OVERLAY_PROP_SIZEMAX) + return (E2BIG); + + if (len < 0) + return (EOVERFLOW); + + bcopy(def, infop->oipi_default, len); + infop->oipi_defsize = (uint32_t)len; + + return (0); +} + +void +overlay_prop_set_nodefault(overlay_prop_handle_t phdl) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_default[0] = '\0'; + infop->oipi_defsize = 0; +} + +void +overlay_prop_set_range_uint32(overlay_prop_handle_t phdl, uint32_t min, + uint32_t max) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + + if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_UINT32) + return; + + if (infop->oipi_posssize + sizeof (mac_propval_uint32_range_t) > + sizeof (infop->oipi_poss)) + return; + + infop->oipi_posssize += sizeof (mac_propval_uint32_range_t); + rangep->mpr_count++; + rangep->mpr_type = MAC_PROPVAL_UINT32; + rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_min = min; + rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_max = max; +} + +void +overlay_prop_set_range_str(overlay_prop_handle_t phdl, const char *str) +{ + size_t len = strlen(str) + 1; /* Account for a null terminator */ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + mac_propval_str_range_t *pstr = &rangep->u.mpr_str; + + if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_STR) + return; + + if (infop->oipi_posssize + len > sizeof (infop->oipi_poss)) + return; + + rangep->mpr_count++; + rangep->mpr_type = MAC_PROPVAL_STR; + strlcpy((char *)&pstr->mpur_data[pstr->mpur_nextbyte], str, + sizeof (infop->oipi_poss) - infop->oipi_posssize); + pstr->mpur_nextbyte += len; + infop->oipi_posssize += len; +} diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c new file mode 100644 index 0000000000..cb1366708a --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_target.c @@ -0,0 +1,1651 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Overlay device target cache management + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/ethernet.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/sysmacros.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/strsubr.h> +#include <sys/mac_provider.h> +#include <sys/mac_client.h> +#include <sys/mac_client_priv.h> +#include <sys/vlan.h> +#include <sys/crc32.h> +#include <sys/cred.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +#include <sys/overlay_impl.h> +#include <sys/sdt.h> + +/* + * This is total straw man, but at least it's a prime number. Here we're + * going to have to go through and do a lot of evaluation and understanding as + * to how these target caches should grow and shrink, as well as, memory + * pressure and evictions. This just gives us a starting point that'll be 'good + * enough', until it's not. + */ +#define OVERLAY_HSIZE 823 + +/* + * We use this data structure to keep track of what requests have been actively + * allocated to a given instance so we know what to put back on the pending + * list. + */ +typedef struct overlay_target_hdl { + minor_t oth_minor; /* RO */ + zoneid_t oth_zoneid; /* RO */ + int oth_oflags; /* RO */ + list_node_t oth_link; /* overlay_target_lock */ + kmutex_t oth_lock; + list_t oth_outstanding; /* oth_lock */ +} overlay_target_hdl_t; + +typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int); +typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *); +typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int); + +typedef struct overlay_target_ioctl { + int oti_cmd; /* ioctl id */ + boolean_t oti_write; /* ioctl requires FWRITE */ + boolean_t oti_ncopyout; /* copyout data? */ + overlay_target_copyin_f oti_copyin; /* copyin func */ + overlay_target_ioctl_f oti_func; /* function to call */ + overlay_target_copyout_f oti_copyout; /* copyin func */ + size_t oti_size; /* size of user level structure */ +} overlay_target_ioctl_t; + +static kmem_cache_t *overlay_target_cache; +static kmem_cache_t *overlay_entry_cache; +static id_space_t *overlay_thdl_idspace; +static void *overlay_thdl_state; + +/* + * When we support overlay devices in the NGZ, then all of these need to become + * zone aware, by plugging into the netstack engine and becoming per-netstack + * data. + */ +static list_t overlay_thdl_list; +static kmutex_t overlay_target_lock; +static kcondvar_t overlay_target_condvar; +static list_t overlay_target_list; +static boolean_t overlay_target_excl; + +/* + * Outstanding data per hash table entry. + */ +static int overlay_ent_size = 128 * 1024; + +/* ARGSUSED */ +static int +overlay_target_cache_constructor(void *buf, void *arg, int kmflgs) +{ + overlay_target_t *ott = buf; + + mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +overlay_target_cache_destructor(void *buf, void *arg) +{ + overlay_target_t *ott = buf; + + cv_destroy(&ott->ott_cond); + mutex_destroy(&ott->ott_lock); +} + +/* ARGSUSED */ +static int +overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs) +{ + overlay_target_entry_t *ote = buf; + + bzero(ote, sizeof (overlay_target_entry_t)); + mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +overlay_entry_cache_destructor(void *buf, void *arg) +{ + overlay_target_entry_t *ote = buf; + + mutex_destroy(&ote->ote_lock); +} + +static uint64_t +overlay_mac_hash(const void *v) +{ + uint32_t crc; + CRC32(crc, v, ETHERADDRL, -1U, crc32_table); + return (crc); +} + +static int +overlay_mac_cmp(const void *a, const void *b) +{ + return (bcmp(a, b, ETHERADDRL)); +} + +/* ARGSUSED */ +static void +overlay_target_entry_dtor(void *arg) +{ + overlay_target_entry_t *ote = arg; + + ote->ote_flags = 0; + bzero(ote->ote_addr, ETHERADDRL); + ote->ote_ott = NULL; + ote->ote_odd = NULL; + freemsgchain(ote->ote_chead); + ote->ote_chead = ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_vtime = 0; + kmem_cache_free(overlay_entry_cache, ote); +} + +static int +overlay_mac_avl(const void *a, const void *b) +{ + int i; + const overlay_target_entry_t *l, *r; + l = a; + r = b; + + for (i = 0; i < ETHERADDRL; i++) { + if (l->ote_addr[i] > r->ote_addr[i]) + return (1); + else if (l->ote_addr[i] < r->ote_addr[i]) + return (-1); + } + + return (0); +} + +void +overlay_target_init(void) +{ + int ret; + ret = ddi_soft_state_init(&overlay_thdl_state, + sizeof (overlay_target_hdl_t), 1); + VERIFY(ret == 0); + overlay_target_cache = kmem_cache_create("overlay_target", + sizeof (overlay_target_t), 0, overlay_target_cache_constructor, + overlay_target_cache_destructor, NULL, NULL, NULL, 0); + overlay_entry_cache = kmem_cache_create("overlay_entry", + sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor, + overlay_entry_cache_destructor, NULL, NULL, NULL, 0); + mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL); + list_create(&overlay_target_list, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_qlink)); + list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t), + offsetof(overlay_target_hdl_t, oth_link)); + overlay_thdl_idspace = id_space_create("overlay_target_minors", + 1, INT32_MAX); +} + +void +overlay_target_fini(void) +{ + id_space_destroy(overlay_thdl_idspace); + list_destroy(&overlay_thdl_list); + list_destroy(&overlay_target_list); + cv_destroy(&overlay_target_condvar); + mutex_destroy(&overlay_target_lock); + kmem_cache_destroy(overlay_entry_cache); + kmem_cache_destroy(overlay_target_cache); + ddi_soft_state_fini(&overlay_thdl_state); +} + +void +overlay_target_free(overlay_dev_t *odd) +{ + if (odd->odd_target == NULL) + return; + + if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) { + refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash; + avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree; + overlay_target_entry_t *ote; + + /* + * Our AVL tree and hashtable contain the same elements, + * therefore we should just remove it from the tree, but then + * delete the entries when we remove them from the hash table + * (which happens through the refhash dtor). + */ + while ((ote = avl_first(ap)) != NULL) + avl_remove(ap, ote); + + avl_destroy(ap); + for (ote = refhash_first(rp); ote != NULL; + ote = refhash_next(rp, ote)) { + refhash_remove(rp, ote); + } + refhash_destroy(rp); + } + + ASSERT(odd->odd_target->ott_ocount == 0); + kmem_cache_free(overlay_target_cache, odd->odd_target); +} + +int +overlay_target_busy() +{ + int ret; + + mutex_enter(&overlay_target_lock); + ret = !list_is_empty(&overlay_thdl_list); + mutex_exit(&overlay_target_lock); + + return (ret); +} + +static void +overlay_target_queue(overlay_target_entry_t *entry) +{ + mutex_enter(&overlay_target_lock); + mutex_enter(&entry->ote_ott->ott_lock); + if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) { + mutex_exit(&entry->ote_ott->ott_lock); + mutex_exit(&overlay_target_lock); + return; + } + entry->ote_ott->ott_ocount++; + mutex_exit(&entry->ote_ott->ott_lock); + list_insert_tail(&overlay_target_list, entry); + cv_signal(&overlay_target_condvar); + mutex_exit(&overlay_target_lock); +} + +void +overlay_target_quiesce(overlay_target_t *ott) +{ + if (ott == NULL) + return; + mutex_enter(&ott->ott_lock); + ott->ott_flags |= OVERLAY_T_TEARDOWN; + while (ott->ott_ocount != 0) + cv_wait(&ott->ott_cond, &ott->ott_lock); + mutex_exit(&ott->ott_lock); +} + +/* + * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP | + * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at + * this time, say for NVGRE, we drop all packets that mcuh this. + */ +int +overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, + socklen_t *slenp) +{ + int ret; + struct sockaddr_in6 *v6; + overlay_target_t *ott; + mac_header_info_t mhi; + overlay_target_entry_t *entry; + + ASSERT(odd->odd_target != NULL); + + /* + * At this point, the overlay device is in a mux which means that it's + * been activated. At this point, parts of the target, such as the mode + * and the destination are now read-only and we don't have to worry + * about synchronization for them. + */ + ott = odd->odd_target; + if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) + return (OVERLAY_TARGET_DROP); + + v6 = (struct sockaddr_in6 *)sock; + bzero(v6, sizeof (struct sockaddr_in6)); + v6->sin6_family = AF_INET6; + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + mutex_enter(&ott->ott_lock); + bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr, + sizeof (struct in6_addr)); + v6->sin6_port = htons(ott->ott_u.ott_point.otp_port); + mutex_exit(&ott->ott_lock); + *slenp = sizeof (struct sockaddr_in6); + + return (OVERLAY_TARGET_OK); + } + + ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC); + + /* + * Note we only want the MAC address here, therefore we won't bother + * using mac_vlan_header_info(). If any caller needs the vlan info at + * this point, this should change to a call to mac_vlan_header_info(). + */ + if (mac_header_info(odd->odd_mh, mp, &mhi) != 0) + return (OVERLAY_TARGET_DROP); + mutex_enter(&ott->ott_lock); + entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + mhi.mhi_daddr); + if (entry == NULL) { + entry = kmem_cache_alloc(overlay_entry_cache, + KM_NOSLEEP | KM_NORMALPRI); + if (entry == NULL) { + mutex_exit(&ott->ott_lock); + return (OVERLAY_TARGET_DROP); + } + bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL); + entry->ote_chead = entry->ote_ctail = mp; + entry->ote_mbsize = msgsize(mp); + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + entry->ote_ott = ott; + entry->ote_odd = odd; + refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry); + avl_add(&ott->ott_u.ott_dyn.ott_tree, entry); + mutex_exit(&ott->ott_lock); + overlay_target_queue(entry); + return (OVERLAY_TARGET_ASYNC); + } + refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry); + mutex_exit(&ott->ott_lock); + + mutex_enter(&entry->ote_lock); + if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) { + ret = OVERLAY_TARGET_DROP; + } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr, + sizeof (struct in6_addr)); + v6->sin6_port = htons(entry->ote_dest.otp_port); + *slenp = sizeof (struct sockaddr_in6); + ret = OVERLAY_TARGET_OK; + } else { + size_t mlen = msgsize(mp); + + if (mlen + entry->ote_mbsize > overlay_ent_size) { + ret = OVERLAY_TARGET_DROP; + } else { + if (entry->ote_ctail != NULL) { + ASSERT(entry->ote_ctail->b_next == + NULL); + entry->ote_ctail->b_next = mp; + entry->ote_ctail = mp; + } else { + entry->ote_chead = mp; + entry->ote_ctail = mp; + } + entry->ote_mbsize += mlen; + if ((entry->ote_flags & + OVERLAY_ENTRY_F_PENDING) == 0) { + entry->ote_flags |= + OVERLAY_ENTRY_F_PENDING; + overlay_target_queue(entry); + } + ret = OVERLAY_TARGET_ASYNC; + } + } + mutex_exit(&entry->ote_lock); + + mutex_enter(&ott->ott_lock); + refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + mutex_exit(&ott->ott_lock); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_info(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_info_t *oti = arg; + + odd = overlay_hold_by_dlid(oti->oti_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + oti->oti_flags = 0; + oti->oti_needs = odd->odd_plugin->ovp_dest; + if (odd->odd_flags & OVERLAY_F_DEGRADED) + oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED; + if (odd->odd_flags & OVERLAY_F_ACTIVATED) + oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE; + oti->oti_vnetid = odd->odd_vid; + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_targ_associate_t *ota = arg; + + odd = overlay_hold_by_dlid(ota->ota_linkid); + if (odd == NULL) + return (ENOENT); + + if (ota->ota_id == 0) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_mode != OVERLAY_TARGET_POINT && + ota->ota_mode != OVERLAY_TARGET_DYNAMIC) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_provides != odd->odd_plugin->ovp_dest) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_mode == OVERLAY_TARGET_POINT) { + if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) { + if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) || + IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) || + IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) { + overlay_hold_rele(odd); + return (EINVAL); + } + } + + if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) { + if (ota->ota_point.otp_port == 0) { + overlay_hold_rele(odd); + return (EINVAL); + } + } + } + + ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP); + ott->ott_flags = 0; + ott->ott_ocount = 0; + ott->ott_mode = ota->ota_mode; + ott->ott_dest = ota->ota_provides; + ott->ott_id = ota->ota_id; + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + bcopy(&ota->ota_point, &ott->ott_u.ott_point, + sizeof (overlay_target_point_t)); + } else { + ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE, + overlay_mac_hash, overlay_mac_cmp, + overlay_target_entry_dtor, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_reflink), + offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP); + avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl, + sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_avllink)); + } + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_VARPD) { + mutex_exit(&odd->odd_lock); + kmem_cache_free(overlay_target_cache, ott); + overlay_hold_rele(odd); + return (EEXIST); + } + + odd->odd_flags |= OVERLAY_F_VARPD; + odd->odd_target = ott; + mutex_exit(&odd->odd_lock); + + overlay_hold_rele(odd); + + + return (0); +} + + +/* ARGSUSED */ +static int +overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_degrade_t *otd = arg; + + odd = overlay_hold_by_dlid(otd->otd_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_fm_degrade(odd, otd->otd_buf); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_restore(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_id_t *otid = arg; + + odd = overlay_hold_by_dlid(otid->otid_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_fm_restore(odd); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_id_t *otid = arg; + + odd = overlay_hold_by_dlid(otid->otid_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + odd->odd_flags &= ~OVERLAY_F_VARPD; + mutex_exit(&odd->odd_lock); + + overlay_hold_rele(odd); + return (0); + +} + +static int +overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_lookup_t *otl = arg; + overlay_target_entry_t *entry; + clock_t ret, timeout; + mac_header_info_t mhi; + + timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC); +again: + mutex_enter(&overlay_target_lock); + while (list_is_empty(&overlay_target_list)) { + ret = cv_timedwait(&overlay_target_condvar, + &overlay_target_lock, timeout); + if (ret == -1) { + mutex_exit(&overlay_target_lock); + return (ETIME); + } + } + entry = list_remove_head(&overlay_target_list); + mutex_exit(&overlay_target_lock); + mutex_enter(&entry->ote_lock); + if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + ASSERT(entry->ote_chead == NULL); + mutex_exit(&entry->ote_lock); + goto again; + } + ASSERT(entry->ote_chead != NULL); + + /* + * If we have a bogon that doesn't have a valid mac header, drop it and + * try again. + */ + if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead, + &mhi) != 0) { + boolean_t queue = B_FALSE; + mblk_t *mp = entry->ote_chead; + entry->ote_chead = mp->b_next; + mp->b_next = NULL; + if (entry->ote_ctail == mp) + entry->ote_ctail = entry->ote_chead; + entry->ote_mbsize -= msgsize(mp); + if (entry->ote_chead != NULL) + queue = B_TRUE; + mutex_exit(&entry->ote_lock); + if (queue == B_TRUE) + overlay_target_queue(entry); + freemsg(mp); + goto again; + } + + otl->otl_dlid = entry->ote_odd->odd_linkid; + otl->otl_reqid = (uintptr_t)entry; + otl->otl_varpdid = entry->ote_ott->ott_id; + otl->otl_vnetid = entry->ote_odd->odd_vid; + + otl->otl_hdrsize = mhi.mhi_hdrsize; + otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize; + bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL); + bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL); + otl->otl_dsttype = mhi.mhi_dsttype; + otl->otl_sap = mhi.mhi_bindsap; + otl->otl_vlan = VLAN_ID(mhi.mhi_tci); + mutex_exit(&entry->ote_lock); + + mutex_enter(&thdl->oth_lock); + list_insert_tail(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + return (0); +} + +static int +overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg) +{ + const overlay_targ_resp_t *otr = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == otr->otr_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + list_remove(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + mutex_enter(&entry->ote_lock); + bcopy(&otr->otr_answer, &entry->ote_dest, + sizeof (overlay_target_point_t)); + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + entry->ote_flags |= OVERLAY_ENTRY_F_VALID; + mp = entry->ote_chead; + entry->ote_chead = NULL; + entry->ote_ctail = NULL; + entry->ote_mbsize = 0; + entry->ote_vtime = gethrtime(); + mutex_exit(&entry->ote_lock); + + /* + * For now do an in-situ drain. + */ + mp = overlay_m_tx(entry->ote_odd, mp); + freemsgchain(mp); + + mutex_enter(&entry->ote_ott->ott_lock); + entry->ote_ott->ott_ocount--; + cv_signal(&entry->ote_ott->ott_cond); + mutex_exit(&entry->ote_ott->ott_lock); + + return (0); +} + +static int +overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) +{ + const overlay_targ_resp_t *otr = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + boolean_t queue = B_FALSE; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == otr->otr_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + list_remove(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + mutex_enter(&entry->ote_lock); + + /* Safeguard against a confused varpd */ + if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + DTRACE_PROBE1(overlay__target__valid__drop, + overlay_target_entry_t *, entry); + mutex_exit(&entry->ote_lock); + goto done; + } + + mp = entry->ote_chead; + if (mp != NULL) { + entry->ote_chead = mp->b_next; + mp->b_next = NULL; + if (entry->ote_ctail == mp) + entry->ote_ctail = entry->ote_chead; + entry->ote_mbsize -= msgsize(mp); + } + if (entry->ote_chead != NULL) { + queue = B_TRUE; + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + } else { + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + } + mutex_exit(&entry->ote_lock); + + if (queue == B_TRUE) + overlay_target_queue(entry); + freemsg(mp); + +done: + mutex_enter(&entry->ote_ott->ott_lock); + entry->ote_ott->ott_ocount--; + cv_signal(&entry->ote_ott->ott_cond); + mutex_exit(&entry->ote_ott->ott_lock); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_pkt_t *pkt; + overlay_targ_pkt32_t *pkt32; + + pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP); + *outp = pkt; + *bsize = sizeof (overlay_targ_pkt_t); + if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) { + uintptr_t addr; + + if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t), + flags & FKIOCTL) != 0) { + kmem_free(pkt, *bsize); + return (EFAULT); + } + pkt32 = (overlay_targ_pkt32_t *)pkt; + addr = pkt32->otp_buf; + pkt->otp_buf = (void *)addr; + } else { + if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) { + kmem_free(pkt, *bsize); + return (EFAULT); + } + } + return (0); +} + +static int +overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize, + int flags) +{ + if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) { + overlay_targ_pkt_t *pkt = buf; + overlay_targ_pkt32_t *pkt32 = buf; + uintptr_t addr = (uintptr_t)pkt->otp_buf; + pkt32->otp_buf = (caddr32_t)addr; + if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t), + flags & FKIOCTL) != 0) + return (EFAULT); + } else { + if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0) + return (EFAULT); + } + return (0); +} + +static int +overlay_target_packet(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + size_t mlen; + size_t boff; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + mutex_enter(&entry->ote_lock); + mutex_exit(&thdl->oth_lock); + mp = entry->ote_chead; + /* Protect against a rogue varpd */ + if (mp == NULL) { + mutex_exit(&entry->ote_lock); + return (EINVAL); + } + mlen = MIN(msgsize(mp), pkt->otp_size); + pkt->otp_size = mlen; + boff = 0; + while (mlen > 0) { + size_t wlen = MIN(MBLKL(mp), mlen); + if (ddi_copyout(mp->b_rptr, + (void *)((uintptr_t)pkt->otp_buf + boff), + wlen, 0) != 0) { + mutex_exit(&entry->ote_lock); + return (EFAULT); + } + mlen -= wlen; + boff += wlen; + mp = mp->b_cont; + } + mutex_exit(&entry->ote_lock); + return (0); +} + +static int +overlay_target_inject(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + overlay_dev_t *odd; + mblk_t *mp; + + if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ) + return (EINVAL); + + mp = allocb(pkt->otp_size, 0); + if (mp == NULL) + return (ENOMEM); + + if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) { + freeb(mp); + return (EFAULT); + } + mp->b_wptr += pkt->otp_size; + + if (pkt->otp_linkid != UINT64_MAX) { + odd = overlay_hold_by_dlid(pkt->otp_linkid); + if (odd == NULL) { + freeb(mp); + return (ENOENT); + } + } else { + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + freeb(mp); + return (ENOENT); + } + odd = entry->ote_odd; + mutex_exit(&thdl->oth_lock); + } + + mutex_enter(&odd->odd_lock); + overlay_io_start(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + + mac_rx(odd->odd_mh, NULL, mp); + + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + + return (0); +} + +static int +overlay_target_resend(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + overlay_dev_t *odd; + mblk_t *mp; + + if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ) + return (EINVAL); + + mp = allocb(pkt->otp_size, 0); + if (mp == NULL) + return (ENOMEM); + + if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) { + freeb(mp); + return (EFAULT); + } + mp->b_wptr += pkt->otp_size; + + if (pkt->otp_linkid != UINT64_MAX) { + odd = overlay_hold_by_dlid(pkt->otp_linkid); + if (odd == NULL) { + freeb(mp); + return (ENOENT); + } + } else { + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + freeb(mp); + return (ENOENT); + } + odd = entry->ote_odd; + mutex_exit(&thdl->oth_lock); + } + + mp = overlay_m_tx(odd, mp); + freemsgchain(mp); + + return (0); +} + +typedef struct overlay_targ_list_int { + boolean_t otli_count; + uint32_t otli_cur; + uint32_t otli_nents; + uint32_t otli_ents[]; +} overlay_targ_list_int_t; + +static int +overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_list_t n; + overlay_targ_list_int_t *otl; + + if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + /* + */ + if (n.otl_nents >= INT32_MAX / sizeof (uint32_t)) + return (EINVAL); + *bsize = sizeof (overlay_targ_list_int_t) + + sizeof (uint32_t) * n.otl_nents; + otl = kmem_zalloc(*bsize, KM_SLEEP); + otl->otli_cur = 0; + otl->otli_nents = n.otl_nents; + if (otl->otli_nents != 0) { + otl->otli_count = B_FALSE; + if (ddi_copyin((void *)((uintptr_t)ubuf + + offsetof(overlay_targ_list_t, otl_ents)), + otl->otli_ents, n.otl_nents * sizeof (uint32_t), + flags & FKIOCTL) != 0) { + kmem_free(otl, *bsize); + return (EFAULT); + } + } else { + otl->otli_count = B_TRUE; + } + + *outp = otl; + return (0); +} + +static int +overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg) +{ + overlay_targ_list_int_t *otl = arg; + + if (otl->otli_cur < otl->otli_nents) + otl->otli_ents[otl->otli_cur] = odd->odd_linkid; + otl->otli_cur++; + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_iter(overlay_target_ioctl_list_cb, arg); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags) +{ + overlay_targ_list_int_t *otl = buf; + + if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + if (otl->otli_count == B_FALSE) { + if (ddi_copyout(otl->otli_ents, + (void *)((uintptr_t)ubuf + + offsetof(overlay_targ_list_t, otl_ents)), + sizeof (uint32_t) * otl->otli_nents, + flags & FKIOCTL) != 0) + return (EFAULT); + } + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg) +{ + int ret = 0; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_POINT && + ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + otc->otc_entry.otce_flags = 0; + bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest, + sizeof (overlay_target_point_t)); + } else { + overlay_target_entry_t *ote; + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote != NULL) { + mutex_enter(&ote->ote_lock); + if ((ote->ote_flags & + OVERLAY_ENTRY_F_VALID_MASK) != 0) { + if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) { + otc->otc_entry.otce_flags = + OVERLAY_TARGET_CACHE_DROP; + } else { + otc->otc_entry.otce_flags = 0; + bcopy(&ote->ote_dest, + &otc->otc_entry.otce_dest, + sizeof (overlay_target_point_t)); + } + ret = 0; + } else { + ret = ENOENT; + } + mutex_exit(&ote->ote_lock); + } else { + ret = ENOENT; + } + } + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + mblk_t *mp = NULL; + + if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP) + return (EINVAL); + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote == NULL) { + ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP); + bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL); + ote->ote_chead = ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_ott = ott; + ote->ote_odd = odd; + mutex_enter(&ote->ote_lock); + refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote); + avl_add(&ott->ott_u.ott_dyn.ott_tree, ote); + } else { + mutex_enter(&ote->ote_lock); + } + + if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) { + ote->ote_flags |= OVERLAY_ENTRY_F_DROP; + } else { + ote->ote_flags |= OVERLAY_ENTRY_F_VALID; + bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest, + sizeof (overlay_target_point_t)); + mp = ote->ote_chead; + ote->ote_chead = NULL; + ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_vtime = gethrtime(); + } + + mutex_exit(&ote->ote_lock); + mutex_exit(&ott->ott_lock); + + if (mp != NULL) { + mp = overlay_m_tx(ote->ote_odd, mp); + freemsgchain(mp); + } + + overlay_hold_rele(odd); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg) +{ + int ret = 0; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote != NULL) { + mutex_enter(&ote->ote_lock); + ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; + mutex_exit(&ote->ote_lock); + ret = 0; + } else { + ret = ENOENT; + } + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg) +{ + avl_tree_t *avl; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + avl = &ott->ott_u.ott_dyn.ott_tree; + + for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) { + mutex_enter(&ote->ote_lock); + ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; + mutex_exit(&ote->ote_lock); + } + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (0); +} + +static int +overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_cache_iter_t base, *iter; + + if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + if (base.otci_count > OVERLAY_TARGET_ITER_MAX) + return (E2BIG); + + if (base.otci_count == 0) + return (EINVAL); + + *bsize = sizeof (overlay_targ_cache_iter_t) + + base.otci_count * sizeof (overlay_targ_cache_entry_t); + iter = kmem_alloc(*bsize, KM_SLEEP); + bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t)); + *outp = iter; + + return (0); +} + +typedef struct overlay_targ_cache_marker { + uint8_t otcm_mac[ETHERADDRL]; + uint16_t otcm_done; +} overlay_targ_cache_marker_t; + +/* ARGSUSED */ +static int +overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t lookup, *ent; + overlay_targ_cache_marker_t *mark; + avl_index_t where; + avl_tree_t *avl; + uint16_t written = 0; + + overlay_targ_cache_iter_t *iter = arg; + mark = (void *)&iter->otci_marker; + + if (mark->otcm_done != 0) { + iter->otci_count = 0; + return (0); + } + + odd = overlay_hold_by_dlid(iter->otci_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC && + ott->ott_mode != OVERLAY_TARGET_POINT) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + + /* + * Holding this lock across the entire iteration probably isn't very + * good. We should perhaps add an r/w lock for the avl tree. But we'll + * wait until we now it's necessary before we do more. + */ + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + overlay_targ_cache_entry_t *out = &iter->otci_ents[0]; + bzero(out->otce_mac, ETHERADDRL); + out->otce_flags = 0; + bcopy(&ott->ott_u.ott_point, &out->otce_dest, + sizeof (overlay_target_point_t)); + written++; + mark->otcm_done = 1; + } + + avl = &ott->ott_u.ott_dyn.ott_tree; + bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL); + ent = avl_find(avl, &lookup, &where); + + /* + * NULL ent means that the entry does not exist, so we want to start + * with the closest node in the tree. This means that we implicitly rely + * on the tree's order and the first node will be the mac 00:00:00:00:00 + * and the last will be ff:ff:ff:ff:ff:ff. + */ + if (ent == NULL) { + ent = avl_nearest(avl, where, AVL_AFTER); + if (ent == NULL) { + mark->otcm_done = 1; + goto done; + } + } + + for (; ent != NULL && written < iter->otci_count; + ent = AVL_NEXT(avl, ent)) { + overlay_targ_cache_entry_t *out = &iter->otci_ents[written]; + mutex_enter(&ent->ote_lock); + if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) { + mutex_exit(&ent->ote_lock); + continue; + } + bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL); + out->otce_flags = 0; + if (ent->ote_flags & OVERLAY_ENTRY_F_DROP) + out->otce_flags |= OVERLAY_TARGET_CACHE_DROP; + if (ent->ote_flags & OVERLAY_ENTRY_F_VALID) + bcopy(&ent->ote_dest, &out->otce_dest, + sizeof (overlay_target_point_t)); + written++; + mutex_exit(&ent->ote_lock); + } + + if (ent != NULL) { + bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL); + } else { + mark->otcm_done = 1; + } + +done: + iter->otci_count = written; + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize, + int flags) +{ + size_t outsize; + const overlay_targ_cache_iter_t *iter = buf; + + outsize = sizeof (overlay_targ_cache_iter_t) + + iter->otci_count * sizeof (overlay_targ_cache_entry_t); + + if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + +static overlay_target_ioctl_t overlay_target_ioctab[] = { + { OVERLAY_TARG_INFO, B_TRUE, B_TRUE, + NULL, overlay_target_info, + NULL, sizeof (overlay_targ_info_t) }, + { OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE, + NULL, overlay_target_associate, + NULL, sizeof (overlay_targ_associate_t) }, + { OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE, + NULL, overlay_target_disassociate, + NULL, sizeof (overlay_targ_id_t) }, + { OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE, + NULL, overlay_target_degrade, + NULL, sizeof (overlay_targ_degrade_t) }, + { OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE, + NULL, overlay_target_restore, + NULL, sizeof (overlay_targ_id_t) }, + { OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE, + NULL, overlay_target_lookup_request, + NULL, sizeof (overlay_targ_lookup_t) }, + { OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE, + NULL, overlay_target_lookup_respond, + NULL, sizeof (overlay_targ_resp_t) }, + { OVERLAY_TARG_DROP, B_TRUE, B_FALSE, + NULL, overlay_target_lookup_drop, + NULL, sizeof (overlay_targ_resp_t) }, + { OVERLAY_TARG_PKT, B_TRUE, B_TRUE, + overlay_target_pkt_copyin, + overlay_target_packet, + overlay_target_pkt_copyout, + sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_INJECT, B_TRUE, B_FALSE, + overlay_target_pkt_copyin, + overlay_target_inject, + NULL, sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_RESEND, B_TRUE, B_FALSE, + overlay_target_pkt_copyin, + overlay_target_resend, + NULL, sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_LIST, B_FALSE, B_TRUE, + overlay_target_list_copyin, + overlay_target_ioctl_list, + overlay_target_list_copyout, + sizeof (overlay_targ_list_t) }, + { OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE, + NULL, overlay_target_cache_get, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE, + NULL, overlay_target_cache_set, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE, + NULL, overlay_target_cache_remove, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE, + NULL, overlay_target_cache_flush, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE, + overlay_target_cache_iter_copyin, + overlay_target_cache_iter, + overlay_target_cache_iter_copyout, + sizeof (overlay_targ_cache_iter_t) }, + { 0 } +}; + +int +overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp) +{ + minor_t mid; + overlay_target_hdl_t *thdl; + + if (secpolicy_dl_config(credp) != 0) + return (EPERM); + + if (getminor(*devp) != 0) + return (ENXIO); + + if (otype & OTYP_BLK) + return (EINVAL); + + if (flags & ~(FREAD | FWRITE | FEXCL)) + return (EINVAL); + + if ((flags & FWRITE) && + !(flags & FEXCL)) + return (EINVAL); + + if (!(flags & FREAD) && !(flags & FWRITE)) + return (EINVAL); + + if (crgetzoneid(credp) != GLOBAL_ZONEID) + return (EPERM); + + mid = id_alloc(overlay_thdl_idspace); + if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) { + id_free(overlay_thdl_idspace, mid); + return (ENXIO); + } + + thdl = ddi_get_soft_state(overlay_thdl_state, mid); + VERIFY(thdl != NULL); + thdl->oth_minor = mid; + thdl->oth_zoneid = crgetzoneid(credp); + thdl->oth_oflags = flags; + mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_qlink)); + *devp = makedevice(getmajor(*devp), mid); + + mutex_enter(&overlay_target_lock); + if ((flags & FEXCL) && overlay_target_excl == B_TRUE) { + mutex_exit(&overlay_target_lock); + list_destroy(&thdl->oth_outstanding); + mutex_destroy(&thdl->oth_lock); + ddi_soft_state_free(overlay_thdl_state, mid); + id_free(overlay_thdl_idspace, mid); + return (EEXIST); + } else if ((flags & FEXCL) != 0) { + VERIFY(overlay_target_excl == B_FALSE); + overlay_target_excl = B_TRUE; + } + list_insert_tail(&overlay_thdl_list, thdl); + mutex_exit(&overlay_target_lock); + + return (0); +} + +/* ARGSUSED */ +int +overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + overlay_target_ioctl_t *ioc; + overlay_target_hdl_t *thdl; + + if (secpolicy_dl_config(credp) != 0) + return (EPERM); + + if ((thdl = ddi_get_soft_state(overlay_thdl_state, + getminor(dev))) == NULL) + return (ENXIO); + + for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) { + int ret; + caddr_t buf; + size_t bufsize; + + if (ioc->oti_cmd != cmd) + continue; + + if (ioc->oti_write == B_TRUE && !(mode & FWRITE)) + return (EBADF); + + if (ioc->oti_copyin == NULL) { + bufsize = ioc->oti_size; + buf = kmem_alloc(bufsize, KM_SLEEP); + if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize, + mode & FKIOCTL) != 0) { + kmem_free(buf, bufsize); + return (EFAULT); + } + } else { + if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg, + (void **)&buf, &bufsize, mode)) != 0) + return (ret); + } + + ret = ioc->oti_func(thdl, buf); + if (ret == 0 && ioc->oti_size != 0 && + ioc->oti_ncopyout == B_TRUE) { + if (ioc->oti_copyout == NULL) { + if (ddi_copyout(buf, (void *)(uintptr_t)arg, + bufsize, mode & FKIOCTL) != 0) + ret = EFAULT; + } else { + ret = ioc->oti_copyout((void *)(uintptr_t)arg, + buf, bufsize, mode); + } + } + + kmem_free(buf, bufsize); + return (ret); + } + + return (ENOTTY); +} + +/* ARGSUSED */ +int +overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp) +{ + overlay_target_hdl_t *thdl; + overlay_target_entry_t *entry; + minor_t mid = getminor(dev); + + if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL) + return (ENXIO); + + mutex_enter(&overlay_target_lock); + list_remove(&overlay_thdl_list, thdl); + mutex_enter(&thdl->oth_lock); + while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL) + list_insert_tail(&overlay_target_list, entry); + cv_signal(&overlay_target_condvar); + mutex_exit(&thdl->oth_lock); + if ((thdl->oth_oflags & FEXCL) != 0) { + VERIFY(overlay_target_excl == B_TRUE); + overlay_target_excl = B_FALSE; + } + mutex_exit(&overlay_target_lock); + + list_destroy(&thdl->oth_outstanding); + mutex_destroy(&thdl->oth_lock); + mid = thdl->oth_minor; + ddi_soft_state_free(overlay_thdl_state, mid); + id_free(overlay_thdl_idspace, mid); + + return (0); +} diff --git a/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c new file mode 100644 index 0000000000..92144b3985 --- /dev/null +++ b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c @@ -0,0 +1,394 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * VXLAN encapsulation module + * + * + * The VXLAN header looks as follows in network byte order: + * + * |0 3| 4 |5 31| + * +----------+---+------------------------+ + * | Reserved | I | Reserved | + * +---------------------------------------+ + * | Virtual Network ID | Reserved | + * +----------------------------+----------+ + * |0 23|24 31| + * + * All reserved values must be 0. The I bit must be 1. We call the top + * word the VXLAN magic field for the time being. The second word is + * definitely not the most friendly way to operate. Specifically, the ID + * is a 24-bit big endian value, but we have to make sure not to use the + * reserved byte. + * + * For us, VXLAN encapsulation is a fairly straightforward implementation. It + * only has two properties, a listen_ip and a listen_port. These determine on + * what address we should be listening on. While we do not have a default + * address to listen upon, we do have a default port, which is the IANA assigned + * port for VXLAN -- 4789. + */ + +#include <sys/overlay_plugin.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/byteorder.h> +#include <sys/vxlan.h> +#include <inet/ip.h> +#include <netinet/in.h> +#include <sys/strsun.h> +#include <netinet/udp.h> + +static const char *vxlan_ident = "vxlan"; +static uint16_t vxlan_defport = IPPORT_VXLAN; + +/* + * Should we enable UDP source port hashing for fanout. + */ +boolean_t vxlan_fanout = B_TRUE; + +/* + * This represents the size in bytes that we want to allocate when allocating a + * vxlan header block. This is intended such that lower levels can try and use + * the message block that we allocate for the IP and UPD header. The hope is + * that even if this is tunneled, that this is enough space. + * + * The vxlan_noalloc_min value represents the minimum amount of space we need to + * consider not allocating a message block and just passing it down the stack in + * this form. This number assumes that we have a VLAN tag, so 18 byte Ethernet + * header, 20 byte IP header, 8 byte UDP header, and 8 byte VXLAN header. + */ +uint_t vxlan_alloc_size = 128; +uint_t vxlan_noalloc_min = 54; + +static const char *vxlan_props[] = { + "vxlan/listen_ip", + "vxlan/listen_port", + NULL +}; + +typedef struct vxlan { + kmutex_t vxl_lock; + overlay_handle_t vxl_oh; + uint16_t vxl_lport; + boolean_t vxl_hladdr; + struct in6_addr vxl_laddr; +} vxlan_t; + +static int +vxlan_o_init(overlay_handle_t oh, void **outp) +{ + vxlan_t *vxl; + + vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP); + *outp = vxl; + mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL); + vxl->vxl_oh = oh; + vxl->vxl_lport = vxlan_defport; + vxl->vxl_hladdr = B_FALSE; + + return (0); +} + +static void +vxlan_o_fini(void *arg) +{ + vxlan_t *vxl = arg; + + mutex_destroy(&vxl->vxl_lock); + kmem_free(arg, sizeof (vxlan_t)); +} + +static int +vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr, + socklen_t *slenp) +{ + vxlan_t *vxl = arg; + struct sockaddr_in6 *in; + + in = (struct sockaddr_in6 *)addr; + *dp = AF_INET6; + *fp = SOCK_DGRAM; + *pp = 0; + bzero(in, sizeof (struct sockaddr_in6)); + in->sin6_family = AF_INET6; + + /* + * We should consider a more expressive private errno set that + * provider's can use. + */ + mutex_enter(&vxl->vxl_lock); + if (vxl->vxl_hladdr == B_FALSE) { + mutex_exit(&vxl->vxl_lock); + return (EINVAL); + } + in->sin6_port = htons(vxl->vxl_lport); + in->sin6_addr = vxl->vxl_laddr; + mutex_exit(&vxl->vxl_lock); + *slenp = sizeof (struct sockaddr_in6); + + return (0); +} + +static int +vxlan_o_sockopt(ksocket_t ksock) +{ + int val, err; + if (vxlan_fanout == B_FALSE) + return (0); + + val = UDP_HASH_VXLAN; + err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val, + sizeof (val), kcred); + return (err); +} + +/* ARGSUSED */ +static int +vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop, + mblk_t **outp) +{ + mblk_t *ob; + vxlan_hdr_t *vxh; + + ASSERT(einfop->ovdi_id < (1 << 24)); + + if (DB_REF(mp) != 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) { + /* + * This allocation could get hot. We may want to have a good + * way to cache and handle this allocation the same way that IP + * does with keeping around a message block per entry, or + * basically treating this as an immutable message block in the + * system. Basically freemsg() will be a nop, but we'll do the + * right thing with respect to the rest of the chain. + */ + ob = allocb(vxlan_alloc_size, 0); + if (ob == NULL) + return (ENOMEM); + + ob->b_wptr = DB_LIM(ob); + ob->b_rptr = ob->b_wptr; + ob->b_cont = mp; + } else { + ob = mp; + } + ob->b_rptr -= VXLAN_HDR_LEN; + + vxh = (vxlan_hdr_t *)ob->b_rptr; + vxh->vxlan_flags = ntohl(VXLAN_F_VDI); + vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT); + *outp = ob; + + return (0); +} + +/* ARGSUSED */ +static int +vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop) +{ + vxlan_hdr_t *vxh; + + if (MBLKL(mp) < sizeof (vxlan_hdr_t)) + return (EINVAL); + vxh = (vxlan_hdr_t *)mp->b_rptr; + if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0) + return (EINVAL); + + dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT; + dinfop->ovdi_hdr_size = VXLAN_HDR_LEN; + + return (0); +} + +static int +vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize) +{ + vxlan_t *vxl = arg; + + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + if (*bufsize < sizeof (struct in6_addr)) + return (EOVERFLOW); + + mutex_enter(&vxl->vxl_lock); + if (vxl->vxl_hladdr == B_FALSE) { + *bufsize = 0; + } else { + bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr)); + *bufsize = sizeof (struct in6_addr); + } + mutex_exit(&vxl->vxl_lock); + return (0); + } + + /* vxlan/listen_port */ + if (strcmp(pr_name, vxlan_props[1]) == 0) { + uint64_t val; + if (*bufsize < sizeof (uint64_t)) + return (EOVERFLOW); + + mutex_enter(&vxl->vxl_lock); + val = vxl->vxl_lport; + bcopy(&val, buf, sizeof (uint64_t)); + *bufsize = sizeof (uint64_t); + mutex_exit(&vxl->vxl_lock); + return (0); + } + + return (EINVAL); +} + +static int +vxlan_o_setprop(void *arg, const char *pr_name, const void *buf, + uint32_t bufsize) +{ + vxlan_t *vxl = arg; + + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + const struct in6_addr *ipv6 = buf; + if (bufsize != sizeof (struct in6_addr)) + return (EINVAL); + + if (IN6_IS_ADDR_V4COMPAT(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_MULTICAST(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_6TO4(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_V4MAPPED(ipv6)) { + ipaddr_t v4; + IN6_V4MAPPED_TO_IPADDR(ipv6, v4); + if (IN_MULTICAST(v4)) + return (EINVAL); + } + + mutex_enter(&vxl->vxl_lock); + vxl->vxl_hladdr = B_TRUE; + bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr)); + mutex_exit(&vxl->vxl_lock); + + return (0); + } + + /* vxlan/listen_port */ + if (strcmp(pr_name, vxlan_props[1]) == 0) { + const uint64_t *valp = buf; + if (bufsize != 8) + return (EINVAL); + + if (*valp == 0 || *valp > UINT16_MAX) + return (EINVAL); + + mutex_enter(&vxl->vxl_lock); + vxl->vxl_lport = *valp; + mutex_exit(&vxl->vxl_lock); + return (0); + } + return (EINVAL); +} + +static int +vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl) +{ + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + overlay_prop_set_name(phdl, vxlan_props[0]); + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP); + overlay_prop_set_nodefault(phdl); + return (0); + } + + if (strcmp(pr_name, vxlan_props[1]) == 0) { + overlay_prop_set_name(phdl, vxlan_props[1]); + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + (void) overlay_prop_set_default(phdl, &vxlan_defport, + sizeof (vxlan_defport)); + overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX); + return (0); + } + + return (EINVAL); +} + +static struct overlay_plugin_ops vxlan_o_ops = { + 0, + vxlan_o_init, + vxlan_o_fini, + vxlan_o_encap, + vxlan_o_decap, + vxlan_o_socket, + vxlan_o_sockopt, + vxlan_o_getprop, + vxlan_o_setprop, + vxlan_o_propinfo +}; + +static struct modlmisc vxlan_modlmisc = { + &mod_miscops, + "VXLAN encap plugin" +}; + +static struct modlinkage vxlan_modlinkage = { + MODREV_1, + &vxlan_modlmisc +}; + +int +_init(void) +{ + int err; + overlay_plugin_register_t *ovrp; + + ovrp = overlay_plugin_alloc(OVEP_VERSION); + if (ovrp == NULL) + return (ENOTSUP); + ovrp->ovep_name = vxlan_ident; + ovrp->ovep_ops = &vxlan_o_ops; + ovrp->ovep_id_size = VXLAN_ID_LEN; + ovrp->ovep_flags = OVEP_F_VLAN_TAG; + ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT; + ovrp->ovep_props = vxlan_props; + + if ((err = overlay_plugin_register(ovrp)) == 0) { + if ((err = mod_install(&vxlan_modlinkage)) != 0) { + (void) overlay_plugin_unregister(vxlan_ident); + } + } + + overlay_plugin_free(ovrp); + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&vxlan_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int err; + + if ((err = overlay_plugin_unregister(vxlan_ident)) != 0) + return (err); + + return (mod_remove(&vxlan_modlinkage)); +} diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h index 1f732c2f65..8b4358cb8d 100644 --- a/usr/src/uts/common/netinet/in.h +++ b/usr/src/uts/common/netinet/in.h @@ -3,6 +3,7 @@ * Use is subject to license terms. * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ /* diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index c05f5ca58f..ed4fcbef5a 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -436,6 +436,9 @@ CHKHDRS= \ ontrap.h \ open.h \ openpromio.h \ + overlay.h \ + overlay_common.h \ + overlay_target.h \ panic.h \ param.h \ pathconf.h \ diff --git a/usr/src/uts/common/sys/dld_ioc.h b/usr/src/uts/common/sys/dld_ioc.h index 2f519a8eda..093a4dc0c3 100644 --- a/usr/src/uts/common/sys/dld_ioc.h +++ b/usr/src/uts/common/sys/dld_ioc.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_DLD_IOC_H @@ -59,6 +60,7 @@ extern "C" { #define IPTUN_IOC 0x454A #define BRIDGE_IOC 0xB81D #define IBPART_IOC 0x6171 +#define OVERLAY_IOC 0x2005 /* GLDv3 modules use these macros to generate unique ioctl commands */ #define DLDIOC(cmdid) DLD_IOC_CMD(DLD_IOC, (cmdid)) @@ -68,6 +70,7 @@ extern "C" { #define IPTUNIOC(cmdid) DLD_IOC_CMD(IPTUN_IOC, (cmdid)) #define BRIDGEIOC(cmdid) DLD_IOC_CMD(BRIDGE_IOC, (cmdid)) #define IBPARTIOC(cmdid) DLD_IOC_CMD(IBPART_IOC, (cmdid)) +#define OVERLAYIOC(cmdid) DLD_IOC_CMD(OVERLAY_IOC, (cmdid)) #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h index e2893a2295..b60e53b267 100644 --- a/usr/src/uts/common/sys/dls_mgmt.h +++ b/usr/src/uts/common/sys/dls_mgmt.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _DLS_MGMT_H @@ -46,13 +47,15 @@ typedef enum { DATALINK_CLASS_SIMNET = 0x20, DATALINK_CLASS_BRIDGE = 0x40, DATALINK_CLASS_IPTUN = 0x80, - DATALINK_CLASS_PART = 0x100 + DATALINK_CLASS_PART = 0x100, + DATALINK_CLASS_OVERLAY = 0x200 } datalink_class_t; #define DATALINK_CLASS_ALL (DATALINK_CLASS_PHYS | \ DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC | \ DATALINK_CLASS_ETHERSTUB | DATALINK_CLASS_SIMNET | \ - DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART) + DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART | \ + DATALINK_CLASS_OVERLAY) /* * A combination of flags and media. diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h index 965dca263c..01cb27644c 100644 --- a/usr/src/uts/common/sys/mac_client_priv.h +++ b/usr/src/uts/common/sys/mac_client_priv.h @@ -188,6 +188,7 @@ extern void mac_client_set_intr_cpu(void *, mac_client_handle_t, int32_t); extern void *mac_get_devinfo(mac_handle_t); extern boolean_t mac_is_vnic(mac_handle_t); +extern boolean_t mac_is_overlay(mac_handle_t); extern uint32_t mac_no_notification(mac_handle_t); extern int mac_set_prop(mac_handle_t, mac_prop_id_t, char *, void *, uint_t); diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index da645ad382..21f2c10a8e 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -609,6 +609,7 @@ struct mac_impl_s { #define MIS_LEGACY 0x0040 #define MIS_NO_ACTIVE 0x0080 #define MIS_POLL_DISABLE 0x0100 +#define MIS_IS_OVERLAY 0x0200 #define mi_getstat mi_callbacks->mc_getstat #define mi_start mi_callbacks->mc_start diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h index 2cb326814a..431de67ff5 100644 --- a/usr/src/uts/common/sys/mac_provider.h +++ b/usr/src/uts/common/sys/mac_provider.h @@ -109,7 +109,8 @@ typedef enum { MAC_CAPAB_NO_ZCOPY = 0x00100000, /* boolean only, no data */ MAC_CAPAB_LEGACY = 0x00200000, /* data is mac_capab_legacy_t */ MAC_CAPAB_VRRP = 0x00400000, /* data is mac_capab_vrrp_t */ - MAC_CAPAB_TRANSCEIVER = 0x01000000, /* mac_capab_transciever_t */ + MAC_CAPAB_OVERLAY = 0x00800000, /* boolean only, no data */ + MAC_CAPAB_TRANSCEIVER = 0x01000000, /* mac_capab_transceiver_t */ MAC_CAPAB_LED = 0x02000000 /* data is mac_capab_led_t */ } mac_capab_t; diff --git a/usr/src/uts/common/sys/overlay.h b/usr/src/uts/common/sys/overlay.h new file mode 100644 index 0000000000..12d0dbca51 --- /dev/null +++ b/usr/src/uts/common/sys/overlay.h @@ -0,0 +1,96 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_H +#define _SYS_OVERLAY_H + +/* + * Overlay device support + */ + +#include <sys/param.h> +#include <sys/dld_ioc.h> +#include <sys/mac.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVERLAY_IOC_CREATE OVERLAYIOC(1) +#define OVERLAY_IOC_DELETE OVERLAYIOC(2) +#define OVERLAY_IOC_PROPINFO OVERLAYIOC(3) +#define OVERLAY_IOC_GETPROP OVERLAYIOC(4) +#define OVERLAY_IOC_SETPROP OVERLAYIOC(5) +#define OVERLAY_IOC_NPROPS OVERLAYIOC(6) +#define OVERLAY_IOC_ACTIVATE OVERLAYIOC(7) +#define OVERLAY_IOC_STATUS OVERLAYIOC(8) + +typedef struct overlay_ioc_create { + datalink_id_t oic_linkid; + uint32_t oic_filler; + uint64_t oic_vnetid; + char oic_encap[MAXLINKNAMELEN]; +} overlay_ioc_create_t; + +typedef struct overlay_ioc_activate { + datalink_id_t oia_linkid; +} overlay_ioc_activate_t; + +typedef struct overlay_ioc_delete { + datalink_id_t oid_linkid; +} overlay_ioc_delete_t; + +typedef struct overlay_ioc_nprops { + datalink_id_t oipn_linkid; + int32_t oipn_nprops; +} overlay_ioc_nprops_t; + +typedef struct overlay_ioc_propinfo { + datalink_id_t oipi_linkid; + int32_t oipi_id; + char oipi_name[OVERLAY_PROP_NAMELEN]; + uint_t oipi_type; + uint_t oipi_prot; + uint8_t oipi_default[OVERLAY_PROP_SIZEMAX]; + uint32_t oipi_defsize; + uint32_t oipi_posssize; + uint8_t oipi_poss[OVERLAY_PROP_SIZEMAX]; +} overlay_ioc_propinfo_t; + +typedef struct overlay_ioc_prop { + datalink_id_t oip_linkid; + int32_t oip_id; + char oip_name[OVERLAY_PROP_NAMELEN]; + uint8_t oip_value[OVERLAY_PROP_SIZEMAX]; + uint32_t oip_size; +} overlay_ioc_prop_t; + +typedef enum overlay_status { + OVERLAY_I_OK = 0x00, + OVERLAY_I_DEGRADED = 0x01 +} overlay_status_t; + +typedef struct overlay_ioc_status { + datalink_id_t ois_linkid; + uint_t ois_status; + char ois_message[OVERLAY_STATUS_BUFLEN]; +} overlay_ioc_status_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_H */ diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h new file mode 100644 index 0000000000..5c4b651f2c --- /dev/null +++ b/usr/src/uts/common/sys/overlay_common.h @@ -0,0 +1,65 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_COMMON_H +#define _SYS_OVERLAY_COMMON_H + +/* + * Common overlay definitions + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum overlay_target_mode { + OVERLAY_TARGET_NONE = 0x0, + OVERLAY_TARGET_POINT, + OVERLAY_TARGET_DYNAMIC +} overlay_target_mode_t; + +typedef enum overlay_plugin_dest { + OVERLAY_PLUGIN_D_INVALID = 0x0, + OVERLAY_PLUGIN_D_ETHERNET = 0x1, + OVERLAY_PLUGIN_D_IP = 0x2, + OVERLAY_PLUGIN_D_PORT = 0x4, + OVERLAY_PLUGIN_D_MASK = 0x7 +} overlay_plugin_dest_t; + +typedef enum overlay_prop_type { + OVERLAY_PROP_T_INT = 0x1, /* signed int */ + OVERLAY_PROP_T_UINT, /* unsigned int */ + OVERLAY_PROP_T_IP, /* sinaddr6 */ + OVERLAY_PROP_T_STRING /* OVERLAY_PROPS_SIZEMAX */ +} overlay_prop_type_t; + +typedef enum overlay_prop_prot { + OVERLAY_PROP_PERM_REQ = 0x1, + OVERLAY_PROP_PERM_READ = 0x2, + OVERLAY_PROP_PERM_WRITE = 0x4, + OVERLAY_PROP_PERM_RW = 0x6, + OVERLAY_PROP_PERM_RRW = 0x7, + OVERLAY_PROP_PERM_MASK = 0x7 +} overlay_prop_prot_t; + +#define OVERLAY_PROP_NAMELEN 64 +#define OVERLAY_PROP_SIZEMAX 256 +#define OVERLAY_STATUS_BUFLEN 256 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_COMMON_H */ diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h new file mode 100644 index 0000000000..0095c75eeb --- /dev/null +++ b/usr/src/uts/common/sys/overlay_impl.h @@ -0,0 +1,205 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_IMPL_H +#define _SYS_OVERLAY_IMPL_H + +/* + * Overlay device support + */ + +#include <sys/overlay.h> +#include <sys/overlay_common.h> +#include <sys/overlay_plugin.h> +#include <sys/overlay_target.h> +#include <sys/ksynch.h> +#include <sys/list.h> +#include <sys/avl.h> +#include <sys/ksocket.h> +#include <sys/socket.h> +#include <sys/refhash.h> +#include <sys/ethernet.h> +#include <sys/list.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVEP_VERSION_ONE 0x1 + +typedef struct overlay_plugin { + kmutex_t ovp_mutex; + list_node_t ovp_link; /* overlay_plugin_lock */ + uint_t ovp_active; /* ovp_mutex */ + const char *ovp_name; /* RO */ + const overlay_plugin_ops_t *ovp_ops; /* RO */ + const char *const *ovp_props; /* RO */ + uint_t ovp_nprops; /* RO */ + uint_t ovp_id_size; /* RO */ + overlay_plugin_flags_t ovp_flags; /* RO */ + overlay_plugin_dest_t ovp_dest; /* RO */ +} overlay_plugin_t; + +typedef struct overlay_mux { + list_node_t omux_lnode; + ksocket_t omux_ksock; /* RO */ + overlay_plugin_t *omux_plugin; /* RO: associated encap */ + int omux_domain; /* RO: socket domain */ + int omux_family; /* RO: socket family */ + int omux_protocol; /* RO: socket protocol */ + struct sockaddr *omux_addr; /* RO: socket address */ + socklen_t omux_alen; /* RO: sockaddr len */ + kmutex_t omux_lock; /* Protects everything below */ + uint_t omux_count; /* Active instances */ + avl_tree_t omux_devices; /* Tree of devices */ +} overlay_mux_t; + +typedef enum overlay_target_flag { + OVERLAY_T_TEARDOWN = 0x1 +} overlay_target_flag_t; + +typedef struct overlay_target { + kmutex_t ott_lock; + kcondvar_t ott_cond; + overlay_target_mode_t ott_mode; /* RO */ + overlay_plugin_dest_t ott_dest; /* RO */ + uint64_t ott_id; /* RO */ + overlay_target_flag_t ott_flags; /* ott_lock */ + uint_t ott_ocount; /* ott_lock */ + union { /* ott_lock */ + overlay_target_point_t ott_point; + struct overlay_target_dyn { + refhash_t *ott_dhash; + avl_tree_t ott_tree; + } ott_dyn; + } ott_u; +} overlay_target_t; + +typedef enum overlay_dev_flag { + OVERLAY_F_ACTIVATED = 0x01, /* Activate ioctl completed */ + OVERLAY_F_IN_MUX = 0x02, /* Currently in a mux */ + OVERLAY_F_IN_TX = 0x04, /* Currently doing tx */ + OVERLAY_F_IN_RX = 0x08, /* Currently doing rx */ + OVERLAY_F_IOMASK = 0x0c, /* A mask for rx and tx */ + OVERLAY_F_MDDROP = 0x10, /* Drop traffic for metadata update */ + OVERLAY_F_STOPMASK = 0x1e, /* None set when stopping */ + OVERLAY_F_VARPD = 0x20, /* varpd plugin exists */ + OVERLAY_F_DEGRADED = 0x40, /* device is degraded */ + OVERLAY_F_MASK = 0x7f /* mask of everything */ +} overlay_dev_flag_t; + +typedef struct overlay_dev { + kmutex_t odd_lock; + kcondvar_t odd_iowait; + list_node_t odd_link; /* overlay_dev_lock */ + mac_handle_t odd_mh; /* RO */ + overlay_plugin_t *odd_plugin; /* RO */ + datalink_id_t odd_linkid; /* RO */ + void *odd_pvoid; /* RO -- only used by plugin */ + uint_t odd_ref; /* protected by odd_lock */ + uint_t odd_mtu; /* protected by odd_lock */ + overlay_dev_flag_t odd_flags; /* protected by odd_lock */ + uint_t odd_rxcount; /* protected by odd_lock */ + uint_t odd_txcount; /* protected by odd_lock */ + overlay_mux_t *odd_mux; /* protected by odd_lock */ + uint64_t odd_vid; /* RO if active else odd_lock */ + avl_node_t odd_muxnode; /* managed by mux */ + overlay_target_t *odd_target; /* See big theory statement */ + char odd_fmamsg[OVERLAY_STATUS_BUFLEN]; /* odd_lock */ +} overlay_dev_t; + +typedef enum overlay_target_entry_flags { + OVERLAY_ENTRY_F_PENDING = 0x01, /* lookup in progress */ + OVERLAY_ENTRY_F_VALID = 0x02, /* entry is currently valid */ + OVERLAY_ENTRY_F_DROP = 0x04, /* always drop target */ + OVERLAY_ENTRY_F_VALID_MASK = 0x06 +} overlay_target_entry_flags_t; + +typedef struct overlay_target_entry { + kmutex_t ote_lock; + refhash_link_t ote_reflink; /* hashtable link */ + avl_node_t ote_avllink; /* iteration link */ + list_node_t ote_qlink; + overlay_target_entry_flags_t ote_flags; /* RW: state flags */ + uint8_t ote_addr[ETHERADDRL]; /* RO: mac addr */ + overlay_target_t *ote_ott; /* RO */ + overlay_dev_t *ote_odd; /* RO */ + overlay_target_point_t ote_dest; /* RW: destination */ + mblk_t *ote_chead; /* RW: blocked mb chain head */ + mblk_t *ote_ctail; /* RW: blocked mb chain tail */ + size_t ote_mbsize; /* RW: outstanding mblk size */ + hrtime_t ote_vtime; /* RW: valid timestamp */ +} overlay_target_entry_t; + + +#define OVERLAY_CTL "overlay" + +extern dev_info_t *overlay_dip; + +extern mblk_t *overlay_m_tx(void *, mblk_t *); + +typedef int (*overlay_dev_iter_f)(overlay_dev_t *, void *); +extern void overlay_dev_iter(overlay_dev_iter_f, void *); + +extern void overlay_plugin_init(void); +extern overlay_plugin_t *overlay_plugin_lookup(const char *); +extern void overlay_plugin_rele(overlay_plugin_t *); +extern void overlay_plugin_fini(void); +typedef int (*overlay_plugin_walk_f)(overlay_plugin_t *, void *); +extern void overlay_plugin_walk(overlay_plugin_walk_f, void *); + +extern void overlay_io_start(overlay_dev_t *, overlay_dev_flag_t); +extern void overlay_io_done(overlay_dev_t *, overlay_dev_flag_t); + +extern void overlay_mux_init(void); +extern void overlay_mux_fini(void); + +extern overlay_mux_t *overlay_mux_open(overlay_plugin_t *, int, int, int, + struct sockaddr *, socklen_t, int *); +extern void overlay_mux_close(overlay_mux_t *); +extern void overlay_mux_add_dev(overlay_mux_t *, overlay_dev_t *); +extern void overlay_mux_remove_dev(overlay_mux_t *, overlay_dev_t *); +extern int overlay_mux_tx(overlay_mux_t *, struct msghdr *, mblk_t *); + +extern void overlay_prop_init(overlay_prop_handle_t); + +extern void overlay_target_init(void); +extern int overlay_target_busy(void); +extern int overlay_target_open(dev_t *, int, int, cred_t *); +extern int overlay_target_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +extern int overlay_target_close(dev_t, int, int, cred_t *); +extern void overlay_target_free(overlay_dev_t *); + +#define OVERLAY_TARGET_OK 0 +#define OVERLAY_TARGET_DROP 1 +#define OVERLAY_TARGET_ASYNC 2 +extern int overlay_target_lookup(overlay_dev_t *, mblk_t *, struct sockaddr *, + socklen_t *); +extern void overlay_target_quiesce(overlay_target_t *); +extern void overlay_target_fini(void); + +extern void overlay_fm_init(void); +extern void overlay_fm_fini(void); +extern void overlay_fm_degrade(overlay_dev_t *, const char *); +extern void overlay_fm_restore(overlay_dev_t *); + +extern overlay_dev_t *overlay_hold_by_dlid(datalink_id_t); +extern void overlay_hold_rele(overlay_dev_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_IMPL_H */ diff --git a/usr/src/uts/common/sys/overlay_plugin.h b/usr/src/uts/common/sys/overlay_plugin.h new file mode 100644 index 0000000000..3392973562 --- /dev/null +++ b/usr/src/uts/common/sys/overlay_plugin.h @@ -0,0 +1,324 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_PLUGIN_H +#define _SYS_OVERLAY_PLUGIN_H + +/* + * overlay plugin interface for encapsulation/decapsulation modules + * + * This header file defines how encapsulation and decapsulation plugins + * interact within the broader system. At this time, these interfaces are + * considered private to illumos and therefore are subject to change. As we gain + * more experience with a few of the different encapsulation formats, say nvgre + * or geneve, then we can move to make this a more-stable interface. + * + * A plugin is a general kernel module that uses the miscellaneous mod-linkage. + * + * In it's _init(9E) routine, it must register itself with the overlay + * subsystem. To do this, it allocates an overlay_plugin_register_t via + * overlay_plugin_alloc(), that it then * fills out with various required + * information and then attempts to register with the system via a call to + * overlay_plugin_register(). If that succeeds, it should then call + * mod_install(9F). If the mod_install(9F) fails, then it should call + * overlay_plugin_unregister(). Regardless of success or failure, it should call + * overlay_plugin_free() to ensure that any memory that may be associated with + * the registration is freed. + * + * When the module's _fini(9E) is called, overlay_plugin_unregister() should be + * called first. It may return an error, such as EBUSY. In such cases, it should + * be returned as the return status of _fini(9E). This is quite necessary, it + * ensures that if the module is in use it doesn't get unloaded out from under + * us the broader subsystem while it's still in use. A driver can use that to + * know that there are no current instances of its private data. + * + * ------------------ + * Plugin Definitions + * ------------------ + * + * A plugin is required to fill in both an operations vector and a series of + * information to the callback routine. Here are the routines and their + * purposes. The full signatures are available below. + * + * overlay_plugin_init_t + * + * This interface is used to create a new instance of a plugin. An instance + * of a plugin will be created for each overlay device that is created. For + * example, if a device is created with VXLAN ID 23 and ID 42, then there + * will be two different calls to this function. + * + * This function gives the plugin a chance to create a private data + * structure that will be returned on subsequent calls to the system. + * + * overlay_plugin_fini_t + * + * This is the opposite of overlay_plugin_init_t. It will be called when it + * is safe to remove any private data that is associated with this instance + * of the plugin. + * + * overlay_plugin_propinfo_t + * + * This is called with the name of a property that is registered when the + * plugin is created. This function will be called with the name of the + * property that information is being requested about. The plugin is + * responsible for filling out information such as setting the name, the + * type of property it is, the protection of the property (can a user + * update it?), whether the property is required, an optional default value + * for the property, and an optional set of values or ranges that are + * allowed. + * + * overlay_plugin_getprop_t + * + * Return the value of the named property from the current instance of the + * plugin. + * + * overlay_plugin_setprop_t + * + * Set the value of the named property to the specified value for the + * current instance of the plugin. Note, that it is the plugin's + * responsibility to ensure that the value of the property is valid and to + * update state as appropriate. + * + * overlay_plugin_socket_t + * + * Every overlay device has a corresponding socket that it uses to send and + * receive traffic. This routine is used to get the parameters that should + * be used to define such a socket. The actual socket may be multiplexed + * with other uses of it. + * + * overlay_plugin_sockopt_t + * + * Allow a plugin to set any necessary socket options that it needs on the + * kernel socket that is being used by a mux. This will only be called once + * for a given mux, if additional devices are added to a mux, it will not + * be called additional times. + * + * overlay_plugin_encap_t + * + * In this routine you're given a message block and information about the + * packet, such as the identifier and are asked to fill out a message block + * that represents the encapsulation header and optionally manipulate the + * input message if required. + * + * overlay_plugin_decap_t + * + * In this routine, you're given the encapsulated message block. The + * requirement is to decapsulate it and determine what is the correct + * overlay identifier for this network and to fill in the header size so + * the broader system knows how much of this data should be considered + * consumed. + * + * ovpo_callbacks + * + * This should be set to zero, it's reserved for future use. + * + * Once these properties are defined, the module should define the following + * members in the overlay_plugin_register_t. + * + * ovep_version + * + * Should be set to the value of the macro OVEP_VERSION. + * + * ovep_name + * + * Should be set to a character string that has the name of the module. + * Generally this should match the name of the kernel module; however, this + * is the name that users will use to refer to this module when creating + * devices. + * + * overlay_plugin_ops_t + * + * Should be set to the functions as described above. + * + * ovep_props + * + * This is an array of character strings that holds the names of the + * properties of the encapsulation plugin. + * + * + * ovep_id_size + * + * This is the size in bytes of the valid range for the identifier. The + * valid identifier range is considered a ovep_id_size byte unsigned + * integer, [ 0, 1 << (ovep_id_size * 8) ). + * + * ovep_flags + * + * A series of flags that indicate optional features that are supported. + * Valid flags include: + * + * OVEP_F_VLAN_TAG + * + * The encapsulation format allows for the encapsulated + * packet to maintain a VLAN tag. + * + * ovep_dest + * + * Describes the kind of destination that the overlay plugin supports for + * sending traffic. For example, vxlan uses UDP, therefore it requires both + * an IP address and a port; however, nvgre uses the gre header and + * therefore only requires an IP address. The following flags may be + * combined: + * + * OVERLAY_PLUGIN_D_ETHERNET + * + * Indicates that to send a packet to its destination, we + * require a link-layer ethernet address. + * + * OVERLAY_PLUGIN_D_IP + * + * Indicates that to send a packet to its destination, we + * require an IP address. Note, all IP addresses are + * transmitted as IPv6 addresses and for an IPv4 + * destination, using an IPv4-mapped IPv6 address is the + * expected way to transmit that. + * + * OVERLAY_PLUGIN_D_PORT + * + * Indicates that to send a packet to its destination, a + * port is required, this usually indicates that the + * protocol uses something like TCP or UDP. + * + * + * ------------------------------------------------- + * Downcalls, Upcalls, and Synchronization Guarantees + * ------------------------------------------------- + * + * Every instance of a given module is independent. The kernel only guarantees + * that it will probably perform downcalls into different instances in parallel + * at some point. No locking is provided by the framework for synchronization + * across instances. If a module finds itself needing that, it will be up to it + * to provide it. + * + * In a given instance, the kernel may call into entry points in parallel. If + * the instance has private data, it should likely synchronize it. The one + * guarantee that we do make, is that calls to getprop and setprop will be done + * synchronized by a caller holding the MAC perimeter. + * + * While servicing a downcall from the general overlay device framework, a + * kernel module should not make any upcalls, excepting those functions that are + * defined in this header file, eg. the property related callbacks. Importantly, + * it cannot make any assumptions about what locks may or may not be held by the + * broader system. The only thing that it is safe for it to use are its own + * locks. + * + * ---------------- + * Downcall Context + * ---------------- + * + * For all of the downcalls, excepting the overlay_plugin_encap_t and + * overlay_plugin_decap_t, the calls will be made either in kernel or user + * context, the module should not assume either way. + * + * overlay_plugin_encap_t and overlay_plugin_decap_t may be called in user, + * kernel or interrupt context; however, it is guaranteed that the interrupt + * will be below LOCK_LEVEL, and therefore it is safe to grab locks. + */ + +#include <sys/stream.h> +#include <sys/mac_provider.h> +#include <sys/ksocket.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVEP_VERSION 0x1 + +typedef enum overlay_plugin_flags { + OVEP_F_VLAN_TAG = 0x01 /* Supports VLAN Tags */ +} overlay_plugin_flags_t; + +/* + * The ID space could easily be more than a 64-bit number, even + * though today it's either a 24-64 bit value. How should we future + * proof ourselves here? + */ +typedef struct ovep_encap_info { + uint64_t ovdi_id; + size_t ovdi_hdr_size; +} ovep_encap_info_t; + +typedef struct __overlay_prop_handle *overlay_prop_handle_t; +typedef struct __overlay_handle *overlay_handle_t; + +/* + * Plugins are guaranteed that calls to setprop are serialized. However, any + * number of other calls can be going on in parallel otherwise. + */ +typedef int (*overlay_plugin_encap_t)(void *, mblk_t *, + ovep_encap_info_t *, mblk_t **); +typedef int (*overlay_plugin_decap_t)(void *, mblk_t *, + ovep_encap_info_t *); +typedef int (*overlay_plugin_init_t)(overlay_handle_t, void **); +typedef void (*overlay_plugin_fini_t)(void *); +typedef int (*overlay_plugin_socket_t)(void *, int *, int *, int *, + struct sockaddr *, socklen_t *); +typedef int (*overlay_plugin_sockopt_t)(ksocket_t); +typedef int (*overlay_plugin_getprop_t)(void *, const char *, void *, + uint32_t *); +typedef int (*overlay_plugin_setprop_t)(void *, const char *, const void *, + uint32_t); +typedef int (*overlay_plugin_propinfo_t)(const char *, overlay_prop_handle_t); + +typedef struct overlay_plugin_ops { + uint_t ovpo_callbacks; + overlay_plugin_init_t ovpo_init; + overlay_plugin_fini_t ovpo_fini; + overlay_plugin_encap_t ovpo_encap; + overlay_plugin_decap_t ovpo_decap; + overlay_plugin_socket_t ovpo_socket; + overlay_plugin_sockopt_t ovpo_sockopt; + overlay_plugin_getprop_t ovpo_getprop; + overlay_plugin_setprop_t ovpo_setprop; + overlay_plugin_propinfo_t ovpo_propinfo; +} overlay_plugin_ops_t; + +typedef struct overlay_plugin_register { + uint_t ovep_version; + const char *ovep_name; + const overlay_plugin_ops_t *ovep_ops; + const char **ovep_props; + uint_t ovep_id_size; + uint_t ovep_flags; + uint_t ovep_dest; +} overlay_plugin_register_t; + +/* + * Functions that interact with registration + */ +extern overlay_plugin_register_t *overlay_plugin_alloc(uint_t); +extern void overlay_plugin_free(overlay_plugin_register_t *); +extern int overlay_plugin_register(overlay_plugin_register_t *); +extern int overlay_plugin_unregister(const char *); + +/* + * Property information callbacks + */ +extern void overlay_prop_set_name(overlay_prop_handle_t, const char *); +extern void overlay_prop_set_prot(overlay_prop_handle_t, overlay_prop_prot_t); +extern void overlay_prop_set_type(overlay_prop_handle_t, overlay_prop_type_t); +extern int overlay_prop_set_default(overlay_prop_handle_t, void *, ssize_t); +extern void overlay_prop_set_nodefault(overlay_prop_handle_t); +extern void overlay_prop_set_range_uint32(overlay_prop_handle_t, uint32_t, + uint32_t); +extern void overlay_prop_set_range_str(overlay_prop_handle_t, const char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_PLUGIN_H */ diff --git a/usr/src/uts/common/sys/overlay_target.h b/usr/src/uts/common/sys/overlay_target.h new file mode 100644 index 0000000000..775c7d27b8 --- /dev/null +++ b/usr/src/uts/common/sys/overlay_target.h @@ -0,0 +1,295 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#ifndef _OVERLAY_TARGET_H +#define _OVERLAY_TARGET_H + +/* + * Overlay device varpd ioctl interface (/dev/overlay) + */ + +#include <sys/types.h> +#include <sys/ethernet.h> +#include <netinet/in.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct overlay_target_point { + uint8_t otp_mac[ETHERADDRL]; + struct in6_addr otp_ip; + uint16_t otp_port; +} overlay_target_point_t; + +#define OVERLAY_TARG_IOCTL (('o' << 24) | ('v' << 16) | ('t' << 8)) + +#define OVERLAY_TARG_INFO (OVERLAY_TARG_IOCTL | 0x01) + +typedef enum overlay_targ_info_flags { + OVERLAY_TARG_INFO_F_ACTIVE = 0x01, + OVERLAY_TARG_INFO_F_DEGRADED = 0x02 +} overlay_targ_info_flags_t; + +/* + * Get target information about an overlay device + */ +typedef struct overlay_targ_info { + datalink_id_t oti_linkid; + uint32_t oti_needs; + uint64_t oti_flags; + uint64_t oti_vnetid; +} overlay_targ_info_t; + +/* + * Declare an association between a given varpd instance and a datalink. + */ +#define OVERLAY_TARG_ASSOCIATE (OVERLAY_TARG_IOCTL | 0x02) + +typedef struct overlay_targ_associate { + datalink_id_t ota_linkid; + uint32_t ota_mode; + uint64_t ota_id; + uint32_t ota_provides; + overlay_target_point_t ota_point; +} overlay_targ_associate_t; + +/* + * Remove an association from a device. If the device has already been started, + * this implies OVERLAY_TARG_DEGRADE. + */ +#define OVERLAY_TARG_DISASSOCIATE (OVERLAY_TARG_IOCTL | 0x3) + +/* + * Tells the kernel that while a varpd instance still exists, it basically isn't + * making any forward progress, so the device should consider itself degraded. + */ +#define OVERLAY_TARG_DEGRADE (OVERLAY_TARG_IOCTL | 0x4) + +typedef struct overlay_targ_degrade { + datalink_id_t otd_linkid; + uint32_t otd_pad; + char otd_buf[OVERLAY_STATUS_BUFLEN]; +} overlay_targ_degrade_t; + +/* + * Tells the kernel to remove the degraded status that it set on a device. + */ +#define OVERLAY_TARG_RESTORE (OVERLAY_TARG_IOCTL | 0x5) + +typedef struct overlay_targ_id { + datalink_id_t otid_linkid; +} overlay_targ_id_t; + +/* + * The following ioctls are all used to support dynamic lookups from userland, + * generally serviced by varpd. + * + * The way this is designed to work is that user land will have threads sitting + * in OVERLAY_TARG_LOOKUP ioctls waiting to service requests. A thread will sit + * waiting for work for up to approximately one second of time before they will + * be sent back out to user land to give user land a chance to clean itself up + * or more generally, come back into the kernel for work. Once these threads + * return, they will have a request with which more action can be done. The + * following ioctls can all be used to answer the request. + * + * OVERLAY_TARG_RESPOND - overlay_targ_resp_t + * + * The overlay_targ_resp_t has the appropriate information from + * which a reply can be generated. The information is filled into + * an overlay_targ_point_t as appropriate based on the + * overlay_plugin_dest_t type. + * + * + * OVERLAY_TARG_DROP - overlay_targ_resp_t + * + * The overlay_targ_resp_t should identify a request for which to + * drop a packet. + * + * + * OVERLAY_TARG_INJECT - overlay_targ_pkt_t + * + * The overlay_targ_pkt_t injects a fully formed packet into the + * virtual network. It may either be identified by its data link id + * or by the request id. If both are specified, the + * datalink id will be used. Note, that an injection is not + * considered a reply and if this corresponds to a request, then + * that individual packet must still be dropped. + * + * + * OVERLAY_TARG_PKT - overlay_targ_pkt_t + * + * This ioctl can be used to copy data from a given request into a + * user buffer. This can be used in combination with + * OVERLAY_TARG_INJECT to implement services such as a proxy-arp. + * + * + * OVERLAY_TARG_RESEND - overlay_targ_pkt_t + * + * This ioctl is similar to the OVERLAY_TARG_INJECT, except instead + * of receiving it on the local mac handle, it queues it for + * retransmission again. This is useful if you have a packet that + * was originally destined for some broadcast or multicast address + * that you now want to send to a unicast address. + */ +#define OVERLAY_TARG_LOOKUP (OVERLAY_TARG_IOCTL | 0x10) +#define OVERLAY_TARG_RESPOND (OVERLAY_TARG_IOCTL | 0x11) +#define OVERLAY_TARG_DROP (OVERLAY_TARG_IOCTL | 0x12) +#define OVERLAY_TARG_INJECT (OVERLAY_TARG_IOCTL | 0x13) +#define OVERLAY_TARG_PKT (OVERLAY_TARG_IOCTL | 0x14) +#define OVERLAY_TARG_RESEND (OVERLAY_TARG_IOCTL | 0x15) + +typedef struct overlay_targ_lookup { + uint64_t otl_dlid; + uint64_t otl_reqid; + uint64_t otl_varpdid; + uint64_t otl_vnetid; + uint64_t otl_hdrsize; + uint64_t otl_pktsize; + uint8_t otl_srcaddr[ETHERADDRL]; + uint8_t otl_dstaddr[ETHERADDRL]; + uint32_t otl_dsttype; + uint32_t otl_sap; + int32_t otl_vlan; +} overlay_targ_lookup_t; + +typedef struct overlay_targ_resp { + uint64_t otr_reqid; + overlay_target_point_t otr_answer; +} overlay_targ_resp_t; + +typedef struct overlay_targ_pkt { + uint64_t otp_linkid; + uint64_t otp_reqid; + uint64_t otp_size; + void *otp_buf; +} overlay_targ_pkt_t; + +#ifdef _KERNEL + +#pragma pack(4) +typedef struct overlay_targ_pkt32 { + uint64_t otp_linkid; + uint64_t otp_reqid; + uint64_t otp_size; + caddr32_t otp_buf; +} overlay_targ_pkt32_t; +#pragma pack() + +#endif /* _KERNEL */ + +/* + * This provides a way to get a list of active overlay devices independently + * from dlmgmtd. At the end of the day the kernel always knows what will exist + * and this allows varpd which is an implementation of libdladm not to end up + * needing to call back into dlmgmtd via libdladm and create an unfortunate + * dependency cycle. + */ + +#define OVERLAY_TARG_LIST (OVERLAY_TARG_IOCTL | 0x20) + +typedef struct overlay_targ_list { + uint32_t otl_nents; + uint32_t otl_ents[]; +} overlay_targ_list_t; + +/* + * The following family of ioctls all manipulate the target cache of a given + * device. + * + * OVERLAY_TARG_CACHE_GET - overlay_targ_cache_t + * + * The overlay_targ_cache_t should be have its link identifier and + * the desired mac address filled in. On return, it will fill in + * the otc_dest member, if the entry exists in the table. + * + * + * OVERLAY_TARG_CACHE_SET - overlay_targ_cache_t + * + * The cache table entry of the mac address referred to by otc_mac + * and otd_linkid will be filled in with the details provided by in + * the otc_dest member. + * + * OVERLAY_TARG_CACHE_REMOVE - overlay_targ_cache_t + * + * Removes the cache entry identified by otc_mac from the table. + * Note that this does not stop any in-flight lookups or deal with + * any data that is awaiting a lookup. + * + * + * OVERLAY_TARG_CACHE_FLUSH - overlay_targ_cache_t + * + * Similar to OVERLAY_TARG_CACHE_REMOVE, but functions on the + * entire table identified by otc_linkid. All other parameters are + * ignored. + * + * + * OVERLAY_TARG_CACHE_ITER - overlay_targ_cache_iter_t + * + * Iterates over the contents of a target cache identified by + * otci_linkid. Iteration is guaranteed to be exactly once for + * items which are in the hashtable at the beginning and end of + * iteration. For items which are added or removed after iteration + * has begun, only at most once semantics are guaranteed. Consumers + * should ensure that otci_marker is zeroed before starting + * iteration and should preserve its contents across calls. + * + * Before calling in, otci_count should be set to the number of + * entries that space has been allocated for in otci_ents. The + * value will be updated to indicate the total number written out. + */ + +#define OVERLAY_TARG_CACHE_GET (OVERLAY_TARG_IOCTL | 0x30) +#define OVERLAY_TARG_CACHE_SET (OVERLAY_TARG_IOCTL | 0x31) +#define OVERLAY_TARG_CACHE_REMOVE (OVERLAY_TARG_IOCTL | 0x32) +#define OVERLAY_TARG_CACHE_FLUSH (OVERLAY_TARG_IOCTL | 0x33) +#define OVERLAY_TARG_CACHE_ITER (OVERLAY_TARG_IOCTL | 0x34) + +/* + * This is a pretty arbitrary number that we're constraining ourselves to + * for iteration. Basically the goal is to make sure that we can't have a user + * ask us to allocate too much memory on their behalf at any time. A more + * dynamic form may be necessary some day. + */ +#define OVERLAY_TARGET_ITER_MAX 500 + +#define OVERLAY_TARGET_CACHE_DROP 0x01 + +typedef struct overlay_targ_cache_entry { + uint8_t otce_mac[ETHERADDRL]; + uint16_t otce_flags; + overlay_target_point_t otce_dest; +} overlay_targ_cache_entry_t; + +typedef struct overlay_targ_cache { + datalink_id_t otc_linkid; + overlay_targ_cache_entry_t otc_entry; +} overlay_targ_cache_t; + +typedef struct overlay_targ_cache_iter { + datalink_id_t otci_linkid; + uint32_t otci_pad; + uint64_t otci_marker; + uint16_t otci_count; + uint8_t otci_pad2[3]; + overlay_targ_cache_entry_t otci_ents[]; +} overlay_targ_cache_iter_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _OVERLAY_TARGET_H */ diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel index b586675d85..290eae88ff 100644 --- a/usr/src/uts/intel/Makefile.intel +++ b/usr/src/uts/intel/Makefile.intel @@ -706,6 +706,12 @@ MAC_KMODS += mac_wifi MAC_KMODS += mac_ib # +# Overlay related modules (/kernel/overlay) +# +DRV_KMODS += overlay +OVERLAY_KMODS += vxlan + +# # socketmod (kernel/socketmod) # SOCKET_KMODS += sockpfp diff --git a/usr/src/uts/intel/overlay/Makefile b/usr/src/uts/intel/overlay/Makefile new file mode 100644 index 0000000000..deb77fcd6d --- /dev/null +++ b/usr/src/uts/intel/overlay/Makefile @@ -0,0 +1,46 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +UTSBASE = ../.. + +MODULE = overlay +OBJECTS = $(OVERLAY_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) $(SRC_CONFFILE) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) +CONF_SRCDIR = $(UTSBASE)/common/io/overlay +MAPFILE = $(UTSBASE)/common/io/overlay/overlay.mapfile + +LDFLAGS += -Nmisc/mac -Ndrv/dld -Nmisc/dls -Nmisc/ksocket + +# needs work +SMATCH=off + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/vxlan/Makefile b/usr/src/uts/intel/vxlan/Makefile new file mode 100644 index 0000000000..530c66ee4c --- /dev/null +++ b/usr/src/uts/intel/vxlan/Makefile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +UTSBASE = ../.. + +MODULE = vxlan +OBJECTS = $(OVERLAY_VXLAN_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_OVERLAY_DIR)/$(MODULE) + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +LDFLAGS += -Ndrv/overlay -Ndrv/ip + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ |