summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorAndy Fiddaman <omnios@citrus-it.co.uk>2021-04-19 13:14:49 +0000
committerAndy Fiddaman <omnios@citrus-it.co.uk>2021-07-12 21:32:19 +0000
commit069b2ef0d51cd626922df94af789ca0dc322222d (patch)
tree02799a0e51840262389c128c1bd046ca343c1d25 /usr/src
parentcfcb628093a526b143a37c1a2f112f2c7591ed70 (diff)
downloadillumos-joyent-069b2ef0d51cd626922df94af789ca0dc322222d.tar.gz
13738 Wire up bhyve's network backends for illumos
Reviewed by: Robert Mustacchi <rm@fingolfin.org> Reviewed by: Jorge Schrauwen <sjorge@blackdot.be> Approved by: Garrett D'Amore <garrett@damore.org>
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/bhyve/Makefile9
-rw-r--r--usr/src/cmd/bhyve/README.sync5
-rw-r--r--usr/src/cmd/bhyve/net_backends.c386
-rw-r--r--usr/src/cmd/bhyve/net_backends.h4
-rw-r--r--usr/src/cmd/bhyve/pci_e82545.c239
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_net.c931
-rw-r--r--usr/src/compat/bhyve/sys/cdefs.h7
-rw-r--r--usr/src/man/man1m/bhyve.1m47
-rw-r--r--usr/src/man/man4/bhyve_config.421
9 files changed, 848 insertions, 801 deletions
diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile
index e12878f71a..bbc966d67f 100644
--- a/usr/src/cmd/bhyve/Makefile
+++ b/usr/src/cmd/bhyve/Makefile
@@ -44,6 +44,7 @@ SRCS = acpi.c \
mem.c \
mevent.c \
mptbl.c \
+ net_backends.c \
net_utils.c \
pci_ahci.c \
pci_e82545.c \
@@ -98,11 +99,6 @@ SRCS = acpi.c \
#hda_codec.c \
#pci_hda.c \
-# The bhyve generic net-backend stuff has been ignored by us at the moment
-# because illumos users prefer to use viona for its superior network perf.
- #net_backends.c \
-
-
OBJS = $(SRCS:.c=.o)
CLOBBERFILES = $(ROOTUSRSBINPROG)
@@ -113,7 +109,8 @@ MEVENT_TEST_OBJS = $(MEVENT_TEST_SRCS:.c=.o)
CLEANFILES = $(PROG) $(MEVENT_TEST_PROG) $(MEVENT_TEST_OBJS)
-CFLAGS += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration -_gcc=-Wno-parentheses
+CFLAGS += $(CCVERBOSE)
+CFLAGS += -_gcc=-Wimplicit-function-declaration -_gcc=-Wno-parentheses
CPPFLAGS = -I$(COMPAT)/bhyve -I$(CONTRIB)/bhyve \
-I$(COMPAT)/bhyve/amd64 -I$(CONTRIB)/bhyve/amd64 \
-I$(CONTRIB)/bhyve/dev/usb/controller \
diff --git a/usr/src/cmd/bhyve/README.sync b/usr/src/cmd/bhyve/README.sync
index bed6671c73..4f71c1420e 100644
--- a/usr/src/cmd/bhyve/README.sync
+++ b/usr/src/cmd/bhyve/README.sync
@@ -19,11 +19,6 @@ Date: Wed Mar 24 09:29:15 2021 -0700
Divergence Notes:
-A previous sync skipped commit 0ff7076bdbc6dae5ea44c0acdb567e1cede199d1 which
-introduced a generic backend functionality to network devices. Without that in
-place, subsequent updates reflect the absence of that subsystem. Integrating
-net backends has not been a priority, given the common use of viona on illumos.
-
The draft Save/Restore functionality, added in FreeBSD commit
483d953a86a2507355f8287c5107dc827a0ff516, has not been synced into illumos bhyve
yet. It is not built by default in FreeBSD, so we're not interested in taking
diff --git a/usr/src/cmd/bhyve/net_backends.c b/usr/src/cmd/bhyve/net_backends.c
index 30c26aea45..3f86b31ded 100644
--- a/usr/src/cmd/bhyve/net_backends.c
+++ b/usr/src/cmd/bhyve/net_backends.c
@@ -46,10 +46,12 @@ __FBSDID("$FreeBSD$");
#include <sys/uio.h>
#include <net/if.h>
+#ifdef __FreeBSD__
#include <net/netmap.h>
#include <net/netmap_virt.h>
#define NETMAP_WITH_LIBS
#include <net/netmap_user.h>
+#endif
#ifndef WITHOUT_CAPSICUM
#include <capsicum_helpers.h>
@@ -75,6 +77,11 @@ __FBSDID("$FreeBSD$");
#include <netgraph.h>
#endif
+#ifndef __FreeBSD__
+#include <libdlpi.h>
+#include <net/ethernet.h>
+#endif
+
#include "config.h"
#include "debug.h"
#include "iov.h"
@@ -151,6 +158,10 @@ struct net_backend {
int (*set_cap)(struct net_backend *be, uint64_t features,
unsigned int vnet_hdr_len);
+#ifndef __FreeBSD__
+ int (*get_mac)(struct net_backend *be, void *, size_t *);
+#endif
+
struct pci_vtnet_softc *sc;
int fd;
@@ -175,6 +186,8 @@ SET_DECLARE(net_backend_set, struct net_backend);
#define WPRINTF(params) PRINTLN params
+#ifdef __FreeBSD__
+
/*
* The tap backend
*/
@@ -893,6 +906,336 @@ static struct net_backend vale_backend = {
DATA_SET(net_backend_set, netmap_backend);
DATA_SET(net_backend_set, vale_backend);
+#else /* __FreeBSD__ */
+
+/*
+ * The illumos dlpi backend
+ */
+
+/*
+ * The size of the bounce buffer used to implement the peek callback.
+ * This value should be big enough to accommodate the largest of all possible
+ * frontend packet lengths. The value here matches the definition of
+ * VTNET_MAX_PKT_LEN in pci_virtio_net.c
+ */
+#define DLPI_BBUF_SIZE (65536 + 64)
+
+typedef struct be_dlpi_priv {
+ dlpi_handle_t bdp_dhp;
+ struct mevent *bdp_mevp;
+ /*
+ * A bounce buffer that allows us to implement the peek_recvlen
+ * callback. Each structure is only used by a single thread so
+ * one is enough.
+ */
+ uint8_t bdp_bbuf[DLPI_BBUF_SIZE];
+ ssize_t bdp_bbuflen;
+} be_dlpi_priv_t;
+
+static void
+be_dlpi_cleanup(net_backend_t *be)
+{
+ be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
+
+ if (priv->bdp_dhp != NULL)
+ dlpi_close(priv->bdp_dhp);
+ priv->bdp_dhp = NULL;
+
+ if (priv->bdp_mevp != NULL)
+ mevent_delete(priv->bdp_mevp);
+ priv->bdp_mevp = NULL;
+
+ priv->bdp_bbuflen = 0;
+ be->fd = -1;
+}
+
+static void
+be_dlpi_err(int ret, const char *dev, char *msg)
+{
+ WPRINTF(("%s: %s (%s)", dev, msg, dlpi_strerror(ret)));
+}
+
+static int
+be_dlpi_init(net_backend_t *be, const char *devname __unused,
+ nvlist_t *nvl, net_be_rxeof_t cb, void *param)
+{
+ be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
+ const char *vnic;
+ int ret;
+
+ if (cb == NULL) {
+ WPRINTF(("dlpi backend requires non-NULL callback"));
+ return (-1);
+ }
+
+ vnic = get_config_value_node(nvl, "vnic");
+ if (vnic == NULL) {
+ WPRINTF(("dlpi backend requires a VNIC"));
+ return (-1);
+ }
+
+ priv->bdp_bbuflen = 0;
+
+ ret = dlpi_open(vnic, &priv->bdp_dhp, DLPI_RAW);
+
+ if (ret != DLPI_SUCCESS) {
+ be_dlpi_err(ret, vnic, "open failed");
+ goto error;
+ }
+
+ if ((ret = dlpi_bind(priv->bdp_dhp, DLPI_ANY_SAP, NULL)) !=
+ DLPI_SUCCESS) {
+ be_dlpi_err(ret, vnic, "bind failed");
+ goto error;
+ }
+
+ if (get_config_bool_node_default(nvl, "promiscrxonly", true)) {
+ if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_RX_ONLY)) !=
+ DLPI_SUCCESS) {
+ be_dlpi_err(ret, vnic,
+ "enable promiscuous mode(rxonly) failed");
+ goto error;
+ }
+ }
+ if (get_config_bool_node_default(nvl, "promiscphys", false)) {
+ if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_PHYS)) !=
+ DLPI_SUCCESS) {
+ be_dlpi_err(ret, vnic,
+ "enable promiscuous mode(physical) failed");
+ goto error;
+ }
+ }
+ if (get_config_bool_node_default(nvl, "promiscsap", true)) {
+ if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_SAP)) !=
+ DLPI_SUCCESS) {
+ be_dlpi_err(ret, vnic,
+ "enable promiscuous mode(SAP) failed");
+ goto error;
+ }
+ }
+ if (get_config_bool_node_default(nvl, "promiscmulti", true)) {
+ if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_MULTI)) !=
+ DLPI_SUCCESS) {
+ be_dlpi_err(ret, vnic,
+ "enable promiscuous mode(muticast) failed");
+ goto error;
+ }
+ }
+
+ be->fd = dlpi_fd(priv->bdp_dhp);
+
+ if (fcntl(be->fd, F_SETFL, O_NONBLOCK) < 0) {
+ WPRINTF(("%s: enable O_NONBLOCK failed", vnic));
+ goto error;
+ }
+
+ priv->bdp_mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
+ if (priv->bdp_mevp == NULL) {
+ WPRINTF(("Could not register event"));
+ goto error;
+ }
+
+ return (0);
+
+error:
+ be_dlpi_cleanup(be);
+ return (-1);
+}
+
+/*
+ * Called to send a buffer chain out to the dlpi device
+ */
+static ssize_t
+be_dlpi_send(net_backend_t *be, const struct iovec *iov, int iovcnt)
+{
+ be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
+ ssize_t len = 0;
+ int ret;
+
+ if (iovcnt == 1) {
+ len = iov[0].iov_len;
+ ret = dlpi_send(priv->bdp_dhp, NULL, 0, iov[0].iov_base, len,
+ NULL);
+ } else {
+ void *buf = NULL;
+
+ len = iov_to_buf(iov, iovcnt, &buf);
+
+ if (len <= 0 || buf == NULL)
+ return (-1);
+
+ ret = dlpi_send(priv->bdp_dhp, NULL, 0, buf, len, NULL);
+ free(buf);
+ }
+
+ if (ret != DLPI_SUCCESS)
+ return (-1);
+
+ return (len);
+}
+
+static ssize_t
+be_dlpi_peek_recvlen(net_backend_t *be)
+{
+ be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
+ dlpi_recvinfo_t recv;
+ size_t len;
+ int ret;
+
+ /*
+ * We already have a packet in the bounce buffer.
+ * Just return its length.
+ */
+ if (priv->bdp_bbuflen > 0)
+ return (priv->bdp_bbuflen);
+
+ /*
+ * Read the next packet (if any) into the bounce buffer, so
+ * that we get to know its length and we can return that
+ * to the caller.
+ */
+ len = sizeof (priv->bdp_bbuf);
+ ret = dlpi_recv(priv->bdp_dhp, NULL, NULL, priv->bdp_bbuf, &len,
+ 0, &recv);
+ if (ret == DL_SYSERR) {
+ if (errno == EWOULDBLOCK)
+ return (0);
+ return (-1);
+ } else if (ret == DLPI_ETIMEDOUT) {
+ return (0);
+ } else if (ret != DLPI_SUCCESS) {
+ return (-1);
+ }
+
+ if (recv.dri_totmsglen > sizeof (priv->bdp_bbuf)) {
+ EPRINTLN("DLPI bounce buffer was too small! - needed %x bytes",
+ recv.dri_totmsglen);
+ }
+
+ priv->bdp_bbuflen = len;
+
+ return (len);
+}
+
+static ssize_t
+be_dlpi_recv(net_backend_t *be, const struct iovec *iov, int iovcnt)
+{
+ be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
+ size_t len;
+ int ret;
+
+ if (priv->bdp_bbuflen > 0) {
+ /*
+ * A packet is available in the bounce buffer, so
+ * we read it from there.
+ */
+ len = buf_to_iov(priv->bdp_bbuf, priv->bdp_bbuflen,
+ iov, iovcnt, 0);
+
+ /* Mark the bounce buffer as empty. */
+ priv->bdp_bbuflen = 0;
+
+ return (len);
+ }
+
+ len = iov[0].iov_len;
+ ret = dlpi_recv(priv->bdp_dhp, NULL, NULL,
+ (uint8_t *)iov[0].iov_base, &len, 0, NULL);
+ if (ret == DL_SYSERR) {
+ if (errno == EWOULDBLOCK)
+ return (0);
+ return (-1);
+ } else if (ret == DLPI_ETIMEDOUT) {
+ return (0);
+ } else if (ret != DLPI_SUCCESS) {
+ return (-1);
+ }
+
+ return (len);
+}
+
+static void
+be_dlpi_recv_enable(net_backend_t *be)
+{
+ be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
+
+ mevent_enable(priv->bdp_mevp);
+}
+
+static void
+be_dlpi_recv_disable(net_backend_t *be)
+{
+ be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
+
+ mevent_disable(priv->bdp_mevp);
+}
+
+static uint64_t
+be_dlpi_get_cap(net_backend_t *be)
+{
+ return (0); /* no capabilities for now */
+}
+
+static int
+be_dlpi_set_cap(net_backend_t *be, uint64_t features,
+ unsigned vnet_hdr_len)
+{
+ return ((features || vnet_hdr_len) ? -1 : 0);
+}
+
+static int
+be_dlpi_get_mac(net_backend_t *be, void *buf, size_t *buflen)
+{
+ be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque;
+ uchar_t physaddr[DLPI_PHYSADDR_MAX];
+ size_t physaddrlen = DLPI_PHYSADDR_MAX;
+ int ret;
+
+ if ((ret = dlpi_get_physaddr(priv->bdp_dhp, DL_CURR_PHYS_ADDR,
+ physaddr, &physaddrlen)) != DLPI_SUCCESS) {
+ be_dlpi_err(ret, dlpi_linkname(priv->bdp_dhp),
+ "read MAC address failed");
+ return (EINVAL);
+ }
+
+ if (physaddrlen != ETHERADDRL) {
+ WPRINTF(("%s: bad MAC address len %d",
+ dlpi_linkname(priv->bdp_dhp), physaddrlen));
+ return (EINVAL);
+ }
+
+ if (physaddrlen > *buflen) {
+ WPRINTF(("%s: MAC address too long (%d bytes required)",
+ dlpi_linkname(priv->bdp_dhp), physaddrlen));
+ return (ENOMEM);
+ }
+
+ *buflen = physaddrlen;
+ memcpy(buf, physaddr, *buflen);
+
+ return (0);
+}
+
+static struct net_backend dlpi_backend = {
+ .prefix = "dlpi",
+ .priv_size = sizeof(struct be_dlpi_priv),
+ .init = be_dlpi_init,
+ .cleanup = be_dlpi_cleanup,
+ .send = be_dlpi_send,
+ .peek_recvlen = be_dlpi_peek_recvlen,
+ .recv = be_dlpi_recv,
+ .recv_enable = be_dlpi_recv_enable,
+ .recv_disable = be_dlpi_recv_disable,
+ .get_cap = be_dlpi_get_cap,
+ .set_cap = be_dlpi_set_cap,
+ .get_mac = be_dlpi_get_mac,
+};
+
+DATA_SET(net_backend_set, dlpi_backend);
+
+#endif /* __FreeBSD__ */
+
+#ifdef __FreeBSD__
int
netbe_legacy_config(nvlist_t *nvl, const char *opts)
{
@@ -911,6 +1254,35 @@ netbe_legacy_config(nvlist_t *nvl, const char *opts)
free(backend);
return (pci_parse_legacy_config(nvl, cp + 1));
}
+#else
+int
+netbe_legacy_config(nvlist_t *nvl, const char *opts)
+{
+ char *config, *name, *tofree, *value;
+
+ if (opts == NULL)
+ return (0);
+
+ /* Default to the 'dlpi' backend - can still be overridden by opts */
+ set_config_value_node(nvl, "backend", "dlpi");
+
+ config = tofree = strdup(opts);
+ if (config == NULL)
+ err(4, "netbe_legacy_config strdup()");
+ while ((name = strsep(&config, ",")) != NULL) {
+ value = strchr(name, '=');
+ if (value != NULL) {
+ *value++ = '\0';
+ set_config_value_node(nvl, name, value);
+ } else {
+ set_config_value_node(nvl, "vnic", name);
+ }
+ }
+ free(tofree);
+
+ return (0);
+}
+#endif
/*
* Initialize a backend and attach to the frontend.
@@ -1066,7 +1438,11 @@ netbe_rx_discard(struct net_backend *be)
static uint8_t dummybuf[65536 + 64];
struct iovec iov;
+#ifdef __FreeBSD__
iov.iov_base = dummybuf;
+#else
+ iov.iov_base = (caddr_t)dummybuf;
+#endif
iov.iov_len = sizeof(dummybuf);
return netbe_recv(be, &iov, 1);
@@ -1092,3 +1468,13 @@ netbe_get_vnet_hdr_len(struct net_backend *be)
return (be->be_vnet_hdr_len);
}
+
+#ifndef __FreeBSD__
+int
+netbe_get_mac(net_backend_t *be, void *buf, size_t *buflen)
+{
+ if (be->get_mac == NULL)
+ return (ENOTSUP);
+ return (be->get_mac(be, buf, buflen));
+}
+#endif
diff --git a/usr/src/cmd/bhyve/net_backends.h b/usr/src/cmd/bhyve/net_backends.h
index bc7834546b..ea4d059b6f 100644
--- a/usr/src/cmd/bhyve/net_backends.h
+++ b/usr/src/cmd/bhyve/net_backends.h
@@ -51,7 +51,9 @@ ssize_t netbe_recv(net_backend_t *be, const struct iovec *iov, int iovcnt);
ssize_t netbe_rx_discard(net_backend_t *be);
void netbe_rx_disable(net_backend_t *be);
void netbe_rx_enable(net_backend_t *be);
-
+#ifndef __FreeBSD__
+int netbe_get_mac(net_backend_t *, void *, size_t *);
+#endif
/*
* Network device capabilities taken from the VirtIO standard.
diff --git a/usr/src/cmd/bhyve/pci_e82545.c b/usr/src/cmd/bhyve/pci_e82545.c
index 598deff980..9f456f06f2 100644
--- a/usr/src/cmd/bhyve/pci_e82545.c
+++ b/usr/src/cmd/bhyve/pci_e82545.c
@@ -42,13 +42,11 @@ __FBSDID("$FreeBSD$");
#include <net/ethernet.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
-#ifndef __FreeBSD__
-#include <sys/filio.h>
-#endif
#ifndef WITHOUT_CAPSICUM
#include <capsicum_helpers.h>
#endif
+
#include <err.h>
#include <errno.h>
#include <fcntl.h>
@@ -71,6 +69,7 @@ __FBSDID("$FreeBSD$");
#include "pci_emul.h"
#include "mevent.h"
#include "net_utils.h"
+#include "net_backends.h"
/* Hardware/register definitions XXX: move some to common code. */
#define E82545_VENDOR_ID_INTEL 0x8086
@@ -250,11 +249,10 @@ struct eth_uni {
struct e82545_softc {
struct pci_devinst *esc_pi;
struct vmctx *esc_ctx;
- struct mevent *esc_mevp;
struct mevent *esc_mevpitr;
pthread_mutex_t esc_mtx;
struct ether_addr esc_mac;
- int esc_tapfd;
+ net_backend_t *esc_be;
/* General */
uint32_t esc_CTRL; /* x0000 device ctl */
@@ -360,9 +358,7 @@ struct e82545_softc {
static void e82545_reset(struct e82545_softc *sc, int dev);
static void e82545_rx_enable(struct e82545_softc *sc);
static void e82545_rx_disable(struct e82545_softc *sc);
-#ifdef __FreeBSD__
static void e82545_rx_callback(int fd, enum ev_type type, void *param);
-#endif
static void e82545_tx_start(struct e82545_softc *sc);
static void e82545_tx_enable(struct e82545_softc *sc);
static void e82545_tx_disable(struct e82545_softc *sc);
@@ -556,7 +552,6 @@ e82545_eecd_strobe(struct e82545_softc *sc)
}
}
-#ifdef __FreeBSD__
static void
e82545_itr_callback(int fd, enum ev_type type, void *param)
{
@@ -575,7 +570,6 @@ e82545_itr_callback(int fd, enum ev_type type, void *param)
}
pthread_mutex_unlock(&sc->esc_mtx);
}
-#endif
static void
e82545_icr_assert(struct e82545_softc *sc, uint32_t bits)
@@ -601,11 +595,9 @@ e82545_icr_assert(struct e82545_softc *sc, uint32_t bits)
sc->esc_irq_asserted = 1;
pci_lintr_assert(sc->esc_pi);
if (sc->esc_ITR != 0) {
-#ifdef __FreeBSD__
sc->esc_mevpitr = mevent_add(
(sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */
EVF_TIMER, e82545_itr_callback, sc);
-#endif
}
}
}
@@ -631,11 +623,9 @@ e82545_ims_change(struct e82545_softc *sc, uint32_t bits)
sc->esc_irq_asserted = 1;
pci_lintr_assert(sc->esc_pi);
if (sc->esc_ITR != 0) {
-#ifdef __FreeBSD__
sc->esc_mevpitr = mevent_add(
(sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */
EVF_TIMER, e82545_itr_callback, sc);
-#endif
}
}
}
@@ -837,9 +827,6 @@ e82545_bufsz(uint32_t rctl)
return (256); /* Forbidden value. */
}
-#ifdef __FreeBSD__
-static uint8_t dummybuf[2048];
-
/* XXX one packet at a time until this is debugged */
static void
e82545_rx_callback(int fd, enum ev_type type, void *param)
@@ -857,7 +844,7 @@ e82545_rx_callback(int fd, enum ev_type type, void *param)
if (!sc->esc_rx_enabled || sc->esc_rx_loopback) {
DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped",
sc->esc_rx_enabled, sc->esc_rx_loopback);
- while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) {
+ while (netbe_rx_discard(sc->esc_be) > 0) {
}
goto done1;
}
@@ -870,7 +857,7 @@ e82545_rx_callback(int fd, enum ev_type type, void *param)
if (left < maxpktdesc) {
DPRINTF("rx overflow (%d < %d) -- packet(s) dropped",
left, maxpktdesc);
- while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) {
+ while (netbe_rx_discard(sc->esc_be) > 0) {
}
goto done1;
}
@@ -887,9 +874,9 @@ e82545_rx_callback(int fd, enum ev_type type, void *param)
rxd->buffer_addr, bufsz);
vec[i].iov_len = bufsz;
}
- len = readv(sc->esc_tapfd, vec, maxpktdesc);
+ len = netbe_recv(sc->esc_be, vec, maxpktdesc);
if (len <= 0) {
- DPRINTF("tap: readv() returned %d\n", len);
+ DPRINTF("netbe_recv() returned %d", len);
goto done;
}
@@ -970,7 +957,6 @@ done1:
DPRINTF("rx_run done: head %x, tail %x", sc->esc_RDH, sc->esc_RDT);
pthread_mutex_unlock(&sc->esc_mtx);
}
-#endif
static uint16_t
e82545_carry(uint32_t sum)
@@ -983,11 +969,7 @@ e82545_carry(uint32_t sum)
}
static uint16_t
-#ifdef __FreeBSD__
e82545_buf_checksum(uint8_t *buf, int len)
-#else
-e82545_buf_checksum(caddr_t buf, int len)
-#endif
{
int i;
uint32_t sum = 0;
@@ -1024,7 +1006,11 @@ e82545_iov_checksum(struct iovec *iov, int iovcnt, int off, int len)
odd = 0;
while (len > 0 && iovcnt > 0) {
now = MIN(len, iov->iov_len - off);
+#ifdef __FreeBSD__
s = e82545_buf_checksum(iov->iov_base + off, now);
+#else
+ s = e82545_buf_checksum((uint8_t *)iov->iov_base + off, now);
+#endif
sum += odd ? (s << 8) : s;
odd ^= (now & 1);
len -= now;
@@ -1069,10 +1055,10 @@ static void
e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt)
{
- if (sc->esc_tapfd == -1)
+ if (sc->esc_be == NULL)
return;
- (void) writev(sc->esc_tapfd, iov, iovcnt);
+ (void) netbe_send(sc->esc_be, iov, iovcnt);
}
static void
@@ -1094,19 +1080,16 @@ static int
e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail,
uint16_t dsize, uint16_t *rhead, int *tdwb)
{
-#ifdef __FreeBSD__
uint8_t *hdr, *hdrp;
-#else
- caddr_t hdr, hdrp;
-#endif
struct iovec iovb[I82545_MAX_TXSEGS + 2];
struct iovec tiov[I82545_MAX_TXSEGS + 2];
struct e1000_context_desc *cd;
struct ck_info ckinfo[2];
struct iovec *iov;
union e1000_tx_udesc *dsc;
- int desc, dtype, len, ntype, iovcnt, tlen, hdrlen, vlen, tcp, tso;
+ int desc, dtype, len, ntype, iovcnt, tlen, tcp, tso;
int mss, paylen, seg, tiovcnt, left, now, nleft, nnow, pv, pvoff;
+ unsigned hdrlen, vlen;
uint32_t tcpsum, tcpseq;
uint16_t ipcs, tcpcs, ipid, ohead;
@@ -1116,7 +1099,6 @@ e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail,
ntype = 0;
tso = 0;
ohead = head;
- hdr = NULL;
/* iovb[0/1] may be used for writable copy of headers. */
iov = &iovb[2];
@@ -1251,6 +1233,68 @@ e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail,
} else {
/* In case of TSO header length provided by software. */
hdrlen = sc->esc_txctx.tcp_seg_setup.fields.hdr_len;
+
+ /*
+ * Cap the header length at 240 based on 7.2.4.5 of
+ * the Intel 82576EB (Rev 2.63) datasheet.
+ */
+ if (hdrlen > 240) {
+ WPRINTF("TSO hdrlen too large: %d", hdrlen);
+ goto done;
+ }
+
+ /*
+ * If VLAN insertion is requested, ensure the header
+ * at least holds the amount of data copied during
+ * VLAN insertion below.
+ *
+ * XXX: Realistic packets will include a full Ethernet
+ * header before the IP header at ckinfo[0].ck_start,
+ * but this check is sufficient to prevent
+ * out-of-bounds access below.
+ */
+ if (vlen != 0 && hdrlen < ETHER_ADDR_LEN*2) {
+ WPRINTF("TSO hdrlen too small for vlan insertion "
+ "(%d vs %d) -- dropped", hdrlen,
+ ETHER_ADDR_LEN*2);
+ goto done;
+ }
+
+ /*
+ * Ensure that the header length covers the used fields
+ * in the IP and TCP headers as well as the IP and TCP
+ * checksums. The following fields are accessed below:
+ *
+ * Header | Field | Offset | Length
+ * -------+-------+--------+-------
+ * IPv4 | len | 2 | 2
+ * IPv4 | ID | 4 | 2
+ * IPv6 | len | 4 | 2
+ * TCP | seq # | 4 | 4
+ * TCP | flags | 13 | 1
+ * UDP | len | 4 | 4
+ */
+ if (hdrlen < ckinfo[0].ck_start + 6 ||
+ hdrlen < ckinfo[0].ck_off + 2) {
+ WPRINTF("TSO hdrlen too small for IP fields (%d) "
+ "-- dropped", hdrlen);
+ goto done;
+ }
+ if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) {
+ if (hdrlen < ckinfo[1].ck_start + 14 ||
+ (ckinfo[1].ck_valid &&
+ hdrlen < ckinfo[1].ck_off + 2)) {
+ WPRINTF("TSO hdrlen too small for TCP fields "
+ "(%d) -- dropped", hdrlen);
+ goto done;
+ }
+ } else {
+ if (hdrlen < ckinfo[1].ck_start + 8) {
+ WPRINTF("TSO hdrlen too small for UDP fields "
+ "(%d) -- dropped", hdrlen);
+ goto done;
+ }
+ }
}
/* Allocate, fill and prepend writable header vector. */
@@ -1270,9 +1314,14 @@ e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail,
}
iov--;
iovcnt++;
+#ifdef __FreeBSD__
iov->iov_base = hdr;
+#else
+ iov->iov_base = (caddr_t)hdr;
+#endif
iov->iov_len = hdrlen;
- }
+ } else
+ hdr = NULL;
/* Insert VLAN tag. */
if (vlen != 0) {
@@ -1283,7 +1332,11 @@ e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail,
hdr[ETHER_ADDR_LEN*2 + 1] = sc->esc_VET & 0xff;
hdr[ETHER_ADDR_LEN*2 + 2] = dsc->td.upper.fields.special >> 8;
hdr[ETHER_ADDR_LEN*2 + 3] = dsc->td.upper.fields.special & 0xff;
+#ifdef __FreeBSD__
iov->iov_base = hdr;
+#else
+ iov->iov_base = (caddr_t)hdr;
+#endif
iov->iov_len += ETHER_VLAN_ENCAP_LEN;
/* Correct checksum offsets after VLAN tag insertion. */
ckinfo[0].ck_start += ETHER_VLAN_ENCAP_LEN;
@@ -1311,10 +1364,12 @@ e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail,
tcp = (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) != 0;
mss = sc->esc_txctx.tcp_seg_setup.fields.mss;
paylen = (sc->esc_txctx.cmd_and_length & 0x000fffff);
- DPRINTF("tx %s segmentation offload %d+%d/%d bytes %d iovs\r\n",
+ DPRINTF("tx %s segmentation offload %d+%d/%d bytes %d iovs",
tcp ? "TCP" : "UDP", hdrlen, paylen, mss, iovcnt);
ipid = ntohs(*(uint16_t *)&hdr[ckinfo[0].ck_start + 4]);
- tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]);
+ tcpseq = 0;
+ if (tcp)
+ tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]);
ipcs = *(uint16_t *)&hdr[ckinfo[0].ck_off];
tcpcs = 0;
if (ckinfo[1].ck_valid) /* Save partial pseudo-header checksum. */
@@ -1326,7 +1381,11 @@ e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail,
/* Construct IOVs for the segment. */
/* Include whole original header. */
+#ifdef __FreeBSD__
tiov[0].iov_base = hdr;
+#else
+ tiov[0].iov_base = (caddr_t)hdr;
+#endif
tiov[0].iov_len = hdrlen;
tiovcnt = 1;
/* Include respective part of payload IOV. */
@@ -1340,7 +1399,7 @@ e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail,
} else
pvoff += nnow;
}
- DPRINTF("tx segment %d %d+%d bytes %d iovs\r\n",
+ DPRINTF("tx segment %d %d+%d bytes %d iovs",
seg, hdrlen, now, tiovcnt);
/* Update IP header. */
@@ -1435,7 +1494,7 @@ e82545_tx_run(struct e82545_softc *sc)
sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT);
}
-static void *
+static _Noreturn void *
e82545_tx_thread(void *param)
{
struct e82545_softc *sc = param;
@@ -1455,9 +1514,6 @@ e82545_tx_thread(void *param)
/* Process some tx descriptors. Lock dropped inside. */
e82545_tx_run(sc);
}
-#ifndef __FreeBSD__
- return (NULL);
-#endif
}
static void
@@ -2236,58 +2292,6 @@ e82545_reset(struct e82545_softc *sc, int drvr)
sc->esc_TXDCTL = 0;
}
-static void
-e82545_open_tap(struct e82545_softc *sc, const char *path)
-{
- char tbuf[80];
-#ifndef WITHOUT_CAPSICUM
- cap_rights_t rights;
-#endif
-
- if (path == NULL) {
- sc->esc_tapfd = -1;
- return;
- }
-
- strcpy(tbuf, "/dev/");
- strlcat(tbuf, path, sizeof(tbuf));
-
- sc->esc_tapfd = open(tbuf, O_RDWR);
- if (sc->esc_tapfd == -1) {
- DPRINTF("unable to open tap device %s\n", path);
- exit(4);
- }
-
- /*
- * Set non-blocking and register for read
- * notifications with the event loop
- */
- int opt = 1;
- if (ioctl(sc->esc_tapfd, FIONBIO, &opt) < 0) {
- WPRINTF("tap device O_NONBLOCK failed: %d\n", errno);
- close(sc->esc_tapfd);
- sc->esc_tapfd = -1;
- }
-
-#ifndef WITHOUT_CAPSICUM
- cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
- if (caph_rights_limit(sc->esc_tapfd, &rights) == -1)
- errx(EX_OSERR, "Unable to apply rights for sandbox");
-#endif
-
-#ifdef __FreeBSD__
- sc->esc_mevp = mevent_add(sc->esc_tapfd,
- EVF_READ,
- e82545_rx_callback,
- sc);
- if (sc->esc_mevp == NULL) {
- DPRINTF("Could not register mevent %d\n", EVF_READ);
- close(sc->esc_tapfd);
- sc->esc_tapfd = -1;
- }
-#endif
-}
-
static int
e82545_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
{
@@ -2342,51 +2346,36 @@ e82545_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
} else
net_genmac(pi, sc->esc_mac.octet);
- const char *tap = get_config_value_node(nvl, "tap");
- if (tap != NULL && (strncmp(tap, "tap", 3) == 0 ||
- strncmp(tap, "vmnet", 5) == 0))
- e82545_open_tap(sc, tap);
+ err = netbe_init(&sc->esc_be, nvl, e82545_rx_callback, sc);
+ if (err) {
+ free(sc);
+ return (err);
+ }
- /* H/w initiated reset */
- e82545_reset(sc, 0);
+#ifndef __FreeBSD__
+ size_t buflen = sizeof (sc->esc_mac.octet);
- return (0);
-}
+ err = netbe_get_mac(sc->esc_be, sc->esc_mac.octet, &buflen);
+ if (err != 0) {
+ free(sc);
+ return (err);
+ }
+#endif
-#ifndef __FreeBSD__
-static int
-e82545_legacy_config(nvlist_t *nvl, const char *opt)
-{
- char *config, *name, *tofree, *value;
+ netbe_rx_enable(sc->esc_be);
- if (opt == NULL)
- return (0);
+ /* H/w initiated reset */
+ e82545_reset(sc, 0);
- config = tofree = strdup(opt);
- while ((name = strsep(&config, ",")) != NULL) {
- value = strchr(name, '=');
- if (value != NULL) {
- *value++ = '\0';
- set_config_value_node(nvl, name, value);
- } else {
- set_config_value_node(nvl, "tap", name);
- }
- }
- free(tofree);
return (0);
}
-#endif
struct pci_devemu pci_de_e82545 = {
.pe_emu = "e1000",
.pe_init = e82545_init,
-#ifdef __FreeBSD__
.pe_legacy_config = netbe_legacy_config,
-#else
- .pe_legacy_config = e82545_legacy_config,
-#endif
.pe_barwrite = e82545_write,
- .pe_barread = e82545_read
+ .pe_barread = e82545_read,
};
PCI_EMUL_SET(pci_de_e82545);
diff --git a/usr/src/cmd/bhyve/pci_virtio_net.c b/usr/src/cmd/bhyve/pci_virtio_net.c
index 6736603fb8..4b9ce3e58a 100644
--- a/usr/src/cmd/bhyve/pci_virtio_net.c
+++ b/usr/src/cmd/bhyve/pci_virtio_net.c
@@ -27,42 +27,18 @@
*
* $FreeBSD$
*/
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * Copyright 2013 Pluribus Networks Inc.
- * Copyright 2018 Joyent, Inc.
- */
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
-#ifndef WITHOUT_CAPSICUM
-#include <sys/capsicum.h>
-#endif
#include <sys/linker_set.h>
#include <sys/select.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <net/ethernet.h>
-#ifdef __FreeBSD__
-#ifndef NETMAP_WITH_LIBS
-#define NETMAP_WITH_LIBS
-#endif
-#include <net/netmap_user.h>
-#endif
+#include <net/if.h> /* IFNAMSIZ */
-#ifndef WITHOUT_CAPSICUM
-#include <capsicum_helpers.h>
-#endif
#include <err.h>
#include <errno.h>
#include <fcntl.h>
@@ -73,54 +49,30 @@ __FBSDID("$FreeBSD$");
#include <strings.h>
#include <unistd.h>
#include <assert.h>
-#include <md5.h>
#include <pthread.h>
#include <pthread_np.h>
-#include <sysexits.h>
-#ifndef __FreeBSD__
-#include <poll.h>
-#include <libdlpi.h>
-#endif
#include "bhyverun.h"
#include "config.h"
#include "debug.h"
#include "pci_emul.h"
-#ifdef __FreeBSD__
#include "mevent.h"
-#endif
#include "virtio.h"
#include "net_utils.h"
+#include "net_backends.h"
+#include "iov.h"
#define VTNET_RINGSZ 1024
#define VTNET_MAXSEGS 256
-/*
- * Host capabilities. Note that we only offer a few of these.
- */
-#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */
-#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */
-#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
-#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */
-#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */
-#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */
-#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */
-#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */
-#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */
-#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */
-#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */
-#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */
-#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
-#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
-#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */
-#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */
-#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */
-#define VIRTIO_NET_F_GUEST_ANNOUNCE \
- (1 << 21) /* guest can send gratuitous pkts */
+#define VTNET_MAX_PKT_LEN (65536 + 64)
+
+#define VTNET_MIN_MTU ETHERMIN
+#define VTNET_MAX_MTU 65535
#define VTNET_S_HOSTCAPS \
- ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
+ ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | \
VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC)
/*
@@ -143,19 +95,6 @@ struct virtio_net_config {
#define VTNET_MAXQ 3
/*
- * Fixed network header size
- */
-struct virtio_net_rxhdr {
- uint8_t vrh_flags;
- uint8_t vrh_gso_type;
- uint16_t vrh_hdr_len;
- uint16_t vrh_gso_size;
- uint16_t vrh_csum_start;
- uint16_t vrh_csum_offset;
- uint16_t vrh_bufs;
-} __packed;
-
-/*
* Debug printf
*/
static int pci_vtnet_debug;
@@ -169,27 +108,16 @@ struct pci_vtnet_softc {
struct virtio_softc vsc_vs;
struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
pthread_mutex_t vsc_mtx;
- struct mevent *vsc_mevp;
-#ifdef __FreeBSD
- int vsc_tapfd;
-#else
- dlpi_handle_t vsc_dhp;
- int vsc_dlpifd;
-#endif
- struct nm_desc *vsc_nmd;
+ net_backend_t *vsc_be;
+
+ bool features_negotiated; /* protected by rx_mtx */
- int vsc_rx_ready;
- bool features_negotiated; /* protected by rx_mtx */
int resetting; /* protected by tx_mtx */
uint64_t vsc_features; /* negotiated features */
-
- struct virtio_net_config vsc_config;
- struct virtio_consts vsc_consts;
-
+
pthread_mutex_t rx_mtx;
- int rx_vhdrlen;
int rx_merge; /* merged rx bufs in use */
pthread_t tx_tid;
@@ -197,9 +125,11 @@ struct pci_vtnet_softc {
pthread_cond_t tx_cond;
int tx_in_progress;
- void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc);
- void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov,
- int iovcnt, int len);
+ size_t vhdrlen;
+ size_t be_vhdrlen;
+
+ struct virtio_net_config vsc_config;
+ struct virtio_consts vsc_consts;
};
static void pci_vtnet_reset(void *);
@@ -230,7 +160,16 @@ pci_vtnet_reset(void *vsc)
/* Acquire the RX lock to block RX processing. */
pthread_mutex_lock(&sc->rx_mtx);
+ /*
+ * Make sure receive operation is disabled at least until we
+ * re-negotiate the features, since receive operation depends
+ * on the value of sc->rx_merge and the header length, which
+ * are both set in pci_vtnet_neg_features().
+ * Receive operation will be enabled again once the guest adds
+ * the first receive buffers and kicks us.
+ */
sc->features_negotiated = false;
+ netbe_rx_disable(sc->vsc_be);
/* Set sc->resetting and give a chance to the TX thread to stop. */
pthread_mutex_lock(&sc->tx_mtx);
@@ -241,10 +180,6 @@ pci_vtnet_reset(void *vsc)
pthread_mutex_lock(&sc->tx_mtx);
}
- sc->vsc_rx_ready = 0;
- sc->rx_merge = 1;
- sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
-
/*
* Now reset rings, MSI-X vectors, and negotiated capabilities.
* Do that with the TX lock held, since we need to reset
@@ -257,434 +192,237 @@ pci_vtnet_reset(void *vsc)
pthread_mutex_unlock(&sc->rx_mtx);
}
-/*
- * Called to send a buffer chain out to the tap device
- */
-#ifdef __FreeBSD__
-static void
-pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
- int len)
-{
- static char pad[60]; /* all zero bytes */
-
- if (sc->vsc_tapfd == -1)
- return;
-
- /*
- * If the length is < 60, pad out to that and add the
- * extra zero'd segment to the iov. It is guaranteed that
- * there is always an extra iov available by the caller.
- */
- if (len < 60) {
- iov[iovcnt].iov_base = pad;
- iov[iovcnt].iov_len = 60 - len;
- iovcnt++;
- }
- (void) writev(sc->vsc_tapfd, iov, iovcnt);
-}
-#else
-static void
-pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
- int len)
-{
- int i;
-
- for (i = 0; i < iovcnt; i++) {
- (void) dlpi_send(sc->vsc_dhp, NULL, 0,
- iov[i].iov_base, iov[i].iov_len, NULL);
- }
-}
-#endif /* __FreeBSD__ */
-
-#ifdef __FreeBSD__
-/*
- * Called when there is read activity on the tap file descriptor.
- * Each buffer posted by the guest is assumed to be able to contain
- * an entire ethernet frame + rx header.
- * MP note: the dummybuf is only used for discarding frames, so there
- * is no need for it to be per-vtnet or locked.
- */
-static uint8_t dummybuf[2048];
-#endif /* __FreeBSD__ */
-
static __inline struct iovec *
-rx_iov_trim(struct iovec *iov, int *niov, int tlen)
+iov_trim_hdr(struct iovec *iov, int *iovcnt, unsigned int hlen)
{
struct iovec *riov;
- /* XXX short-cut: assume first segment is >= tlen */
- assert(iov[0].iov_len >= tlen);
+ if (iov[0].iov_len < hlen) {
+ /*
+ * Not enough header space in the first fragment.
+ * That's not ok for us.
+ */
+ return NULL;
+ }
- iov[0].iov_len -= tlen;
+ iov[0].iov_len -= hlen;
if (iov[0].iov_len == 0) {
- assert(*niov > 1);
- *niov -= 1;
+ *iovcnt -= 1;
+ if (*iovcnt == 0) {
+ /*
+ * Only space for the header. That's not
+ * enough for us.
+ */
+ return NULL;
+ }
riov = &iov[1];
} else {
- iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
+ iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + hlen);
riov = &iov[0];
}
return (riov);
}
+struct virtio_mrg_rxbuf_info {
+ uint16_t idx;
+ uint16_t pad;
+ uint32_t len;
+};
+
static void
-pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
+pci_vtnet_rx(struct pci_vtnet_softc *sc)
{
- struct iovec iov[VTNET_MAXSEGS], *riov;
+ int prepend_hdr_len = sc->vhdrlen - sc->be_vhdrlen;
+ struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS];
+ struct iovec iov[VTNET_MAXSEGS + 1];
struct vqueue_info *vq;
- void *vrx;
- int n;
-#ifdef __FreeBSD__
- int len;
-#else
- size_t len;
- int ret;
-#endif
- uint16_t idx;
- /*
- * Should never be called without a valid tap fd
- */
-#ifdef __FreeBSD__
- assert(sc->vsc_tapfd != -1);
-#else
- assert(sc->vsc_dlpifd != -1);
-#endif
+ vq = &sc->vsc_queues[VTNET_RXQ];
/* Features must be negotiated */
if (!sc->features_negotiated) {
return;
}
- /*
- * But, will be called when the rx ring hasn't yet
- * been set up.
- */
- if (!sc->vsc_rx_ready) {
-#ifdef __FreeBSD__
- /*
- * Drop the packet and try later.
- */
- (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
-#endif
- return;
- }
+ for (;;) {
+ struct virtio_net_rxhdr *hdr;
+ uint32_t riov_bytes;
+ struct iovec *riov;
+ uint32_t ulen;
+ int riov_len;
+ int n_chains;
+ ssize_t rlen;
+ ssize_t plen;
+
+ plen = netbe_peek_recvlen(sc->vsc_be);
+ if (plen <= 0) {
+ /*
+ * No more packets (plen == 0), or backend errored
+ * (plen < 0). Interrupt if needed and stop.
+ */
+ vq_endchains(vq, /*used_all_avail=*/0);
+ return;
+ }
+ plen += prepend_hdr_len;
- /*
- * Check for available rx buffers
- */
- vq = &sc->vsc_queues[VTNET_RXQ];
- if (!vq_has_descs(vq)) {
/*
- * Drop the packet and try later. Interrupt on
- * empty, if that's negotiated.
+ * Get a descriptor chain to store the next ingress
+ * packet. In case of mergeable rx buffers, get as
+ * many chains as necessary in order to make room
+ * for plen bytes.
*/
-#ifdef __FreeBSD__
- (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+ riov_bytes = 0;
+ riov_len = 0;
+ riov = iov;
+ n_chains = 0;
+ do {
+ int n = vq_getchain(vq, &info[n_chains].idx, riov,
+ VTNET_MAXSEGS - riov_len, NULL);
+
+ if (n == 0) {
+ /*
+ * No rx buffers. Enable RX kicks and double
+ * check.
+ */
+ vq_kick_enable(vq);
+ if (!vq_has_descs(vq)) {
+ /*
+ * Still no buffers. Return the unused
+ * chains (if any), interrupt if needed
+ * (including for NOTIFY_ON_EMPTY), and
+ * disable the backend until the next
+ * kick.
+ */
+ vq_retchains(vq, n_chains);
+ vq_endchains(vq, /*used_all_avail=*/1);
+ netbe_rx_disable(sc->vsc_be);
+ return;
+ }
+
+ /* More rx buffers found, so keep going. */
+ vq_kick_disable(vq);
+ continue;
+ }
+#ifndef __FreeBSD__
+ if (n == -1) {
+ /*
+ * An error from vq_getchain() means that
+ * an invalid descriptor was found.
+ */
+ vq_retchains(vq, n_chains);
+ vq_endchains(vq, /*used_all_avail=*/0);
+ return;
+ }
#endif
- vq_endchains(vq, 1);
- return;
- }
-
- do {
- /*
- * Get descriptor chain
- */
- n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
- assert(n >= 1 && n <= VTNET_MAXSEGS);
+ assert(n >= 1 && riov_len + n <= VTNET_MAXSEGS);
+ riov_len += n;
+ if (!sc->rx_merge) {
+ n_chains = 1;
+ break;
+ }
+#ifndef __FreeBSD__
+ size_t c = count_iov(riov, n);
+ if (c > UINT32_MAX) {
+ vq_retchains(vq, n_chains);
+ vq_endchains(vq, /*used_all_avail=*/0);
+ return;
+ }
+ info[n_chains].len = (uint32_t)c;
+#else
+ info[n_chains].len = (uint32_t)count_iov(riov, n);
+#endif
+ riov_bytes += info[n_chains].len;
+ riov += n;
+ n_chains++;
+ } while (riov_bytes < plen && riov_len < VTNET_MAXSEGS);
- /*
- * Get a pointer to the rx header, and use the
- * data immediately following it for the packet buffer.
- */
- vrx = iov[0].iov_base;
- riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
-#ifdef __FreeBSD__
- len = readv(sc->vsc_tapfd, riov, n);
+ riov = iov;
+#ifdef __FreeBSD__
+ hdr = riov[0].iov_base;
#else
- len = riov[0].iov_len;
- ret = dlpi_recv(sc->vsc_dhp, NULL, NULL,
- (uint8_t *)riov[0].iov_base, &len, 0, NULL);
- if (ret != DLPI_SUCCESS) {
- errno = EWOULDBLOCK;
- len = 0;
- }
+ hdr = (struct virtio_net_rxhdr *)riov[0].iov_base;
#endif
- if (len <= 0 && errno == EWOULDBLOCK) {
+ if (prepend_hdr_len > 0) {
/*
- * No more packets, but still some avail ring
- * entries. Interrupt if needed/appropriate.
+ * The frontend uses a virtio-net header, but the
+ * backend does not. We need to prepend a zeroed
+ * header.
*/
- vq_retchains(vq, 1);
- vq_endchains(vq, 0);
- return;
- }
-
- /*
- * The only valid field in the rx packet header is the
- * number of buffers if merged rx bufs were negotiated.
- */
- memset(vrx, 0, sc->rx_vhdrlen);
-
- if (sc->rx_merge) {
- struct virtio_net_rxhdr *vrxh;
-
- vrxh = vrx;
- vrxh->vrh_bufs = 1;
+ riov = iov_trim_hdr(riov, &riov_len, prepend_hdr_len);
+ if (riov == NULL) {
+ /*
+ * The first collected chain is nonsensical,
+ * as it is not even enough to store the
+ * virtio-net header. Just drop it.
+ */
+ vq_relchain(vq, info[0].idx, 0);
+ vq_retchains(vq, n_chains - 1);
+ continue;
+ }
+ memset(hdr, 0, prepend_hdr_len);
}
- /*
- * Release this chain and handle more chains.
- */
- vq_relchain(vq, idx, len + sc->rx_vhdrlen);
- } while (vq_has_descs(vq));
-
- /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
- vq_endchains(vq, 1);
-}
-
-#ifdef __FreeBSD__
-static __inline int
-pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
-{
- int r, i;
- int len = 0;
-
- for (r = nmd->cur_tx_ring; ; ) {
- struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r);
- uint32_t cur, idx;
- char *buf;
-
- if (nm_ring_empty(ring)) {
- r++;
- if (r > nmd->last_tx_ring)
- r = nmd->first_tx_ring;
- if (r == nmd->cur_tx_ring)
- break;
+ rlen = netbe_recv(sc->vsc_be, riov, riov_len);
+ if (rlen != plen - prepend_hdr_len) {
+ /*
+ * If this happens it means there is something
+ * wrong with the backend (e.g., some other
+ * process is stealing our packets).
+ */
+ WPRINTF(("netbe_recv: expected %zd bytes, "
+ "got %zd", plen - prepend_hdr_len, rlen));
+ vq_retchains(vq, n_chains);
continue;
}
- cur = ring->cur;
- idx = ring->slot[cur].buf_idx;
- buf = NETMAP_BUF(ring, idx);
-
- for (i = 0; i < iovcnt; i++) {
- if (len + iov[i].iov_len > 2048)
- break;
- memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len);
- len += iov[i].iov_len;
- }
- ring->slot[cur].len = len;
- ring->head = ring->cur = nm_ring_next(ring, cur);
- nmd->cur_tx_ring = r;
- ioctl(nmd->fd, NIOCTXSYNC, NULL);
- break;
- }
- return (len);
-}
+ ulen = (uint32_t)plen;
-static __inline int
-pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt)
-{
- int len = 0;
- int i = 0;
- int r;
-
- for (r = nmd->cur_rx_ring; ; ) {
- struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r);
- uint32_t cur, idx;
- char *buf;
- size_t left;
-
- if (nm_ring_empty(ring)) {
- r++;
- if (r > nmd->last_rx_ring)
- r = nmd->first_rx_ring;
- if (r == nmd->cur_rx_ring)
- break;
- continue;
- }
- cur = ring->cur;
- idx = ring->slot[cur].buf_idx;
- buf = NETMAP_BUF(ring, idx);
- left = ring->slot[cur].len;
-
- for (i = 0; i < iovcnt && left > 0; i++) {
- if (iov[i].iov_len > left)
- iov[i].iov_len = left;
- memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len);
- len += iov[i].iov_len;
- left -= iov[i].iov_len;
+ /*
+ * Publish the used buffers to the guest, reporting the
+ * number of bytes that we wrote.
+ */
+ if (!sc->rx_merge) {
+ vq_relchain(vq, info[0].idx, ulen);
+ } else {
+ uint32_t iolen;
+ int i = 0;
+
+ do {
+ iolen = info[i].len;
+ if (iolen > ulen) {
+ iolen = ulen;
+ }
+ vq_relchain_prepare(vq, info[i].idx, iolen);
+ ulen -= iolen;
+ i++;
+ } while (ulen > 0);
+
+ hdr->vrh_bufs = i;
+ vq_relchain_publish(vq);
+ assert(i == n_chains);
}
- ring->head = ring->cur = nm_ring_next(ring, cur);
- nmd->cur_rx_ring = r;
- ioctl(nmd->fd, NIOCRXSYNC, NULL);
- break;
}
- for (; i < iovcnt; i++)
- iov[i].iov_len = 0;
- return (len);
}
/*
- * Called to send a buffer chain out to the vale port
+ * Called when there is read activity on the backend file descriptor.
+ * Each buffer posted by the guest is assumed to be able to contain
+ * an entire ethernet frame + rx header.
*/
static void
-pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
- int len)
-{
- static char pad[60]; /* all zero bytes */
-
- if (sc->vsc_nmd == NULL)
- return;
-
- /*
- * If the length is < 60, pad out to that and add the
- * extra zero'd segment to the iov. It is guaranteed that
- * there is always an extra iov available by the caller.
- */
- if (len < 60) {
- iov[iovcnt].iov_base = pad;
- iov[iovcnt].iov_len = 60 - len;
- iovcnt++;
- }
- (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt);
-}
-
-static void
-pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
-{
- struct iovec iov[VTNET_MAXSEGS], *riov;
- struct vqueue_info *vq;
- void *vrx;
- int len, n;
- uint16_t idx;
-
- /*
- * Should never be called without a valid netmap descriptor
- */
- assert(sc->vsc_nmd != NULL);
-
- /* Features must be negotiated */
- if (!sc->features_negotiated) {
- return;
- }
-
- /*
- * But, will be called when the rx ring hasn't yet
- * been set up.
- */
- if (!sc->vsc_rx_ready) {
- /*
- * Drop the packet and try later.
- */
- (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
- return;
- }
-
- /*
- * Check for available rx buffers
- */
- vq = &sc->vsc_queues[VTNET_RXQ];
- if (!vq_has_descs(vq)) {
- /*
- * Drop the packet and try later. Interrupt on
- * empty, if that's negotiated.
- */
- (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf);
- vq_endchains(vq, 1);
- return;
- }
-
- do {
- /*
- * Get descriptor chain.
- */
- n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
- assert(n >= 1 && n <= VTNET_MAXSEGS);
-
- /*
- * Get a pointer to the rx header, and use the
- * data immediately following it for the packet buffer.
- */
- vrx = iov[0].iov_base;
- riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
-
- len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n);
-
- if (len == 0) {
- /*
- * No more packets, but still some avail ring
- * entries. Interrupt if needed/appropriate.
- */
- vq_retchain(vq);
- vq_endchains(vq, 0);
- return;
- }
-
- /*
- * The only valid field in the rx packet header is the
- * number of buffers if merged rx bufs were negotiated.
- */
- memset(vrx, 0, sc->rx_vhdrlen);
-
- if (sc->rx_merge) {
- struct virtio_net_rxhdr *vrxh;
-
- vrxh = vrx;
- vrxh->vrh_bufs = 1;
- }
-
- /*
- * Release this chain and handle more chains.
- */
- vq_relchain(vq, idx, len + sc->rx_vhdrlen);
- } while (vq_has_descs(vq));
-
- /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
- vq_endchains(vq, 1);
-}
-#endif /* __FreeBSD__ */
-
-#ifdef __FreeBSD__
-static void
pci_vtnet_rx_callback(int fd, enum ev_type type, void *param)
{
struct pci_vtnet_softc *sc = param;
pthread_mutex_lock(&sc->rx_mtx);
- sc->pci_vtnet_rx(sc);
+ pci_vtnet_rx(sc);
pthread_mutex_unlock(&sc->rx_mtx);
}
-#else
-static void *
-pci_vtnet_poll_thread(void *param)
-{
- struct pci_vtnet_softc *sc = param;
- pollfd_t pollset;
-
- pollset.fd = sc->vsc_dlpifd;
- pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
-
- for (;;) {
- if (poll(&pollset, 1, -1) < 0) {
- if (errno == EINTR)
- continue;
- fprintf(stderr, "pci_vtnet_poll_thread poll() error %d\n", errno);
- continue;
- }
- pthread_mutex_lock(&sc->vsc_mtx);
- pci_vtnet_tap_rx(sc);
- pthread_mutex_unlock(&sc->vsc_mtx);
- }
-
- return (NULL);
-}
-#endif /* __FreeBSD__ */
+/* Called on RX kick. */
static void
pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
{
@@ -695,42 +433,64 @@ pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
* Enable RX only if features are negotiated.
*/
pthread_mutex_lock(&sc->rx_mtx);
- if (sc->vsc_rx_ready == 0 && sc->features_negotiated) {
- sc->vsc_rx_ready = 1;
- vq_kick_disable(vq);
+ if (!sc->features_negotiated) {
+ pthread_mutex_unlock(&sc->rx_mtx);
+ return;
}
+
+ vq_kick_disable(vq);
+ netbe_rx_enable(sc->vsc_be);
pthread_mutex_unlock(&sc->rx_mtx);
}
+/* TX virtqueue processing, called by the TX thread. */
static void
pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
{
struct iovec iov[VTNET_MAXSEGS + 1];
- int i, n;
- int plen, tlen;
+ struct iovec *siov = iov;
uint16_t idx;
+ ssize_t len;
+ int n;
/*
- * Obtain chain of descriptors. The first one is
- * really the header descriptor, so we need to sum
- * up two lengths: packet length and transfer length.
+ * Obtain chain of descriptors. The first descriptor also
+ * contains the virtio-net header.
*/
n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
assert(n >= 1 && n <= VTNET_MAXSEGS);
- plen = 0;
- tlen = iov[0].iov_len;
- for (i = 1; i < n; i++) {
- plen += iov[i].iov_len;
- tlen += iov[i].iov_len;
+
+ if (sc->vhdrlen != sc->be_vhdrlen) {
+ /*
+ * The frontend uses a virtio-net header, but the backend
+ * does not. We simply strip the header and ignore it, as
+ * it should be zero-filled.
+ */
+ siov = iov_trim_hdr(siov, &n, sc->vhdrlen);
}
- DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
- sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen);
+ if (siov == NULL) {
+ /* The chain is nonsensical. Just drop it. */
+ len = 0;
+ } else {
+ len = netbe_send(sc->vsc_be, siov, n);
+ if (len < 0) {
+ /*
+ * If send failed, report that 0 bytes
+ * were read.
+ */
+ len = 0;
+ }
+ }
- /* chain is processed, release it and set tlen */
- vq_relchain(vq, idx, tlen);
+ /*
+ * Return the processed chain to the guest, reporting
+ * the number of bytes that we read.
+ */
+ vq_relchain(vq, idx, len);
}
+/* Called on TX kick. */
static void
pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
{
@@ -797,140 +557,23 @@ pci_vtnet_tx_thread(void *param)
/*
* Generate an interrupt if needed.
*/
- vq_endchains(vq, 1);
+ vq_endchains(vq, /*used_all_avail=*/1);
pthread_mutex_lock(&sc->tx_mtx);
}
+#ifndef __FreeBSD__
return (NULL);
+#endif
}
-#ifdef __FreeBSD__
+#ifdef notyet
static void
pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
{
DPRINTF(("vtnet: control qnotify!"));
}
-#endif /* __FreeBSD__ */
-
-static void
-pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, const char *devname)
-{
- char tbuf[80];
-#ifndef WITHOUT_CAPSICUM
- cap_rights_t rights;
-#endif
-#ifndef __FreeBSD__
- uchar_t physaddr[DLPI_PHYSADDR_MAX];
- size_t physaddrlen = DLPI_PHYSADDR_MAX;
- int error;
-#endif
-
- strcpy(tbuf, "/dev/");
- strlcat(tbuf, devname, sizeof(tbuf));
-
- sc->pci_vtnet_rx = pci_vtnet_tap_rx;
- sc->pci_vtnet_tx = pci_vtnet_tap_tx;
-#ifdef __FreeBSD__
- sc->vsc_tapfd = open(tbuf, O_RDWR);
- if (sc->vsc_tapfd == -1) {
- WPRINTF(("open of tap device %s failed\n", tbuf));
- return;
- }
-
- /*
- * Set non-blocking and register for read
- * notifications with the event loop
- */
- int opt = 1;
- if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
- WPRINTF(("tap device O_NONBLOCK failed\n"));
- close(sc->vsc_tapfd);
- sc->vsc_tapfd = -1;
- }
-
-#ifndef WITHOUT_CAPSICUM
- cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
- if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1)
- errx(EX_OSERR, "Unable to apply rights for sandbox");
-#endif
-
- sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
- EVF_READ,
- pci_vtnet_rx_callback,
- sc);
- if (sc->vsc_mevp == NULL) {
- WPRINTF(("Could not register event\n"));
- close(sc->vsc_tapfd);
- sc->vsc_tapfd = -1;
- }
-#else
- if (dlpi_open(devname, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) {
- WPRINTF(("open of vnic device %s failed\n", devname));
- }
-
- if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr,
- &physaddrlen) != DLPI_SUCCESS) {
- WPRINTF(("read MAC address of vnic device %s failed\n",
- devname));
- }
- if (physaddrlen != ETHERADDRL) {
- WPRINTF(("bad MAC address len %d on vnic device %s\n",
- physaddrlen, devname));
- }
- memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL);
-
- if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) {
- WPRINTF(("bind of vnic device %s failed\n", devname));
- }
-
- if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) {
- WPRINTF(("enable promiscous mode(physical) of vnic device %s "
- "failed\n", devname));
- }
- if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) {
- WPRINTF(("enable promiscous mode(SAP) of vnic device %s "
- "failed\n", devname));
- }
-
- sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp);
-
- if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) {
- WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n",
- devname));
- dlpi_close(sc->vsc_dhp);
- sc->vsc_dlpifd = -1;
- }
-
- error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc);
- assert(error == 0);
#endif
-}
-
-#ifdef __FreeBSD__
-static void
-pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname)
-{
- sc->pci_vtnet_rx = pci_vtnet_netmap_rx;
- sc->pci_vtnet_tx = pci_vtnet_netmap_tx;
-
- sc->vsc_nmd = nm_open(ifname, NULL, 0, 0);
- if (sc->vsc_nmd == NULL) {
- WPRINTF(("open of netmap device %s failed\n", ifname));
- return;
- }
-
- sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd,
- EVF_READ,
- pci_vtnet_rx_callback,
- sc);
- if (sc->vsc_mevp == NULL) {
- WPRINTF(("Could not register event\n"));
- nm_close(sc->vsc_nmd);
- sc->vsc_nmd = NULL;
- }
-}
-#endif /* __FreeBSD__ */
static int
pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
@@ -938,11 +581,7 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
struct pci_vtnet_softc *sc;
const char *value;
char tname[MAXCOMLEN + 1];
-#ifdef __FreeBSD__
unsigned long mtu = ETHERMTU;
-#else
- int use_msix = 1;
-#endif
int err;
/*
@@ -974,7 +613,6 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
} else
net_genmac(pi, sc->vsc_config.mac);
-#ifdef __FreeBSD__
value = get_config_value_node(nvl, "mtu");
if (value != NULL) {
err = net_parsemtu(value, &mtu);
@@ -982,6 +620,7 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
free(sc);
return (err);
}
+
if (mtu < VTNET_MIN_MTU || mtu > VTNET_MAX_MTU) {
err = EINVAL;
errno = EINVAL;
@@ -990,26 +629,28 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
}
sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MTU;
}
-#endif
+ sc->vsc_config.mtu = mtu;
/* Permit interfaces without a configured backend. */
if (get_config_value_node(nvl, "backend") != NULL) {
-#ifdef __FreeBSD__
err = netbe_init(&sc->vsc_be, nvl, pci_vtnet_rx_callback, sc);
if (err) {
free(sc);
return (err);
}
-#else
- pci_vtnet_tap_setup(sc, get_config_value_node(nvl, "backend"));
+#ifndef __FreeBSD__
+ size_t buflen = sizeof (sc->vsc_config.mac);
+
+ err = netbe_get_mac(sc->vsc_be, sc->vsc_config.mac, &buflen);
+ if (err != 0) {
+ free(sc);
+ return (err);
+ }
#endif
}
-
-#ifdef __FreeBSD__
sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF |
netbe_get_cap(sc->vsc_be);
-#endif
/*
* Since we do not actually support multiqueue,
@@ -1026,18 +667,23 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
/* Link is always up. */
sc->vsc_config.status = 1;
+
+ vi_softc_linkup(&sc->vsc_vs, &sc->vsc_consts, sc, pi, sc->vsc_queues);
+ sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
- if (vi_intr_init(&sc->vsc_vs, 1, use_msix))
+ if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) {
+ free(sc);
return (1);
+ }
/* use BAR 0 to map config regs in IO space */
vi_set_io_bar(&sc->vsc_vs, 0);
sc->resetting = 0;
- sc->rx_merge = 1;
- sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
+ sc->rx_merge = 0;
+ sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2;
pthread_mutex_init(&sc->rx_mtx, NULL);
/*
@@ -1062,8 +708,8 @@ pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
struct pci_vtnet_softc *sc = vsc;
void *ptr;
- if (offset < 6) {
- assert(offset + size <= 6);
+ if (offset < (int)sizeof(sc->vsc_config.mac)) {
+ assert(offset + size <= (int)sizeof(sc->vsc_config.mac));
/*
* The driver is allowed to change the MAC address
*/
@@ -1095,50 +741,33 @@ pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
sc->vsc_features = negotiated_features;
- if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) {
+ if (negotiated_features & VIRTIO_NET_F_MRG_RXBUF) {
+ sc->vhdrlen = sizeof(struct virtio_net_rxhdr);
+ sc->rx_merge = 1;
+ } else {
+ /*
+ * Without mergeable rx buffers, virtio-net header is 2
+ * bytes shorter than sizeof(struct virtio_net_rxhdr).
+ */
+ sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2;
sc->rx_merge = 0;
- /* non-merge rx header is 2 bytes shorter */
- sc->rx_vhdrlen -= 2;
}
+ /* Tell the backend to enable some capabilities it has advertised. */
+ netbe_set_cap(sc->vsc_be, negotiated_features, sc->vhdrlen);
+ sc->be_vhdrlen = netbe_get_vnet_hdr_len(sc->vsc_be);
+ assert(sc->be_vhdrlen == 0 || sc->be_vhdrlen == sc->vhdrlen);
+
pthread_mutex_lock(&sc->rx_mtx);
sc->features_negotiated = true;
pthread_mutex_unlock(&sc->rx_mtx);
}
-#ifndef __FreeBSD__
-static int
-pci_vtnet_legacy_config(nvlist_t *nvl, const char *opt)
-{
- char *config, *name, *tofree, *value;
-
- if (opt == NULL)
- return (0);
-
- config = tofree = strdup(opt);
- while ((name = strsep(&config, ",")) != NULL) {
- value = strchr(name, '=');
- if (value != NULL) {
- *value++ = '\0';
- set_config_value_node(nvl, name, value);
- } else {
- set_config_value_node(nvl, "backend", name);
- }
- }
- free(tofree);
- return (0);
-}
-#endif
-
-struct pci_devemu pci_de_vnet = {
+static struct pci_devemu pci_de_vnet = {
.pe_emu = "virtio-net",
.pe_init = pci_vtnet_init,
-#ifdef __FreeBSD__
.pe_legacy_config = netbe_legacy_config,
-#else
- .pe_legacy_config = pci_vtnet_legacy_config,
-#endif
.pe_barwrite = vi_pci_write,
- .pe_barread = vi_pci_read
+ .pe_barread = vi_pci_read,
};
PCI_EMUL_SET(pci_de_vnet);
diff --git a/usr/src/compat/bhyve/sys/cdefs.h b/usr/src/compat/bhyve/sys/cdefs.h
index 0f3146ea43..71dd205466 100644
--- a/usr/src/compat/bhyve/sys/cdefs.h
+++ b/usr/src/compat/bhyve/sys/cdefs.h
@@ -12,6 +12,7 @@
/*
* Copyright 2013 Pluribus Networks Inc.
* Copyright 2017 Joyent, Inc.
+ * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
*/
#ifndef _COMPAT_FREEBSD_SYS_CDEFS_H_
@@ -61,6 +62,12 @@
#define _Alignof(x) __alignof(x)
#endif
+#if defined(__cplusplus) && __cplusplus >= 201103L
+#define _Noreturn [[noreturn]]
+#else
+#define _Noreturn __dead2
+#endif
+
#if !__has_extension(c_static_assert)
#if (defined(__cplusplus) && __cplusplus >= 201103L) || \
__has_extension(cxx_static_assert)
diff --git a/usr/src/man/man1m/bhyve.1m b/usr/src/man/man1m/bhyve.1m
index ea61607829..a6c4637538 100644
--- a/usr/src/man/man1m/bhyve.1m
+++ b/usr/src/man/man1m/bhyve.1m
@@ -258,7 +258,9 @@ emulation is identical but uses a PCI vendor ID of
.It Li passthru
PCI pass-through device.
.It Li virtio-net-viona
-Virtio network interface.
+Accelerated Virtio network interface.
+.It Li virtio-net
+Legacy Virtio network interface.
.It Li virtio-blk
Virtio block storage interface.
.It Li virtio-rnd
@@ -295,7 +297,7 @@ If
is not specified, the device emulation has no backend and can be
considered unconnected.
.Pp
-Host Bridge Devices
+.Sy Host Bridge Devices
.Bl -tag -width 10n
.It Cm model Ns = Ns Ar model
Specify a hostbridge model to emulate.
@@ -324,7 +326,7 @@ and
.Va devid
must be specified.
.Pp
-Network backends:
+.Sy Accelerated Virtio Network Backends :
.Bl -tag -width 10n
.It Oo Cm vnic Ns = Oc Ns Ar vnic Ns Oo , Ns Cm feature_mask Ns = Ns Ar mask Oc
.Pp
@@ -337,7 +339,30 @@ Bits set in the
value are removed from the advertised features.
.El
.Pp
-Block storage devices:
+.Sy Other Network Backends :
+.Bl -tag -width 10n
+.It Oo Cm vnic Ns = Oc Ns Ar vnic Ns Oo , Ns Ar network-backend-options Oc
+.Pp
+.Ar vnic
+is the name of a configured virtual NIC on the system.
+.El
+.Pp
+The
+.Ar network-backend-options
+are:
+.Bl -tag -width 8n
+.It Cm promiscphys
+Enable promiscuous mode at the physical level (default: false)
+.It Cm promiscsap
+Enable promiscuous mode at the SAP level (default: true)
+.It Cm promiscmulti
+Enable promiscuous mode for all multicast addresses (default: true)
+.It Cm promiscrxonly
+The selected promiscuous modes are only enabled for received traffic
+(default: true).
+.El
+.Pp
+.Sy Block storage devices :
.Bl -tag -width 10n
.It Pa /filename Ns Oo , Ns Ar block-device-options Oc
.It Pa /dev/xxx Ns Oo , Ns Ar block-device-options Oc
@@ -376,7 +401,7 @@ process.
Use the host TTY device for serial port I/O.
.El
.Pp
-Boot ROM device:
+.Sy Boot ROM device :
.Bl -tag -width 10n
.It Pa romfile
Map
@@ -384,7 +409,7 @@ Map
in the guest address space reserved for boot firmware.
.El
.Pp
-Pass-through devices:
+.Sy Pass-through devices :
.Bl -tag -width 10n
.It Pa /dev/ppt Ns Ar N
Connect to a PCI device on the host identified by the specificed path.
@@ -398,7 +423,7 @@ The host device must have been previously attached to the
.Sy ppt
driver.
.Pp
-Virtio console devices:
+.Sy Virtio console devices :
.Bl -tag -width 10n
.It Li port1= Ns Pa /path/to/port1.sock Ns ,anotherport= Ns Pa ...
A maximum of 16 ports per device can be created.
@@ -423,7 +448,7 @@ Emergency write is advertised, but no-op at present.
.El
.El
.Pp
-Framebuffer devices:
+.Sy Framebuffer devices :
.Bl -tag -width 10n
.It Xo
.Sm off
@@ -503,14 +528,14 @@ the session over an encrypted channel provided by IPsec or SSH.
.El
.El
.Pp
-xHCI USB devices:
+.Sy xHCI USB devices :
.Bl -tag -width 10n
.It Li tablet
A USB tablet device which provides precise cursor synchronization
when using VNC.
.El
.Pp
-NVMe devices:
+.Sy NVMe devices :
.Bl -tag -width 10n
.It Li path
Accepted device paths are:
@@ -531,7 +556,7 @@ Sector size (defaults to blockif sector size).
Serial number with maximum 20 characters.
.El
.Pp
-AHCI devices:
+.Sy AHCI devices :
.Bl -tag -width 10n
.It Li nmrr
Nominal Media Rotation Rate, known as RPM. value 1 will indicate device as Solid State Disk. default value is 0, not report.
diff --git a/usr/src/man/man4/bhyve_config.4 b/usr/src/man/man4/bhyve_config.4
index b2d563cf8f..23e1e33c5a 100644
--- a/usr/src/man/man4/bhyve_config.4
+++ b/usr/src/man/man4/bhyve_config.4
@@ -222,7 +222,9 @@ VirtIO block storage interface.
.It Li virtio-console
VirtIO console interface.
.It Li virtio-net-viona
-VirtIO network interface.
+Accelerated VirtIO network interface.
+.It Li net-viona
+Legacy VirtIO network interface.
.It Li virtio-rnd
VirtIO random number generator interface.
.It Li xhci
@@ -272,7 +274,7 @@ Specify the logical and physical sector size of the emulated disk.
If the physical size is not specified, it is set to be equal to the logical
size.
.El
-.Ss virtio-net-viona Settings
+.Ss virtio-net-viona Network Backend Settings
Viona network devices use the following settings to configure their backend.
.Bl -column "feature_flags" "string" "Default"
.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
@@ -281,6 +283,21 @@ The VNIC to use for the network connection.
.It feature_mask Ta integer Ta 0 Ta
Specify a mask to apply to the virtio features advertised to the guest.
.El
+.Ss Other Network Backend Settings
+Other network devices use the following settings to configure their backend.
+.Bl -column "feature_flags" "string" "Default"
+.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
+.It vnic Ta string Ta Ta
+The VNIC to use for the network connection.
+.It promiscphys Ta bool Ta false Ta
+Enable promiscuous mode at the physical level.
+.It promiscsap Ta bool Ta true Ta
+Enable promiscuous mode at the SAP level.
+.It promiscmulti Ta bool Ta true Ta
+Enable promiscuous mode for all multicast addresses.
+.It promiscrxonly Ta bool Ta true Ta
+The selected promiscuous modes are only enabled for received traffic.
+.El
.Ss UART Device Settings
.Bl -column "Name" "Format" "Default"
.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description