summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorPatrick Mooney <pmooney@pfmooney.com>2018-01-03 21:11:35 +0000
committerPatrick Mooney <pmooney@pfmooney.com>2020-05-19 15:55:57 +0000
commitb22a70abf81f995ecc990b8444e63308bc389d5c (patch)
tree5142f78f319737bcd44477e4e3cf578ccd0617e4 /usr/src
parentd77e6e0f12d19668c0e9068c0fcd7a2123da5373 (diff)
downloadillumos-joyent-b22a70abf81f995ecc990b8444e63308bc389d5c.tar.gz
12679 want viona driver for bhyve
Portions contributed by: Ryan Zezeski <rpz@joyent.com> Portions contributed by: John Levon <john.levon@joyent.com> Portions contributed by: Jason King <jason.king@joyent.com> Portions contributed by: Robert Mustacchi <rm@joyent.com> Portions contributed by: Bryan Cantrill <bryan@joyent.com> Reviewed by: Ryan Zezeski <ryan@zinascii.com> Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/bhyve/Makefile3
-rw-r--r--usr/src/cmd/bhyve/pci_emul.c5
-rw-r--r--usr/src/cmd/bhyve/pci_emul.h7
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_viona.c494
-rw-r--r--usr/src/cmd/devfsadm/i386/misc_link_i386.c6
-rw-r--r--usr/src/man/man9e/mac.9e22
-rw-r--r--usr/src/pkg/manifests/system-bhyve.mf3
-rw-r--r--usr/src/uts/common/inet/ip/ip6_output.c13
-rw-r--r--usr/src/uts/common/inet/ip/ip_output.c8
-rw-r--r--usr/src/uts/common/inet/ipf/ip_fil_solaris.c335
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/ipf_stack.h16
-rw-r--r--usr/src/uts/common/io/hook.c2
-rw-r--r--usr/src/uts/common/sys/dlpi.h7
-rw-r--r--usr/src/uts/common/sys/hook_impl.h4
-rw-r--r--usr/src/uts/common/sys/neti.h5
-rw-r--r--usr/src/uts/i86pc/Makefile.files6
-rw-r--r--usr/src/uts/i86pc/Makefile.i86pc1
-rw-r--r--usr/src/uts/i86pc/io/viona/viona.c1409
-rw-r--r--usr/src/uts/i86pc/io/viona/viona.mapfile41
-rw-r--r--usr/src/uts/i86pc/io/viona/viona_hook.c438
-rw-r--r--usr/src/uts/i86pc/io/viona/viona_impl.h326
-rw-r--r--usr/src/uts/i86pc/io/viona/viona_main.c991
-rw-r--r--usr/src/uts/i86pc/io/viona/viona_ring.c638
-rw-r--r--usr/src/uts/i86pc/io/viona/viona_rx.c718
-rw-r--r--usr/src/uts/i86pc/io/viona/viona_tx.c756
-rw-r--r--usr/src/uts/i86pc/sys/viona_io.h49
-rw-r--r--usr/src/uts/i86pc/sys/vmm_drv.h3
-rw-r--r--usr/src/uts/i86pc/viona/Makefile13
-rw-r--r--usr/src/uts/intel/ipf/ipf.global-objs.debug649
29 files changed, 4689 insertions, 1639 deletions
diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile
index e96868e006..2301e6c8a6 100644
--- a/usr/src/cmd/bhyve/Makefile
+++ b/usr/src/cmd/bhyve/Makefile
@@ -58,6 +58,7 @@ SRCS = acpi.c \
pci_virtio_console.c \
pci_virtio_net.c \
pci_virtio_rnd.c \
+ pci_virtio_viona.c \
pci_xhci.c \
pm.c \
post.c \
@@ -120,7 +121,7 @@ CSTD= $(CSTD_GNU99)
C99MODE= -xc99=%all
C99LMODE= -Xc99=%all
-$(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -lmd -luuid -lvmmapi -lz
+$(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -ldladm -lmd -luuid -lvmmapi -lz
$(MEVENT_TEST_PROG) := LDLIBS += -lsocket
.KEEP_STATE:
diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c
index 5118b31534..a71cc528aa 100644
--- a/usr/src/cmd/bhyve/pci_emul.c
+++ b/usr/src/cmd/bhyve/pci_emul.c
@@ -1597,6 +1597,11 @@ pci_lintr_update(struct pci_devinst *pi)
pci_irq_assert(pi);
}
pthread_mutex_unlock(&pi->pi_lintr.lock);
+#ifndef __FreeBSD__
+ if (pi->pi_d->pe_lintrupdate != NULL) {
+ pi->pi_d->pe_lintrupdate(pi);
+ }
+#endif /* __FreeBSD__ */
}
int
diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h
index 853badaadb..0053caed99 100644
--- a/usr/src/cmd/bhyve/pci_emul.h
+++ b/usr/src/cmd/bhyve/pci_emul.h
@@ -27,6 +27,9 @@
*
* $FreeBSD$
*/
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
#ifndef _PCI_EMUL_H_
#define _PCI_EMUL_H_
@@ -71,6 +74,10 @@ struct pci_devemu {
uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int baridx,
uint64_t offset, int size);
+
+#ifndef __FreeBSD__
+ void (*pe_lintrupdate)(struct pci_devinst *pi);
+#endif /* __FreeBSD__ */
};
#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
diff --git a/usr/src/cmd/bhyve/pci_virtio_viona.c b/usr/src/cmd/bhyve/pci_virtio_viona.c
index e5a5cb584f..9cafa7b111 100644
--- a/usr/src/cmd/bhyve/pci_virtio_viona.c
+++ b/usr/src/cmd/bhyve/pci_virtio_viona.c
@@ -34,7 +34,7 @@
* http://www.illumos.org/license/CDDL.
*
* Copyright 2015 Pluribus Networks Inc.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/cdefs.h>
@@ -85,18 +85,6 @@
#define VIONA_REGSZ VIONA_R_MAX+1
/*
- * Host capabilities
- */
-#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
-#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
-#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
-
-#define VIONA_S_HOSTCAPS \
- (VIRTIO_NET_F_MAC | \
- VIRTIO_NET_F_MRG_RXBUF | \
- VIRTIO_NET_F_STATUS)
-
-/*
* Queue definitions.
*/
#define VIONA_RXQ 0
@@ -108,7 +96,7 @@
/*
* Debug printf
*/
-static int pci_viona_debug;
+static volatile int pci_viona_debug;
#define DPRINTF(params) if (pci_viona_debug) printf params
#define WPRINTF(params) printf params
@@ -124,26 +112,20 @@ struct pci_viona_softc {
int vsc_isr;
datalink_id_t vsc_linkid;
- char vsc_linkname[MAXLINKNAMELEN];
int vsc_vnafd;
+ /* Configurable parameters */
+ char vsc_linkname[MAXLINKNAMELEN];
+ uint32_t vsc_feature_mask;
+ uint16_t vsc_vq_size;
+
uint32_t vsc_features;
uint8_t vsc_macaddr[6];
uint64_t vsc_pfn[VIONA_MAXQ];
uint16_t vsc_msix_table_idx[VIONA_MAXQ];
- /*
- * Flag to see if host is already sending data out.
- * If it is, no need to wait for lock and send interrupt to host
- * for new data.
- */
- boolean_t vsc_tx_kick_lock_held;
-
- pthread_t tx_tid;
- pthread_mutex_t tx_mtx;
- pthread_cond_t tx_cond;
+ boolean_t vsc_msix_active;
};
-#define viona_ctx(sc) ((sc)->vsc_pi->pi_vmctx)
/*
* Return the size of IO BAR that maps virtio header and device specific
@@ -160,47 +142,44 @@ pci_viona_iosize(struct pci_devinst *pi)
}
static uint16_t
-pci_viona_qsize(int qnum)
+pci_viona_qsize(struct pci_viona_softc *sc, int qnum)
{
/* XXX no ctl queue currently */
if (qnum == VIONA_CTLQ) {
return (0);
}
- /* XXX fixed currently. Maybe different for tx/rx/ctl */
- return (VIONA_RINGSZ);
+ return (sc->vsc_vq_size);
}
static void
pci_viona_ring_reset(struct pci_viona_softc *sc, int ring)
{
- int error;
-
assert(ring < VIONA_MAXQ);
switch (ring) {
case VIONA_RXQ:
- error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_RESET);
- if (error != 0) {
- WPRINTF(("ioctl viona rx ring reset failed %d\n",
- error));
- } else {
- sc->vsc_pfn[VIONA_RXQ] = 0;
- }
- break;
case VIONA_TXQ:
- error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_RESET);
- if (error != 0) {
- WPRINTF(("ioctl viona tx ring reset failed %d\n",
- error));
- } else {
- sc->vsc_pfn[VIONA_TXQ] = 0;
- }
break;
case VIONA_CTLQ:
default:
- break;
+ return;
+ }
+
+ for (;;) {
+ int res;
+
+ res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_RESET, ring);
+ if (res == 0) {
+ break;
+ } else if (errno != EINTR) {
+ WPRINTF(("ioctl viona ring %d reset failed %d\n",
+ ring, errno));
+ return;
+ }
}
+
+ sc->vsc_pfn[ring] = 0;
}
static void
@@ -220,11 +199,11 @@ static void *
pci_viona_poll_thread(void *param)
{
struct pci_viona_softc *sc = param;
- pollfd_t pollset;
- int error;
+ pollfd_t pollset;
+ const int fd = sc->vsc_vnafd;
- pollset.fd = sc->vsc_vnafd;
- pollset.events = POLLIN | POLLOUT;
+ pollset.fd = fd;
+ pollset.events = POLLRDBAND;
for (;;) {
if (poll(&pollset, 1, -1) < 0) {
@@ -236,23 +215,35 @@ pci_viona_poll_thread(void *param)
break;
}
}
- if (pollset.revents & POLLIN) {
- pci_generate_msix(sc->vsc_pi,
- sc->vsc_msix_table_idx[VIONA_RXQ]);
- error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_INTR_CLR);
- if (error != 0) {
- WPRINTF(("ioctl viona rx intr clear failed"
- " %d\n", error));
+ if (pollset.revents & POLLRDBAND) {
+ vioc_intr_poll_t vip;
+ uint_t i;
+ int res;
+ boolean_t assert_lintr = B_FALSE;
+ const boolean_t do_msix = pci_msix_enabled(sc->vsc_pi);
+
+ res = ioctl(fd, VNA_IOC_INTR_POLL, &vip);
+ for (i = 0; res > 0 && i < VIONA_VQ_MAX; i++) {
+ if (vip.vip_status[i] == 0) {
+ continue;
+ }
+ if (do_msix) {
+ pci_generate_msix(sc->vsc_pi,
+ sc->vsc_msix_table_idx[i]);
+ } else {
+ assert_lintr = B_TRUE;
+ }
+ res = ioctl(fd, VNA_IOC_RING_INTR_CLR, i);
+ if (res != 0) {
+ WPRINTF(("ioctl viona vq %d intr "
+ "clear failed %d\n", i, errno));
+ }
}
- }
-
- if (pollset.revents & POLLOUT) {
- pci_generate_msix(sc->vsc_pi,
- sc->vsc_msix_table_idx[VIONA_TXQ]);
- error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_INTR_CLR);
- if (error != 0) {
- WPRINTF(("ioctl viona tx intr clear failed"
- " %d\n", error));
+ if (assert_lintr) {
+ pthread_mutex_lock(&sc->vsc_mtx);
+ sc->vsc_isr |= VTCFG_ISR_QUEUES;
+ pci_lintr_assert(sc->vsc_pi);
+ pthread_mutex_unlock(&sc->vsc_mtx);
}
}
}
@@ -261,57 +252,6 @@ pci_viona_poll_thread(void *param)
}
static void
-pci_viona_ping_rxq(struct pci_viona_softc *sc)
-{
- int error;
-
- error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_KICK);
- if (error != 0) {
- WPRINTF(("ioctl viona rx ring kick failed %d\n", error));
- }
-}
-
-static void *
-pci_viona_tx_thread(void *param)
-{
- struct pci_viona_softc *sc = (struct pci_viona_softc *)param;
- int error;
-
- pthread_mutex_lock(&sc->tx_mtx);
- for (;;) {
- error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
- assert(error == 0);
- sc->vsc_tx_kick_lock_held = B_TRUE;
- error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_KICK);
- if (error != 0) {
- WPRINTF(("ioctl viona tx ring kick failed %d\n",
- error));
- }
- sc->vsc_tx_kick_lock_held = B_FALSE;
- }
- pthread_mutex_unlock(&sc->tx_mtx);
-
- return (NULL);
-}
-
-static void
-pci_viona_ping_txq(struct pci_viona_softc *sc)
-{
- /* Signal the tx thread for processing */
- if (sc->vsc_tx_kick_lock_held)
- return;
- pthread_mutex_lock(&sc->tx_mtx);
- pthread_cond_signal(&sc->tx_cond);
- pthread_mutex_unlock(&sc->tx_mtx);
-}
-
-static void
-pci_viona_ping_ctlq(struct pci_viona_softc *sc)
-{
- DPRINTF(("viona: control qnotify!\n\r"));
-}
-
-static void
pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn)
{
int qnum = sc->vsc_curq;
@@ -320,29 +260,19 @@ pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn)
assert(qnum < VIONA_MAXQ);
+ if (qnum == VIONA_CTLQ) {
+ return;
+ }
+
sc->vsc_pfn[qnum] = (pfn << VRING_PFN);
- vna_ri.ri_qsize = pci_viona_qsize(qnum);
+ vna_ri.ri_index = qnum;
+ vna_ri.ri_qsize = pci_viona_qsize(sc, qnum);
vna_ri.ri_qaddr = (pfn << VRING_PFN);
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_INIT, &vna_ri);
- switch (qnum) {
- case VIONA_RXQ:
- error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_INIT, &vna_ri);
- if (error != 0) {
- WPRINTF(("ioctl viona rx ring init failed %d\n",
- error));
- }
- break;
- case VIONA_TXQ:
- error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_INIT, &vna_ri);
- if (error != 0) {
- WPRINTF(("ioctl viona tx ring init failed %d\n",
- error));
- }
- break;
- case VIONA_CTLQ:
- default:
- break;
+ if (error != 0) {
+ WPRINTF(("ioctl viona ring %u init failed %d\n", qnum, errno));
}
}
@@ -350,30 +280,20 @@ static int
pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc)
{
vioc_create_t vna_create;
-#if notyet
- char devname[MAXNAMELEN];
- int ctlfd;
-#endif
int error;
- sc->vsc_vnafd = open("/devices/pseudo/viona@0:ctl", O_RDWR | O_EXCL);
+ sc->vsc_vnafd = open("/dev/viona", O_RDWR | O_EXCL);
if (sc->vsc_vnafd == -1) {
- WPRINTF(("open viona ctl failed\n"));
+ WPRINTF(("open viona ctl failed: %d\n", errno));
return (-1);
}
vna_create.c_linkid = sc->vsc_linkid;
- strlcpy(vna_create.c_vmname, vmname,
- sizeof (vna_create.c_vmname));
-#if notyet
- vm_get_memory_seg(ctx, 1 * (1024 * 1024UL), &vna_create.c_lomem_size,
- NULL);
- vm_get_memory_seg(ctx, 4 * (1024 * 1024 * 1024UL),
- &vna_create.c_himem_size, NULL);
-#endif
+ vna_create.c_vmfd = vm_get_device_fd(ctx);
error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create);
if (error != 0) {
- WPRINTF(("ioctl viona create failed %d\n", error));
+ (void) close(sc->vsc_vnafd);
+ WPRINTF(("ioctl viona create failed %d\n", errno));
return (-1);
}
@@ -381,15 +301,99 @@ pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc)
}
static int
+pci_viona_parse_opts(struct pci_viona_softc *sc, char *opts)
+{
+ char *next, *cp, *vnic = NULL;
+ int err = 0;
+
+ sc->vsc_vq_size = VIONA_RINGSZ;
+ sc->vsc_feature_mask = 0;
+
+ for (; opts != NULL && *opts != '\0'; opts = next) {
+ char *val;
+
+ if ((cp = strchr(opts, ',')) != NULL) {
+ *cp = '\0';
+ next = cp + 1;
+ } else {
+ next = NULL;
+ }
+
+ if ((cp = strchr(opts, '=')) == NULL) {
+ /* vnic chosen with bare name */
+ if (vnic != NULL) {
+ fprintf(stderr,
+ "viona: unexpected vnic name '%s'", opts);
+ err = -1;
+ } else {
+ vnic = opts;
+ }
+ continue;
+ }
+
+ /* <param>=<value> handling */
+ val = cp + 1;
+ *cp = '\0';
+ if (strcmp(opts, "feature_mask") == 0) {
+ long num;
+
+ errno = 0;
+ num = strtol(val, NULL, 0);
+ if (errno != 0 || num < 0) {
+ fprintf(stderr,
+ "viona: invalid mask '%s'", val);
+ } else {
+ sc->vsc_feature_mask = num;
+ }
+ } else if (strcmp(opts, "vqsize") == 0) {
+ long num;
+
+ errno = 0;
+ num = strtol(val, NULL, 0);
+ if (errno != 0) {
+ fprintf(stderr,
+ "viona: invalid vsqize '%s'", val);
+ err = -1;
+ } else if (num <= 2 || num > 32768) {
+ fprintf(stderr,
+ "viona: vqsize out of range", num);
+ err = -1;
+ } else if ((1 << (ffs(num) - 1)) != num) {
+ fprintf(stderr,
+ "viona: vqsize must be power of 2", num);
+ err = -1;
+ } else {
+ sc->vsc_vq_size = num;
+ }
+ } else {
+ fprintf(stderr,
+ "viona: unrecognized option '%s'", opts);
+ err = -1;
+ }
+ }
+ if (vnic == NULL) {
+ fprintf(stderr, "viona: vnic name required");
+ sc->vsc_linkname[0] = '\0';
+ err = -1;
+ } else {
+ (void) strlcpy(sc->vsc_linkname, vnic, MAXLINKNAMELEN);
+ }
+
+ DPRINTF(("viona=%p dev=%s vqsize=%x feature_mask=%x\n", sc,
+ sc->vsc_linkname, sc->vsc_vq_size, sc->vsc_feature_mask));
+ return (err);
+}
+
+static int
pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
dladm_handle_t handle;
dladm_status_t status;
dladm_vnic_attr_t attr;
char errmsg[DLADM_STRSIZE];
- int error;
+ int error, i;
struct pci_viona_softc *sc;
- int i;
+ uint64_t ioport;
if (opts == NULL) {
printf("virtio-viona: vnic required\n");
@@ -404,7 +408,10 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
pthread_mutex_init(&sc->vsc_mtx, NULL);
- strlcpy(sc->vsc_linkname, opts, MAXLINKNAMELEN);
+ if (pci_viona_parse_opts(sc, opts) != 0) {
+ free(sc);
+ return (1);
+ }
if ((status = dladm_open(&handle)) != DLADM_STATUS_OK) {
WPRINTF(("could not open /dev/dld"));
@@ -430,7 +437,6 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
return (1);
}
- sc->vsc_tx_kick_lock_held = B_FALSE;
memcpy(sc->vsc_macaddr, attr.va_mac_addr, ETHERADDRL);
dladm_close(handle);
@@ -449,42 +455,44 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+ pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
/* MSI-X support */
for (i = 0; i < VIONA_MAXQ; i++)
sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
- /*
- * BAR 1 used to map MSI-X table and PBA
- */
+ /* BAR 1 used to map MSI-X table and PBA */
if (pci_emul_add_msixcap(pi, VIONA_MAXQ, 1)) {
free(sc);
return (1);
}
- pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ);
+ /* BAR 0 for legacy-style virtio register access. */
+ error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ);
+ if (error != 0) {
+ WPRINTF(("could not allocate virtio BAR\n"));
+ free(sc);
+ return (1);
+ }
+
+ /* Install ioport hook for virtqueue notification */
+ ioport = pi->pi_bar[0].addr + VTCFG_R_QNOTIFY;
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_SET_NOTIFY_IOP, ioport);
+ if (error != 0) {
+ WPRINTF(("could not install ioport hook at %x\n", ioport));
+ free(sc);
+ return (1);
+ }
/*
- * Initialize tx semaphore & spawn TX processing thread
- * As of now, only one thread for TX desc processing is
- * spawned.
+ * Need a legacy interrupt for virtio compliance, even though MSI-X
+ * operation is _strongly_ suggested for adequate performance.
*/
- pthread_mutex_init(&sc->tx_mtx, NULL);
- pthread_cond_init(&sc->tx_cond, NULL);
- pthread_create(&sc->tx_tid, NULL, pci_viona_tx_thread, (void *)sc);
+ pci_lintr_request(pi);
return (0);
}
-/*
- * Function pointer array to handle queue notifications
- */
-static void (*pci_viona_qnotify[VIONA_MAXQ])(struct pci_viona_softc *) = {
- pci_viona_ping_rxq,
- pci_viona_ping_txq,
- pci_viona_ping_ctlq
-};
-
static uint64_t
viona_adjust_offset(struct pci_devinst *pi, uint64_t offset)
{
@@ -501,6 +509,109 @@ viona_adjust_offset(struct pci_devinst *pi, uint64_t offset)
}
static void
+pci_viona_ring_set_msix(struct pci_devinst *pi, uint_t ring)
+{
+ struct pci_viona_softc *sc = pi->pi_arg;
+ struct msix_table_entry mte;
+ uint16_t tab_index;
+ vioc_ring_msi_t vrm;
+ int res;
+
+ assert(ring <= VIONA_VQ_TX);
+
+ vrm.rm_index = ring;
+ vrm.rm_addr = 0;
+ vrm.rm_msg = 0;
+ tab_index = sc->vsc_msix_table_idx[ring];
+
+ if (tab_index != VIRTIO_MSI_NO_VECTOR && sc->vsc_msix_active) {
+ mte = pi->pi_msix.table[tab_index];
+ if ((mte.vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ vrm.rm_addr = mte.addr;
+ vrm.rm_msg = mte.msg_data;
+ }
+ }
+
+ res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_SET_MSI, &vrm);
+ if (res != 0) {
+ WPRINTF(("ioctl viona set_msi %d failed %d\n", ring, errno));
+ }
+}
+
+static void
+pci_viona_lintrupdate(struct pci_devinst *pi)
+{
+ struct pci_viona_softc *sc = pi->pi_arg;
+ boolean_t msix_on = B_FALSE;
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+ msix_on = pci_msix_enabled(pi) && (pi->pi_msix.function_mask == 0);
+ if ((sc->vsc_msix_active && !msix_on) ||
+ (msix_on && !sc->vsc_msix_active)) {
+ uint_t i;
+
+ sc->vsc_msix_active = msix_on;
+ /* Update in-kernel ring configs */
+ for (i = 0; i <= VIONA_VQ_TX; i++) {
+ pci_viona_ring_set_msix(pi, i);
+ }
+ }
+ pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+static void
+pci_viona_msix_update(struct pci_devinst *pi, uint64_t offset)
+{
+ struct pci_viona_softc *sc = pi->pi_arg;
+ uint_t tab_index, i;
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+ if (!sc->vsc_msix_active) {
+ pthread_mutex_unlock(&sc->vsc_mtx);
+ return;
+ }
+
+ /*
+ * Rather than update every possible MSI-X vector, cheat and use the
+ * offset to calculate the entry within the table. Since this should
+ * only be called when a write to the table succeeds, the index should
+ * be valid.
+ */
+ tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+
+ for (i = 0; i <= VIONA_VQ_TX; i++) {
+ if (sc->vsc_msix_table_idx[i] != tab_index) {
+ continue;
+ }
+ pci_viona_ring_set_msix(pi, i);
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+static void
+pci_viona_qnotify(struct pci_viona_softc *sc, int ring)
+{
+ int error;
+
+ switch (ring) {
+ case VIONA_TXQ:
+ case VIONA_RXQ:
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_KICK, ring);
+ if (error != 0) {
+ WPRINTF(("ioctl viona ring %d kick failed %d\n",
+ ring, errno));
+ }
+ break;
+ case VIONA_CTLQ:
+ DPRINTF(("viona: control qnotify!\n"));
+ break;
+ default:
+ break;
+ }
+}
+
+static void
pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value)
{
@@ -510,7 +621,9 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
if (baridx == pci_msix_table_bar(pi) ||
baridx == pci_msix_pba_bar(pi)) {
- pci_emul_msix_twrite(pi, offset, size, value);
+ if (pci_emul_msix_twrite(pi, offset, size, value) == 0) {
+ pci_viona_msix_update(pi, offset);
+ }
return;
}
@@ -529,10 +642,14 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
switch (offset) {
case VTCFG_R_GUESTCAP:
assert(size == 4);
+ value &= ~(sc->vsc_feature_mask);
err = ioctl(sc->vsc_vnafd, VNA_IOC_SET_FEATURES, &value);
- if (err != 0)
+ if (err != 0) {
WPRINTF(("ioctl feature negotiation returned"
- " err = %d\n", err));
+ " err = %d\n", errno));
+ } else {
+ sc->vsc_features = value;
+ }
break;
case VTCFG_R_PFN:
assert(size == 4);
@@ -546,7 +663,7 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
case VTCFG_R_QNOTIFY:
assert(size == 2);
assert(value < VIONA_MAXQ);
- (*pci_viona_qnotify[value])(sc);
+ pci_viona_qnotify(sc, value);
break;
case VTCFG_R_STATUS:
assert(size == 1);
@@ -560,6 +677,7 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
assert(size == 2);
assert(sc->vsc_curq != VIONA_CTLQ);
sc->vsc_msix_table_idx[sc->vsc_curq] = value;
+ pci_viona_ring_set_msix(pi, sc->vsc_curq);
break;
case VIONA_R_CFG0:
case VIONA_R_CFG1:
@@ -597,7 +715,7 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
pthread_mutex_unlock(&sc->vsc_mtx);
}
-uint64_t
+static uint64_t
pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size)
{
@@ -627,9 +745,11 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
case VTCFG_R_HOSTCAP:
assert(size == 4);
err = ioctl(sc->vsc_vnafd, VNA_IOC_GET_FEATURES, &value);
- if (err != 0)
+ if (err != 0) {
WPRINTF(("ioctl get host features returned"
- " err = %d\n", err));
+ " err = %d\n", errno));
+ }
+ value &= ~sc->vsc_feature_mask;
break;
case VTCFG_R_GUESTCAP:
assert(size == 4);
@@ -641,7 +761,7 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
break;
case VTCFG_R_QNUM:
assert(size == 2);
- value = pci_viona_qsize(sc->vsc_curq);
+ value = pci_viona_qsize(sc, sc->vsc_curq);
break;
case VTCFG_R_QSEL:
assert(size == 2);
@@ -659,6 +779,9 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
assert(size == 1);
value = sc->vsc_isr;
sc->vsc_isr = 0; /* a read clears this flag */
+ if (value != 0) {
+ pci_lintr_deassert(pi);
+ }
break;
case VTCFG_R_CFGVEC:
assert(size == 2);
@@ -705,9 +828,10 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
}
struct pci_devemu pci_de_viona = {
- .pe_emu = "virtio-net-viona",
+ .pe_emu = "virtio-net-viona",
.pe_init = pci_viona_init,
.pe_barwrite = pci_viona_write,
- .pe_barread = pci_viona_read
+ .pe_barread = pci_viona_read,
+ .pe_lintrupdate = pci_viona_lintrupdate
};
PCI_EMUL_SET(pci_de_viona);
diff --git a/usr/src/cmd/devfsadm/i386/misc_link_i386.c b/usr/src/cmd/devfsadm/i386/misc_link_i386.c
index 4aeea7d294..0f8e64551d 100644
--- a/usr/src/cmd/devfsadm/i386/misc_link_i386.c
+++ b/usr/src/cmd/devfsadm/i386/misc_link_i386.c
@@ -85,6 +85,9 @@ static devfsadm_create_t misc_cbt[] = {
{ "pseudo", "ddi_pseudo", "ucode",
TYPE_EXACT | DRV_EXACT, ILEVEL_0, ln_minor_name,
},
+ { "pseudo", "ddi_pseudo", "viona",
+ TYPE_EXACT | DRV_EXACT, ILEVEL_0, ln_minor_name,
+ },
{ "pseudo", "ddi_pseudo", "vmm",
TYPE_EXACT | DRV_EXACT, ILEVEL_0, vmmctl,
}
@@ -114,6 +117,9 @@ static devfsadm_remove_t misc_remove_cbt[] = {
{ "serial", "^tty[a-z]$", RM_ALWAYS | RM_PRE,
ILEVEL_1, devfsadm_rm_all
},
+ { "pseudo", "^viona$", RM_ALWAYS | RM_PRE | RM_HOT,
+ ILEVEL_0, devfsadm_rm_all
+ },
{ "pseudo", "^vmmctl$", RM_ALWAYS | RM_PRE | RM_HOT,
ILEVEL_0, devfsadm_rm_all
}
diff --git a/usr/src/man/man9e/mac.9e b/usr/src/man/man9e/mac.9e
index 3a3f2ae90a..d3d066a564 100644
--- a/usr/src/man/man9e/mac.9e
+++ b/usr/src/man/man9e/mac.9e
@@ -570,24 +570,28 @@ The following set of flags may be combined through a bitwise inclusive OR:
.Bl -tag -width Ds
.It Sy HCKSUM_INET_PARTIAL
This indicates that the hardware can calculate a partial checksum for
-both IPv4 and IPv6; however, it requires the pseudo-header checksum be
-calculated for it.
+both IPv4 and IPv6 UDP and TCP packets; however, it requires the pseudo-header
+checksum be calculated for it.
The pseudo-header checksum will be available for the mblk_t when calling
.Xr mac_hcksum_get 9F .
-Note this does not imply that the hardware is capable of calculating the
-IPv4 header checksum.
+Note this does not imply that the hardware is capable of calculating
+the partial checksum for other L4 protocols or the IPv4 header checksum.
That should be indicated with the
.Sy HCKSUM_IPHDRCKSUM flag.
.It Sy HCKSUM_INET_FULL_V4
-This indicates that the hardware will fully calculate the L4 checksum
-for outgoing IPv4 packets and does not require a pseudo-header checksum.
+This indicates that the hardware will fully calculate the L4 checksum for
+outgoing IPv4 UDP or TCP packets only, and does not require a pseudo-header
+checksum.
Note this does not imply that the hardware is capable of calculating the
-IPv4 header checksum.
+checksum for other L4 protocols or the IPv4 header checksum.
That should be indicated with the
.Sy HCKSUM_IPHDRCKSUM .
.It Sy HCKSUM_INET_FULL_V6
-This indicates that the hardware will fully calculate the L4 checksum
-for outgoing IPv6 packets and does not require a pseudo-header checksum.
+This indicates that the hardware will fully calculate the L4 checksum for
+outgoing IPv6 UDP or TCP packets only, and does not require a pseudo-header
+checksum.
+Note this does not imply that the hardware is capable of calculating the
+checksum for any other L4 protocols.
.It Sy HCKSUM_IPHDRCKSUM
This indicates that the hardware supports calculating the checksum for
the IPv4 header itself.
diff --git a/usr/src/pkg/manifests/system-bhyve.mf b/usr/src/pkg/manifests/system-bhyve.mf
index 2a51d4fc22..7fdeb81254 100644
--- a/usr/src/pkg/manifests/system-bhyve.mf
+++ b/usr/src/pkg/manifests/system-bhyve.mf
@@ -35,8 +35,11 @@ dir path=usr group=sys
dir path=usr/kernel/drv group=sys
dir path=usr/kernel/drv/$(ARCH64) group=sys
dir path=usr/sbin
+driver name=viona
driver name=vmm
+file path=usr/kernel/drv/$(ARCH64)/viona
file path=usr/kernel/drv/$(ARCH64)/vmm
+file path=usr/kernel/drv/viona.conf
file path=usr/kernel/drv/vmm.conf
file path=usr/sbin/bhyve mode=0555
file path=usr/sbin/bhyvectl mode=0555
diff --git a/usr/src/uts/common/inet/ip/ip6_output.c b/usr/src/uts/common/inet/ip/ip6_output.c
index 6c5868ddde..143077ed32 100644
--- a/usr/src/uts/common/inet/ip/ip6_output.c
+++ b/usr/src/uts/common/inet/ip/ip6_output.c
@@ -23,6 +23,7 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -866,8 +867,16 @@ ip_output_cksum_v6(iaflags_t ixaflags, mblk_t *mp, ip6_t *ip6h,
ixa->ixa_raw_cksum_offset);
cksum = htons(protocol);
} else if (protocol == IPPROTO_ICMPV6) {
- cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
- cksum = IP_ICMPV6_CSUM_COMP; /* Pseudo-header cksum */
+ /*
+ * Currently we assume no HW support for ICMP checksum calc.
+ *
+ * When HW support is advertised for ICMP, we'll want the
+ * following to be set:
+ * cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
+ * cksum = IP_ICMPV6_CSUM_COMP; Pseudo-header cksum
+ */
+
+ return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
} else {
ip_hdr_cksum:
/* No IP header checksum for IPv6 */
diff --git a/usr/src/uts/common/inet/ip/ip_output.c b/usr/src/uts/common/inet/ip/ip_output.c
index 1017240521..a0157d3c48 100644
--- a/usr/src/uts/common/inet/ip/ip_output.c
+++ b/usr/src/uts/common/inet/ip/ip_output.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -1738,6 +1739,13 @@ ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
#endif
sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
goto ip_hdr_cksum;
+ } else if (protocol == IPPROTO_ICMP) {
+ /*
+ * Note that we always calculate a SW checksum for ICMP. In the
+ * future, if HW support for ICMP is advertised, we can change
+ * this.
+ */
+ return (ip_output_sw_cksum_v4(mp, ipha, ixa));
} else {
ip_hdr_cksum:
/* Calculate IPv4 header checksum */
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index b80cf53882..2e55e6fab8 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -22,6 +22,7 @@ static const char rcsid[] = "@(#)$Id: ip_fil_solaris.c,v 2.62.2.19 2005/07/13 21
#include <sys/filio.h>
#include <sys/systm.h>
#include <sys/strsubr.h>
+#include <sys/strsun.h>
#include <sys/cred.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
@@ -84,9 +85,19 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t,
static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t,
void *));
static int ipf_hook6 __P((hook_data_t, int, int, void *));
+
+static int ipf_hookviona_in __P((hook_event_token_t, hook_data_t, void *));
+static int ipf_hookviona_out __P((hook_event_token_t, hook_data_t,
+ void *));
+
extern int ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *));
extern int ipf_frruleiter __P((void *, int, void *, ipf_stack_t *));
+static int ipf_hook_protocol_notify __P((hook_notify_cmd_t, void *,
+ const char *, const char *, const char *));
+static int ipf_hook_instance_notify __P((hook_notify_cmd_t, void *,
+ const char *, const char *, const char *));
+
#if SOLARIS2 < 10
#if SOLARIS2 >= 7
u_int *ip_ttl_ptr = NULL;
@@ -153,6 +164,12 @@ char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz";
char *hook6_loop_out = "ipfilter_hook6_loop_out";
char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz";
+/* viona hook names */
+char *hook_viona_in = "ipfilter_hookviona_in";
+char *hook_viona_in_gz = "ipfilter_hookviona_in_gz";
+char *hook_viona_out = "ipfilter_hookviona_out";
+char *hook_viona_out_gz = "ipfilter_hookviona_out_gz";
+
/* ------------------------------------------------------------------------ */
/* Function: ipldetach */
/* Returns: int - 0 == success, else error. */
@@ -249,8 +266,40 @@ ipf_stack_t *ifs;
ifs->ifs_ipf_ipv4 = NULL;
}
+ /*
+ * Remove notification of viona hooks
+ */
+ net_instance_notify_unregister(ifs->ifs_netid,
+ ipf_hook_instance_notify);
+
#undef UNDO_HOOK
+ /*
+ * Normally, viona will unregister itself before ipldetach() is called,
+ * so these will be no-ops, but out of caution, we try to make sure
+ * we've removed any of our references.
+ */
+ (void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL,
+ NH_PHYSICAL_IN);
+ (void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL,
+ NH_PHYSICAL_OUT);
+
+ {
+ char netidstr[12]; /* Large enough for INT_MAX + NUL */
+ (void) snprintf(netidstr, sizeof (netidstr), "%d",
+ ifs->ifs_netid);
+
+ /*
+ * The notify callbacks expect the netid value passed as a
+ * string in the third argument. To prevent confusion if
+ * traced, we pass the same value the nethook framework would
+ * pass, even though the callback does not currently use the
+ * value.
+ */
+ (void) ipf_hook_instance_notify(HN_UNREGISTER, ifs, netidstr,
+ NULL, Hn_VIONA);
+ }
+
#ifdef IPFDEBUG
cmn_err(CE_CONT, "ipldetach()\n");
#endif
@@ -446,6 +495,21 @@ ipf_stack_t *ifs;
}
/*
+ * VIONA INET hooks. While the nethook framework allows us to register
+ * hooks for events that haven't been registered yet, we instead
+ * register and unregister our hooks in response to notifications
+ * about the viona hooks from the nethook framework. This prevents
+ * problems when the viona module gets unloaded while the ipf module
+ * does not. If we do not unregister our hooks after the viona module
+ * is unloaded, the viona module cannot later re-register them if it
+ * gets reloaded. As the ip, vnd, and ipf modules are rarely unloaded
+ * even on DEBUG kernels, they do not experience this issue.
+ */
+ if (net_instance_notify_register(id, ipf_hook_instance_notify,
+ ifs) != 0)
+ goto hookup_failed;
+
+ /*
* Reacquire ipf_global, now it is safe.
*/
WRITE_ENTER(&ifs->ifs_ipf_global);
@@ -508,6 +572,155 @@ hookup_failed:
return -1;
}
+/* ------------------------------------------------------------------------ */
+/*
+ * Called whenever a nethook protocol is registered or unregistered. Currently
+ * only used to add or remove the hooks for viona.
+ *
+ * While the function signature requires returning int, nothing
+ * in usr/src/uts/common/io/hook.c that invokes the callbacks
+ * captures the return value (nor is there currently any documentation
+ * on what return values should be). For now at least, we'll return 0
+ * on success (or 'not applicable') or an error value. Even if the
+ * nethook framework doesn't use the return address, it can be observed via
+ * dtrace if needed.
+ */
+static int
+ipf_hook_protocol_notify(hook_notify_cmd_t command, void *arg,
+ const char *name, const char *dummy __unused, const char *he_name)
+{
+ ipf_stack_t *ifs = arg;
+ hook_t **hookpp;
+ char *hook_name, *hint_name;
+ hook_func_t hookfn;
+ boolean_t *hookedp;
+ hook_hint_t hint;
+ boolean_t out;
+ int ret = 0;
+
+ const boolean_t gz = ifs->ifs_gz_controlled;
+
+ /* We currently only care about viona hooks notifications */
+ if (strcmp(name, Hn_VIONA) != 0)
+ return (0);
+
+ if (strcmp(he_name, NH_PHYSICAL_IN) == 0) {
+ out = B_FALSE;
+ } else if (strcmp(he_name, NH_PHYSICAL_OUT) == 0) {
+ out = B_TRUE;
+ } else {
+ /*
+ * If we've added more hook events to viona, we must add
+ * the corresponding handling here (even if it's just to
+ * ignore it) to prevent the firewall from not working as
+ * intended.
+ */
+ cmn_err(CE_PANIC, "%s: unhandled hook event %s", __func__,
+ he_name);
+
+ return (0);
+ }
+
+ if (out) {
+ hookpp = &ifs->ifs_ipfhookviona_out;
+ hookfn = ipf_hookviona_out;
+ hookedp = &ifs->ifs_hookviona_physical_out;
+ name = gz ? hook_viona_out_gz : hook_viona_out;
+ hint = gz ? HH_AFTER : HH_BEFORE;
+ hint_name = gz ? hook_viona_out : hook_viona_out_gz;
+ } else {
+ hookpp = &ifs->ifs_ipfhookviona_in;
+ hookfn = ipf_hookviona_in;
+ hookedp = &ifs->ifs_hookviona_physical_in;
+ name = gz ? hook_viona_in_gz : hook_viona_in;
+ hint = gz ? HH_BEFORE : HH_AFTER;
+ hint_name = gz ? hook_viona_in : hook_viona_in_gz;
+ }
+
+ switch (command) {
+ default:
+ case HN_NONE:
+ break;
+ case HN_REGISTER:
+ HOOK_INIT(*hookpp, hookfn, (char *)name, ifs);
+ (*hookpp)->h_hint = hint;
+ (*hookpp)->h_hintvalue = (uintptr_t)hint_name;
+ ret = net_hook_register(ifs->ifs_ipf_viona,
+ (char *)he_name, *hookpp);
+ if (ret != 0) {
+ cmn_err(CE_NOTE, "%s: could not register hook "
+ "(hook family=%s hook=%s) err=%d", __func__,
+ name, he_name, ret);
+ *hookedp = B_FALSE;
+ return (ret);
+ }
+ *hookedp = B_TRUE;
+ break;
+ case HN_UNREGISTER:
+ if (ifs->ifs_ipf_viona == NULL)
+ break;
+
+ ret = *hookedp ? net_hook_unregister(ifs->ifs_ipf_viona,
+ (char *)he_name, *hookpp) : 0;
+ if ((ret == 0 || ret == ENXIO)) {
+ if (*hookpp != NULL) {
+ hook_free(*hookpp);
+ *hookpp = NULL;
+ }
+ *hookedp = B_FALSE;
+ }
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * Called whenever a new nethook instance is created. Currently only used
+ * with the Hn_VIONA nethooks. Similar to ipf_hook_protocol_notify, the out
+ * function signature must return an int, though the result is never used.
+ * We elect to return 0 on success (or not applicable) or a non-zero value
+ * on error.
+ */
+static int
+ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg,
+ const char *netid, const char *dummy __unused, const char *instance)
+{
+ ipf_stack_t *ifs = arg;
+ int ret = 0;
+
+ /* We currently only care about viona hooks */
+ if (strcmp(instance, Hn_VIONA) != 0)
+ return (0);
+
+ switch (command) {
+ case HN_NONE:
+ default:
+ return (0);
+ case HN_REGISTER:
+ ifs->ifs_ipf_viona = net_protocol_lookup(ifs->ifs_netid,
+ NHF_VIONA);
+
+ if (ifs->ifs_ipf_viona == NULL)
+ return (EPROTONOSUPPORT);
+
+ ret = net_protocol_notify_register(ifs->ifs_ipf_viona,
+ ipf_hook_protocol_notify, ifs);
+ VERIFY(ret == 0 || ret == ESHUTDOWN);
+ break;
+ case HN_UNREGISTER:
+ if (ifs->ifs_ipf_viona == NULL)
+ break;
+ VERIFY0(net_protocol_notify_unregister(ifs->ifs_ipf_viona,
+ ipf_hook_protocol_notify));
+ VERIFY0(net_protocol_release(ifs->ifs_ipf_viona));
+ ifs->ifs_ipf_viona = NULL;
+ break;
+ }
+
+ return (ret);
+}
+
static int fr_setipfloopback(set, ifs)
int set;
ipf_stack_t *ifs;
@@ -2043,6 +2256,124 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg)
return ipf_hook6(info, 1, FI_NOCKSUM, arg);
}
+/* Static constants used by ipf_hook_ether */
+static uint8_t ipf_eth_bcast_addr[ETHERADDRL] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+static uint8_t ipf_eth_ipv4_mcast[3] = { 0x01, 0x00, 0x5E };
+static uint8_t ipf_eth_ipv6_mcast[2] = { 0x33, 0x33 };
+
+/* ------------------------------------------------------------------------ */
+/* Function: ipf_hook_ether */
+/* Returns: int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters: token(I) - pointer to event */
+/* info(I) - pointer to hook information for firewalling */
+/* */
+/* The ipf_hook_ether hook is currently private to illumos. It represents */
+/* a layer 2 datapath generally used by virtual machines. Currently the */
+/* hook is only used by the viona driver to pass along L2 frames for */
+/* inspection. It requires that the L2 ethernet header is contained within */
+/* a single dblk_t (however layers above the L2 header have no restrctions */
+/* in ipf). ipf does not currently support filtering on L2 fields (e.g. */
+/* filtering on a MAC address or ethertype), however virtual machines do */
+/* not have native IP stack instances where ipf traditionally hooks in. */
+/* Instead this entry point is used to determine if the packet is unicast, */
+/* broadcast, or multicast. The IPv4 or IPv6 packet is then passed to the */
+/* traditional ip hooks for filtering. Non IPv4 or non IPv6 packets are */
+/* not subject to examination. */
+/* ------------------------------------------------------------------------ */
+int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg,
+ boolean_t out)
+{
+ struct ether_header *ethp;
+ hook_pkt_event_t *hpe = (hook_pkt_event_t *)info;
+ mblk_t *mp;
+ size_t offset, len;
+ uint16_t etype;
+ boolean_t v6;
+
+ /*
+ * viona will only pass us mblks with the L2 header contained in a
+ * single data block.
+ */
+ mp = *hpe->hpe_mp;
+ len = MBLKL(mp);
+
+ VERIFY3S(len, >=, sizeof (struct ether_header));
+
+ ethp = (struct ether_header *)mp->b_rptr;
+ if ((etype = ntohs(ethp->ether_type)) == ETHERTYPE_VLAN) {
+ struct ether_vlan_header *evh =
+ (struct ether_vlan_header *)ethp;
+
+ VERIFY3S(len, >=, sizeof (struct ether_vlan_header));
+
+ etype = ntohs(evh->ether_type);
+ offset = sizeof (*evh);
+ } else {
+ offset = sizeof (*ethp);
+ }
+
+ /*
+ * ipf only support filtering IPv4 and IPv6. Ignore other types.
+ */
+ if (etype == ETHERTYPE_IP)
+ v6 = B_FALSE;
+ else if (etype == ETHERTYPE_IPV6)
+ v6 = B_TRUE;
+ else
+ return (0);
+
+ if (bcmp(ipf_eth_bcast_addr, ethp, ETHERADDRL) == 0)
+ hpe->hpe_flags |= HPE_BROADCAST;
+ else if (bcmp(ipf_eth_ipv4_mcast, ethp,
+ sizeof (ipf_eth_ipv4_mcast)) == 0)
+ hpe->hpe_flags |= HPE_MULTICAST;
+ else if (bcmp(ipf_eth_ipv6_mcast, ethp,
+ sizeof (ipf_eth_ipv6_mcast)) == 0)
+ hpe->hpe_flags |= HPE_MULTICAST;
+
+ /* Find the start of the IPv4 or IPv6 header */
+ for (; offset >= len; len = MBLKL(mp)) {
+ offset -= len;
+ mp = mp->b_cont;
+ if (mp == NULL) {
+ freemsg(*hpe->hpe_mp);
+ *hpe->hpe_mp = NULL;
+ return (-1);
+ }
+ }
+ hpe->hpe_mb = mp;
+ hpe->hpe_hdr = mp->b_rptr + offset;
+
+ return (v6 ? ipf_hook6(info, out, 0, arg) :
+ ipf_hook(info, out, 0, arg));
+}
+
+/* ------------------------------------------------------------------------ */
+/* Function: ipf_hookviona_{in,out} */
+/* Returns: int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters: event(I) - pointer to event */
+/* info(I) - pointer to hook information for firewalling */
+/* */
+/* The viona hooks are private hooks to illumos. They represents a layer 2 */
+/* datapath generally used to implement virtual machines. */
+/* along L2 packets. */
+/* */
+/* They end up calling the appropriate traditional ip hooks. */
+/* ------------------------------------------------------------------------ */
+int
+ipf_hookviona_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return (ipf_hook_ether(token, info, arg, B_FALSE));
+}
+
+int
+ipf_hookviona_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return (ipf_hook_ether(token, info, arg, B_TRUE));
+}
+
/* ------------------------------------------------------------------------ */
/* Function: ipf_hook4_loop_in */
/* Returns: int - 0 == packet ok, else problem, free packet if not done */
@@ -2386,7 +2717,7 @@ fr_info_t *fin;
#ifdef USE_INET6
struct in6_addr tmp_src6;
#endif
-
+
ASSERT(fin->fin_p == IPPROTO_TCP);
/*
@@ -2428,7 +2759,7 @@ fr_info_t *fin;
#endif
if (tcp != NULL) {
- /*
+ /*
* Adjust TCP header:
* swap ports,
* set flags,
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
index a239f1c1ca..0ceea1e921 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
@@ -6,7 +6,7 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
- * Copyright 2014 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc. All rights reserved.
*/
#ifndef __IPF_STACK_H__
@@ -87,8 +87,8 @@ struct ipf_stack {
#endif
int ifs_ipf_locks_done;
- ipftoken_t *ifs_ipftokenhead;
- ipftoken_t **ifs_ipftokentail;
+ ipftoken_t *ifs_ipftokenhead;
+ ipftoken_t **ifs_ipftokentail;
ipfmutex_t ifs_ipl_mutex;
ipfmutex_t ifs_ipf_authmx;
@@ -126,6 +126,9 @@ struct ipf_stack {
hook_t *ifs_ipfhook6_loop_out;
hook_t *ifs_ipfhook6_nicevents;
+ hook_t *ifs_ipfhookviona_in;
+ hook_t *ifs_ipfhookviona_out;
+
/* flags to indicate whether hooks are registered. */
boolean_t ifs_hook4_physical_in;
boolean_t ifs_hook4_physical_out;
@@ -137,10 +140,13 @@ struct ipf_stack {
boolean_t ifs_hook6_nic_events;
boolean_t ifs_hook6_loopback_in;
boolean_t ifs_hook6_loopback_out;
+ boolean_t ifs_hookviona_physical_in;
+ boolean_t ifs_hookviona_physical_out;
int ifs_ipf_loopback;
net_handle_t ifs_ipf_ipv4;
net_handle_t ifs_ipf_ipv6;
+ net_handle_t ifs_ipf_viona;
/* ip_auth.c */
int ifs_fr_authsize;
@@ -167,8 +173,8 @@ struct ipf_stack {
ipfr_t **ifs_ipfr_nattail;
ipfr_t **ifs_ipfr_nattab;
- ipfr_t *ifs_ipfr_ipidlist;
- ipfr_t **ifs_ipfr_ipidtail;
+ ipfr_t *ifs_ipfr_ipidlist;
+ ipfr_t **ifs_ipfr_ipidtail;
ipfr_t **ifs_ipfr_ipidtab;
ipfrstat_t ifs_ipfr_stats;
diff --git a/usr/src/uts/common/io/hook.c b/usr/src/uts/common/io/hook.c
index eb139a37e2..44af26e7c4 100644
--- a/usr/src/uts/common/io/hook.c
+++ b/usr/src/uts/common/io/hook.c
@@ -1050,7 +1050,7 @@ hook_family_free(hook_family_int_t *hfi, hook_stack_t *hks)
/* Free container */
kmem_free(hfi, sizeof (*hfi));
- if (hks->hks_shutdown == 2)
+ if (hks != NULL && hks->hks_shutdown == 2)
hook_stack_remove(hks);
mutex_exit(&hook_stack_lock);
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index 5bc2bd41c5..54aad9307a 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -673,11 +674,11 @@ typedef struct {
#define HCKSUM_ENABLE 0x01 /* Set to enable hardware checksum */
/* capability */
#define HCKSUM_INET_PARTIAL 0x02 /* Partial 1's complement checksum */
- /* ability */
+ /* ability for TCP/UDP packets. */
#define HCKSUM_INET_FULL_V4 0x04 /* Full 1's complement checksum */
- /* ability for IPv4 packets. */
+ /* ability for IPv4 TCP/UDP packets. */
#define HCKSUM_INET_FULL_V6 0x08 /* Full 1's complement checksum */
- /* ability for IPv6 packets. */
+ /* ability for IPv6 TCP/UDP packets. */
#define HCKSUM_IPHDRCKSUM 0x10 /* IPv4 Header checksum offload */
/* capability */
#ifdef _KERNEL
diff --git a/usr/src/uts/common/sys/hook_impl.h b/usr/src/uts/common/sys/hook_impl.h
index d8a15f0fe5..f3337bbacf 100644
--- a/usr/src/uts/common/sys/hook_impl.h
+++ b/usr/src/uts/common/sys/hook_impl.h
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018, Joyent, Inc.
*/
/*
@@ -171,7 +172,7 @@ typedef struct hook_family_int {
cvwaitlock_t hfi_lock;
SLIST_ENTRY(hook_family_int) hfi_entry;
hook_event_int_head_t hfi_head;
- hook_family_t hfi_family;
+ hook_family_t hfi_family;
kstat_t *hfi_kstat;
struct hook_stack *hfi_stack;
hook_notify_head_t hfi_nhead;
@@ -209,6 +210,7 @@ typedef struct hook_stack_head hook_stack_head_t;
#define Hn_ARP "arp"
#define Hn_IPV4 "inet"
#define Hn_IPV6 "inet6"
+#define Hn_VIONA "viona_inet"
extern int hook_run(hook_family_int_t *, hook_event_token_t, hook_data_t);
extern int hook_register(hook_family_int_t *, char *, hook_t *);
diff --git a/usr/src/uts/common/sys/neti.h b/usr/src/uts/common/sys/neti.h
index b21504109c..e7027f8ece 100644
--- a/usr/src/uts/common/sys/neti.h
+++ b/usr/src/uts/common/sys/neti.h
@@ -21,6 +21,8 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018, Joyent, Inc.
*/
#ifndef _SYS_NETI_H
@@ -46,6 +48,7 @@ struct msgb; /* avoiding sys/stream.h here */
#define NHF_INET "NHF_INET"
#define NHF_INET6 "NHF_INET6"
#define NHF_ARP "NHF_ARP"
+#define NHF_VIONA "NHF_VIONA"
/*
* Event identification
@@ -61,7 +64,7 @@ struct msgb; /* avoiding sys/stream.h here */
/*
* Network NIC hardware checksum capability
*/
-#define NET_HCK_NONE 0x00
+#define NET_HCK_NONE 0x00
#define NET_HCK_L3_FULL 0x01
#define NET_HCK_L3_PART 0x02
#define NET_HCK_L4_FULL 0x10
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index ca4ae0cd65..312c0f233d 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -276,7 +276,11 @@ VMM_OBJS += vmm.o \
vmm_support.o \
vmm_zsd.o
-VIONA_OBJS += viona.o
+VIONA_OBJS += viona_main.o \
+ viona_ring.o \
+ viona_rx.o \
+ viona_tx.o \
+ viona_hook.o \
#
# Build up defines and paths.
diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc
index b66b0ca2da..b60d24d82c 100644
--- a/usr/src/uts/i86pc/Makefile.i86pc
+++ b/usr/src/uts/i86pc/Makefile.i86pc
@@ -247,6 +247,7 @@ DRV_KMODS += ioat
DRV_KMODS += fipe
DRV_KMODS += imc imcstub
DRV_KMODS += vmm
+DRV_KMODS += viona
DRV_KMODS += cpudrv
diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c
deleted file mode 100644
index 2371a2f3ae..0000000000
--- a/usr/src/uts/i86pc/io/viona/viona.c
+++ /dev/null
@@ -1,1409 +0,0 @@
-/*
- * Copyright (c) 2013 Chris Torek <torek @ torek net>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * Copyright 2015 Pluribus Networks Inc.
- * Copyright 2017 Joyent, Inc.
- */
-
-#include <sys/conf.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/sunndi.h>
-#include <sys/sysmacros.h>
-#include <sys/strsubr.h>
-#include <sys/strsun.h>
-#include <vm/seg_kmem.h>
-
-#include <sys/dls.h>
-#include <sys/mac_client.h>
-
-#include <sys/viona_io.h>
-
-#define MB (1024UL * 1024)
-#define GB (1024UL * MB)
-
-/*
- * Min. octets in an ethernet frame minus FCS
- */
-#define MIN_BUF_SIZE 60
-
-#define VIONA_NAME "Virtio Network Accelerator"
-
-#define VIONA_CTL_MINOR 0
-#define VIONA_CTL_NODE_NAME "ctl"
-
-#define VIONA_CLI_NAME "viona"
-
-#define VTNET_MAXSEGS 32
-
-#define VRING_ALIGN 4096
-
-#define VRING_DESC_F_NEXT (1 << 0)
-#define VRING_DESC_F_WRITE (1 << 1)
-#define VRING_DESC_F_INDIRECT (1 << 2)
-
-#define VRING_AVAIL_F_NO_INTERRUPT 1
-
-#define VRING_USED_F_NO_NOTIFY 1
-
-#define BCM_NIC_DRIVER "bnxe"
-/*
- * Host capabilities
- */
-#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
-#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
-#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
-
-#define VIONA_S_HOSTCAPS \
- (VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | \
- VIRTIO_NET_F_STATUS)
-
-#pragma pack(1)
-struct virtio_desc {
- uint64_t vd_addr;
- uint32_t vd_len;
- uint16_t vd_flags;
- uint16_t vd_next;
-};
-#pragma pack()
-
-#pragma pack(1)
-struct virtio_used {
- uint32_t vu_idx;
- uint32_t vu_tlen;
-};
-#pragma pack()
-
-#pragma pack(1)
-struct virtio_net_mrgrxhdr {
- uint8_t vrh_flags;
- uint8_t vrh_gso_type;
- uint16_t vrh_hdr_len;
- uint16_t vrh_gso_size;
- uint16_t vrh_csum_start;
- uint16_t vrh_csum_offset;
- uint16_t vrh_bufs;
-};
-struct virtio_net_hdr {
- uint8_t vrh_flags;
- uint8_t vrh_gso_type;
- uint16_t vrh_hdr_len;
- uint16_t vrh_gso_size;
- uint16_t vrh_csum_start;
- uint16_t vrh_csum_offset;
-};
-#pragma pack()
-
-typedef struct viona_vring_hqueue {
- /* Internal state */
- uint16_t hq_size;
- kmutex_t hq_a_mutex;
- kmutex_t hq_u_mutex;
- uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
-
- /* Host-context pointers to the queue */
- caddr_t hq_baseaddr;
- uint16_t *hq_avail_flags;
- uint16_t *hq_avail_idx; /* monotonically increasing */
- uint16_t *hq_avail_ring;
-
- uint16_t *hq_used_flags;
- uint16_t *hq_used_idx; /* monotonically increasing */
- struct virtio_used *hq_used_ring;
-} viona_vring_hqueue_t;
-
-
-typedef struct viona_link {
- datalink_id_t l_linkid;
-
- struct vm *l_vm;
- size_t l_vm_lomemsize;
- caddr_t l_vm_lomemaddr;
- size_t l_vm_himemsize;
- caddr_t l_vm_himemaddr;
-
- mac_handle_t l_mh;
- mac_client_handle_t l_mch;
-
- kmem_cache_t *l_desb_kmc;
-
- pollhead_t l_pollhead;
-
- viona_vring_hqueue_t l_rx_vring;
- uint_t l_rx_intr;
-
- viona_vring_hqueue_t l_tx_vring;
- kcondvar_t l_tx_cv;
- uint_t l_tx_intr;
- kmutex_t l_tx_mutex;
- int l_tx_outstanding;
- uint32_t l_features;
-} viona_link_t;
-
-typedef struct {
- frtn_t d_frtn;
- viona_link_t *d_link;
- uint_t d_ref;
- uint16_t d_cookie;
- int d_len;
-} viona_desb_t;
-
-typedef struct viona_soft_state {
- viona_link_t *ss_link;
-} viona_soft_state_t;
-
-typedef struct used_elem {
- uint16_t id;
- uint32_t len;
-} used_elem_t;
-
-static void *viona_state;
-static dev_info_t *viona_dip;
-static id_space_t *viona_minor_ids;
-/*
- * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
- * transmission to free resources.
- */
-static boolean_t copy_tx_mblks = B_TRUE;
-
-extern struct vm *vm_lookup_by_name(char *name);
-extern uint64_t vm_gpa2hpa(struct vm *vm, uint64_t gpa, size_t len);
-
-static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
-static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
-static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
-static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
-static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
- cred_t *credp, int *rval);
-static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
- struct pollhead **phpp);
-
-static int viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create);
-static int viona_ioc_delete(viona_soft_state_t *ss);
-
-static int viona_vm_map(viona_link_t *link);
-static caddr_t viona_gpa2kva(viona_link_t *link, uint64_t gpa);
-static void viona_vm_unmap(viona_link_t *link);
-
-static int viona_ioc_rx_ring_init(viona_link_t *link,
- vioc_ring_init_t *u_ri);
-static int viona_ioc_tx_ring_init(viona_link_t *link,
- vioc_ring_init_t *u_ri);
-static int viona_ioc_rx_ring_reset(viona_link_t *link);
-static int viona_ioc_tx_ring_reset(viona_link_t *link);
-static void viona_ioc_rx_ring_kick(viona_link_t *link);
-static void viona_ioc_tx_ring_kick(viona_link_t *link);
-static int viona_ioc_rx_intr_clear(viona_link_t *link);
-static int viona_ioc_tx_intr_clear(viona_link_t *link);
-
-static void viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
- boolean_t loopback);
-static void viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq);
-
-static struct cb_ops viona_cb_ops = {
- viona_open,
- viona_close,
- nodev,
- nodev,
- nodev,
- nodev,
- nodev,
- viona_ioctl,
- nodev,
- nodev,
- nodev,
- viona_chpoll,
- ddi_prop_op,
- 0,
- D_MP | D_NEW | D_HOTPLUG,
- CB_REV,
- nodev,
- nodev
-};
-
-static struct dev_ops viona_ops = {
- DEVO_REV,
- 0,
- nodev,
- nulldev,
- nulldev,
- viona_attach,
- viona_detach,
- nodev,
- &viona_cb_ops,
- NULL,
- ddi_power,
- ddi_quiesce_not_needed
-};
-
-static struct modldrv modldrv = {
- &mod_driverops,
- VIONA_NAME,
- &viona_ops,
-};
-
-static struct modlinkage modlinkage = {
- MODREV_1, &modldrv, NULL
-};
-
-int
-_init(void)
-{
- int ret;
-
- ret = ddi_soft_state_init(&viona_state,
- sizeof (viona_soft_state_t), 0);
- if (ret == 0) {
- ret = mod_install(&modlinkage);
- if (ret != 0) {
- ddi_soft_state_fini(&viona_state);
- return (ret);
- }
- }
-
- return (ret);
-}
-
-int
-_fini(void)
-{
- int ret;
-
- ret = mod_remove(&modlinkage);
- if (ret == 0) {
- ddi_soft_state_fini(&viona_state);
- }
-
- return (ret);
-}
-
-int
-_info(struct modinfo *modinfop)
-{
- return (mod_info(&modlinkage, modinfop));
-}
-
-static void
-set_viona_tx_mode()
-{
- major_t bcm_nic_major;
- if ((bcm_nic_major = ddi_name_to_major(BCM_NIC_DRIVER))
- != DDI_MAJOR_T_NONE) {
- if (ddi_hold_installed_driver(bcm_nic_major) != NULL) {
- copy_tx_mblks = B_FALSE;
- ddi_rele_driver(bcm_nic_major);
- }
- }
-}
-
-static int
-viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
-{
- if (cmd != DDI_ATTACH) {
- return (DDI_FAILURE);
- }
-
- viona_minor_ids = id_space_create("viona_minor_id",
- VIONA_CTL_MINOR + 1, UINT16_MAX);
-
- if (ddi_create_minor_node(dip, VIONA_CTL_NODE_NAME,
- S_IFCHR, VIONA_CTL_MINOR, DDI_PSEUDO, 0) != DDI_SUCCESS) {
- return (DDI_FAILURE);
- }
-
- viona_dip = dip;
-
- set_viona_tx_mode();
- ddi_report_dev(viona_dip);
-
- return (DDI_SUCCESS);
-}
-
-static int
-viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
-{
- if (cmd != DDI_DETACH) {
- return (DDI_FAILURE);
- }
-
- id_space_destroy(viona_minor_ids);
-
- ddi_remove_minor_node(viona_dip, NULL);
-
- viona_dip = NULL;
-
- return (DDI_SUCCESS);
-}
-
-static int
-viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
-{
- int minor;
-
- if (otype != OTYP_CHR) {
- return (EINVAL);
- }
-
- if (drv_priv(credp) != 0) {
- return (EPERM);
- }
-
- if (getminor(*devp) != VIONA_CTL_MINOR) {
- return (ENXIO);
- }
-
- minor = id_alloc(viona_minor_ids);
- if (minor == 0) {
- /* All minors are busy */
- return (EBUSY);
- }
-
- if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
- id_free(viona_minor_ids, minor);
- }
-
- *devp = makedevice(getmajor(*devp), minor);
-
- return (0);
-}
-
-static int
-viona_close(dev_t dev, int flag, int otype, cred_t *credp)
-{
- int minor;
- viona_soft_state_t *ss;
-
- if (otype != OTYP_CHR) {
- return (EINVAL);
- }
-
- if (drv_priv(credp) != 0) {
- return (EPERM);
- }
-
- minor = getminor(dev);
-
- ss = ddi_get_soft_state(viona_state, minor);
- if (ss == NULL) {
- return (ENXIO);
- }
-
- viona_ioc_delete(ss);
-
- ddi_soft_state_free(viona_state, minor);
-
- id_free(viona_minor_ids, minor);
-
- return (0);
-}
-
-static int
-viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
- cred_t *credp, int *rval)
-{
- viona_soft_state_t *ss;
- int err = 0;
-
- ss = ddi_get_soft_state(viona_state, getminor(dev));
- if (ss == NULL) {
- return (ENXIO);
- }
-
- switch (cmd) {
- case VNA_IOC_CREATE:
- err = viona_ioc_create(ss, (vioc_create_t *)data);
- break;
- case VNA_IOC_DELETE:
- err = viona_ioc_delete(ss);
- break;
- case VNA_IOC_SET_FEATURES:
- if (ss->ss_link == NULL) {
- return (ENOSYS);
- }
- ss->ss_link->l_features = *(int *)data & VIONA_S_HOSTCAPS;
- break;
- case VNA_IOC_GET_FEATURES:
- if (ss->ss_link == NULL) {
- return (ENOSYS);
- }
- *(int *)data = VIONA_S_HOSTCAPS;
- break;
- case VNA_IOC_RX_RING_INIT:
- if (ss->ss_link == NULL) {
- return (ENOSYS);
- }
- err = viona_ioc_rx_ring_init(ss->ss_link,
- (vioc_ring_init_t *)data);
- break;
- case VNA_IOC_RX_RING_RESET:
- if (ss->ss_link == NULL) {
- return (ENOSYS);
- }
- err = viona_ioc_rx_ring_reset(ss->ss_link);
- break;
- case VNA_IOC_RX_RING_KICK:
- if (ss->ss_link == NULL) {
- return (ENOSYS);
- }
- viona_ioc_rx_ring_kick(ss->ss_link);
- err = 0;
- break;
- case VNA_IOC_TX_RING_INIT:
- if (ss->ss_link == NULL) {
- return (ENOSYS);
- }
- err = viona_ioc_tx_ring_init(ss->ss_link,
- (vioc_ring_init_t *)data);
- break;
- case VNA_IOC_TX_RING_RESET:
- if (ss->ss_link == NULL) {
- return (ENOSYS);
- }
- err = viona_ioc_tx_ring_reset(ss->ss_link);
- break;
- case VNA_IOC_TX_RING_KICK:
- if (ss->ss_link == NULL) {
- return (ENOSYS);
- }
- viona_ioc_tx_ring_kick(ss->ss_link);
- err = 0;
- break;
- case VNA_IOC_RX_INTR_CLR:
- if (ss->ss_link == NULL) {
- return (ENOSYS);
- }
- err = viona_ioc_rx_intr_clear(ss->ss_link);
- break;
- case VNA_IOC_TX_INTR_CLR:
- if (ss->ss_link == NULL) {
- return (ENOSYS);
- }
- err = viona_ioc_tx_intr_clear(ss->ss_link);
- break;
- default:
- err = ENOTTY;
- break;
- }
-
- return (err);
-}
-
-static int
-viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
- struct pollhead **phpp)
-{
- viona_soft_state_t *ss;
-
- ss = ddi_get_soft_state(viona_state, getminor(dev));
- if (ss == NULL || ss->ss_link == NULL) {
- return (ENXIO);
- }
-
- *reventsp = 0;
-
- if (ss->ss_link->l_rx_intr && (events & POLLIN)) {
- *reventsp |= POLLIN;
- }
-
- if (ss->ss_link->l_tx_intr && (events & POLLOUT)) {
- *reventsp |= POLLOUT;
- }
-
- if (*reventsp == 0 && !anyyet) {
- *phpp = &ss->ss_link->l_pollhead;
- }
-
- return (0);
-}
-
-static int
-viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create)
-{
- vioc_create_t k_create;
- viona_link_t *link;
- char cli_name[MAXNAMELEN];
- int err;
-
- if (ss->ss_link != NULL) {
- return (ENOSYS);
- }
- if (copyin(u_create, &k_create, sizeof (k_create)) != 0) {
- return (EFAULT);
- }
-
- link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
-
- link->l_linkid = k_create.c_linkid;
- link->l_vm = vm_lookup_by_name(k_create.c_vmname);
- if (link->l_vm == NULL) {
- err = ENXIO;
- goto bail;
- }
-
- link->l_vm_lomemsize = k_create.c_lomem_size;
- link->l_vm_himemsize = k_create.c_himem_size;
- err = viona_vm_map(link);
- if (err != 0) {
- goto bail;
- }
-
- err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
- if (err != 0) {
- cmn_err(CE_WARN, "viona create mac_open_by_linkid"
- " returned %d\n", err);
- goto bail;
- }
-
- snprintf(cli_name, sizeof (cli_name), "%s-%d",
- VIONA_CLI_NAME, link->l_linkid);
- err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
- if (err != 0) {
- cmn_err(CE_WARN, "viona create mac_client_open"
- " returned %d\n", err);
- goto bail;
- }
-
- link->l_features = VIONA_S_HOSTCAPS;
- link->l_desb_kmc = kmem_cache_create(cli_name,
- sizeof (viona_desb_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-
- mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL);
- mutex_init(&link->l_rx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL);
- mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL);
- mutex_init(&link->l_tx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL);
- if (copy_tx_mblks) {
- mutex_init(&link->l_tx_mutex, NULL, MUTEX_DRIVER, NULL);
- cv_init(&link->l_tx_cv, NULL, CV_DRIVER, NULL);
- }
- ss->ss_link = link;
-
- return (0);
-
-bail:
- if (link->l_mch != NULL) {
- mac_client_close(link->l_mch, 0);
- }
- if (link->l_mh != NULL) {
- mac_close(link->l_mh);
- }
-
- kmem_free(link, sizeof (viona_link_t));
-
- return (err);
-}
-
-static int
-viona_ioc_delete(viona_soft_state_t *ss)
-{
- viona_link_t *link;
-
- link = ss->ss_link;
- if (link == NULL) {
- return (ENOSYS);
- }
- if (copy_tx_mblks) {
- mutex_enter(&link->l_tx_mutex);
- while (link->l_tx_outstanding != 0) {
- cv_wait(&link->l_tx_cv, &link->l_tx_mutex);
- }
- mutex_exit(&link->l_tx_mutex);
- }
- if (link->l_mch != NULL) {
- mac_rx_clear(link->l_mch);
- mac_client_close(link->l_mch, 0);
- }
- if (link->l_mh != NULL) {
- mac_close(link->l_mh);
- }
-
- viona_vm_unmap(link);
- mutex_destroy(&link->l_tx_vring.hq_a_mutex);
- mutex_destroy(&link->l_tx_vring.hq_u_mutex);
- mutex_destroy(&link->l_rx_vring.hq_a_mutex);
- mutex_destroy(&link->l_rx_vring.hq_u_mutex);
- if (copy_tx_mblks) {
- mutex_destroy(&link->l_tx_mutex);
- cv_destroy(&link->l_tx_cv);
- }
-
- kmem_cache_destroy(link->l_desb_kmc);
-
- kmem_free(link, sizeof (viona_link_t));
-
- ss->ss_link = NULL;
-
- return (0);
-}
-
-static caddr_t
-viona_mapin_vm_chunk(viona_link_t *link, uint64_t gpa, size_t len)
-{
- caddr_t addr;
- size_t offset;
- pfn_t pfnum;
-
- if (len == 0)
- return (NULL);
-
- addr = vmem_alloc(heap_arena, len, VM_SLEEP);
- if (addr == NULL)
- return (NULL);
-
- for (offset = 0; offset < len; offset += PAGESIZE) {
- pfnum = btop(vm_gpa2hpa(link->l_vm, gpa + offset, PAGESIZE));
- ASSERT(pfnum);
- hat_devload(kas.a_hat, addr + offset, PAGESIZE, pfnum,
- PROT_READ | PROT_WRITE, HAT_LOAD_LOCK);
- }
-
- return (addr);
-}
-
-/*
- * Map the guest physical address space into the kernel virtual address space.
- */
-static int
-viona_vm_map(viona_link_t *link)
-{
- link->l_vm_lomemaddr = viona_mapin_vm_chunk(link,
- 0, link->l_vm_lomemsize);
- if (link->l_vm_lomemaddr == NULL)
- return (-1);
- link->l_vm_himemaddr = viona_mapin_vm_chunk(link,
- 4 * (1024 * 1024 * 1024UL), link->l_vm_himemsize);
- if (link->l_vm_himemsize && link->l_vm_himemaddr == NULL)
- return (-1);
-
- return (0);
-}
-
-/*
- * Translate a guest physical address into a kernel virtual address.
- */
-static caddr_t
-viona_gpa2kva(viona_link_t *link, uint64_t gpa)
-{
- if (gpa < link->l_vm_lomemsize)
- return (link->l_vm_lomemaddr + gpa);
-
- gpa -= (4 * GB);
- if (gpa < link->l_vm_himemsize)
- return (link->l_vm_himemaddr + gpa);
-
- return (NULL);
-}
-
-static void
-viona_vm_unmap(viona_link_t *link)
-{
- if (link->l_vm_lomemaddr) {
- hat_unload(kas.a_hat, link->l_vm_lomemaddr,
- link->l_vm_lomemsize, HAT_UNLOAD_UNLOCK);
- vmem_free(heap_arena, link->l_vm_lomemaddr,
- link->l_vm_lomemsize);
- }
- if (link->l_vm_himemaddr) {
- hat_unload(kas.a_hat, link->l_vm_himemaddr,
- link->l_vm_himemsize, HAT_UNLOAD_UNLOCK);
- vmem_free(heap_arena, link->l_vm_himemaddr,
- link->l_vm_himemsize);
- }
-}
-
-static int
-viona_ioc_ring_init_common(viona_link_t *link, viona_vring_hqueue_t *hq,
- vioc_ring_init_t *u_ri)
-{
- vioc_ring_init_t k_ri;
-
- if (copyin(u_ri, &k_ri, sizeof (k_ri)) != 0) {
- return (EFAULT);
- }
-
- hq->hq_size = k_ri.ri_qsize;
- hq->hq_baseaddr = viona_gpa2kva(link, k_ri.ri_qaddr);
- if (hq->hq_baseaddr == NULL)
- return (EINVAL);
-
- hq->hq_avail_flags = (uint16_t *)(viona_gpa2kva(link,
- k_ri.ri_qaddr + hq->hq_size * sizeof (struct virtio_desc)));
- if (hq->hq_avail_flags == NULL)
- return (EINVAL);
- hq->hq_avail_idx = hq->hq_avail_flags + 1;
- hq->hq_avail_ring = hq->hq_avail_flags + 2;
-
- hq->hq_used_flags = (uint16_t *)(viona_gpa2kva(link,
- P2ROUNDUP(k_ri.ri_qaddr +
- hq->hq_size * sizeof (struct virtio_desc) + 2, VRING_ALIGN)));
- if (hq->hq_used_flags == NULL)
- return (EINVAL);
- hq->hq_used_idx = hq->hq_used_flags + 1;
- hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
-
- /*
- * Initialize queue indexes
- */
- hq->hq_cur_aidx = 0;
-
- return (0);
-}
-
-static int
-viona_ioc_rx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri)
-{
- viona_vring_hqueue_t *hq;
- int rval;
-
- hq = &link->l_rx_vring;
-
- rval = viona_ioc_ring_init_common(link, hq, u_ri);
- if (rval != 0) {
- return (rval);
- }
-
- return (0);
-}
-
-static int
-viona_ioc_tx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri)
-{
- viona_vring_hqueue_t *hq;
-
- hq = &link->l_tx_vring;
-
- return (viona_ioc_ring_init_common(link, hq, u_ri));
-}
-
-static int
-viona_ioc_ring_reset_common(viona_vring_hqueue_t *hq)
-{
- /*
- * Reset all soft state
- */
- hq->hq_cur_aidx = 0;
-
- return (0);
-}
-
-static int
-viona_ioc_rx_ring_reset(viona_link_t *link)
-{
- viona_vring_hqueue_t *hq;
-
- mac_rx_clear(link->l_mch);
-
- hq = &link->l_rx_vring;
-
- return (viona_ioc_ring_reset_common(hq));
-}
-
-static int
-viona_ioc_tx_ring_reset(viona_link_t *link)
-{
- viona_vring_hqueue_t *hq;
-
- hq = &link->l_tx_vring;
-
- return (viona_ioc_ring_reset_common(hq));
-}
-
-static void
-viona_ioc_rx_ring_kick(viona_link_t *link)
-{
- viona_vring_hqueue_t *hq = &link->l_rx_vring;
-
- atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY);
-
- mac_rx_set(link->l_mch, viona_rx, link);
-}
-
-/*
- * Return the number of available descriptors in the vring taking care
- * of the 16-bit index wraparound.
- */
-static inline int
-viona_hq_num_avail(viona_vring_hqueue_t *hq)
-{
- uint16_t ndesc;
-
- /*
- * We're just computing (a-b) in GF(216).
- *
- * The only glitch here is that in standard C,
- * uint16_t promotes to (signed) int when int has
- * more than 16 bits (pretty much always now), so
- * we have to force it back to unsigned.
- */
- ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx;
-
- ASSERT(ndesc <= hq->hq_size);
-
- return (ndesc);
-}
-
-static void
-viona_ioc_tx_ring_kick(viona_link_t *link)
-{
- viona_vring_hqueue_t *hq = &link->l_tx_vring;
-
- do {
- atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY);
- while (viona_hq_num_avail(hq)) {
- viona_tx(link, hq);
- }
- if (copy_tx_mblks) {
- mutex_enter(&link->l_tx_mutex);
- if (link->l_tx_outstanding != 0) {
- cv_wait_sig(&link->l_tx_cv, &link->l_tx_mutex);
- }
- mutex_exit(&link->l_tx_mutex);
- }
- atomic_and_16(hq->hq_used_flags, ~VRING_USED_F_NO_NOTIFY);
- } while (viona_hq_num_avail(hq));
-}
-
-static int
-viona_ioc_rx_intr_clear(viona_link_t *link)
-{
- link->l_rx_intr = 0;
-
- return (0);
-}
-
-static int
-viona_ioc_tx_intr_clear(viona_link_t *link)
-{
- link->l_tx_intr = 0;
-
- return (0);
-}
-#define VQ_MAX_DESCRIPTORS 512
-
-static int
-vq_popchain(viona_link_t *link, viona_vring_hqueue_t *hq, struct iovec *iov,
- int n_iov, uint16_t *cookie)
-{
- int i;
- int ndesc, nindir;
- int idx, head, next;
- struct virtio_desc *vdir, *vindir, *vp;
-
- idx = hq->hq_cur_aidx;
- ndesc = (uint16_t)((unsigned)*hq->hq_avail_idx - (unsigned)idx);
-
- if (ndesc == 0)
- return (0);
- if (ndesc > hq->hq_size) {
- cmn_err(CE_NOTE, "ndesc (%d) out of range\n", ndesc);
- return (-1);
- }
-
- head = hq->hq_avail_ring[idx & (hq->hq_size - 1)];
- next = head;
-
- for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
- if (next >= hq->hq_size) {
- cmn_err(CE_NOTE, "descriptor index (%d)"
- "out of range\n", next);
- return (-1);
- }
-
- vdir = (struct virtio_desc *)(hq->hq_baseaddr +
- next * sizeof (struct virtio_desc));
- if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
- if (i > n_iov)
- return (-1);
- iov[i].iov_base = viona_gpa2kva(link, vdir->vd_addr);
- if (iov[i].iov_base == NULL) {
- cmn_err(CE_NOTE, "invalid guest physical"
- " address 0x%"PRIx64"\n", vdir->vd_addr);
- return (-1);
- }
- iov[i++].iov_len = vdir->vd_len;
- } else {
- nindir = vdir->vd_len / 16;
- if ((vdir->vd_len & 0xf) || nindir == 0) {
- cmn_err(CE_NOTE, "invalid indir len 0x%x\n",
- vdir->vd_len);
- return (-1);
- }
- vindir = (struct virtio_desc *)
- viona_gpa2kva(link, vdir->vd_addr);
- if (vindir == NULL) {
- cmn_err(CE_NOTE, "invalid guest physical"
- " address 0x%"PRIx64"\n", vdir->vd_addr);
- return (-1);
- }
- next = 0;
- for (;;) {
- vp = &vindir[next];
- if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
- cmn_err(CE_NOTE, "indirect desc"
- " has INDIR flag\n");
- return (-1);
- }
- if (i > n_iov)
- return (-1);
- iov[i].iov_base =
- viona_gpa2kva(link, vp->vd_addr);
- if (iov[i].iov_base == NULL) {
- cmn_err(CE_NOTE, "invalid guest"
- " physical address 0x%"PRIx64"\n",
- vp->vd_addr);
- return (-1);
- }
- iov[i++].iov_len = vp->vd_len;
-
- if (i > VQ_MAX_DESCRIPTORS)
- goto loopy;
- if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
- break;
-
- next = vp->vd_next;
- if (next >= nindir) {
- cmn_err(CE_NOTE, "invalid next"
- " %d > %d\n", next, nindir);
- return (-1);
- }
- }
- }
- if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) {
- *cookie = head;
- hq->hq_cur_aidx++;
- return (i);
- }
- }
-
-loopy:
- cmn_err(CE_NOTE, "%d > descriptor loop count\n", i);
-
- return (-1);
-}
-
-static void
-vq_pushchain(viona_vring_hqueue_t *hq, uint32_t len, uint16_t cookie)
-{
- struct virtio_used *vu;
- int uidx;
-
- uidx = *hq->hq_used_idx;
- vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)];
- vu->vu_idx = cookie;
- vu->vu_tlen = len;
- membar_producer();
- *hq->hq_used_idx = uidx;
-}
-
-static void
-vq_pushchain_mrgrx(viona_vring_hqueue_t *hq, int num_bufs, used_elem_t *elem)
-{
- struct virtio_used *vu;
- int uidx;
- int i;
-
- uidx = *hq->hq_used_idx;
- if (num_bufs == 1) {
- vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)];
- vu->vu_idx = elem[0].id;
- vu->vu_tlen = elem[0].len;
- } else {
- for (i = 0; i < num_bufs; i++) {
- vu = &hq->hq_used_ring[(uidx + i) & (hq->hq_size - 1)];
- vu->vu_idx = elem[i].id;
- vu->vu_tlen = elem[i].len;
- }
- uidx = uidx + num_bufs;
- }
- membar_producer();
- *hq->hq_used_idx = uidx;
-}
-
-/*
- * Copy bytes from mp to iov.
- * copied_buf: Total num_bytes copied from mblk to iov array.
- * buf: pointer to iov_base.
- * i: index of iov array. Mainly used to identify if we are
- * dealing with first iov array element.
- * rxhdr_size: Virtio header size. Two possibilities in case
- * of MRGRX buf, header has 2 additional bytes.
- * In case of mrgrx, virtio header should be part of iov[0].
- * In case of non-mrgrx, virtio header may or may not be part
- * of iov[0].
- */
-static int
-copy_in_mblk(mblk_t *mp, int copied_buf, caddr_t buf, struct iovec *iov,
- int i, int rxhdr_size)
-{
- int copied_chunk = 0;
- mblk_t *ml;
- int total_buf_len = iov->iov_len;
- /*
- * iov[0] might have header, adjust
- * total_buf_len accordingly
- */
- if (i == 0) {
- total_buf_len = iov->iov_len - rxhdr_size;
- }
- for (ml = mp; ml != NULL; ml = ml->b_cont) {
- size_t chunk = MBLKL(ml);
- /*
- * If chunk is less than
- * copied_buf we should move
- * to correct msgblk
- */
- if (copied_buf != 0) {
- if (copied_buf < chunk) {
- chunk -= copied_buf;
- } else {
- copied_buf -= chunk;
- continue;
- }
- }
- /*
- * iov[0] already has virtio header.
- * and if copied chunk is length of iov_len break
- */
- if (copied_chunk == total_buf_len) {
- break;
- }
- /*
- * Sometimes chunk is total mblk len, sometimes mblk is
- * divided into multiple chunks.
- */
- if (chunk > copied_buf) {
- if (chunk > copied_chunk) {
- if ((chunk + copied_chunk) > total_buf_len)
- chunk = (size_t)total_buf_len
- - copied_chunk;
- } else {
- if (chunk > (total_buf_len - copied_chunk))
- chunk = (size_t)((total_buf_len
- - copied_chunk) - chunk);
- }
- bcopy(ml->b_rptr + copied_buf, buf, chunk);
- } else {
- if (chunk > (total_buf_len - copied_chunk)) {
- chunk = (size_t)(total_buf_len - copied_chunk);
- }
- bcopy(ml->b_rptr + copied_buf, buf, chunk);
- }
- buf += chunk;
- copied_chunk += chunk;
- }
- return (copied_chunk);
-}
-
-static void
-viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
- boolean_t loopback)
-{
- viona_link_t *link = arg;
- viona_vring_hqueue_t *hq = &link->l_rx_vring;
- mblk_t *mp0 = mp;
-
- while (viona_hq_num_avail(hq)) {
- struct iovec iov[VTNET_MAXSEGS];
- size_t mblklen;
- int n, i = 0;
- uint16_t cookie;
- struct virtio_net_hdr *vrx = NULL;
- struct virtio_net_mrgrxhdr *vmrgrx = NULL;
-#if notyet
- mblk_t *ml;
-#endif
- caddr_t buf = NULL;
- int total_len = 0;
- int copied_buf = 0;
- int num_bufs = 0;
- int num_pops = 0;
- used_elem_t uelem[VTNET_MAXSEGS];
-
- if (mp == NULL) {
- break;
- }
- mblklen = msgsize(mp);
- if (mblklen == 0) {
- break;
- }
-
- mutex_enter(&hq->hq_a_mutex);
- n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie);
- mutex_exit(&hq->hq_a_mutex);
- if (n <= 0) {
- break;
- }
- num_pops++;
- if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) {
- int total_n = n;
- int mrgrxhdr_size = sizeof (struct virtio_net_mrgrxhdr);
- /*
- * Get a pointer to the rx header, and use the
- * data immediately following it for the packet buffer.
- */
- vmrgrx = (struct virtio_net_mrgrxhdr *)iov[0].iov_base;
- if (n == 1) {
- buf = iov[0].iov_base + mrgrxhdr_size;
- }
- while (mblklen > copied_buf) {
- if (total_n == i) {
- mutex_enter(&hq->hq_a_mutex);
- n = vq_popchain(link, hq, &iov[i],
- VTNET_MAXSEGS, &cookie);
- mutex_exit(&hq->hq_a_mutex);
- if (n <= 0) {
- freemsgchain(mp0);
- return;
- }
- num_pops++;
- total_n += n;
- }
- if (total_n > i) {
- int copied_chunk = 0;
- if (i != 0) {
- buf = iov[i].iov_base;
- }
- copied_chunk = copy_in_mblk(mp,
- copied_buf, buf, &iov[i], i,
- mrgrxhdr_size);
- copied_buf += copied_chunk;
- uelem[i].id = cookie;
- uelem[i].len = copied_chunk;
- if (i == 0) {
- uelem[i].len += mrgrxhdr_size;
- }
- }
- num_bufs++;
- i++;
- }
- } else {
- boolean_t virt_hdr_incl_iov = B_FALSE;
- int rxhdr_size = sizeof (struct virtio_net_hdr);
- /* First element is header */
- vrx = (struct virtio_net_hdr *)iov[0].iov_base;
- if (n == 1 || iov[0].iov_len > rxhdr_size) {
- buf = iov[0].iov_base + rxhdr_size;
- virt_hdr_incl_iov = B_TRUE;
- total_len += rxhdr_size;
- if (iov[0].iov_len < rxhdr_size) {
- // Buff too small to fit pkt. Drop it.
- freemsgchain(mp0);
- return;
- }
- } else {
- total_len = iov[0].iov_len;
- }
- if (iov[0].iov_len == rxhdr_size)
- i++;
- while (mblklen > copied_buf) {
- if (n > i) {
- int copied_chunk = 0;
- if (i != 0) {
- buf = iov[i].iov_base;
- }
- /*
- * In case of non-mrgrx buf, first
- * descriptor always has header and
- * rest of the descriptors have data.
- * But it is not guaranteed that first
- * descriptor will only have virtio
- * header. It might also have data.
- */
- if (virt_hdr_incl_iov) {
- copied_chunk = copy_in_mblk(mp,
- copied_buf, buf, &iov[i],
- i, rxhdr_size);
- } else {
- copied_chunk = copy_in_mblk(mp,
- copied_buf, buf, &iov[i],
- i, 0);
- }
- copied_buf += copied_chunk;
- total_len += copied_chunk;
- } else {
- /*
- * Drop packet as it cant fit
- * in buf provided by guest.
- */
- freemsgchain(mp0);
- return;
- }
- i++;
- }
- }
- /*
- * The only valid field in the rx packet header is the
- * number of buffers, which is always 1 without TSO
- * support.
- */
- if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) {
- memset(vmrgrx, 0, sizeof (struct virtio_net_mrgrxhdr));
- vmrgrx->vrh_bufs = num_bufs;
- /*
- * Make sure iov[0].iov_len >= MIN_BUF_SIZE
- * otherwise guest will consider it as invalid frame.
- */
- if (num_bufs == 1 && uelem[0].len < MIN_BUF_SIZE) {
- uelem[0].len = MIN_BUF_SIZE;
- }
- /*
- * Release this chain and handle more chains.
- */
- mutex_enter(&hq->hq_u_mutex);
- vq_pushchain_mrgrx(hq, num_pops, uelem);
- mutex_exit(&hq->hq_u_mutex);
- } else {
- memset(vrx, 0, sizeof (struct virtio_net_hdr));
- if (total_len < MIN_BUF_SIZE) {
- total_len = MIN_BUF_SIZE;
- }
- /*
- * Release this chain and handle more chains.
- */
- mutex_enter(&hq->hq_u_mutex);
- vq_pushchain(hq, total_len, cookie);
- mutex_exit(&hq->hq_u_mutex);
- }
-
- mp = mp->b_next;
- }
-
- if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
- if (atomic_cas_uint(&link->l_rx_intr, 0, 1) == 0) {
- pollwakeup(&link->l_pollhead, POLLIN);
- }
- }
-
- freemsgchain(mp0);
-}
-
-static void
-viona_desb_free(viona_desb_t *dp)
-{
- viona_link_t *link;
- viona_vring_hqueue_t *hq;
-#if notyet
- struct virtio_used *vu;
- int uidx;
-#endif
- uint_t ref;
-
- ref = atomic_dec_uint_nv(&dp->d_ref);
- if (ref != 0)
- return;
-
- link = dp->d_link;
- hq = &link->l_tx_vring;
-
- mutex_enter(&hq->hq_u_mutex);
- vq_pushchain(hq, dp->d_len, dp->d_cookie);
- mutex_exit(&hq->hq_u_mutex);
-
- kmem_cache_free(link->l_desb_kmc, dp);
-
- if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
- if (atomic_cas_uint(&link->l_tx_intr, 0, 1) == 0) {
- pollwakeup(&link->l_pollhead, POLLOUT);
- }
- }
- if (copy_tx_mblks) {
- mutex_enter(&link->l_tx_mutex);
- if (--link->l_tx_outstanding == 0) {
- cv_broadcast(&link->l_tx_cv);
- }
- mutex_exit(&link->l_tx_mutex);
- }
-}
-
-static void
-viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq)
-{
- struct iovec iov[VTNET_MAXSEGS];
- uint16_t cookie;
- int i, n;
- mblk_t *mp_head, *mp_tail, *mp;
- viona_desb_t *dp;
- mac_client_handle_t link_mch = link->l_mch;
-
- mp_head = mp_tail = NULL;
-
- mutex_enter(&hq->hq_a_mutex);
- n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie);
- mutex_exit(&hq->hq_a_mutex);
- ASSERT(n != 0);
-
- dp = kmem_cache_alloc(link->l_desb_kmc, KM_SLEEP);
- dp->d_frtn.free_func = viona_desb_free;
- dp->d_frtn.free_arg = (void *)dp;
- dp->d_link = link;
- dp->d_cookie = cookie;
-
- dp->d_ref = 0;
- dp->d_len = iov[0].iov_len;
-
- for (i = 1; i < n; i++) {
- dp->d_ref++;
- dp->d_len += iov[i].iov_len;
- if (copy_tx_mblks) {
- mp = desballoc((uchar_t *)iov[i].iov_base,
- iov[i].iov_len, BPRI_MED, &dp->d_frtn);
- ASSERT(mp);
- } else {
- mp = allocb(iov[i].iov_len, BPRI_MED);
- ASSERT(mp);
- bcopy((uchar_t *)iov[i].iov_base, mp->b_wptr,
- iov[i].iov_len);
- }
- mp->b_wptr += iov[i].iov_len;
- if (mp_head == NULL) {
- ASSERT(mp_tail == NULL);
- mp_head = mp;
- } else {
- ASSERT(mp_tail != NULL);
- mp_tail->b_cont = mp;
- }
- mp_tail = mp;
- }
- if (copy_tx_mblks == B_FALSE) {
- viona_desb_free(dp);
- }
- if (copy_tx_mblks) {
- mutex_enter(&link->l_tx_mutex);
- link->l_tx_outstanding++;
- mutex_exit(&link->l_tx_mutex);
- }
- mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
-}
diff --git a/usr/src/uts/i86pc/io/viona/viona.mapfile b/usr/src/uts/i86pc/io/viona/viona.mapfile
new file mode 100644
index 0000000000..cece86348c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona.mapfile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+# usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+ global:
+ # DDI Interfaces
+ _fini;
+ _init;
+ _info;
+
+ local:
+ *;
+};
diff --git a/usr/src/uts/i86pc/io/viona/viona_hook.c b/usr/src/uts/i86pc/io/viona/viona_hook.c
new file mode 100644
index 0000000000..4520be04b0
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_hook.c
@@ -0,0 +1,438 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/hook.h>
+#include <sys/hook_event.h>
+
+#include "viona_impl.h"
+
+
+/*
+ * Global linked list of viona_neti_ts. Access is protected by viona_neti_lock
+ */
+static list_t viona_neti_list;
+static kmutex_t viona_neti_lock;
+
+/*
+ * viona_neti is allocated and initialized during attach, and read-only
+ * until detach (where it's also freed)
+ */
+static net_instance_t *viona_neti;
+
+
+/*
+ * Generate a hook event for the packet in *mpp headed in the direction
+ * indicated by 'out'. If the packet is accepted, 0 is returned. If the
+ * packet is rejected, an error is returned. The hook function may or may not
+ * alter or even free *mpp. The caller is expected to deal with either
+ * situation.
+ */
+int
+viona_hook(viona_link_t *link, viona_vring_t *ring, mblk_t **mpp, boolean_t out)
+{
+ viona_neti_t *nip = link->l_neti;
+ viona_nethook_t *vnh = &nip->vni_nethook;
+ hook_pkt_event_t info;
+ hook_event_t he;
+ hook_event_token_t het;
+ int ret;
+
+ he = out ? vnh->vnh_event_out : vnh->vnh_event_in;
+ het = out ? vnh->vnh_token_out : vnh->vnh_token_in;
+
+ if (!he.he_interested)
+ return (0);
+
+ info.hpe_protocol = vnh->vnh_neti;
+ info.hpe_ifp = (phy_if_t)link;
+ info.hpe_ofp = (phy_if_t)link;
+ info.hpe_mp = mpp;
+ info.hpe_flags = 0;
+
+ ret = hook_run(vnh->vnh_neti->netd_hooks, het, (hook_data_t)&info);
+ if (ret == 0)
+ return (0);
+
+ if (out) {
+ VIONA_PROBE3(tx_hook_drop, viona_vring_t *, ring,
+ mblk_t *, *mpp, int, ret);
+ VIONA_RING_STAT_INCR(ring, tx_hookdrop);
+ } else {
+ VIONA_PROBE3(rx_hook_drop, viona_vring_t *, ring,
+ mblk_t *, *mpp, int, ret);
+ VIONA_RING_STAT_INCR(ring, rx_hookdrop);
+ }
+ return (ret);
+}
+
+/*
+ * netinfo stubs - required by the nethook framework, but otherwise unused
+ *
+ * Currently, all ipf rules are applied against all interfaces in a given
+ * netstack (e.g. all interfaces in a zone). In the future if we want to
+ * support being able to apply different rules to different interfaces, I
+ * believe we would need to implement some of these stubs to map an interface
+ * name in a rule (e.g. 'net0', back to an index or viona_link_t);
+ */
+static int
+viona_neti_getifname(net_handle_t neti __unused, phy_if_t phy __unused,
+ char *buf __unused, const size_t len __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_getmtu(net_handle_t neti __unused, phy_if_t phy __unused,
+ lif_if_t ifdata __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_getptmue(net_handle_t neti __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_getlifaddr(net_handle_t neti __unused, phy_if_t phy __unused,
+ lif_if_t ifdata __unused, size_t nelem __unused,
+ net_ifaddr_t type[] __unused, void *storage __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_getlifzone(net_handle_t neti __unused, phy_if_t phy __unused,
+ lif_if_t ifdata __unused, zoneid_t *zid __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_getlifflags(net_handle_t neti __unused, phy_if_t phy __unused,
+ lif_if_t ifdata __unused, uint64_t *flags __unused)
+{
+ return (-1);
+}
+
+static phy_if_t
+viona_neti_phygetnext(net_handle_t neti __unused, phy_if_t phy __unused)
+{
+ return ((phy_if_t)-1);
+}
+
+static phy_if_t
+viona_neti_phylookup(net_handle_t neti __unused, const char *name __unused)
+{
+ return ((phy_if_t)-1);
+}
+
+static lif_if_t
+viona_neti_lifgetnext(net_handle_t neti __unused, phy_if_t phy __unused,
+ lif_if_t ifdata __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_inject(net_handle_t neti __unused, inject_t style __unused,
+ net_inject_t *packet __unused)
+{
+ return (-1);
+}
+
+static phy_if_t
+viona_neti_route(net_handle_t neti __unused, struct sockaddr *address __unused,
+ struct sockaddr *next __unused)
+{
+ return ((phy_if_t)-1);
+}
+
+static int
+viona_neti_ispchksum(net_handle_t neti __unused, mblk_t *mp __unused)
+{
+ return (-1);
+}
+
+static int
+viona_neti_isvchksum(net_handle_t neti __unused, mblk_t *mp __unused)
+{
+ return (-1);
+}
+
+static net_protocol_t viona_netinfo = {
+ NETINFO_VERSION,
+ NHF_VIONA,
+ viona_neti_getifname,
+ viona_neti_getmtu,
+ viona_neti_getptmue,
+ viona_neti_getlifaddr,
+ viona_neti_getlifzone,
+ viona_neti_getlifflags,
+ viona_neti_phygetnext,
+ viona_neti_phylookup,
+ viona_neti_lifgetnext,
+ viona_neti_inject,
+ viona_neti_route,
+ viona_neti_ispchksum,
+ viona_neti_isvchksum
+};
+
+/*
+ * Create/register our nethooks
+ */
+static int
+viona_nethook_init(netid_t nid, viona_nethook_t *vnh, char *nh_name,
+ net_protocol_t *netip)
+{
+ int ret;
+
+ if ((vnh->vnh_neti = net_protocol_register(nid, netip)) == NULL) {
+ cmn_err(CE_NOTE, "%s: net_protocol_register failed "
+ "(netid=%d name=%s)", __func__, nid, nh_name);
+ goto fail_init_proto;
+ }
+
+ HOOK_FAMILY_INIT(&vnh->vnh_family, nh_name);
+ if ((ret = net_family_register(vnh->vnh_neti, &vnh->vnh_family)) != 0) {
+ cmn_err(CE_NOTE, "%s: net_family_register failed "
+ "(netid=%d name=%s err=%d)", __func__,
+ nid, nh_name, ret);
+ goto fail_init_family;
+ }
+
+ HOOK_EVENT_INIT(&vnh->vnh_event_in, NH_PHYSICAL_IN);
+ if ((vnh->vnh_token_in = net_event_register(vnh->vnh_neti,
+ &vnh->vnh_event_in)) == NULL) {
+ cmn_err(CE_NOTE, "%s: net_event_register %s failed "
+ "(netid=%d name=%s)", __func__, NH_PHYSICAL_IN, nid,
+ nh_name);
+ goto fail_init_event_in;
+ }
+
+ HOOK_EVENT_INIT(&vnh->vnh_event_out, NH_PHYSICAL_OUT);
+ if ((vnh->vnh_token_out = net_event_register(vnh->vnh_neti,
+ &vnh->vnh_event_out)) == NULL) {
+ cmn_err(CE_NOTE, "%s: net_event_register %s failed "
+ "(netid=%d name=%s)", __func__, NH_PHYSICAL_OUT, nid,
+ nh_name);
+ goto fail_init_event_out;
+ }
+ return (0);
+
+ /*
+ * On failure, we undo all the steps that succeeded in the
+ * reverse order of initialization, starting at the last
+ * successful step (the labels denoting the failing step).
+ */
+fail_init_event_out:
+ VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
+ VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
+ vnh->vnh_token_in = NULL;
+
+fail_init_event_in:
+ VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
+ VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
+
+fail_init_family:
+ VERIFY0(net_protocol_unregister(vnh->vnh_neti));
+ vnh->vnh_neti = NULL;
+
+fail_init_proto:
+ return (1);
+}
+
+/*
+ * Shutdown the nethooks for a protocol family. This triggers notification
+ * callbacks to anything that has registered interest to allow hook consumers
+ * to unhook prior to the removal of the hooks as well as makes them unavailable
+ * to any future consumers as the first step of removal.
+ */
+static void
+viona_nethook_shutdown(viona_nethook_t *vnh)
+{
+ VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_out));
+ VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in));
+ VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family));
+}
+
+/*
+ * Remove the nethooks for a protocol family.
+ */
+static void
+viona_nethook_fini(viona_nethook_t *vnh)
+{
+ VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_out));
+ VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in));
+ VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family));
+ VERIFY0(net_protocol_unregister(vnh->vnh_neti));
+ vnh->vnh_neti = NULL;
+}
+
+/*
+ * Callback invoked by the neti module. This creates/registers our hooks
+ * {IPv4,IPv6}{in,out} with the nethook framework so they are available to
+ * interested consumers (e.g. ipf).
+ *
+ * During attach, viona_neti_create is called once for every netstack
+ * present on the system at the time of attach. Thereafter, it is called
+ * during the creation of additional netstack instances (i.e. zone boot). As a
+ * result, the viona_neti_t that is created during this call always occurs
+ * prior to any viona instances that will use it to send hook events.
+ *
+ * It should never return NULL. If we cannot register our hooks, we do not
+ * set vnh_hooked of the respective protocol family, which will prevent the
+ * creation of any viona instances on this netstack (see viona_ioc_create).
+ * This can only occur if after a shutdown event (which means destruction is
+ * imminent) we are trying to create a new instance.
+ */
+static void *
+viona_neti_create(const netid_t netid)
+{
+ viona_neti_t *nip;
+
+ VERIFY(netid != -1);
+
+ nip = kmem_zalloc(sizeof (*nip), KM_SLEEP);
+ nip->vni_netid = netid;
+ nip->vni_zid = net_getzoneidbynetid(netid);
+ mutex_init(&nip->vni_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&nip->vni_dev_list, sizeof (viona_soft_state_t),
+ offsetof(viona_soft_state_t, ss_node));
+
+ if (viona_nethook_init(netid, &nip->vni_nethook, Hn_VIONA,
+ &viona_netinfo) == 0)
+ nip->vni_nethook.vnh_hooked = B_TRUE;
+
+ mutex_enter(&viona_neti_lock);
+ list_insert_tail(&viona_neti_list, nip);
+ mutex_exit(&viona_neti_lock);
+
+ return (nip);
+}
+
+/*
+ * Called during netstack teardown by the neti module. During teardown, all
+ * the shutdown callbacks are invoked, allowing consumers to release any holds
+ * and otherwise quiesce themselves prior to destruction, followed by the
+ * actual destruction callbacks.
+ */
+static void
+viona_neti_shutdown(netid_t nid, void *arg)
+{
+ viona_neti_t *nip = arg;
+
+ ASSERT(nip != NULL);
+ VERIFY(nid == nip->vni_netid);
+
+ mutex_enter(&viona_neti_lock);
+ list_remove(&viona_neti_list, nip);
+ mutex_exit(&viona_neti_lock);
+
+ if (nip->vni_nethook.vnh_hooked)
+ viona_nethook_shutdown(&nip->vni_nethook);
+}
+
+/*
+ * Called during netstack teardown by the neti module. Destroys the viona
+ * netinst data. This is invoked after all the netstack and neti shutdown
+ * callbacks have been invoked.
+ */
+static void
+viona_neti_destroy(netid_t nid, void *arg)
+{
+ viona_neti_t *nip = arg;
+
+ ASSERT(nip != NULL);
+ VERIFY(nid == nip->vni_netid);
+
+ mutex_enter(&nip->vni_lock);
+ while (nip->vni_ref != 0)
+ cv_wait(&nip->vni_ref_change, &nip->vni_lock);
+ mutex_exit(&nip->vni_lock);
+
+ VERIFY(!list_link_active(&nip->vni_node));
+
+ if (nip->vni_nethook.vnh_hooked)
+ viona_nethook_fini(&nip->vni_nethook);
+
+ mutex_destroy(&nip->vni_lock);
+ list_destroy(&nip->vni_dev_list);
+ kmem_free(nip, sizeof (*nip));
+}
+
+/*
+ * Find the viona netinst data by zone id. This is only used during
+ * viona instance creation (and thus is only called by a zone that is running).
+ */
+viona_neti_t *
+viona_neti_lookup_by_zid(zoneid_t zid)
+{
+ viona_neti_t *nip;
+
+ mutex_enter(&viona_neti_lock);
+ for (nip = list_head(&viona_neti_list); nip != NULL;
+ nip = list_next(&viona_neti_list, nip)) {
+ if (nip->vni_zid == zid) {
+ mutex_enter(&nip->vni_lock);
+ nip->vni_ref++;
+ mutex_exit(&nip->vni_lock);
+ mutex_exit(&viona_neti_lock);
+ return (nip);
+ }
+ }
+ mutex_exit(&viona_neti_lock);
+ return (NULL);
+}
+
+void
+viona_neti_rele(viona_neti_t *nip)
+{
+ mutex_enter(&nip->vni_lock);
+ VERIFY3S(nip->vni_ref, >, 0);
+ nip->vni_ref--;
+ mutex_exit(&nip->vni_lock);
+ cv_broadcast(&nip->vni_ref_change);
+}
+
+void
+viona_neti_attach(void)
+{
+ mutex_init(&viona_neti_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&viona_neti_list, sizeof (viona_neti_t),
+ offsetof(viona_neti_t, vni_node));
+
+ /* This can only fail if NETINFO_VERSION is wrong */
+ viona_neti = net_instance_alloc(NETINFO_VERSION);
+ VERIFY(viona_neti != NULL);
+
+ viona_neti->nin_name = "viona";
+ viona_neti->nin_create = viona_neti_create;
+ viona_neti->nin_shutdown = viona_neti_shutdown;
+ viona_neti->nin_destroy = viona_neti_destroy;
+ /* This can only fail if we've registered ourselves multiple times */
+ VERIFY3S(net_instance_register(viona_neti), ==, DDI_SUCCESS);
+}
+
+void
+viona_neti_detach(void)
+{
+ /* This can only fail if we've not registered previously */
+ VERIFY3S(net_instance_unregister(viona_neti), ==, DDI_SUCCESS);
+ net_instance_free(viona_neti);
+ viona_neti = NULL;
+
+ list_destroy(&viona_neti_list);
+ mutex_destroy(&viona_neti_lock);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_impl.h b/usr/src/uts/i86pc/io/viona/viona_impl.h
new file mode 100644
index 0000000000..5471b611a4
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_impl.h
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _VIONA_IMPL_H
+#define _VIONA_IMPL_H
+
+#include <sys/ddi.h>
+#include <sys/list.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/strsun.h>
+#include <sys/sysmacros.h>
+#include <sys/uio.h>
+
+#include <sys/mac_client.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_priv.h>
+#include <sys/neti.h>
+#include <inet/ip.h>
+#include <inet/tcp.h>
+
+#include <sys/vmm_drv.h>
+#include <sys/viona_io.h>
+
+struct viona_link;
+typedef struct viona_link viona_link_t;
+struct viona_desb;
+typedef struct viona_desb viona_desb_t;
+struct viona_net;
+typedef struct viona_neti viona_neti_t;
+
+enum viona_ring_state {
+ VRS_RESET = 0x0, /* just allocated or reset */
+ VRS_SETUP = 0x1, /* addrs setup and starting worker thread */
+ VRS_INIT = 0x2, /* worker thread started & waiting to run */
+ VRS_RUN = 0x3, /* running work routine */
+ VRS_STOP = 0x4, /* worker is exiting */
+};
+enum viona_ring_state_flags {
+ VRSF_REQ_START = 0x1, /* start running from INIT state */
+ VRSF_REQ_STOP = 0x2, /* stop running, clean up, goto RESET state */
+ VRSF_RENEW = 0x4, /* ring renewing lease */
+};
+
+typedef struct viona_vring {
+ viona_link_t *vr_link;
+
+ kmutex_t vr_lock;
+ kcondvar_t vr_cv;
+ uint16_t vr_state;
+ uint16_t vr_state_flags;
+ uint_t vr_xfer_outstanding;
+ kthread_t *vr_worker_thread;
+ vmm_lease_t *vr_lease;
+
+ /* ring-sized resources for TX activity */
+ viona_desb_t *vr_txdesb;
+ struct iovec *vr_txiov;
+
+ uint_t vr_intr_enabled;
+ uint64_t vr_msi_addr;
+ uint64_t vr_msi_msg;
+
+ /* Internal ring-related state */
+ kmutex_t vr_a_mutex; /* sync consumers of 'avail' */
+ kmutex_t vr_u_mutex; /* sync consumers of 'used' */
+ uint64_t vr_pa;
+ uint16_t vr_size;
+ uint16_t vr_mask; /* cached from vr_size */
+ uint16_t vr_cur_aidx; /* trails behind 'avail_idx' */
+
+ /* Host-context pointers to the queue */
+ volatile struct virtio_desc *vr_descr;
+
+ volatile uint16_t *vr_avail_flags;
+ volatile uint16_t *vr_avail_idx;
+ volatile uint16_t *vr_avail_ring;
+ volatile uint16_t *vr_avail_used_event;
+
+ volatile uint16_t *vr_used_flags;
+ volatile uint16_t *vr_used_idx;
+ volatile struct virtio_used *vr_used_ring;
+ volatile uint16_t *vr_used_avail_event;
+
+ /* Per-ring error condition statistics */
+ struct viona_ring_stats {
+ uint64_t rs_ndesc_too_high;
+ uint64_t rs_bad_idx;
+ uint64_t rs_indir_bad_len;
+ uint64_t rs_indir_bad_nest;
+ uint64_t rs_indir_bad_next;
+ uint64_t rs_no_space;
+ uint64_t rs_too_many_desc;
+ uint64_t rs_desc_bad_len;
+
+ uint64_t rs_bad_ring_addr;
+
+ uint64_t rs_fail_hcksum;
+ uint64_t rs_fail_hcksum6;
+ uint64_t rs_fail_hcksum_proto;
+
+ uint64_t rs_bad_rx_frame;
+ uint64_t rs_rx_merge_overrun;
+ uint64_t rs_rx_merge_underrun;
+ uint64_t rs_rx_pad_short;
+ uint64_t rs_rx_mcast_check;
+ uint64_t rs_too_short;
+ uint64_t rs_tx_absent;
+
+ uint64_t rs_rx_hookdrop;
+ uint64_t rs_tx_hookdrop;
+ } vr_stats;
+} viona_vring_t;
+
+struct viona_link {
+ vmm_hold_t *l_vm_hold;
+ boolean_t l_destroyed;
+
+ viona_vring_t l_vrings[VIONA_VQ_MAX];
+
+ uint32_t l_features;
+ uint32_t l_features_hw;
+ uint32_t l_cap_csum;
+
+ uintptr_t l_notify_ioport;
+ void *l_notify_cookie;
+
+ datalink_id_t l_linkid;
+ mac_handle_t l_mh;
+ mac_client_handle_t l_mch;
+ mac_promisc_handle_t l_mph;
+
+ pollhead_t l_pollhead;
+
+ viona_neti_t *l_neti;
+};
+
+typedef struct viona_nethook {
+ net_handle_t vnh_neti;
+ hook_family_t vnh_family;
+ hook_event_t vnh_event_in;
+ hook_event_t vnh_event_out;
+ hook_event_token_t vnh_token_in;
+ hook_event_token_t vnh_token_out;
+ boolean_t vnh_hooked;
+} viona_nethook_t;
+
+struct viona_neti {
+ list_node_t vni_node;
+
+ netid_t vni_netid;
+ zoneid_t vni_zid;
+
+ viona_nethook_t vni_nethook;
+
+ kmutex_t vni_lock; /* Protects remaining members */
+ kcondvar_t vni_ref_change; /* Protected by vni_lock */
+ uint_t vni_ref; /* Protected by vni_lock */
+ list_t vni_dev_list; /* Protected by vni_lock */
+};
+
+typedef struct used_elem {
+ uint16_t id;
+ uint32_t len;
+} used_elem_t;
+
+typedef struct viona_soft_state {
+ kmutex_t ss_lock;
+ viona_link_t *ss_link;
+ list_node_t ss_node;
+} viona_soft_state_t;
+
+#pragma pack(1)
+struct virtio_desc {
+ uint64_t vd_addr;
+ uint32_t vd_len;
+ uint16_t vd_flags;
+ uint16_t vd_next;
+};
+
+struct virtio_used {
+ uint32_t vu_idx;
+ uint32_t vu_tlen;
+};
+
+struct virtio_net_mrgrxhdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+ uint16_t vrh_bufs;
+};
+
+struct virtio_net_hdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+};
+#pragma pack()
+
+#define VRING_NEED_BAIL(ring, proc) \
+ (((ring)->vr_state_flags & VRSF_REQ_STOP) != 0 || \
+ ((proc)->p_flag & SEXITING) != 0)
+
+
+#define VNETHOOK_INTERESTED_IN(neti) \
+ (neti)->vni_nethook.vnh_event_in.he_interested
+#define VNETHOOK_INTERESTED_OUT(neti) \
+ (neti)->vni_nethook.vnh_event_out.he_interested
+
+
+#define VIONA_PROBE(name) DTRACE_PROBE(viona__##name)
+#define VIONA_PROBE1(name, arg1, arg2) \
+ DTRACE_PROBE1(viona__##name, arg1, arg2)
+#define VIONA_PROBE2(name, arg1, arg2, arg3, arg4) \
+ DTRACE_PROBE2(viona__##name, arg1, arg2, arg3, arg4)
+#define VIONA_PROBE3(name, arg1, arg2, arg3, arg4, arg5, arg6) \
+ DTRACE_PROBE3(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6)
+#define VIONA_PROBE5(name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, \
+ arg9, arg10) \
+ DTRACE_PROBE5(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, \
+ arg8, arg9, arg10)
+#define VIONA_PROBE_BAD_RING_ADDR(r, a) \
+ VIONA_PROBE2(bad_ring_addr, viona_vring_t *, r, void *, (void *)(a))
+
+#define VIONA_RING_STAT_INCR(r, name) \
+ (((r)->vr_stats.rs_ ## name)++)
+
+
+#define VIONA_MAX_HDRS_LEN (sizeof (struct ether_vlan_header) + \
+ IP_MAX_HDR_LENGTH + TCP_MAX_HDR_LENGTH)
+
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+#define VRING_USED_F_NO_NOTIFY 1
+
+#define VRING_DESC_F_NEXT (1 << 0)
+#define VRING_DESC_F_WRITE (1 << 1)
+#define VRING_DESC_F_INDIRECT (1 << 2)
+
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM (1 << 0)
+#define VIRTIO_NET_HDR_F_DATA_VALID (1 << 1)
+
+#define VIRTIO_NET_HDR_GSO_NONE 0
+#define VIRTIO_NET_HDR_GSO_TCPV4 1
+
+#define VIRTIO_NET_F_CSUM (1 << 0)
+#define VIRTIO_NET_F_GUEST_CSUM (1 << 1)
+#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
+#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can accept TSO */
+#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can accept TSO */
+#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX bufs */
+#define VIRTIO_NET_F_STATUS (1 << 16) /* cfg status field present */
+#define VIRTIO_F_RING_NOTIFY_ON_EMPTY (1 << 24)
+#define VIRTIO_F_RING_INDIRECT_DESC (1 << 28)
+#define VIRTIO_F_RING_EVENT_IDX (1 << 29)
+
+
+void viona_ring_alloc(viona_link_t *, viona_vring_t *);
+void viona_ring_free(viona_vring_t *);
+int viona_ring_reset(viona_vring_t *, boolean_t);
+int viona_ring_init(viona_link_t *, uint16_t, uint16_t, uint64_t);
+boolean_t viona_ring_lease_renew(viona_vring_t *);
+int vq_popchain(viona_vring_t *, struct iovec *, uint_t, uint16_t *);
+void vq_pushchain(viona_vring_t *, uint32_t, uint16_t);
+void vq_pushchain_many(viona_vring_t *, uint_t, used_elem_t *);
+void viona_intr_ring(viona_vring_t *ring);
+
+void viona_rx_init(void);
+void viona_rx_fini(void);
+int viona_rx_set(viona_link_t *);
+void viona_rx_clear(viona_link_t *);
+void viona_worker_rx(viona_vring_t *, viona_link_t *);
+
+extern kmutex_t viona_force_copy_lock;
+void viona_worker_tx(viona_vring_t *, viona_link_t *);
+void viona_tx_ring_alloc(viona_vring_t *, const uint16_t);
+void viona_tx_ring_free(viona_vring_t *, const uint16_t);
+
+void viona_neti_attach(void);
+void viona_neti_detach(void);
+viona_neti_t *viona_neti_lookup_by_zid(zoneid_t);
+void viona_neti_rele(viona_neti_t *);
+int viona_hook(viona_link_t *, viona_vring_t *, mblk_t **, boolean_t);
+
+#endif /* _VIONA_IMPL_H */
diff --git a/usr/src/uts/i86pc/io/viona/viona_main.c b/usr/src/uts/i86pc/io/viona/viona_main.c
new file mode 100644
index 0000000000..f51a1f9b12
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_main.c
@@ -0,0 +1,991 @@
+/*
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * viona - VirtIO-Net, Accelerated
+ *
+ * The purpose of viona is to provide high performance virtio-net devices to
+ * bhyve guests. It does so by sitting directly atop MAC, skipping all of the
+ * DLS/DLD stack.
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * A single viona instance is comprised of a "link" handle and two "rings".
+ * After opening the viona device, it must be associated with a MAC network
+ * interface and a bhyve (vmm) instance to form its link resource. This is
+ * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
+ * passed in to perform the initialization. With the MAC client opened, and a
+ * driver handle to the vmm instance established, the device is ready to be
+ * configured by the guest.
+ *
+ * The userspace portion of bhyve, which interfaces with the PCI device
+ * emulation framework, is meant to stay out of the datapath if at all
+ * possible. Configuration changes made via PCI are mapped to actions which
+ * will steer the operation of the in-kernel logic.
+ *
+ *
+ * -----------
+ * Ring Basics
+ * -----------
+ *
+ * Each viona link has two viona_vring_t entities, RX and TX, for handling data
+ * transfers to and from the guest. They represent an interface to the
+ * standard virtio ring structures. When intiailized and active, each ring is
+ * backed by a kernel worker thread (parented to the bhyve process for the
+ * instance) which handles ring events. The RX worker has the simple task of
+ * watching for ring shutdown conditions. The TX worker does that in addition
+ * to processing all requests to transmit data. Data destined for the guest is
+ * delivered directly by MAC to viona_rx() when the ring is active.
+ *
+ *
+ * -----------
+ * Ring States
+ * -----------
+ *
+ * The viona_vring_t instances follow a simple path through the possible state
+ * values represented in virtio_vring_t`vr_state:
+ *
+ * +<--------------------------------------------+
+ * | |
+ * V ^
+ * +-----------+ This is the initial state when a link is created or
+ * | VRS_RESET | when the ring has been explicitly reset.
+ * +-----------+
+ * | ^
+ * |---* ioctl(VNA_IOC_RING_INIT) issued |
+ * | |
+ * | ^
+ * V
+ * +-----------+ The ring parameters (size, guest physical addresses)
+ * | VRS_SETUP | have been set and start-up of the ring worker thread
+ * +-----------+ has begun.
+ * | ^
+ * | |
+ * |---* ring worker thread begins execution |
+ * | |
+ * +-------------------------------------------->+
+ * | | ^
+ * | |
+ * | * If ring shutdown is requested (by ioctl or impending
+ * | bhyve process death) while the worker thread is
+ * | starting, the worker will transition the ring to
+ * | VRS_RESET and exit.
+ * | ^
+ * | |
+ * | ^
+ * V
+ * +-----------+ The worker thread associated with the ring has started
+ * | VRS_INIT | executing. It has allocated any extra resources needed
+ * +-----------+ for the ring to operate.
+ * | ^
+ * | |
+ * +-------------------------------------------->+
+ * | | ^
+ * | |
+ * | * If ring shutdown is requested while the worker is
+ * | waiting in VRS_INIT, it will free any extra resources
+ * | and transition to VRS_RESET.
+ * | ^
+ * | |
+ * |--* ioctl(VNA_IOC_RING_KICK) issued |
+ * | ^
+ * V
+ * +-----------+ The worker thread associated with the ring is executing
+ * | VRS_RUN | workload specific to that ring.
+ * +-----------+
+ * | ^
+ * |---* ioctl(VNA_IOC_RING_RESET) issued |
+ * | (or bhyve process begins exit) ^
+ * |
+ * +-----------+ The worker thread associated with the ring is in the
+ * | VRS_STOP | process of exiting. All outstanding TX and RX
+ * +-----------+ requests are allowed to complete, but new requests
+ * | must be ignored.
+ * | ^
+ * | |
+ * +-------------------------------------------->+
+ *
+ *
+ * While the worker thread is not running, changes to vr_state are only made by
+ * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts
+ * the worker, and sets the ring state to VRS_SETUP. Once the worker thread
+ * has been started, only it may perform ring state transitions (still under
+ * the protection of vr_lock), when requested by outside consumers via
+ * vr_state_flags or when the containing bhyve process initiates an exit.
+ *
+ *
+ * ----------------------------
+ * Transmission mblk_t Handling
+ * ----------------------------
+ *
+ * For incoming frames destined for a bhyve guest, the data must first land in
+ * a host OS buffer from the physical NIC before it is copied into the awaiting
+ * guest buffer(s). Outbound frames transmitted by the guest are not bound by
+ * this limitation and can avoid extra copying before the buffers are accessed
+ * directly by the NIC. When a guest designates buffers to be transmitted,
+ * viona translates the guest-physical addresses contained in the ring
+ * descriptors to host-virtual addresses via vmm_dr_gpa2kva(). That pointer is
+ * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
+ * Doing so increments vr_xfer_outstanding, preventing the ring from being
+ * reset (allowing the link to drop its vmm handle to the guest) until all
+ * transmit mblks referencing guest memory have been processed. Allocation of
+ * the viona_desb_t entries is done during the VRS_INIT stage of the ring
+ * worker thread. The ring size informs that allocation as the number of
+ * concurrent transmissions is limited by the number of descriptors in the
+ * ring. This minimizes allocation in the transmit hot-path by aqcuiring those
+ * fixed-size resources during initialization.
+ *
+ * This optimization depends on the underlying NIC driver freeing the mblks in
+ * a timely manner after they have been transmitted by the hardware. Some
+ * drivers have been found to flush TX descriptors only when new transmissions
+ * are initiated. This means that there is no upper bound to the time needed
+ * for an mblk to be flushed and can stall bhyve guests from shutting down
+ * since their memory must be free of viona TX references prior to clean-up.
+ *
+ * This expectation of deterministic mblk_t processing is likely the reason
+ * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
+ * loaded will copy transmit data into fresh buffers rather than passing up
+ * zero-copy mblks. It is a hold-over from the original viona sources provided
+ * by Pluribus and its continued necessity has not been confirmed.
+ *
+ *
+ * ----------------------------
+ * Ring Notification Fast-paths
+ * ----------------------------
+ *
+ * Device operation for viona requires that notifications flow to and from the
+ * guest to indicate certain ring conditions. In order to minimize latency and
+ * processing overhead, the notification procedures are kept in-kernel whenever
+ * possible.
+ *
+ * Guest-to-host notifications, when new available descriptors have been placed
+ * in the ring, are posted via the 'queue notify' address in the virtio BAR.
+ * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
+ * install a callback hook on an ioport address. Guest exits for accesses to
+ * viona-hooked ioport addresses will result in direct calls to notify the
+ * appropriate ring worker without a trip to userland.
+ *
+ * Host-to-guest notifications in the form of interrupts enjoy similar
+ * acceleration. Each viona ring can be configured to send MSI notifications
+ * to the guest as virtio conditions dictate. This in-kernel interrupt
+ * configuration is kept synchronized through viona ioctls which are utilized
+ * during writes to the associated PCI config registers or MSI-X BAR.
+ *
+ * Guests which do not utilize MSI-X will result in viona falling back to the
+ * slow path for interrupts. It will poll(2) the viona handle, receiving
+ * notification when ring events necessitate the assertion of an interrupt.
+ *
+ *
+ * ---------------
+ * Nethook Support
+ * ---------------
+ *
+ * Viona provides four nethook events that consumers (e.g. ipf) can hook into
+ * to intercept packets as they go up or down the stack. Unfortunately,
+ * the nethook framework does not understand raw packets, so we can only
+ * generate events (in, out) for IPv4 and IPv6 packets. At driver attach,
+ * we register callbacks with the neti (netinfo) module that will be invoked
+ * for each netstack already present, as well as for any additional netstack
+ * instances created as the system operates. These callbacks will
+ * register/unregister the hooks with the nethook framework for each
+ * netstack instance. This registration occurs prior to creating any
+ * viona instances for a given netstack, and the unregistration for a netstack
+ * instance occurs after all viona instances of the netstack instance have
+ * been deleted.
+ */
+
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+
+#include <sys/dlpi.h>
+
+#include "viona_impl.h"
+
+
+#define VIONA_NAME "Virtio Network Accelerator"
+#define VIONA_CTL_MINOR 0
+#define VIONA_CLI_NAME "viona" /* MAC client name */
+
+
+/*
+ * Host capabilities.
+ */
+#define VIONA_S_HOSTCAPS ( \
+ VIRTIO_NET_F_GUEST_CSUM | \
+ VIRTIO_NET_F_MAC | \
+ VIRTIO_NET_F_GUEST_TSO4 | \
+ VIRTIO_NET_F_MRG_RXBUF | \
+ VIRTIO_NET_F_STATUS | \
+ VIRTIO_F_RING_NOTIFY_ON_EMPTY | \
+ VIRTIO_F_RING_INDIRECT_DESC)
+
+/* MAC_CAPAB_HCKSUM specifics of interest */
+#define VIONA_CAP_HCKSUM_INTEREST \
+ (HCKSUM_INET_PARTIAL | \
+ HCKSUM_INET_FULL_V4 | \
+ HCKSUM_INET_FULL_V6)
+
+static void *viona_state;
+static dev_info_t *viona_dip;
+static id_space_t *viona_minors;
+
+
+static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
+ void **result);
+static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
+static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
+static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
+ cred_t *credp, int *rval);
+static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp);
+
+static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
+static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
+
+static int viona_ioc_set_notify_ioport(viona_link_t *, uint_t);
+static int viona_ioc_ring_init(viona_link_t *, void *, int);
+static int viona_ioc_ring_reset(viona_link_t *, uint_t);
+static int viona_ioc_ring_kick(viona_link_t *, uint_t);
+static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
+static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
+static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
+
+static struct cb_ops viona_cb_ops = {
+ viona_open,
+ viona_close,
+ nodev,
+ nodev,
+ nodev,
+ nodev,
+ nodev,
+ viona_ioctl,
+ nodev,
+ nodev,
+ nodev,
+ viona_chpoll,
+ ddi_prop_op,
+ 0,
+ D_MP | D_NEW | D_HOTPLUG,
+ CB_REV,
+ nodev,
+ nodev
+};
+
+static struct dev_ops viona_ops = {
+ DEVO_REV,
+ 0,
+ viona_info,
+ nulldev,
+ nulldev,
+ viona_attach,
+ viona_detach,
+ nodev,
+ &viona_cb_ops,
+ NULL,
+ ddi_power,
+ ddi_quiesce_not_needed
+};
+
+static struct modldrv modldrv = {
+ &mod_driverops,
+ VIONA_NAME,
+ &viona_ops,
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, &modldrv, NULL
+};
+
+int
+_init(void)
+{
+ int ret;
+
+ ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
+ if (ret != 0) {
+ return (ret);
+ }
+
+ viona_minors = id_space_create("viona_minors",
+ VIONA_CTL_MINOR + 1, UINT16_MAX);
+ viona_rx_init();
+ mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
+
+ ret = mod_install(&modlinkage);
+ if (ret != 0) {
+ ddi_soft_state_fini(&viona_state);
+ id_space_destroy(viona_minors);
+ viona_rx_fini();
+ mutex_destroy(&viona_force_copy_lock);
+ }
+
+ return (ret);
+}
+
+int
+_fini(void)
+{
+ int ret;
+
+ ret = mod_remove(&modlinkage);
+ if (ret != 0) {
+ return (ret);
+ }
+
+ ddi_soft_state_fini(&viona_state);
+ id_space_destroy(viona_minors);
+ viona_rx_fini();
+ mutex_destroy(&viona_force_copy_lock);
+
+ return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+/* ARGSUSED */
+static int
+viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
+{
+ int error;
+
+ switch (cmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *result = (void *)viona_dip;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)0;
+ error = DDI_SUCCESS;
+ break;
+ default:
+ error = DDI_FAILURE;
+ break;
+ }
+ return (error);
+}
+
+static int
+viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ if (cmd != DDI_ATTACH) {
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
+ DDI_PSEUDO, 0) != DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+
+ viona_neti_attach();
+
+ viona_dip = dip;
+ ddi_report_dev(viona_dip);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ dev_info_t *old_dip = viona_dip;
+
+ if (cmd != DDI_DETACH) {
+ return (DDI_FAILURE);
+ }
+
+ VERIFY(old_dip != NULL);
+
+ viona_neti_detach();
+ viona_dip = NULL;
+ ddi_remove_minor_node(old_dip, NULL);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
+{
+ int minor;
+ viona_soft_state_t *ss;
+
+ if (otype != OTYP_CHR) {
+ return (EINVAL);
+ }
+#if 0
+ /*
+ * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
+ * Should the check be at open() or ioctl()?
+ */
+ if (drv_priv(credp) != 0) {
+ return (EPERM);
+ }
+#endif
+ if (getminor(*devp) != VIONA_CTL_MINOR) {
+ return (ENXIO);
+ }
+
+ minor = id_alloc_nosleep(viona_minors);
+ if (minor == -1) {
+ /* All minors are busy */
+ return (EBUSY);
+ }
+ if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
+ id_free(viona_minors, minor);
+ return (ENOMEM);
+ }
+
+ ss = ddi_get_soft_state(viona_state, minor);
+ mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
+ *devp = makedevice(getmajor(*devp), minor);
+
+ return (0);
+}
+
+static int
+viona_close(dev_t dev, int flag, int otype, cred_t *credp)
+{
+ int minor;
+ viona_soft_state_t *ss;
+
+ if (otype != OTYP_CHR) {
+ return (EINVAL);
+ }
+
+ minor = getminor(dev);
+
+ ss = ddi_get_soft_state(viona_state, minor);
+ if (ss == NULL) {
+ return (ENXIO);
+ }
+
+ VERIFY0(viona_ioc_delete(ss, B_TRUE));
+ VERIFY(!list_link_active(&ss->ss_node));
+ ddi_soft_state_free(viona_state, minor);
+ id_free(viona_minors, minor);
+
+ return (0);
+}
+
+static int
+viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
+{
+ viona_soft_state_t *ss;
+ void *dptr = (void *)data;
+ int err = 0, val;
+ viona_link_t *link;
+
+ ss = ddi_get_soft_state(viona_state, getminor(dev));
+ if (ss == NULL) {
+ return (ENXIO);
+ }
+
+ switch (cmd) {
+ case VNA_IOC_CREATE:
+ return (viona_ioc_create(ss, dptr, md, cr));
+ case VNA_IOC_DELETE:
+ return (viona_ioc_delete(ss, B_FALSE));
+ default:
+ break;
+ }
+
+ mutex_enter(&ss->ss_lock);
+ if ((link = ss->ss_link) == NULL || link->l_destroyed ||
+ vmm_drv_release_reqd(link->l_vm_hold)) {
+ mutex_exit(&ss->ss_lock);
+ return (ENXIO);
+ }
+
+ switch (cmd) {
+ case VNA_IOC_GET_FEATURES:
+ val = VIONA_S_HOSTCAPS | link->l_features_hw;
+ if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
+ err = EFAULT;
+ }
+ break;
+ case VNA_IOC_SET_FEATURES:
+ if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
+ err = EFAULT;
+ break;
+ }
+ val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
+
+ if ((val & VIRTIO_NET_F_CSUM) == 0)
+ val &= ~VIRTIO_NET_F_HOST_TSO4;
+
+ if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
+ val &= ~VIRTIO_NET_F_GUEST_TSO4;
+
+ link->l_features = val;
+ break;
+ case VNA_IOC_RING_INIT:
+ err = viona_ioc_ring_init(link, dptr, md);
+ break;
+ case VNA_IOC_RING_RESET:
+ err = viona_ioc_ring_reset(link, (uint_t)data);
+ break;
+ case VNA_IOC_RING_KICK:
+ err = viona_ioc_ring_kick(link, (uint_t)data);
+ break;
+ case VNA_IOC_RING_SET_MSI:
+ err = viona_ioc_ring_set_msi(link, dptr, md);
+ break;
+ case VNA_IOC_RING_INTR_CLR:
+ err = viona_ioc_ring_intr_clear(link, (uint_t)data);
+ break;
+ case VNA_IOC_INTR_POLL:
+ err = viona_ioc_intr_poll(link, dptr, md, rv);
+ break;
+ case VNA_IOC_SET_NOTIFY_IOP:
+ err = viona_ioc_set_notify_ioport(link, (uint_t)data);
+ break;
+ default:
+ err = ENOTTY;
+ break;
+ }
+
+ mutex_exit(&ss->ss_lock);
+ return (err);
+}
+
+static int
+viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ viona_soft_state_t *ss;
+ viona_link_t *link;
+
+ ss = ddi_get_soft_state(viona_state, getminor(dev));
+ if (ss == NULL) {
+ return (ENXIO);
+ }
+
+ mutex_enter(&ss->ss_lock);
+ if ((link = ss->ss_link) == NULL || link->l_destroyed) {
+ mutex_exit(&ss->ss_lock);
+ return (ENXIO);
+ }
+
+ *reventsp = 0;
+ if ((events & POLLRDBAND) != 0) {
+ for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
+ if (link->l_vrings[i].vr_intr_enabled != 0) {
+ *reventsp |= POLLRDBAND;
+ break;
+ }
+ }
+ }
+ if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
+ *phpp = &link->l_pollhead;
+ }
+ mutex_exit(&ss->ss_lock);
+
+ return (0);
+}
+
+static void
+viona_get_mac_capab(viona_link_t *link)
+{
+ mac_handle_t mh = link->l_mh;
+ uint32_t cap = 0;
+ mac_capab_lso_t lso_cap;
+
+ link->l_features_hw = 0;
+ if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
+ /*
+ * Only report HW checksum ability if the underlying MAC
+ * resource is capable of populating the L4 header.
+ */
+ if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
+ link->l_features_hw |= VIRTIO_NET_F_CSUM;
+ }
+ link->l_cap_csum = cap;
+ }
+
+ if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
+ mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
+ /*
+ * Virtio doesn't allow for negotiating a maximum LSO
+ * packet size. We have to assume that the guest may
+ * send a maximum length IP packet. Make sure the
+ * underlying MAC can handle an LSO of this size.
+ */
+ if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
+ lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
+ link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
+ }
+}
+
+static int
+viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
+{
+ vioc_create_t kvc;
+ viona_link_t *link = NULL;
+ char cli_name[MAXNAMELEN];
+ int err = 0;
+ file_t *fp;
+ vmm_hold_t *hold = NULL;
+ viona_neti_t *nip = NULL;
+ zoneid_t zid;
+
+ ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
+
+ if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
+ return (EFAULT);
+ }
+
+ zid = crgetzoneid(cr);
+ nip = viona_neti_lookup_by_zid(zid);
+ if (nip == NULL) {
+ return (EIO);
+ }
+
+ if (!nip->vni_nethook.vnh_hooked) {
+ viona_neti_rele(nip);
+ return (EIO);
+ }
+
+ mutex_enter(&ss->ss_lock);
+ if (ss->ss_link != NULL) {
+ mutex_exit(&ss->ss_lock);
+ viona_neti_rele(nip);
+ return (EEXIST);
+ }
+
+ if ((fp = getf(kvc.c_vmfd)) == NULL) {
+ err = EBADF;
+ goto bail;
+ }
+ err = vmm_drv_hold(fp, cr, &hold);
+ releasef(kvc.c_vmfd);
+ if (err != 0) {
+ goto bail;
+ }
+
+ link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
+ link->l_linkid = kvc.c_linkid;
+ link->l_vm_hold = hold;
+
+ err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
+ if (err != 0) {
+ goto bail;
+ }
+
+ viona_get_mac_capab(link);
+
+ (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME,
+ link->l_linkid);
+ err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
+ if (err != 0) {
+ goto bail;
+ }
+
+ viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
+ viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
+
+ if ((err = viona_rx_set(link)) != 0) {
+ viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
+ viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
+ goto bail;
+ }
+
+ link->l_neti = nip;
+ ss->ss_link = link;
+ mutex_exit(&ss->ss_lock);
+
+ mutex_enter(&nip->vni_lock);
+ list_insert_tail(&nip->vni_dev_list, ss);
+ mutex_exit(&nip->vni_lock);
+
+ return (0);
+
+bail:
+ if (link != NULL) {
+ if (link->l_mch != NULL) {
+ mac_client_close(link->l_mch, 0);
+ }
+ if (link->l_mh != NULL) {
+ mac_close(link->l_mh);
+ }
+ kmem_free(link, sizeof (viona_link_t));
+ }
+ if (hold != NULL) {
+ vmm_drv_rele(hold);
+ }
+ viona_neti_rele(nip);
+
+ mutex_exit(&ss->ss_lock);
+ return (err);
+}
+
+static int
+viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
+{
+ viona_link_t *link;
+ viona_neti_t *nip = NULL;
+
+ mutex_enter(&ss->ss_lock);
+ if ((link = ss->ss_link) == NULL) {
+ /* Link destruction already complete */
+ mutex_exit(&ss->ss_lock);
+ return (0);
+ }
+
+ if (link->l_destroyed) {
+ /*
+ * Link destruction has been started by another thread, but has
+ * not completed. This condition should be impossible to
+ * encounter when performing the on-close destroy of the link,
+ * since racing ioctl accessors must necessarily be absent.
+ */
+ VERIFY(!on_close);
+ mutex_exit(&ss->ss_lock);
+ return (EAGAIN);
+ }
+ /*
+ * The link deletion cannot fail after this point, continuing until its
+ * successful completion is reached.
+ */
+ link->l_destroyed = B_TRUE;
+
+ /*
+ * Tear down the IO port hook so it cannot be used to kick any of the
+ * rings which are about to be reset and stopped.
+ */
+ VERIFY0(viona_ioc_set_notify_ioport(link, 0));
+ mutex_exit(&ss->ss_lock);
+
+ /*
+ * Return the rings to their reset state, ignoring any possible
+ * interruptions from signals.
+ */
+ VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
+ VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
+
+ mutex_enter(&ss->ss_lock);
+ if (link->l_mch != NULL) {
+ /* Unhook the receive callbacks and close out the client */
+ viona_rx_clear(link);
+ mac_client_close(link->l_mch, 0);
+ }
+ if (link->l_mh != NULL) {
+ mac_close(link->l_mh);
+ }
+ if (link->l_vm_hold != NULL) {
+ vmm_drv_rele(link->l_vm_hold);
+ link->l_vm_hold = NULL;
+ }
+
+ nip = link->l_neti;
+ link->l_neti = NULL;
+
+ viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
+ viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
+ pollhead_clean(&link->l_pollhead);
+ ss->ss_link = NULL;
+ mutex_exit(&ss->ss_lock);
+
+ mutex_enter(&nip->vni_lock);
+ list_remove(&nip->vni_dev_list, ss);
+ mutex_exit(&nip->vni_lock);
+
+ viona_neti_rele(nip);
+
+ kmem_free(link, sizeof (viona_link_t));
+ return (0);
+}
+
+static int
+viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
+{
+ vioc_ring_init_t kri;
+ int err;
+
+ if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
+ return (EFAULT);
+ }
+
+ err = viona_ring_init(link, kri.ri_index, kri.ri_qsize, kri.ri_qaddr);
+
+ return (err);
+}
+
+static int
+viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
+{
+ viona_vring_t *ring;
+
+ if (idx >= VIONA_VQ_MAX) {
+ return (EINVAL);
+ }
+ ring = &link->l_vrings[idx];
+
+ return (viona_ring_reset(ring, B_TRUE));
+}
+
+static int
+viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
+{
+ viona_vring_t *ring;
+ int err;
+
+ if (idx >= VIONA_VQ_MAX) {
+ return (EINVAL);
+ }
+ ring = &link->l_vrings[idx];
+
+ mutex_enter(&ring->vr_lock);
+ switch (ring->vr_state) {
+ case VRS_SETUP:
+ /*
+ * An early kick to a ring which is starting its worker thread
+ * is fine. Once that thread is active, it will process the
+ * start-up request immediately.
+ */
+ /* FALLTHROUGH */
+ case VRS_INIT:
+ ring->vr_state_flags |= VRSF_REQ_START;
+ /* FALLTHROUGH */
+ case VRS_RUN:
+ cv_broadcast(&ring->vr_cv);
+ err = 0;
+ break;
+ default:
+ err = EBUSY;
+ break;
+ }
+ mutex_exit(&ring->vr_lock);
+
+ return (err);
+}
+
+static int
+viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
+{
+ vioc_ring_msi_t vrm;
+ viona_vring_t *ring;
+
+ if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
+ return (EFAULT);
+ }
+ if (vrm.rm_index >= VIONA_VQ_MAX) {
+ return (EINVAL);
+ }
+
+ ring = &link->l_vrings[vrm.rm_index];
+ mutex_enter(&ring->vr_lock);
+ ring->vr_msi_addr = vrm.rm_addr;
+ ring->vr_msi_msg = vrm.rm_msg;
+ mutex_exit(&ring->vr_lock);
+
+ return (0);
+}
+
+static int
+viona_notify_wcb(void *arg, uintptr_t ioport, uint_t sz, uint64_t val)
+{
+ viona_link_t *link = (viona_link_t *)arg;
+ uint16_t vq = (uint16_t)val;
+
+ if (ioport != link->l_notify_ioport || sz != sizeof (uint16_t)) {
+ return (EINVAL);
+ }
+ return (viona_ioc_ring_kick(link, vq));
+}
+
+static int
+viona_ioc_set_notify_ioport(viona_link_t *link, uint_t ioport)
+{
+ int err = 0;
+
+ if (link->l_notify_ioport != 0) {
+ vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
+ link->l_notify_ioport = 0;
+ }
+
+ if (ioport != 0) {
+ err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, NULL,
+ viona_notify_wcb, (void *)link, &link->l_notify_cookie);
+ if (err == 0) {
+ link->l_notify_ioport = ioport;
+ }
+ }
+ return (err);
+}
+
+static int
+viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
+{
+ if (idx >= VIONA_VQ_MAX) {
+ return (EINVAL);
+ }
+
+ link->l_vrings[idx].vr_intr_enabled = 0;
+ return (0);
+}
+
+static int
+viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
+{
+ uint_t cnt = 0;
+ vioc_intr_poll_t vip;
+
+ for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
+ uint_t val = link->l_vrings[i].vr_intr_enabled;
+
+ vip.vip_status[i] = val;
+ if (val != 0) {
+ cnt++;
+ }
+ }
+
+ if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
+ return (EFAULT);
+ }
+ *rv = (int)cnt;
+ return (0);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_ring.c b/usr/src/uts/i86pc/io/viona/viona_ring.c
new file mode 100644
index 0000000000..5ba6fad963
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_ring.c
@@ -0,0 +1,638 @@
+/*
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+
+#include <sys/disp.h>
+
+#include "viona_impl.h"
+
+#define VRING_ALIGN 4096
+#define VRING_MAX_LEN 32768
+
+static boolean_t viona_ring_map(viona_vring_t *);
+static void viona_ring_unmap(viona_vring_t *);
+static kthread_t *viona_create_worker(viona_vring_t *);
+
+static void *
+viona_gpa2kva(viona_vring_t *ring, uint64_t gpa, size_t len)
+{
+ ASSERT3P(ring->vr_lease, !=, NULL);
+
+ return (vmm_drv_gpa2kva(ring->vr_lease, gpa, len));
+}
+
+static boolean_t
+viona_ring_lease_expire_cb(void *arg)
+{
+ viona_vring_t *ring = arg;
+
+ cv_broadcast(&ring->vr_cv);
+
+ /* The lease will be broken asynchronously. */
+ return (B_FALSE);
+}
+
+static void
+viona_ring_lease_drop(viona_vring_t *ring)
+{
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ if (ring->vr_lease != NULL) {
+ vmm_hold_t *hold = ring->vr_link->l_vm_hold;
+
+ ASSERT(hold != NULL);
+
+ /*
+ * Without an active lease, the ring mappings cannot be
+ * considered valid.
+ */
+ viona_ring_unmap(ring);
+
+ vmm_drv_lease_break(hold, ring->vr_lease);
+ ring->vr_lease = NULL;
+ }
+}
+
+boolean_t
+viona_ring_lease_renew(viona_vring_t *ring)
+{
+ vmm_hold_t *hold = ring->vr_link->l_vm_hold;
+
+ ASSERT(hold != NULL);
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ viona_ring_lease_drop(ring);
+
+ /*
+ * Lease renewal will fail if the VM has requested that all holds be
+ * cleaned up.
+ */
+ ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
+ ring);
+ if (ring->vr_lease != NULL) {
+ /* A ring undergoing renewal will need valid guest mappings */
+ if (ring->vr_pa != 0 && ring->vr_size != 0) {
+ /*
+ * If new mappings cannot be established, consider the
+ * lease renewal a failure.
+ */
+ if (!viona_ring_map(ring)) {
+ viona_ring_lease_drop(ring);
+ return (B_FALSE);
+ }
+ }
+ }
+ return (ring->vr_lease != NULL);
+}
+
+void
+viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
+{
+ ring->vr_link = link;
+ mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
+ mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
+}
+
+static void
+viona_ring_misc_free(viona_vring_t *ring)
+{
+ const uint_t qsz = ring->vr_size;
+
+ viona_tx_ring_free(ring, qsz);
+}
+
+void
+viona_ring_free(viona_vring_t *ring)
+{
+ mutex_destroy(&ring->vr_lock);
+ cv_destroy(&ring->vr_cv);
+ mutex_destroy(&ring->vr_a_mutex);
+ mutex_destroy(&ring->vr_u_mutex);
+ ring->vr_link = NULL;
+}
+
+int
+viona_ring_init(viona_link_t *link, uint16_t idx, uint16_t qsz, uint64_t pa)
+{
+ viona_vring_t *ring;
+ kthread_t *t;
+ int err = 0;
+
+ if (idx >= VIONA_VQ_MAX) {
+ return (EINVAL);
+ }
+ if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
+ return (EINVAL);
+ }
+
+ ring = &link->l_vrings[idx];
+ mutex_enter(&ring->vr_lock);
+ if (ring->vr_state != VRS_RESET) {
+ mutex_exit(&ring->vr_lock);
+ return (EBUSY);
+ }
+ VERIFY(ring->vr_state_flags == 0);
+
+ ring->vr_lease = NULL;
+ if (!viona_ring_lease_renew(ring)) {
+ err = EBUSY;
+ goto fail;
+ }
+
+ ring->vr_size = qsz;
+ ring->vr_mask = (ring->vr_size - 1);
+ ring->vr_pa = pa;
+ if (!viona_ring_map(ring)) {
+ err = EINVAL;
+ goto fail;
+ }
+
+ /* Initialize queue indexes */
+ ring->vr_cur_aidx = 0;
+
+ if (idx == VIONA_VQ_TX) {
+ viona_tx_ring_alloc(ring, qsz);
+ }
+
+ /* Zero out MSI-X configuration */
+ ring->vr_msi_addr = 0;
+ ring->vr_msi_msg = 0;
+
+ /* Clear the stats */
+ bzero(&ring->vr_stats, sizeof (ring->vr_stats));
+
+ t = viona_create_worker(ring);
+ if (t == NULL) {
+ err = ENOMEM;
+ goto fail;
+ }
+ ring->vr_worker_thread = t;
+ ring->vr_state = VRS_SETUP;
+ cv_broadcast(&ring->vr_cv);
+ mutex_exit(&ring->vr_lock);
+ return (0);
+
+fail:
+ viona_ring_lease_drop(ring);
+ viona_ring_misc_free(ring);
+ ring->vr_size = 0;
+ ring->vr_mask = 0;
+ mutex_exit(&ring->vr_lock);
+ return (err);
+}
+
+int
+viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
+{
+ mutex_enter(&ring->vr_lock);
+ if (ring->vr_state == VRS_RESET) {
+ mutex_exit(&ring->vr_lock);
+ return (0);
+ }
+
+ if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
+ ring->vr_state_flags |= VRSF_REQ_STOP;
+ cv_broadcast(&ring->vr_cv);
+ }
+ while (ring->vr_state != VRS_RESET) {
+ if (!heed_signals) {
+ cv_wait(&ring->vr_cv, &ring->vr_lock);
+ } else {
+ int rs;
+
+ rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+ if (rs <= 0 && ring->vr_state != VRS_RESET) {
+ mutex_exit(&ring->vr_lock);
+ return (EINTR);
+ }
+ }
+ }
+ viona_ring_lease_drop(ring);
+ mutex_exit(&ring->vr_lock);
+ return (0);
+}
+
+static boolean_t
+viona_ring_map(viona_vring_t *ring)
+{
+ uint64_t pos = ring->vr_pa;
+ const uint16_t qsz = ring->vr_size;
+
+ ASSERT3U(qsz, !=, 0);
+ ASSERT3U(pos, !=, 0);
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ const size_t desc_sz = qsz * sizeof (struct virtio_desc);
+ ring->vr_descr = viona_gpa2kva(ring, pos, desc_sz);
+ if (ring->vr_descr == NULL) {
+ goto fail;
+ }
+ pos += desc_sz;
+
+ const size_t avail_sz = (qsz + 3) * sizeof (uint16_t);
+ ring->vr_avail_flags = viona_gpa2kva(ring, pos, avail_sz);
+ if (ring->vr_avail_flags == NULL) {
+ goto fail;
+ }
+ ring->vr_avail_idx = ring->vr_avail_flags + 1;
+ ring->vr_avail_ring = ring->vr_avail_flags + 2;
+ ring->vr_avail_used_event = ring->vr_avail_ring + qsz;
+ pos += avail_sz;
+
+ const size_t used_sz = (qsz * sizeof (struct virtio_used)) +
+ (sizeof (uint16_t) * 3);
+ pos = P2ROUNDUP(pos, VRING_ALIGN);
+ ring->vr_used_flags = viona_gpa2kva(ring, pos, used_sz);
+ if (ring->vr_used_flags == NULL) {
+ goto fail;
+ }
+ ring->vr_used_idx = ring->vr_used_flags + 1;
+ ring->vr_used_ring = (struct virtio_used *)(ring->vr_used_flags + 2);
+ ring->vr_used_avail_event = (uint16_t *)(ring->vr_used_ring + qsz);
+
+ return (B_TRUE);
+
+fail:
+ viona_ring_unmap(ring);
+ return (B_FALSE);
+}
+
+static void
+viona_ring_unmap(viona_vring_t *ring)
+{
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ ring->vr_descr = NULL;
+ ring->vr_avail_flags = NULL;
+ ring->vr_avail_idx = NULL;
+ ring->vr_avail_ring = NULL;
+ ring->vr_avail_used_event = NULL;
+ ring->vr_used_flags = NULL;
+ ring->vr_used_idx = NULL;
+ ring->vr_used_ring = NULL;
+ ring->vr_used_avail_event = NULL;
+}
+
+void
+viona_intr_ring(viona_vring_t *ring)
+{
+ uint64_t addr;
+
+ mutex_enter(&ring->vr_lock);
+ /* Deliver the interrupt directly, if so configured. */
+ if ((addr = ring->vr_msi_addr) != 0) {
+ uint64_t msg = ring->vr_msi_msg;
+
+ mutex_exit(&ring->vr_lock);
+ (void) vmm_drv_msi(ring->vr_lease, addr, msg);
+ return;
+ }
+ mutex_exit(&ring->vr_lock);
+
+ if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
+ pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
+ }
+}
+
+static void
+viona_worker(void *arg)
+{
+ viona_vring_t *ring = (viona_vring_t *)arg;
+ viona_link_t *link = ring->vr_link;
+ proc_t *p = ttoproc(curthread);
+
+ mutex_enter(&ring->vr_lock);
+ VERIFY3U(ring->vr_state, ==, VRS_SETUP);
+
+ /* Bail immediately if ring shutdown or process exit was requested */
+ if (VRING_NEED_BAIL(ring, p)) {
+ goto cleanup;
+ }
+
+ /* Report worker thread as alive and notify creator */
+ ring->vr_state = VRS_INIT;
+ cv_broadcast(&ring->vr_cv);
+
+ while (ring->vr_state_flags == 0) {
+ /*
+ * Keeping lease renewals timely while waiting for the ring to
+ * be started is important for avoiding deadlocks.
+ */
+ if (vmm_drv_lease_expired(ring->vr_lease)) {
+ if (!viona_ring_lease_renew(ring)) {
+ goto cleanup;
+ }
+ }
+
+ (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+
+ if (VRING_NEED_BAIL(ring, p)) {
+ goto cleanup;
+ }
+ }
+
+ ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
+ ring->vr_state = VRS_RUN;
+ ring->vr_state_flags &= ~VRSF_REQ_START;
+
+ /* Ensure ring lease is valid first */
+ if (vmm_drv_lease_expired(ring->vr_lease)) {
+ if (!viona_ring_lease_renew(ring)) {
+ goto cleanup;
+ }
+ }
+
+ /* Process actual work */
+ if (ring == &link->l_vrings[VIONA_VQ_RX]) {
+ viona_worker_rx(ring, link);
+ } else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
+ viona_worker_tx(ring, link);
+ } else {
+ panic("unexpected ring: %p", (void *)ring);
+ }
+
+ VERIFY3U(ring->vr_state, ==, VRS_STOP);
+
+cleanup:
+ if (ring->vr_txdesb != NULL) {
+ /*
+ * Transmit activity must be entirely concluded before the
+ * associated descriptors can be cleaned up.
+ */
+ VERIFY(ring->vr_xfer_outstanding == 0);
+ }
+ viona_ring_misc_free(ring);
+
+ viona_ring_lease_drop(ring);
+ ring->vr_cur_aidx = 0;
+ ring->vr_state = VRS_RESET;
+ ring->vr_state_flags = 0;
+ ring->vr_worker_thread = NULL;
+ cv_broadcast(&ring->vr_cv);
+ mutex_exit(&ring->vr_lock);
+
+ mutex_enter(&ttoproc(curthread)->p_lock);
+ lwp_exit();
+}
+
+static kthread_t *
+viona_create_worker(viona_vring_t *ring)
+{
+ k_sigset_t hold_set;
+ proc_t *p = curproc;
+ kthread_t *t;
+ klwp_t *lwp;
+
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+ ASSERT(ring->vr_state == VRS_RESET);
+
+ sigfillset(&hold_set);
+ lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
+ minclsyspri - 1, &hold_set, curthread->t_cid, 0);
+ if (lwp == NULL) {
+ return (NULL);
+ }
+
+ t = lwptot(lwp);
+ mutex_enter(&p->p_lock);
+ t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
+ lwp_create_done(t);
+ mutex_exit(&p->p_lock);
+
+ return (t);
+}
+
+int
+vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
+ uint16_t *cookie)
+{
+ uint_t i, ndesc, idx, head, next;
+ struct virtio_desc vdir;
+ void *buf;
+
+ ASSERT(iov != NULL);
+ ASSERT(niov > 0 && niov < INT_MAX);
+
+ mutex_enter(&ring->vr_a_mutex);
+ idx = ring->vr_cur_aidx;
+ ndesc = (uint16_t)((unsigned)*ring->vr_avail_idx - (unsigned)idx);
+
+ if (ndesc == 0) {
+ mutex_exit(&ring->vr_a_mutex);
+ return (0);
+ }
+ if (ndesc > ring->vr_size) {
+ /*
+ * Despite the fact that the guest has provided an 'avail_idx'
+ * which indicates that an impossible number of descriptors are
+ * available, continue on and attempt to process the next one.
+ *
+ * The transgression will not escape the probe or stats though.
+ */
+ VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
+ uint16_t, ndesc);
+ VIONA_RING_STAT_INCR(ring, ndesc_too_high);
+ }
+
+ head = ring->vr_avail_ring[idx & ring->vr_mask];
+ next = head;
+
+ for (i = 0; i < niov; next = vdir.vd_next) {
+ if (next >= ring->vr_size) {
+ VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
+ uint16_t, next);
+ VIONA_RING_STAT_INCR(ring, bad_idx);
+ goto bail;
+ }
+
+ vdir = ring->vr_descr[next];
+ if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
+ if (vdir.vd_len == 0) {
+ VIONA_PROBE2(desc_bad_len,
+ viona_vring_t *, ring,
+ uint32_t, vdir.vd_len);
+ VIONA_RING_STAT_INCR(ring, desc_bad_len);
+ goto bail;
+ }
+ buf = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
+ if (buf == NULL) {
+ VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
+ VIONA_RING_STAT_INCR(ring, bad_ring_addr);
+ goto bail;
+ }
+ iov[i].iov_base = buf;
+ iov[i].iov_len = vdir.vd_len;
+ i++;
+ } else {
+ const uint_t nindir = vdir.vd_len / 16;
+ volatile struct virtio_desc *vindir;
+
+ if ((vdir.vd_len & 0xf) || nindir == 0) {
+ VIONA_PROBE2(indir_bad_len,
+ viona_vring_t *, ring,
+ uint32_t, vdir.vd_len);
+ VIONA_RING_STAT_INCR(ring, indir_bad_len);
+ goto bail;
+ }
+ vindir = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len);
+ if (vindir == NULL) {
+ VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr);
+ VIONA_RING_STAT_INCR(ring, bad_ring_addr);
+ goto bail;
+ }
+ next = 0;
+ for (;;) {
+ struct virtio_desc vp;
+
+ /*
+ * A copy of the indirect descriptor is made
+ * here, rather than simply using a reference
+ * pointer. This prevents malicious or
+ * erroneous guest writes to the descriptor
+ * from fooling the flags/bounds verification
+ * through a race.
+ */
+ vp = vindir[next];
+ if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
+ VIONA_PROBE1(indir_bad_nest,
+ viona_vring_t *, ring);
+ VIONA_RING_STAT_INCR(ring,
+ indir_bad_nest);
+ goto bail;
+ } else if (vp.vd_len == 0) {
+ VIONA_PROBE2(desc_bad_len,
+ viona_vring_t *, ring,
+ uint32_t, vp.vd_len);
+ VIONA_RING_STAT_INCR(ring,
+ desc_bad_len);
+ goto bail;
+ }
+ buf = viona_gpa2kva(ring, vp.vd_addr,
+ vp.vd_len);
+ if (buf == NULL) {
+ VIONA_PROBE_BAD_RING_ADDR(ring,
+ vp.vd_addr);
+ VIONA_RING_STAT_INCR(ring,
+ bad_ring_addr);
+ goto bail;
+ }
+ iov[i].iov_base = buf;
+ iov[i].iov_len = vp.vd_len;
+ i++;
+
+ if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0)
+ break;
+ if (i >= niov) {
+ goto loopy;
+ }
+
+ next = vp.vd_next;
+ if (next >= nindir) {
+ VIONA_PROBE3(indir_bad_next,
+ viona_vring_t *, ring,
+ uint16_t, next,
+ uint_t, nindir);
+ VIONA_RING_STAT_INCR(ring,
+ indir_bad_next);
+ goto bail;
+ }
+ }
+ }
+ if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
+ *cookie = head;
+ ring->vr_cur_aidx++;
+ mutex_exit(&ring->vr_a_mutex);
+ return (i);
+ }
+ }
+
+loopy:
+ VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
+ VIONA_RING_STAT_INCR(ring, too_many_desc);
+bail:
+ mutex_exit(&ring->vr_a_mutex);
+ return (-1);
+}
+
+void
+vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
+{
+ volatile struct virtio_used *vu;
+ uint_t uidx;
+
+ mutex_enter(&ring->vr_u_mutex);
+
+ uidx = *ring->vr_used_idx;
+ vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
+ vu->vu_idx = cookie;
+ vu->vu_tlen = len;
+ membar_producer();
+ *ring->vr_used_idx = uidx;
+
+ mutex_exit(&ring->vr_u_mutex);
+}
+
+void
+vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
+{
+ volatile struct virtio_used *vu;
+ uint_t uidx, i;
+
+ mutex_enter(&ring->vr_u_mutex);
+
+ uidx = *ring->vr_used_idx;
+ if (num_bufs == 1) {
+ vu = &ring->vr_used_ring[uidx++ & ring->vr_mask];
+ vu->vu_idx = elem[0].id;
+ vu->vu_tlen = elem[0].len;
+ } else {
+ for (i = 0; i < num_bufs; i++) {
+ vu = &ring->vr_used_ring[(uidx + i) & ring->vr_mask];
+ vu->vu_idx = elem[i].id;
+ vu->vu_tlen = elem[i].len;
+ }
+ uidx = uidx + num_bufs;
+ }
+ membar_producer();
+ *ring->vr_used_idx = uidx;
+
+ mutex_exit(&ring->vr_u_mutex);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_rx.c b/usr/src/uts/i86pc/io/viona/viona_rx.c
new file mode 100644
index 0000000000..1ccbaa63f1
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_rx.c
@@ -0,0 +1,718 @@
+/*
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/strsubr.h>
+
+#include <sys/dlpi.h>
+#include <sys/pattr.h>
+#include <sys/vlan.h>
+
+#include "viona_impl.h"
+
+
+
+#define VTNET_MAXSEGS 32
+
+/* Min. octets in an ethernet frame minus FCS */
+#define MIN_BUF_SIZE 60
+#define NEED_VLAN_PAD_SIZE (MIN_BUF_SIZE - VLAN_TAGSZ)
+
+static mblk_t *viona_vlan_pad_mp;
+
+void
+viona_rx_init(void)
+{
+ mblk_t *mp;
+
+ ASSERT(viona_vlan_pad_mp == NULL);
+
+ /* Create mblk for padding when VLAN tags are stripped */
+ mp = allocb_wait(VLAN_TAGSZ, BPRI_HI, STR_NOSIG, NULL);
+ bzero(mp->b_rptr, VLAN_TAGSZ);
+ mp->b_wptr += VLAN_TAGSZ;
+ viona_vlan_pad_mp = mp;
+}
+
+void
+viona_rx_fini(void)
+{
+ mblk_t *mp;
+
+ /* Clean up the VLAN padding mblk */
+ mp = viona_vlan_pad_mp;
+ viona_vlan_pad_mp = NULL;
+ VERIFY(mp != NULL && mp->b_cont == NULL);
+ freemsg(mp);
+}
+
+void
+viona_worker_rx(viona_vring_t *ring, viona_link_t *link)
+{
+ proc_t *p = ttoproc(curthread);
+
+ (void) thread_vsetname(curthread, "viona_rx_%p", ring);
+
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+ ASSERT3U(ring->vr_state, ==, VRS_RUN);
+
+ *ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
+
+ do {
+ if (vmm_drv_lease_expired(ring->vr_lease)) {
+ /*
+ * Set the renewal flag, causing incoming traffic to be
+ * dropped, and issue an RX barrier to ensure any
+ * threads in the RX callbacks will have finished.
+ * The vr_lock cannot be held across the barrier as it
+ * poses a deadlock risk.
+ */
+ ring->vr_state_flags |= VRSF_RENEW;
+ mutex_exit(&ring->vr_lock);
+ mac_rx_barrier(link->l_mch);
+ mutex_enter(&ring->vr_lock);
+
+ if (!viona_ring_lease_renew(ring)) {
+ break;
+ }
+ ring->vr_state_flags &= ~VRSF_RENEW;
+ }
+
+ /*
+ * For now, there is little to do in the RX worker as inbound
+ * data is delivered by MAC via the RX callbacks. If tap-like
+ * functionality is added later, this would be a convenient
+ * place to inject frames into the guest.
+ */
+ (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+ } while (!VRING_NEED_BAIL(ring, p));
+
+ ring->vr_state = VRS_STOP;
+
+ /*
+ * The RX ring is stopping, before we start tearing it down it
+ * is imperative that we perform an RX barrier so that
+ * incoming packets are dropped at viona_rx_classified().
+ */
+ mutex_exit(&ring->vr_lock);
+ mac_rx_barrier(link->l_mch);
+ mutex_enter(&ring->vr_lock);
+
+ *ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
+}
+
+static size_t
+viona_copy_mblk(const mblk_t *mp, size_t seek, caddr_t buf, size_t len,
+ boolean_t *end)
+{
+ size_t copied = 0;
+ size_t off = 0;
+
+ /* Seek past already-consumed data */
+ while (seek > 0 && mp != NULL) {
+ const size_t chunk = MBLKL(mp);
+
+ if (chunk > seek) {
+ off = seek;
+ break;
+ }
+ mp = mp->b_cont;
+ seek -= chunk;
+ }
+
+ while (mp != NULL) {
+ const size_t chunk = MBLKL(mp) - off;
+ const size_t to_copy = MIN(chunk, len);
+
+ bcopy(mp->b_rptr + off, buf, to_copy);
+ copied += to_copy;
+ buf += to_copy;
+ len -= to_copy;
+
+ /*
+ * If all the remaining data in the mblk_t was copied, move on
+ * to the next one in the chain. Any seek offset applied to
+ * the first mblk copy is zeroed out for subsequent operations.
+ */
+ if (chunk == to_copy) {
+ mp = mp->b_cont;
+ off = 0;
+ }
+#ifdef DEBUG
+ else {
+ /*
+ * The only valid reason for the copy to consume less
+ * than the entire contents of the mblk_t is because
+ * the output buffer has been filled.
+ */
+ ASSERT0(len);
+ }
+#endif
+
+ /* Go no further if the buffer has been filled */
+ if (len == 0) {
+ break;
+ }
+
+ }
+ *end = (mp == NULL);
+ return (copied);
+}
+
+static int
+viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz)
+{
+ struct iovec iov[VTNET_MAXSEGS];
+ uint16_t cookie;
+ int n;
+ const size_t hdr_sz = sizeof (struct virtio_net_hdr);
+ struct virtio_net_hdr *hdr;
+ size_t len, copied = 0;
+ caddr_t buf = NULL;
+ boolean_t end = B_FALSE;
+ const uint32_t features = ring->vr_link->l_features;
+
+ ASSERT(msz >= MIN_BUF_SIZE);
+
+ n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+ if (n <= 0) {
+ /* Without available buffers, the frame must be dropped. */
+ return (ENOSPC);
+ }
+ if (iov[0].iov_len < hdr_sz) {
+ /*
+ * There is little to do if there is not even space available
+ * for the sole header. Zero the buffer and bail out as a last
+ * act of desperation.
+ */
+ bzero(iov[0].iov_base, iov[0].iov_len);
+ goto bad_frame;
+ }
+
+ /* Grab the address of the header before anything else */
+ hdr = (struct virtio_net_hdr *)iov[0].iov_base;
+
+ /*
+ * If there is any space remaining in the first buffer after writing
+ * the header, fill it with frame data.
+ */
+ if (iov[0].iov_len > hdr_sz) {
+ buf = (caddr_t)iov[0].iov_base + hdr_sz;
+ len = iov[0].iov_len - hdr_sz;
+
+ copied += viona_copy_mblk(mp, copied, buf, len, &end);
+ }
+
+ /* Copy any remaining data into subsequent buffers, if present */
+ for (int i = 1; i < n && !end; i++) {
+ buf = (caddr_t)iov[i].iov_base;
+ len = iov[i].iov_len;
+
+ copied += viona_copy_mblk(mp, copied, buf, len, &end);
+ }
+
+ /* Was the expected amount of data copied? */
+ if (copied != msz) {
+ VIONA_PROBE5(too_short, viona_vring_t *, ring,
+ uint16_t, cookie, mblk_t *, mp, size_t, copied,
+ size_t, msz);
+ VIONA_RING_STAT_INCR(ring, too_short);
+ goto bad_frame;
+ }
+
+ /* Populate (read: zero) the header and account for it in the size */
+ bzero(hdr, hdr_sz);
+ copied += hdr_sz;
+
+ /* Add chksum bits, if needed */
+ if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
+ uint32_t cksum_flags;
+
+ if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
+ ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
+ hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
+ hdr->vrh_gso_size = DB_LSOMSS(mp);
+ }
+
+ mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
+ &cksum_flags);
+ if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
+ hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
+ }
+ }
+
+ /* Release this chain */
+ vq_pushchain(ring, copied, cookie);
+ return (0);
+
+bad_frame:
+ VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, uint16_t, cookie,
+ mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, bad_rx_frame);
+
+ vq_pushchain(ring, MAX(copied, MIN_BUF_SIZE + hdr_sz), cookie);
+ return (EINVAL);
+}
+
+static int
+viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz)
+{
+ struct iovec iov[VTNET_MAXSEGS];
+ used_elem_t uelem[VTNET_MAXSEGS];
+ int n, i = 0, buf_idx = 0, err = 0;
+ uint16_t cookie;
+ caddr_t buf;
+ size_t len, copied = 0, chunk = 0;
+ struct virtio_net_mrgrxhdr *hdr = NULL;
+ const size_t hdr_sz = sizeof (struct virtio_net_mrgrxhdr);
+ boolean_t end = B_FALSE;
+ const uint32_t features = ring->vr_link->l_features;
+
+ ASSERT(msz >= MIN_BUF_SIZE);
+
+ n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+ if (n <= 0) {
+ /* Without available buffers, the frame must be dropped. */
+ VIONA_PROBE2(no_space, viona_vring_t *, ring, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, no_space);
+ return (ENOSPC);
+ }
+ if (iov[0].iov_len < hdr_sz) {
+ /*
+ * There is little to do if there is not even space available
+ * for the sole header. Zero the buffer and bail out as a last
+ * act of desperation.
+ */
+ bzero(iov[0].iov_base, iov[0].iov_len);
+ uelem[0].id = cookie;
+ uelem[0].len = iov[0].iov_len;
+ err = EINVAL;
+ goto done;
+ }
+
+ /* Grab the address of the header and do initial population */
+ hdr = (struct virtio_net_mrgrxhdr *)iov[0].iov_base;
+ bzero(hdr, hdr_sz);
+ hdr->vrh_bufs = 1;
+
+ /*
+ * If there is any space remaining in the first buffer after writing
+ * the header, fill it with frame data.
+ */
+ if (iov[0].iov_len > hdr_sz) {
+ buf = iov[0].iov_base + hdr_sz;
+ len = iov[0].iov_len - hdr_sz;
+
+ chunk += viona_copy_mblk(mp, copied, buf, len, &end);
+ copied += chunk;
+ }
+ i = 1;
+
+ do {
+ while (i < n && !end) {
+ buf = iov[i].iov_base;
+ len = iov[i].iov_len;
+
+ chunk += viona_copy_mblk(mp, copied, buf, len, &end);
+ copied += chunk;
+ i++;
+ }
+
+ uelem[buf_idx].id = cookie;
+ uelem[buf_idx].len = chunk;
+
+ /*
+ * Try to grab another buffer from the ring if the mblk has not
+ * yet been entirely copied out.
+ */
+ if (!end) {
+ if (buf_idx == (VTNET_MAXSEGS - 1)) {
+ /*
+ * Our arbitrary limit on the number of buffers
+ * to offer for merge has already been reached.
+ */
+ err = EOVERFLOW;
+ break;
+ }
+ n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie);
+ if (n <= 0) {
+ /*
+ * Without more immediate space to perform the
+ * copying, there is little choice left but to
+ * drop the packet.
+ */
+ err = EMSGSIZE;
+ break;
+ }
+ chunk = 0;
+ i = 0;
+ buf_idx++;
+ /*
+ * Keep the header up-to-date with the number of
+ * buffers, but never reference its value since the
+ * guest could meddle with it.
+ */
+ hdr->vrh_bufs++;
+ }
+ } while (!end && copied < msz);
+
+ /* Account for the header size in the first buffer */
+ uelem[0].len += hdr_sz;
+
+ /*
+ * If no other errors were encounted during the copy, was the expected
+ * amount of data transfered?
+ */
+ if (err == 0 && copied != msz) {
+ VIONA_PROBE5(too_short, viona_vring_t *, ring,
+ uint16_t, cookie, mblk_t *, mp, size_t, copied,
+ size_t, msz);
+ VIONA_RING_STAT_INCR(ring, too_short);
+ err = EINVAL;
+ }
+
+ /* Add chksum bits, if needed */
+ if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
+ uint32_t cksum_flags;
+
+ if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
+ ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
+ hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
+ hdr->vrh_gso_size = DB_LSOMSS(mp);
+ }
+
+ mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
+ &cksum_flags);
+ if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
+ hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
+ }
+ }
+
+done:
+ switch (err) {
+ case 0:
+ /* Success can fall right through to ring delivery */
+ break;
+
+ case EMSGSIZE:
+ VIONA_PROBE3(rx_merge_underrun, viona_vring_t *, ring,
+ uint16_t, cookie, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, rx_merge_underrun);
+ break;
+
+ case EOVERFLOW:
+ VIONA_PROBE3(rx_merge_overrun, viona_vring_t *, ring,
+ uint16_t, cookie, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, rx_merge_overrun);
+ break;
+
+ default:
+ VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring,
+ uint16_t, cookie, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, bad_rx_frame);
+ }
+ vq_pushchain_many(ring, buf_idx + 1, uelem);
+ return (err);
+}
+
+static void
+viona_rx_common(viona_vring_t *ring, mblk_t *mp, boolean_t is_loopback)
+{
+ viona_link_t *link = ring->vr_link;
+ mblk_t *mprx = NULL, **mprx_prevp = &mprx;
+ mblk_t *mpdrop = NULL, **mpdrop_prevp = &mpdrop;
+ const boolean_t do_merge =
+ ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0);
+
+ size_t nrx = 0, ndrop = 0;
+
+ while (mp != NULL) {
+ mblk_t *next = mp->b_next;
+ mblk_t *pad = NULL;
+ size_t size = msgsize(mp);
+ int err = 0;
+
+ mp->b_next = NULL;
+
+ /*
+ * We treat both a 'drop' response and errors the same here
+ * and put the packet on the drop chain. As packets may be
+ * subject to different actions in ipf (which do not all
+ * return the same set of error values), an error processing
+ * one packet doesn't mean the next packet will also generate
+ * an error.
+ */
+ if (VNETHOOK_INTERESTED_IN(link->l_neti) &&
+ viona_hook(link, ring, &mp, B_FALSE) != 0) {
+ if (mp != NULL) {
+ *mpdrop_prevp = mp;
+ mpdrop_prevp = &mp->b_next;
+ } else {
+ /*
+ * If the hook consumer (e.g. ipf) already
+ * freed the mblk_t, update the drop count now.
+ */
+ ndrop++;
+ }
+ mp = next;
+ continue;
+ }
+
+ /*
+ * Ethernet frames are expected to be padded out in order to
+ * meet the minimum size.
+ *
+ * A special case is made for frames which are short by
+ * VLAN_TAGSZ, having been stripped of their VLAN tag while
+ * traversing MAC. A preallocated (and recycled) mblk is used
+ * for that specific condition.
+ *
+ * All other frames that fall short on length will have custom
+ * zero-padding allocated appended to them.
+ */
+ if (size == NEED_VLAN_PAD_SIZE) {
+ ASSERT(MBLKL(viona_vlan_pad_mp) == VLAN_TAGSZ);
+ ASSERT(viona_vlan_pad_mp->b_cont == NULL);
+
+ for (pad = mp; pad->b_cont != NULL; pad = pad->b_cont)
+ ;
+
+ pad->b_cont = viona_vlan_pad_mp;
+ size += VLAN_TAGSZ;
+ } else if (size < MIN_BUF_SIZE) {
+ const size_t pad_size = MIN_BUF_SIZE - size;
+ mblk_t *zero_mp;
+
+ zero_mp = allocb(pad_size, BPRI_MED);
+ if (zero_mp == NULL) {
+ err = ENOMEM;
+ goto pad_drop;
+ }
+
+ VIONA_PROBE3(rx_pad_short, viona_vring_t *, ring,
+ mblk_t *, mp, size_t, pad_size);
+ VIONA_RING_STAT_INCR(ring, rx_pad_short);
+ zero_mp->b_wptr += pad_size;
+ bzero(zero_mp->b_rptr, pad_size);
+ linkb(mp, zero_mp);
+ size += pad_size;
+ }
+
+ if (do_merge) {
+ err = viona_recv_merged(ring, mp, size);
+ } else {
+ err = viona_recv_plain(ring, mp, size);
+ }
+
+ /*
+ * The VLAN padding mblk is meant for continual reuse, so
+ * remove it from the chain to prevent it from being freed.
+ *
+ * Custom allocated padding does not require this treatment and
+ * is freed normally.
+ */
+ if (pad != NULL) {
+ pad->b_cont = NULL;
+ }
+
+pad_drop:
+ /*
+ * While an error during rx processing
+ * (viona_recv_{merged,plain}) does not free mp on error,
+ * hook processing might or might not free mp. Handle either
+ * scenario -- if mp is not yet free, it is queued up and
+ * freed after the guest has been notified. If mp is
+ * already NULL, just proceed on.
+ */
+ if (err != 0) {
+ *mpdrop_prevp = mp;
+ mpdrop_prevp = &mp->b_next;
+
+ /*
+ * If the available ring is empty, do not bother
+ * attempting to deliver any more frames. Count the
+ * rest as dropped too.
+ */
+ if (err == ENOSPC) {
+ mp->b_next = next;
+ break;
+ }
+ } else {
+ /* Chain successful mblks to be freed later */
+ *mprx_prevp = mp;
+ mprx_prevp = &mp->b_next;
+ nrx++;
+ }
+ mp = next;
+ }
+
+ membar_enter();
+ if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ viona_intr_ring(ring);
+ }
+
+ /* Free successfully received frames */
+ if (mprx != NULL) {
+ freemsgchain(mprx);
+ }
+
+ /* Free dropped frames, also tallying them */
+ mp = mpdrop;
+ while (mp != NULL) {
+ mblk_t *next = mp->b_next;
+
+ mp->b_next = NULL;
+ freemsg(mp);
+ mp = next;
+ ndrop++;
+ }
+ VIONA_PROBE3(rx, viona_link_t *, link, size_t, nrx, size_t, ndrop);
+}
+
+static void
+viona_rx_classified(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t is_loopback)
+{
+ viona_vring_t *ring = (viona_vring_t *)arg;
+
+ /* Drop traffic if ring is inactive or renewing its lease */
+ if (ring->vr_state != VRS_RUN ||
+ (ring->vr_state_flags & VRSF_RENEW) != 0) {
+ freemsgchain(mp);
+ return;
+ }
+
+ viona_rx_common(ring, mp, is_loopback);
+}
+
+static void
+viona_rx_mcast(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t is_loopback)
+{
+ viona_vring_t *ring = (viona_vring_t *)arg;
+ mac_handle_t mh = ring->vr_link->l_mh;
+ mblk_t *mp_mcast_only = NULL;
+ mblk_t **mpp = &mp_mcast_only;
+
+ /* Drop traffic if ring is inactive or renewing its lease */
+ if (ring->vr_state != VRS_RUN ||
+ (ring->vr_state_flags & VRSF_RENEW) != 0) {
+ freemsgchain(mp);
+ return;
+ }
+
+ /*
+ * In addition to multicast traffic, broadcast packets will also arrive
+ * via the MAC_CLIENT_PROMISC_MULTI handler. The mac_rx_set() callback
+ * for fully-classified traffic has already delivered that broadcast
+ * traffic, so it should be suppressed here, rather than duplicating it
+ * to the guest.
+ */
+ while (mp != NULL) {
+ mblk_t *mp_next;
+ mac_header_info_t mhi;
+ int err;
+
+ mp_next = mp->b_next;
+ mp->b_next = NULL;
+
+ /* Determine the packet type */
+ err = mac_vlan_header_info(mh, mp, &mhi);
+ if (err != 0) {
+ mblk_t *pull;
+
+ /*
+ * It is possible that gathering of the header
+ * information was impeded by a leading mblk_t which
+ * was of inadequate length to reference the needed
+ * fields. Try again, in case that could be solved
+ * with a pull-up.
+ */
+ pull = msgpullup(mp, sizeof (struct ether_vlan_header));
+ if (pull == NULL) {
+ err = ENOMEM;
+ } else {
+ err = mac_vlan_header_info(mh, pull, &mhi);
+ freemsg(pull);
+ }
+
+ if (err != 0) {
+ VIONA_RING_STAT_INCR(ring, rx_mcast_check);
+ }
+ }
+
+ /* Chain up matching packets while discarding others */
+ if (err == 0 && mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) {
+ *mpp = mp;
+ mpp = &mp->b_next;
+ } else {
+ freemsg(mp);
+ }
+
+ mp = mp_next;
+ }
+
+ if (mp_mcast_only != NULL) {
+ viona_rx_common(ring, mp_mcast_only, is_loopback);
+ }
+}
+
+int
+viona_rx_set(viona_link_t *link)
+{
+ viona_vring_t *ring = &link->l_vrings[VIONA_VQ_RX];
+ int err;
+
+ mac_rx_set(link->l_mch, viona_rx_classified, ring);
+ err = mac_promisc_add(link->l_mch, MAC_CLIENT_PROMISC_MULTI,
+ viona_rx_mcast, ring, &link->l_mph,
+ MAC_PROMISC_FLAGS_NO_TX_LOOP | MAC_PROMISC_FLAGS_VLAN_TAG_STRIP);
+ if (err != 0) {
+ mac_rx_clear(link->l_mch);
+ }
+
+ return (err);
+}
+
+void
+viona_rx_clear(viona_link_t *link)
+{
+ mac_promisc_remove(link->l_mph);
+ mac_rx_clear(link->l_mch);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona_tx.c b/usr/src/uts/i86pc/io/viona/viona_tx.c
new file mode 100644
index 0000000000..5dc645723c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona_tx.c
@@ -0,0 +1,756 @@
+/*
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+
+#include <sys/types.h>
+#include <sys/smt.h>
+#include <sys/strsubr.h>
+
+#include <sys/pattr.h>
+#include <sys/dlpi.h>
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+
+#include "viona_impl.h"
+
+#define BNXE_NIC_DRIVER "bnxe"
+
+/*
+ * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
+ * transmission to free resources.
+ */
+kmutex_t viona_force_copy_lock;
+static enum viona_force_copy {
+ VFC_UNINITALIZED = 0,
+ VFC_COPY_UNEEDED = 1,
+ VFC_COPY_REQUIRED = 2,
+} viona_force_copy_state = VFC_UNINITALIZED;
+
+struct viona_desb {
+ frtn_t d_frtn;
+ viona_vring_t *d_ring;
+ uint_t d_ref;
+ uint32_t d_len;
+ uint16_t d_cookie;
+ uchar_t *d_headers;
+};
+
+static void viona_tx(viona_link_t *, viona_vring_t *);
+static void viona_desb_release(viona_desb_t *);
+
+/*
+ * Return the number of available descriptors in the vring taking care of the
+ * 16-bit index wraparound.
+ *
+ * Note: If the number of apparently available descriptors is larger than the
+ * ring size (due to guest misbehavior), this check will still report the
+ * positive count of descriptors.
+ */
+static inline uint_t
+viona_vr_num_avail(viona_vring_t *ring)
+{
+ uint16_t ndesc;
+
+ /*
+ * We're just computing (a-b) in GF(216).
+ *
+ * The only glitch here is that in standard C, uint16_t promotes to
+ * (signed) int when int has more than 16 bits (almost always now).
+ * A cast back to unsigned is necessary for proper operation.
+ */
+ ndesc = (unsigned)*ring->vr_avail_idx - (unsigned)ring->vr_cur_aidx;
+
+ return (ndesc);
+}
+
+static void
+viona_tx_wait_outstanding(viona_vring_t *ring)
+{
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ while (ring->vr_xfer_outstanding != 0) {
+ /*
+ * Paying heed to signals is counterproductive here. This is a
+ * very tight loop if pending transfers take an extended amount
+ * of time to be reclaimed while the host process is exiting.
+ */
+ cv_wait(&ring->vr_cv, &ring->vr_lock);
+ }
+}
+
+/*
+ * Check if full TX packet copying is needed. This should not be called from
+ * viona attach()/detach() context.
+ */
+static boolean_t
+viona_tx_copy_needed(void)
+{
+ boolean_t result;
+
+ mutex_enter(&viona_force_copy_lock);
+ if (viona_force_copy_state == VFC_UNINITALIZED) {
+ major_t bnxe_major;
+
+ /*
+ * The original code for viona featured an explicit check for
+ * the bnxe driver which, when found present, necessitated that
+ * all transmissions be copied into their own mblks instead of
+ * passing guest memory to the underlying device.
+ *
+ * The motivations for this are unclear, but until it can be
+ * proven unnecessary, the check lives on.
+ */
+ viona_force_copy_state = VFC_COPY_UNEEDED;
+ if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
+ != DDI_MAJOR_T_NONE) {
+ if (ddi_hold_installed_driver(bnxe_major) != NULL) {
+ viona_force_copy_state = VFC_COPY_REQUIRED;
+ ddi_rele_driver(bnxe_major);
+ }
+ }
+ }
+ result = (viona_force_copy_state == VFC_COPY_REQUIRED);
+ mutex_exit(&viona_force_copy_lock);
+
+ return (result);
+}
+
+void
+viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
+{
+ /* Allocate desb handles for TX ring if packet copying not disabled */
+ if (!viona_tx_copy_needed()) {
+ viona_desb_t *dp;
+
+ dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
+ ring->vr_txdesb = dp;
+ for (uint_t i = 0; i < qsz; i++, dp++) {
+ dp->d_frtn.free_func = viona_desb_release;
+ dp->d_frtn.free_arg = (void *)dp;
+ dp->d_ring = ring;
+ dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN,
+ KM_SLEEP);
+ }
+ }
+
+ /* Allocate ring-sized iovec buffers for TX */
+ ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
+}
+
+void
+viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
+{
+ if (ring->vr_txdesb != NULL) {
+ viona_desb_t *dp = ring->vr_txdesb;
+
+ for (uint_t i = 0; i < qsz; i++, dp++) {
+ kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN);
+ }
+ kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz);
+ ring->vr_txdesb = NULL;
+ }
+
+ if (ring->vr_txiov != NULL) {
+ kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz);
+ ring->vr_txiov = NULL;
+ }
+}
+
+static void
+viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
+{
+ vq_pushchain(ring, len, cookie);
+
+ membar_enter();
+ if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ viona_intr_ring(ring);
+ }
+}
+
+void
+viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
+{
+ proc_t *p = ttoproc(curthread);
+
+ (void) thread_vsetname(curthread, "viona_tx_%p", ring);
+
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+ ASSERT3U(ring->vr_state, ==, VRS_RUN);
+
+ mutex_exit(&ring->vr_lock);
+
+ for (;;) {
+ boolean_t bail = B_FALSE;
+ boolean_t renew = B_FALSE;
+ uint_t ntx = 0;
+
+ *ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY;
+ while (viona_vr_num_avail(ring)) {
+ viona_tx(link, ring);
+
+ /*
+ * It is advantageous for throughput to keep this
+ * transmission loop tight, but periodic breaks to
+ * check for other events are of value too.
+ */
+ if (ntx++ >= ring->vr_size)
+ break;
+ }
+ *ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY;
+
+ VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
+
+ /*
+ * Check for available descriptors on the ring once more in
+ * case a late addition raced with the NO_NOTIFY flag toggle.
+ *
+ * The barrier ensures that visibility of the vr_used_flags
+ * store does not cross the viona_vr_num_avail() check below.
+ */
+ membar_enter();
+ bail = VRING_NEED_BAIL(ring, p);
+ renew = vmm_drv_lease_expired(ring->vr_lease);
+ if (!bail && !renew && viona_vr_num_avail(ring)) {
+ continue;
+ }
+
+ if ((link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
+ viona_intr_ring(ring);
+ }
+
+ mutex_enter(&ring->vr_lock);
+
+ while (!bail && !renew && !viona_vr_num_avail(ring)) {
+ (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
+ bail = VRING_NEED_BAIL(ring, p);
+ renew = vmm_drv_lease_expired(ring->vr_lease);
+ }
+
+ if (bail) {
+ break;
+ } else if (renew) {
+ ring->vr_state_flags |= VRSF_RENEW;
+ /*
+ * When renewing the lease for the ring, no TX
+ * frames may be outstanding, as they contain
+ * references to guest memory.
+ */
+ viona_tx_wait_outstanding(ring);
+
+ if (!viona_ring_lease_renew(ring)) {
+ break;
+ }
+ ring->vr_state_flags &= ~VRSF_RENEW;
+ }
+ mutex_exit(&ring->vr_lock);
+ }
+
+ ASSERT(MUTEX_HELD(&ring->vr_lock));
+
+ ring->vr_state = VRS_STOP;
+ viona_tx_wait_outstanding(ring);
+}
+
+static void
+viona_desb_release(viona_desb_t *dp)
+{
+ viona_vring_t *ring = dp->d_ring;
+ uint_t ref;
+ uint32_t len;
+ uint16_t cookie;
+
+ ref = atomic_dec_uint_nv(&dp->d_ref);
+ if (ref > 1) {
+ return;
+ }
+
+ /*
+ * The desb corresponding to this index must be ready for reuse before
+ * the descriptor is returned to the guest via the 'used' ring.
+ */
+ len = dp->d_len;
+ cookie = dp->d_cookie;
+ dp->d_len = 0;
+ dp->d_cookie = 0;
+ dp->d_ref = 0;
+
+ viona_tx_done(ring, len, cookie);
+
+ mutex_enter(&ring->vr_lock);
+ if ((--ring->vr_xfer_outstanding) == 0) {
+ cv_broadcast(&ring->vr_cv);
+ }
+ mutex_exit(&ring->vr_lock);
+}
+
+static boolean_t
+viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
+ mblk_t *mp, uint32_t len)
+{
+ viona_link_t *link = ring->vr_link;
+ const struct ether_header *eth;
+ uint_t eth_len = sizeof (struct ether_header);
+ ushort_t ftype;
+ ipha_t *ipha = NULL;
+ uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
+ uint16_t flags = 0;
+ const uint_t csum_start = hdr->vrh_csum_start;
+ const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start;
+
+ /*
+ * Validate that the checksum offsets provided by the guest are within
+ * the bounds of the packet. Additionally, ensure that the checksum
+ * contents field is within the headers mblk copied by viona_tx().
+ */
+ if (csum_start >= len || csum_start < eth_len || csum_stuff >= len ||
+ (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) {
+ VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, fail_hcksum);
+ return (B_FALSE);
+ }
+
+ /*
+ * This is guaranteed to be safe thanks to the header copying
+ * done in viona_tx().
+ */
+ eth = (const struct ether_header *)mp->b_rptr;
+ ftype = ntohs(eth->ether_type);
+
+ if (ftype == ETHERTYPE_VLAN) {
+ const struct ether_vlan_header *veth;
+
+ /* punt on QinQ for now */
+ eth_len = sizeof (struct ether_vlan_header);
+ veth = (const struct ether_vlan_header *)eth;
+ ftype = ntohs(veth->ether_type);
+ }
+
+ if (ftype == ETHERTYPE_IP) {
+ ipha = (ipha_t *)(mp->b_rptr + eth_len);
+
+ ipproto = ipha->ipha_protocol;
+ } else if (ftype == ETHERTYPE_IPV6) {
+ ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
+
+ ipproto = ip6h->ip6_nxt;
+ }
+
+ /*
+ * We ignore hdr_len because the spec says it can't be
+ * trusted. Besides, our own stack will determine the header
+ * boundary.
+ */
+ if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
+ (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
+ ftype == ETHERTYPE_IP) {
+ uint16_t *cksump;
+ uint32_t cksum;
+ ipaddr_t src = ipha->ipha_src;
+ ipaddr_t dst = ipha->ipha_dst;
+
+ /*
+ * Our native IP stack doesn't set the L4 length field
+ * of the pseudo header when LSO is in play. Other IP
+ * stacks, e.g. Linux, do include the length field.
+ * This is a problem because the hardware expects that
+ * the length field is not set. When it is set it will
+ * cause an incorrect TCP checksum to be generated.
+ * The reason this works in Linux is because Linux
+ * corrects the pseudo-header checksum in the driver
+ * code. In order to get the correct HW checksum we
+ * need to assume the guest's IP stack gave us a bogus
+ * TCP partial checksum and calculate it ourselves.
+ */
+ cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
+ cksum = IP_TCP_CSUM_COMP;
+ cksum += (dst >> 16) + (dst & 0xFFFF) +
+ (src >> 16) + (src & 0xFFFF);
+ cksum = (cksum & 0xFFFF) + (cksum >> 16);
+ *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
+
+ /*
+ * Since viona is a "legacy device", the data stored
+ * by the driver will be in the guest's native endian
+ * format (see sections 2.4.3 and 5.1.6.1 of the
+ * VIRTIO 1.0 spec for more info). At this time the
+ * only guests using viona are x86 and we can assume
+ * little-endian.
+ */
+ lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
+
+ /*
+ * Hardware, like ixgbe, expects the client to request
+ * IP header checksum offload if it's sending LSO (see
+ * ixgbe_get_context()). Unfortunately, virtio makes
+ * no allowances for negotiating IP header checksum
+ * and HW offload, only TCP checksum. We add the flag
+ * and zero-out the checksum field. This mirrors the
+ * behavior of our native IP stack (which does this in
+ * the interest of HW that expects the field to be
+ * zero).
+ */
+ flags |= HCK_IPV4_HDRCKSUM;
+ ipha->ipha_hdr_checksum = 0;
+ }
+
+ /*
+ * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
+ * HW_LSO, if present, is not lost.
+ */
+ flags |= DB_CKSUMFLAGS(mp);
+
+ /*
+ * Partial checksum support from the NIC is ideal, since it most
+ * closely maps to the interface defined by virtio.
+ */
+ if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
+ (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+ /*
+ * MAC expects these offsets to be relative to the
+ * start of the L3 header rather than the L2 frame.
+ */
+ flags |= HCK_PARTIALCKSUM;
+ mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len,
+ len - eth_len, 0, flags);
+ return (B_TRUE);
+ }
+
+ /*
+ * Without partial checksum support, look to the L3/L4 protocol
+ * information to see if the NIC can handle it. If not, the
+ * checksum will need to calculated inline.
+ */
+ if (ftype == ETHERTYPE_IP) {
+ if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
+ (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+ uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
+ *csump = 0;
+ flags |= HCK_FULLCKSUM;
+ mac_hcksum_set(mp, 0, 0, 0, 0, flags);
+ return (B_TRUE);
+ }
+
+ /* XXX: Implement manual fallback checksumming? */
+ VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, fail_hcksum);
+ return (B_FALSE);
+ } else if (ftype == ETHERTYPE_IPV6) {
+ if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
+ (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
+ uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
+ *csump = 0;
+ flags |= HCK_FULLCKSUM;
+ mac_hcksum_set(mp, 0, 0, 0, 0, flags);
+ return (B_TRUE);
+ }
+
+ /* XXX: Implement manual fallback checksumming? */
+ VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, fail_hcksum6);
+ return (B_FALSE);
+ }
+
+ /* Cannot even emulate hcksum for unrecognized protocols */
+ VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
+ VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
+ return (B_FALSE);
+}
+
+static void
+viona_tx(viona_link_t *link, viona_vring_t *ring)
+{
+ struct iovec *iov = ring->vr_txiov;
+ const uint_t max_segs = ring->vr_size;
+ uint16_t cookie;
+ int i, n;
+ uint32_t len, base_off = 0;
+ uint32_t min_copy = VIONA_MAX_HDRS_LEN;
+ mblk_t *mp_head, *mp_tail, *mp;
+ viona_desb_t *dp = NULL;
+ mac_client_handle_t link_mch = link->l_mch;
+ const struct virtio_net_hdr *hdr;
+
+ mp_head = mp_tail = NULL;
+
+ ASSERT(iov != NULL);
+
+ n = vq_popchain(ring, iov, max_segs, &cookie);
+ if (n == 0) {
+ VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
+ VIONA_RING_STAT_INCR(ring, tx_absent);
+ return;
+ } else if (n < 0) {
+ /*
+ * Any error encountered in vq_popchain has already resulted in
+ * specific probe and statistic handling. Further action here
+ * is unnecessary.
+ */
+ return;
+ }
+
+ /* Grab the header and ensure it is of adequate length */
+ hdr = (const struct virtio_net_hdr *)iov[0].iov_base;
+ len = iov[0].iov_len;
+ if (len < sizeof (struct virtio_net_hdr)) {
+ goto drop_fail;
+ }
+
+ /* Make sure the packet headers are always in the first mblk. */
+ if (ring->vr_txdesb != NULL) {
+ dp = &ring->vr_txdesb[cookie];
+
+ /*
+ * If the guest driver is operating properly, each desb slot
+ * should be available for use when processing a TX descriptor
+ * from the 'avail' ring. In the case of drivers that reuse a
+ * descriptor before it has been posted to the 'used' ring, the
+ * data is simply dropped.
+ */
+ if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
+ dp = NULL;
+ goto drop_fail;
+ }
+
+ dp->d_cookie = cookie;
+ mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0,
+ &dp->d_frtn);
+
+ /* Account for the successful desballoc. */
+ if (mp_head != NULL)
+ dp->d_ref++;
+ } else {
+ mp_head = allocb(VIONA_MAX_HDRS_LEN, 0);
+ }
+
+ if (mp_head == NULL)
+ goto drop_fail;
+
+ mp_tail = mp_head;
+
+ /*
+ * We always copy enough of the guest data to cover the
+ * headers. This protects us from TOCTOU attacks and allows
+ * message block length assumptions to be made in subsequent
+ * code. In many cases, this means copying more data than
+ * strictly necessary. That's okay, as it is the larger packets
+ * (such as LSO) that really benefit from desballoc().
+ */
+ for (i = 1; i < n; i++) {
+ const uint32_t to_copy = MIN(min_copy, iov[i].iov_len);
+
+ bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy);
+ mp_head->b_wptr += to_copy;
+ len += to_copy;
+ min_copy -= to_copy;
+
+ /*
+ * We've met the minimum copy requirement. The rest of
+ * the guest data can be referenced.
+ */
+ if (min_copy == 0) {
+ /*
+ * If we copied all contents of this
+ * descriptor then move onto the next one.
+ * Otherwise, record how far we are into the
+ * current descriptor.
+ */
+ if (iov[i].iov_len == to_copy)
+ i++;
+ else
+ base_off = to_copy;
+
+ break;
+ }
+ }
+
+ ASSERT3P(mp_head, !=, NULL);
+ ASSERT3P(mp_tail, !=, NULL);
+
+ for (; i < n; i++) {
+ uintptr_t base = (uintptr_t)iov[i].iov_base + base_off;
+ uint32_t chunk = iov[i].iov_len - base_off;
+
+ ASSERT3U(base_off, <, iov[i].iov_len);
+ ASSERT3U(chunk, >, 0);
+
+ if (dp != NULL) {
+ mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn);
+ if (mp == NULL) {
+ goto drop_fail;
+ }
+ dp->d_ref++;
+ } else {
+ mp = allocb(chunk, BPRI_MED);
+ if (mp == NULL) {
+ goto drop_fail;
+ }
+ bcopy((uchar_t *)base, mp->b_wptr, chunk);
+ }
+
+ base_off = 0;
+ len += chunk;
+ mp->b_wptr += chunk;
+ mp_tail->b_cont = mp;
+ mp_tail = mp;
+ }
+
+ if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
+ /*
+ * The hook consumer may elect to free the mblk_t and set
+ * our mblk_t ** to NULL. When using a viona_desb_t
+ * (dp != NULL), we do not want the corresponding cleanup to
+ * occur during the viona_hook() call. We instead want to
+ * reset and recycle dp for future use. To prevent cleanup
+ * during the viona_hook() call, we take a ref on dp (if being
+ * used), and release it on success. On failure, the
+ * freemsgchain() call will release all the refs taken earlier
+ * in viona_tx() (aside from the initial ref and the one we
+ * take), and drop_hook will reset dp for reuse.
+ */
+ if (dp != NULL)
+ dp->d_ref++;
+
+ /*
+ * Pass &mp instead of &mp_head so we don't lose track of
+ * mp_head if the hook consumer (i.e. ipf) elects to free mp
+ * and set mp to NULL.
+ */
+ mp = mp_head;
+ if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
+ if (mp != NULL)
+ freemsgchain(mp);
+ goto drop_hook;
+ }
+
+ if (dp != NULL) {
+ dp->d_ref--;
+
+ /*
+ * It is possible that the hook(s) accepted the packet,
+ * but as part of its processing, it issued a pull-up
+ * which released all references to the desb. In that
+ * case, go back to acting like the packet is entirely
+ * copied (which it is).
+ */
+ if (dp->d_ref == 1) {
+ dp->d_cookie = 0;
+ dp->d_ref = 0;
+ dp = NULL;
+ }
+ }
+ }
+
+ /*
+ * Request hardware checksumming, if necessary. If the guest
+ * sent an LSO packet then it must have also negotiated and
+ * requested partial checksum; therefore the LSO logic is
+ * contained within viona_tx_csum().
+ */
+ if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
+ (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
+ if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {
+ goto drop_fail;
+ }
+ }
+
+ if (dp != NULL) {
+ dp->d_len = len;
+ mutex_enter(&ring->vr_lock);
+ ring->vr_xfer_outstanding++;
+ mutex_exit(&ring->vr_lock);
+ } else {
+ /*
+ * If the data was cloned out of the ring, the descriptors can
+ * be marked as 'used' now, rather than deferring that action
+ * until after successful packet transmission.
+ */
+ viona_tx_done(ring, len, cookie);
+ }
+
+ /*
+ * We're potentially going deep into the networking layer; make sure the
+ * guest can't run concurrently.
+ */
+ smt_begin_unsafe();
+ mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
+ smt_end_unsafe();
+ return;
+
+drop_fail:
+ /*
+ * On the off chance that memory is not available via the desballoc or
+ * allocb calls, there are few options left besides to fail and drop
+ * the frame on the floor.
+ */
+
+ if (dp != NULL) {
+ /*
+ * Take an additional reference on the desb handle (if present)
+ * so any desballoc-sourced mblks can release their hold on it
+ * without the handle reaching its final state and executing
+ * its clean-up logic.
+ */
+ dp->d_ref++;
+ }
+
+ /*
+ * Free any already-allocated blocks and sum up the total length of the
+ * dropped data to be released to the used ring.
+ */
+ freemsgchain(mp_head);
+
+drop_hook:
+ len = 0;
+ for (uint_t i = 0; i < n; i++) {
+ len += iov[i].iov_len;
+ }
+
+ if (dp != NULL) {
+ VERIFY(dp->d_ref == 2);
+
+ /* Clean up the desb handle, releasing the extra hold. */
+ dp->d_len = 0;
+ dp->d_cookie = 0;
+ dp->d_ref = 0;
+ }
+
+ VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len,
+ uint16_t, cookie);
+ viona_tx_done(ring, len, cookie);
+}
diff --git a/usr/src/uts/i86pc/sys/viona_io.h b/usr/src/uts/i86pc/sys/viona_io.h
index a26cc00a55..46cc72eb06 100644
--- a/usr/src/uts/i86pc/sys/viona_io.h
+++ b/usr/src/uts/i86pc/sys/viona_io.h
@@ -11,36 +11,53 @@
/*
* Copyright 2013 Pluribus Networks Inc.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _VIONA_IO_H_
#define _VIONA_IO_H_
#define VNA_IOC (('V' << 16)|('C' << 8))
-#define VNA_IOC_CREATE (VNA_IOC | 1)
-#define VNA_IOC_DELETE (VNA_IOC | 2)
-#define VNA_IOC_RX_RING_INIT (VNA_IOC | 3)
-#define VNA_IOC_TX_RING_INIT (VNA_IOC | 4)
-#define VNA_IOC_RX_RING_RESET (VNA_IOC | 5)
-#define VNA_IOC_TX_RING_RESET (VNA_IOC | 6)
-#define VNA_IOC_RX_RING_KICK (VNA_IOC | 7)
-#define VNA_IOC_TX_RING_KICK (VNA_IOC | 8)
-#define VNA_IOC_RX_INTR_CLR (VNA_IOC | 9)
-#define VNA_IOC_TX_INTR_CLR (VNA_IOC | 10)
-#define VNA_IOC_SET_FEATURES (VNA_IOC | 11)
-#define VNA_IOC_GET_FEATURES (VNA_IOC | 12)
+#define VNA_IOC_CREATE (VNA_IOC | 0x01)
+#define VNA_IOC_DELETE (VNA_IOC | 0x02)
+
+#define VNA_IOC_RING_INIT (VNA_IOC | 0x10)
+#define VNA_IOC_RING_RESET (VNA_IOC | 0x11)
+#define VNA_IOC_RING_KICK (VNA_IOC | 0x12)
+#define VNA_IOC_RING_SET_MSI (VNA_IOC | 0x13)
+#define VNA_IOC_RING_INTR_CLR (VNA_IOC | 0x14)
+
+#define VNA_IOC_INTR_POLL (VNA_IOC | 0x20)
+#define VNA_IOC_SET_FEATURES (VNA_IOC | 0x21)
+#define VNA_IOC_GET_FEATURES (VNA_IOC | 0x22)
+#define VNA_IOC_SET_NOTIFY_IOP (VNA_IOC | 0x23)
typedef struct vioc_create {
datalink_id_t c_linkid;
- char c_vmname[64];
- size_t c_lomem_size;
- size_t c_himem_size;
+ int c_vmfd;
} vioc_create_t;
typedef struct vioc_ring_init {
+ uint16_t ri_index;
uint16_t ri_qsize;
uint64_t ri_qaddr;
} vioc_ring_init_t;
+typedef struct vioc_ring_msi {
+ uint16_t rm_index;
+ uint64_t rm_addr;
+ uint64_t rm_msg;
+} vioc_ring_msi_t;
+
+enum viona_vq_id {
+ VIONA_VQ_RX = 0,
+ VIONA_VQ_TX = 1,
+ VIONA_VQ_MAX = 2
+};
+
+typedef struct vioc_intr_poll {
+ uint32_t vip_status[VIONA_VQ_MAX];
+} vioc_intr_poll_t;
+
+
#endif /* _VIONA_IO_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h
index 33fefc10ea..856b75e5cc 100644
--- a/usr/src/uts/i86pc/sys/vmm_drv.h
+++ b/usr/src/uts/i86pc/sys/vmm_drv.h
@@ -17,6 +17,9 @@
#define _VMM_DRV_H_
#ifdef _KERNEL
+
+#include <sys/file.h>
+
struct vmm_hold;
typedef struct vmm_hold vmm_hold_t;
diff --git a/usr/src/uts/i86pc/viona/Makefile b/usr/src/uts/i86pc/viona/Makefile
index 4ede5bbd84..dac59c9a45 100644
--- a/usr/src/uts/i86pc/viona/Makefile
+++ b/usr/src/uts/i86pc/viona/Makefile
@@ -11,7 +11,7 @@
#
# Copyright 2013 Pluribus Networks Inc.
-# Copyright 2017 Joyent, Inc.
+# Copyright 2019 Joyent, Inc.
#
#
@@ -27,6 +27,7 @@ OBJECTS = $(VIONA_OBJS:%=$(OBJS_DIR)/%)
LINTS = $(VIONA_OBJS:%.o=$(LINTS_DIR)/%.ln)
ROOTMODULE = $(USR_DRV_DIR)/$(MODULE)
CONF_SRCDIR = $(UTSBASE)/i86pc/io/viona
+MAPFILE = $(UTSBASE)/i86pc/io/viona/viona.mapfile
#
# Include common rules.
@@ -49,8 +50,16 @@ LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
LINTTAGS += -erroff=E_FUNC_RET_MAYBE_IGNORED2
LINTTAGS += -erroff=E_FUNC_RET_ALWAYS_IGNOR2
+# needs work
+SMOFF += all_func_returns
+
+ALL_BUILDS = $(ALL_BUILDSONLY64)
+DEF_BUILDS = $(DEF_BUILDSONLY64)
+
CFLAGS += $(CCVERBOSE)
-LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm
+LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm -Nmisc/neti
+LDFLAGS += -Nmisc/hook
+LDFLAGS += -M $(MAPFILE)
#
# Default build targets.
diff --git a/usr/src/uts/intel/ipf/ipf.global-objs.debug64 b/usr/src/uts/intel/ipf/ipf.global-objs.debug64
index 663613cee3..846011b4c5 100644
--- a/usr/src/uts/intel/ipf/ipf.global-objs.debug64
+++ b/usr/src/uts/intel/ipf/ipf.global-objs.debug64
@@ -22,13 +22,17 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# Copyright 2013 Joyent, Inc. All rights reserved
+# Copyright 2018 Joyent, Inc. All rights reserved
#
fr_availfuncs
fr_features
fr_objbytes
hdrsizes
+hook_viona_in
+hook_viona_in_gz
+hook_viona_out
+hook_viona_out_gz
hook4_in
hook4_in_gz
hook4_loop_in
@@ -58,6 +62,9 @@ ip6exthdr
ipf_cb_ops
ipf_dev_info
ipf_devfiles
+ipf_eth_bcast_addr
+ipf_eth_ipv4_mcast
+ipf_eth_ipv6_mcast
ipf_kstat_tmp
ipf_minor
ipf_ops