diff options
author | Patrick Mooney <pmooney@pfmooney.com> | 2018-01-03 21:11:35 +0000 |
---|---|---|
committer | Patrick Mooney <pmooney@pfmooney.com> | 2020-05-19 15:55:57 +0000 |
commit | b22a70abf81f995ecc990b8444e63308bc389d5c (patch) | |
tree | 5142f78f319737bcd44477e4e3cf578ccd0617e4 /usr/src | |
parent | d77e6e0f12d19668c0e9068c0fcd7a2123da5373 (diff) | |
download | illumos-joyent-b22a70abf81f995ecc990b8444e63308bc389d5c.tar.gz |
12679 want viona driver for bhyve
Portions contributed by: Ryan Zezeski <rpz@joyent.com>
Portions contributed by: John Levon <john.levon@joyent.com>
Portions contributed by: Jason King <jason.king@joyent.com>
Portions contributed by: Robert Mustacchi <rm@joyent.com>
Portions contributed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>
Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src')
29 files changed, 4689 insertions, 1639 deletions
diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile index e96868e006..2301e6c8a6 100644 --- a/usr/src/cmd/bhyve/Makefile +++ b/usr/src/cmd/bhyve/Makefile @@ -58,6 +58,7 @@ SRCS = acpi.c \ pci_virtio_console.c \ pci_virtio_net.c \ pci_virtio_rnd.c \ + pci_virtio_viona.c \ pci_xhci.c \ pm.c \ post.c \ @@ -120,7 +121,7 @@ CSTD= $(CSTD_GNU99) C99MODE= -xc99=%all C99LMODE= -Xc99=%all -$(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -lmd -luuid -lvmmapi -lz +$(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -ldladm -lmd -luuid -lvmmapi -lz $(MEVENT_TEST_PROG) := LDLIBS += -lsocket .KEEP_STATE: diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c index 5118b31534..a71cc528aa 100644 --- a/usr/src/cmd/bhyve/pci_emul.c +++ b/usr/src/cmd/bhyve/pci_emul.c @@ -1597,6 +1597,11 @@ pci_lintr_update(struct pci_devinst *pi) pci_irq_assert(pi); } pthread_mutex_unlock(&pi->pi_lintr.lock); +#ifndef __FreeBSD__ + if (pi->pi_d->pe_lintrupdate != NULL) { + pi->pi_d->pe_lintrupdate(pi); + } +#endif /* __FreeBSD__ */ } int diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h index 853badaadb..0053caed99 100644 --- a/usr/src/cmd/bhyve/pci_emul.h +++ b/usr/src/cmd/bhyve/pci_emul.h @@ -27,6 +27,9 @@ * * $FreeBSD$ */ +/* + * Copyright 2018 Joyent, Inc. + */ #ifndef _PCI_EMUL_H_ #define _PCI_EMUL_H_ @@ -71,6 +74,10 @@ struct pci_devemu { uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size); + +#ifndef __FreeBSD__ + void (*pe_lintrupdate)(struct pci_devinst *pi); +#endif /* __FreeBSD__ */ }; #define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); diff --git a/usr/src/cmd/bhyve/pci_virtio_viona.c b/usr/src/cmd/bhyve/pci_virtio_viona.c index e5a5cb584f..9cafa7b111 100644 --- a/usr/src/cmd/bhyve/pci_virtio_viona.c +++ b/usr/src/cmd/bhyve/pci_virtio_viona.c @@ -34,7 +34,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. - * Copyright 2017 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/cdefs.h> @@ -85,18 +85,6 @@ #define VIONA_REGSZ VIONA_R_MAX+1 /* - * Host capabilities - */ -#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ -#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ -#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ - -#define VIONA_S_HOSTCAPS \ - (VIRTIO_NET_F_MAC | \ - VIRTIO_NET_F_MRG_RXBUF | \ - VIRTIO_NET_F_STATUS) - -/* * Queue definitions. */ #define VIONA_RXQ 0 @@ -108,7 +96,7 @@ /* * Debug printf */ -static int pci_viona_debug; +static volatile int pci_viona_debug; #define DPRINTF(params) if (pci_viona_debug) printf params #define WPRINTF(params) printf params @@ -124,26 +112,20 @@ struct pci_viona_softc { int vsc_isr; datalink_id_t vsc_linkid; - char vsc_linkname[MAXLINKNAMELEN]; int vsc_vnafd; + /* Configurable parameters */ + char vsc_linkname[MAXLINKNAMELEN]; + uint32_t vsc_feature_mask; + uint16_t vsc_vq_size; + uint32_t vsc_features; uint8_t vsc_macaddr[6]; uint64_t vsc_pfn[VIONA_MAXQ]; uint16_t vsc_msix_table_idx[VIONA_MAXQ]; - /* - * Flag to see if host is already sending data out. - * If it is, no need to wait for lock and send interrupt to host - * for new data. - */ - boolean_t vsc_tx_kick_lock_held; - - pthread_t tx_tid; - pthread_mutex_t tx_mtx; - pthread_cond_t tx_cond; + boolean_t vsc_msix_active; }; -#define viona_ctx(sc) ((sc)->vsc_pi->pi_vmctx) /* * Return the size of IO BAR that maps virtio header and device specific @@ -160,47 +142,44 @@ pci_viona_iosize(struct pci_devinst *pi) } static uint16_t -pci_viona_qsize(int qnum) +pci_viona_qsize(struct pci_viona_softc *sc, int qnum) { /* XXX no ctl queue currently */ if (qnum == VIONA_CTLQ) { return (0); } - /* XXX fixed currently. Maybe different for tx/rx/ctl */ - return (VIONA_RINGSZ); + return (sc->vsc_vq_size); } static void pci_viona_ring_reset(struct pci_viona_softc *sc, int ring) { - int error; - assert(ring < VIONA_MAXQ); switch (ring) { case VIONA_RXQ: - error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_RESET); - if (error != 0) { - WPRINTF(("ioctl viona rx ring reset failed %d\n", - error)); - } else { - sc->vsc_pfn[VIONA_RXQ] = 0; - } - break; case VIONA_TXQ: - error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_RESET); - if (error != 0) { - WPRINTF(("ioctl viona tx ring reset failed %d\n", - error)); - } else { - sc->vsc_pfn[VIONA_TXQ] = 0; - } break; case VIONA_CTLQ: default: - break; + return; + } + + for (;;) { + int res; + + res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_RESET, ring); + if (res == 0) { + break; + } else if (errno != EINTR) { + WPRINTF(("ioctl viona ring %d reset failed %d\n", + ring, errno)); + return; + } } + + sc->vsc_pfn[ring] = 0; } static void @@ -220,11 +199,11 @@ static void * pci_viona_poll_thread(void *param) { struct pci_viona_softc *sc = param; - pollfd_t pollset; - int error; + pollfd_t pollset; + const int fd = sc->vsc_vnafd; - pollset.fd = sc->vsc_vnafd; - pollset.events = POLLIN | POLLOUT; + pollset.fd = fd; + pollset.events = POLLRDBAND; for (;;) { if (poll(&pollset, 1, -1) < 0) { @@ -236,23 +215,35 @@ pci_viona_poll_thread(void *param) break; } } - if (pollset.revents & POLLIN) { - pci_generate_msix(sc->vsc_pi, - sc->vsc_msix_table_idx[VIONA_RXQ]); - error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_INTR_CLR); - if (error != 0) { - WPRINTF(("ioctl viona rx intr clear failed" - " %d\n", error)); + if (pollset.revents & POLLRDBAND) { + vioc_intr_poll_t vip; + uint_t i; + int res; + boolean_t assert_lintr = B_FALSE; + const boolean_t do_msix = pci_msix_enabled(sc->vsc_pi); + + res = ioctl(fd, VNA_IOC_INTR_POLL, &vip); + for (i = 0; res > 0 && i < VIONA_VQ_MAX; i++) { + if (vip.vip_status[i] == 0) { + continue; + } + if (do_msix) { + pci_generate_msix(sc->vsc_pi, + sc->vsc_msix_table_idx[i]); + } else { + assert_lintr = B_TRUE; + } + res = ioctl(fd, VNA_IOC_RING_INTR_CLR, i); + if (res != 0) { + WPRINTF(("ioctl viona vq %d intr " + "clear failed %d\n", i, errno)); + } } - } - - if (pollset.revents & POLLOUT) { - pci_generate_msix(sc->vsc_pi, - sc->vsc_msix_table_idx[VIONA_TXQ]); - error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_INTR_CLR); - if (error != 0) { - WPRINTF(("ioctl viona tx intr clear failed" - " %d\n", error)); + if (assert_lintr) { + pthread_mutex_lock(&sc->vsc_mtx); + sc->vsc_isr |= VTCFG_ISR_QUEUES; + pci_lintr_assert(sc->vsc_pi); + pthread_mutex_unlock(&sc->vsc_mtx); } } } @@ -261,57 +252,6 @@ pci_viona_poll_thread(void *param) } static void -pci_viona_ping_rxq(struct pci_viona_softc *sc) -{ - int error; - - error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_KICK); - if (error != 0) { - WPRINTF(("ioctl viona rx ring kick failed %d\n", error)); - } -} - -static void * -pci_viona_tx_thread(void *param) -{ - struct pci_viona_softc *sc = (struct pci_viona_softc *)param; - int error; - - pthread_mutex_lock(&sc->tx_mtx); - for (;;) { - error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); - assert(error == 0); - sc->vsc_tx_kick_lock_held = B_TRUE; - error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_KICK); - if (error != 0) { - WPRINTF(("ioctl viona tx ring kick failed %d\n", - error)); - } - sc->vsc_tx_kick_lock_held = B_FALSE; - } - pthread_mutex_unlock(&sc->tx_mtx); - - return (NULL); -} - -static void -pci_viona_ping_txq(struct pci_viona_softc *sc) -{ - /* Signal the tx thread for processing */ - if (sc->vsc_tx_kick_lock_held) - return; - pthread_mutex_lock(&sc->tx_mtx); - pthread_cond_signal(&sc->tx_cond); - pthread_mutex_unlock(&sc->tx_mtx); -} - -static void -pci_viona_ping_ctlq(struct pci_viona_softc *sc) -{ - DPRINTF(("viona: control qnotify!\n\r")); -} - -static void pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn) { int qnum = sc->vsc_curq; @@ -320,29 +260,19 @@ pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn) assert(qnum < VIONA_MAXQ); + if (qnum == VIONA_CTLQ) { + return; + } + sc->vsc_pfn[qnum] = (pfn << VRING_PFN); - vna_ri.ri_qsize = pci_viona_qsize(qnum); + vna_ri.ri_index = qnum; + vna_ri.ri_qsize = pci_viona_qsize(sc, qnum); vna_ri.ri_qaddr = (pfn << VRING_PFN); + error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_INIT, &vna_ri); - switch (qnum) { - case VIONA_RXQ: - error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_INIT, &vna_ri); - if (error != 0) { - WPRINTF(("ioctl viona rx ring init failed %d\n", - error)); - } - break; - case VIONA_TXQ: - error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_INIT, &vna_ri); - if (error != 0) { - WPRINTF(("ioctl viona tx ring init failed %d\n", - error)); - } - break; - case VIONA_CTLQ: - default: - break; + if (error != 0) { + WPRINTF(("ioctl viona ring %u init failed %d\n", qnum, errno)); } } @@ -350,30 +280,20 @@ static int pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc) { vioc_create_t vna_create; -#if notyet - char devname[MAXNAMELEN]; - int ctlfd; -#endif int error; - sc->vsc_vnafd = open("/devices/pseudo/viona@0:ctl", O_RDWR | O_EXCL); + sc->vsc_vnafd = open("/dev/viona", O_RDWR | O_EXCL); if (sc->vsc_vnafd == -1) { - WPRINTF(("open viona ctl failed\n")); + WPRINTF(("open viona ctl failed: %d\n", errno)); return (-1); } vna_create.c_linkid = sc->vsc_linkid; - strlcpy(vna_create.c_vmname, vmname, - sizeof (vna_create.c_vmname)); -#if notyet - vm_get_memory_seg(ctx, 1 * (1024 * 1024UL), &vna_create.c_lomem_size, - NULL); - vm_get_memory_seg(ctx, 4 * (1024 * 1024 * 1024UL), - &vna_create.c_himem_size, NULL); -#endif + vna_create.c_vmfd = vm_get_device_fd(ctx); error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create); if (error != 0) { - WPRINTF(("ioctl viona create failed %d\n", error)); + (void) close(sc->vsc_vnafd); + WPRINTF(("ioctl viona create failed %d\n", errno)); return (-1); } @@ -381,15 +301,99 @@ pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc) } static int +pci_viona_parse_opts(struct pci_viona_softc *sc, char *opts) +{ + char *next, *cp, *vnic = NULL; + int err = 0; + + sc->vsc_vq_size = VIONA_RINGSZ; + sc->vsc_feature_mask = 0; + + for (; opts != NULL && *opts != '\0'; opts = next) { + char *val; + + if ((cp = strchr(opts, ',')) != NULL) { + *cp = '\0'; + next = cp + 1; + } else { + next = NULL; + } + + if ((cp = strchr(opts, '=')) == NULL) { + /* vnic chosen with bare name */ + if (vnic != NULL) { + fprintf(stderr, + "viona: unexpected vnic name '%s'", opts); + err = -1; + } else { + vnic = opts; + } + continue; + } + + /* <param>=<value> handling */ + val = cp + 1; + *cp = '\0'; + if (strcmp(opts, "feature_mask") == 0) { + long num; + + errno = 0; + num = strtol(val, NULL, 0); + if (errno != 0 || num < 0) { + fprintf(stderr, + "viona: invalid mask '%s'", val); + } else { + sc->vsc_feature_mask = num; + } + } else if (strcmp(opts, "vqsize") == 0) { + long num; + + errno = 0; + num = strtol(val, NULL, 0); + if (errno != 0) { + fprintf(stderr, + "viona: invalid vsqize '%s'", val); + err = -1; + } else if (num <= 2 || num > 32768) { + fprintf(stderr, + "viona: vqsize out of range", num); + err = -1; + } else if ((1 << (ffs(num) - 1)) != num) { + fprintf(stderr, + "viona: vqsize must be power of 2", num); + err = -1; + } else { + sc->vsc_vq_size = num; + } + } else { + fprintf(stderr, + "viona: unrecognized option '%s'", opts); + err = -1; + } + } + if (vnic == NULL) { + fprintf(stderr, "viona: vnic name required"); + sc->vsc_linkname[0] = '\0'; + err = -1; + } else { + (void) strlcpy(sc->vsc_linkname, vnic, MAXLINKNAMELEN); + } + + DPRINTF(("viona=%p dev=%s vqsize=%x feature_mask=%x\n", sc, + sc->vsc_linkname, sc->vsc_vq_size, sc->vsc_feature_mask)); + return (err); +} + +static int pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { dladm_handle_t handle; dladm_status_t status; dladm_vnic_attr_t attr; char errmsg[DLADM_STRSIZE]; - int error; + int error, i; struct pci_viona_softc *sc; - int i; + uint64_t ioport; if (opts == NULL) { printf("virtio-viona: vnic required\n"); @@ -404,7 +408,10 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pthread_mutex_init(&sc->vsc_mtx, NULL); - strlcpy(sc->vsc_linkname, opts, MAXLINKNAMELEN); + if (pci_viona_parse_opts(sc, opts) != 0) { + free(sc); + return (1); + } if ((status = dladm_open(&handle)) != DLADM_STATUS_OK) { WPRINTF(("could not open /dev/dld")); @@ -430,7 +437,6 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) return (1); } - sc->vsc_tx_kick_lock_held = B_FALSE; memcpy(sc->vsc_macaddr, attr.va_mac_addr, ETHERADDRL); dladm_close(handle); @@ -449,42 +455,44 @@ pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); /* MSI-X support */ for (i = 0; i < VIONA_MAXQ; i++) sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR; - /* - * BAR 1 used to map MSI-X table and PBA - */ + /* BAR 1 used to map MSI-X table and PBA */ if (pci_emul_add_msixcap(pi, VIONA_MAXQ, 1)) { free(sc); return (1); } - pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ); + /* BAR 0 for legacy-style virtio register access. */ + error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ); + if (error != 0) { + WPRINTF(("could not allocate virtio BAR\n")); + free(sc); + return (1); + } + + /* Install ioport hook for virtqueue notification */ + ioport = pi->pi_bar[0].addr + VTCFG_R_QNOTIFY; + error = ioctl(sc->vsc_vnafd, VNA_IOC_SET_NOTIFY_IOP, ioport); + if (error != 0) { + WPRINTF(("could not install ioport hook at %x\n", ioport)); + free(sc); + return (1); + } /* - * Initialize tx semaphore & spawn TX processing thread - * As of now, only one thread for TX desc processing is - * spawned. + * Need a legacy interrupt for virtio compliance, even though MSI-X + * operation is _strongly_ suggested for adequate performance. */ - pthread_mutex_init(&sc->tx_mtx, NULL); - pthread_cond_init(&sc->tx_cond, NULL); - pthread_create(&sc->tx_tid, NULL, pci_viona_tx_thread, (void *)sc); + pci_lintr_request(pi); return (0); } -/* - * Function pointer array to handle queue notifications - */ -static void (*pci_viona_qnotify[VIONA_MAXQ])(struct pci_viona_softc *) = { - pci_viona_ping_rxq, - pci_viona_ping_txq, - pci_viona_ping_ctlq -}; - static uint64_t viona_adjust_offset(struct pci_devinst *pi, uint64_t offset) { @@ -501,6 +509,109 @@ viona_adjust_offset(struct pci_devinst *pi, uint64_t offset) } static void +pci_viona_ring_set_msix(struct pci_devinst *pi, uint_t ring) +{ + struct pci_viona_softc *sc = pi->pi_arg; + struct msix_table_entry mte; + uint16_t tab_index; + vioc_ring_msi_t vrm; + int res; + + assert(ring <= VIONA_VQ_TX); + + vrm.rm_index = ring; + vrm.rm_addr = 0; + vrm.rm_msg = 0; + tab_index = sc->vsc_msix_table_idx[ring]; + + if (tab_index != VIRTIO_MSI_NO_VECTOR && sc->vsc_msix_active) { + mte = pi->pi_msix.table[tab_index]; + if ((mte.vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + vrm.rm_addr = mte.addr; + vrm.rm_msg = mte.msg_data; + } + } + + res = ioctl(sc->vsc_vnafd, VNA_IOC_RING_SET_MSI, &vrm); + if (res != 0) { + WPRINTF(("ioctl viona set_msi %d failed %d\n", ring, errno)); + } +} + +static void +pci_viona_lintrupdate(struct pci_devinst *pi) +{ + struct pci_viona_softc *sc = pi->pi_arg; + boolean_t msix_on = B_FALSE; + + pthread_mutex_lock(&sc->vsc_mtx); + msix_on = pci_msix_enabled(pi) && (pi->pi_msix.function_mask == 0); + if ((sc->vsc_msix_active && !msix_on) || + (msix_on && !sc->vsc_msix_active)) { + uint_t i; + + sc->vsc_msix_active = msix_on; + /* Update in-kernel ring configs */ + for (i = 0; i <= VIONA_VQ_TX; i++) { + pci_viona_ring_set_msix(pi, i); + } + } + pthread_mutex_unlock(&sc->vsc_mtx); +} + +static void +pci_viona_msix_update(struct pci_devinst *pi, uint64_t offset) +{ + struct pci_viona_softc *sc = pi->pi_arg; + uint_t tab_index, i; + + pthread_mutex_lock(&sc->vsc_mtx); + if (!sc->vsc_msix_active) { + pthread_mutex_unlock(&sc->vsc_mtx); + return; + } + + /* + * Rather than update every possible MSI-X vector, cheat and use the + * offset to calculate the entry within the table. Since this should + * only be called when a write to the table succeeds, the index should + * be valid. + */ + tab_index = offset / MSIX_TABLE_ENTRY_SIZE; + + for (i = 0; i <= VIONA_VQ_TX; i++) { + if (sc->vsc_msix_table_idx[i] != tab_index) { + continue; + } + pci_viona_ring_set_msix(pi, i); + } + + pthread_mutex_unlock(&sc->vsc_mtx); +} + +static void +pci_viona_qnotify(struct pci_viona_softc *sc, int ring) +{ + int error; + + switch (ring) { + case VIONA_TXQ: + case VIONA_RXQ: + error = ioctl(sc->vsc_vnafd, VNA_IOC_RING_KICK, ring); + if (error != 0) { + WPRINTF(("ioctl viona ring %d kick failed %d\n", + ring, errno)); + } + break; + case VIONA_CTLQ: + DPRINTF(("viona: control qnotify!\n")); + break; + default: + break; + } +} + +static void pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { @@ -510,7 +621,9 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { - pci_emul_msix_twrite(pi, offset, size, value); + if (pci_emul_msix_twrite(pi, offset, size, value) == 0) { + pci_viona_msix_update(pi, offset); + } return; } @@ -529,10 +642,14 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, switch (offset) { case VTCFG_R_GUESTCAP: assert(size == 4); + value &= ~(sc->vsc_feature_mask); err = ioctl(sc->vsc_vnafd, VNA_IOC_SET_FEATURES, &value); - if (err != 0) + if (err != 0) { WPRINTF(("ioctl feature negotiation returned" - " err = %d\n", err)); + " err = %d\n", errno)); + } else { + sc->vsc_features = value; + } break; case VTCFG_R_PFN: assert(size == 4); @@ -546,7 +663,7 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, case VTCFG_R_QNOTIFY: assert(size == 2); assert(value < VIONA_MAXQ); - (*pci_viona_qnotify[value])(sc); + pci_viona_qnotify(sc, value); break; case VTCFG_R_STATUS: assert(size == 1); @@ -560,6 +677,7 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, assert(size == 2); assert(sc->vsc_curq != VIONA_CTLQ); sc->vsc_msix_table_idx[sc->vsc_curq] = value; + pci_viona_ring_set_msix(pi, sc->vsc_curq); break; case VIONA_R_CFG0: case VIONA_R_CFG1: @@ -597,7 +715,7 @@ pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, pthread_mutex_unlock(&sc->vsc_mtx); } -uint64_t +static uint64_t pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { @@ -627,9 +745,11 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, case VTCFG_R_HOSTCAP: assert(size == 4); err = ioctl(sc->vsc_vnafd, VNA_IOC_GET_FEATURES, &value); - if (err != 0) + if (err != 0) { WPRINTF(("ioctl get host features returned" - " err = %d\n", err)); + " err = %d\n", errno)); + } + value &= ~sc->vsc_feature_mask; break; case VTCFG_R_GUESTCAP: assert(size == 4); @@ -641,7 +761,7 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, break; case VTCFG_R_QNUM: assert(size == 2); - value = pci_viona_qsize(sc->vsc_curq); + value = pci_viona_qsize(sc, sc->vsc_curq); break; case VTCFG_R_QSEL: assert(size == 2); @@ -659,6 +779,9 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, assert(size == 1); value = sc->vsc_isr; sc->vsc_isr = 0; /* a read clears this flag */ + if (value != 0) { + pci_lintr_deassert(pi); + } break; case VTCFG_R_CFGVEC: assert(size == 2); @@ -705,9 +828,10 @@ pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, } struct pci_devemu pci_de_viona = { - .pe_emu = "virtio-net-viona", + .pe_emu = "virtio-net-viona", .pe_init = pci_viona_init, .pe_barwrite = pci_viona_write, - .pe_barread = pci_viona_read + .pe_barread = pci_viona_read, + .pe_lintrupdate = pci_viona_lintrupdate }; PCI_EMUL_SET(pci_de_viona); diff --git a/usr/src/cmd/devfsadm/i386/misc_link_i386.c b/usr/src/cmd/devfsadm/i386/misc_link_i386.c index 4aeea7d294..0f8e64551d 100644 --- a/usr/src/cmd/devfsadm/i386/misc_link_i386.c +++ b/usr/src/cmd/devfsadm/i386/misc_link_i386.c @@ -85,6 +85,9 @@ static devfsadm_create_t misc_cbt[] = { { "pseudo", "ddi_pseudo", "ucode", TYPE_EXACT | DRV_EXACT, ILEVEL_0, ln_minor_name, }, + { "pseudo", "ddi_pseudo", "viona", + TYPE_EXACT | DRV_EXACT, ILEVEL_0, ln_minor_name, + }, { "pseudo", "ddi_pseudo", "vmm", TYPE_EXACT | DRV_EXACT, ILEVEL_0, vmmctl, } @@ -114,6 +117,9 @@ static devfsadm_remove_t misc_remove_cbt[] = { { "serial", "^tty[a-z]$", RM_ALWAYS | RM_PRE, ILEVEL_1, devfsadm_rm_all }, + { "pseudo", "^viona$", RM_ALWAYS | RM_PRE | RM_HOT, + ILEVEL_0, devfsadm_rm_all + }, { "pseudo", "^vmmctl$", RM_ALWAYS | RM_PRE | RM_HOT, ILEVEL_0, devfsadm_rm_all } diff --git a/usr/src/man/man9e/mac.9e b/usr/src/man/man9e/mac.9e index 3a3f2ae90a..d3d066a564 100644 --- a/usr/src/man/man9e/mac.9e +++ b/usr/src/man/man9e/mac.9e @@ -570,24 +570,28 @@ The following set of flags may be combined through a bitwise inclusive OR: .Bl -tag -width Ds .It Sy HCKSUM_INET_PARTIAL This indicates that the hardware can calculate a partial checksum for -both IPv4 and IPv6; however, it requires the pseudo-header checksum be -calculated for it. +both IPv4 and IPv6 UDP and TCP packets; however, it requires the pseudo-header +checksum be calculated for it. The pseudo-header checksum will be available for the mblk_t when calling .Xr mac_hcksum_get 9F . -Note this does not imply that the hardware is capable of calculating the -IPv4 header checksum. +Note this does not imply that the hardware is capable of calculating +the partial checksum for other L4 protocols or the IPv4 header checksum. That should be indicated with the .Sy HCKSUM_IPHDRCKSUM flag. .It Sy HCKSUM_INET_FULL_V4 -This indicates that the hardware will fully calculate the L4 checksum -for outgoing IPv4 packets and does not require a pseudo-header checksum. +This indicates that the hardware will fully calculate the L4 checksum for +outgoing IPv4 UDP or TCP packets only, and does not require a pseudo-header +checksum. Note this does not imply that the hardware is capable of calculating the -IPv4 header checksum. +checksum for other L4 protocols or the IPv4 header checksum. That should be indicated with the .Sy HCKSUM_IPHDRCKSUM . .It Sy HCKSUM_INET_FULL_V6 -This indicates that the hardware will fully calculate the L4 checksum -for outgoing IPv6 packets and does not require a pseudo-header checksum. +This indicates that the hardware will fully calculate the L4 checksum for +outgoing IPv6 UDP or TCP packets only, and does not require a pseudo-header +checksum. +Note this does not imply that the hardware is capable of calculating the +checksum for any other L4 protocols. .It Sy HCKSUM_IPHDRCKSUM This indicates that the hardware supports calculating the checksum for the IPv4 header itself. diff --git a/usr/src/pkg/manifests/system-bhyve.mf b/usr/src/pkg/manifests/system-bhyve.mf index 2a51d4fc22..7fdeb81254 100644 --- a/usr/src/pkg/manifests/system-bhyve.mf +++ b/usr/src/pkg/manifests/system-bhyve.mf @@ -35,8 +35,11 @@ dir path=usr group=sys dir path=usr/kernel/drv group=sys dir path=usr/kernel/drv/$(ARCH64) group=sys dir path=usr/sbin +driver name=viona driver name=vmm +file path=usr/kernel/drv/$(ARCH64)/viona file path=usr/kernel/drv/$(ARCH64)/vmm +file path=usr/kernel/drv/viona.conf file path=usr/kernel/drv/vmm.conf file path=usr/sbin/bhyve mode=0555 file path=usr/sbin/bhyvectl mode=0555 diff --git a/usr/src/uts/common/inet/ip/ip6_output.c b/usr/src/uts/common/inet/ip/ip6_output.c index 6c5868ddde..143077ed32 100644 --- a/usr/src/uts/common/inet/ip/ip6_output.c +++ b/usr/src/uts/common/inet/ip/ip6_output.c @@ -23,6 +23,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -866,8 +867,16 @@ ip_output_cksum_v6(iaflags_t ixaflags, mblk_t *mp, ip6_t *ip6h, ixa->ixa_raw_cksum_offset); cksum = htons(protocol); } else if (protocol == IPPROTO_ICMPV6) { - cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length); - cksum = IP_ICMPV6_CSUM_COMP; /* Pseudo-header cksum */ + /* + * Currently we assume no HW support for ICMP checksum calc. + * + * When HW support is advertised for ICMP, we'll want the + * following to be set: + * cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length); + * cksum = IP_ICMPV6_CSUM_COMP; Pseudo-header cksum + */ + + return (ip_output_sw_cksum_v6(mp, ip6h, ixa)); } else { ip_hdr_cksum: /* No IP header checksum for IPv6 */ diff --git a/usr/src/uts/common/inet/ip/ip_output.c b/usr/src/uts/common/inet/ip/ip_output.c index 1017240521..a0157d3c48 100644 --- a/usr/src/uts/common/inet/ip/ip_output.c +++ b/usr/src/uts/common/inet/ip/ip_output.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -1738,6 +1739,13 @@ ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha, #endif sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); goto ip_hdr_cksum; + } else if (protocol == IPPROTO_ICMP) { + /* + * Note that we always calculate a SW checksum for ICMP. In the + * future, if HW support for ICMP is advertised, we can change + * this. + */ + return (ip_output_sw_cksum_v4(mp, ipha, ixa)); } else { ip_hdr_cksum: /* Calculate IPv4 header checksum */ diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c index b80cf53882..2e55e6fab8 100644 --- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c +++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c @@ -22,6 +22,7 @@ static const char rcsid[] = "@(#)$Id: ip_fil_solaris.c,v 2.62.2.19 2005/07/13 21 #include <sys/filio.h> #include <sys/systm.h> #include <sys/strsubr.h> +#include <sys/strsun.h> #include <sys/cred.h> #include <sys/ddi.h> #include <sys/sunddi.h> @@ -84,9 +85,19 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t, static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t, void *)); static int ipf_hook6 __P((hook_data_t, int, int, void *)); + +static int ipf_hookviona_in __P((hook_event_token_t, hook_data_t, void *)); +static int ipf_hookviona_out __P((hook_event_token_t, hook_data_t, + void *)); + extern int ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *)); extern int ipf_frruleiter __P((void *, int, void *, ipf_stack_t *)); +static int ipf_hook_protocol_notify __P((hook_notify_cmd_t, void *, + const char *, const char *, const char *)); +static int ipf_hook_instance_notify __P((hook_notify_cmd_t, void *, + const char *, const char *, const char *)); + #if SOLARIS2 < 10 #if SOLARIS2 >= 7 u_int *ip_ttl_ptr = NULL; @@ -153,6 +164,12 @@ char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz"; char *hook6_loop_out = "ipfilter_hook6_loop_out"; char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz"; +/* viona hook names */ +char *hook_viona_in = "ipfilter_hookviona_in"; +char *hook_viona_in_gz = "ipfilter_hookviona_in_gz"; +char *hook_viona_out = "ipfilter_hookviona_out"; +char *hook_viona_out_gz = "ipfilter_hookviona_out_gz"; + /* ------------------------------------------------------------------------ */ /* Function: ipldetach */ /* Returns: int - 0 == success, else error. */ @@ -249,8 +266,40 @@ ipf_stack_t *ifs; ifs->ifs_ipf_ipv4 = NULL; } + /* + * Remove notification of viona hooks + */ + net_instance_notify_unregister(ifs->ifs_netid, + ipf_hook_instance_notify); + #undef UNDO_HOOK + /* + * Normally, viona will unregister itself before ipldetach() is called, + * so these will be no-ops, but out of caution, we try to make sure + * we've removed any of our references. + */ + (void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL, + NH_PHYSICAL_IN); + (void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL, + NH_PHYSICAL_OUT); + + { + char netidstr[12]; /* Large enough for INT_MAX + NUL */ + (void) snprintf(netidstr, sizeof (netidstr), "%d", + ifs->ifs_netid); + + /* + * The notify callbacks expect the netid value passed as a + * string in the third argument. To prevent confusion if + * traced, we pass the same value the nethook framework would + * pass, even though the callback does not currently use the + * value. + */ + (void) ipf_hook_instance_notify(HN_UNREGISTER, ifs, netidstr, + NULL, Hn_VIONA); + } + #ifdef IPFDEBUG cmn_err(CE_CONT, "ipldetach()\n"); #endif @@ -446,6 +495,21 @@ ipf_stack_t *ifs; } /* + * VIONA INET hooks. While the nethook framework allows us to register + * hooks for events that haven't been registered yet, we instead + * register and unregister our hooks in response to notifications + * about the viona hooks from the nethook framework. This prevents + * problems when the viona module gets unloaded while the ipf module + * does not. If we do not unregister our hooks after the viona module + * is unloaded, the viona module cannot later re-register them if it + * gets reloaded. As the ip, vnd, and ipf modules are rarely unloaded + * even on DEBUG kernels, they do not experience this issue. + */ + if (net_instance_notify_register(id, ipf_hook_instance_notify, + ifs) != 0) + goto hookup_failed; + + /* * Reacquire ipf_global, now it is safe. */ WRITE_ENTER(&ifs->ifs_ipf_global); @@ -508,6 +572,155 @@ hookup_failed: return -1; } +/* ------------------------------------------------------------------------ */ +/* + * Called whenever a nethook protocol is registered or unregistered. Currently + * only used to add or remove the hooks for viona. + * + * While the function signature requires returning int, nothing + * in usr/src/uts/common/io/hook.c that invokes the callbacks + * captures the return value (nor is there currently any documentation + * on what return values should be). For now at least, we'll return 0 + * on success (or 'not applicable') or an error value. Even if the + * nethook framework doesn't use the return address, it can be observed via + * dtrace if needed. + */ +static int +ipf_hook_protocol_notify(hook_notify_cmd_t command, void *arg, + const char *name, const char *dummy __unused, const char *he_name) +{ + ipf_stack_t *ifs = arg; + hook_t **hookpp; + char *hook_name, *hint_name; + hook_func_t hookfn; + boolean_t *hookedp; + hook_hint_t hint; + boolean_t out; + int ret = 0; + + const boolean_t gz = ifs->ifs_gz_controlled; + + /* We currently only care about viona hooks notifications */ + if (strcmp(name, Hn_VIONA) != 0) + return (0); + + if (strcmp(he_name, NH_PHYSICAL_IN) == 0) { + out = B_FALSE; + } else if (strcmp(he_name, NH_PHYSICAL_OUT) == 0) { + out = B_TRUE; + } else { + /* + * If we've added more hook events to viona, we must add + * the corresponding handling here (even if it's just to + * ignore it) to prevent the firewall from not working as + * intended. + */ + cmn_err(CE_PANIC, "%s: unhandled hook event %s", __func__, + he_name); + + return (0); + } + + if (out) { + hookpp = &ifs->ifs_ipfhookviona_out; + hookfn = ipf_hookviona_out; + hookedp = &ifs->ifs_hookviona_physical_out; + name = gz ? hook_viona_out_gz : hook_viona_out; + hint = gz ? HH_AFTER : HH_BEFORE; + hint_name = gz ? hook_viona_out : hook_viona_out_gz; + } else { + hookpp = &ifs->ifs_ipfhookviona_in; + hookfn = ipf_hookviona_in; + hookedp = &ifs->ifs_hookviona_physical_in; + name = gz ? hook_viona_in_gz : hook_viona_in; + hint = gz ? HH_BEFORE : HH_AFTER; + hint_name = gz ? hook_viona_in : hook_viona_in_gz; + } + + switch (command) { + default: + case HN_NONE: + break; + case HN_REGISTER: + HOOK_INIT(*hookpp, hookfn, (char *)name, ifs); + (*hookpp)->h_hint = hint; + (*hookpp)->h_hintvalue = (uintptr_t)hint_name; + ret = net_hook_register(ifs->ifs_ipf_viona, + (char *)he_name, *hookpp); + if (ret != 0) { + cmn_err(CE_NOTE, "%s: could not register hook " + "(hook family=%s hook=%s) err=%d", __func__, + name, he_name, ret); + *hookedp = B_FALSE; + return (ret); + } + *hookedp = B_TRUE; + break; + case HN_UNREGISTER: + if (ifs->ifs_ipf_viona == NULL) + break; + + ret = *hookedp ? net_hook_unregister(ifs->ifs_ipf_viona, + (char *)he_name, *hookpp) : 0; + if ((ret == 0 || ret == ENXIO)) { + if (*hookpp != NULL) { + hook_free(*hookpp); + *hookpp = NULL; + } + *hookedp = B_FALSE; + } + break; + } + + return (ret); +} + +/* + * Called whenever a new nethook instance is created. Currently only used + * with the Hn_VIONA nethooks. Similar to ipf_hook_protocol_notify, the out + * function signature must return an int, though the result is never used. + * We elect to return 0 on success (or not applicable) or a non-zero value + * on error. + */ +static int +ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg, + const char *netid, const char *dummy __unused, const char *instance) +{ + ipf_stack_t *ifs = arg; + int ret = 0; + + /* We currently only care about viona hooks */ + if (strcmp(instance, Hn_VIONA) != 0) + return (0); + + switch (command) { + case HN_NONE: + default: + return (0); + case HN_REGISTER: + ifs->ifs_ipf_viona = net_protocol_lookup(ifs->ifs_netid, + NHF_VIONA); + + if (ifs->ifs_ipf_viona == NULL) + return (EPROTONOSUPPORT); + + ret = net_protocol_notify_register(ifs->ifs_ipf_viona, + ipf_hook_protocol_notify, ifs); + VERIFY(ret == 0 || ret == ESHUTDOWN); + break; + case HN_UNREGISTER: + if (ifs->ifs_ipf_viona == NULL) + break; + VERIFY0(net_protocol_notify_unregister(ifs->ifs_ipf_viona, + ipf_hook_protocol_notify)); + VERIFY0(net_protocol_release(ifs->ifs_ipf_viona)); + ifs->ifs_ipf_viona = NULL; + break; + } + + return (ret); +} + static int fr_setipfloopback(set, ifs) int set; ipf_stack_t *ifs; @@ -2043,6 +2256,124 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg) return ipf_hook6(info, 1, FI_NOCKSUM, arg); } +/* Static constants used by ipf_hook_ether */ +static uint8_t ipf_eth_bcast_addr[ETHERADDRL] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +}; +static uint8_t ipf_eth_ipv4_mcast[3] = { 0x01, 0x00, 0x5E }; +static uint8_t ipf_eth_ipv6_mcast[2] = { 0x33, 0x33 }; + +/* ------------------------------------------------------------------------ */ +/* Function: ipf_hook_ether */ +/* Returns: int - 0 == packet ok, else problem, free packet if not done */ +/* Parameters: token(I) - pointer to event */ +/* info(I) - pointer to hook information for firewalling */ +/* */ +/* The ipf_hook_ether hook is currently private to illumos. It represents */ +/* a layer 2 datapath generally used by virtual machines. Currently the */ +/* hook is only used by the viona driver to pass along L2 frames for */ +/* inspection. It requires that the L2 ethernet header is contained within */ +/* a single dblk_t (however layers above the L2 header have no restrctions */ +/* in ipf). ipf does not currently support filtering on L2 fields (e.g. */ +/* filtering on a MAC address or ethertype), however virtual machines do */ +/* not have native IP stack instances where ipf traditionally hooks in. */ +/* Instead this entry point is used to determine if the packet is unicast, */ +/* broadcast, or multicast. The IPv4 or IPv6 packet is then passed to the */ +/* traditional ip hooks for filtering. Non IPv4 or non IPv6 packets are */ +/* not subject to examination. */ +/* ------------------------------------------------------------------------ */ +int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg, + boolean_t out) +{ + struct ether_header *ethp; + hook_pkt_event_t *hpe = (hook_pkt_event_t *)info; + mblk_t *mp; + size_t offset, len; + uint16_t etype; + boolean_t v6; + + /* + * viona will only pass us mblks with the L2 header contained in a + * single data block. + */ + mp = *hpe->hpe_mp; + len = MBLKL(mp); + + VERIFY3S(len, >=, sizeof (struct ether_header)); + + ethp = (struct ether_header *)mp->b_rptr; + if ((etype = ntohs(ethp->ether_type)) == ETHERTYPE_VLAN) { + struct ether_vlan_header *evh = + (struct ether_vlan_header *)ethp; + + VERIFY3S(len, >=, sizeof (struct ether_vlan_header)); + + etype = ntohs(evh->ether_type); + offset = sizeof (*evh); + } else { + offset = sizeof (*ethp); + } + + /* + * ipf only support filtering IPv4 and IPv6. Ignore other types. + */ + if (etype == ETHERTYPE_IP) + v6 = B_FALSE; + else if (etype == ETHERTYPE_IPV6) + v6 = B_TRUE; + else + return (0); + + if (bcmp(ipf_eth_bcast_addr, ethp, ETHERADDRL) == 0) + hpe->hpe_flags |= HPE_BROADCAST; + else if (bcmp(ipf_eth_ipv4_mcast, ethp, + sizeof (ipf_eth_ipv4_mcast)) == 0) + hpe->hpe_flags |= HPE_MULTICAST; + else if (bcmp(ipf_eth_ipv6_mcast, ethp, + sizeof (ipf_eth_ipv6_mcast)) == 0) + hpe->hpe_flags |= HPE_MULTICAST; + + /* Find the start of the IPv4 or IPv6 header */ + for (; offset >= len; len = MBLKL(mp)) { + offset -= len; + mp = mp->b_cont; + if (mp == NULL) { + freemsg(*hpe->hpe_mp); + *hpe->hpe_mp = NULL; + return (-1); + } + } + hpe->hpe_mb = mp; + hpe->hpe_hdr = mp->b_rptr + offset; + + return (v6 ? ipf_hook6(info, out, 0, arg) : + ipf_hook(info, out, 0, arg)); +} + +/* ------------------------------------------------------------------------ */ +/* Function: ipf_hookviona_{in,out} */ +/* Returns: int - 0 == packet ok, else problem, free packet if not done */ +/* Parameters: event(I) - pointer to event */ +/* info(I) - pointer to hook information for firewalling */ +/* */ +/* The viona hooks are private hooks to illumos. They represents a layer 2 */ +/* datapath generally used to implement virtual machines. */ +/* along L2 packets. */ +/* */ +/* They end up calling the appropriate traditional ip hooks. */ +/* ------------------------------------------------------------------------ */ +int +ipf_hookviona_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return (ipf_hook_ether(token, info, arg, B_FALSE)); +} + +int +ipf_hookviona_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return (ipf_hook_ether(token, info, arg, B_TRUE)); +} + /* ------------------------------------------------------------------------ */ /* Function: ipf_hook4_loop_in */ /* Returns: int - 0 == packet ok, else problem, free packet if not done */ @@ -2386,7 +2717,7 @@ fr_info_t *fin; #ifdef USE_INET6 struct in6_addr tmp_src6; #endif - + ASSERT(fin->fin_p == IPPROTO_TCP); /* @@ -2428,7 +2759,7 @@ fr_info_t *fin; #endif if (tcp != NULL) { - /* + /* * Adjust TCP header: * swap ports, * set flags, diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h index a239f1c1ca..0ceea1e921 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h +++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h @@ -6,7 +6,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. All rights reserved. */ #ifndef __IPF_STACK_H__ @@ -87,8 +87,8 @@ struct ipf_stack { #endif int ifs_ipf_locks_done; - ipftoken_t *ifs_ipftokenhead; - ipftoken_t **ifs_ipftokentail; + ipftoken_t *ifs_ipftokenhead; + ipftoken_t **ifs_ipftokentail; ipfmutex_t ifs_ipl_mutex; ipfmutex_t ifs_ipf_authmx; @@ -126,6 +126,9 @@ struct ipf_stack { hook_t *ifs_ipfhook6_loop_out; hook_t *ifs_ipfhook6_nicevents; + hook_t *ifs_ipfhookviona_in; + hook_t *ifs_ipfhookviona_out; + /* flags to indicate whether hooks are registered. */ boolean_t ifs_hook4_physical_in; boolean_t ifs_hook4_physical_out; @@ -137,10 +140,13 @@ struct ipf_stack { boolean_t ifs_hook6_nic_events; boolean_t ifs_hook6_loopback_in; boolean_t ifs_hook6_loopback_out; + boolean_t ifs_hookviona_physical_in; + boolean_t ifs_hookviona_physical_out; int ifs_ipf_loopback; net_handle_t ifs_ipf_ipv4; net_handle_t ifs_ipf_ipv6; + net_handle_t ifs_ipf_viona; /* ip_auth.c */ int ifs_fr_authsize; @@ -167,8 +173,8 @@ struct ipf_stack { ipfr_t **ifs_ipfr_nattail; ipfr_t **ifs_ipfr_nattab; - ipfr_t *ifs_ipfr_ipidlist; - ipfr_t **ifs_ipfr_ipidtail; + ipfr_t *ifs_ipfr_ipidlist; + ipfr_t **ifs_ipfr_ipidtail; ipfr_t **ifs_ipfr_ipidtab; ipfrstat_t ifs_ipfr_stats; diff --git a/usr/src/uts/common/io/hook.c b/usr/src/uts/common/io/hook.c index eb139a37e2..44af26e7c4 100644 --- a/usr/src/uts/common/io/hook.c +++ b/usr/src/uts/common/io/hook.c @@ -1050,7 +1050,7 @@ hook_family_free(hook_family_int_t *hfi, hook_stack_t *hks) /* Free container */ kmem_free(hfi, sizeof (*hfi)); - if (hks->hks_shutdown == 2) + if (hks != NULL && hks->hks_shutdown == 2) hook_stack_remove(hks); mutex_exit(&hook_stack_lock); diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h index 5bc2bd41c5..54aad9307a 100644 --- a/usr/src/uts/common/sys/dlpi.h +++ b/usr/src/uts/common/sys/dlpi.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -673,11 +674,11 @@ typedef struct { #define HCKSUM_ENABLE 0x01 /* Set to enable hardware checksum */ /* capability */ #define HCKSUM_INET_PARTIAL 0x02 /* Partial 1's complement checksum */ - /* ability */ + /* ability for TCP/UDP packets. */ #define HCKSUM_INET_FULL_V4 0x04 /* Full 1's complement checksum */ - /* ability for IPv4 packets. */ + /* ability for IPv4 TCP/UDP packets. */ #define HCKSUM_INET_FULL_V6 0x08 /* Full 1's complement checksum */ - /* ability for IPv6 packets. */ + /* ability for IPv6 TCP/UDP packets. */ #define HCKSUM_IPHDRCKSUM 0x10 /* IPv4 Header checksum offload */ /* capability */ #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/hook_impl.h b/usr/src/uts/common/sys/hook_impl.h index d8a15f0fe5..f3337bbacf 100644 --- a/usr/src/uts/common/sys/hook_impl.h +++ b/usr/src/uts/common/sys/hook_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018, Joyent, Inc. */ /* @@ -171,7 +172,7 @@ typedef struct hook_family_int { cvwaitlock_t hfi_lock; SLIST_ENTRY(hook_family_int) hfi_entry; hook_event_int_head_t hfi_head; - hook_family_t hfi_family; + hook_family_t hfi_family; kstat_t *hfi_kstat; struct hook_stack *hfi_stack; hook_notify_head_t hfi_nhead; @@ -209,6 +210,7 @@ typedef struct hook_stack_head hook_stack_head_t; #define Hn_ARP "arp" #define Hn_IPV4 "inet" #define Hn_IPV6 "inet6" +#define Hn_VIONA "viona_inet" extern int hook_run(hook_family_int_t *, hook_event_token_t, hook_data_t); extern int hook_register(hook_family_int_t *, char *, hook_t *); diff --git a/usr/src/uts/common/sys/neti.h b/usr/src/uts/common/sys/neti.h index b21504109c..e7027f8ece 100644 --- a/usr/src/uts/common/sys/neti.h +++ b/usr/src/uts/common/sys/neti.h @@ -21,6 +21,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018, Joyent, Inc. */ #ifndef _SYS_NETI_H @@ -46,6 +48,7 @@ struct msgb; /* avoiding sys/stream.h here */ #define NHF_INET "NHF_INET" #define NHF_INET6 "NHF_INET6" #define NHF_ARP "NHF_ARP" +#define NHF_VIONA "NHF_VIONA" /* * Event identification @@ -61,7 +64,7 @@ struct msgb; /* avoiding sys/stream.h here */ /* * Network NIC hardware checksum capability */ -#define NET_HCK_NONE 0x00 +#define NET_HCK_NONE 0x00 #define NET_HCK_L3_FULL 0x01 #define NET_HCK_L3_PART 0x02 #define NET_HCK_L4_FULL 0x10 diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index ca4ae0cd65..312c0f233d 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -276,7 +276,11 @@ VMM_OBJS += vmm.o \ vmm_support.o \ vmm_zsd.o -VIONA_OBJS += viona.o +VIONA_OBJS += viona_main.o \ + viona_ring.o \ + viona_rx.o \ + viona_tx.o \ + viona_hook.o \ # # Build up defines and paths. diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc index b66b0ca2da..b60d24d82c 100644 --- a/usr/src/uts/i86pc/Makefile.i86pc +++ b/usr/src/uts/i86pc/Makefile.i86pc @@ -247,6 +247,7 @@ DRV_KMODS += ioat DRV_KMODS += fipe DRV_KMODS += imc imcstub DRV_KMODS += vmm +DRV_KMODS += viona DRV_KMODS += cpudrv diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c deleted file mode 100644 index 2371a2f3ae..0000000000 --- a/usr/src/uts/i86pc/io/viona/viona.c +++ /dev/null @@ -1,1409 +0,0 @@ -/* - * Copyright (c) 2013 Chris Torek <torek @ torek net> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * Copyright 2015 Pluribus Networks Inc. - * Copyright 2017 Joyent, Inc. - */ - -#include <sys/conf.h> -#include <sys/file.h> -#include <sys/stat.h> -#include <sys/ddi.h> -#include <sys/sunddi.h> -#include <sys/sunndi.h> -#include <sys/sysmacros.h> -#include <sys/strsubr.h> -#include <sys/strsun.h> -#include <vm/seg_kmem.h> - -#include <sys/dls.h> -#include <sys/mac_client.h> - -#include <sys/viona_io.h> - -#define MB (1024UL * 1024) -#define GB (1024UL * MB) - -/* - * Min. octets in an ethernet frame minus FCS - */ -#define MIN_BUF_SIZE 60 - -#define VIONA_NAME "Virtio Network Accelerator" - -#define VIONA_CTL_MINOR 0 -#define VIONA_CTL_NODE_NAME "ctl" - -#define VIONA_CLI_NAME "viona" - -#define VTNET_MAXSEGS 32 - -#define VRING_ALIGN 4096 - -#define VRING_DESC_F_NEXT (1 << 0) -#define VRING_DESC_F_WRITE (1 << 1) -#define VRING_DESC_F_INDIRECT (1 << 2) - -#define VRING_AVAIL_F_NO_INTERRUPT 1 - -#define VRING_USED_F_NO_NOTIFY 1 - -#define BCM_NIC_DRIVER "bnxe" -/* - * Host capabilities - */ -#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ -#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ -#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ - -#define VIONA_S_HOSTCAPS \ - (VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | \ - VIRTIO_NET_F_STATUS) - -#pragma pack(1) -struct virtio_desc { - uint64_t vd_addr; - uint32_t vd_len; - uint16_t vd_flags; - uint16_t vd_next; -}; -#pragma pack() - -#pragma pack(1) -struct virtio_used { - uint32_t vu_idx; - uint32_t vu_tlen; -}; -#pragma pack() - -#pragma pack(1) -struct virtio_net_mrgrxhdr { - uint8_t vrh_flags; - uint8_t vrh_gso_type; - uint16_t vrh_hdr_len; - uint16_t vrh_gso_size; - uint16_t vrh_csum_start; - uint16_t vrh_csum_offset; - uint16_t vrh_bufs; -}; -struct virtio_net_hdr { - uint8_t vrh_flags; - uint8_t vrh_gso_type; - uint16_t vrh_hdr_len; - uint16_t vrh_gso_size; - uint16_t vrh_csum_start; - uint16_t vrh_csum_offset; -}; -#pragma pack() - -typedef struct viona_vring_hqueue { - /* Internal state */ - uint16_t hq_size; - kmutex_t hq_a_mutex; - kmutex_t hq_u_mutex; - uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */ - - /* Host-context pointers to the queue */ - caddr_t hq_baseaddr; - uint16_t *hq_avail_flags; - uint16_t *hq_avail_idx; /* monotonically increasing */ - uint16_t *hq_avail_ring; - - uint16_t *hq_used_flags; - uint16_t *hq_used_idx; /* monotonically increasing */ - struct virtio_used *hq_used_ring; -} viona_vring_hqueue_t; - - -typedef struct viona_link { - datalink_id_t l_linkid; - - struct vm *l_vm; - size_t l_vm_lomemsize; - caddr_t l_vm_lomemaddr; - size_t l_vm_himemsize; - caddr_t l_vm_himemaddr; - - mac_handle_t l_mh; - mac_client_handle_t l_mch; - - kmem_cache_t *l_desb_kmc; - - pollhead_t l_pollhead; - - viona_vring_hqueue_t l_rx_vring; - uint_t l_rx_intr; - - viona_vring_hqueue_t l_tx_vring; - kcondvar_t l_tx_cv; - uint_t l_tx_intr; - kmutex_t l_tx_mutex; - int l_tx_outstanding; - uint32_t l_features; -} viona_link_t; - -typedef struct { - frtn_t d_frtn; - viona_link_t *d_link; - uint_t d_ref; - uint16_t d_cookie; - int d_len; -} viona_desb_t; - -typedef struct viona_soft_state { - viona_link_t *ss_link; -} viona_soft_state_t; - -typedef struct used_elem { - uint16_t id; - uint32_t len; -} used_elem_t; - -static void *viona_state; -static dev_info_t *viona_dip; -static id_space_t *viona_minor_ids; -/* - * copy tx mbufs from virtio ring to avoid necessitating a wait for packet - * transmission to free resources. - */ -static boolean_t copy_tx_mblks = B_TRUE; - -extern struct vm *vm_lookup_by_name(char *name); -extern uint64_t vm_gpa2hpa(struct vm *vm, uint64_t gpa, size_t len); - -static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); -static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); -static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp); -static int viona_close(dev_t dev, int flag, int otype, cred_t *credp); -static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode, - cred_t *credp, int *rval); -static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, - struct pollhead **phpp); - -static int viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create); -static int viona_ioc_delete(viona_soft_state_t *ss); - -static int viona_vm_map(viona_link_t *link); -static caddr_t viona_gpa2kva(viona_link_t *link, uint64_t gpa); -static void viona_vm_unmap(viona_link_t *link); - -static int viona_ioc_rx_ring_init(viona_link_t *link, - vioc_ring_init_t *u_ri); -static int viona_ioc_tx_ring_init(viona_link_t *link, - vioc_ring_init_t *u_ri); -static int viona_ioc_rx_ring_reset(viona_link_t *link); -static int viona_ioc_tx_ring_reset(viona_link_t *link); -static void viona_ioc_rx_ring_kick(viona_link_t *link); -static void viona_ioc_tx_ring_kick(viona_link_t *link); -static int viona_ioc_rx_intr_clear(viona_link_t *link); -static int viona_ioc_tx_intr_clear(viona_link_t *link); - -static void viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, - boolean_t loopback); -static void viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq); - -static struct cb_ops viona_cb_ops = { - viona_open, - viona_close, - nodev, - nodev, - nodev, - nodev, - nodev, - viona_ioctl, - nodev, - nodev, - nodev, - viona_chpoll, - ddi_prop_op, - 0, - D_MP | D_NEW | D_HOTPLUG, - CB_REV, - nodev, - nodev -}; - -static struct dev_ops viona_ops = { - DEVO_REV, - 0, - nodev, - nulldev, - nulldev, - viona_attach, - viona_detach, - nodev, - &viona_cb_ops, - NULL, - ddi_power, - ddi_quiesce_not_needed -}; - -static struct modldrv modldrv = { - &mod_driverops, - VIONA_NAME, - &viona_ops, -}; - -static struct modlinkage modlinkage = { - MODREV_1, &modldrv, NULL -}; - -int -_init(void) -{ - int ret; - - ret = ddi_soft_state_init(&viona_state, - sizeof (viona_soft_state_t), 0); - if (ret == 0) { - ret = mod_install(&modlinkage); - if (ret != 0) { - ddi_soft_state_fini(&viona_state); - return (ret); - } - } - - return (ret); -} - -int -_fini(void) -{ - int ret; - - ret = mod_remove(&modlinkage); - if (ret == 0) { - ddi_soft_state_fini(&viona_state); - } - - return (ret); -} - -int -_info(struct modinfo *modinfop) -{ - return (mod_info(&modlinkage, modinfop)); -} - -static void -set_viona_tx_mode() -{ - major_t bcm_nic_major; - if ((bcm_nic_major = ddi_name_to_major(BCM_NIC_DRIVER)) - != DDI_MAJOR_T_NONE) { - if (ddi_hold_installed_driver(bcm_nic_major) != NULL) { - copy_tx_mblks = B_FALSE; - ddi_rele_driver(bcm_nic_major); - } - } -} - -static int -viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) -{ - if (cmd != DDI_ATTACH) { - return (DDI_FAILURE); - } - - viona_minor_ids = id_space_create("viona_minor_id", - VIONA_CTL_MINOR + 1, UINT16_MAX); - - if (ddi_create_minor_node(dip, VIONA_CTL_NODE_NAME, - S_IFCHR, VIONA_CTL_MINOR, DDI_PSEUDO, 0) != DDI_SUCCESS) { - return (DDI_FAILURE); - } - - viona_dip = dip; - - set_viona_tx_mode(); - ddi_report_dev(viona_dip); - - return (DDI_SUCCESS); -} - -static int -viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) -{ - if (cmd != DDI_DETACH) { - return (DDI_FAILURE); - } - - id_space_destroy(viona_minor_ids); - - ddi_remove_minor_node(viona_dip, NULL); - - viona_dip = NULL; - - return (DDI_SUCCESS); -} - -static int -viona_open(dev_t *devp, int flag, int otype, cred_t *credp) -{ - int minor; - - if (otype != OTYP_CHR) { - return (EINVAL); - } - - if (drv_priv(credp) != 0) { - return (EPERM); - } - - if (getminor(*devp) != VIONA_CTL_MINOR) { - return (ENXIO); - } - - minor = id_alloc(viona_minor_ids); - if (minor == 0) { - /* All minors are busy */ - return (EBUSY); - } - - if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) { - id_free(viona_minor_ids, minor); - } - - *devp = makedevice(getmajor(*devp), minor); - - return (0); -} - -static int -viona_close(dev_t dev, int flag, int otype, cred_t *credp) -{ - int minor; - viona_soft_state_t *ss; - - if (otype != OTYP_CHR) { - return (EINVAL); - } - - if (drv_priv(credp) != 0) { - return (EPERM); - } - - minor = getminor(dev); - - ss = ddi_get_soft_state(viona_state, minor); - if (ss == NULL) { - return (ENXIO); - } - - viona_ioc_delete(ss); - - ddi_soft_state_free(viona_state, minor); - - id_free(viona_minor_ids, minor); - - return (0); -} - -static int -viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode, - cred_t *credp, int *rval) -{ - viona_soft_state_t *ss; - int err = 0; - - ss = ddi_get_soft_state(viona_state, getminor(dev)); - if (ss == NULL) { - return (ENXIO); - } - - switch (cmd) { - case VNA_IOC_CREATE: - err = viona_ioc_create(ss, (vioc_create_t *)data); - break; - case VNA_IOC_DELETE: - err = viona_ioc_delete(ss); - break; - case VNA_IOC_SET_FEATURES: - if (ss->ss_link == NULL) { - return (ENOSYS); - } - ss->ss_link->l_features = *(int *)data & VIONA_S_HOSTCAPS; - break; - case VNA_IOC_GET_FEATURES: - if (ss->ss_link == NULL) { - return (ENOSYS); - } - *(int *)data = VIONA_S_HOSTCAPS; - break; - case VNA_IOC_RX_RING_INIT: - if (ss->ss_link == NULL) { - return (ENOSYS); - } - err = viona_ioc_rx_ring_init(ss->ss_link, - (vioc_ring_init_t *)data); - break; - case VNA_IOC_RX_RING_RESET: - if (ss->ss_link == NULL) { - return (ENOSYS); - } - err = viona_ioc_rx_ring_reset(ss->ss_link); - break; - case VNA_IOC_RX_RING_KICK: - if (ss->ss_link == NULL) { - return (ENOSYS); - } - viona_ioc_rx_ring_kick(ss->ss_link); - err = 0; - break; - case VNA_IOC_TX_RING_INIT: - if (ss->ss_link == NULL) { - return (ENOSYS); - } - err = viona_ioc_tx_ring_init(ss->ss_link, - (vioc_ring_init_t *)data); - break; - case VNA_IOC_TX_RING_RESET: - if (ss->ss_link == NULL) { - return (ENOSYS); - } - err = viona_ioc_tx_ring_reset(ss->ss_link); - break; - case VNA_IOC_TX_RING_KICK: - if (ss->ss_link == NULL) { - return (ENOSYS); - } - viona_ioc_tx_ring_kick(ss->ss_link); - err = 0; - break; - case VNA_IOC_RX_INTR_CLR: - if (ss->ss_link == NULL) { - return (ENOSYS); - } - err = viona_ioc_rx_intr_clear(ss->ss_link); - break; - case VNA_IOC_TX_INTR_CLR: - if (ss->ss_link == NULL) { - return (ENOSYS); - } - err = viona_ioc_tx_intr_clear(ss->ss_link); - break; - default: - err = ENOTTY; - break; - } - - return (err); -} - -static int -viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, - struct pollhead **phpp) -{ - viona_soft_state_t *ss; - - ss = ddi_get_soft_state(viona_state, getminor(dev)); - if (ss == NULL || ss->ss_link == NULL) { - return (ENXIO); - } - - *reventsp = 0; - - if (ss->ss_link->l_rx_intr && (events & POLLIN)) { - *reventsp |= POLLIN; - } - - if (ss->ss_link->l_tx_intr && (events & POLLOUT)) { - *reventsp |= POLLOUT; - } - - if (*reventsp == 0 && !anyyet) { - *phpp = &ss->ss_link->l_pollhead; - } - - return (0); -} - -static int -viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create) -{ - vioc_create_t k_create; - viona_link_t *link; - char cli_name[MAXNAMELEN]; - int err; - - if (ss->ss_link != NULL) { - return (ENOSYS); - } - if (copyin(u_create, &k_create, sizeof (k_create)) != 0) { - return (EFAULT); - } - - link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP); - - link->l_linkid = k_create.c_linkid; - link->l_vm = vm_lookup_by_name(k_create.c_vmname); - if (link->l_vm == NULL) { - err = ENXIO; - goto bail; - } - - link->l_vm_lomemsize = k_create.c_lomem_size; - link->l_vm_himemsize = k_create.c_himem_size; - err = viona_vm_map(link); - if (err != 0) { - goto bail; - } - - err = mac_open_by_linkid(link->l_linkid, &link->l_mh); - if (err != 0) { - cmn_err(CE_WARN, "viona create mac_open_by_linkid" - " returned %d\n", err); - goto bail; - } - - snprintf(cli_name, sizeof (cli_name), "%s-%d", - VIONA_CLI_NAME, link->l_linkid); - err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0); - if (err != 0) { - cmn_err(CE_WARN, "viona create mac_client_open" - " returned %d\n", err); - goto bail; - } - - link->l_features = VIONA_S_HOSTCAPS; - link->l_desb_kmc = kmem_cache_create(cli_name, - sizeof (viona_desb_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - - mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL); - mutex_init(&link->l_rx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL); - mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL); - mutex_init(&link->l_tx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL); - if (copy_tx_mblks) { - mutex_init(&link->l_tx_mutex, NULL, MUTEX_DRIVER, NULL); - cv_init(&link->l_tx_cv, NULL, CV_DRIVER, NULL); - } - ss->ss_link = link; - - return (0); - -bail: - if (link->l_mch != NULL) { - mac_client_close(link->l_mch, 0); - } - if (link->l_mh != NULL) { - mac_close(link->l_mh); - } - - kmem_free(link, sizeof (viona_link_t)); - - return (err); -} - -static int -viona_ioc_delete(viona_soft_state_t *ss) -{ - viona_link_t *link; - - link = ss->ss_link; - if (link == NULL) { - return (ENOSYS); - } - if (copy_tx_mblks) { - mutex_enter(&link->l_tx_mutex); - while (link->l_tx_outstanding != 0) { - cv_wait(&link->l_tx_cv, &link->l_tx_mutex); - } - mutex_exit(&link->l_tx_mutex); - } - if (link->l_mch != NULL) { - mac_rx_clear(link->l_mch); - mac_client_close(link->l_mch, 0); - } - if (link->l_mh != NULL) { - mac_close(link->l_mh); - } - - viona_vm_unmap(link); - mutex_destroy(&link->l_tx_vring.hq_a_mutex); - mutex_destroy(&link->l_tx_vring.hq_u_mutex); - mutex_destroy(&link->l_rx_vring.hq_a_mutex); - mutex_destroy(&link->l_rx_vring.hq_u_mutex); - if (copy_tx_mblks) { - mutex_destroy(&link->l_tx_mutex); - cv_destroy(&link->l_tx_cv); - } - - kmem_cache_destroy(link->l_desb_kmc); - - kmem_free(link, sizeof (viona_link_t)); - - ss->ss_link = NULL; - - return (0); -} - -static caddr_t -viona_mapin_vm_chunk(viona_link_t *link, uint64_t gpa, size_t len) -{ - caddr_t addr; - size_t offset; - pfn_t pfnum; - - if (len == 0) - return (NULL); - - addr = vmem_alloc(heap_arena, len, VM_SLEEP); - if (addr == NULL) - return (NULL); - - for (offset = 0; offset < len; offset += PAGESIZE) { - pfnum = btop(vm_gpa2hpa(link->l_vm, gpa + offset, PAGESIZE)); - ASSERT(pfnum); - hat_devload(kas.a_hat, addr + offset, PAGESIZE, pfnum, - PROT_READ | PROT_WRITE, HAT_LOAD_LOCK); - } - - return (addr); -} - -/* - * Map the guest physical address space into the kernel virtual address space. - */ -static int -viona_vm_map(viona_link_t *link) -{ - link->l_vm_lomemaddr = viona_mapin_vm_chunk(link, - 0, link->l_vm_lomemsize); - if (link->l_vm_lomemaddr == NULL) - return (-1); - link->l_vm_himemaddr = viona_mapin_vm_chunk(link, - 4 * (1024 * 1024 * 1024UL), link->l_vm_himemsize); - if (link->l_vm_himemsize && link->l_vm_himemaddr == NULL) - return (-1); - - return (0); -} - -/* - * Translate a guest physical address into a kernel virtual address. - */ -static caddr_t -viona_gpa2kva(viona_link_t *link, uint64_t gpa) -{ - if (gpa < link->l_vm_lomemsize) - return (link->l_vm_lomemaddr + gpa); - - gpa -= (4 * GB); - if (gpa < link->l_vm_himemsize) - return (link->l_vm_himemaddr + gpa); - - return (NULL); -} - -static void -viona_vm_unmap(viona_link_t *link) -{ - if (link->l_vm_lomemaddr) { - hat_unload(kas.a_hat, link->l_vm_lomemaddr, - link->l_vm_lomemsize, HAT_UNLOAD_UNLOCK); - vmem_free(heap_arena, link->l_vm_lomemaddr, - link->l_vm_lomemsize); - } - if (link->l_vm_himemaddr) { - hat_unload(kas.a_hat, link->l_vm_himemaddr, - link->l_vm_himemsize, HAT_UNLOAD_UNLOCK); - vmem_free(heap_arena, link->l_vm_himemaddr, - link->l_vm_himemsize); - } -} - -static int -viona_ioc_ring_init_common(viona_link_t *link, viona_vring_hqueue_t *hq, - vioc_ring_init_t *u_ri) -{ - vioc_ring_init_t k_ri; - - if (copyin(u_ri, &k_ri, sizeof (k_ri)) != 0) { - return (EFAULT); - } - - hq->hq_size = k_ri.ri_qsize; - hq->hq_baseaddr = viona_gpa2kva(link, k_ri.ri_qaddr); - if (hq->hq_baseaddr == NULL) - return (EINVAL); - - hq->hq_avail_flags = (uint16_t *)(viona_gpa2kva(link, - k_ri.ri_qaddr + hq->hq_size * sizeof (struct virtio_desc))); - if (hq->hq_avail_flags == NULL) - return (EINVAL); - hq->hq_avail_idx = hq->hq_avail_flags + 1; - hq->hq_avail_ring = hq->hq_avail_flags + 2; - - hq->hq_used_flags = (uint16_t *)(viona_gpa2kva(link, - P2ROUNDUP(k_ri.ri_qaddr + - hq->hq_size * sizeof (struct virtio_desc) + 2, VRING_ALIGN))); - if (hq->hq_used_flags == NULL) - return (EINVAL); - hq->hq_used_idx = hq->hq_used_flags + 1; - hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2); - - /* - * Initialize queue indexes - */ - hq->hq_cur_aidx = 0; - - return (0); -} - -static int -viona_ioc_rx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri) -{ - viona_vring_hqueue_t *hq; - int rval; - - hq = &link->l_rx_vring; - - rval = viona_ioc_ring_init_common(link, hq, u_ri); - if (rval != 0) { - return (rval); - } - - return (0); -} - -static int -viona_ioc_tx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri) -{ - viona_vring_hqueue_t *hq; - - hq = &link->l_tx_vring; - - return (viona_ioc_ring_init_common(link, hq, u_ri)); -} - -static int -viona_ioc_ring_reset_common(viona_vring_hqueue_t *hq) -{ - /* - * Reset all soft state - */ - hq->hq_cur_aidx = 0; - - return (0); -} - -static int -viona_ioc_rx_ring_reset(viona_link_t *link) -{ - viona_vring_hqueue_t *hq; - - mac_rx_clear(link->l_mch); - - hq = &link->l_rx_vring; - - return (viona_ioc_ring_reset_common(hq)); -} - -static int -viona_ioc_tx_ring_reset(viona_link_t *link) -{ - viona_vring_hqueue_t *hq; - - hq = &link->l_tx_vring; - - return (viona_ioc_ring_reset_common(hq)); -} - -static void -viona_ioc_rx_ring_kick(viona_link_t *link) -{ - viona_vring_hqueue_t *hq = &link->l_rx_vring; - - atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY); - - mac_rx_set(link->l_mch, viona_rx, link); -} - -/* - * Return the number of available descriptors in the vring taking care - * of the 16-bit index wraparound. - */ -static inline int -viona_hq_num_avail(viona_vring_hqueue_t *hq) -{ - uint16_t ndesc; - - /* - * We're just computing (a-b) in GF(216). - * - * The only glitch here is that in standard C, - * uint16_t promotes to (signed) int when int has - * more than 16 bits (pretty much always now), so - * we have to force it back to unsigned. - */ - ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx; - - ASSERT(ndesc <= hq->hq_size); - - return (ndesc); -} - -static void -viona_ioc_tx_ring_kick(viona_link_t *link) -{ - viona_vring_hqueue_t *hq = &link->l_tx_vring; - - do { - atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY); - while (viona_hq_num_avail(hq)) { - viona_tx(link, hq); - } - if (copy_tx_mblks) { - mutex_enter(&link->l_tx_mutex); - if (link->l_tx_outstanding != 0) { - cv_wait_sig(&link->l_tx_cv, &link->l_tx_mutex); - } - mutex_exit(&link->l_tx_mutex); - } - atomic_and_16(hq->hq_used_flags, ~VRING_USED_F_NO_NOTIFY); - } while (viona_hq_num_avail(hq)); -} - -static int -viona_ioc_rx_intr_clear(viona_link_t *link) -{ - link->l_rx_intr = 0; - - return (0); -} - -static int -viona_ioc_tx_intr_clear(viona_link_t *link) -{ - link->l_tx_intr = 0; - - return (0); -} -#define VQ_MAX_DESCRIPTORS 512 - -static int -vq_popchain(viona_link_t *link, viona_vring_hqueue_t *hq, struct iovec *iov, - int n_iov, uint16_t *cookie) -{ - int i; - int ndesc, nindir; - int idx, head, next; - struct virtio_desc *vdir, *vindir, *vp; - - idx = hq->hq_cur_aidx; - ndesc = (uint16_t)((unsigned)*hq->hq_avail_idx - (unsigned)idx); - - if (ndesc == 0) - return (0); - if (ndesc > hq->hq_size) { - cmn_err(CE_NOTE, "ndesc (%d) out of range\n", ndesc); - return (-1); - } - - head = hq->hq_avail_ring[idx & (hq->hq_size - 1)]; - next = head; - - for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) { - if (next >= hq->hq_size) { - cmn_err(CE_NOTE, "descriptor index (%d)" - "out of range\n", next); - return (-1); - } - - vdir = (struct virtio_desc *)(hq->hq_baseaddr + - next * sizeof (struct virtio_desc)); - if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) { - if (i > n_iov) - return (-1); - iov[i].iov_base = viona_gpa2kva(link, vdir->vd_addr); - if (iov[i].iov_base == NULL) { - cmn_err(CE_NOTE, "invalid guest physical" - " address 0x%"PRIx64"\n", vdir->vd_addr); - return (-1); - } - iov[i++].iov_len = vdir->vd_len; - } else { - nindir = vdir->vd_len / 16; - if ((vdir->vd_len & 0xf) || nindir == 0) { - cmn_err(CE_NOTE, "invalid indir len 0x%x\n", - vdir->vd_len); - return (-1); - } - vindir = (struct virtio_desc *) - viona_gpa2kva(link, vdir->vd_addr); - if (vindir == NULL) { - cmn_err(CE_NOTE, "invalid guest physical" - " address 0x%"PRIx64"\n", vdir->vd_addr); - return (-1); - } - next = 0; - for (;;) { - vp = &vindir[next]; - if (vp->vd_flags & VRING_DESC_F_INDIRECT) { - cmn_err(CE_NOTE, "indirect desc" - " has INDIR flag\n"); - return (-1); - } - if (i > n_iov) - return (-1); - iov[i].iov_base = - viona_gpa2kva(link, vp->vd_addr); - if (iov[i].iov_base == NULL) { - cmn_err(CE_NOTE, "invalid guest" - " physical address 0x%"PRIx64"\n", - vp->vd_addr); - return (-1); - } - iov[i++].iov_len = vp->vd_len; - - if (i > VQ_MAX_DESCRIPTORS) - goto loopy; - if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0) - break; - - next = vp->vd_next; - if (next >= nindir) { - cmn_err(CE_NOTE, "invalid next" - " %d > %d\n", next, nindir); - return (-1); - } - } - } - if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) { - *cookie = head; - hq->hq_cur_aidx++; - return (i); - } - } - -loopy: - cmn_err(CE_NOTE, "%d > descriptor loop count\n", i); - - return (-1); -} - -static void -vq_pushchain(viona_vring_hqueue_t *hq, uint32_t len, uint16_t cookie) -{ - struct virtio_used *vu; - int uidx; - - uidx = *hq->hq_used_idx; - vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)]; - vu->vu_idx = cookie; - vu->vu_tlen = len; - membar_producer(); - *hq->hq_used_idx = uidx; -} - -static void -vq_pushchain_mrgrx(viona_vring_hqueue_t *hq, int num_bufs, used_elem_t *elem) -{ - struct virtio_used *vu; - int uidx; - int i; - - uidx = *hq->hq_used_idx; - if (num_bufs == 1) { - vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)]; - vu->vu_idx = elem[0].id; - vu->vu_tlen = elem[0].len; - } else { - for (i = 0; i < num_bufs; i++) { - vu = &hq->hq_used_ring[(uidx + i) & (hq->hq_size - 1)]; - vu->vu_idx = elem[i].id; - vu->vu_tlen = elem[i].len; - } - uidx = uidx + num_bufs; - } - membar_producer(); - *hq->hq_used_idx = uidx; -} - -/* - * Copy bytes from mp to iov. - * copied_buf: Total num_bytes copied from mblk to iov array. - * buf: pointer to iov_base. - * i: index of iov array. Mainly used to identify if we are - * dealing with first iov array element. - * rxhdr_size: Virtio header size. Two possibilities in case - * of MRGRX buf, header has 2 additional bytes. - * In case of mrgrx, virtio header should be part of iov[0]. - * In case of non-mrgrx, virtio header may or may not be part - * of iov[0]. - */ -static int -copy_in_mblk(mblk_t *mp, int copied_buf, caddr_t buf, struct iovec *iov, - int i, int rxhdr_size) -{ - int copied_chunk = 0; - mblk_t *ml; - int total_buf_len = iov->iov_len; - /* - * iov[0] might have header, adjust - * total_buf_len accordingly - */ - if (i == 0) { - total_buf_len = iov->iov_len - rxhdr_size; - } - for (ml = mp; ml != NULL; ml = ml->b_cont) { - size_t chunk = MBLKL(ml); - /* - * If chunk is less than - * copied_buf we should move - * to correct msgblk - */ - if (copied_buf != 0) { - if (copied_buf < chunk) { - chunk -= copied_buf; - } else { - copied_buf -= chunk; - continue; - } - } - /* - * iov[0] already has virtio header. - * and if copied chunk is length of iov_len break - */ - if (copied_chunk == total_buf_len) { - break; - } - /* - * Sometimes chunk is total mblk len, sometimes mblk is - * divided into multiple chunks. - */ - if (chunk > copied_buf) { - if (chunk > copied_chunk) { - if ((chunk + copied_chunk) > total_buf_len) - chunk = (size_t)total_buf_len - - copied_chunk; - } else { - if (chunk > (total_buf_len - copied_chunk)) - chunk = (size_t)((total_buf_len - - copied_chunk) - chunk); - } - bcopy(ml->b_rptr + copied_buf, buf, chunk); - } else { - if (chunk > (total_buf_len - copied_chunk)) { - chunk = (size_t)(total_buf_len - copied_chunk); - } - bcopy(ml->b_rptr + copied_buf, buf, chunk); - } - buf += chunk; - copied_chunk += chunk; - } - return (copied_chunk); -} - -static void -viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, - boolean_t loopback) -{ - viona_link_t *link = arg; - viona_vring_hqueue_t *hq = &link->l_rx_vring; - mblk_t *mp0 = mp; - - while (viona_hq_num_avail(hq)) { - struct iovec iov[VTNET_MAXSEGS]; - size_t mblklen; - int n, i = 0; - uint16_t cookie; - struct virtio_net_hdr *vrx = NULL; - struct virtio_net_mrgrxhdr *vmrgrx = NULL; -#if notyet - mblk_t *ml; -#endif - caddr_t buf = NULL; - int total_len = 0; - int copied_buf = 0; - int num_bufs = 0; - int num_pops = 0; - used_elem_t uelem[VTNET_MAXSEGS]; - - if (mp == NULL) { - break; - } - mblklen = msgsize(mp); - if (mblklen == 0) { - break; - } - - mutex_enter(&hq->hq_a_mutex); - n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie); - mutex_exit(&hq->hq_a_mutex); - if (n <= 0) { - break; - } - num_pops++; - if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) { - int total_n = n; - int mrgrxhdr_size = sizeof (struct virtio_net_mrgrxhdr); - /* - * Get a pointer to the rx header, and use the - * data immediately following it for the packet buffer. - */ - vmrgrx = (struct virtio_net_mrgrxhdr *)iov[0].iov_base; - if (n == 1) { - buf = iov[0].iov_base + mrgrxhdr_size; - } - while (mblklen > copied_buf) { - if (total_n == i) { - mutex_enter(&hq->hq_a_mutex); - n = vq_popchain(link, hq, &iov[i], - VTNET_MAXSEGS, &cookie); - mutex_exit(&hq->hq_a_mutex); - if (n <= 0) { - freemsgchain(mp0); - return; - } - num_pops++; - total_n += n; - } - if (total_n > i) { - int copied_chunk = 0; - if (i != 0) { - buf = iov[i].iov_base; - } - copied_chunk = copy_in_mblk(mp, - copied_buf, buf, &iov[i], i, - mrgrxhdr_size); - copied_buf += copied_chunk; - uelem[i].id = cookie; - uelem[i].len = copied_chunk; - if (i == 0) { - uelem[i].len += mrgrxhdr_size; - } - } - num_bufs++; - i++; - } - } else { - boolean_t virt_hdr_incl_iov = B_FALSE; - int rxhdr_size = sizeof (struct virtio_net_hdr); - /* First element is header */ - vrx = (struct virtio_net_hdr *)iov[0].iov_base; - if (n == 1 || iov[0].iov_len > rxhdr_size) { - buf = iov[0].iov_base + rxhdr_size; - virt_hdr_incl_iov = B_TRUE; - total_len += rxhdr_size; - if (iov[0].iov_len < rxhdr_size) { - // Buff too small to fit pkt. Drop it. - freemsgchain(mp0); - return; - } - } else { - total_len = iov[0].iov_len; - } - if (iov[0].iov_len == rxhdr_size) - i++; - while (mblklen > copied_buf) { - if (n > i) { - int copied_chunk = 0; - if (i != 0) { - buf = iov[i].iov_base; - } - /* - * In case of non-mrgrx buf, first - * descriptor always has header and - * rest of the descriptors have data. - * But it is not guaranteed that first - * descriptor will only have virtio - * header. It might also have data. - */ - if (virt_hdr_incl_iov) { - copied_chunk = copy_in_mblk(mp, - copied_buf, buf, &iov[i], - i, rxhdr_size); - } else { - copied_chunk = copy_in_mblk(mp, - copied_buf, buf, &iov[i], - i, 0); - } - copied_buf += copied_chunk; - total_len += copied_chunk; - } else { - /* - * Drop packet as it cant fit - * in buf provided by guest. - */ - freemsgchain(mp0); - return; - } - i++; - } - } - /* - * The only valid field in the rx packet header is the - * number of buffers, which is always 1 without TSO - * support. - */ - if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) { - memset(vmrgrx, 0, sizeof (struct virtio_net_mrgrxhdr)); - vmrgrx->vrh_bufs = num_bufs; - /* - * Make sure iov[0].iov_len >= MIN_BUF_SIZE - * otherwise guest will consider it as invalid frame. - */ - if (num_bufs == 1 && uelem[0].len < MIN_BUF_SIZE) { - uelem[0].len = MIN_BUF_SIZE; - } - /* - * Release this chain and handle more chains. - */ - mutex_enter(&hq->hq_u_mutex); - vq_pushchain_mrgrx(hq, num_pops, uelem); - mutex_exit(&hq->hq_u_mutex); - } else { - memset(vrx, 0, sizeof (struct virtio_net_hdr)); - if (total_len < MIN_BUF_SIZE) { - total_len = MIN_BUF_SIZE; - } - /* - * Release this chain and handle more chains. - */ - mutex_enter(&hq->hq_u_mutex); - vq_pushchain(hq, total_len, cookie); - mutex_exit(&hq->hq_u_mutex); - } - - mp = mp->b_next; - } - - if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { - if (atomic_cas_uint(&link->l_rx_intr, 0, 1) == 0) { - pollwakeup(&link->l_pollhead, POLLIN); - } - } - - freemsgchain(mp0); -} - -static void -viona_desb_free(viona_desb_t *dp) -{ - viona_link_t *link; - viona_vring_hqueue_t *hq; -#if notyet - struct virtio_used *vu; - int uidx; -#endif - uint_t ref; - - ref = atomic_dec_uint_nv(&dp->d_ref); - if (ref != 0) - return; - - link = dp->d_link; - hq = &link->l_tx_vring; - - mutex_enter(&hq->hq_u_mutex); - vq_pushchain(hq, dp->d_len, dp->d_cookie); - mutex_exit(&hq->hq_u_mutex); - - kmem_cache_free(link->l_desb_kmc, dp); - - if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { - if (atomic_cas_uint(&link->l_tx_intr, 0, 1) == 0) { - pollwakeup(&link->l_pollhead, POLLOUT); - } - } - if (copy_tx_mblks) { - mutex_enter(&link->l_tx_mutex); - if (--link->l_tx_outstanding == 0) { - cv_broadcast(&link->l_tx_cv); - } - mutex_exit(&link->l_tx_mutex); - } -} - -static void -viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq) -{ - struct iovec iov[VTNET_MAXSEGS]; - uint16_t cookie; - int i, n; - mblk_t *mp_head, *mp_tail, *mp; - viona_desb_t *dp; - mac_client_handle_t link_mch = link->l_mch; - - mp_head = mp_tail = NULL; - - mutex_enter(&hq->hq_a_mutex); - n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie); - mutex_exit(&hq->hq_a_mutex); - ASSERT(n != 0); - - dp = kmem_cache_alloc(link->l_desb_kmc, KM_SLEEP); - dp->d_frtn.free_func = viona_desb_free; - dp->d_frtn.free_arg = (void *)dp; - dp->d_link = link; - dp->d_cookie = cookie; - - dp->d_ref = 0; - dp->d_len = iov[0].iov_len; - - for (i = 1; i < n; i++) { - dp->d_ref++; - dp->d_len += iov[i].iov_len; - if (copy_tx_mblks) { - mp = desballoc((uchar_t *)iov[i].iov_base, - iov[i].iov_len, BPRI_MED, &dp->d_frtn); - ASSERT(mp); - } else { - mp = allocb(iov[i].iov_len, BPRI_MED); - ASSERT(mp); - bcopy((uchar_t *)iov[i].iov_base, mp->b_wptr, - iov[i].iov_len); - } - mp->b_wptr += iov[i].iov_len; - if (mp_head == NULL) { - ASSERT(mp_tail == NULL); - mp_head = mp; - } else { - ASSERT(mp_tail != NULL); - mp_tail->b_cont = mp; - } - mp_tail = mp; - } - if (copy_tx_mblks == B_FALSE) { - viona_desb_free(dp); - } - if (copy_tx_mblks) { - mutex_enter(&link->l_tx_mutex); - link->l_tx_outstanding++; - mutex_exit(&link->l_tx_mutex); - } - mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL); -} diff --git a/usr/src/uts/i86pc/io/viona/viona.mapfile b/usr/src/uts/i86pc/io/viona/viona.mapfile new file mode 100644 index 0000000000..cece86348c --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona.mapfile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + local: + *; +}; diff --git a/usr/src/uts/i86pc/io/viona/viona_hook.c b/usr/src/uts/i86pc/io/viona/viona_hook.c new file mode 100644 index 0000000000..4520be04b0 --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona_hook.c @@ -0,0 +1,438 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/hook.h> +#include <sys/hook_event.h> + +#include "viona_impl.h" + + +/* + * Global linked list of viona_neti_ts. Access is protected by viona_neti_lock + */ +static list_t viona_neti_list; +static kmutex_t viona_neti_lock; + +/* + * viona_neti is allocated and initialized during attach, and read-only + * until detach (where it's also freed) + */ +static net_instance_t *viona_neti; + + +/* + * Generate a hook event for the packet in *mpp headed in the direction + * indicated by 'out'. If the packet is accepted, 0 is returned. If the + * packet is rejected, an error is returned. The hook function may or may not + * alter or even free *mpp. The caller is expected to deal with either + * situation. + */ +int +viona_hook(viona_link_t *link, viona_vring_t *ring, mblk_t **mpp, boolean_t out) +{ + viona_neti_t *nip = link->l_neti; + viona_nethook_t *vnh = &nip->vni_nethook; + hook_pkt_event_t info; + hook_event_t he; + hook_event_token_t het; + int ret; + + he = out ? vnh->vnh_event_out : vnh->vnh_event_in; + het = out ? vnh->vnh_token_out : vnh->vnh_token_in; + + if (!he.he_interested) + return (0); + + info.hpe_protocol = vnh->vnh_neti; + info.hpe_ifp = (phy_if_t)link; + info.hpe_ofp = (phy_if_t)link; + info.hpe_mp = mpp; + info.hpe_flags = 0; + + ret = hook_run(vnh->vnh_neti->netd_hooks, het, (hook_data_t)&info); + if (ret == 0) + return (0); + + if (out) { + VIONA_PROBE3(tx_hook_drop, viona_vring_t *, ring, + mblk_t *, *mpp, int, ret); + VIONA_RING_STAT_INCR(ring, tx_hookdrop); + } else { + VIONA_PROBE3(rx_hook_drop, viona_vring_t *, ring, + mblk_t *, *mpp, int, ret); + VIONA_RING_STAT_INCR(ring, rx_hookdrop); + } + return (ret); +} + +/* + * netinfo stubs - required by the nethook framework, but otherwise unused + * + * Currently, all ipf rules are applied against all interfaces in a given + * netstack (e.g. all interfaces in a zone). In the future if we want to + * support being able to apply different rules to different interfaces, I + * believe we would need to implement some of these stubs to map an interface + * name in a rule (e.g. 'net0', back to an index or viona_link_t); + */ +static int +viona_neti_getifname(net_handle_t neti __unused, phy_if_t phy __unused, + char *buf __unused, const size_t len __unused) +{ + return (-1); +} + +static int +viona_neti_getmtu(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused) +{ + return (-1); +} + +static int +viona_neti_getptmue(net_handle_t neti __unused) +{ + return (-1); +} + +static int +viona_neti_getlifaddr(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused, size_t nelem __unused, + net_ifaddr_t type[] __unused, void *storage __unused) +{ + return (-1); +} + +static int +viona_neti_getlifzone(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused, zoneid_t *zid __unused) +{ + return (-1); +} + +static int +viona_neti_getlifflags(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused, uint64_t *flags __unused) +{ + return (-1); +} + +static phy_if_t +viona_neti_phygetnext(net_handle_t neti __unused, phy_if_t phy __unused) +{ + return ((phy_if_t)-1); +} + +static phy_if_t +viona_neti_phylookup(net_handle_t neti __unused, const char *name __unused) +{ + return ((phy_if_t)-1); +} + +static lif_if_t +viona_neti_lifgetnext(net_handle_t neti __unused, phy_if_t phy __unused, + lif_if_t ifdata __unused) +{ + return (-1); +} + +static int +viona_neti_inject(net_handle_t neti __unused, inject_t style __unused, + net_inject_t *packet __unused) +{ + return (-1); +} + +static phy_if_t +viona_neti_route(net_handle_t neti __unused, struct sockaddr *address __unused, + struct sockaddr *next __unused) +{ + return ((phy_if_t)-1); +} + +static int +viona_neti_ispchksum(net_handle_t neti __unused, mblk_t *mp __unused) +{ + return (-1); +} + +static int +viona_neti_isvchksum(net_handle_t neti __unused, mblk_t *mp __unused) +{ + return (-1); +} + +static net_protocol_t viona_netinfo = { + NETINFO_VERSION, + NHF_VIONA, + viona_neti_getifname, + viona_neti_getmtu, + viona_neti_getptmue, + viona_neti_getlifaddr, + viona_neti_getlifzone, + viona_neti_getlifflags, + viona_neti_phygetnext, + viona_neti_phylookup, + viona_neti_lifgetnext, + viona_neti_inject, + viona_neti_route, + viona_neti_ispchksum, + viona_neti_isvchksum +}; + +/* + * Create/register our nethooks + */ +static int +viona_nethook_init(netid_t nid, viona_nethook_t *vnh, char *nh_name, + net_protocol_t *netip) +{ + int ret; + + if ((vnh->vnh_neti = net_protocol_register(nid, netip)) == NULL) { + cmn_err(CE_NOTE, "%s: net_protocol_register failed " + "(netid=%d name=%s)", __func__, nid, nh_name); + goto fail_init_proto; + } + + HOOK_FAMILY_INIT(&vnh->vnh_family, nh_name); + if ((ret = net_family_register(vnh->vnh_neti, &vnh->vnh_family)) != 0) { + cmn_err(CE_NOTE, "%s: net_family_register failed " + "(netid=%d name=%s err=%d)", __func__, + nid, nh_name, ret); + goto fail_init_family; + } + + HOOK_EVENT_INIT(&vnh->vnh_event_in, NH_PHYSICAL_IN); + if ((vnh->vnh_token_in = net_event_register(vnh->vnh_neti, + &vnh->vnh_event_in)) == NULL) { + cmn_err(CE_NOTE, "%s: net_event_register %s failed " + "(netid=%d name=%s)", __func__, NH_PHYSICAL_IN, nid, + nh_name); + goto fail_init_event_in; + } + + HOOK_EVENT_INIT(&vnh->vnh_event_out, NH_PHYSICAL_OUT); + if ((vnh->vnh_token_out = net_event_register(vnh->vnh_neti, + &vnh->vnh_event_out)) == NULL) { + cmn_err(CE_NOTE, "%s: net_event_register %s failed " + "(netid=%d name=%s)", __func__, NH_PHYSICAL_OUT, nid, + nh_name); + goto fail_init_event_out; + } + return (0); + + /* + * On failure, we undo all the steps that succeeded in the + * reverse order of initialization, starting at the last + * successful step (the labels denoting the failing step). + */ +fail_init_event_out: + VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in)); + VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in)); + vnh->vnh_token_in = NULL; + +fail_init_event_in: + VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family)); + VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family)); + +fail_init_family: + VERIFY0(net_protocol_unregister(vnh->vnh_neti)); + vnh->vnh_neti = NULL; + +fail_init_proto: + return (1); +} + +/* + * Shutdown the nethooks for a protocol family. This triggers notification + * callbacks to anything that has registered interest to allow hook consumers + * to unhook prior to the removal of the hooks as well as makes them unavailable + * to any future consumers as the first step of removal. + */ +static void +viona_nethook_shutdown(viona_nethook_t *vnh) +{ + VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_out)); + VERIFY0(net_event_shutdown(vnh->vnh_neti, &vnh->vnh_event_in)); + VERIFY0(net_family_shutdown(vnh->vnh_neti, &vnh->vnh_family)); +} + +/* + * Remove the nethooks for a protocol family. + */ +static void +viona_nethook_fini(viona_nethook_t *vnh) +{ + VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_out)); + VERIFY0(net_event_unregister(vnh->vnh_neti, &vnh->vnh_event_in)); + VERIFY0(net_family_unregister(vnh->vnh_neti, &vnh->vnh_family)); + VERIFY0(net_protocol_unregister(vnh->vnh_neti)); + vnh->vnh_neti = NULL; +} + +/* + * Callback invoked by the neti module. This creates/registers our hooks + * {IPv4,IPv6}{in,out} with the nethook framework so they are available to + * interested consumers (e.g. ipf). + * + * During attach, viona_neti_create is called once for every netstack + * present on the system at the time of attach. Thereafter, it is called + * during the creation of additional netstack instances (i.e. zone boot). As a + * result, the viona_neti_t that is created during this call always occurs + * prior to any viona instances that will use it to send hook events. + * + * It should never return NULL. If we cannot register our hooks, we do not + * set vnh_hooked of the respective protocol family, which will prevent the + * creation of any viona instances on this netstack (see viona_ioc_create). + * This can only occur if after a shutdown event (which means destruction is + * imminent) we are trying to create a new instance. + */ +static void * +viona_neti_create(const netid_t netid) +{ + viona_neti_t *nip; + + VERIFY(netid != -1); + + nip = kmem_zalloc(sizeof (*nip), KM_SLEEP); + nip->vni_netid = netid; + nip->vni_zid = net_getzoneidbynetid(netid); + mutex_init(&nip->vni_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&nip->vni_dev_list, sizeof (viona_soft_state_t), + offsetof(viona_soft_state_t, ss_node)); + + if (viona_nethook_init(netid, &nip->vni_nethook, Hn_VIONA, + &viona_netinfo) == 0) + nip->vni_nethook.vnh_hooked = B_TRUE; + + mutex_enter(&viona_neti_lock); + list_insert_tail(&viona_neti_list, nip); + mutex_exit(&viona_neti_lock); + + return (nip); +} + +/* + * Called during netstack teardown by the neti module. During teardown, all + * the shutdown callbacks are invoked, allowing consumers to release any holds + * and otherwise quiesce themselves prior to destruction, followed by the + * actual destruction callbacks. + */ +static void +viona_neti_shutdown(netid_t nid, void *arg) +{ + viona_neti_t *nip = arg; + + ASSERT(nip != NULL); + VERIFY(nid == nip->vni_netid); + + mutex_enter(&viona_neti_lock); + list_remove(&viona_neti_list, nip); + mutex_exit(&viona_neti_lock); + + if (nip->vni_nethook.vnh_hooked) + viona_nethook_shutdown(&nip->vni_nethook); +} + +/* + * Called during netstack teardown by the neti module. Destroys the viona + * netinst data. This is invoked after all the netstack and neti shutdown + * callbacks have been invoked. + */ +static void +viona_neti_destroy(netid_t nid, void *arg) +{ + viona_neti_t *nip = arg; + + ASSERT(nip != NULL); + VERIFY(nid == nip->vni_netid); + + mutex_enter(&nip->vni_lock); + while (nip->vni_ref != 0) + cv_wait(&nip->vni_ref_change, &nip->vni_lock); + mutex_exit(&nip->vni_lock); + + VERIFY(!list_link_active(&nip->vni_node)); + + if (nip->vni_nethook.vnh_hooked) + viona_nethook_fini(&nip->vni_nethook); + + mutex_destroy(&nip->vni_lock); + list_destroy(&nip->vni_dev_list); + kmem_free(nip, sizeof (*nip)); +} + +/* + * Find the viona netinst data by zone id. This is only used during + * viona instance creation (and thus is only called by a zone that is running). + */ +viona_neti_t * +viona_neti_lookup_by_zid(zoneid_t zid) +{ + viona_neti_t *nip; + + mutex_enter(&viona_neti_lock); + for (nip = list_head(&viona_neti_list); nip != NULL; + nip = list_next(&viona_neti_list, nip)) { + if (nip->vni_zid == zid) { + mutex_enter(&nip->vni_lock); + nip->vni_ref++; + mutex_exit(&nip->vni_lock); + mutex_exit(&viona_neti_lock); + return (nip); + } + } + mutex_exit(&viona_neti_lock); + return (NULL); +} + +void +viona_neti_rele(viona_neti_t *nip) +{ + mutex_enter(&nip->vni_lock); + VERIFY3S(nip->vni_ref, >, 0); + nip->vni_ref--; + mutex_exit(&nip->vni_lock); + cv_broadcast(&nip->vni_ref_change); +} + +void +viona_neti_attach(void) +{ + mutex_init(&viona_neti_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&viona_neti_list, sizeof (viona_neti_t), + offsetof(viona_neti_t, vni_node)); + + /* This can only fail if NETINFO_VERSION is wrong */ + viona_neti = net_instance_alloc(NETINFO_VERSION); + VERIFY(viona_neti != NULL); + + viona_neti->nin_name = "viona"; + viona_neti->nin_create = viona_neti_create; + viona_neti->nin_shutdown = viona_neti_shutdown; + viona_neti->nin_destroy = viona_neti_destroy; + /* This can only fail if we've registered ourselves multiple times */ + VERIFY3S(net_instance_register(viona_neti), ==, DDI_SUCCESS); +} + +void +viona_neti_detach(void) +{ + /* This can only fail if we've not registered previously */ + VERIFY3S(net_instance_unregister(viona_neti), ==, DDI_SUCCESS); + net_instance_free(viona_neti); + viona_neti = NULL; + + list_destroy(&viona_neti_list); + mutex_destroy(&viona_neti_lock); +} diff --git a/usr/src/uts/i86pc/io/viona/viona_impl.h b/usr/src/uts/i86pc/io/viona/viona_impl.h new file mode 100644 index 0000000000..5471b611a4 --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona_impl.h @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VIONA_IMPL_H +#define _VIONA_IMPL_H + +#include <sys/ddi.h> +#include <sys/list.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/strsun.h> +#include <sys/sysmacros.h> +#include <sys/uio.h> + +#include <sys/mac_client.h> +#include <sys/mac_provider.h> +#include <sys/mac_client_priv.h> +#include <sys/neti.h> +#include <inet/ip.h> +#include <inet/tcp.h> + +#include <sys/vmm_drv.h> +#include <sys/viona_io.h> + +struct viona_link; +typedef struct viona_link viona_link_t; +struct viona_desb; +typedef struct viona_desb viona_desb_t; +struct viona_net; +typedef struct viona_neti viona_neti_t; + +enum viona_ring_state { + VRS_RESET = 0x0, /* just allocated or reset */ + VRS_SETUP = 0x1, /* addrs setup and starting worker thread */ + VRS_INIT = 0x2, /* worker thread started & waiting to run */ + VRS_RUN = 0x3, /* running work routine */ + VRS_STOP = 0x4, /* worker is exiting */ +}; +enum viona_ring_state_flags { + VRSF_REQ_START = 0x1, /* start running from INIT state */ + VRSF_REQ_STOP = 0x2, /* stop running, clean up, goto RESET state */ + VRSF_RENEW = 0x4, /* ring renewing lease */ +}; + +typedef struct viona_vring { + viona_link_t *vr_link; + + kmutex_t vr_lock; + kcondvar_t vr_cv; + uint16_t vr_state; + uint16_t vr_state_flags; + uint_t vr_xfer_outstanding; + kthread_t *vr_worker_thread; + vmm_lease_t *vr_lease; + + /* ring-sized resources for TX activity */ + viona_desb_t *vr_txdesb; + struct iovec *vr_txiov; + + uint_t vr_intr_enabled; + uint64_t vr_msi_addr; + uint64_t vr_msi_msg; + + /* Internal ring-related state */ + kmutex_t vr_a_mutex; /* sync consumers of 'avail' */ + kmutex_t vr_u_mutex; /* sync consumers of 'used' */ + uint64_t vr_pa; + uint16_t vr_size; + uint16_t vr_mask; /* cached from vr_size */ + uint16_t vr_cur_aidx; /* trails behind 'avail_idx' */ + + /* Host-context pointers to the queue */ + volatile struct virtio_desc *vr_descr; + + volatile uint16_t *vr_avail_flags; + volatile uint16_t *vr_avail_idx; + volatile uint16_t *vr_avail_ring; + volatile uint16_t *vr_avail_used_event; + + volatile uint16_t *vr_used_flags; + volatile uint16_t *vr_used_idx; + volatile struct virtio_used *vr_used_ring; + volatile uint16_t *vr_used_avail_event; + + /* Per-ring error condition statistics */ + struct viona_ring_stats { + uint64_t rs_ndesc_too_high; + uint64_t rs_bad_idx; + uint64_t rs_indir_bad_len; + uint64_t rs_indir_bad_nest; + uint64_t rs_indir_bad_next; + uint64_t rs_no_space; + uint64_t rs_too_many_desc; + uint64_t rs_desc_bad_len; + + uint64_t rs_bad_ring_addr; + + uint64_t rs_fail_hcksum; + uint64_t rs_fail_hcksum6; + uint64_t rs_fail_hcksum_proto; + + uint64_t rs_bad_rx_frame; + uint64_t rs_rx_merge_overrun; + uint64_t rs_rx_merge_underrun; + uint64_t rs_rx_pad_short; + uint64_t rs_rx_mcast_check; + uint64_t rs_too_short; + uint64_t rs_tx_absent; + + uint64_t rs_rx_hookdrop; + uint64_t rs_tx_hookdrop; + } vr_stats; +} viona_vring_t; + +struct viona_link { + vmm_hold_t *l_vm_hold; + boolean_t l_destroyed; + + viona_vring_t l_vrings[VIONA_VQ_MAX]; + + uint32_t l_features; + uint32_t l_features_hw; + uint32_t l_cap_csum; + + uintptr_t l_notify_ioport; + void *l_notify_cookie; + + datalink_id_t l_linkid; + mac_handle_t l_mh; + mac_client_handle_t l_mch; + mac_promisc_handle_t l_mph; + + pollhead_t l_pollhead; + + viona_neti_t *l_neti; +}; + +typedef struct viona_nethook { + net_handle_t vnh_neti; + hook_family_t vnh_family; + hook_event_t vnh_event_in; + hook_event_t vnh_event_out; + hook_event_token_t vnh_token_in; + hook_event_token_t vnh_token_out; + boolean_t vnh_hooked; +} viona_nethook_t; + +struct viona_neti { + list_node_t vni_node; + + netid_t vni_netid; + zoneid_t vni_zid; + + viona_nethook_t vni_nethook; + + kmutex_t vni_lock; /* Protects remaining members */ + kcondvar_t vni_ref_change; /* Protected by vni_lock */ + uint_t vni_ref; /* Protected by vni_lock */ + list_t vni_dev_list; /* Protected by vni_lock */ +}; + +typedef struct used_elem { + uint16_t id; + uint32_t len; +} used_elem_t; + +typedef struct viona_soft_state { + kmutex_t ss_lock; + viona_link_t *ss_link; + list_node_t ss_node; +} viona_soft_state_t; + +#pragma pack(1) +struct virtio_desc { + uint64_t vd_addr; + uint32_t vd_len; + uint16_t vd_flags; + uint16_t vd_next; +}; + +struct virtio_used { + uint32_t vu_idx; + uint32_t vu_tlen; +}; + +struct virtio_net_mrgrxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +}; + +struct virtio_net_hdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; +}; +#pragma pack() + +#define VRING_NEED_BAIL(ring, proc) \ + (((ring)->vr_state_flags & VRSF_REQ_STOP) != 0 || \ + ((proc)->p_flag & SEXITING) != 0) + + +#define VNETHOOK_INTERESTED_IN(neti) \ + (neti)->vni_nethook.vnh_event_in.he_interested +#define VNETHOOK_INTERESTED_OUT(neti) \ + (neti)->vni_nethook.vnh_event_out.he_interested + + +#define VIONA_PROBE(name) DTRACE_PROBE(viona__##name) +#define VIONA_PROBE1(name, arg1, arg2) \ + DTRACE_PROBE1(viona__##name, arg1, arg2) +#define VIONA_PROBE2(name, arg1, arg2, arg3, arg4) \ + DTRACE_PROBE2(viona__##name, arg1, arg2, arg3, arg4) +#define VIONA_PROBE3(name, arg1, arg2, arg3, arg4, arg5, arg6) \ + DTRACE_PROBE3(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6) +#define VIONA_PROBE5(name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, \ + arg9, arg10) \ + DTRACE_PROBE5(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, \ + arg8, arg9, arg10) +#define VIONA_PROBE_BAD_RING_ADDR(r, a) \ + VIONA_PROBE2(bad_ring_addr, viona_vring_t *, r, void *, (void *)(a)) + +#define VIONA_RING_STAT_INCR(r, name) \ + (((r)->vr_stats.rs_ ## name)++) + + +#define VIONA_MAX_HDRS_LEN (sizeof (struct ether_vlan_header) + \ + IP_MAX_HDR_LENGTH + TCP_MAX_HDR_LENGTH) + +#define VRING_AVAIL_F_NO_INTERRUPT 1 +#define VRING_USED_F_NO_NOTIFY 1 + +#define VRING_DESC_F_NEXT (1 << 0) +#define VRING_DESC_F_WRITE (1 << 1) +#define VRING_DESC_F_INDIRECT (1 << 2) + +#define VIRTIO_NET_HDR_F_NEEDS_CSUM (1 << 0) +#define VIRTIO_NET_HDR_F_DATA_VALID (1 << 1) + +#define VIRTIO_NET_HDR_GSO_NONE 0 +#define VIRTIO_NET_HDR_GSO_TCPV4 1 + +#define VIRTIO_NET_F_CSUM (1 << 0) +#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ +#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can accept TSO */ +#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can accept TSO */ +#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX bufs */ +#define VIRTIO_NET_F_STATUS (1 << 16) /* cfg status field present */ +#define VIRTIO_F_RING_NOTIFY_ON_EMPTY (1 << 24) +#define VIRTIO_F_RING_INDIRECT_DESC (1 << 28) +#define VIRTIO_F_RING_EVENT_IDX (1 << 29) + + +void viona_ring_alloc(viona_link_t *, viona_vring_t *); +void viona_ring_free(viona_vring_t *); +int viona_ring_reset(viona_vring_t *, boolean_t); +int viona_ring_init(viona_link_t *, uint16_t, uint16_t, uint64_t); +boolean_t viona_ring_lease_renew(viona_vring_t *); +int vq_popchain(viona_vring_t *, struct iovec *, uint_t, uint16_t *); +void vq_pushchain(viona_vring_t *, uint32_t, uint16_t); +void vq_pushchain_many(viona_vring_t *, uint_t, used_elem_t *); +void viona_intr_ring(viona_vring_t *ring); + +void viona_rx_init(void); +void viona_rx_fini(void); +int viona_rx_set(viona_link_t *); +void viona_rx_clear(viona_link_t *); +void viona_worker_rx(viona_vring_t *, viona_link_t *); + +extern kmutex_t viona_force_copy_lock; +void viona_worker_tx(viona_vring_t *, viona_link_t *); +void viona_tx_ring_alloc(viona_vring_t *, const uint16_t); +void viona_tx_ring_free(viona_vring_t *, const uint16_t); + +void viona_neti_attach(void); +void viona_neti_detach(void); +viona_neti_t *viona_neti_lookup_by_zid(zoneid_t); +void viona_neti_rele(viona_neti_t *); +int viona_hook(viona_link_t *, viona_vring_t *, mblk_t **, boolean_t); + +#endif /* _VIONA_IMPL_H */ diff --git a/usr/src/uts/i86pc/io/viona/viona_main.c b/usr/src/uts/i86pc/io/viona/viona_main.c new file mode 100644 index 0000000000..f51a1f9b12 --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona_main.c @@ -0,0 +1,991 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +/* + * viona - VirtIO-Net, Accelerated + * + * The purpose of viona is to provide high performance virtio-net devices to + * bhyve guests. It does so by sitting directly atop MAC, skipping all of the + * DLS/DLD stack. + * + * -------------------- + * General Architecture + * -------------------- + * + * A single viona instance is comprised of a "link" handle and two "rings". + * After opening the viona device, it must be associated with a MAC network + * interface and a bhyve (vmm) instance to form its link resource. This is + * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are + * passed in to perform the initialization. With the MAC client opened, and a + * driver handle to the vmm instance established, the device is ready to be + * configured by the guest. + * + * The userspace portion of bhyve, which interfaces with the PCI device + * emulation framework, is meant to stay out of the datapath if at all + * possible. Configuration changes made via PCI are mapped to actions which + * will steer the operation of the in-kernel logic. + * + * + * ----------- + * Ring Basics + * ----------- + * + * Each viona link has two viona_vring_t entities, RX and TX, for handling data + * transfers to and from the guest. They represent an interface to the + * standard virtio ring structures. When intiailized and active, each ring is + * backed by a kernel worker thread (parented to the bhyve process for the + * instance) which handles ring events. The RX worker has the simple task of + * watching for ring shutdown conditions. The TX worker does that in addition + * to processing all requests to transmit data. Data destined for the guest is + * delivered directly by MAC to viona_rx() when the ring is active. + * + * + * ----------- + * Ring States + * ----------- + * + * The viona_vring_t instances follow a simple path through the possible state + * values represented in virtio_vring_t`vr_state: + * + * +<--------------------------------------------+ + * | | + * V ^ + * +-----------+ This is the initial state when a link is created or + * | VRS_RESET | when the ring has been explicitly reset. + * +-----------+ + * | ^ + * |---* ioctl(VNA_IOC_RING_INIT) issued | + * | | + * | ^ + * V + * +-----------+ The ring parameters (size, guest physical addresses) + * | VRS_SETUP | have been set and start-up of the ring worker thread + * +-----------+ has begun. + * | ^ + * | | + * |---* ring worker thread begins execution | + * | | + * +-------------------------------------------->+ + * | | ^ + * | | + * | * If ring shutdown is requested (by ioctl or impending + * | bhyve process death) while the worker thread is + * | starting, the worker will transition the ring to + * | VRS_RESET and exit. + * | ^ + * | | + * | ^ + * V + * +-----------+ The worker thread associated with the ring has started + * | VRS_INIT | executing. It has allocated any extra resources needed + * +-----------+ for the ring to operate. + * | ^ + * | | + * +-------------------------------------------->+ + * | | ^ + * | | + * | * If ring shutdown is requested while the worker is + * | waiting in VRS_INIT, it will free any extra resources + * | and transition to VRS_RESET. + * | ^ + * | | + * |--* ioctl(VNA_IOC_RING_KICK) issued | + * | ^ + * V + * +-----------+ The worker thread associated with the ring is executing + * | VRS_RUN | workload specific to that ring. + * +-----------+ + * | ^ + * |---* ioctl(VNA_IOC_RING_RESET) issued | + * | (or bhyve process begins exit) ^ + * | + * +-----------+ The worker thread associated with the ring is in the + * | VRS_STOP | process of exiting. All outstanding TX and RX + * +-----------+ requests are allowed to complete, but new requests + * | must be ignored. + * | ^ + * | | + * +-------------------------------------------->+ + * + * + * While the worker thread is not running, changes to vr_state are only made by + * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts + * the worker, and sets the ring state to VRS_SETUP. Once the worker thread + * has been started, only it may perform ring state transitions (still under + * the protection of vr_lock), when requested by outside consumers via + * vr_state_flags or when the containing bhyve process initiates an exit. + * + * + * ---------------------------- + * Transmission mblk_t Handling + * ---------------------------- + * + * For incoming frames destined for a bhyve guest, the data must first land in + * a host OS buffer from the physical NIC before it is copied into the awaiting + * guest buffer(s). Outbound frames transmitted by the guest are not bound by + * this limitation and can avoid extra copying before the buffers are accessed + * directly by the NIC. When a guest designates buffers to be transmitted, + * viona translates the guest-physical addresses contained in the ring + * descriptors to host-virtual addresses via vmm_dr_gpa2kva(). That pointer is + * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc(). + * Doing so increments vr_xfer_outstanding, preventing the ring from being + * reset (allowing the link to drop its vmm handle to the guest) until all + * transmit mblks referencing guest memory have been processed. Allocation of + * the viona_desb_t entries is done during the VRS_INIT stage of the ring + * worker thread. The ring size informs that allocation as the number of + * concurrent transmissions is limited by the number of descriptors in the + * ring. This minimizes allocation in the transmit hot-path by aqcuiring those + * fixed-size resources during initialization. + * + * This optimization depends on the underlying NIC driver freeing the mblks in + * a timely manner after they have been transmitted by the hardware. Some + * drivers have been found to flush TX descriptors only when new transmissions + * are initiated. This means that there is no upper bound to the time needed + * for an mblk to be flushed and can stall bhyve guests from shutting down + * since their memory must be free of viona TX references prior to clean-up. + * + * This expectation of deterministic mblk_t processing is likely the reason + * behind the notable exception to the zero-copy TX path: systems with 'bnxe' + * loaded will copy transmit data into fresh buffers rather than passing up + * zero-copy mblks. It is a hold-over from the original viona sources provided + * by Pluribus and its continued necessity has not been confirmed. + * + * + * ---------------------------- + * Ring Notification Fast-paths + * ---------------------------- + * + * Device operation for viona requires that notifications flow to and from the + * guest to indicate certain ring conditions. In order to minimize latency and + * processing overhead, the notification procedures are kept in-kernel whenever + * possible. + * + * Guest-to-host notifications, when new available descriptors have been placed + * in the ring, are posted via the 'queue notify' address in the virtio BAR. + * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to + * install a callback hook on an ioport address. Guest exits for accesses to + * viona-hooked ioport addresses will result in direct calls to notify the + * appropriate ring worker without a trip to userland. + * + * Host-to-guest notifications in the form of interrupts enjoy similar + * acceleration. Each viona ring can be configured to send MSI notifications + * to the guest as virtio conditions dictate. This in-kernel interrupt + * configuration is kept synchronized through viona ioctls which are utilized + * during writes to the associated PCI config registers or MSI-X BAR. + * + * Guests which do not utilize MSI-X will result in viona falling back to the + * slow path for interrupts. It will poll(2) the viona handle, receiving + * notification when ring events necessitate the assertion of an interrupt. + * + * + * --------------- + * Nethook Support + * --------------- + * + * Viona provides four nethook events that consumers (e.g. ipf) can hook into + * to intercept packets as they go up or down the stack. Unfortunately, + * the nethook framework does not understand raw packets, so we can only + * generate events (in, out) for IPv4 and IPv6 packets. At driver attach, + * we register callbacks with the neti (netinfo) module that will be invoked + * for each netstack already present, as well as for any additional netstack + * instances created as the system operates. These callbacks will + * register/unregister the hooks with the nethook framework for each + * netstack instance. This registration occurs prior to creating any + * viona instances for a given netstack, and the unregistration for a netstack + * instance occurs after all viona instances of the netstack instance have + * been deleted. + */ + +#include <sys/conf.h> +#include <sys/file.h> +#include <sys/stat.h> + +#include <sys/dlpi.h> + +#include "viona_impl.h" + + +#define VIONA_NAME "Virtio Network Accelerator" +#define VIONA_CTL_MINOR 0 +#define VIONA_CLI_NAME "viona" /* MAC client name */ + + +/* + * Host capabilities. + */ +#define VIONA_S_HOSTCAPS ( \ + VIRTIO_NET_F_GUEST_CSUM | \ + VIRTIO_NET_F_MAC | \ + VIRTIO_NET_F_GUEST_TSO4 | \ + VIRTIO_NET_F_MRG_RXBUF | \ + VIRTIO_NET_F_STATUS | \ + VIRTIO_F_RING_NOTIFY_ON_EMPTY | \ + VIRTIO_F_RING_INDIRECT_DESC) + +/* MAC_CAPAB_HCKSUM specifics of interest */ +#define VIONA_CAP_HCKSUM_INTEREST \ + (HCKSUM_INET_PARTIAL | \ + HCKSUM_INET_FULL_V4 | \ + HCKSUM_INET_FULL_V6) + +static void *viona_state; +static dev_info_t *viona_dip; +static id_space_t *viona_minors; + + +static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, + void **result); +static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); +static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp); +static int viona_close(dev_t dev, int flag, int otype, cred_t *credp); +static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode, + cred_t *credp, int *rval); +static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp); + +static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *); +static int viona_ioc_delete(viona_soft_state_t *, boolean_t); + +static int viona_ioc_set_notify_ioport(viona_link_t *, uint_t); +static int viona_ioc_ring_init(viona_link_t *, void *, int); +static int viona_ioc_ring_reset(viona_link_t *, uint_t); +static int viona_ioc_ring_kick(viona_link_t *, uint_t); +static int viona_ioc_ring_set_msi(viona_link_t *, void *, int); +static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t); +static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *); + +static struct cb_ops viona_cb_ops = { + viona_open, + viona_close, + nodev, + nodev, + nodev, + nodev, + nodev, + viona_ioctl, + nodev, + nodev, + nodev, + viona_chpoll, + ddi_prop_op, + 0, + D_MP | D_NEW | D_HOTPLUG, + CB_REV, + nodev, + nodev +}; + +static struct dev_ops viona_ops = { + DEVO_REV, + 0, + viona_info, + nulldev, + nulldev, + viona_attach, + viona_detach, + nodev, + &viona_cb_ops, + NULL, + ddi_power, + ddi_quiesce_not_needed +}; + +static struct modldrv modldrv = { + &mod_driverops, + VIONA_NAME, + &viona_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +int +_init(void) +{ + int ret; + + ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0); + if (ret != 0) { + return (ret); + } + + viona_minors = id_space_create("viona_minors", + VIONA_CTL_MINOR + 1, UINT16_MAX); + viona_rx_init(); + mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL); + + ret = mod_install(&modlinkage); + if (ret != 0) { + ddi_soft_state_fini(&viona_state); + id_space_destroy(viona_minors); + viona_rx_fini(); + mutex_destroy(&viona_force_copy_lock); + } + + return (ret); +} + +int +_fini(void) +{ + int ret; + + ret = mod_remove(&modlinkage); + if (ret != 0) { + return (ret); + } + + ddi_soft_state_fini(&viona_state); + id_space_destroy(viona_minors); + viona_rx_fini(); + mutex_destroy(&viona_force_copy_lock); + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* ARGSUSED */ +static int +viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)viona_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + return (error); +} + +static int +viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + return (DDI_FAILURE); + } + + viona_neti_attach(); + + viona_dip = dip; + ddi_report_dev(viona_dip); + + return (DDI_SUCCESS); +} + +static int +viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + dev_info_t *old_dip = viona_dip; + + if (cmd != DDI_DETACH) { + return (DDI_FAILURE); + } + + VERIFY(old_dip != NULL); + + viona_neti_detach(); + viona_dip = NULL; + ddi_remove_minor_node(old_dip, NULL); + + return (DDI_SUCCESS); +} + +static int +viona_open(dev_t *devp, int flag, int otype, cred_t *credp) +{ + int minor; + viona_soft_state_t *ss; + + if (otype != OTYP_CHR) { + return (EINVAL); + } +#if 0 + /* + * XXX-mg: drv_priv() is wrong, but I'm not sure what is right. + * Should the check be at open() or ioctl()? + */ + if (drv_priv(credp) != 0) { + return (EPERM); + } +#endif + if (getminor(*devp) != VIONA_CTL_MINOR) { + return (ENXIO); + } + + minor = id_alloc_nosleep(viona_minors); + if (minor == -1) { + /* All minors are busy */ + return (EBUSY); + } + if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) { + id_free(viona_minors, minor); + return (ENOMEM); + } + + ss = ddi_get_soft_state(viona_state, minor); + mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL); + *devp = makedevice(getmajor(*devp), minor); + + return (0); +} + +static int +viona_close(dev_t dev, int flag, int otype, cred_t *credp) +{ + int minor; + viona_soft_state_t *ss; + + if (otype != OTYP_CHR) { + return (EINVAL); + } + + minor = getminor(dev); + + ss = ddi_get_soft_state(viona_state, minor); + if (ss == NULL) { + return (ENXIO); + } + + VERIFY0(viona_ioc_delete(ss, B_TRUE)); + VERIFY(!list_link_active(&ss->ss_node)); + ddi_soft_state_free(viona_state, minor); + id_free(viona_minors, minor); + + return (0); +} + +static int +viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv) +{ + viona_soft_state_t *ss; + void *dptr = (void *)data; + int err = 0, val; + viona_link_t *link; + + ss = ddi_get_soft_state(viona_state, getminor(dev)); + if (ss == NULL) { + return (ENXIO); + } + + switch (cmd) { + case VNA_IOC_CREATE: + return (viona_ioc_create(ss, dptr, md, cr)); + case VNA_IOC_DELETE: + return (viona_ioc_delete(ss, B_FALSE)); + default: + break; + } + + mutex_enter(&ss->ss_lock); + if ((link = ss->ss_link) == NULL || link->l_destroyed || + vmm_drv_release_reqd(link->l_vm_hold)) { + mutex_exit(&ss->ss_lock); + return (ENXIO); + } + + switch (cmd) { + case VNA_IOC_GET_FEATURES: + val = VIONA_S_HOSTCAPS | link->l_features_hw; + if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) { + err = EFAULT; + } + break; + case VNA_IOC_SET_FEATURES: + if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) { + err = EFAULT; + break; + } + val &= (VIONA_S_HOSTCAPS | link->l_features_hw); + + if ((val & VIRTIO_NET_F_CSUM) == 0) + val &= ~VIRTIO_NET_F_HOST_TSO4; + + if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0) + val &= ~VIRTIO_NET_F_GUEST_TSO4; + + link->l_features = val; + break; + case VNA_IOC_RING_INIT: + err = viona_ioc_ring_init(link, dptr, md); + break; + case VNA_IOC_RING_RESET: + err = viona_ioc_ring_reset(link, (uint_t)data); + break; + case VNA_IOC_RING_KICK: + err = viona_ioc_ring_kick(link, (uint_t)data); + break; + case VNA_IOC_RING_SET_MSI: + err = viona_ioc_ring_set_msi(link, dptr, md); + break; + case VNA_IOC_RING_INTR_CLR: + err = viona_ioc_ring_intr_clear(link, (uint_t)data); + break; + case VNA_IOC_INTR_POLL: + err = viona_ioc_intr_poll(link, dptr, md, rv); + break; + case VNA_IOC_SET_NOTIFY_IOP: + err = viona_ioc_set_notify_ioport(link, (uint_t)data); + break; + default: + err = ENOTTY; + break; + } + + mutex_exit(&ss->ss_lock); + return (err); +} + +static int +viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + viona_soft_state_t *ss; + viona_link_t *link; + + ss = ddi_get_soft_state(viona_state, getminor(dev)); + if (ss == NULL) { + return (ENXIO); + } + + mutex_enter(&ss->ss_lock); + if ((link = ss->ss_link) == NULL || link->l_destroyed) { + mutex_exit(&ss->ss_lock); + return (ENXIO); + } + + *reventsp = 0; + if ((events & POLLRDBAND) != 0) { + for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { + if (link->l_vrings[i].vr_intr_enabled != 0) { + *reventsp |= POLLRDBAND; + break; + } + } + } + if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { + *phpp = &link->l_pollhead; + } + mutex_exit(&ss->ss_lock); + + return (0); +} + +static void +viona_get_mac_capab(viona_link_t *link) +{ + mac_handle_t mh = link->l_mh; + uint32_t cap = 0; + mac_capab_lso_t lso_cap; + + link->l_features_hw = 0; + if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) { + /* + * Only report HW checksum ability if the underlying MAC + * resource is capable of populating the L4 header. + */ + if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) { + link->l_features_hw |= VIRTIO_NET_F_CSUM; + } + link->l_cap_csum = cap; + } + + if ((link->l_features_hw & VIRTIO_NET_F_CSUM) && + mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) { + /* + * Virtio doesn't allow for negotiating a maximum LSO + * packet size. We have to assume that the guest may + * send a maximum length IP packet. Make sure the + * underlying MAC can handle an LSO of this size. + */ + if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) && + lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET) + link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4; + } +} + +static int +viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr) +{ + vioc_create_t kvc; + viona_link_t *link = NULL; + char cli_name[MAXNAMELEN]; + int err = 0; + file_t *fp; + vmm_hold_t *hold = NULL; + viona_neti_t *nip = NULL; + zoneid_t zid; + + ASSERT(MUTEX_NOT_HELD(&ss->ss_lock)); + + if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) { + return (EFAULT); + } + + zid = crgetzoneid(cr); + nip = viona_neti_lookup_by_zid(zid); + if (nip == NULL) { + return (EIO); + } + + if (!nip->vni_nethook.vnh_hooked) { + viona_neti_rele(nip); + return (EIO); + } + + mutex_enter(&ss->ss_lock); + if (ss->ss_link != NULL) { + mutex_exit(&ss->ss_lock); + viona_neti_rele(nip); + return (EEXIST); + } + + if ((fp = getf(kvc.c_vmfd)) == NULL) { + err = EBADF; + goto bail; + } + err = vmm_drv_hold(fp, cr, &hold); + releasef(kvc.c_vmfd); + if (err != 0) { + goto bail; + } + + link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP); + link->l_linkid = kvc.c_linkid; + link->l_vm_hold = hold; + + err = mac_open_by_linkid(link->l_linkid, &link->l_mh); + if (err != 0) { + goto bail; + } + + viona_get_mac_capab(link); + + (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME, + link->l_linkid); + err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0); + if (err != 0) { + goto bail; + } + + viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]); + viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]); + + if ((err = viona_rx_set(link)) != 0) { + viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); + viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); + goto bail; + } + + link->l_neti = nip; + ss->ss_link = link; + mutex_exit(&ss->ss_lock); + + mutex_enter(&nip->vni_lock); + list_insert_tail(&nip->vni_dev_list, ss); + mutex_exit(&nip->vni_lock); + + return (0); + +bail: + if (link != NULL) { + if (link->l_mch != NULL) { + mac_client_close(link->l_mch, 0); + } + if (link->l_mh != NULL) { + mac_close(link->l_mh); + } + kmem_free(link, sizeof (viona_link_t)); + } + if (hold != NULL) { + vmm_drv_rele(hold); + } + viona_neti_rele(nip); + + mutex_exit(&ss->ss_lock); + return (err); +} + +static int +viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close) +{ + viona_link_t *link; + viona_neti_t *nip = NULL; + + mutex_enter(&ss->ss_lock); + if ((link = ss->ss_link) == NULL) { + /* Link destruction already complete */ + mutex_exit(&ss->ss_lock); + return (0); + } + + if (link->l_destroyed) { + /* + * Link destruction has been started by another thread, but has + * not completed. This condition should be impossible to + * encounter when performing the on-close destroy of the link, + * since racing ioctl accessors must necessarily be absent. + */ + VERIFY(!on_close); + mutex_exit(&ss->ss_lock); + return (EAGAIN); + } + /* + * The link deletion cannot fail after this point, continuing until its + * successful completion is reached. + */ + link->l_destroyed = B_TRUE; + + /* + * Tear down the IO port hook so it cannot be used to kick any of the + * rings which are about to be reset and stopped. + */ + VERIFY0(viona_ioc_set_notify_ioport(link, 0)); + mutex_exit(&ss->ss_lock); + + /* + * Return the rings to their reset state, ignoring any possible + * interruptions from signals. + */ + VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE)); + VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE)); + + mutex_enter(&ss->ss_lock); + if (link->l_mch != NULL) { + /* Unhook the receive callbacks and close out the client */ + viona_rx_clear(link); + mac_client_close(link->l_mch, 0); + } + if (link->l_mh != NULL) { + mac_close(link->l_mh); + } + if (link->l_vm_hold != NULL) { + vmm_drv_rele(link->l_vm_hold); + link->l_vm_hold = NULL; + } + + nip = link->l_neti; + link->l_neti = NULL; + + viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); + viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); + pollhead_clean(&link->l_pollhead); + ss->ss_link = NULL; + mutex_exit(&ss->ss_lock); + + mutex_enter(&nip->vni_lock); + list_remove(&nip->vni_dev_list, ss); + mutex_exit(&nip->vni_lock); + + viona_neti_rele(nip); + + kmem_free(link, sizeof (viona_link_t)); + return (0); +} + +static int +viona_ioc_ring_init(viona_link_t *link, void *udata, int md) +{ + vioc_ring_init_t kri; + int err; + + if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) { + return (EFAULT); + } + + err = viona_ring_init(link, kri.ri_index, kri.ri_qsize, kri.ri_qaddr); + + return (err); +} + +static int +viona_ioc_ring_reset(viona_link_t *link, uint_t idx) +{ + viona_vring_t *ring; + + if (idx >= VIONA_VQ_MAX) { + return (EINVAL); + } + ring = &link->l_vrings[idx]; + + return (viona_ring_reset(ring, B_TRUE)); +} + +static int +viona_ioc_ring_kick(viona_link_t *link, uint_t idx) +{ + viona_vring_t *ring; + int err; + + if (idx >= VIONA_VQ_MAX) { + return (EINVAL); + } + ring = &link->l_vrings[idx]; + + mutex_enter(&ring->vr_lock); + switch (ring->vr_state) { + case VRS_SETUP: + /* + * An early kick to a ring which is starting its worker thread + * is fine. Once that thread is active, it will process the + * start-up request immediately. + */ + /* FALLTHROUGH */ + case VRS_INIT: + ring->vr_state_flags |= VRSF_REQ_START; + /* FALLTHROUGH */ + case VRS_RUN: + cv_broadcast(&ring->vr_cv); + err = 0; + break; + default: + err = EBUSY; + break; + } + mutex_exit(&ring->vr_lock); + + return (err); +} + +static int +viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md) +{ + vioc_ring_msi_t vrm; + viona_vring_t *ring; + + if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) { + return (EFAULT); + } + if (vrm.rm_index >= VIONA_VQ_MAX) { + return (EINVAL); + } + + ring = &link->l_vrings[vrm.rm_index]; + mutex_enter(&ring->vr_lock); + ring->vr_msi_addr = vrm.rm_addr; + ring->vr_msi_msg = vrm.rm_msg; + mutex_exit(&ring->vr_lock); + + return (0); +} + +static int +viona_notify_wcb(void *arg, uintptr_t ioport, uint_t sz, uint64_t val) +{ + viona_link_t *link = (viona_link_t *)arg; + uint16_t vq = (uint16_t)val; + + if (ioport != link->l_notify_ioport || sz != sizeof (uint16_t)) { + return (EINVAL); + } + return (viona_ioc_ring_kick(link, vq)); +} + +static int +viona_ioc_set_notify_ioport(viona_link_t *link, uint_t ioport) +{ + int err = 0; + + if (link->l_notify_ioport != 0) { + vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie); + link->l_notify_ioport = 0; + } + + if (ioport != 0) { + err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, NULL, + viona_notify_wcb, (void *)link, &link->l_notify_cookie); + if (err == 0) { + link->l_notify_ioport = ioport; + } + } + return (err); +} + +static int +viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx) +{ + if (idx >= VIONA_VQ_MAX) { + return (EINVAL); + } + + link->l_vrings[idx].vr_intr_enabled = 0; + return (0); +} + +static int +viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv) +{ + uint_t cnt = 0; + vioc_intr_poll_t vip; + + for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { + uint_t val = link->l_vrings[i].vr_intr_enabled; + + vip.vip_status[i] = val; + if (val != 0) { + cnt++; + } + } + + if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) { + return (EFAULT); + } + *rv = (int)cnt; + return (0); +} diff --git a/usr/src/uts/i86pc/io/viona/viona_ring.c b/usr/src/uts/i86pc/io/viona/viona_ring.c new file mode 100644 index 0000000000..5ba6fad963 --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona_ring.c @@ -0,0 +1,638 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + + +#include <sys/disp.h> + +#include "viona_impl.h" + +#define VRING_ALIGN 4096 +#define VRING_MAX_LEN 32768 + +static boolean_t viona_ring_map(viona_vring_t *); +static void viona_ring_unmap(viona_vring_t *); +static kthread_t *viona_create_worker(viona_vring_t *); + +static void * +viona_gpa2kva(viona_vring_t *ring, uint64_t gpa, size_t len) +{ + ASSERT3P(ring->vr_lease, !=, NULL); + + return (vmm_drv_gpa2kva(ring->vr_lease, gpa, len)); +} + +static boolean_t +viona_ring_lease_expire_cb(void *arg) +{ + viona_vring_t *ring = arg; + + cv_broadcast(&ring->vr_cv); + + /* The lease will be broken asynchronously. */ + return (B_FALSE); +} + +static void +viona_ring_lease_drop(viona_vring_t *ring) +{ + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + if (ring->vr_lease != NULL) { + vmm_hold_t *hold = ring->vr_link->l_vm_hold; + + ASSERT(hold != NULL); + + /* + * Without an active lease, the ring mappings cannot be + * considered valid. + */ + viona_ring_unmap(ring); + + vmm_drv_lease_break(hold, ring->vr_lease); + ring->vr_lease = NULL; + } +} + +boolean_t +viona_ring_lease_renew(viona_vring_t *ring) +{ + vmm_hold_t *hold = ring->vr_link->l_vm_hold; + + ASSERT(hold != NULL); + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + viona_ring_lease_drop(ring); + + /* + * Lease renewal will fail if the VM has requested that all holds be + * cleaned up. + */ + ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb, + ring); + if (ring->vr_lease != NULL) { + /* A ring undergoing renewal will need valid guest mappings */ + if (ring->vr_pa != 0 && ring->vr_size != 0) { + /* + * If new mappings cannot be established, consider the + * lease renewal a failure. + */ + if (!viona_ring_map(ring)) { + viona_ring_lease_drop(ring); + return (B_FALSE); + } + } + } + return (ring->vr_lease != NULL); +} + +void +viona_ring_alloc(viona_link_t *link, viona_vring_t *ring) +{ + ring->vr_link = link; + mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL); + mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL); +} + +static void +viona_ring_misc_free(viona_vring_t *ring) +{ + const uint_t qsz = ring->vr_size; + + viona_tx_ring_free(ring, qsz); +} + +void +viona_ring_free(viona_vring_t *ring) +{ + mutex_destroy(&ring->vr_lock); + cv_destroy(&ring->vr_cv); + mutex_destroy(&ring->vr_a_mutex); + mutex_destroy(&ring->vr_u_mutex); + ring->vr_link = NULL; +} + +int +viona_ring_init(viona_link_t *link, uint16_t idx, uint16_t qsz, uint64_t pa) +{ + viona_vring_t *ring; + kthread_t *t; + int err = 0; + + if (idx >= VIONA_VQ_MAX) { + return (EINVAL); + } + if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) { + return (EINVAL); + } + + ring = &link->l_vrings[idx]; + mutex_enter(&ring->vr_lock); + if (ring->vr_state != VRS_RESET) { + mutex_exit(&ring->vr_lock); + return (EBUSY); + } + VERIFY(ring->vr_state_flags == 0); + + ring->vr_lease = NULL; + if (!viona_ring_lease_renew(ring)) { + err = EBUSY; + goto fail; + } + + ring->vr_size = qsz; + ring->vr_mask = (ring->vr_size - 1); + ring->vr_pa = pa; + if (!viona_ring_map(ring)) { + err = EINVAL; + goto fail; + } + + /* Initialize queue indexes */ + ring->vr_cur_aidx = 0; + + if (idx == VIONA_VQ_TX) { + viona_tx_ring_alloc(ring, qsz); + } + + /* Zero out MSI-X configuration */ + ring->vr_msi_addr = 0; + ring->vr_msi_msg = 0; + + /* Clear the stats */ + bzero(&ring->vr_stats, sizeof (ring->vr_stats)); + + t = viona_create_worker(ring); + if (t == NULL) { + err = ENOMEM; + goto fail; + } + ring->vr_worker_thread = t; + ring->vr_state = VRS_SETUP; + cv_broadcast(&ring->vr_cv); + mutex_exit(&ring->vr_lock); + return (0); + +fail: + viona_ring_lease_drop(ring); + viona_ring_misc_free(ring); + ring->vr_size = 0; + ring->vr_mask = 0; + mutex_exit(&ring->vr_lock); + return (err); +} + +int +viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals) +{ + mutex_enter(&ring->vr_lock); + if (ring->vr_state == VRS_RESET) { + mutex_exit(&ring->vr_lock); + return (0); + } + + if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) { + ring->vr_state_flags |= VRSF_REQ_STOP; + cv_broadcast(&ring->vr_cv); + } + while (ring->vr_state != VRS_RESET) { + if (!heed_signals) { + cv_wait(&ring->vr_cv, &ring->vr_lock); + } else { + int rs; + + rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock); + if (rs <= 0 && ring->vr_state != VRS_RESET) { + mutex_exit(&ring->vr_lock); + return (EINTR); + } + } + } + viona_ring_lease_drop(ring); + mutex_exit(&ring->vr_lock); + return (0); +} + +static boolean_t +viona_ring_map(viona_vring_t *ring) +{ + uint64_t pos = ring->vr_pa; + const uint16_t qsz = ring->vr_size; + + ASSERT3U(qsz, !=, 0); + ASSERT3U(pos, !=, 0); + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + const size_t desc_sz = qsz * sizeof (struct virtio_desc); + ring->vr_descr = viona_gpa2kva(ring, pos, desc_sz); + if (ring->vr_descr == NULL) { + goto fail; + } + pos += desc_sz; + + const size_t avail_sz = (qsz + 3) * sizeof (uint16_t); + ring->vr_avail_flags = viona_gpa2kva(ring, pos, avail_sz); + if (ring->vr_avail_flags == NULL) { + goto fail; + } + ring->vr_avail_idx = ring->vr_avail_flags + 1; + ring->vr_avail_ring = ring->vr_avail_flags + 2; + ring->vr_avail_used_event = ring->vr_avail_ring + qsz; + pos += avail_sz; + + const size_t used_sz = (qsz * sizeof (struct virtio_used)) + + (sizeof (uint16_t) * 3); + pos = P2ROUNDUP(pos, VRING_ALIGN); + ring->vr_used_flags = viona_gpa2kva(ring, pos, used_sz); + if (ring->vr_used_flags == NULL) { + goto fail; + } + ring->vr_used_idx = ring->vr_used_flags + 1; + ring->vr_used_ring = (struct virtio_used *)(ring->vr_used_flags + 2); + ring->vr_used_avail_event = (uint16_t *)(ring->vr_used_ring + qsz); + + return (B_TRUE); + +fail: + viona_ring_unmap(ring); + return (B_FALSE); +} + +static void +viona_ring_unmap(viona_vring_t *ring) +{ + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + ring->vr_descr = NULL; + ring->vr_avail_flags = NULL; + ring->vr_avail_idx = NULL; + ring->vr_avail_ring = NULL; + ring->vr_avail_used_event = NULL; + ring->vr_used_flags = NULL; + ring->vr_used_idx = NULL; + ring->vr_used_ring = NULL; + ring->vr_used_avail_event = NULL; +} + +void +viona_intr_ring(viona_vring_t *ring) +{ + uint64_t addr; + + mutex_enter(&ring->vr_lock); + /* Deliver the interrupt directly, if so configured. */ + if ((addr = ring->vr_msi_addr) != 0) { + uint64_t msg = ring->vr_msi_msg; + + mutex_exit(&ring->vr_lock); + (void) vmm_drv_msi(ring->vr_lease, addr, msg); + return; + } + mutex_exit(&ring->vr_lock); + + if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) { + pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND); + } +} + +static void +viona_worker(void *arg) +{ + viona_vring_t *ring = (viona_vring_t *)arg; + viona_link_t *link = ring->vr_link; + proc_t *p = ttoproc(curthread); + + mutex_enter(&ring->vr_lock); + VERIFY3U(ring->vr_state, ==, VRS_SETUP); + + /* Bail immediately if ring shutdown or process exit was requested */ + if (VRING_NEED_BAIL(ring, p)) { + goto cleanup; + } + + /* Report worker thread as alive and notify creator */ + ring->vr_state = VRS_INIT; + cv_broadcast(&ring->vr_cv); + + while (ring->vr_state_flags == 0) { + /* + * Keeping lease renewals timely while waiting for the ring to + * be started is important for avoiding deadlocks. + */ + if (vmm_drv_lease_expired(ring->vr_lease)) { + if (!viona_ring_lease_renew(ring)) { + goto cleanup; + } + } + + (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); + + if (VRING_NEED_BAIL(ring, p)) { + goto cleanup; + } + } + + ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0); + ring->vr_state = VRS_RUN; + ring->vr_state_flags &= ~VRSF_REQ_START; + + /* Ensure ring lease is valid first */ + if (vmm_drv_lease_expired(ring->vr_lease)) { + if (!viona_ring_lease_renew(ring)) { + goto cleanup; + } + } + + /* Process actual work */ + if (ring == &link->l_vrings[VIONA_VQ_RX]) { + viona_worker_rx(ring, link); + } else if (ring == &link->l_vrings[VIONA_VQ_TX]) { + viona_worker_tx(ring, link); + } else { + panic("unexpected ring: %p", (void *)ring); + } + + VERIFY3U(ring->vr_state, ==, VRS_STOP); + +cleanup: + if (ring->vr_txdesb != NULL) { + /* + * Transmit activity must be entirely concluded before the + * associated descriptors can be cleaned up. + */ + VERIFY(ring->vr_xfer_outstanding == 0); + } + viona_ring_misc_free(ring); + + viona_ring_lease_drop(ring); + ring->vr_cur_aidx = 0; + ring->vr_state = VRS_RESET; + ring->vr_state_flags = 0; + ring->vr_worker_thread = NULL; + cv_broadcast(&ring->vr_cv); + mutex_exit(&ring->vr_lock); + + mutex_enter(&ttoproc(curthread)->p_lock); + lwp_exit(); +} + +static kthread_t * +viona_create_worker(viona_vring_t *ring) +{ + k_sigset_t hold_set; + proc_t *p = curproc; + kthread_t *t; + klwp_t *lwp; + + ASSERT(MUTEX_HELD(&ring->vr_lock)); + ASSERT(ring->vr_state == VRS_RESET); + + sigfillset(&hold_set); + lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED, + minclsyspri - 1, &hold_set, curthread->t_cid, 0); + if (lwp == NULL) { + return (NULL); + } + + t = lwptot(lwp); + mutex_enter(&p->p_lock); + t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD; + lwp_create_done(t); + mutex_exit(&p->p_lock); + + return (t); +} + +int +vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov, + uint16_t *cookie) +{ + uint_t i, ndesc, idx, head, next; + struct virtio_desc vdir; + void *buf; + + ASSERT(iov != NULL); + ASSERT(niov > 0 && niov < INT_MAX); + + mutex_enter(&ring->vr_a_mutex); + idx = ring->vr_cur_aidx; + ndesc = (uint16_t)((unsigned)*ring->vr_avail_idx - (unsigned)idx); + + if (ndesc == 0) { + mutex_exit(&ring->vr_a_mutex); + return (0); + } + if (ndesc > ring->vr_size) { + /* + * Despite the fact that the guest has provided an 'avail_idx' + * which indicates that an impossible number of descriptors are + * available, continue on and attempt to process the next one. + * + * The transgression will not escape the probe or stats though. + */ + VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring, + uint16_t, ndesc); + VIONA_RING_STAT_INCR(ring, ndesc_too_high); + } + + head = ring->vr_avail_ring[idx & ring->vr_mask]; + next = head; + + for (i = 0; i < niov; next = vdir.vd_next) { + if (next >= ring->vr_size) { + VIONA_PROBE2(bad_idx, viona_vring_t *, ring, + uint16_t, next); + VIONA_RING_STAT_INCR(ring, bad_idx); + goto bail; + } + + vdir = ring->vr_descr[next]; + if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) { + if (vdir.vd_len == 0) { + VIONA_PROBE2(desc_bad_len, + viona_vring_t *, ring, + uint32_t, vdir.vd_len); + VIONA_RING_STAT_INCR(ring, desc_bad_len); + goto bail; + } + buf = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len); + if (buf == NULL) { + VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr); + VIONA_RING_STAT_INCR(ring, bad_ring_addr); + goto bail; + } + iov[i].iov_base = buf; + iov[i].iov_len = vdir.vd_len; + i++; + } else { + const uint_t nindir = vdir.vd_len / 16; + volatile struct virtio_desc *vindir; + + if ((vdir.vd_len & 0xf) || nindir == 0) { + VIONA_PROBE2(indir_bad_len, + viona_vring_t *, ring, + uint32_t, vdir.vd_len); + VIONA_RING_STAT_INCR(ring, indir_bad_len); + goto bail; + } + vindir = viona_gpa2kva(ring, vdir.vd_addr, vdir.vd_len); + if (vindir == NULL) { + VIONA_PROBE_BAD_RING_ADDR(ring, vdir.vd_addr); + VIONA_RING_STAT_INCR(ring, bad_ring_addr); + goto bail; + } + next = 0; + for (;;) { + struct virtio_desc vp; + + /* + * A copy of the indirect descriptor is made + * here, rather than simply using a reference + * pointer. This prevents malicious or + * erroneous guest writes to the descriptor + * from fooling the flags/bounds verification + * through a race. + */ + vp = vindir[next]; + if (vp.vd_flags & VRING_DESC_F_INDIRECT) { + VIONA_PROBE1(indir_bad_nest, + viona_vring_t *, ring); + VIONA_RING_STAT_INCR(ring, + indir_bad_nest); + goto bail; + } else if (vp.vd_len == 0) { + VIONA_PROBE2(desc_bad_len, + viona_vring_t *, ring, + uint32_t, vp.vd_len); + VIONA_RING_STAT_INCR(ring, + desc_bad_len); + goto bail; + } + buf = viona_gpa2kva(ring, vp.vd_addr, + vp.vd_len); + if (buf == NULL) { + VIONA_PROBE_BAD_RING_ADDR(ring, + vp.vd_addr); + VIONA_RING_STAT_INCR(ring, + bad_ring_addr); + goto bail; + } + iov[i].iov_base = buf; + iov[i].iov_len = vp.vd_len; + i++; + + if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) + break; + if (i >= niov) { + goto loopy; + } + + next = vp.vd_next; + if (next >= nindir) { + VIONA_PROBE3(indir_bad_next, + viona_vring_t *, ring, + uint16_t, next, + uint_t, nindir); + VIONA_RING_STAT_INCR(ring, + indir_bad_next); + goto bail; + } + } + } + if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) { + *cookie = head; + ring->vr_cur_aidx++; + mutex_exit(&ring->vr_a_mutex); + return (i); + } + } + +loopy: + VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); + VIONA_RING_STAT_INCR(ring, too_many_desc); +bail: + mutex_exit(&ring->vr_a_mutex); + return (-1); +} + +void +vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie) +{ + volatile struct virtio_used *vu; + uint_t uidx; + + mutex_enter(&ring->vr_u_mutex); + + uidx = *ring->vr_used_idx; + vu = &ring->vr_used_ring[uidx++ & ring->vr_mask]; + vu->vu_idx = cookie; + vu->vu_tlen = len; + membar_producer(); + *ring->vr_used_idx = uidx; + + mutex_exit(&ring->vr_u_mutex); +} + +void +vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem) +{ + volatile struct virtio_used *vu; + uint_t uidx, i; + + mutex_enter(&ring->vr_u_mutex); + + uidx = *ring->vr_used_idx; + if (num_bufs == 1) { + vu = &ring->vr_used_ring[uidx++ & ring->vr_mask]; + vu->vu_idx = elem[0].id; + vu->vu_tlen = elem[0].len; + } else { + for (i = 0; i < num_bufs; i++) { + vu = &ring->vr_used_ring[(uidx + i) & ring->vr_mask]; + vu->vu_idx = elem[i].id; + vu->vu_tlen = elem[i].len; + } + uidx = uidx + num_bufs; + } + membar_producer(); + *ring->vr_used_idx = uidx; + + mutex_exit(&ring->vr_u_mutex); +} diff --git a/usr/src/uts/i86pc/io/viona/viona_rx.c b/usr/src/uts/i86pc/io/viona/viona_rx.c new file mode 100644 index 0000000000..1ccbaa63f1 --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona_rx.c @@ -0,0 +1,718 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/strsubr.h> + +#include <sys/dlpi.h> +#include <sys/pattr.h> +#include <sys/vlan.h> + +#include "viona_impl.h" + + + +#define VTNET_MAXSEGS 32 + +/* Min. octets in an ethernet frame minus FCS */ +#define MIN_BUF_SIZE 60 +#define NEED_VLAN_PAD_SIZE (MIN_BUF_SIZE - VLAN_TAGSZ) + +static mblk_t *viona_vlan_pad_mp; + +void +viona_rx_init(void) +{ + mblk_t *mp; + + ASSERT(viona_vlan_pad_mp == NULL); + + /* Create mblk for padding when VLAN tags are stripped */ + mp = allocb_wait(VLAN_TAGSZ, BPRI_HI, STR_NOSIG, NULL); + bzero(mp->b_rptr, VLAN_TAGSZ); + mp->b_wptr += VLAN_TAGSZ; + viona_vlan_pad_mp = mp; +} + +void +viona_rx_fini(void) +{ + mblk_t *mp; + + /* Clean up the VLAN padding mblk */ + mp = viona_vlan_pad_mp; + viona_vlan_pad_mp = NULL; + VERIFY(mp != NULL && mp->b_cont == NULL); + freemsg(mp); +} + +void +viona_worker_rx(viona_vring_t *ring, viona_link_t *link) +{ + proc_t *p = ttoproc(curthread); + + (void) thread_vsetname(curthread, "viona_rx_%p", ring); + + ASSERT(MUTEX_HELD(&ring->vr_lock)); + ASSERT3U(ring->vr_state, ==, VRS_RUN); + + *ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY; + + do { + if (vmm_drv_lease_expired(ring->vr_lease)) { + /* + * Set the renewal flag, causing incoming traffic to be + * dropped, and issue an RX barrier to ensure any + * threads in the RX callbacks will have finished. + * The vr_lock cannot be held across the barrier as it + * poses a deadlock risk. + */ + ring->vr_state_flags |= VRSF_RENEW; + mutex_exit(&ring->vr_lock); + mac_rx_barrier(link->l_mch); + mutex_enter(&ring->vr_lock); + + if (!viona_ring_lease_renew(ring)) { + break; + } + ring->vr_state_flags &= ~VRSF_RENEW; + } + + /* + * For now, there is little to do in the RX worker as inbound + * data is delivered by MAC via the RX callbacks. If tap-like + * functionality is added later, this would be a convenient + * place to inject frames into the guest. + */ + (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); + } while (!VRING_NEED_BAIL(ring, p)); + + ring->vr_state = VRS_STOP; + + /* + * The RX ring is stopping, before we start tearing it down it + * is imperative that we perform an RX barrier so that + * incoming packets are dropped at viona_rx_classified(). + */ + mutex_exit(&ring->vr_lock); + mac_rx_barrier(link->l_mch); + mutex_enter(&ring->vr_lock); + + *ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY; +} + +static size_t +viona_copy_mblk(const mblk_t *mp, size_t seek, caddr_t buf, size_t len, + boolean_t *end) +{ + size_t copied = 0; + size_t off = 0; + + /* Seek past already-consumed data */ + while (seek > 0 && mp != NULL) { + const size_t chunk = MBLKL(mp); + + if (chunk > seek) { + off = seek; + break; + } + mp = mp->b_cont; + seek -= chunk; + } + + while (mp != NULL) { + const size_t chunk = MBLKL(mp) - off; + const size_t to_copy = MIN(chunk, len); + + bcopy(mp->b_rptr + off, buf, to_copy); + copied += to_copy; + buf += to_copy; + len -= to_copy; + + /* + * If all the remaining data in the mblk_t was copied, move on + * to the next one in the chain. Any seek offset applied to + * the first mblk copy is zeroed out for subsequent operations. + */ + if (chunk == to_copy) { + mp = mp->b_cont; + off = 0; + } +#ifdef DEBUG + else { + /* + * The only valid reason for the copy to consume less + * than the entire contents of the mblk_t is because + * the output buffer has been filled. + */ + ASSERT0(len); + } +#endif + + /* Go no further if the buffer has been filled */ + if (len == 0) { + break; + } + + } + *end = (mp == NULL); + return (copied); +} + +static int +viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz) +{ + struct iovec iov[VTNET_MAXSEGS]; + uint16_t cookie; + int n; + const size_t hdr_sz = sizeof (struct virtio_net_hdr); + struct virtio_net_hdr *hdr; + size_t len, copied = 0; + caddr_t buf = NULL; + boolean_t end = B_FALSE; + const uint32_t features = ring->vr_link->l_features; + + ASSERT(msz >= MIN_BUF_SIZE); + + n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie); + if (n <= 0) { + /* Without available buffers, the frame must be dropped. */ + return (ENOSPC); + } + if (iov[0].iov_len < hdr_sz) { + /* + * There is little to do if there is not even space available + * for the sole header. Zero the buffer and bail out as a last + * act of desperation. + */ + bzero(iov[0].iov_base, iov[0].iov_len); + goto bad_frame; + } + + /* Grab the address of the header before anything else */ + hdr = (struct virtio_net_hdr *)iov[0].iov_base; + + /* + * If there is any space remaining in the first buffer after writing + * the header, fill it with frame data. + */ + if (iov[0].iov_len > hdr_sz) { + buf = (caddr_t)iov[0].iov_base + hdr_sz; + len = iov[0].iov_len - hdr_sz; + + copied += viona_copy_mblk(mp, copied, buf, len, &end); + } + + /* Copy any remaining data into subsequent buffers, if present */ + for (int i = 1; i < n && !end; i++) { + buf = (caddr_t)iov[i].iov_base; + len = iov[i].iov_len; + + copied += viona_copy_mblk(mp, copied, buf, len, &end); + } + + /* Was the expected amount of data copied? */ + if (copied != msz) { + VIONA_PROBE5(too_short, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp, size_t, copied, + size_t, msz); + VIONA_RING_STAT_INCR(ring, too_short); + goto bad_frame; + } + + /* Populate (read: zero) the header and account for it in the size */ + bzero(hdr, hdr_sz); + copied += hdr_sz; + + /* Add chksum bits, if needed */ + if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) { + uint32_t cksum_flags; + + if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) && + ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) { + hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4; + hdr->vrh_gso_size = DB_LSOMSS(mp); + } + + mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL, + &cksum_flags); + if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) { + hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID; + } + } + + /* Release this chain */ + vq_pushchain(ring, copied, cookie); + return (0); + +bad_frame: + VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, uint16_t, cookie, + mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, bad_rx_frame); + + vq_pushchain(ring, MAX(copied, MIN_BUF_SIZE + hdr_sz), cookie); + return (EINVAL); +} + +static int +viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz) +{ + struct iovec iov[VTNET_MAXSEGS]; + used_elem_t uelem[VTNET_MAXSEGS]; + int n, i = 0, buf_idx = 0, err = 0; + uint16_t cookie; + caddr_t buf; + size_t len, copied = 0, chunk = 0; + struct virtio_net_mrgrxhdr *hdr = NULL; + const size_t hdr_sz = sizeof (struct virtio_net_mrgrxhdr); + boolean_t end = B_FALSE; + const uint32_t features = ring->vr_link->l_features; + + ASSERT(msz >= MIN_BUF_SIZE); + + n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie); + if (n <= 0) { + /* Without available buffers, the frame must be dropped. */ + VIONA_PROBE2(no_space, viona_vring_t *, ring, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, no_space); + return (ENOSPC); + } + if (iov[0].iov_len < hdr_sz) { + /* + * There is little to do if there is not even space available + * for the sole header. Zero the buffer and bail out as a last + * act of desperation. + */ + bzero(iov[0].iov_base, iov[0].iov_len); + uelem[0].id = cookie; + uelem[0].len = iov[0].iov_len; + err = EINVAL; + goto done; + } + + /* Grab the address of the header and do initial population */ + hdr = (struct virtio_net_mrgrxhdr *)iov[0].iov_base; + bzero(hdr, hdr_sz); + hdr->vrh_bufs = 1; + + /* + * If there is any space remaining in the first buffer after writing + * the header, fill it with frame data. + */ + if (iov[0].iov_len > hdr_sz) { + buf = iov[0].iov_base + hdr_sz; + len = iov[0].iov_len - hdr_sz; + + chunk += viona_copy_mblk(mp, copied, buf, len, &end); + copied += chunk; + } + i = 1; + + do { + while (i < n && !end) { + buf = iov[i].iov_base; + len = iov[i].iov_len; + + chunk += viona_copy_mblk(mp, copied, buf, len, &end); + copied += chunk; + i++; + } + + uelem[buf_idx].id = cookie; + uelem[buf_idx].len = chunk; + + /* + * Try to grab another buffer from the ring if the mblk has not + * yet been entirely copied out. + */ + if (!end) { + if (buf_idx == (VTNET_MAXSEGS - 1)) { + /* + * Our arbitrary limit on the number of buffers + * to offer for merge has already been reached. + */ + err = EOVERFLOW; + break; + } + n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie); + if (n <= 0) { + /* + * Without more immediate space to perform the + * copying, there is little choice left but to + * drop the packet. + */ + err = EMSGSIZE; + break; + } + chunk = 0; + i = 0; + buf_idx++; + /* + * Keep the header up-to-date with the number of + * buffers, but never reference its value since the + * guest could meddle with it. + */ + hdr->vrh_bufs++; + } + } while (!end && copied < msz); + + /* Account for the header size in the first buffer */ + uelem[0].len += hdr_sz; + + /* + * If no other errors were encounted during the copy, was the expected + * amount of data transfered? + */ + if (err == 0 && copied != msz) { + VIONA_PROBE5(too_short, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp, size_t, copied, + size_t, msz); + VIONA_RING_STAT_INCR(ring, too_short); + err = EINVAL; + } + + /* Add chksum bits, if needed */ + if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) { + uint32_t cksum_flags; + + if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) && + ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) { + hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4; + hdr->vrh_gso_size = DB_LSOMSS(mp); + } + + mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL, + &cksum_flags); + if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) { + hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID; + } + } + +done: + switch (err) { + case 0: + /* Success can fall right through to ring delivery */ + break; + + case EMSGSIZE: + VIONA_PROBE3(rx_merge_underrun, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, rx_merge_underrun); + break; + + case EOVERFLOW: + VIONA_PROBE3(rx_merge_overrun, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, rx_merge_overrun); + break; + + default: + VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, + uint16_t, cookie, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, bad_rx_frame); + } + vq_pushchain_many(ring, buf_idx + 1, uelem); + return (err); +} + +static void +viona_rx_common(viona_vring_t *ring, mblk_t *mp, boolean_t is_loopback) +{ + viona_link_t *link = ring->vr_link; + mblk_t *mprx = NULL, **mprx_prevp = &mprx; + mblk_t *mpdrop = NULL, **mpdrop_prevp = &mpdrop; + const boolean_t do_merge = + ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0); + + size_t nrx = 0, ndrop = 0; + + while (mp != NULL) { + mblk_t *next = mp->b_next; + mblk_t *pad = NULL; + size_t size = msgsize(mp); + int err = 0; + + mp->b_next = NULL; + + /* + * We treat both a 'drop' response and errors the same here + * and put the packet on the drop chain. As packets may be + * subject to different actions in ipf (which do not all + * return the same set of error values), an error processing + * one packet doesn't mean the next packet will also generate + * an error. + */ + if (VNETHOOK_INTERESTED_IN(link->l_neti) && + viona_hook(link, ring, &mp, B_FALSE) != 0) { + if (mp != NULL) { + *mpdrop_prevp = mp; + mpdrop_prevp = &mp->b_next; + } else { + /* + * If the hook consumer (e.g. ipf) already + * freed the mblk_t, update the drop count now. + */ + ndrop++; + } + mp = next; + continue; + } + + /* + * Ethernet frames are expected to be padded out in order to + * meet the minimum size. + * + * A special case is made for frames which are short by + * VLAN_TAGSZ, having been stripped of their VLAN tag while + * traversing MAC. A preallocated (and recycled) mblk is used + * for that specific condition. + * + * All other frames that fall short on length will have custom + * zero-padding allocated appended to them. + */ + if (size == NEED_VLAN_PAD_SIZE) { + ASSERT(MBLKL(viona_vlan_pad_mp) == VLAN_TAGSZ); + ASSERT(viona_vlan_pad_mp->b_cont == NULL); + + for (pad = mp; pad->b_cont != NULL; pad = pad->b_cont) + ; + + pad->b_cont = viona_vlan_pad_mp; + size += VLAN_TAGSZ; + } else if (size < MIN_BUF_SIZE) { + const size_t pad_size = MIN_BUF_SIZE - size; + mblk_t *zero_mp; + + zero_mp = allocb(pad_size, BPRI_MED); + if (zero_mp == NULL) { + err = ENOMEM; + goto pad_drop; + } + + VIONA_PROBE3(rx_pad_short, viona_vring_t *, ring, + mblk_t *, mp, size_t, pad_size); + VIONA_RING_STAT_INCR(ring, rx_pad_short); + zero_mp->b_wptr += pad_size; + bzero(zero_mp->b_rptr, pad_size); + linkb(mp, zero_mp); + size += pad_size; + } + + if (do_merge) { + err = viona_recv_merged(ring, mp, size); + } else { + err = viona_recv_plain(ring, mp, size); + } + + /* + * The VLAN padding mblk is meant for continual reuse, so + * remove it from the chain to prevent it from being freed. + * + * Custom allocated padding does not require this treatment and + * is freed normally. + */ + if (pad != NULL) { + pad->b_cont = NULL; + } + +pad_drop: + /* + * While an error during rx processing + * (viona_recv_{merged,plain}) does not free mp on error, + * hook processing might or might not free mp. Handle either + * scenario -- if mp is not yet free, it is queued up and + * freed after the guest has been notified. If mp is + * already NULL, just proceed on. + */ + if (err != 0) { + *mpdrop_prevp = mp; + mpdrop_prevp = &mp->b_next; + + /* + * If the available ring is empty, do not bother + * attempting to deliver any more frames. Count the + * rest as dropped too. + */ + if (err == ENOSPC) { + mp->b_next = next; + break; + } + } else { + /* Chain successful mblks to be freed later */ + *mprx_prevp = mp; + mprx_prevp = &mp->b_next; + nrx++; + } + mp = next; + } + + membar_enter(); + if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { + viona_intr_ring(ring); + } + + /* Free successfully received frames */ + if (mprx != NULL) { + freemsgchain(mprx); + } + + /* Free dropped frames, also tallying them */ + mp = mpdrop; + while (mp != NULL) { + mblk_t *next = mp->b_next; + + mp->b_next = NULL; + freemsg(mp); + mp = next; + ndrop++; + } + VIONA_PROBE3(rx, viona_link_t *, link, size_t, nrx, size_t, ndrop); +} + +static void +viona_rx_classified(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t is_loopback) +{ + viona_vring_t *ring = (viona_vring_t *)arg; + + /* Drop traffic if ring is inactive or renewing its lease */ + if (ring->vr_state != VRS_RUN || + (ring->vr_state_flags & VRSF_RENEW) != 0) { + freemsgchain(mp); + return; + } + + viona_rx_common(ring, mp, is_loopback); +} + +static void +viona_rx_mcast(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t is_loopback) +{ + viona_vring_t *ring = (viona_vring_t *)arg; + mac_handle_t mh = ring->vr_link->l_mh; + mblk_t *mp_mcast_only = NULL; + mblk_t **mpp = &mp_mcast_only; + + /* Drop traffic if ring is inactive or renewing its lease */ + if (ring->vr_state != VRS_RUN || + (ring->vr_state_flags & VRSF_RENEW) != 0) { + freemsgchain(mp); + return; + } + + /* + * In addition to multicast traffic, broadcast packets will also arrive + * via the MAC_CLIENT_PROMISC_MULTI handler. The mac_rx_set() callback + * for fully-classified traffic has already delivered that broadcast + * traffic, so it should be suppressed here, rather than duplicating it + * to the guest. + */ + while (mp != NULL) { + mblk_t *mp_next; + mac_header_info_t mhi; + int err; + + mp_next = mp->b_next; + mp->b_next = NULL; + + /* Determine the packet type */ + err = mac_vlan_header_info(mh, mp, &mhi); + if (err != 0) { + mblk_t *pull; + + /* + * It is possible that gathering of the header + * information was impeded by a leading mblk_t which + * was of inadequate length to reference the needed + * fields. Try again, in case that could be solved + * with a pull-up. + */ + pull = msgpullup(mp, sizeof (struct ether_vlan_header)); + if (pull == NULL) { + err = ENOMEM; + } else { + err = mac_vlan_header_info(mh, pull, &mhi); + freemsg(pull); + } + + if (err != 0) { + VIONA_RING_STAT_INCR(ring, rx_mcast_check); + } + } + + /* Chain up matching packets while discarding others */ + if (err == 0 && mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) { + *mpp = mp; + mpp = &mp->b_next; + } else { + freemsg(mp); + } + + mp = mp_next; + } + + if (mp_mcast_only != NULL) { + viona_rx_common(ring, mp_mcast_only, is_loopback); + } +} + +int +viona_rx_set(viona_link_t *link) +{ + viona_vring_t *ring = &link->l_vrings[VIONA_VQ_RX]; + int err; + + mac_rx_set(link->l_mch, viona_rx_classified, ring); + err = mac_promisc_add(link->l_mch, MAC_CLIENT_PROMISC_MULTI, + viona_rx_mcast, ring, &link->l_mph, + MAC_PROMISC_FLAGS_NO_TX_LOOP | MAC_PROMISC_FLAGS_VLAN_TAG_STRIP); + if (err != 0) { + mac_rx_clear(link->l_mch); + } + + return (err); +} + +void +viona_rx_clear(viona_link_t *link) +{ + mac_promisc_remove(link->l_mph); + mac_rx_clear(link->l_mch); +} diff --git a/usr/src/uts/i86pc/io/viona/viona_tx.c b/usr/src/uts/i86pc/io/viona/viona_tx.c new file mode 100644 index 0000000000..5dc645723c --- /dev/null +++ b/usr/src/uts/i86pc/io/viona/viona_tx.c @@ -0,0 +1,756 @@ +/* + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. + */ + + +#include <sys/types.h> +#include <sys/smt.h> +#include <sys/strsubr.h> + +#include <sys/pattr.h> +#include <sys/dlpi.h> +#include <inet/ip.h> +#include <inet/ip_impl.h> + +#include "viona_impl.h" + +#define BNXE_NIC_DRIVER "bnxe" + +/* + * copy tx mbufs from virtio ring to avoid necessitating a wait for packet + * transmission to free resources. + */ +kmutex_t viona_force_copy_lock; +static enum viona_force_copy { + VFC_UNINITALIZED = 0, + VFC_COPY_UNEEDED = 1, + VFC_COPY_REQUIRED = 2, +} viona_force_copy_state = VFC_UNINITALIZED; + +struct viona_desb { + frtn_t d_frtn; + viona_vring_t *d_ring; + uint_t d_ref; + uint32_t d_len; + uint16_t d_cookie; + uchar_t *d_headers; +}; + +static void viona_tx(viona_link_t *, viona_vring_t *); +static void viona_desb_release(viona_desb_t *); + +/* + * Return the number of available descriptors in the vring taking care of the + * 16-bit index wraparound. + * + * Note: If the number of apparently available descriptors is larger than the + * ring size (due to guest misbehavior), this check will still report the + * positive count of descriptors. + */ +static inline uint_t +viona_vr_num_avail(viona_vring_t *ring) +{ + uint16_t ndesc; + + /* + * We're just computing (a-b) in GF(216). + * + * The only glitch here is that in standard C, uint16_t promotes to + * (signed) int when int has more than 16 bits (almost always now). + * A cast back to unsigned is necessary for proper operation. + */ + ndesc = (unsigned)*ring->vr_avail_idx - (unsigned)ring->vr_cur_aidx; + + return (ndesc); +} + +static void +viona_tx_wait_outstanding(viona_vring_t *ring) +{ + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + while (ring->vr_xfer_outstanding != 0) { + /* + * Paying heed to signals is counterproductive here. This is a + * very tight loop if pending transfers take an extended amount + * of time to be reclaimed while the host process is exiting. + */ + cv_wait(&ring->vr_cv, &ring->vr_lock); + } +} + +/* + * Check if full TX packet copying is needed. This should not be called from + * viona attach()/detach() context. + */ +static boolean_t +viona_tx_copy_needed(void) +{ + boolean_t result; + + mutex_enter(&viona_force_copy_lock); + if (viona_force_copy_state == VFC_UNINITALIZED) { + major_t bnxe_major; + + /* + * The original code for viona featured an explicit check for + * the bnxe driver which, when found present, necessitated that + * all transmissions be copied into their own mblks instead of + * passing guest memory to the underlying device. + * + * The motivations for this are unclear, but until it can be + * proven unnecessary, the check lives on. + */ + viona_force_copy_state = VFC_COPY_UNEEDED; + if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER)) + != DDI_MAJOR_T_NONE) { + if (ddi_hold_installed_driver(bnxe_major) != NULL) { + viona_force_copy_state = VFC_COPY_REQUIRED; + ddi_rele_driver(bnxe_major); + } + } + } + result = (viona_force_copy_state == VFC_COPY_REQUIRED); + mutex_exit(&viona_force_copy_lock); + + return (result); +} + +void +viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz) +{ + /* Allocate desb handles for TX ring if packet copying not disabled */ + if (!viona_tx_copy_needed()) { + viona_desb_t *dp; + + dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP); + ring->vr_txdesb = dp; + for (uint_t i = 0; i < qsz; i++, dp++) { + dp->d_frtn.free_func = viona_desb_release; + dp->d_frtn.free_arg = (void *)dp; + dp->d_ring = ring; + dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN, + KM_SLEEP); + } + } + + /* Allocate ring-sized iovec buffers for TX */ + ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP); +} + +void +viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz) +{ + if (ring->vr_txdesb != NULL) { + viona_desb_t *dp = ring->vr_txdesb; + + for (uint_t i = 0; i < qsz; i++, dp++) { + kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN); + } + kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz); + ring->vr_txdesb = NULL; + } + + if (ring->vr_txiov != NULL) { + kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz); + ring->vr_txiov = NULL; + } +} + +static void +viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie) +{ + vq_pushchain(ring, len, cookie); + + membar_enter(); + if ((*ring->vr_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { + viona_intr_ring(ring); + } +} + +void +viona_worker_tx(viona_vring_t *ring, viona_link_t *link) +{ + proc_t *p = ttoproc(curthread); + + (void) thread_vsetname(curthread, "viona_tx_%p", ring); + + ASSERT(MUTEX_HELD(&ring->vr_lock)); + ASSERT3U(ring->vr_state, ==, VRS_RUN); + + mutex_exit(&ring->vr_lock); + + for (;;) { + boolean_t bail = B_FALSE; + boolean_t renew = B_FALSE; + uint_t ntx = 0; + + *ring->vr_used_flags |= VRING_USED_F_NO_NOTIFY; + while (viona_vr_num_avail(ring)) { + viona_tx(link, ring); + + /* + * It is advantageous for throughput to keep this + * transmission loop tight, but periodic breaks to + * check for other events are of value too. + */ + if (ntx++ >= ring->vr_size) + break; + } + *ring->vr_used_flags &= ~VRING_USED_F_NO_NOTIFY; + + VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx); + + /* + * Check for available descriptors on the ring once more in + * case a late addition raced with the NO_NOTIFY flag toggle. + * + * The barrier ensures that visibility of the vr_used_flags + * store does not cross the viona_vr_num_avail() check below. + */ + membar_enter(); + bail = VRING_NEED_BAIL(ring, p); + renew = vmm_drv_lease_expired(ring->vr_lease); + if (!bail && !renew && viona_vr_num_avail(ring)) { + continue; + } + + if ((link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) { + viona_intr_ring(ring); + } + + mutex_enter(&ring->vr_lock); + + while (!bail && !renew && !viona_vr_num_avail(ring)) { + (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); + bail = VRING_NEED_BAIL(ring, p); + renew = vmm_drv_lease_expired(ring->vr_lease); + } + + if (bail) { + break; + } else if (renew) { + ring->vr_state_flags |= VRSF_RENEW; + /* + * When renewing the lease for the ring, no TX + * frames may be outstanding, as they contain + * references to guest memory. + */ + viona_tx_wait_outstanding(ring); + + if (!viona_ring_lease_renew(ring)) { + break; + } + ring->vr_state_flags &= ~VRSF_RENEW; + } + mutex_exit(&ring->vr_lock); + } + + ASSERT(MUTEX_HELD(&ring->vr_lock)); + + ring->vr_state = VRS_STOP; + viona_tx_wait_outstanding(ring); +} + +static void +viona_desb_release(viona_desb_t *dp) +{ + viona_vring_t *ring = dp->d_ring; + uint_t ref; + uint32_t len; + uint16_t cookie; + + ref = atomic_dec_uint_nv(&dp->d_ref); + if (ref > 1) { + return; + } + + /* + * The desb corresponding to this index must be ready for reuse before + * the descriptor is returned to the guest via the 'used' ring. + */ + len = dp->d_len; + cookie = dp->d_cookie; + dp->d_len = 0; + dp->d_cookie = 0; + dp->d_ref = 0; + + viona_tx_done(ring, len, cookie); + + mutex_enter(&ring->vr_lock); + if ((--ring->vr_xfer_outstanding) == 0) { + cv_broadcast(&ring->vr_cv); + } + mutex_exit(&ring->vr_lock); +} + +static boolean_t +viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr, + mblk_t *mp, uint32_t len) +{ + viona_link_t *link = ring->vr_link; + const struct ether_header *eth; + uint_t eth_len = sizeof (struct ether_header); + ushort_t ftype; + ipha_t *ipha = NULL; + uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */ + uint16_t flags = 0; + const uint_t csum_start = hdr->vrh_csum_start; + const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start; + + /* + * Validate that the checksum offsets provided by the guest are within + * the bounds of the packet. Additionally, ensure that the checksum + * contents field is within the headers mblk copied by viona_tx(). + */ + if (csum_start >= len || csum_start < eth_len || csum_stuff >= len || + (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) { + VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, fail_hcksum); + return (B_FALSE); + } + + /* + * This is guaranteed to be safe thanks to the header copying + * done in viona_tx(). + */ + eth = (const struct ether_header *)mp->b_rptr; + ftype = ntohs(eth->ether_type); + + if (ftype == ETHERTYPE_VLAN) { + const struct ether_vlan_header *veth; + + /* punt on QinQ for now */ + eth_len = sizeof (struct ether_vlan_header); + veth = (const struct ether_vlan_header *)eth; + ftype = ntohs(veth->ether_type); + } + + if (ftype == ETHERTYPE_IP) { + ipha = (ipha_t *)(mp->b_rptr + eth_len); + + ipproto = ipha->ipha_protocol; + } else if (ftype == ETHERTYPE_IPV6) { + ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len); + + ipproto = ip6h->ip6_nxt; + } + + /* + * We ignore hdr_len because the spec says it can't be + * trusted. Besides, our own stack will determine the header + * boundary. + */ + if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 && + (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 && + ftype == ETHERTYPE_IP) { + uint16_t *cksump; + uint32_t cksum; + ipaddr_t src = ipha->ipha_src; + ipaddr_t dst = ipha->ipha_dst; + + /* + * Our native IP stack doesn't set the L4 length field + * of the pseudo header when LSO is in play. Other IP + * stacks, e.g. Linux, do include the length field. + * This is a problem because the hardware expects that + * the length field is not set. When it is set it will + * cause an incorrect TCP checksum to be generated. + * The reason this works in Linux is because Linux + * corrects the pseudo-header checksum in the driver + * code. In order to get the correct HW checksum we + * need to assume the guest's IP stack gave us a bogus + * TCP partial checksum and calculate it ourselves. + */ + cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha)); + cksum = IP_TCP_CSUM_COMP; + cksum += (dst >> 16) + (dst & 0xFFFF) + + (src >> 16) + (src & 0xFFFF); + cksum = (cksum & 0xFFFF) + (cksum >> 16); + *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); + + /* + * Since viona is a "legacy device", the data stored + * by the driver will be in the guest's native endian + * format (see sections 2.4.3 and 5.1.6.1 of the + * VIRTIO 1.0 spec for more info). At this time the + * only guests using viona are x86 and we can assume + * little-endian. + */ + lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO); + + /* + * Hardware, like ixgbe, expects the client to request + * IP header checksum offload if it's sending LSO (see + * ixgbe_get_context()). Unfortunately, virtio makes + * no allowances for negotiating IP header checksum + * and HW offload, only TCP checksum. We add the flag + * and zero-out the checksum field. This mirrors the + * behavior of our native IP stack (which does this in + * the interest of HW that expects the field to be + * zero). + */ + flags |= HCK_IPV4_HDRCKSUM; + ipha->ipha_hdr_checksum = 0; + } + + /* + * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure + * HW_LSO, if present, is not lost. + */ + flags |= DB_CKSUMFLAGS(mp); + + /* + * Partial checksum support from the NIC is ideal, since it most + * closely maps to the interface defined by virtio. + */ + if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 && + (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { + /* + * MAC expects these offsets to be relative to the + * start of the L3 header rather than the L2 frame. + */ + flags |= HCK_PARTIALCKSUM; + mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len, + len - eth_len, 0, flags); + return (B_TRUE); + } + + /* + * Without partial checksum support, look to the L3/L4 protocol + * information to see if the NIC can handle it. If not, the + * checksum will need to calculated inline. + */ + if (ftype == ETHERTYPE_IP) { + if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 && + (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { + uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff); + *csump = 0; + flags |= HCK_FULLCKSUM; + mac_hcksum_set(mp, 0, 0, 0, 0, flags); + return (B_TRUE); + } + + /* XXX: Implement manual fallback checksumming? */ + VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, fail_hcksum); + return (B_FALSE); + } else if (ftype == ETHERTYPE_IPV6) { + if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 && + (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { + uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff); + *csump = 0; + flags |= HCK_FULLCKSUM; + mac_hcksum_set(mp, 0, 0, 0, 0, flags); + return (B_TRUE); + } + + /* XXX: Implement manual fallback checksumming? */ + VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, fail_hcksum6); + return (B_FALSE); + } + + /* Cannot even emulate hcksum for unrecognized protocols */ + VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp); + VIONA_RING_STAT_INCR(ring, fail_hcksum_proto); + return (B_FALSE); +} + +static void +viona_tx(viona_link_t *link, viona_vring_t *ring) +{ + struct iovec *iov = ring->vr_txiov; + const uint_t max_segs = ring->vr_size; + uint16_t cookie; + int i, n; + uint32_t len, base_off = 0; + uint32_t min_copy = VIONA_MAX_HDRS_LEN; + mblk_t *mp_head, *mp_tail, *mp; + viona_desb_t *dp = NULL; + mac_client_handle_t link_mch = link->l_mch; + const struct virtio_net_hdr *hdr; + + mp_head = mp_tail = NULL; + + ASSERT(iov != NULL); + + n = vq_popchain(ring, iov, max_segs, &cookie); + if (n == 0) { + VIONA_PROBE1(tx_absent, viona_vring_t *, ring); + VIONA_RING_STAT_INCR(ring, tx_absent); + return; + } else if (n < 0) { + /* + * Any error encountered in vq_popchain has already resulted in + * specific probe and statistic handling. Further action here + * is unnecessary. + */ + return; + } + + /* Grab the header and ensure it is of adequate length */ + hdr = (const struct virtio_net_hdr *)iov[0].iov_base; + len = iov[0].iov_len; + if (len < sizeof (struct virtio_net_hdr)) { + goto drop_fail; + } + + /* Make sure the packet headers are always in the first mblk. */ + if (ring->vr_txdesb != NULL) { + dp = &ring->vr_txdesb[cookie]; + + /* + * If the guest driver is operating properly, each desb slot + * should be available for use when processing a TX descriptor + * from the 'avail' ring. In the case of drivers that reuse a + * descriptor before it has been posted to the 'used' ring, the + * data is simply dropped. + */ + if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) { + dp = NULL; + goto drop_fail; + } + + dp->d_cookie = cookie; + mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0, + &dp->d_frtn); + + /* Account for the successful desballoc. */ + if (mp_head != NULL) + dp->d_ref++; + } else { + mp_head = allocb(VIONA_MAX_HDRS_LEN, 0); + } + + if (mp_head == NULL) + goto drop_fail; + + mp_tail = mp_head; + + /* + * We always copy enough of the guest data to cover the + * headers. This protects us from TOCTOU attacks and allows + * message block length assumptions to be made in subsequent + * code. In many cases, this means copying more data than + * strictly necessary. That's okay, as it is the larger packets + * (such as LSO) that really benefit from desballoc(). + */ + for (i = 1; i < n; i++) { + const uint32_t to_copy = MIN(min_copy, iov[i].iov_len); + + bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy); + mp_head->b_wptr += to_copy; + len += to_copy; + min_copy -= to_copy; + + /* + * We've met the minimum copy requirement. The rest of + * the guest data can be referenced. + */ + if (min_copy == 0) { + /* + * If we copied all contents of this + * descriptor then move onto the next one. + * Otherwise, record how far we are into the + * current descriptor. + */ + if (iov[i].iov_len == to_copy) + i++; + else + base_off = to_copy; + + break; + } + } + + ASSERT3P(mp_head, !=, NULL); + ASSERT3P(mp_tail, !=, NULL); + + for (; i < n; i++) { + uintptr_t base = (uintptr_t)iov[i].iov_base + base_off; + uint32_t chunk = iov[i].iov_len - base_off; + + ASSERT3U(base_off, <, iov[i].iov_len); + ASSERT3U(chunk, >, 0); + + if (dp != NULL) { + mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn); + if (mp == NULL) { + goto drop_fail; + } + dp->d_ref++; + } else { + mp = allocb(chunk, BPRI_MED); + if (mp == NULL) { + goto drop_fail; + } + bcopy((uchar_t *)base, mp->b_wptr, chunk); + } + + base_off = 0; + len += chunk; + mp->b_wptr += chunk; + mp_tail->b_cont = mp; + mp_tail = mp; + } + + if (VNETHOOK_INTERESTED_OUT(link->l_neti)) { + /* + * The hook consumer may elect to free the mblk_t and set + * our mblk_t ** to NULL. When using a viona_desb_t + * (dp != NULL), we do not want the corresponding cleanup to + * occur during the viona_hook() call. We instead want to + * reset and recycle dp for future use. To prevent cleanup + * during the viona_hook() call, we take a ref on dp (if being + * used), and release it on success. On failure, the + * freemsgchain() call will release all the refs taken earlier + * in viona_tx() (aside from the initial ref and the one we + * take), and drop_hook will reset dp for reuse. + */ + if (dp != NULL) + dp->d_ref++; + + /* + * Pass &mp instead of &mp_head so we don't lose track of + * mp_head if the hook consumer (i.e. ipf) elects to free mp + * and set mp to NULL. + */ + mp = mp_head; + if (viona_hook(link, ring, &mp, B_TRUE) != 0) { + if (mp != NULL) + freemsgchain(mp); + goto drop_hook; + } + + if (dp != NULL) { + dp->d_ref--; + + /* + * It is possible that the hook(s) accepted the packet, + * but as part of its processing, it issued a pull-up + * which released all references to the desb. In that + * case, go back to acting like the packet is entirely + * copied (which it is). + */ + if (dp->d_ref == 1) { + dp->d_cookie = 0; + dp->d_ref = 0; + dp = NULL; + } + } + } + + /* + * Request hardware checksumming, if necessary. If the guest + * sent an LSO packet then it must have also negotiated and + * requested partial checksum; therefore the LSO logic is + * contained within viona_tx_csum(). + */ + if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 && + (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) { + if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) { + goto drop_fail; + } + } + + if (dp != NULL) { + dp->d_len = len; + mutex_enter(&ring->vr_lock); + ring->vr_xfer_outstanding++; + mutex_exit(&ring->vr_lock); + } else { + /* + * If the data was cloned out of the ring, the descriptors can + * be marked as 'used' now, rather than deferring that action + * until after successful packet transmission. + */ + viona_tx_done(ring, len, cookie); + } + + /* + * We're potentially going deep into the networking layer; make sure the + * guest can't run concurrently. + */ + smt_begin_unsafe(); + mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL); + smt_end_unsafe(); + return; + +drop_fail: + /* + * On the off chance that memory is not available via the desballoc or + * allocb calls, there are few options left besides to fail and drop + * the frame on the floor. + */ + + if (dp != NULL) { + /* + * Take an additional reference on the desb handle (if present) + * so any desballoc-sourced mblks can release their hold on it + * without the handle reaching its final state and executing + * its clean-up logic. + */ + dp->d_ref++; + } + + /* + * Free any already-allocated blocks and sum up the total length of the + * dropped data to be released to the used ring. + */ + freemsgchain(mp_head); + +drop_hook: + len = 0; + for (uint_t i = 0; i < n; i++) { + len += iov[i].iov_len; + } + + if (dp != NULL) { + VERIFY(dp->d_ref == 2); + + /* Clean up the desb handle, releasing the extra hold. */ + dp->d_len = 0; + dp->d_cookie = 0; + dp->d_ref = 0; + } + + VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len, + uint16_t, cookie); + viona_tx_done(ring, len, cookie); +} diff --git a/usr/src/uts/i86pc/sys/viona_io.h b/usr/src/uts/i86pc/sys/viona_io.h index a26cc00a55..46cc72eb06 100644 --- a/usr/src/uts/i86pc/sys/viona_io.h +++ b/usr/src/uts/i86pc/sys/viona_io.h @@ -11,36 +11,53 @@ /* * Copyright 2013 Pluribus Networks Inc. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _VIONA_IO_H_ #define _VIONA_IO_H_ #define VNA_IOC (('V' << 16)|('C' << 8)) -#define VNA_IOC_CREATE (VNA_IOC | 1) -#define VNA_IOC_DELETE (VNA_IOC | 2) -#define VNA_IOC_RX_RING_INIT (VNA_IOC | 3) -#define VNA_IOC_TX_RING_INIT (VNA_IOC | 4) -#define VNA_IOC_RX_RING_RESET (VNA_IOC | 5) -#define VNA_IOC_TX_RING_RESET (VNA_IOC | 6) -#define VNA_IOC_RX_RING_KICK (VNA_IOC | 7) -#define VNA_IOC_TX_RING_KICK (VNA_IOC | 8) -#define VNA_IOC_RX_INTR_CLR (VNA_IOC | 9) -#define VNA_IOC_TX_INTR_CLR (VNA_IOC | 10) -#define VNA_IOC_SET_FEATURES (VNA_IOC | 11) -#define VNA_IOC_GET_FEATURES (VNA_IOC | 12) +#define VNA_IOC_CREATE (VNA_IOC | 0x01) +#define VNA_IOC_DELETE (VNA_IOC | 0x02) + +#define VNA_IOC_RING_INIT (VNA_IOC | 0x10) +#define VNA_IOC_RING_RESET (VNA_IOC | 0x11) +#define VNA_IOC_RING_KICK (VNA_IOC | 0x12) +#define VNA_IOC_RING_SET_MSI (VNA_IOC | 0x13) +#define VNA_IOC_RING_INTR_CLR (VNA_IOC | 0x14) + +#define VNA_IOC_INTR_POLL (VNA_IOC | 0x20) +#define VNA_IOC_SET_FEATURES (VNA_IOC | 0x21) +#define VNA_IOC_GET_FEATURES (VNA_IOC | 0x22) +#define VNA_IOC_SET_NOTIFY_IOP (VNA_IOC | 0x23) typedef struct vioc_create { datalink_id_t c_linkid; - char c_vmname[64]; - size_t c_lomem_size; - size_t c_himem_size; + int c_vmfd; } vioc_create_t; typedef struct vioc_ring_init { + uint16_t ri_index; uint16_t ri_qsize; uint64_t ri_qaddr; } vioc_ring_init_t; +typedef struct vioc_ring_msi { + uint16_t rm_index; + uint64_t rm_addr; + uint64_t rm_msg; +} vioc_ring_msi_t; + +enum viona_vq_id { + VIONA_VQ_RX = 0, + VIONA_VQ_TX = 1, + VIONA_VQ_MAX = 2 +}; + +typedef struct vioc_intr_poll { + uint32_t vip_status[VIONA_VQ_MAX]; +} vioc_intr_poll_t; + + #endif /* _VIONA_IO_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h index 33fefc10ea..856b75e5cc 100644 --- a/usr/src/uts/i86pc/sys/vmm_drv.h +++ b/usr/src/uts/i86pc/sys/vmm_drv.h @@ -17,6 +17,9 @@ #define _VMM_DRV_H_ #ifdef _KERNEL + +#include <sys/file.h> + struct vmm_hold; typedef struct vmm_hold vmm_hold_t; diff --git a/usr/src/uts/i86pc/viona/Makefile b/usr/src/uts/i86pc/viona/Makefile index 4ede5bbd84..dac59c9a45 100644 --- a/usr/src/uts/i86pc/viona/Makefile +++ b/usr/src/uts/i86pc/viona/Makefile @@ -11,7 +11,7 @@ # # Copyright 2013 Pluribus Networks Inc. -# Copyright 2017 Joyent, Inc. +# Copyright 2019 Joyent, Inc. # # @@ -27,6 +27,7 @@ OBJECTS = $(VIONA_OBJS:%=$(OBJS_DIR)/%) LINTS = $(VIONA_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) CONF_SRCDIR = $(UTSBASE)/i86pc/io/viona +MAPFILE = $(UTSBASE)/i86pc/io/viona/viona.mapfile # # Include common rules. @@ -49,8 +50,16 @@ LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN LINTTAGS += -erroff=E_FUNC_RET_MAYBE_IGNORED2 LINTTAGS += -erroff=E_FUNC_RET_ALWAYS_IGNOR2 +# needs work +SMOFF += all_func_returns + +ALL_BUILDS = $(ALL_BUILDSONLY64) +DEF_BUILDS = $(DEF_BUILDSONLY64) + CFLAGS += $(CCVERBOSE) -LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm +LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm -Nmisc/neti +LDFLAGS += -Nmisc/hook +LDFLAGS += -M $(MAPFILE) # # Default build targets. diff --git a/usr/src/uts/intel/ipf/ipf.global-objs.debug64 b/usr/src/uts/intel/ipf/ipf.global-objs.debug64 index 663613cee3..846011b4c5 100644 --- a/usr/src/uts/intel/ipf/ipf.global-objs.debug64 +++ b/usr/src/uts/intel/ipf/ipf.global-objs.debug64 @@ -22,13 +22,17 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# Copyright 2013 Joyent, Inc. All rights reserved +# Copyright 2018 Joyent, Inc. All rights reserved # fr_availfuncs fr_features fr_objbytes hdrsizes +hook_viona_in +hook_viona_in_gz +hook_viona_out +hook_viona_out_gz hook4_in hook4_in_gz hook4_loop_in @@ -58,6 +62,9 @@ ip6exthdr ipf_cb_ops ipf_dev_info ipf_devfiles +ipf_eth_bcast_addr +ipf_eth_ipv4_mcast +ipf_eth_ipv6_mcast ipf_kstat_tmp ipf_minor ipf_ops |